summaryrefslogtreecommitdiffstats
path: root/third_party/aom/av1/common/x86
diff options
context:
space:
mode:
Diffstat (limited to 'third_party/aom/av1/common/x86')
-rw-r--r--third_party/aom/av1/common/x86/av1_convolve_horiz_rs_sse4.c228
-rw-r--r--third_party/aom/av1/common/x86/av1_convolve_scale_sse4.c498
-rw-r--r--third_party/aom/av1/common/x86/av1_inv_txfm_avx2.c2254
-rw-r--r--third_party/aom/av1/common/x86/av1_inv_txfm_avx2.h71
-rw-r--r--third_party/aom/av1/common/x86/av1_inv_txfm_ssse3.c2904
-rw-r--r--third_party/aom/av1/common/x86/av1_inv_txfm_ssse3.h247
-rw-r--r--third_party/aom/av1/common/x86/av1_txfm_sse2.h321
-rw-r--r--third_party/aom/av1/common/x86/av1_txfm_sse4.c22
-rw-r--r--third_party/aom/av1/common/x86/av1_txfm_sse4.h72
-rw-r--r--third_party/aom/av1/common/x86/cdef_block_avx2.c357
-rw-r--r--third_party/aom/av1/common/x86/cdef_block_sse2.c40
-rw-r--r--third_party/aom/av1/common/x86/cdef_block_sse4.c40
-rw-r--r--third_party/aom/av1/common/x86/cdef_block_ssse3.c40
-rw-r--r--third_party/aom/av1/common/x86/cfl_avx2.c495
-rw-r--r--third_party/aom/av1/common/x86/cfl_simd.h246
-rw-r--r--third_party/aom/av1/common/x86/cfl_sse2.c89
-rw-r--r--third_party/aom/av1/common/x86/cfl_ssse3.c397
-rw-r--r--third_party/aom/av1/common/x86/convolve_2d_avx2.c161
-rw-r--r--third_party/aom/av1/common/x86/convolve_2d_sse2.c547
-rw-r--r--third_party/aom/av1/common/x86/convolve_avx2.c916
-rw-r--r--third_party/aom/av1/common/x86/convolve_sse2.c500
-rw-r--r--third_party/aom/av1/common/x86/filterintra_sse4.c350
-rw-r--r--third_party/aom/av1/common/x86/highbd_convolve_2d_avx2.c200
-rw-r--r--third_party/aom/av1/common/x86/highbd_convolve_2d_sse4.c421
-rw-r--r--third_party/aom/av1/common/x86/highbd_convolve_2d_ssse3.c414
-rw-r--r--third_party/aom/av1/common/x86/highbd_inv_txfm_avx2.c4239
-rw-r--r--third_party/aom/av1/common/x86/highbd_inv_txfm_sse4.c5830
-rw-r--r--third_party/aom/av1/common/x86/highbd_jnt_convolve_avx2.c849
-rw-r--r--third_party/aom/av1/common/x86/highbd_jnt_convolve_sse4.c381
-rw-r--r--third_party/aom/av1/common/x86/highbd_txfm_utility_sse4.h132
-rw-r--r--third_party/aom/av1/common/x86/highbd_warp_affine_avx2.c656
-rw-r--r--third_party/aom/av1/common/x86/highbd_warp_plane_sse4.c636
-rw-r--r--third_party/aom/av1/common/x86/highbd_wiener_convolve_avx2.c245
-rw-r--r--third_party/aom/av1/common/x86/highbd_wiener_convolve_ssse3.c202
-rw-r--r--third_party/aom/av1/common/x86/intra_edge_sse4.c322
-rw-r--r--third_party/aom/av1/common/x86/jnt_convolve_avx2.c1124
-rw-r--r--third_party/aom/av1/common/x86/jnt_convolve_sse2.c606
-rw-r--r--third_party/aom/av1/common/x86/jnt_convolve_ssse3.c230
-rw-r--r--third_party/aom/av1/common/x86/reconinter_avx2.c624
-rw-r--r--third_party/aom/av1/common/x86/reconinter_sse4.c154
-rw-r--r--third_party/aom/av1/common/x86/reconinter_ssse3.c120
-rw-r--r--third_party/aom/av1/common/x86/resize_ssse3.c974
-rw-r--r--third_party/aom/av1/common/x86/selfguided_avx2.c724
-rw-r--r--third_party/aom/av1/common/x86/selfguided_sse4.c662
-rw-r--r--third_party/aom/av1/common/x86/warp_plane_avx2.c1210
-rw-r--r--third_party/aom/av1/common/x86/warp_plane_sse4.c908
-rw-r--r--third_party/aom/av1/common/x86/wiener_convolve_avx2.c242
-rw-r--r--third_party/aom/av1/common/x86/wiener_convolve_sse2.c199
48 files changed, 33099 insertions, 0 deletions
diff --git a/third_party/aom/av1/common/x86/av1_convolve_horiz_rs_sse4.c b/third_party/aom/av1/common/x86/av1_convolve_horiz_rs_sse4.c
new file mode 100644
index 0000000000..8aa14696f6
--- /dev/null
+++ b/third_party/aom/av1/common/x86/av1_convolve_horiz_rs_sse4.c
@@ -0,0 +1,228 @@
+/*
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <assert.h>
+#include <smmintrin.h>
+
+#include "config/av1_rtcd.h"
+
+#include "av1/common/convolve.h"
+#include "av1/common/resize.h"
+#include "aom_dsp/x86/synonyms.h"
+
+// Note: If the crop width is not a multiple of 4, then, unlike the C version,
+// this function will overwrite some of the padding on the right hand side of
+// the frame. This padding appears to be trashed anyway, so this should not
+// affect the running of the decoder.
+void av1_convolve_horiz_rs_sse4_1(const uint8_t *src, int src_stride,
+ uint8_t *dst, int dst_stride, int w, int h,
+ const int16_t *x_filters, int x0_qn,
+ int x_step_qn) {
+ assert(UPSCALE_NORMATIVE_TAPS == 8);
+
+ src -= UPSCALE_NORMATIVE_TAPS / 2 - 1;
+
+ const __m128i round_add = _mm_set1_epi32((1 << FILTER_BITS) >> 1);
+ const __m128i zero = _mm_setzero_si128();
+
+ const uint8_t *src_y;
+ uint8_t *dst_y;
+ int x_qn = x0_qn;
+ for (int x = 0; x < w; x += 4, x_qn += 4 * x_step_qn) {
+ const int x_filter_idx0 =
+ ((x_qn + 0 * x_step_qn) & RS_SCALE_SUBPEL_MASK) >> RS_SCALE_EXTRA_BITS;
+ const int x_filter_idx1 =
+ ((x_qn + 1 * x_step_qn) & RS_SCALE_SUBPEL_MASK) >> RS_SCALE_EXTRA_BITS;
+ const int x_filter_idx2 =
+ ((x_qn + 2 * x_step_qn) & RS_SCALE_SUBPEL_MASK) >> RS_SCALE_EXTRA_BITS;
+ const int x_filter_idx3 =
+ ((x_qn + 3 * x_step_qn) & RS_SCALE_SUBPEL_MASK) >> RS_SCALE_EXTRA_BITS;
+
+ assert(x_filter_idx0 <= RS_SUBPEL_MASK);
+ assert(x_filter_idx1 <= RS_SUBPEL_MASK);
+ assert(x_filter_idx2 <= RS_SUBPEL_MASK);
+ assert(x_filter_idx3 <= RS_SUBPEL_MASK);
+
+ const int16_t *const x_filter0 =
+ &x_filters[x_filter_idx0 * UPSCALE_NORMATIVE_TAPS];
+ const int16_t *const x_filter1 =
+ &x_filters[x_filter_idx1 * UPSCALE_NORMATIVE_TAPS];
+ const int16_t *const x_filter2 =
+ &x_filters[x_filter_idx2 * UPSCALE_NORMATIVE_TAPS];
+ const int16_t *const x_filter3 =
+ &x_filters[x_filter_idx3 * UPSCALE_NORMATIVE_TAPS];
+
+ const __m128i fil0_16 = xx_loadu_128(x_filter0);
+ const __m128i fil1_16 = xx_loadu_128(x_filter1);
+ const __m128i fil2_16 = xx_loadu_128(x_filter2);
+ const __m128i fil3_16 = xx_loadu_128(x_filter3);
+
+ src_y = src;
+ dst_y = dst;
+ for (int y = 0; y < h; y++, src_y += src_stride, dst_y += dst_stride) {
+ const uint8_t *const src_x0 =
+ &src_y[(x_qn + 0 * x_step_qn) >> RS_SCALE_SUBPEL_BITS];
+ const uint8_t *const src_x1 =
+ &src_y[(x_qn + 1 * x_step_qn) >> RS_SCALE_SUBPEL_BITS];
+ const uint8_t *const src_x2 =
+ &src_y[(x_qn + 2 * x_step_qn) >> RS_SCALE_SUBPEL_BITS];
+ const uint8_t *const src_x3 =
+ &src_y[(x_qn + 3 * x_step_qn) >> RS_SCALE_SUBPEL_BITS];
+
+ // Load up the source data. This is 8-bit input data, so each load
+ // gets 8 pixels.
+ const __m128i src0_8 = xx_loadl_64(src_x0);
+ const __m128i src1_8 = xx_loadl_64(src_x1);
+ const __m128i src2_8 = xx_loadl_64(src_x2);
+ const __m128i src3_8 = xx_loadl_64(src_x3);
+
+ // Now zero-extend up to 16-bit precision, i.e.
+ // [ 00 00 00 00 hg fe dc ba ] -> [ 0h 0g 0f 0e 0d 0c 0b 0a ]
+ const __m128i src0_16 = _mm_cvtepu8_epi16(src0_8);
+ const __m128i src1_16 = _mm_cvtepu8_epi16(src1_8);
+ const __m128i src2_16 = _mm_cvtepu8_epi16(src2_8);
+ const __m128i src3_16 = _mm_cvtepu8_epi16(src3_8);
+
+ // Multiply by filter coefficients (results in a 32-bit value),
+ // and add adjacent pairs, i.e.
+ // ([ s7 s6 s5 s4 s3 s2 s1 s0], [ f7 f6 f5 f4 f3 f2 f1 f0 ])
+ // -> [ {s7*f7+s6*f6} {s5*f5+s4*f4} {s3*f3+s2*f2} {s1*f1+s0*f0} ]
+ const __m128i conv0_32 = _mm_madd_epi16(src0_16, fil0_16);
+ const __m128i conv1_32 = _mm_madd_epi16(src1_16, fil1_16);
+ const __m128i conv2_32 = _mm_madd_epi16(src2_16, fil2_16);
+ const __m128i conv3_32 = _mm_madd_epi16(src3_16, fil3_16);
+
+ // Reduce horizontally and add, i.e.
+ // ([ D C B A ], [ S R Q P ]) -> [ S+R Q+P D+C B+A ]
+ const __m128i conv01_32 = _mm_hadd_epi32(conv0_32, conv1_32);
+ const __m128i conv23_32 = _mm_hadd_epi32(conv2_32, conv3_32);
+
+ const __m128i conv0123_32 = _mm_hadd_epi32(conv01_32, conv23_32);
+
+ // Divide down by (1 << FILTER_BITS), rounding to nearest.
+ const __m128i shifted_32 =
+ _mm_srai_epi32(_mm_add_epi32(conv0123_32, round_add), FILTER_BITS);
+
+ // Pack 32-bit values into 16-bit values, i.e.
+ // ([ D C B A ], [ 0 0 0 0 ]) -> [ 0 0 0 0 D C B A ]
+ const __m128i shifted_16 = _mm_packus_epi32(shifted_32, zero);
+
+ // Pack 16-bit values into 8-bit values, i.e.
+ // ([ 0 0 0 0 D C B A ], [ 0 0 0 0 0 0 0 0 ])
+ // -> [ 0 0 0 0 0 0 DC BA ]
+ const __m128i shifted_8 = _mm_packus_epi16(shifted_16, zero);
+
+ // Write to the output
+ xx_storel_32(&dst_y[x], shifted_8);
+ }
+ }
+}
+
+// Note: If the crop width is not a multiple of 4, then, unlike the C version,
+// this function will overwrite some of the padding on the right hand side of
+// the frame. This padding appears to be trashed anyway, so this should not
+// affect the running of the decoder.
+void av1_highbd_convolve_horiz_rs_sse4_1(const uint16_t *src, int src_stride,
+ uint16_t *dst, int dst_stride, int w,
+ int h, const int16_t *x_filters,
+ int x0_qn, int x_step_qn, int bd) {
+ assert(UPSCALE_NORMATIVE_TAPS == 8);
+ assert(bd == 8 || bd == 10 || bd == 12);
+
+ src -= UPSCALE_NORMATIVE_TAPS / 2 - 1;
+
+ const __m128i round_add = _mm_set1_epi32((1 << FILTER_BITS) >> 1);
+ const __m128i zero = _mm_setzero_si128();
+ const __m128i clip_maximum = _mm_set1_epi16((1 << bd) - 1);
+
+ const uint16_t *src_y;
+ uint16_t *dst_y;
+ int x_qn = x0_qn;
+ for (int x = 0; x < w; x += 4, x_qn += 4 * x_step_qn) {
+ const int x_filter_idx0 =
+ ((x_qn + 0 * x_step_qn) & RS_SCALE_SUBPEL_MASK) >> RS_SCALE_EXTRA_BITS;
+ const int x_filter_idx1 =
+ ((x_qn + 1 * x_step_qn) & RS_SCALE_SUBPEL_MASK) >> RS_SCALE_EXTRA_BITS;
+ const int x_filter_idx2 =
+ ((x_qn + 2 * x_step_qn) & RS_SCALE_SUBPEL_MASK) >> RS_SCALE_EXTRA_BITS;
+ const int x_filter_idx3 =
+ ((x_qn + 3 * x_step_qn) & RS_SCALE_SUBPEL_MASK) >> RS_SCALE_EXTRA_BITS;
+
+ assert(x_filter_idx0 <= RS_SUBPEL_MASK);
+ assert(x_filter_idx1 <= RS_SUBPEL_MASK);
+ assert(x_filter_idx2 <= RS_SUBPEL_MASK);
+ assert(x_filter_idx3 <= RS_SUBPEL_MASK);
+
+ const int16_t *const x_filter0 =
+ &x_filters[x_filter_idx0 * UPSCALE_NORMATIVE_TAPS];
+ const int16_t *const x_filter1 =
+ &x_filters[x_filter_idx1 * UPSCALE_NORMATIVE_TAPS];
+ const int16_t *const x_filter2 =
+ &x_filters[x_filter_idx2 * UPSCALE_NORMATIVE_TAPS];
+ const int16_t *const x_filter3 =
+ &x_filters[x_filter_idx3 * UPSCALE_NORMATIVE_TAPS];
+
+ const __m128i fil0_16 = xx_loadu_128(x_filter0);
+ const __m128i fil1_16 = xx_loadu_128(x_filter1);
+ const __m128i fil2_16 = xx_loadu_128(x_filter2);
+ const __m128i fil3_16 = xx_loadu_128(x_filter3);
+
+ src_y = src;
+ dst_y = dst;
+ for (int y = 0; y < h; y++, src_y += src_stride, dst_y += dst_stride) {
+ const uint16_t *const src_x0 =
+ &src_y[(x_qn + 0 * x_step_qn) >> RS_SCALE_SUBPEL_BITS];
+ const uint16_t *const src_x1 =
+ &src_y[(x_qn + 1 * x_step_qn) >> RS_SCALE_SUBPEL_BITS];
+ const uint16_t *const src_x2 =
+ &src_y[(x_qn + 2 * x_step_qn) >> RS_SCALE_SUBPEL_BITS];
+ const uint16_t *const src_x3 =
+ &src_y[(x_qn + 3 * x_step_qn) >> RS_SCALE_SUBPEL_BITS];
+
+ // Load up the source data. This is 16-bit input data, so each load
+ // gets 8 pixels.
+ const __m128i src0_16 = xx_loadu_128(src_x0);
+ const __m128i src1_16 = xx_loadu_128(src_x1);
+ const __m128i src2_16 = xx_loadu_128(src_x2);
+ const __m128i src3_16 = xx_loadu_128(src_x3);
+
+ // Multiply by filter coefficients (results in a 32-bit value),
+ // and add adjacent pairs, i.e.
+ // ([ s7 s6 s5 s4 s3 s2 s1 s0], [ f7 f6 f5 f4 f3 f2 f1 f0 ])
+ // -> [ {s7*f7+s6*f6} {s5*f5+s4*f4} {s3*f3+s2*f2} {s1*f1+s0*f0} ]
+ const __m128i conv0_32 = _mm_madd_epi16(src0_16, fil0_16);
+ const __m128i conv1_32 = _mm_madd_epi16(src1_16, fil1_16);
+ const __m128i conv2_32 = _mm_madd_epi16(src2_16, fil2_16);
+ const __m128i conv3_32 = _mm_madd_epi16(src3_16, fil3_16);
+
+ // Reduce horizontally and add, i.e.
+ // ([ D C B A ], [ S R Q P ]) -> [ S+R Q+P D+C B+A ]
+ const __m128i conv01_32 = _mm_hadd_epi32(conv0_32, conv1_32);
+ const __m128i conv23_32 = _mm_hadd_epi32(conv2_32, conv3_32);
+
+ const __m128i conv0123_32 = _mm_hadd_epi32(conv01_32, conv23_32);
+
+ // Divide down by (1 << FILTER_BITS), rounding to nearest.
+ const __m128i shifted_32 =
+ _mm_srai_epi32(_mm_add_epi32(conv0123_32, round_add), FILTER_BITS);
+
+ // Pack 32-bit values into 16-bit values, i.e.
+ // ([ D C B A ], [ 0 0 0 0 ]) -> [ 0 0 0 0 D C B A ]
+ const __m128i shifted_16 = _mm_packus_epi32(shifted_32, zero);
+
+ // Clip the values at (1 << bd) - 1
+ const __m128i clipped_16 = _mm_min_epi16(shifted_16, clip_maximum);
+
+ // Write to the output
+ xx_storel_64(&dst_y[x], clipped_16);
+ }
+ }
+}
diff --git a/third_party/aom/av1/common/x86/av1_convolve_scale_sse4.c b/third_party/aom/av1/common/x86/av1_convolve_scale_sse4.c
new file mode 100644
index 0000000000..8e293b5bb1
--- /dev/null
+++ b/third_party/aom/av1/common/x86/av1_convolve_scale_sse4.c
@@ -0,0 +1,498 @@
+/*
+ * Copyright (c) 2017, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <assert.h>
+#include <smmintrin.h>
+
+#include "config/av1_rtcd.h"
+
+#include "aom_dsp/aom_dsp_common.h"
+#include "aom_dsp/aom_filter.h"
+#include "av1/common/convolve.h"
+
+// A specialised version of hfilter, the horizontal filter for
+// av1_convolve_2d_scale_sse4_1. This version only supports 8 tap filters.
+static void hfilter8(const uint8_t *src, int src_stride, int16_t *dst, int w,
+ int h, int subpel_x_qn, int x_step_qn,
+ const InterpFilterParams *filter_params, int round) {
+ const int bd = 8;
+ const int ntaps = 8;
+
+ src -= ntaps / 2 - 1;
+
+ int32_t round_add32 = (1 << round) / 2 + (1 << (bd + FILTER_BITS - 1));
+ const __m128i round_add = _mm_set1_epi32(round_add32);
+ const __m128i round_shift = _mm_cvtsi32_si128(round);
+
+ int x_qn = subpel_x_qn;
+ for (int x = 0; x < w; ++x, x_qn += x_step_qn) {
+ const uint8_t *const src_col = src + (x_qn >> SCALE_SUBPEL_BITS);
+ const int filter_idx = (x_qn & SCALE_SUBPEL_MASK) >> SCALE_EXTRA_BITS;
+ assert(filter_idx < SUBPEL_SHIFTS);
+ const int16_t *filter =
+ av1_get_interp_filter_subpel_kernel(filter_params, filter_idx);
+
+ // Load the filter coefficients
+ const __m128i coefflo = _mm_loadu_si128((__m128i *)filter);
+ const __m128i zero = _mm_castps_si128(_mm_setzero_ps());
+
+ int y;
+ for (y = 0; y <= h - 4; y += 4) {
+ const uint8_t *const src0 = src_col + y * src_stride;
+ const uint8_t *const src1 = src0 + 1 * src_stride;
+ const uint8_t *const src2 = src0 + 2 * src_stride;
+ const uint8_t *const src3 = src0 + 3 * src_stride;
+
+ // Load up source data. This is 8-bit input data; each load is just
+ // loading the lower half of the register and gets 8 pixels
+ const __m128i data08 = _mm_loadl_epi64((__m128i *)src0);
+ const __m128i data18 = _mm_loadl_epi64((__m128i *)src1);
+ const __m128i data28 = _mm_loadl_epi64((__m128i *)src2);
+ const __m128i data38 = _mm_loadl_epi64((__m128i *)src3);
+
+ // Now zero-extend up to 16-bit precision by interleaving with
+ // zeros. Drop the upper half of each register (which just had zeros)
+ const __m128i data0lo = _mm_unpacklo_epi8(data08, zero);
+ const __m128i data1lo = _mm_unpacklo_epi8(data18, zero);
+ const __m128i data2lo = _mm_unpacklo_epi8(data28, zero);
+ const __m128i data3lo = _mm_unpacklo_epi8(data38, zero);
+
+ // Multiply by coefficients
+ const __m128i conv0lo = _mm_madd_epi16(data0lo, coefflo);
+ const __m128i conv1lo = _mm_madd_epi16(data1lo, coefflo);
+ const __m128i conv2lo = _mm_madd_epi16(data2lo, coefflo);
+ const __m128i conv3lo = _mm_madd_epi16(data3lo, coefflo);
+
+ // Reduce horizontally and add
+ const __m128i conv01lo = _mm_hadd_epi32(conv0lo, conv1lo);
+ const __m128i conv23lo = _mm_hadd_epi32(conv2lo, conv3lo);
+ const __m128i conv = _mm_hadd_epi32(conv01lo, conv23lo);
+
+ // Divide down by (1 << round), rounding to nearest.
+ __m128i shifted =
+ _mm_sra_epi32(_mm_add_epi32(conv, round_add), round_shift);
+
+ shifted = _mm_packus_epi32(shifted, shifted);
+ // Write transposed to the output
+ _mm_storel_epi64((__m128i *)(dst + y + x * h), shifted);
+ }
+ for (; y < h; ++y) {
+ const uint8_t *const src_row = src_col + y * src_stride;
+
+ int32_t sum = (1 << (bd + FILTER_BITS - 1));
+ for (int k = 0; k < ntaps; ++k) {
+ sum += filter[k] * src_row[k];
+ }
+
+ dst[y + x * h] = ROUND_POWER_OF_TWO(sum, round);
+ }
+ }
+}
+
+static __m128i convolve_16_8(const int16_t *src, __m128i coeff) {
+ __m128i data = _mm_loadu_si128((__m128i *)src);
+ return _mm_madd_epi16(data, coeff);
+}
+
+// A specialised version of vfilter, the vertical filter for
+// av1_convolve_2d_scale_sse4_1. This version only supports 8 tap filters.
+static void vfilter8(const int16_t *src, int src_stride, uint8_t *dst,
+ int dst_stride, int w, int h, int subpel_y_qn,
+ int y_step_qn, const InterpFilterParams *filter_params,
+ const ConvolveParams *conv_params, int bd) {
+ const int offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0;
+ const int ntaps = 8;
+
+ const __m128i round_shift = _mm_cvtsi32_si128(conv_params->round_1);
+
+ const int32_t sub32 = ((1 << (offset_bits - conv_params->round_1)) +
+ (1 << (offset_bits - conv_params->round_1 - 1)));
+ const __m128i sub = _mm_set1_epi16(sub32);
+
+ CONV_BUF_TYPE *dst16 = conv_params->dst;
+ const int dst16_stride = conv_params->dst_stride;
+ const int bits =
+ FILTER_BITS * 2 - conv_params->round_0 - conv_params->round_1;
+ const __m128i bits_shift = _mm_cvtsi32_si128(bits);
+ const __m128i bits_const = _mm_set1_epi16(((1 << bits) >> 1));
+ const __m128i round_shift_add =
+ _mm_set1_epi32(((1 << conv_params->round_1) >> 1));
+ const __m128i res_add_const = _mm_set1_epi32(1 << offset_bits);
+
+ const int w0 = conv_params->fwd_offset;
+ const int w1 = conv_params->bck_offset;
+ const __m128i wt0 = _mm_set1_epi16((short)w0);
+ const __m128i wt1 = _mm_set1_epi16((short)w1);
+ const __m128i wt = _mm_unpacklo_epi16(wt0, wt1);
+
+ int y_qn = subpel_y_qn;
+ for (int y = 0; y < h; ++y, y_qn += y_step_qn) {
+ const int16_t *src_y = src + (y_qn >> SCALE_SUBPEL_BITS);
+ const int filter_idx = (y_qn & SCALE_SUBPEL_MASK) >> SCALE_EXTRA_BITS;
+ assert(filter_idx < SUBPEL_SHIFTS);
+ const int16_t *filter =
+ av1_get_interp_filter_subpel_kernel(filter_params, filter_idx);
+
+ const __m128i coeff0716 = _mm_loadu_si128((__m128i *)filter);
+ int x;
+ for (x = 0; x <= w - 4; x += 4) {
+ const int16_t *const src0 = src_y + x * src_stride;
+ const int16_t *const src1 = src0 + 1 * src_stride;
+ const int16_t *const src2 = src0 + 2 * src_stride;
+ const int16_t *const src3 = src0 + 3 * src_stride;
+
+ // Load the source data for the three rows, adding the three registers of
+ // convolved products to one as we go (conv0..conv3) to avoid the
+ // register pressure getting too high.
+ const __m128i conv0 = convolve_16_8(src0, coeff0716);
+ const __m128i conv1 = convolve_16_8(src1, coeff0716);
+ const __m128i conv2 = convolve_16_8(src2, coeff0716);
+ const __m128i conv3 = convolve_16_8(src3, coeff0716);
+
+ // Now reduce horizontally to get one lane for each result
+ const __m128i conv01 = _mm_hadd_epi32(conv0, conv1);
+ const __m128i conv23 = _mm_hadd_epi32(conv2, conv3);
+ __m128i conv = _mm_hadd_epi32(conv01, conv23);
+
+ conv = _mm_add_epi32(conv, res_add_const);
+ // Divide down by (1 << round_1), rounding to nearest and subtract sub32.
+ __m128i shifted =
+ _mm_sra_epi32(_mm_add_epi32(conv, round_shift_add), round_shift);
+
+ uint8_t *dst_x = dst + y * dst_stride + x;
+ __m128i result;
+ __m128i shifted_16 = _mm_packus_epi32(shifted, shifted);
+
+ if (conv_params->is_compound) {
+ CONV_BUF_TYPE *dst_16_x = dst16 + y * dst16_stride + x;
+ if (conv_params->do_average) {
+ const __m128i p_16 = _mm_loadl_epi64((__m128i *)dst_16_x);
+ if (conv_params->use_dist_wtd_comp_avg) {
+ const __m128i p_16_lo = _mm_unpacklo_epi16(p_16, shifted_16);
+ const __m128i wt_res_lo = _mm_madd_epi16(p_16_lo, wt);
+ const __m128i shifted_32 =
+ _mm_srai_epi32(wt_res_lo, DIST_PRECISION_BITS);
+ shifted_16 = _mm_packus_epi32(shifted_32, shifted_32);
+ } else {
+ shifted_16 = _mm_srai_epi16(_mm_add_epi16(p_16, shifted_16), 1);
+ }
+ const __m128i subbed = _mm_sub_epi16(shifted_16, sub);
+ result = _mm_sra_epi16(_mm_add_epi16(subbed, bits_const), bits_shift);
+ const __m128i result_8 = _mm_packus_epi16(result, result);
+ *(int *)dst_x = _mm_cvtsi128_si32(result_8);
+ } else {
+ _mm_storel_epi64((__m128i *)dst_16_x, shifted_16);
+ }
+ } else {
+ const __m128i subbed = _mm_sub_epi16(shifted_16, sub);
+ result = _mm_sra_epi16(_mm_add_epi16(subbed, bits_const), bits_shift);
+ const __m128i result_8 = _mm_packus_epi16(result, result);
+ *(int *)dst_x = _mm_cvtsi128_si32(result_8);
+ }
+ }
+ for (; x < w; ++x) {
+ const int16_t *src_x = src_y + x * src_stride;
+ int32_t sum = 1 << offset_bits;
+ for (int k = 0; k < ntaps; ++k) sum += filter[k] * src_x[k];
+ CONV_BUF_TYPE res = ROUND_POWER_OF_TWO(sum, conv_params->round_1);
+
+ if (conv_params->is_compound) {
+ if (conv_params->do_average) {
+ int32_t tmp = dst16[y * dst16_stride + x];
+ if (conv_params->use_dist_wtd_comp_avg) {
+ tmp = tmp * conv_params->fwd_offset + res * conv_params->bck_offset;
+ tmp = tmp >> DIST_PRECISION_BITS;
+ } else {
+ tmp += res;
+ tmp = tmp >> 1;
+ }
+ /* Subtract round offset and convolve round */
+ tmp = tmp - sub32;
+ dst[y * dst_stride + x] = clip_pixel(ROUND_POWER_OF_TWO(tmp, bits));
+ } else {
+ dst16[y * dst16_stride + x] = res;
+ }
+ } else {
+ /* Subtract round offset and convolve round */
+ int32_t tmp = res - ((1 << (offset_bits - conv_params->round_1)) +
+ (1 << (offset_bits - conv_params->round_1 - 1)));
+ dst[y * dst_stride + x] = clip_pixel(ROUND_POWER_OF_TWO(tmp, bits));
+ }
+ }
+ }
+}
+void av1_convolve_2d_scale_sse4_1(const uint8_t *src, int src_stride,
+ uint8_t *dst8, int dst8_stride, int w, int h,
+ const InterpFilterParams *filter_params_x,
+ const InterpFilterParams *filter_params_y,
+ const int subpel_x_qn, const int x_step_qn,
+ const int subpel_y_qn, const int y_step_qn,
+ ConvolveParams *conv_params) {
+ int16_t tmp[(2 * MAX_SB_SIZE + MAX_FILTER_TAP) * MAX_SB_SIZE];
+ int im_h = (((h - 1) * y_step_qn + subpel_y_qn) >> SCALE_SUBPEL_BITS) +
+ filter_params_y->taps;
+
+ const int xtaps = filter_params_x->taps;
+ const int ytaps = filter_params_y->taps;
+ const int fo_vert = ytaps / 2 - 1;
+ assert((xtaps == 8) && (ytaps == 8));
+ (void)xtaps;
+
+ // horizontal filter
+ hfilter8(src - fo_vert * src_stride, src_stride, tmp, w, im_h, subpel_x_qn,
+ x_step_qn, filter_params_x, conv_params->round_0);
+
+ // vertical filter (input is transposed)
+ vfilter8(tmp, im_h, dst8, dst8_stride, w, h, subpel_y_qn, y_step_qn,
+ filter_params_y, conv_params, 8);
+}
+
+// A specialised version of hfilter, the horizontal filter for
+// av1_highbd_convolve_2d_scale_sse4_1. This version only supports 8 tap
+// filters.
+static void highbd_hfilter8(const uint16_t *src, int src_stride, int16_t *dst,
+ int w, int h, int subpel_x_qn, int x_step_qn,
+ const InterpFilterParams *filter_params, int round,
+ int bd) {
+ const int ntaps = 8;
+
+ src -= ntaps / 2 - 1;
+
+ int32_t round_add32 = (1 << round) / 2 + (1 << (bd + FILTER_BITS - 1));
+ const __m128i round_add = _mm_set1_epi32(round_add32);
+ const __m128i round_shift = _mm_cvtsi32_si128(round);
+
+ int x_qn = subpel_x_qn;
+ for (int x = 0; x < w; ++x, x_qn += x_step_qn) {
+ const uint16_t *const src_col = src + (x_qn >> SCALE_SUBPEL_BITS);
+ const int filter_idx = (x_qn & SCALE_SUBPEL_MASK) >> SCALE_EXTRA_BITS;
+ assert(filter_idx < SUBPEL_SHIFTS);
+ const int16_t *filter =
+ av1_get_interp_filter_subpel_kernel(filter_params, filter_idx);
+
+ // Load the filter coefficients
+ const __m128i coefflo = _mm_loadu_si128((__m128i *)filter);
+
+ int y;
+ for (y = 0; y <= h - 4; y += 4) {
+ const uint16_t *const src0 = src_col + y * src_stride;
+ const uint16_t *const src1 = src0 + 1 * src_stride;
+ const uint16_t *const src2 = src0 + 2 * src_stride;
+ const uint16_t *const src3 = src0 + 3 * src_stride;
+
+ // Load up source data. This is 16-bit input data, so each load gets the 8
+ // pixels we need.
+ const __m128i data0lo = _mm_loadu_si128((__m128i *)src0);
+ const __m128i data1lo = _mm_loadu_si128((__m128i *)src1);
+ const __m128i data2lo = _mm_loadu_si128((__m128i *)src2);
+ const __m128i data3lo = _mm_loadu_si128((__m128i *)src3);
+
+ // Multiply by coefficients
+ const __m128i conv0lo = _mm_madd_epi16(data0lo, coefflo);
+ const __m128i conv1lo = _mm_madd_epi16(data1lo, coefflo);
+ const __m128i conv2lo = _mm_madd_epi16(data2lo, coefflo);
+ const __m128i conv3lo = _mm_madd_epi16(data3lo, coefflo);
+
+ // Reduce horizontally and add
+ const __m128i conv01lo = _mm_hadd_epi32(conv0lo, conv1lo);
+ const __m128i conv23lo = _mm_hadd_epi32(conv2lo, conv3lo);
+ const __m128i conv = _mm_hadd_epi32(conv01lo, conv23lo);
+
+ // Divide down by (1 << round), rounding to nearest.
+ __m128i shifted =
+ _mm_sra_epi32(_mm_add_epi32(conv, round_add), round_shift);
+
+ shifted = _mm_packus_epi32(shifted, shifted);
+ // Write transposed to the output
+ _mm_storel_epi64((__m128i *)(dst + y + x * h), shifted);
+ }
+ for (; y < h; ++y) {
+ const uint16_t *const src_row = src_col + y * src_stride;
+
+ int32_t sum = (1 << (bd + FILTER_BITS - 1));
+ for (int k = 0; k < ntaps; ++k) {
+ sum += filter[k] * src_row[k];
+ }
+
+ dst[y + x * h] = ROUND_POWER_OF_TWO(sum, round);
+ }
+ }
+}
+// A specialised version of vfilter, the vertical filter for
+// av1_highbd_convolve_2d_scale_sse4_1. This version only supports 8 tap
+// filters.
+static void highbd_vfilter8(const int16_t *src, int src_stride, uint16_t *dst,
+ int dst_stride, int w, int h, int subpel_y_qn,
+ int y_step_qn,
+ const InterpFilterParams *filter_params,
+ const ConvolveParams *conv_params, int bd) {
+ const int offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0;
+ const int ntaps = 8;
+
+ const __m128i round_shift = _mm_cvtsi32_si128(conv_params->round_1);
+
+ const int32_t sub32 = ((1 << (offset_bits - conv_params->round_1)) +
+ (1 << (offset_bits - conv_params->round_1 - 1)));
+ const __m128i sub = _mm_set1_epi32(sub32);
+
+ CONV_BUF_TYPE *dst16 = conv_params->dst;
+ const int dst16_stride = conv_params->dst_stride;
+ const __m128i clip_pixel_ =
+ _mm_set1_epi16(bd == 10 ? 1023 : (bd == 12 ? 4095 : 255));
+ const int bits =
+ FILTER_BITS * 2 - conv_params->round_0 - conv_params->round_1;
+ const __m128i bits_shift = _mm_cvtsi32_si128(bits);
+ const __m128i bits_const = _mm_set1_epi32(((1 << bits) >> 1));
+ const __m128i round_shift_add =
+ _mm_set1_epi32(((1 << conv_params->round_1) >> 1));
+ const __m128i res_add_const = _mm_set1_epi32(1 << offset_bits);
+ const int round_bits =
+ 2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1;
+ __m128i round_bits_shift = _mm_cvtsi32_si128(round_bits);
+ __m128i round_bits_const = _mm_set1_epi32(((1 << round_bits) >> 1));
+
+ const int w0 = conv_params->fwd_offset;
+ const int w1 = conv_params->bck_offset;
+ const __m128i wt0 = _mm_set1_epi32(w0);
+ const __m128i wt1 = _mm_set1_epi32(w1);
+
+ int y_qn = subpel_y_qn;
+ for (int y = 0; y < h; ++y, y_qn += y_step_qn) {
+ const int16_t *src_y = src + (y_qn >> SCALE_SUBPEL_BITS);
+ const int filter_idx = (y_qn & SCALE_SUBPEL_MASK) >> SCALE_EXTRA_BITS;
+ assert(filter_idx < SUBPEL_SHIFTS);
+ const int16_t *filter =
+ av1_get_interp_filter_subpel_kernel(filter_params, filter_idx);
+
+ const __m128i coeff0716 = _mm_loadu_si128((__m128i *)filter);
+ int x;
+ for (x = 0; x <= w - 4; x += 4) {
+ const int16_t *const src0 = src_y + x * src_stride;
+ const int16_t *const src1 = src0 + 1 * src_stride;
+ const int16_t *const src2 = src0 + 2 * src_stride;
+ const int16_t *const src3 = src0 + 3 * src_stride;
+
+ // Load the source data for the three rows, adding the three registers of
+ // convolved products to one as we go (conv0..conv3) to avoid the
+ // register pressure getting too high.
+ const __m128i conv0 = convolve_16_8(src0, coeff0716);
+ const __m128i conv1 = convolve_16_8(src1, coeff0716);
+ const __m128i conv2 = convolve_16_8(src2, coeff0716);
+ const __m128i conv3 = convolve_16_8(src3, coeff0716);
+
+ // Now reduce horizontally to get one lane for each result
+ const __m128i conv01 = _mm_hadd_epi32(conv0, conv1);
+ const __m128i conv23 = _mm_hadd_epi32(conv2, conv3);
+ __m128i conv = _mm_hadd_epi32(conv01, conv23);
+ conv = _mm_add_epi32(conv, res_add_const);
+
+ // Divide down by (1 << round_1), rounding to nearest and subtract sub32.
+ __m128i shifted =
+ _mm_sra_epi32(_mm_add_epi32(conv, round_shift_add), round_shift);
+
+ uint16_t *dst_x = dst + y * dst_stride + x;
+
+ __m128i result;
+ if (conv_params->is_compound) {
+ CONV_BUF_TYPE *dst_16_x = dst16 + y * dst16_stride + x;
+ if (conv_params->do_average) {
+ __m128i p_32 =
+ _mm_cvtepu16_epi32(_mm_loadl_epi64((__m128i *)dst_16_x));
+
+ if (conv_params->use_dist_wtd_comp_avg) {
+ shifted = _mm_add_epi32(_mm_mullo_epi32(p_32, wt0),
+ _mm_mullo_epi32(shifted, wt1));
+ shifted = _mm_srai_epi32(shifted, DIST_PRECISION_BITS);
+ } else {
+ shifted = _mm_srai_epi32(_mm_add_epi32(p_32, shifted), 1);
+ }
+ result = _mm_sub_epi32(shifted, sub);
+ result = _mm_sra_epi32(_mm_add_epi32(result, round_bits_const),
+ round_bits_shift);
+
+ result = _mm_packus_epi32(result, result);
+ result = _mm_min_epi16(result, clip_pixel_);
+ _mm_storel_epi64((__m128i *)dst_x, result);
+ } else {
+ __m128i shifted_16 = _mm_packus_epi32(shifted, shifted);
+ _mm_storel_epi64((__m128i *)dst_16_x, shifted_16);
+ }
+ } else {
+ result = _mm_sub_epi32(shifted, sub);
+ result = _mm_sra_epi16(_mm_add_epi32(result, bits_const), bits_shift);
+ result = _mm_packus_epi32(result, result);
+ result = _mm_min_epi16(result, clip_pixel_);
+ _mm_storel_epi64((__m128i *)dst_x, result);
+ }
+ }
+
+ for (; x < w; ++x) {
+ const int16_t *src_x = src_y + x * src_stride;
+ int32_t sum = 1 << offset_bits;
+ for (int k = 0; k < ntaps; ++k) sum += filter[k] * src_x[k];
+ CONV_BUF_TYPE res = ROUND_POWER_OF_TWO(sum, conv_params->round_1);
+ if (conv_params->is_compound) {
+ if (conv_params->do_average) {
+ int32_t tmp = dst16[y * dst16_stride + x];
+ if (conv_params->use_dist_wtd_comp_avg) {
+ tmp = tmp * conv_params->fwd_offset + res * conv_params->bck_offset;
+ tmp = tmp >> DIST_PRECISION_BITS;
+ } else {
+ tmp += res;
+ tmp = tmp >> 1;
+ }
+ /* Subtract round offset and convolve round */
+ tmp = tmp - ((1 << (offset_bits - conv_params->round_1)) +
+ (1 << (offset_bits - conv_params->round_1 - 1)));
+ dst[y * dst_stride + x] =
+ clip_pixel_highbd(ROUND_POWER_OF_TWO(tmp, bits), bd);
+ } else {
+ dst16[y * dst16_stride + x] = res;
+ }
+ } else {
+ /* Subtract round offset and convolve round */
+ int32_t tmp = res - ((1 << (offset_bits - conv_params->round_1)) +
+ (1 << (offset_bits - conv_params->round_1 - 1)));
+ dst[y * dst_stride + x] =
+ clip_pixel_highbd(ROUND_POWER_OF_TWO(tmp, bits), bd);
+ }
+ }
+ }
+}
+
+void av1_highbd_convolve_2d_scale_sse4_1(
+ const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w,
+ int h, const InterpFilterParams *filter_params_x,
+ const InterpFilterParams *filter_params_y, const int subpel_x_qn,
+ const int x_step_qn, const int subpel_y_qn, const int y_step_qn,
+ ConvolveParams *conv_params, int bd) {
+ // TODO(yaowu): Move this out of stack
+ DECLARE_ALIGNED(16, int16_t,
+ tmp[(2 * MAX_SB_SIZE + MAX_FILTER_TAP) * MAX_SB_SIZE]);
+ int im_h = (((h - 1) * y_step_qn + subpel_y_qn) >> SCALE_SUBPEL_BITS) +
+ filter_params_y->taps;
+ const int xtaps = filter_params_x->taps;
+ const int ytaps = filter_params_y->taps;
+ const int fo_vert = ytaps / 2 - 1;
+
+ memset(tmp, 0, sizeof(tmp));
+ assert((xtaps == 8) && (ytaps == 8));
+ (void)xtaps;
+
+ // horizontal filter
+ highbd_hfilter8(src - fo_vert * src_stride, src_stride, tmp, w, im_h,
+ subpel_x_qn, x_step_qn, filter_params_x, conv_params->round_0,
+ bd);
+
+ // vertical filter (input is transposed)
+ highbd_vfilter8(tmp, im_h, dst, dst_stride, w, h, subpel_y_qn, y_step_qn,
+ filter_params_y, conv_params, bd);
+}
diff --git a/third_party/aom/av1/common/x86/av1_inv_txfm_avx2.c b/third_party/aom/av1/common/x86/av1_inv_txfm_avx2.c
new file mode 100644
index 0000000000..0afd42b170
--- /dev/null
+++ b/third_party/aom/av1/common/x86/av1_inv_txfm_avx2.c
@@ -0,0 +1,2254 @@
+/*
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include "config/aom_config.h"
+
+#include "config/av1_rtcd.h"
+
+#include "av1/common/av1_inv_txfm1d_cfg.h"
+#include "av1/common/x86/av1_txfm_sse2.h"
+#include "av1/common/x86/av1_inv_txfm_avx2.h"
+#include "av1/common/x86/av1_inv_txfm_ssse3.h"
+
+// TODO(venkatsanampudi@ittiam.com): move this to header file
+
+// Sqrt2, Sqrt2^2, Sqrt2^3, Sqrt2^4, Sqrt2^5
+static int32_t NewSqrt2list[TX_SIZES] = { 5793, 2 * 4096, 2 * 5793, 4 * 4096,
+ 4 * 5793 };
+
+static INLINE void idct16_stage5_avx2(__m256i *x1, const int32_t *cospi,
+ const __m256i _r, int8_t cos_bit) {
+ const __m256i cospi_m32_p32 = pair_set_w16_epi16(-cospi[32], cospi[32]);
+ const __m256i cospi_p32_p32 = pair_set_w16_epi16(cospi[32], cospi[32]);
+ btf_16_adds_subs_avx2(&x1[0], &x1[3]);
+ btf_16_adds_subs_avx2(&x1[1], &x1[2]);
+ btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x1[5], &x1[6], _r, cos_bit);
+
+ btf_16_adds_subs_avx2(&x1[8], &x1[11]);
+ btf_16_adds_subs_avx2(&x1[9], &x1[10]);
+ btf_16_adds_subs_avx2(&x1[15], &x1[12]);
+ btf_16_adds_subs_avx2(&x1[14], &x1[13]);
+}
+
+static INLINE void idct16_stage6_avx2(__m256i *x, const int32_t *cospi,
+ const __m256i _r, int8_t cos_bit) {
+ const __m256i cospi_m32_p32 = pair_set_w16_epi16(-cospi[32], cospi[32]);
+ const __m256i cospi_p32_p32 = pair_set_w16_epi16(cospi[32], cospi[32]);
+ btf_16_adds_subs_avx2(&x[0], &x[7]);
+ btf_16_adds_subs_avx2(&x[1], &x[6]);
+ btf_16_adds_subs_avx2(&x[2], &x[5]);
+ btf_16_adds_subs_avx2(&x[3], &x[4]);
+ btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x[10], &x[13], _r, cos_bit);
+ btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x[11], &x[12], _r, cos_bit);
+}
+
+static INLINE void idct16_stage7_avx2(__m256i *output, __m256i *x1) {
+ btf_16_adds_subs_out_avx2(&output[0], &output[15], x1[0], x1[15]);
+ btf_16_adds_subs_out_avx2(&output[1], &output[14], x1[1], x1[14]);
+ btf_16_adds_subs_out_avx2(&output[2], &output[13], x1[2], x1[13]);
+ btf_16_adds_subs_out_avx2(&output[3], &output[12], x1[3], x1[12]);
+ btf_16_adds_subs_out_avx2(&output[4], &output[11], x1[4], x1[11]);
+ btf_16_adds_subs_out_avx2(&output[5], &output[10], x1[5], x1[10]);
+ btf_16_adds_subs_out_avx2(&output[6], &output[9], x1[6], x1[9]);
+ btf_16_adds_subs_out_avx2(&output[7], &output[8], x1[7], x1[8]);
+}
+
+static void idct16_avx2(const __m256i *input, __m256i *output) {
+ const int32_t *cospi = cospi_arr(INV_COS_BIT);
+ const __m256i _r = _mm256_set1_epi32(1 << (INV_COS_BIT - 1));
+
+ __m256i cospi_p60_m04 = pair_set_w16_epi16(cospi[60], -cospi[4]);
+ __m256i cospi_p04_p60 = pair_set_w16_epi16(cospi[4], cospi[60]);
+ __m256i cospi_p28_m36 = pair_set_w16_epi16(cospi[28], -cospi[36]);
+ __m256i cospi_p36_p28 = pair_set_w16_epi16(cospi[36], cospi[28]);
+ __m256i cospi_p44_m20 = pair_set_w16_epi16(cospi[44], -cospi[20]);
+ __m256i cospi_p20_p44 = pair_set_w16_epi16(cospi[20], cospi[44]);
+ __m256i cospi_p12_m52 = pair_set_w16_epi16(cospi[12], -cospi[52]);
+ __m256i cospi_p52_p12 = pair_set_w16_epi16(cospi[52], cospi[12]);
+ __m256i cospi_p56_m08 = pair_set_w16_epi16(cospi[56], -cospi[8]);
+ __m256i cospi_p08_p56 = pair_set_w16_epi16(cospi[8], cospi[56]);
+ __m256i cospi_p24_m40 = pair_set_w16_epi16(cospi[24], -cospi[40]);
+ __m256i cospi_p40_p24 = pair_set_w16_epi16(cospi[40], cospi[24]);
+ __m256i cospi_p32_p32 = pair_set_w16_epi16(cospi[32], cospi[32]);
+ __m256i cospi_p32_m32 = pair_set_w16_epi16(cospi[32], -cospi[32]);
+ __m256i cospi_p48_m16 = pair_set_w16_epi16(cospi[48], -cospi[16]);
+ __m256i cospi_p16_p48 = pair_set_w16_epi16(cospi[16], cospi[48]);
+ __m256i cospi_m16_p48 = pair_set_w16_epi16(-cospi[16], cospi[48]);
+ __m256i cospi_p48_p16 = pair_set_w16_epi16(cospi[48], cospi[16]);
+ __m256i cospi_m48_m16 = pair_set_w16_epi16(-cospi[48], -cospi[16]);
+
+ // stage 1
+ __m256i x1[16];
+ x1[0] = input[0];
+ x1[1] = input[8];
+ x1[2] = input[4];
+ x1[3] = input[12];
+ x1[4] = input[2];
+ x1[5] = input[10];
+ x1[6] = input[6];
+ x1[7] = input[14];
+ x1[8] = input[1];
+ x1[9] = input[9];
+ x1[10] = input[5];
+ x1[11] = input[13];
+ x1[12] = input[3];
+ x1[13] = input[11];
+ x1[14] = input[7];
+ x1[15] = input[15];
+
+ // stage 2
+ btf_16_w16_avx2(cospi_p60_m04, cospi_p04_p60, &x1[8], &x1[15], _r,
+ INV_COS_BIT);
+ btf_16_w16_avx2(cospi_p28_m36, cospi_p36_p28, &x1[9], &x1[14], _r,
+ INV_COS_BIT);
+ btf_16_w16_avx2(cospi_p44_m20, cospi_p20_p44, &x1[10], &x1[13], _r,
+ INV_COS_BIT);
+ btf_16_w16_avx2(cospi_p12_m52, cospi_p52_p12, &x1[11], &x1[12], _r,
+ INV_COS_BIT);
+
+ // stage 3
+ btf_16_w16_avx2(cospi_p56_m08, cospi_p08_p56, &x1[4], &x1[7], _r,
+ INV_COS_BIT);
+ btf_16_w16_avx2(cospi_p24_m40, cospi_p40_p24, &x1[5], &x1[6], _r,
+ INV_COS_BIT);
+ btf_16_adds_subs_avx2(&x1[8], &x1[9]);
+ btf_16_adds_subs_avx2(&x1[11], &x1[10]);
+ btf_16_adds_subs_avx2(&x1[12], &x1[13]);
+ btf_16_adds_subs_avx2(&x1[15], &x1[14]);
+
+ // stage 4
+ btf_16_w16_avx2(cospi_p32_p32, cospi_p32_m32, &x1[0], &x1[1], _r,
+ INV_COS_BIT);
+ btf_16_w16_avx2(cospi_p48_m16, cospi_p16_p48, &x1[2], &x1[3], _r,
+ INV_COS_BIT);
+ btf_16_adds_subs_avx2(&x1[4], &x1[5]);
+ btf_16_adds_subs_avx2(&x1[7], &x1[6]);
+ btf_16_w16_avx2(cospi_m16_p48, cospi_p48_p16, &x1[9], &x1[14], _r,
+ INV_COS_BIT);
+ btf_16_w16_avx2(cospi_m48_m16, cospi_m16_p48, &x1[10], &x1[13], _r,
+ INV_COS_BIT);
+
+ idct16_stage5_avx2(x1, cospi, _r, INV_COS_BIT);
+ idct16_stage6_avx2(x1, cospi, _r, INV_COS_BIT);
+ idct16_stage7_avx2(output, x1);
+}
+
+static void idct16_low8_avx2(const __m256i *input, __m256i *output) {
+ const int32_t *cospi = cospi_arr(INV_COS_BIT);
+ const __m256i _r = _mm256_set1_epi32(1 << (INV_COS_BIT - 1));
+
+ const __m256i cospi_m16_p48 = pair_set_w16_epi16(-cospi[16], cospi[48]);
+ const __m256i cospi_p48_p16 = pair_set_w16_epi16(cospi[48], cospi[16]);
+ const __m256i cospi_m48_m16 = pair_set_w16_epi16(-cospi[48], -cospi[16]);
+
+ // stage 1
+ __m256i x1[16];
+ x1[0] = input[0];
+ x1[2] = input[4];
+ x1[4] = input[2];
+ x1[6] = input[6];
+ x1[8] = input[1];
+ x1[10] = input[5];
+ x1[12] = input[3];
+ x1[14] = input[7];
+
+ // stage 2
+ btf_16_w16_0_avx2(cospi[60], cospi[4], x1[8], x1[8], x1[15]);
+ btf_16_w16_0_avx2(-cospi[36], cospi[28], x1[14], x1[9], x1[14]);
+ btf_16_w16_0_avx2(cospi[44], cospi[20], x1[10], x1[10], x1[13]);
+ btf_16_w16_0_avx2(-cospi[52], cospi[12], x1[12], x1[11], x1[12]);
+
+ // stage 3
+ btf_16_w16_0_avx2(cospi[56], cospi[8], x1[4], x1[4], x1[7]);
+ btf_16_w16_0_avx2(-cospi[40], cospi[24], x1[6], x1[5], x1[6]);
+ btf_16_adds_subs_avx2(&x1[8], &x1[9]);
+ btf_16_adds_subs_avx2(&x1[11], &x1[10]);
+ btf_16_adds_subs_avx2(&x1[12], &x1[13]);
+ btf_16_adds_subs_avx2(&x1[15], &x1[14]);
+
+ // stage 4
+ btf_16_w16_0_avx2(cospi[32], cospi[32], x1[0], x1[0], x1[1]);
+ btf_16_w16_0_avx2(cospi[48], cospi[16], x1[2], x1[2], x1[3]);
+ btf_16_adds_subs_avx2(&x1[4], &x1[5]);
+ btf_16_adds_subs_avx2(&x1[7], &x1[6]);
+ btf_16_w16_avx2(cospi_m16_p48, cospi_p48_p16, &x1[9], &x1[14], _r,
+ INV_COS_BIT);
+ btf_16_w16_avx2(cospi_m48_m16, cospi_m16_p48, &x1[10], &x1[13], _r,
+ INV_COS_BIT);
+
+ idct16_stage5_avx2(x1, cospi, _r, INV_COS_BIT);
+ idct16_stage6_avx2(x1, cospi, _r, INV_COS_BIT);
+ idct16_stage7_avx2(output, x1);
+}
+
+static void idct16_low1_avx2(const __m256i *input, __m256i *output) {
+ const int32_t *cospi = cospi_arr(INV_COS_BIT);
+
+ // stage 1
+ __m256i x1[2];
+ x1[0] = input[0];
+
+ // stage 2
+ // stage 3
+ // stage 4
+ btf_16_w16_0_avx2(cospi[32], cospi[32], x1[0], x1[0], x1[1]);
+
+ // stage 5
+ // stage 6
+ output[0] = x1[0];
+ output[1] = x1[1];
+ output[2] = x1[1];
+ output[3] = x1[0];
+ output[4] = x1[0];
+ output[5] = x1[1];
+ output[6] = x1[1];
+ output[7] = x1[0];
+ output[8] = x1[0];
+ output[9] = x1[1];
+ output[10] = x1[1];
+ output[11] = x1[0];
+ output[12] = x1[0];
+ output[13] = x1[1];
+ output[14] = x1[1];
+ output[15] = x1[0];
+}
+
+static INLINE void iadst16_stage3_avx2(__m256i *x) {
+ btf_16_adds_subs_avx2(&x[0], &x[8]);
+ btf_16_adds_subs_avx2(&x[1], &x[9]);
+ btf_16_adds_subs_avx2(&x[2], &x[10]);
+ btf_16_adds_subs_avx2(&x[3], &x[11]);
+ btf_16_adds_subs_avx2(&x[4], &x[12]);
+ btf_16_adds_subs_avx2(&x[5], &x[13]);
+ btf_16_adds_subs_avx2(&x[6], &x[14]);
+ btf_16_adds_subs_avx2(&x[7], &x[15]);
+}
+
+static INLINE void iadst16_stage4_avx2(__m256i *x, const int32_t *cospi,
+ const __m256i _r, int8_t cos_bit) {
+ const __m256i cospi_p08_p56 = pair_set_w16_epi16(cospi[8], cospi[56]);
+ const __m256i cospi_p56_m08 = pair_set_w16_epi16(cospi[56], -cospi[8]);
+ const __m256i cospi_p40_p24 = pair_set_w16_epi16(cospi[40], cospi[24]);
+ const __m256i cospi_p24_m40 = pair_set_w16_epi16(cospi[24], -cospi[40]);
+ const __m256i cospi_m56_p08 = pair_set_w16_epi16(-cospi[56], cospi[8]);
+ const __m256i cospi_m24_p40 = pair_set_w16_epi16(-cospi[24], cospi[40]);
+ btf_16_w16_avx2(cospi_p08_p56, cospi_p56_m08, &x[8], &x[9], _r, cos_bit);
+ btf_16_w16_avx2(cospi_p40_p24, cospi_p24_m40, &x[10], &x[11], _r, cos_bit);
+ btf_16_w16_avx2(cospi_m56_p08, cospi_p08_p56, &x[12], &x[13], _r, cos_bit);
+ btf_16_w16_avx2(cospi_m24_p40, cospi_p40_p24, &x[14], &x[15], _r, cos_bit);
+}
+
+static INLINE void iadst16_stage5_avx2(__m256i *x) {
+ btf_16_adds_subs_avx2(&x[0], &x[4]);
+ btf_16_adds_subs_avx2(&x[1], &x[5]);
+ btf_16_adds_subs_avx2(&x[2], &x[6]);
+ btf_16_adds_subs_avx2(&x[3], &x[7]);
+ btf_16_adds_subs_avx2(&x[8], &x[12]);
+ btf_16_adds_subs_avx2(&x[9], &x[13]);
+ btf_16_adds_subs_avx2(&x[10], &x[14]);
+ btf_16_adds_subs_avx2(&x[11], &x[15]);
+}
+
+static INLINE void iadst16_stage6_avx2(__m256i *x, const int32_t *cospi,
+ const __m256i _r, int8_t cos_bit) {
+ const __m256i cospi_p16_p48 = pair_set_w16_epi16(cospi[16], cospi[48]);
+ const __m256i cospi_p48_m16 = pair_set_w16_epi16(cospi[48], -cospi[16]);
+ const __m256i cospi_m48_p16 = pair_set_w16_epi16(-cospi[48], cospi[16]);
+ btf_16_w16_avx2(cospi_p16_p48, cospi_p48_m16, &x[4], &x[5], _r, cos_bit);
+ btf_16_w16_avx2(cospi_m48_p16, cospi_p16_p48, &x[6], &x[7], _r, cos_bit);
+ btf_16_w16_avx2(cospi_p16_p48, cospi_p48_m16, &x[12], &x[13], _r, cos_bit);
+ btf_16_w16_avx2(cospi_m48_p16, cospi_p16_p48, &x[14], &x[15], _r, cos_bit);
+}
+
+static INLINE void iadst16_stage7_avx2(__m256i *x) {
+ btf_16_adds_subs_avx2(&x[0], &x[2]);
+ btf_16_adds_subs_avx2(&x[1], &x[3]);
+ btf_16_adds_subs_avx2(&x[4], &x[6]);
+ btf_16_adds_subs_avx2(&x[5], &x[7]);
+ btf_16_adds_subs_avx2(&x[8], &x[10]);
+ btf_16_adds_subs_avx2(&x[9], &x[11]);
+ btf_16_adds_subs_avx2(&x[12], &x[14]);
+ btf_16_adds_subs_avx2(&x[13], &x[15]);
+}
+
+static INLINE void iadst16_stage8_avx2(__m256i *x1, const int32_t *cospi,
+ const __m256i _r, int8_t cos_bit) {
+ const __m256i cospi_p32_p32 = pair_set_w16_epi16(cospi[32], cospi[32]);
+ const __m256i cospi_p32_m32 = pair_set_w16_epi16(cospi[32], -cospi[32]);
+ btf_16_w16_avx2(cospi_p32_p32, cospi_p32_m32, &x1[2], &x1[3], _r, cos_bit);
+ btf_16_w16_avx2(cospi_p32_p32, cospi_p32_m32, &x1[6], &x1[7], _r, cos_bit);
+ btf_16_w16_avx2(cospi_p32_p32, cospi_p32_m32, &x1[10], &x1[11], _r, cos_bit);
+ btf_16_w16_avx2(cospi_p32_p32, cospi_p32_m32, &x1[14], &x1[15], _r, cos_bit);
+}
+
+static INLINE void iadst16_stage9_avx2(__m256i *output, __m256i *x1) {
+ const __m256i __zero = _mm256_setzero_si256();
+ output[0] = x1[0];
+ output[1] = _mm256_subs_epi16(__zero, x1[8]);
+ output[2] = x1[12];
+ output[3] = _mm256_subs_epi16(__zero, x1[4]);
+ output[4] = x1[6];
+ output[5] = _mm256_subs_epi16(__zero, x1[14]);
+ output[6] = x1[10];
+ output[7] = _mm256_subs_epi16(__zero, x1[2]);
+ output[8] = x1[3];
+ output[9] = _mm256_subs_epi16(__zero, x1[11]);
+ output[10] = x1[15];
+ output[11] = _mm256_subs_epi16(__zero, x1[7]);
+ output[12] = x1[5];
+ output[13] = _mm256_subs_epi16(__zero, x1[13]);
+ output[14] = x1[9];
+ output[15] = _mm256_subs_epi16(__zero, x1[1]);
+}
+
+static void iadst16_avx2(const __m256i *input, __m256i *output) {
+ const int32_t *cospi = cospi_arr(INV_COS_BIT);
+
+ const __m256i _r = _mm256_set1_epi32(1 << (INV_COS_BIT - 1));
+
+ __m256i cospi_p02_p62 = pair_set_w16_epi16(cospi[2], cospi[62]);
+ __m256i cospi_p62_m02 = pair_set_w16_epi16(cospi[62], -cospi[2]);
+ __m256i cospi_p10_p54 = pair_set_w16_epi16(cospi[10], cospi[54]);
+ __m256i cospi_p54_m10 = pair_set_w16_epi16(cospi[54], -cospi[10]);
+ __m256i cospi_p18_p46 = pair_set_w16_epi16(cospi[18], cospi[46]);
+ __m256i cospi_p46_m18 = pair_set_w16_epi16(cospi[46], -cospi[18]);
+ __m256i cospi_p26_p38 = pair_set_w16_epi16(cospi[26], cospi[38]);
+ __m256i cospi_p38_m26 = pair_set_w16_epi16(cospi[38], -cospi[26]);
+ __m256i cospi_p34_p30 = pair_set_w16_epi16(cospi[34], cospi[30]);
+ __m256i cospi_p30_m34 = pair_set_w16_epi16(cospi[30], -cospi[34]);
+ __m256i cospi_p42_p22 = pair_set_w16_epi16(cospi[42], cospi[22]);
+ __m256i cospi_p22_m42 = pair_set_w16_epi16(cospi[22], -cospi[42]);
+ __m256i cospi_p50_p14 = pair_set_w16_epi16(cospi[50], cospi[14]);
+ __m256i cospi_p14_m50 = pair_set_w16_epi16(cospi[14], -cospi[50]);
+ __m256i cospi_p58_p06 = pair_set_w16_epi16(cospi[58], cospi[6]);
+ __m256i cospi_p06_m58 = pair_set_w16_epi16(cospi[6], -cospi[58]);
+
+ // stage 1
+ __m256i x1[16];
+ x1[0] = input[15];
+ x1[1] = input[0];
+ x1[2] = input[13];
+ x1[3] = input[2];
+ x1[4] = input[11];
+ x1[5] = input[4];
+ x1[6] = input[9];
+ x1[7] = input[6];
+ x1[8] = input[7];
+ x1[9] = input[8];
+ x1[10] = input[5];
+ x1[11] = input[10];
+ x1[12] = input[3];
+ x1[13] = input[12];
+ x1[14] = input[1];
+ x1[15] = input[14];
+
+ // stage 2
+ btf_16_w16_avx2(cospi_p02_p62, cospi_p62_m02, &x1[0], &x1[1], _r,
+ INV_COS_BIT);
+ btf_16_w16_avx2(cospi_p10_p54, cospi_p54_m10, &x1[2], &x1[3], _r,
+ INV_COS_BIT);
+ btf_16_w16_avx2(cospi_p18_p46, cospi_p46_m18, &x1[4], &x1[5], _r,
+ INV_COS_BIT);
+ btf_16_w16_avx2(cospi_p26_p38, cospi_p38_m26, &x1[6], &x1[7], _r,
+ INV_COS_BIT);
+ btf_16_w16_avx2(cospi_p34_p30, cospi_p30_m34, &x1[8], &x1[9], _r,
+ INV_COS_BIT);
+ btf_16_w16_avx2(cospi_p42_p22, cospi_p22_m42, &x1[10], &x1[11], _r,
+ INV_COS_BIT);
+ btf_16_w16_avx2(cospi_p50_p14, cospi_p14_m50, &x1[12], &x1[13], _r,
+ INV_COS_BIT);
+ btf_16_w16_avx2(cospi_p58_p06, cospi_p06_m58, &x1[14], &x1[15], _r,
+ INV_COS_BIT);
+
+ iadst16_stage3_avx2(x1);
+ iadst16_stage4_avx2(x1, cospi, _r, INV_COS_BIT);
+ iadst16_stage5_avx2(x1);
+ iadst16_stage6_avx2(x1, cospi, _r, INV_COS_BIT);
+ iadst16_stage7_avx2(x1);
+ iadst16_stage8_avx2(x1, cospi, _r, INV_COS_BIT);
+ iadst16_stage9_avx2(output, x1);
+}
+
+static void iadst16_low8_avx2(const __m256i *input, __m256i *output) {
+ const int32_t *cospi = cospi_arr(INV_COS_BIT);
+ const __m256i _r = _mm256_set1_epi32(1 << (INV_COS_BIT - 1));
+
+ // stage 1
+ __m256i x1[16];
+ x1[1] = input[0];
+ x1[3] = input[2];
+ x1[5] = input[4];
+ x1[7] = input[6];
+ x1[8] = input[7];
+ x1[10] = input[5];
+ x1[12] = input[3];
+ x1[14] = input[1];
+
+ // stage 2
+ btf_16_w16_0_avx2(cospi[62], -cospi[2], x1[1], x1[0], x1[1]);
+ btf_16_w16_0_avx2(cospi[54], -cospi[10], x1[3], x1[2], x1[3]);
+ btf_16_w16_0_avx2(cospi[46], -cospi[18], x1[5], x1[4], x1[5]);
+ btf_16_w16_0_avx2(cospi[38], -cospi[26], x1[7], x1[6], x1[7]);
+ btf_16_w16_0_avx2(cospi[34], cospi[30], x1[8], x1[8], x1[9]);
+ btf_16_w16_0_avx2(cospi[42], cospi[22], x1[10], x1[10], x1[11]);
+ btf_16_w16_0_avx2(cospi[50], cospi[14], x1[12], x1[12], x1[13]);
+ btf_16_w16_0_avx2(cospi[58], cospi[06], x1[14], x1[14], x1[15]);
+
+ iadst16_stage3_avx2(x1);
+ iadst16_stage4_avx2(x1, cospi, _r, INV_COS_BIT);
+ iadst16_stage5_avx2(x1);
+ iadst16_stage6_avx2(x1, cospi, _r, INV_COS_BIT);
+ iadst16_stage7_avx2(x1);
+ iadst16_stage8_avx2(x1, cospi, _r, INV_COS_BIT);
+ iadst16_stage9_avx2(output, x1);
+}
+
+static void iadst16_low1_avx2(const __m256i *input, __m256i *output) {
+ const int32_t *cospi = cospi_arr(INV_COS_BIT);
+ const __m256i _r = _mm256_set1_epi32(1 << (INV_COS_BIT - 1));
+
+ const __m256i cospi_p08_p56 = pair_set_w16_epi16(cospi[8], cospi[56]);
+ const __m256i cospi_p56_m08 = pair_set_w16_epi16(cospi[56], -cospi[8]);
+ const __m256i cospi_p16_p48 = pair_set_w16_epi16(cospi[16], cospi[48]);
+ const __m256i cospi_p48_m16 = pair_set_w16_epi16(cospi[48], -cospi[16]);
+
+ // stage 1
+ __m256i x1[16];
+ x1[1] = input[0];
+
+ // stage 2
+ btf_16_w16_0_avx2(cospi[62], -cospi[2], x1[1], x1[0], x1[1]);
+
+ // stage 3
+ x1[8] = x1[0];
+ x1[9] = x1[1];
+
+ // stage 4
+ btf_16_w16_avx2(cospi_p08_p56, cospi_p56_m08, &x1[8], &x1[9], _r,
+ INV_COS_BIT);
+
+ // stage 5
+ x1[4] = x1[0];
+ x1[5] = x1[1];
+
+ x1[12] = x1[8];
+ x1[13] = x1[9];
+
+ // stage 6
+ btf_16_w16_avx2(cospi_p16_p48, cospi_p48_m16, &x1[4], &x1[5], _r,
+ INV_COS_BIT);
+ btf_16_w16_avx2(cospi_p16_p48, cospi_p48_m16, &x1[12], &x1[13], _r,
+ INV_COS_BIT);
+
+ // stage 7
+ x1[2] = x1[0];
+ x1[3] = x1[1];
+ x1[6] = x1[4];
+ x1[7] = x1[5];
+ x1[10] = x1[8];
+ x1[11] = x1[9];
+ x1[14] = x1[12];
+ x1[15] = x1[13];
+
+ iadst16_stage8_avx2(x1, cospi, _r, INV_COS_BIT);
+ iadst16_stage9_avx2(output, x1);
+}
+
+static INLINE void idct32_high16_stage3_avx2(__m256i *x) {
+ btf_16_adds_subs_avx2(&x[16], &x[17]);
+ btf_16_adds_subs_avx2(&x[19], &x[18]);
+ btf_16_adds_subs_avx2(&x[20], &x[21]);
+ btf_16_adds_subs_avx2(&x[23], &x[22]);
+ btf_16_adds_subs_avx2(&x[24], &x[25]);
+ btf_16_adds_subs_avx2(&x[27], &x[26]);
+ btf_16_adds_subs_avx2(&x[28], &x[29]);
+ btf_16_adds_subs_avx2(&x[31], &x[30]);
+}
+
+static INLINE void idct32_high16_stage4_avx2(__m256i *x, const int32_t *cospi,
+ const __m256i _r, int8_t cos_bit) {
+ const __m256i cospi_m08_p56 = pair_set_w16_epi16(-cospi[8], cospi[56]);
+ const __m256i cospi_p56_p08 = pair_set_w16_epi16(cospi[56], cospi[8]);
+ const __m256i cospi_m56_m08 = pair_set_w16_epi16(-cospi[56], -cospi[8]);
+ const __m256i cospi_m40_p24 = pair_set_w16_epi16(-cospi[40], cospi[24]);
+ const __m256i cospi_p24_p40 = pair_set_w16_epi16(cospi[24], cospi[40]);
+ const __m256i cospi_m24_m40 = pair_set_w16_epi16(-cospi[24], -cospi[40]);
+ btf_16_w16_avx2(cospi_m08_p56, cospi_p56_p08, &x[17], &x[30], _r, cos_bit);
+ btf_16_w16_avx2(cospi_m56_m08, cospi_m08_p56, &x[18], &x[29], _r, cos_bit);
+ btf_16_w16_avx2(cospi_m40_p24, cospi_p24_p40, &x[21], &x[26], _r, cos_bit);
+ btf_16_w16_avx2(cospi_m24_m40, cospi_m40_p24, &x[22], &x[25], _r, cos_bit);
+}
+
+static INLINE void idct32_high24_stage5_avx2(__m256i *x, const int32_t *cospi,
+ const __m256i _r, int8_t cos_bit) {
+ const __m256i cospi_m16_p48 = pair_set_w16_epi16(-cospi[16], cospi[48]);
+ const __m256i cospi_p48_p16 = pair_set_w16_epi16(cospi[48], cospi[16]);
+ const __m256i cospi_m48_m16 = pair_set_w16_epi16(-cospi[48], -cospi[16]);
+ btf_16_w16_avx2(cospi_m16_p48, cospi_p48_p16, &x[9], &x[14], _r, cos_bit);
+ btf_16_w16_avx2(cospi_m48_m16, cospi_m16_p48, &x[10], &x[13], _r, cos_bit);
+ btf_16_adds_subs_avx2(&x[16], &x[19]);
+ btf_16_adds_subs_avx2(&x[17], &x[18]);
+ btf_16_adds_subs_avx2(&x[23], &x[20]);
+ btf_16_adds_subs_avx2(&x[22], &x[21]);
+ btf_16_adds_subs_avx2(&x[24], &x[27]);
+ btf_16_adds_subs_avx2(&x[25], &x[26]);
+ btf_16_adds_subs_avx2(&x[31], &x[28]);
+ btf_16_adds_subs_avx2(&x[30], &x[29]);
+}
+
+static INLINE void idct32_high28_stage6_avx2(__m256i *x, const int32_t *cospi,
+ const __m256i _r, int8_t cos_bit) {
+ const __m256i cospi_m32_p32 = pair_set_w16_epi16(-cospi[32], cospi[32]);
+ const __m256i cospi_p32_p32 = pair_set_w16_epi16(cospi[32], cospi[32]);
+ const __m256i cospi_m16_p48 = pair_set_w16_epi16(-cospi[16], cospi[48]);
+ const __m256i cospi_p48_p16 = pair_set_w16_epi16(cospi[48], cospi[16]);
+ const __m256i cospi_m48_m16 = pair_set_w16_epi16(-cospi[48], -cospi[16]);
+ btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x[5], &x[6], _r, cos_bit);
+ btf_16_adds_subs_avx2(&x[8], &x[11]);
+ btf_16_adds_subs_avx2(&x[9], &x[10]);
+ btf_16_adds_subs_avx2(&x[15], &x[12]);
+ btf_16_adds_subs_avx2(&x[14], &x[13]);
+ btf_16_w16_avx2(cospi_m16_p48, cospi_p48_p16, &x[18], &x[29], _r, cos_bit);
+ btf_16_w16_avx2(cospi_m16_p48, cospi_p48_p16, &x[19], &x[28], _r, cos_bit);
+ btf_16_w16_avx2(cospi_m48_m16, cospi_m16_p48, &x[20], &x[27], _r, cos_bit);
+ btf_16_w16_avx2(cospi_m48_m16, cospi_m16_p48, &x[21], &x[26], _r, cos_bit);
+}
+
+static INLINE void idct32_stage7_avx2(__m256i *x, const int32_t *cospi,
+ const __m256i _r, int8_t cos_bit) {
+ const __m256i cospi_m32_p32 = pair_set_w16_epi16(-cospi[32], cospi[32]);
+ const __m256i cospi_p32_p32 = pair_set_w16_epi16(cospi[32], cospi[32]);
+ btf_16_adds_subs_avx2(&x[0], &x[7]);
+ btf_16_adds_subs_avx2(&x[1], &x[6]);
+ btf_16_adds_subs_avx2(&x[2], &x[5]);
+ btf_16_adds_subs_avx2(&x[3], &x[4]);
+ btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x[10], &x[13], _r, cos_bit);
+ btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x[11], &x[12], _r, cos_bit);
+ btf_16_adds_subs_avx2(&x[16], &x[23]);
+ btf_16_adds_subs_avx2(&x[17], &x[22]);
+ btf_16_adds_subs_avx2(&x[18], &x[21]);
+ btf_16_adds_subs_avx2(&x[19], &x[20]);
+ btf_16_adds_subs_avx2(&x[31], &x[24]);
+ btf_16_adds_subs_avx2(&x[30], &x[25]);
+ btf_16_adds_subs_avx2(&x[29], &x[26]);
+ btf_16_adds_subs_avx2(&x[28], &x[27]);
+}
+
+static INLINE void idct32_stage8_avx2(__m256i *x, const int32_t *cospi,
+ const __m256i _r, int8_t cos_bit) {
+ const __m256i cospi_m32_p32 = pair_set_w16_epi16(-cospi[32], cospi[32]);
+ const __m256i cospi_p32_p32 = pair_set_w16_epi16(cospi[32], cospi[32]);
+ btf_16_adds_subs_avx2(&x[0], &x[15]);
+ btf_16_adds_subs_avx2(&x[1], &x[14]);
+ btf_16_adds_subs_avx2(&x[2], &x[13]);
+ btf_16_adds_subs_avx2(&x[3], &x[12]);
+ btf_16_adds_subs_avx2(&x[4], &x[11]);
+ btf_16_adds_subs_avx2(&x[5], &x[10]);
+ btf_16_adds_subs_avx2(&x[6], &x[9]);
+ btf_16_adds_subs_avx2(&x[7], &x[8]);
+ btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x[20], &x[27], _r, cos_bit);
+ btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x[21], &x[26], _r, cos_bit);
+ btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x[22], &x[25], _r, cos_bit);
+ btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x[23], &x[24], _r, cos_bit);
+}
+
+static INLINE void idct32_stage9_avx2(__m256i *output, __m256i *x) {
+ btf_16_adds_subs_out_avx2(&output[0], &output[31], x[0], x[31]);
+ btf_16_adds_subs_out_avx2(&output[1], &output[30], x[1], x[30]);
+ btf_16_adds_subs_out_avx2(&output[2], &output[29], x[2], x[29]);
+ btf_16_adds_subs_out_avx2(&output[3], &output[28], x[3], x[28]);
+ btf_16_adds_subs_out_avx2(&output[4], &output[27], x[4], x[27]);
+ btf_16_adds_subs_out_avx2(&output[5], &output[26], x[5], x[26]);
+ btf_16_adds_subs_out_avx2(&output[6], &output[25], x[6], x[25]);
+ btf_16_adds_subs_out_avx2(&output[7], &output[24], x[7], x[24]);
+ btf_16_adds_subs_out_avx2(&output[8], &output[23], x[8], x[23]);
+ btf_16_adds_subs_out_avx2(&output[9], &output[22], x[9], x[22]);
+ btf_16_adds_subs_out_avx2(&output[10], &output[21], x[10], x[21]);
+ btf_16_adds_subs_out_avx2(&output[11], &output[20], x[11], x[20]);
+ btf_16_adds_subs_out_avx2(&output[12], &output[19], x[12], x[19]);
+ btf_16_adds_subs_out_avx2(&output[13], &output[18], x[13], x[18]);
+ btf_16_adds_subs_out_avx2(&output[14], &output[17], x[14], x[17]);
+ btf_16_adds_subs_out_avx2(&output[15], &output[16], x[15], x[16]);
+}
+
+static void idct32_low1_avx2(const __m256i *input, __m256i *output) {
+ const int32_t *cospi = cospi_arr(INV_COS_BIT);
+
+ // stage 1
+ __m256i x[2];
+ x[0] = input[0];
+
+ // stage 2
+ // stage 3
+ // stage 4
+ // stage 5
+ btf_16_w16_0_avx2(cospi[32], cospi[32], x[0], x[0], x[1]);
+
+ // stage 6
+ // stage 7
+ // stage 8
+ // stage 9
+ output[0] = x[0];
+ output[31] = x[0];
+ output[1] = x[1];
+ output[30] = x[1];
+ output[2] = x[1];
+ output[29] = x[1];
+ output[3] = x[0];
+ output[28] = x[0];
+ output[4] = x[0];
+ output[27] = x[0];
+ output[5] = x[1];
+ output[26] = x[1];
+ output[6] = x[1];
+ output[25] = x[1];
+ output[7] = x[0];
+ output[24] = x[0];
+ output[8] = x[0];
+ output[23] = x[0];
+ output[9] = x[1];
+ output[22] = x[1];
+ output[10] = x[1];
+ output[21] = x[1];
+ output[11] = x[0];
+ output[20] = x[0];
+ output[12] = x[0];
+ output[19] = x[0];
+ output[13] = x[1];
+ output[18] = x[1];
+ output[14] = x[1];
+ output[17] = x[1];
+ output[15] = x[0];
+ output[16] = x[0];
+}
+
+static void idct32_low8_avx2(const __m256i *input, __m256i *output) {
+ const int32_t *cospi = cospi_arr(INV_COS_BIT);
+ const __m256i _r = _mm256_set1_epi32(1 << (INV_COS_BIT - 1));
+
+ // stage 1
+ __m256i x[32];
+ x[0] = input[0];
+ x[4] = input[4];
+ x[8] = input[2];
+ x[12] = input[6];
+ x[16] = input[1];
+ x[20] = input[5];
+ x[24] = input[3];
+ x[28] = input[7];
+
+ // stage 2
+ btf_16_w16_0_avx2(cospi[62], cospi[2], x[16], x[16], x[31]);
+ btf_16_w16_0_avx2(-cospi[50], cospi[14], x[28], x[19], x[28]);
+ btf_16_w16_0_avx2(cospi[54], cospi[10], x[20], x[20], x[27]);
+ btf_16_w16_0_avx2(-cospi[58], cospi[6], x[24], x[23], x[24]);
+
+ // stage 3
+ btf_16_w16_0_avx2(cospi[60], cospi[4], x[8], x[8], x[15]);
+ btf_16_w16_0_avx2(-cospi[52], cospi[12], x[12], x[11], x[12]);
+ x[17] = x[16];
+ x[18] = x[19];
+ x[21] = x[20];
+ x[22] = x[23];
+ x[25] = x[24];
+ x[26] = x[27];
+ x[29] = x[28];
+ x[30] = x[31];
+
+ // stage 4
+ btf_16_w16_0_avx2(cospi[56], cospi[8], x[4], x[4], x[7]);
+ x[9] = x[8];
+ x[10] = x[11];
+ x[13] = x[12];
+ x[14] = x[15];
+ idct32_high16_stage4_avx2(x, cospi, _r, INV_COS_BIT);
+
+ // stage 5
+ btf_16_w16_0_avx2(cospi[32], cospi[32], x[0], x[0], x[1]);
+ x[5] = x[4];
+ x[6] = x[7];
+ idct32_high24_stage5_avx2(x, cospi, _r, INV_COS_BIT);
+ // stage 6
+ x[3] = x[0];
+ x[2] = x[1];
+ idct32_high28_stage6_avx2(x, cospi, _r, INV_COS_BIT);
+
+ idct32_stage7_avx2(x, cospi, _r, INV_COS_BIT);
+ idct32_stage8_avx2(x, cospi, _r, INV_COS_BIT);
+ idct32_stage9_avx2(output, x);
+}
+
+static void idct32_low16_avx2(const __m256i *input, __m256i *output) {
+ const int32_t *cospi = cospi_arr(INV_COS_BIT);
+ const __m256i _r = _mm256_set1_epi32(1 << (INV_COS_BIT - 1));
+
+ // stage 1
+ __m256i x[32];
+ x[0] = input[0];
+ x[2] = input[8];
+ x[4] = input[4];
+ x[6] = input[12];
+ x[8] = input[2];
+ x[10] = input[10];
+ x[12] = input[6];
+ x[14] = input[14];
+ x[16] = input[1];
+ x[18] = input[9];
+ x[20] = input[5];
+ x[22] = input[13];
+ x[24] = input[3];
+ x[26] = input[11];
+ x[28] = input[7];
+ x[30] = input[15];
+
+ // stage 2
+ btf_16_w16_0_avx2(cospi[62], cospi[2], x[16], x[16], x[31]);
+ btf_16_w16_0_avx2(-cospi[34], cospi[30], x[30], x[17], x[30]);
+ btf_16_w16_0_avx2(cospi[46], cospi[18], x[18], x[18], x[29]);
+ btf_16_w16_0_avx2(-cospi[50], cospi[14], x[28], x[19], x[28]);
+ btf_16_w16_0_avx2(cospi[54], cospi[10], x[20], x[20], x[27]);
+ btf_16_w16_0_avx2(-cospi[42], cospi[22], x[26], x[21], x[26]);
+ btf_16_w16_0_avx2(cospi[38], cospi[26], x[22], x[22], x[25]);
+ btf_16_w16_0_avx2(-cospi[58], cospi[6], x[24], x[23], x[24]);
+
+ // stage 3
+ btf_16_w16_0_avx2(cospi[60], cospi[4], x[8], x[8], x[15]);
+ btf_16_w16_0_avx2(-cospi[36], cospi[28], x[14], x[9], x[14]);
+ btf_16_w16_0_avx2(cospi[44], cospi[20], x[10], x[10], x[13]);
+ btf_16_w16_0_avx2(-cospi[52], cospi[12], x[12], x[11], x[12]);
+ idct32_high16_stage3_avx2(x);
+
+ // stage 4
+ btf_16_w16_0_avx2(cospi[56], cospi[8], x[4], x[4], x[7]);
+ btf_16_w16_0_avx2(-cospi[40], cospi[24], x[6], x[5], x[6]);
+ btf_16_adds_subs_avx2(&x[8], &x[9]);
+ btf_16_adds_subs_avx2(&x[11], &x[10]);
+ btf_16_adds_subs_avx2(&x[12], &x[13]);
+ btf_16_adds_subs_avx2(&x[15], &x[14]);
+ idct32_high16_stage4_avx2(x, cospi, _r, INV_COS_BIT);
+
+ // stage 5
+ btf_16_w16_0_avx2(cospi[32], cospi[32], x[0], x[0], x[1]);
+ btf_16_w16_0_avx2(cospi[48], cospi[16], x[2], x[2], x[3]);
+ btf_16_adds_subs_avx2(&x[4], &x[5]);
+ btf_16_adds_subs_avx2(&x[7], &x[6]);
+ idct32_high24_stage5_avx2(x, cospi, _r, INV_COS_BIT);
+
+ btf_16_adds_subs_avx2(&x[0], &x[3]);
+ btf_16_adds_subs_avx2(&x[1], &x[2]);
+ idct32_high28_stage6_avx2(x, cospi, _r, INV_COS_BIT);
+
+ idct32_stage7_avx2(x, cospi, _r, INV_COS_BIT);
+ idct32_stage8_avx2(x, cospi, _r, INV_COS_BIT);
+ idct32_stage9_avx2(output, x);
+}
+
+static void idct32_avx2(const __m256i *input, __m256i *output) {
+ const int32_t *cospi = cospi_arr(INV_COS_BIT);
+ const __m256i _r = _mm256_set1_epi32(1 << (INV_COS_BIT - 1));
+
+ __m256i cospi_p62_m02 = pair_set_w16_epi16(cospi[62], -cospi[2]);
+ __m256i cospi_p02_p62 = pair_set_w16_epi16(cospi[2], cospi[62]);
+ __m256i cospi_p30_m34 = pair_set_w16_epi16(cospi[30], -cospi[34]);
+ __m256i cospi_p34_p30 = pair_set_w16_epi16(cospi[34], cospi[30]);
+ __m256i cospi_p46_m18 = pair_set_w16_epi16(cospi[46], -cospi[18]);
+ __m256i cospi_p18_p46 = pair_set_w16_epi16(cospi[18], cospi[46]);
+ __m256i cospi_p14_m50 = pair_set_w16_epi16(cospi[14], -cospi[50]);
+ __m256i cospi_p50_p14 = pair_set_w16_epi16(cospi[50], cospi[14]);
+ __m256i cospi_p54_m10 = pair_set_w16_epi16(cospi[54], -cospi[10]);
+ __m256i cospi_p10_p54 = pair_set_w16_epi16(cospi[10], cospi[54]);
+ __m256i cospi_p22_m42 = pair_set_w16_epi16(cospi[22], -cospi[42]);
+ __m256i cospi_p42_p22 = pair_set_w16_epi16(cospi[42], cospi[22]);
+ __m256i cospi_p38_m26 = pair_set_w16_epi16(cospi[38], -cospi[26]);
+ __m256i cospi_p26_p38 = pair_set_w16_epi16(cospi[26], cospi[38]);
+ __m256i cospi_p06_m58 = pair_set_w16_epi16(cospi[6], -cospi[58]);
+ __m256i cospi_p58_p06 = pair_set_w16_epi16(cospi[58], cospi[6]);
+ __m256i cospi_p60_m04 = pair_set_w16_epi16(cospi[60], -cospi[4]);
+ __m256i cospi_p04_p60 = pair_set_w16_epi16(cospi[4], cospi[60]);
+ __m256i cospi_p28_m36 = pair_set_w16_epi16(cospi[28], -cospi[36]);
+ __m256i cospi_p36_p28 = pair_set_w16_epi16(cospi[36], cospi[28]);
+ __m256i cospi_p44_m20 = pair_set_w16_epi16(cospi[44], -cospi[20]);
+ __m256i cospi_p20_p44 = pair_set_w16_epi16(cospi[20], cospi[44]);
+ __m256i cospi_p12_m52 = pair_set_w16_epi16(cospi[12], -cospi[52]);
+ __m256i cospi_p52_p12 = pair_set_w16_epi16(cospi[52], cospi[12]);
+ __m256i cospi_p56_m08 = pair_set_w16_epi16(cospi[56], -cospi[8]);
+ __m256i cospi_p08_p56 = pair_set_w16_epi16(cospi[8], cospi[56]);
+ __m256i cospi_p24_m40 = pair_set_w16_epi16(cospi[24], -cospi[40]);
+ __m256i cospi_p40_p24 = pair_set_w16_epi16(cospi[40], cospi[24]);
+ __m256i cospi_p32_p32 = pair_set_w16_epi16(cospi[32], cospi[32]);
+ __m256i cospi_p32_m32 = pair_set_w16_epi16(cospi[32], -cospi[32]);
+ __m256i cospi_p48_m16 = pair_set_w16_epi16(cospi[48], -cospi[16]);
+ __m256i cospi_p16_p48 = pair_set_w16_epi16(cospi[16], cospi[48]);
+
+ // stage 1
+ __m256i x1[32];
+ x1[0] = input[0];
+ x1[1] = input[16];
+ x1[2] = input[8];
+ x1[3] = input[24];
+ x1[4] = input[4];
+ x1[5] = input[20];
+ x1[6] = input[12];
+ x1[7] = input[28];
+ x1[8] = input[2];
+ x1[9] = input[18];
+ x1[10] = input[10];
+ x1[11] = input[26];
+ x1[12] = input[6];
+ x1[13] = input[22];
+ x1[14] = input[14];
+ x1[15] = input[30];
+ x1[16] = input[1];
+ x1[17] = input[17];
+ x1[18] = input[9];
+ x1[19] = input[25];
+ x1[20] = input[5];
+ x1[21] = input[21];
+ x1[22] = input[13];
+ x1[23] = input[29];
+ x1[24] = input[3];
+ x1[25] = input[19];
+ x1[26] = input[11];
+ x1[27] = input[27];
+ x1[28] = input[7];
+ x1[29] = input[23];
+ x1[30] = input[15];
+ x1[31] = input[31];
+
+ // stage 2
+ btf_16_w16_avx2(cospi_p62_m02, cospi_p02_p62, &x1[16], &x1[31], _r,
+ INV_COS_BIT);
+ btf_16_w16_avx2(cospi_p30_m34, cospi_p34_p30, &x1[17], &x1[30], _r,
+ INV_COS_BIT);
+ btf_16_w16_avx2(cospi_p46_m18, cospi_p18_p46, &x1[18], &x1[29], _r,
+ INV_COS_BIT);
+ btf_16_w16_avx2(cospi_p14_m50, cospi_p50_p14, &x1[19], &x1[28], _r,
+ INV_COS_BIT);
+ btf_16_w16_avx2(cospi_p54_m10, cospi_p10_p54, &x1[20], &x1[27], _r,
+ INV_COS_BIT);
+ btf_16_w16_avx2(cospi_p22_m42, cospi_p42_p22, &x1[21], &x1[26], _r,
+ INV_COS_BIT);
+ btf_16_w16_avx2(cospi_p38_m26, cospi_p26_p38, &x1[22], &x1[25], _r,
+ INV_COS_BIT);
+ btf_16_w16_avx2(cospi_p06_m58, cospi_p58_p06, &x1[23], &x1[24], _r,
+ INV_COS_BIT);
+
+ // stage 3
+ btf_16_w16_avx2(cospi_p60_m04, cospi_p04_p60, &x1[8], &x1[15], _r,
+ INV_COS_BIT);
+ btf_16_w16_avx2(cospi_p28_m36, cospi_p36_p28, &x1[9], &x1[14], _r,
+ INV_COS_BIT);
+ btf_16_w16_avx2(cospi_p44_m20, cospi_p20_p44, &x1[10], &x1[13], _r,
+ INV_COS_BIT);
+ btf_16_w16_avx2(cospi_p12_m52, cospi_p52_p12, &x1[11], &x1[12], _r,
+ INV_COS_BIT);
+ idct32_high16_stage3_avx2(x1);
+
+ // stage 4
+ btf_16_w16_avx2(cospi_p56_m08, cospi_p08_p56, &x1[4], &x1[7], _r,
+ INV_COS_BIT);
+ btf_16_w16_avx2(cospi_p24_m40, cospi_p40_p24, &x1[5], &x1[6], _r,
+ INV_COS_BIT);
+ btf_16_adds_subs_avx2(&x1[8], &x1[9]);
+ btf_16_adds_subs_avx2(&x1[11], &x1[10]);
+ btf_16_adds_subs_avx2(&x1[12], &x1[13]);
+ btf_16_adds_subs_avx2(&x1[15], &x1[14]);
+ idct32_high16_stage4_avx2(x1, cospi, _r, INV_COS_BIT);
+
+ // stage 5
+ btf_16_w16_avx2(cospi_p32_p32, cospi_p32_m32, &x1[0], &x1[1], _r,
+ INV_COS_BIT);
+ btf_16_w16_avx2(cospi_p48_m16, cospi_p16_p48, &x1[2], &x1[3], _r,
+ INV_COS_BIT);
+ btf_16_adds_subs_avx2(&x1[4], &x1[5]);
+ btf_16_adds_subs_avx2(&x1[7], &x1[6]);
+ idct32_high24_stage5_avx2(x1, cospi, _r, INV_COS_BIT);
+
+ // stage 6
+ btf_16_adds_subs_avx2(&x1[0], &x1[3]);
+ btf_16_adds_subs_avx2(&x1[1], &x1[2]);
+ idct32_high28_stage6_avx2(x1, cospi, _r, INV_COS_BIT);
+
+ idct32_stage7_avx2(x1, cospi, _r, INV_COS_BIT);
+ idct32_stage8_avx2(x1, cospi, _r, INV_COS_BIT);
+ idct32_stage9_avx2(output, x1);
+}
+
+static INLINE void idct64_stage4_high32_avx2(__m256i *x, const int32_t *cospi,
+ const __m256i _r, int8_t cos_bit) {
+ (void)cos_bit;
+ const __m256i cospi_m04_p60 = pair_set_w16_epi16(-cospi[4], cospi[60]);
+ const __m256i cospi_p60_p04 = pair_set_w16_epi16(cospi[60], cospi[4]);
+ const __m256i cospi_m60_m04 = pair_set_w16_epi16(-cospi[60], -cospi[4]);
+ const __m256i cospi_m36_p28 = pair_set_w16_epi16(-cospi[36], cospi[28]);
+ const __m256i cospi_p28_p36 = pair_set_w16_epi16(cospi[28], cospi[36]);
+ const __m256i cospi_m28_m36 = pair_set_w16_epi16(-cospi[28], -cospi[36]);
+ const __m256i cospi_m20_p44 = pair_set_w16_epi16(-cospi[20], cospi[44]);
+ const __m256i cospi_p44_p20 = pair_set_w16_epi16(cospi[44], cospi[20]);
+ const __m256i cospi_m44_m20 = pair_set_w16_epi16(-cospi[44], -cospi[20]);
+ const __m256i cospi_m52_p12 = pair_set_w16_epi16(-cospi[52], cospi[12]);
+ const __m256i cospi_p12_p52 = pair_set_w16_epi16(cospi[12], cospi[52]);
+ const __m256i cospi_m12_m52 = pair_set_w16_epi16(-cospi[12], -cospi[52]);
+ btf_16_w16_avx2(cospi_m04_p60, cospi_p60_p04, &x[33], &x[62], _r, cos_bit);
+ btf_16_w16_avx2(cospi_m60_m04, cospi_m04_p60, &x[34], &x[61], _r, cos_bit);
+ btf_16_w16_avx2(cospi_m36_p28, cospi_p28_p36, &x[37], &x[58], _r, cos_bit);
+ btf_16_w16_avx2(cospi_m28_m36, cospi_m36_p28, &x[38], &x[57], _r, cos_bit);
+ btf_16_w16_avx2(cospi_m20_p44, cospi_p44_p20, &x[41], &x[54], _r, cos_bit);
+ btf_16_w16_avx2(cospi_m44_m20, cospi_m20_p44, &x[42], &x[53], _r, cos_bit);
+ btf_16_w16_avx2(cospi_m52_p12, cospi_p12_p52, &x[45], &x[50], _r, cos_bit);
+ btf_16_w16_avx2(cospi_m12_m52, cospi_m52_p12, &x[46], &x[49], _r, cos_bit);
+}
+
+static INLINE void idct64_stage5_high48_avx2(__m256i *x, const int32_t *cospi,
+ const __m256i _r, int8_t cos_bit) {
+ (void)cos_bit;
+ const __m256i cospi_m08_p56 = pair_set_w16_epi16(-cospi[8], cospi[56]);
+ const __m256i cospi_p56_p08 = pair_set_w16_epi16(cospi[56], cospi[8]);
+ const __m256i cospi_m56_m08 = pair_set_w16_epi16(-cospi[56], -cospi[8]);
+ const __m256i cospi_m40_p24 = pair_set_w16_epi16(-cospi[40], cospi[24]);
+ const __m256i cospi_p24_p40 = pair_set_w16_epi16(cospi[24], cospi[40]);
+ const __m256i cospi_m24_m40 = pair_set_w16_epi16(-cospi[24], -cospi[40]);
+ btf_16_w16_avx2(cospi_m08_p56, cospi_p56_p08, &x[17], &x[30], _r, cos_bit);
+ btf_16_w16_avx2(cospi_m56_m08, cospi_m08_p56, &x[18], &x[29], _r, cos_bit);
+ btf_16_w16_avx2(cospi_m40_p24, cospi_p24_p40, &x[21], &x[26], _r, cos_bit);
+ btf_16_w16_avx2(cospi_m24_m40, cospi_m40_p24, &x[22], &x[25], _r, cos_bit);
+ btf_16_adds_subs_avx2(&x[32], &x[35]);
+ btf_16_adds_subs_avx2(&x[33], &x[34]);
+ btf_16_adds_subs_avx2(&x[39], &x[36]);
+ btf_16_adds_subs_avx2(&x[38], &x[37]);
+ btf_16_adds_subs_avx2(&x[40], &x[43]);
+ btf_16_adds_subs_avx2(&x[41], &x[42]);
+ btf_16_adds_subs_avx2(&x[47], &x[44]);
+ btf_16_adds_subs_avx2(&x[46], &x[45]);
+ btf_16_adds_subs_avx2(&x[48], &x[51]);
+ btf_16_adds_subs_avx2(&x[49], &x[50]);
+ btf_16_adds_subs_avx2(&x[55], &x[52]);
+ btf_16_adds_subs_avx2(&x[54], &x[53]);
+ btf_16_adds_subs_avx2(&x[56], &x[59]);
+ btf_16_adds_subs_avx2(&x[57], &x[58]);
+ btf_16_adds_subs_avx2(&x[63], &x[60]);
+ btf_16_adds_subs_avx2(&x[62], &x[61]);
+}
+
+static INLINE void idct64_stage6_high32_avx2(__m256i *x, const int32_t *cospi,
+ const __m256i _r, int8_t cos_bit) {
+ (void)cos_bit;
+ const __m256i cospi_m08_p56 = pair_set_w16_epi16(-cospi[8], cospi[56]);
+ const __m256i cospi_p56_p08 = pair_set_w16_epi16(cospi[56], cospi[8]);
+ const __m256i cospi_m56_m08 = pair_set_w16_epi16(-cospi[56], -cospi[8]);
+ const __m256i cospi_m40_p24 = pair_set_w16_epi16(-cospi[40], cospi[24]);
+ const __m256i cospi_p24_p40 = pair_set_w16_epi16(cospi[24], cospi[40]);
+ const __m256i cospi_m24_m40 = pair_set_w16_epi16(-cospi[24], -cospi[40]);
+ btf_16_w16_avx2(cospi_m08_p56, cospi_p56_p08, &x[34], &x[61], _r, cos_bit);
+ btf_16_w16_avx2(cospi_m08_p56, cospi_p56_p08, &x[35], &x[60], _r, cos_bit);
+ btf_16_w16_avx2(cospi_m56_m08, cospi_m08_p56, &x[36], &x[59], _r, cos_bit);
+ btf_16_w16_avx2(cospi_m56_m08, cospi_m08_p56, &x[37], &x[58], _r, cos_bit);
+ btf_16_w16_avx2(cospi_m40_p24, cospi_p24_p40, &x[42], &x[53], _r, cos_bit);
+ btf_16_w16_avx2(cospi_m40_p24, cospi_p24_p40, &x[43], &x[52], _r, cos_bit);
+ btf_16_w16_avx2(cospi_m24_m40, cospi_m40_p24, &x[44], &x[51], _r, cos_bit);
+ btf_16_w16_avx2(cospi_m24_m40, cospi_m40_p24, &x[45], &x[50], _r, cos_bit);
+}
+
+static INLINE void idct64_stage6_high48_avx2(__m256i *x, const int32_t *cospi,
+ const __m256i _r, int8_t cos_bit) {
+ btf_16_adds_subs_avx2(&x[16], &x[19]);
+ btf_16_adds_subs_avx2(&x[17], &x[18]);
+ btf_16_adds_subs_avx2(&x[23], &x[20]);
+ btf_16_adds_subs_avx2(&x[22], &x[21]);
+ btf_16_adds_subs_avx2(&x[24], &x[27]);
+ btf_16_adds_subs_avx2(&x[25], &x[26]);
+ btf_16_adds_subs_avx2(&x[31], &x[28]);
+ btf_16_adds_subs_avx2(&x[30], &x[29]);
+ idct64_stage6_high32_avx2(x, cospi, _r, cos_bit);
+}
+
+static INLINE void idct64_stage7_high48_avx2(__m256i *x, const int32_t *cospi,
+ const __m256i _r, int8_t cos_bit) {
+ (void)cos_bit;
+ const __m256i cospi_m16_p48 = pair_set_w16_epi16(-cospi[16], cospi[48]);
+ const __m256i cospi_p48_p16 = pair_set_w16_epi16(cospi[48], cospi[16]);
+ const __m256i cospi_m48_m16 = pair_set_w16_epi16(-cospi[48], -cospi[16]);
+ btf_16_w16_avx2(cospi_m16_p48, cospi_p48_p16, &x[18], &x[29], _r, cos_bit);
+ btf_16_w16_avx2(cospi_m16_p48, cospi_p48_p16, &x[19], &x[28], _r, cos_bit);
+ btf_16_w16_avx2(cospi_m48_m16, cospi_m16_p48, &x[20], &x[27], _r, cos_bit);
+ btf_16_w16_avx2(cospi_m48_m16, cospi_m16_p48, &x[21], &x[26], _r, cos_bit);
+ btf_16_adds_subs_avx2(&x[32], &x[39]);
+ btf_16_adds_subs_avx2(&x[33], &x[38]);
+ btf_16_adds_subs_avx2(&x[34], &x[37]);
+ btf_16_adds_subs_avx2(&x[35], &x[36]);
+ btf_16_adds_subs_avx2(&x[47], &x[40]);
+ btf_16_adds_subs_avx2(&x[46], &x[41]);
+ btf_16_adds_subs_avx2(&x[45], &x[42]);
+ btf_16_adds_subs_avx2(&x[44], &x[43]);
+ btf_16_adds_subs_avx2(&x[48], &x[55]);
+ btf_16_adds_subs_avx2(&x[49], &x[54]);
+ btf_16_adds_subs_avx2(&x[50], &x[53]);
+ btf_16_adds_subs_avx2(&x[51], &x[52]);
+ btf_16_adds_subs_avx2(&x[63], &x[56]);
+ btf_16_adds_subs_avx2(&x[62], &x[57]);
+ btf_16_adds_subs_avx2(&x[61], &x[58]);
+ btf_16_adds_subs_avx2(&x[60], &x[59]);
+}
+
+static INLINE void idct64_stage8_high48_avx2(__m256i *x, const int32_t *cospi,
+ const __m256i _r, int8_t cos_bit) {
+ (void)cos_bit;
+ const __m256i cospi_m16_p48 = pair_set_w16_epi16(-cospi[16], cospi[48]);
+ const __m256i cospi_p48_p16 = pair_set_w16_epi16(cospi[48], cospi[16]);
+ const __m256i cospi_m48_m16 = pair_set_w16_epi16(-cospi[48], -cospi[16]);
+ btf_16_adds_subs_avx2(&x[16], &x[23]);
+ btf_16_adds_subs_avx2(&x[17], &x[22]);
+ btf_16_adds_subs_avx2(&x[18], &x[21]);
+ btf_16_adds_subs_avx2(&x[19], &x[20]);
+ btf_16_adds_subs_avx2(&x[31], &x[24]);
+ btf_16_adds_subs_avx2(&x[30], &x[25]);
+ btf_16_adds_subs_avx2(&x[29], &x[26]);
+ btf_16_adds_subs_avx2(&x[28], &x[27]);
+ btf_16_w16_avx2(cospi_m16_p48, cospi_p48_p16, &x[36], &x[59], _r, cos_bit);
+ btf_16_w16_avx2(cospi_m16_p48, cospi_p48_p16, &x[37], &x[58], _r, cos_bit);
+ btf_16_w16_avx2(cospi_m16_p48, cospi_p48_p16, &x[38], &x[57], _r, cos_bit);
+ btf_16_w16_avx2(cospi_m16_p48, cospi_p48_p16, &x[39], &x[56], _r, cos_bit);
+ btf_16_w16_avx2(cospi_m48_m16, cospi_m16_p48, &x[40], &x[55], _r, cos_bit);
+ btf_16_w16_avx2(cospi_m48_m16, cospi_m16_p48, &x[41], &x[54], _r, cos_bit);
+ btf_16_w16_avx2(cospi_m48_m16, cospi_m16_p48, &x[42], &x[53], _r, cos_bit);
+ btf_16_w16_avx2(cospi_m48_m16, cospi_m16_p48, &x[43], &x[52], _r, cos_bit);
+}
+
+static INLINE void idct64_stage9_avx2(__m256i *x, const int32_t *cospi,
+ const __m256i _r, int8_t cos_bit) {
+ (void)cos_bit;
+ const __m256i cospi_m32_p32 = pair_set_w16_epi16(-cospi[32], cospi[32]);
+ const __m256i cospi_p32_p32 = pair_set_w16_epi16(cospi[32], cospi[32]);
+ btf_16_adds_subs_avx2(&x[0], &x[15]);
+ btf_16_adds_subs_avx2(&x[1], &x[14]);
+ btf_16_adds_subs_avx2(&x[2], &x[13]);
+ btf_16_adds_subs_avx2(&x[3], &x[12]);
+ btf_16_adds_subs_avx2(&x[4], &x[11]);
+ btf_16_adds_subs_avx2(&x[5], &x[10]);
+ btf_16_adds_subs_avx2(&x[6], &x[9]);
+ btf_16_adds_subs_avx2(&x[7], &x[8]);
+ btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x[20], &x[27], _r, cos_bit);
+ btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x[21], &x[26], _r, cos_bit);
+ btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x[22], &x[25], _r, cos_bit);
+ btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x[23], &x[24], _r, cos_bit);
+ btf_16_adds_subs_avx2(&x[32], &x[47]);
+ btf_16_adds_subs_avx2(&x[33], &x[46]);
+ btf_16_adds_subs_avx2(&x[34], &x[45]);
+ btf_16_adds_subs_avx2(&x[35], &x[44]);
+ btf_16_adds_subs_avx2(&x[36], &x[43]);
+ btf_16_adds_subs_avx2(&x[37], &x[42]);
+ btf_16_adds_subs_avx2(&x[38], &x[41]);
+ btf_16_adds_subs_avx2(&x[39], &x[40]);
+ btf_16_adds_subs_avx2(&x[63], &x[48]);
+ btf_16_adds_subs_avx2(&x[62], &x[49]);
+ btf_16_adds_subs_avx2(&x[61], &x[50]);
+ btf_16_adds_subs_avx2(&x[60], &x[51]);
+ btf_16_adds_subs_avx2(&x[59], &x[52]);
+ btf_16_adds_subs_avx2(&x[58], &x[53]);
+ btf_16_adds_subs_avx2(&x[57], &x[54]);
+ btf_16_adds_subs_avx2(&x[56], &x[55]);
+}
+
+static INLINE void idct64_stage10_avx2(__m256i *x, const int32_t *cospi,
+ const __m256i _r, int8_t cos_bit) {
+ (void)cos_bit;
+ const __m256i cospi_m32_p32 = pair_set_w16_epi16(-cospi[32], cospi[32]);
+ const __m256i cospi_p32_p32 = pair_set_w16_epi16(cospi[32], cospi[32]);
+ btf_16_adds_subs_avx2(&x[0], &x[31]);
+ btf_16_adds_subs_avx2(&x[1], &x[30]);
+ btf_16_adds_subs_avx2(&x[2], &x[29]);
+ btf_16_adds_subs_avx2(&x[3], &x[28]);
+ btf_16_adds_subs_avx2(&x[4], &x[27]);
+ btf_16_adds_subs_avx2(&x[5], &x[26]);
+ btf_16_adds_subs_avx2(&x[6], &x[25]);
+ btf_16_adds_subs_avx2(&x[7], &x[24]);
+ btf_16_adds_subs_avx2(&x[8], &x[23]);
+ btf_16_adds_subs_avx2(&x[9], &x[22]);
+ btf_16_adds_subs_avx2(&x[10], &x[21]);
+ btf_16_adds_subs_avx2(&x[11], &x[20]);
+ btf_16_adds_subs_avx2(&x[12], &x[19]);
+ btf_16_adds_subs_avx2(&x[13], &x[18]);
+ btf_16_adds_subs_avx2(&x[14], &x[17]);
+ btf_16_adds_subs_avx2(&x[15], &x[16]);
+ btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x[40], &x[55], _r, cos_bit);
+ btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x[41], &x[54], _r, cos_bit);
+ btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x[42], &x[53], _r, cos_bit);
+ btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x[43], &x[52], _r, cos_bit);
+ btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x[44], &x[51], _r, cos_bit);
+ btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x[45], &x[50], _r, cos_bit);
+ btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x[46], &x[49], _r, cos_bit);
+ btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x[47], &x[48], _r, cos_bit);
+}
+
+static INLINE void idct64_stage11_avx2(__m256i *output, __m256i *x) {
+ btf_16_adds_subs_out_avx2(&output[0], &output[63], x[0], x[63]);
+ btf_16_adds_subs_out_avx2(&output[1], &output[62], x[1], x[62]);
+ btf_16_adds_subs_out_avx2(&output[2], &output[61], x[2], x[61]);
+ btf_16_adds_subs_out_avx2(&output[3], &output[60], x[3], x[60]);
+ btf_16_adds_subs_out_avx2(&output[4], &output[59], x[4], x[59]);
+ btf_16_adds_subs_out_avx2(&output[5], &output[58], x[5], x[58]);
+ btf_16_adds_subs_out_avx2(&output[6], &output[57], x[6], x[57]);
+ btf_16_adds_subs_out_avx2(&output[7], &output[56], x[7], x[56]);
+ btf_16_adds_subs_out_avx2(&output[8], &output[55], x[8], x[55]);
+ btf_16_adds_subs_out_avx2(&output[9], &output[54], x[9], x[54]);
+ btf_16_adds_subs_out_avx2(&output[10], &output[53], x[10], x[53]);
+ btf_16_adds_subs_out_avx2(&output[11], &output[52], x[11], x[52]);
+ btf_16_adds_subs_out_avx2(&output[12], &output[51], x[12], x[51]);
+ btf_16_adds_subs_out_avx2(&output[13], &output[50], x[13], x[50]);
+ btf_16_adds_subs_out_avx2(&output[14], &output[49], x[14], x[49]);
+ btf_16_adds_subs_out_avx2(&output[15], &output[48], x[15], x[48]);
+ btf_16_adds_subs_out_avx2(&output[16], &output[47], x[16], x[47]);
+ btf_16_adds_subs_out_avx2(&output[17], &output[46], x[17], x[46]);
+ btf_16_adds_subs_out_avx2(&output[18], &output[45], x[18], x[45]);
+ btf_16_adds_subs_out_avx2(&output[19], &output[44], x[19], x[44]);
+ btf_16_adds_subs_out_avx2(&output[20], &output[43], x[20], x[43]);
+ btf_16_adds_subs_out_avx2(&output[21], &output[42], x[21], x[42]);
+ btf_16_adds_subs_out_avx2(&output[22], &output[41], x[22], x[41]);
+ btf_16_adds_subs_out_avx2(&output[23], &output[40], x[23], x[40]);
+ btf_16_adds_subs_out_avx2(&output[24], &output[39], x[24], x[39]);
+ btf_16_adds_subs_out_avx2(&output[25], &output[38], x[25], x[38]);
+ btf_16_adds_subs_out_avx2(&output[26], &output[37], x[26], x[37]);
+ btf_16_adds_subs_out_avx2(&output[27], &output[36], x[27], x[36]);
+ btf_16_adds_subs_out_avx2(&output[28], &output[35], x[28], x[35]);
+ btf_16_adds_subs_out_avx2(&output[29], &output[34], x[29], x[34]);
+ btf_16_adds_subs_out_avx2(&output[30], &output[33], x[30], x[33]);
+ btf_16_adds_subs_out_avx2(&output[31], &output[32], x[31], x[32]);
+}
+
+static void idct64_low1_avx2(const __m256i *input, __m256i *output) {
+ const int32_t *cospi = cospi_arr(INV_COS_BIT);
+
+ // stage 1
+ __m256i x[32];
+ x[0] = input[0];
+
+ // stage 2
+ // stage 3
+ // stage 4
+ // stage 5
+ // stage 6
+ btf_16_w16_0_avx2(cospi[32], cospi[32], x[0], x[0], x[1]);
+
+ // stage 7
+ // stage 8
+ // stage 9
+ // stage 10
+ // stage 11
+ output[0] = x[0];
+ output[63] = x[0];
+ output[1] = x[1];
+ output[62] = x[1];
+ output[2] = x[1];
+ output[61] = x[1];
+ output[3] = x[0];
+ output[60] = x[0];
+ output[4] = x[0];
+ output[59] = x[0];
+ output[5] = x[1];
+ output[58] = x[1];
+ output[6] = x[1];
+ output[57] = x[1];
+ output[7] = x[0];
+ output[56] = x[0];
+ output[8] = x[0];
+ output[55] = x[0];
+ output[9] = x[1];
+ output[54] = x[1];
+ output[10] = x[1];
+ output[53] = x[1];
+ output[11] = x[0];
+ output[52] = x[0];
+ output[12] = x[0];
+ output[51] = x[0];
+ output[13] = x[1];
+ output[50] = x[1];
+ output[14] = x[1];
+ output[49] = x[1];
+ output[15] = x[0];
+ output[48] = x[0];
+ output[16] = x[0];
+ output[47] = x[0];
+ output[17] = x[1];
+ output[46] = x[1];
+ output[18] = x[1];
+ output[45] = x[1];
+ output[19] = x[0];
+ output[44] = x[0];
+ output[20] = x[0];
+ output[43] = x[0];
+ output[21] = x[1];
+ output[42] = x[1];
+ output[22] = x[1];
+ output[41] = x[1];
+ output[23] = x[0];
+ output[40] = x[0];
+ output[24] = x[0];
+ output[39] = x[0];
+ output[25] = x[1];
+ output[38] = x[1];
+ output[26] = x[1];
+ output[37] = x[1];
+ output[27] = x[0];
+ output[36] = x[0];
+ output[28] = x[0];
+ output[35] = x[0];
+ output[29] = x[1];
+ output[34] = x[1];
+ output[30] = x[1];
+ output[33] = x[1];
+ output[31] = x[0];
+ output[32] = x[0];
+}
+
+static void idct64_low8_avx2(const __m256i *input, __m256i *output) {
+ const int32_t *cospi = cospi_arr(INV_COS_BIT);
+ const __m256i _r = _mm256_set1_epi32(1 << (INV_COS_BIT - 1));
+ const __m256i cospi_m04_p60 = pair_set_w16_epi16(-cospi[4], cospi[60]);
+ const __m256i cospi_p60_p04 = pair_set_w16_epi16(cospi[60], cospi[4]);
+ const __m256i cospi_m36_p28 = pair_set_w16_epi16(-cospi[36], cospi[28]);
+ const __m256i cospi_m28_m36 = pair_set_w16_epi16(-cospi[28], -cospi[36]);
+ const __m256i cospi_m20_p44 = pair_set_w16_epi16(-cospi[20], cospi[44]);
+ const __m256i cospi_p44_p20 = pair_set_w16_epi16(cospi[44], cospi[20]);
+ const __m256i cospi_m52_p12 = pair_set_w16_epi16(-cospi[52], cospi[12]);
+ const __m256i cospi_m12_m52 = pair_set_w16_epi16(-cospi[12], -cospi[52]);
+ const __m256i cospi_m08_p56 = pair_set_w16_epi16(-cospi[8], cospi[56]);
+ const __m256i cospi_p56_p08 = pair_set_w16_epi16(cospi[56], cospi[8]);
+ const __m256i cospi_m40_p24 = pair_set_w16_epi16(-cospi[40], cospi[24]);
+ const __m256i cospi_m24_m40 = pair_set_w16_epi16(-cospi[24], -cospi[40]);
+ const __m256i cospi_p32_p32 = pair_set_w16_epi16(cospi[32], cospi[32]);
+ const __m256i cospi_m16_p48 = pair_set_w16_epi16(-cospi[16], cospi[48]);
+ const __m256i cospi_p48_p16 = pair_set_w16_epi16(cospi[48], cospi[16]);
+ const __m256i cospi_m32_p32 = pair_set_w16_epi16(-cospi[32], cospi[32]);
+
+ // stage 1
+ __m256i x[64];
+ x[0] = input[0];
+ x[8] = input[4];
+ x[16] = input[2];
+ x[24] = input[6];
+ x[32] = input[1];
+ x[40] = input[5];
+ x[48] = input[3];
+ x[56] = input[7];
+
+ // stage 2
+ btf_16_w16_0_avx2(cospi[63], cospi[1], x[32], x[32], x[63]);
+ btf_16_w16_0_avx2(-cospi[57], cospi[7], x[56], x[39], x[56]);
+ btf_16_w16_0_avx2(cospi[59], cospi[5], x[40], x[40], x[55]);
+ btf_16_w16_0_avx2(-cospi[61], cospi[3], x[48], x[47], x[48]);
+
+ // stage 3
+ btf_16_w16_0_avx2(cospi[62], cospi[2], x[16], x[16], x[31]);
+ btf_16_w16_0_avx2(-cospi[58], cospi[6], x[24], x[23], x[24]);
+ x[33] = x[32];
+ x[38] = x[39];
+ x[41] = x[40];
+ x[46] = x[47];
+ x[49] = x[48];
+ x[54] = x[55];
+ x[57] = x[56];
+ x[62] = x[63];
+
+ // stage 4
+ btf_16_w16_0_avx2(cospi[60], cospi[4], x[8], x[8], x[15]);
+ x[17] = x[16];
+ x[22] = x[23];
+ x[25] = x[24];
+ x[30] = x[31];
+ btf_16_w16_avx2(cospi_m04_p60, cospi_p60_p04, &x[33], &x[62], _r,
+ INV_COS_BIT);
+ btf_16_w16_avx2(cospi_m28_m36, cospi_m36_p28, &x[38], &x[57], _r,
+ INV_COS_BIT);
+ btf_16_w16_avx2(cospi_m20_p44, cospi_p44_p20, &x[41], &x[54], _r,
+ INV_COS_BIT);
+ btf_16_w16_avx2(cospi_m12_m52, cospi_m52_p12, &x[46], &x[49], _r,
+ INV_COS_BIT);
+
+ // stage 5
+ x[9] = x[8];
+ x[14] = x[15];
+ btf_16_w16_avx2(cospi_m08_p56, cospi_p56_p08, &x[17], &x[30], _r,
+ INV_COS_BIT);
+ btf_16_w16_avx2(cospi_m24_m40, cospi_m40_p24, &x[22], &x[25], _r,
+ INV_COS_BIT);
+ x[35] = x[32];
+ x[34] = x[33];
+ x[36] = x[39];
+ x[37] = x[38];
+ x[43] = x[40];
+ x[42] = x[41];
+ x[44] = x[47];
+ x[45] = x[46];
+ x[51] = x[48];
+ x[50] = x[49];
+ x[52] = x[55];
+ x[53] = x[54];
+ x[59] = x[56];
+ x[58] = x[57];
+ x[60] = x[63];
+ x[61] = x[62];
+
+ // stage 6
+ btf_16_w16_0_avx2(cospi[32], cospi[32], x[0], x[0], x[1]);
+ btf_16_w16_avx2(cospi_m16_p48, cospi_p48_p16, &x[9], &x[14], _r, INV_COS_BIT);
+ x[19] = x[16];
+ x[18] = x[17];
+ x[20] = x[23];
+ x[21] = x[22];
+ x[27] = x[24];
+ x[26] = x[25];
+ x[28] = x[31];
+ x[29] = x[30];
+ idct64_stage6_high32_avx2(x, cospi, _r, INV_COS_BIT);
+
+ // stage 7
+ x[3] = x[0];
+ x[2] = x[1];
+ x[11] = x[8];
+ x[10] = x[9];
+ x[12] = x[15];
+ x[13] = x[14];
+ idct64_stage7_high48_avx2(x, cospi, _r, INV_COS_BIT);
+
+ // stage 8
+ x[7] = x[0];
+ x[6] = x[1];
+ x[5] = x[2];
+ x[4] = x[3];
+ btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x[10], &x[13], _r,
+ INV_COS_BIT);
+ btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x[11], &x[12], _r,
+ INV_COS_BIT);
+ idct64_stage8_high48_avx2(x, cospi, _r, INV_COS_BIT);
+
+ idct64_stage9_avx2(x, cospi, _r, INV_COS_BIT);
+ idct64_stage10_avx2(x, cospi, _r, INV_COS_BIT);
+ idct64_stage11_avx2(output, x);
+}
+
+static void idct64_low16_avx2(const __m256i *input, __m256i *output) {
+ const int32_t *cospi = cospi_arr(INV_COS_BIT);
+ const __m256i _r = _mm256_set1_epi32(1 << (INV_COS_BIT - 1));
+
+ const __m256i cospi_p32_p32 = pair_set_w16_epi16(cospi[32], cospi[32]);
+ const __m256i cospi_m16_p48 = pair_set_w16_epi16(-cospi[16], cospi[48]);
+ const __m256i cospi_p48_p16 = pair_set_w16_epi16(cospi[48], cospi[16]);
+ const __m256i cospi_m48_m16 = pair_set_w16_epi16(-cospi[48], -cospi[16]);
+ const __m256i cospi_m32_p32 = pair_set_w16_epi16(-cospi[32], cospi[32]);
+
+ // stage 1
+ __m256i x[64];
+ x[0] = input[0];
+ x[4] = input[8];
+ x[8] = input[4];
+ x[12] = input[12];
+ x[16] = input[2];
+ x[20] = input[10];
+ x[24] = input[6];
+ x[28] = input[14];
+ x[32] = input[1];
+ x[36] = input[9];
+ x[40] = input[5];
+ x[44] = input[13];
+ x[48] = input[3];
+ x[52] = input[11];
+ x[56] = input[7];
+ x[60] = input[15];
+
+ // stage 2
+ btf_16_w16_0_avx2(cospi[63], cospi[1], x[32], x[32], x[63]);
+ btf_16_w16_0_avx2(-cospi[49], cospi[15], x[60], x[35], x[60]);
+ btf_16_w16_0_avx2(cospi[55], cospi[9], x[36], x[36], x[59]);
+ btf_16_w16_0_avx2(-cospi[57], cospi[7], x[56], x[39], x[56]);
+ btf_16_w16_0_avx2(cospi[59], cospi[5], x[40], x[40], x[55]);
+ btf_16_w16_0_avx2(-cospi[53], cospi[11], x[52], x[43], x[52]);
+ btf_16_w16_0_avx2(cospi[51], cospi[13], x[44], x[44], x[51]);
+ btf_16_w16_0_avx2(-cospi[61], cospi[3], x[48], x[47], x[48]);
+
+ // stage 3
+ btf_16_w16_0_avx2(cospi[62], cospi[2], x[16], x[16], x[31]);
+ btf_16_w16_0_avx2(-cospi[50], cospi[14], x[28], x[19], x[28]);
+ btf_16_w16_0_avx2(cospi[54], cospi[10], x[20], x[20], x[27]);
+ btf_16_w16_0_avx2(-cospi[58], cospi[6], x[24], x[23], x[24]);
+ x[33] = x[32];
+ x[34] = x[35];
+ x[37] = x[36];
+ x[38] = x[39];
+ x[41] = x[40];
+ x[42] = x[43];
+ x[45] = x[44];
+ x[46] = x[47];
+ x[49] = x[48];
+ x[50] = x[51];
+ x[53] = x[52];
+ x[54] = x[55];
+ x[57] = x[56];
+ x[58] = x[59];
+ x[61] = x[60];
+ x[62] = x[63];
+
+ // stage 4
+ btf_16_w16_0_avx2(cospi[60], cospi[4], x[8], x[8], x[15]);
+ btf_16_w16_0_avx2(-cospi[52], cospi[12], x[12], x[11], x[12]);
+ x[17] = x[16];
+ x[18] = x[19];
+ x[21] = x[20];
+ x[22] = x[23];
+ x[25] = x[24];
+ x[26] = x[27];
+ x[29] = x[28];
+ x[30] = x[31];
+ idct64_stage4_high32_avx2(x, cospi, _r, INV_COS_BIT);
+
+ // stage 5
+ btf_16_w16_0_avx2(cospi[56], cospi[8], x[4], x[4], x[7]);
+ x[9] = x[8];
+ x[10] = x[11];
+ x[13] = x[12];
+ x[14] = x[15];
+ idct64_stage5_high48_avx2(x, cospi, _r, INV_COS_BIT);
+
+ // stage 6
+ btf_16_w16_0_avx2(cospi[32], cospi[32], x[0], x[0], x[1]);
+ x[5] = x[4];
+ x[6] = x[7];
+ btf_16_w16_avx2(cospi_m16_p48, cospi_p48_p16, &x[9], &x[14], _r, INV_COS_BIT);
+ btf_16_w16_avx2(cospi_m48_m16, cospi_m16_p48, &x[10], &x[13], _r,
+ INV_COS_BIT);
+ idct64_stage6_high48_avx2(x, cospi, _r, INV_COS_BIT);
+
+ // stage 7
+ x[3] = x[0];
+ x[2] = x[1];
+ btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x[5], &x[6], _r, INV_COS_BIT);
+ btf_16_adds_subs_avx2(&x[8], &x[11]);
+ btf_16_adds_subs_avx2(&x[9], &x[10]);
+ btf_16_adds_subs_avx2(&x[15], &x[12]);
+ btf_16_adds_subs_avx2(&x[14], &x[13]);
+ idct64_stage7_high48_avx2(x, cospi, _r, INV_COS_BIT);
+
+ // stage 8
+ btf_16_adds_subs_avx2(&x[0], &x[7]);
+ btf_16_adds_subs_avx2(&x[1], &x[6]);
+ btf_16_adds_subs_avx2(&x[2], &x[5]);
+ btf_16_adds_subs_avx2(&x[3], &x[4]);
+ btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x[10], &x[13], _r,
+ INV_COS_BIT);
+ btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x[11], &x[12], _r,
+ INV_COS_BIT);
+ idct64_stage8_high48_avx2(x, cospi, _r, INV_COS_BIT);
+
+ idct64_stage9_avx2(x, cospi, _r, INV_COS_BIT);
+ idct64_stage10_avx2(x, cospi, _r, INV_COS_BIT);
+ idct64_stage11_avx2(output, x);
+}
+
+static void idct64_low32_avx2(const __m256i *input, __m256i *output) {
+ const int32_t *cospi = cospi_arr(INV_COS_BIT);
+ const __m256i _r = _mm256_set1_epi32(1 << (INV_COS_BIT - 1));
+
+ const __m256i cospi_p32_p32 = pair_set_w16_epi16(cospi[32], cospi[32]);
+ const __m256i cospi_m16_p48 = pair_set_w16_epi16(-cospi[16], cospi[48]);
+ const __m256i cospi_p48_p16 = pair_set_w16_epi16(cospi[48], cospi[16]);
+ const __m256i cospi_m48_m16 = pair_set_w16_epi16(-cospi[48], -cospi[16]);
+ const __m256i cospi_m32_p32 = pair_set_w16_epi16(-cospi[32], cospi[32]);
+
+ // stage 1
+ __m256i x[64];
+ x[0] = input[0];
+ x[2] = input[16];
+ x[4] = input[8];
+ x[6] = input[24];
+ x[8] = input[4];
+ x[10] = input[20];
+ x[12] = input[12];
+ x[14] = input[28];
+ x[16] = input[2];
+ x[18] = input[18];
+ x[20] = input[10];
+ x[22] = input[26];
+ x[24] = input[6];
+ x[26] = input[22];
+ x[28] = input[14];
+ x[30] = input[30];
+ x[32] = input[1];
+ x[34] = input[17];
+ x[36] = input[9];
+ x[38] = input[25];
+ x[40] = input[5];
+ x[42] = input[21];
+ x[44] = input[13];
+ x[46] = input[29];
+ x[48] = input[3];
+ x[50] = input[19];
+ x[52] = input[11];
+ x[54] = input[27];
+ x[56] = input[7];
+ x[58] = input[23];
+ x[60] = input[15];
+ x[62] = input[31];
+
+ // stage 2
+ btf_16_w16_0_avx2(cospi[63], cospi[1], x[32], x[32], x[63]);
+ btf_16_w16_0_avx2(-cospi[33], cospi[31], x[62], x[33], x[62]);
+ btf_16_w16_0_avx2(cospi[47], cospi[17], x[34], x[34], x[61]);
+ btf_16_w16_0_avx2(-cospi[49], cospi[15], x[60], x[35], x[60]);
+ btf_16_w16_0_avx2(cospi[55], cospi[9], x[36], x[36], x[59]);
+ btf_16_w16_0_avx2(-cospi[41], cospi[23], x[58], x[37], x[58]);
+ btf_16_w16_0_avx2(cospi[39], cospi[25], x[38], x[38], x[57]);
+ btf_16_w16_0_avx2(-cospi[57], cospi[7], x[56], x[39], x[56]);
+ btf_16_w16_0_avx2(cospi[59], cospi[5], x[40], x[40], x[55]);
+ btf_16_w16_0_avx2(-cospi[37], cospi[27], x[54], x[41], x[54]);
+ btf_16_w16_0_avx2(cospi[43], cospi[21], x[42], x[42], x[53]);
+ btf_16_w16_0_avx2(-cospi[53], cospi[11], x[52], x[43], x[52]);
+ btf_16_w16_0_avx2(cospi[51], cospi[13], x[44], x[44], x[51]);
+ btf_16_w16_0_avx2(-cospi[45], cospi[19], x[50], x[45], x[50]);
+ btf_16_w16_0_avx2(cospi[35], cospi[29], x[46], x[46], x[49]);
+ btf_16_w16_0_avx2(-cospi[61], cospi[3], x[48], x[47], x[48]);
+
+ // stage 3
+ btf_16_w16_0_avx2(cospi[62], cospi[2], x[16], x[16], x[31]);
+ btf_16_w16_0_avx2(-cospi[34], cospi[30], x[30], x[17], x[30]);
+ btf_16_w16_0_avx2(cospi[46], cospi[18], x[18], x[18], x[29]);
+ btf_16_w16_0_avx2(-cospi[50], cospi[14], x[28], x[19], x[28]);
+ btf_16_w16_0_avx2(cospi[54], cospi[10], x[20], x[20], x[27]);
+ btf_16_w16_0_avx2(-cospi[42], cospi[22], x[26], x[21], x[26]);
+ btf_16_w16_0_avx2(cospi[38], cospi[26], x[22], x[22], x[25]);
+ btf_16_w16_0_avx2(-cospi[58], cospi[6], x[24], x[23], x[24]);
+ btf_16_adds_subs_avx2(&x[32], &x[33]);
+ btf_16_adds_subs_avx2(&x[35], &x[34]);
+ btf_16_adds_subs_avx2(&x[36], &x[37]);
+ btf_16_adds_subs_avx2(&x[39], &x[38]);
+ btf_16_adds_subs_avx2(&x[40], &x[41]);
+ btf_16_adds_subs_avx2(&x[43], &x[42]);
+ btf_16_adds_subs_avx2(&x[44], &x[45]);
+ btf_16_adds_subs_avx2(&x[47], &x[46]);
+ btf_16_adds_subs_avx2(&x[48], &x[49]);
+ btf_16_adds_subs_avx2(&x[51], &x[50]);
+ btf_16_adds_subs_avx2(&x[52], &x[53]);
+ btf_16_adds_subs_avx2(&x[55], &x[54]);
+ btf_16_adds_subs_avx2(&x[56], &x[57]);
+ btf_16_adds_subs_avx2(&x[59], &x[58]);
+ btf_16_adds_subs_avx2(&x[60], &x[61]);
+ btf_16_adds_subs_avx2(&x[63], &x[62]);
+
+ // stage 4
+ btf_16_w16_0_avx2(cospi[60], cospi[4], x[8], x[8], x[15]);
+ btf_16_w16_0_avx2(-cospi[36], cospi[28], x[14], x[9], x[14]);
+ btf_16_w16_0_avx2(cospi[44], cospi[20], x[10], x[10], x[13]);
+ btf_16_w16_0_avx2(-cospi[52], cospi[12], x[12], x[11], x[12]);
+ btf_16_adds_subs_avx2(&x[16], &x[17]);
+ btf_16_adds_subs_avx2(&x[19], &x[18]);
+ btf_16_adds_subs_avx2(&x[20], &x[21]);
+ btf_16_adds_subs_avx2(&x[23], &x[22]);
+ btf_16_adds_subs_avx2(&x[24], &x[25]);
+ btf_16_adds_subs_avx2(&x[27], &x[26]);
+ btf_16_adds_subs_avx2(&x[28], &x[29]);
+ btf_16_adds_subs_avx2(&x[31], &x[30]);
+ idct64_stage4_high32_avx2(x, cospi, _r, INV_COS_BIT);
+
+ // stage 5
+ btf_16_w16_0_avx2(cospi[56], cospi[8], x[4], x[4], x[7]);
+ btf_16_w16_0_avx2(-cospi[40], cospi[24], x[6], x[5], x[6]);
+ btf_16_adds_subs_avx2(&x[8], &x[9]);
+ btf_16_adds_subs_avx2(&x[11], &x[10]);
+ btf_16_adds_subs_avx2(&x[12], &x[13]);
+ btf_16_adds_subs_avx2(&x[15], &x[14]);
+ idct64_stage5_high48_avx2(x, cospi, _r, INV_COS_BIT);
+
+ // stage 6
+ btf_16_w16_0_avx2(cospi[32], cospi[32], x[0], x[0], x[1]);
+ btf_16_w16_0_avx2(cospi[48], cospi[16], x[2], x[2], x[3]);
+ btf_16_adds_subs_avx2(&x[4], &x[5]);
+ btf_16_adds_subs_avx2(&x[7], &x[6]);
+ btf_16_w16_avx2(cospi_m16_p48, cospi_p48_p16, &x[9], &x[14], _r, INV_COS_BIT);
+ btf_16_w16_avx2(cospi_m48_m16, cospi_m16_p48, &x[10], &x[13], _r,
+ INV_COS_BIT);
+ idct64_stage6_high48_avx2(x, cospi, _r, INV_COS_BIT);
+
+ // stage 7
+ btf_16_adds_subs_avx2(&x[0], &x[3]);
+ btf_16_adds_subs_avx2(&x[1], &x[2]);
+ btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x[5], &x[6], _r, INV_COS_BIT);
+ btf_16_adds_subs_avx2(&x[8], &x[11]);
+ btf_16_adds_subs_avx2(&x[9], &x[10]);
+ btf_16_adds_subs_avx2(&x[15], &x[12]);
+ btf_16_adds_subs_avx2(&x[14], &x[13]);
+ idct64_stage7_high48_avx2(x, cospi, _r, INV_COS_BIT);
+
+ // stage 8
+ btf_16_adds_subs_avx2(&x[0], &x[7]);
+ btf_16_adds_subs_avx2(&x[1], &x[6]);
+ btf_16_adds_subs_avx2(&x[2], &x[5]);
+ btf_16_adds_subs_avx2(&x[3], &x[4]);
+ btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x[10], &x[13], _r,
+ INV_COS_BIT);
+ btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x[11], &x[12], _r,
+ INV_COS_BIT);
+ idct64_stage8_high48_avx2(x, cospi, _r, INV_COS_BIT);
+
+ // stage 9~11
+ idct64_stage9_avx2(x, cospi, _r, INV_COS_BIT);
+ idct64_stage10_avx2(x, cospi, _r, INV_COS_BIT);
+ idct64_stage11_avx2(output, x);
+}
+
+typedef void (*transform_1d_avx2)(const __m256i *input, __m256i *output);
+
+// 1D functions process 16 pixels at one time.
+static const transform_1d_avx2
+ lowbd_txfm_all_1d_zeros_w16_arr[TX_SIZES][ITX_TYPES_1D][4] = {
+ {
+ { NULL, NULL, NULL, NULL },
+ { NULL, NULL, NULL, NULL },
+ { NULL, NULL, NULL, NULL },
+ },
+ { { NULL, NULL, NULL, NULL },
+ { NULL, NULL, NULL, NULL },
+ { NULL, NULL, NULL, NULL } },
+ {
+ { idct16_low1_avx2, idct16_low8_avx2, idct16_avx2, NULL },
+ { iadst16_low1_avx2, iadst16_low8_avx2, iadst16_avx2, NULL },
+ { NULL, NULL, NULL, NULL },
+ },
+ { { idct32_low1_avx2, idct32_low8_avx2, idct32_low16_avx2, idct32_avx2 },
+ { NULL, NULL, NULL, NULL },
+ { NULL, NULL, NULL, NULL } },
+ { { idct64_low1_avx2, idct64_low8_avx2, idct64_low16_avx2,
+ idct64_low32_avx2 },
+ { NULL, NULL, NULL, NULL },
+ { NULL, NULL, NULL, NULL } }
+ };
+
+// only process w >= 16 h >= 16
+static INLINE void lowbd_inv_txfm2d_add_no_identity_avx2(
+ const int32_t *input, uint8_t *output, int stride, TX_TYPE tx_type,
+ TX_SIZE tx_size, int eob) {
+ __m256i buf1[64 * 16];
+ int eobx, eoby;
+ get_eobx_eoby_scan_default(&eobx, &eoby, tx_size, eob);
+ const int8_t *shift = av1_inv_txfm_shift_ls[tx_size];
+ const int txw_idx = get_txw_idx(tx_size);
+ const int txh_idx = get_txh_idx(tx_size);
+ const int txfm_size_col = tx_size_wide[tx_size];
+ const int txfm_size_row = tx_size_high[tx_size];
+ const int buf_size_w_div16 = txfm_size_col >> 4;
+ const int buf_size_nonzero_w = ((eobx + 16) >> 4) << 4;
+ const int buf_size_nonzero_h_div16 = (eoby + 16) >> 4;
+ const int input_stride = AOMMIN(32, txfm_size_row);
+ const int rect_type = get_rect_tx_log_ratio(txfm_size_col, txfm_size_row);
+
+ const int fun_idx_x = lowbd_txfm_all_1d_zeros_idx[eobx];
+ const int fun_idx_y = lowbd_txfm_all_1d_zeros_idx[eoby];
+ const transform_1d_avx2 row_txfm =
+ lowbd_txfm_all_1d_zeros_w16_arr[txw_idx][hitx_1d_tab[tx_type]][fun_idx_x];
+ const transform_1d_avx2 col_txfm =
+ lowbd_txfm_all_1d_zeros_w16_arr[txh_idx][vitx_1d_tab[tx_type]][fun_idx_y];
+
+ assert(col_txfm != NULL);
+ assert(row_txfm != NULL);
+ int ud_flip, lr_flip;
+ get_flip_cfg(tx_type, &ud_flip, &lr_flip);
+ const __m256i scale0 = _mm256_set1_epi16(1 << (15 + shift[0]));
+ for (int i = 0; i < buf_size_nonzero_h_div16; i++) {
+ __m256i buf0[64];
+ load_buffer_32bit_to_16bit_w16_avx2(input + 16 * i, input_stride, buf0,
+ buf_size_nonzero_w);
+ if (rect_type == 1 || rect_type == -1) {
+ round_shift_avx2(buf0, buf0, buf_size_nonzero_w); // rect special code
+ }
+ row_txfm(buf0, buf0);
+ for (int j = 0; j < txfm_size_col; ++j) {
+ buf0[j] = _mm256_mulhrs_epi16(buf0[j], scale0);
+ }
+
+ __m256i *buf1_cur = buf1 + (i << 4);
+ if (lr_flip) {
+ for (int j = 0; j < buf_size_w_div16; ++j) {
+ __m256i temp[16];
+ flip_buf_avx2(buf0 + 16 * j, temp, 16);
+ int offset = txfm_size_row * (buf_size_w_div16 - 1 - j);
+ transpose_16bit_16x16_avx2(temp, buf1_cur + offset);
+ }
+ } else {
+ for (int j = 0; j < buf_size_w_div16; ++j) {
+ transpose_16bit_16x16_avx2(buf0 + 16 * j, buf1_cur + txfm_size_row * j);
+ }
+ }
+ }
+ const __m256i scale1 = _mm256_set1_epi16(1 << (15 + shift[1]));
+ for (int i = 0; i < buf_size_w_div16; i++) {
+ __m256i *buf1_cur = buf1 + i * txfm_size_row;
+ col_txfm(buf1_cur, buf1_cur);
+ for (int j = 0; j < txfm_size_row; ++j) {
+ buf1_cur[j] = _mm256_mulhrs_epi16(buf1_cur[j], scale1);
+ }
+ }
+ for (int i = 0; i < buf_size_w_div16; i++) {
+ lowbd_write_buffer_16xn_avx2(buf1 + i * txfm_size_row, output + 16 * i,
+ stride, ud_flip, txfm_size_row);
+ }
+}
+
+static INLINE void iidentity_row_16xn_avx2(__m256i *out, const int32_t *input,
+ int stride, int shift, int height,
+ int txw_idx, int rect_type) {
+ const int32_t *input_row = input;
+ const __m256i scale = _mm256_set1_epi16(NewSqrt2list[txw_idx]);
+ const __m256i _r = _mm256_set1_epi16((1 << (NewSqrt2Bits - 1)) +
+ (1 << (NewSqrt2Bits - shift - 1)));
+ const __m256i one = _mm256_set1_epi16(1);
+ const __m256i scale__r = _mm256_unpacklo_epi16(scale, _r);
+ if (rect_type != 1 && rect_type != -1) {
+ for (int i = 0; i < height; ++i) {
+ const __m256i src = load_32bit_to_16bit_w16_avx2(input_row);
+ input_row += stride;
+ __m256i lo = _mm256_unpacklo_epi16(src, one);
+ __m256i hi = _mm256_unpackhi_epi16(src, one);
+ lo = _mm256_madd_epi16(lo, scale__r);
+ hi = _mm256_madd_epi16(hi, scale__r);
+ lo = _mm256_srai_epi32(lo, NewSqrt2Bits - shift);
+ hi = _mm256_srai_epi32(hi, NewSqrt2Bits - shift);
+ out[i] = _mm256_packs_epi32(lo, hi);
+ }
+ } else {
+ const __m256i rect_scale =
+ _mm256_set1_epi16(NewInvSqrt2 << (15 - NewSqrt2Bits));
+ for (int i = 0; i < height; ++i) {
+ __m256i src = load_32bit_to_16bit_w16_avx2(input_row);
+ src = _mm256_mulhrs_epi16(src, rect_scale);
+ input_row += stride;
+ __m256i lo = _mm256_unpacklo_epi16(src, one);
+ __m256i hi = _mm256_unpackhi_epi16(src, one);
+ lo = _mm256_madd_epi16(lo, scale__r);
+ hi = _mm256_madd_epi16(hi, scale__r);
+ lo = _mm256_srai_epi32(lo, NewSqrt2Bits - shift);
+ hi = _mm256_srai_epi32(hi, NewSqrt2Bits - shift);
+ out[i] = _mm256_packs_epi32(lo, hi);
+ }
+ }
+}
+
+static INLINE void iidentity_col_16xn_avx2(uint8_t *output, int stride,
+ __m256i *buf, int shift, int height,
+ int txh_idx) {
+ const __m256i scale = _mm256_set1_epi16(NewSqrt2list[txh_idx]);
+ const __m256i scale__r = _mm256_set1_epi16(1 << (NewSqrt2Bits - 1));
+ const __m256i shift__r = _mm256_set1_epi32(1 << (-shift - 1));
+ const __m256i one = _mm256_set1_epi16(1);
+ const __m256i scale_coeff = _mm256_unpacklo_epi16(scale, scale__r);
+ for (int h = 0; h < height; ++h) {
+ __m256i lo = _mm256_unpacklo_epi16(buf[h], one);
+ __m256i hi = _mm256_unpackhi_epi16(buf[h], one);
+ lo = _mm256_madd_epi16(lo, scale_coeff);
+ hi = _mm256_madd_epi16(hi, scale_coeff);
+ lo = _mm256_srai_epi32(lo, NewSqrt2Bits);
+ hi = _mm256_srai_epi32(hi, NewSqrt2Bits);
+ lo = _mm256_add_epi32(lo, shift__r);
+ hi = _mm256_add_epi32(hi, shift__r);
+ lo = _mm256_srai_epi32(lo, -shift);
+ hi = _mm256_srai_epi32(hi, -shift);
+ const __m256i x = _mm256_packs_epi32(lo, hi);
+ write_recon_w16_avx2(x, output);
+ output += stride;
+ }
+}
+
+static INLINE void lowbd_inv_txfm2d_add_idtx_avx2(const int32_t *input,
+ uint8_t *output, int stride,
+ TX_SIZE tx_size,
+ int32_t eob) {
+ (void)eob;
+ const int8_t *shift = av1_inv_txfm_shift_ls[tx_size];
+ const int txw_idx = get_txw_idx(tx_size);
+ const int txh_idx = get_txh_idx(tx_size);
+ const int txfm_size_col = tx_size_wide[tx_size];
+ const int txfm_size_row = tx_size_high[tx_size];
+ const int col_max = AOMMIN(32, txfm_size_col);
+ const int row_max = AOMMIN(32, txfm_size_row);
+ const int input_stride = row_max;
+ const int rect_type = get_rect_tx_log_ratio(txfm_size_col, txfm_size_row);
+ __m256i buf[32];
+
+ for (int i = 0; i < (col_max >> 4); ++i) {
+ for (int j = 0; j < (row_max >> 4); j++) {
+ iidentity_row_16xn_avx2(buf, input + j * 16 + i * 16 * input_stride,
+ row_max, shift[0], 16, txw_idx, rect_type);
+ transpose_16bit_16x16_avx2(buf, buf);
+ iidentity_col_16xn_avx2(output + i * 16 + j * 16 * stride, stride, buf,
+ shift[1], 16, txh_idx);
+ }
+ }
+}
+
+static INLINE void lowbd_inv_txfm2d_add_h_identity_avx2(
+ const int32_t *input, uint8_t *output, int stride, TX_TYPE tx_type,
+ TX_SIZE tx_size, int eob) {
+ int eobx, eoby;
+ get_eobx_eoby_scan_h_identity(&eobx, &eoby, tx_size, eob);
+ const int8_t *shift = av1_inv_txfm_shift_ls[tx_size];
+ const int txw_idx = get_txw_idx(tx_size);
+ const int txh_idx = get_txh_idx(tx_size);
+ const int txfm_size_col = tx_size_wide[tx_size];
+ const int txfm_size_row = tx_size_high[tx_size];
+ const int txfm_size_row_notzero = AOMMIN(32, txfm_size_row);
+ const int input_stride = txfm_size_row_notzero;
+ const int buf_size_w_div16 = (eobx + 16) >> 4;
+ const int buf_size_h_div16 = (eoby + 16) >> 4;
+ const int rect_type = get_rect_tx_log_ratio(txfm_size_col, txfm_size_row);
+
+ const int fun_idx_y = lowbd_txfm_all_1d_zeros_idx[eoby];
+ const transform_1d_avx2 col_txfm =
+ lowbd_txfm_all_1d_zeros_w16_arr[txh_idx][vitx_1d_tab[tx_type]][fun_idx_y];
+
+ assert(col_txfm != NULL);
+
+ int ud_flip, lr_flip;
+ get_flip_cfg(tx_type, &ud_flip, &lr_flip);
+ for (int i = 0; i < buf_size_w_div16; i++) {
+ __m256i buf0[64];
+ for (int j = 0; j < buf_size_h_div16; j++) {
+ __m256i *buf0_cur = buf0 + j * 16;
+ const int32_t *input_cur = input + i * 16 * input_stride + j * 16;
+ iidentity_row_16xn_avx2(buf0_cur, input_cur, input_stride, shift[0], 16,
+ txw_idx, rect_type);
+ transpose_16bit_16x16_avx2(buf0_cur, buf0_cur);
+ }
+ col_txfm(buf0, buf0);
+ __m256i mshift = _mm256_set1_epi16(1 << (15 + shift[1]));
+ int k = ud_flip ? (txfm_size_row - 1) : 0;
+ const int step = ud_flip ? -1 : 1;
+ for (int j = 0; j < txfm_size_row; ++j, k += step) {
+ __m256i res = _mm256_mulhrs_epi16(buf0[k], mshift);
+ write_recon_w16_avx2(res, output + (i << 4) + j * stride);
+ }
+ }
+}
+
+static INLINE void lowbd_inv_txfm2d_add_v_identity_avx2(
+ const int32_t *input, uint8_t *output, int stride, TX_TYPE tx_type,
+ TX_SIZE tx_size, int eob) {
+ __m256i buf1[64];
+ int eobx, eoby;
+ get_eobx_eoby_scan_v_identity(&eobx, &eoby, tx_size, eob);
+ const int8_t *shift = av1_inv_txfm_shift_ls[tx_size];
+ const int txw_idx = get_txw_idx(tx_size);
+ const int txh_idx = get_txh_idx(tx_size);
+ const int txfm_size_col = tx_size_wide[tx_size];
+ const int txfm_size_row = tx_size_high[tx_size];
+ const int buf_size_w_div16 = txfm_size_col >> 4;
+ const int buf_size_h_div16 = (eoby + 16) >> 4;
+ const int buf_size_nonzero_w = ((eobx + 8) >> 3) << 3;
+ const int input_stride = AOMMIN(32, txfm_size_row);
+ const int rect_type = get_rect_tx_log_ratio(txfm_size_col, txfm_size_row);
+
+ const int fun_idx_x = lowbd_txfm_all_1d_zeros_idx[eobx];
+ const transform_1d_avx2 row_txfm =
+ lowbd_txfm_all_1d_zeros_w16_arr[txw_idx][hitx_1d_tab[tx_type]][fun_idx_x];
+
+ assert(row_txfm != NULL);
+
+ int ud_flip, lr_flip;
+ get_flip_cfg(tx_type, &ud_flip, &lr_flip);
+ for (int i = 0; i < buf_size_h_div16; i++) {
+ __m256i buf0[64];
+ load_buffer_32bit_to_16bit_w16_avx2(input + i * 16, input_stride, buf0,
+ buf_size_nonzero_w);
+ if (rect_type == 1 || rect_type == -1) {
+ round_shift_avx2(buf0, buf0, buf_size_nonzero_w); // rect special code
+ }
+ row_txfm(buf0, buf0);
+ round_shift_16bit_w16_avx2(buf0, txfm_size_col, shift[0]);
+ __m256i *_buf1 = buf1;
+ if (lr_flip) {
+ for (int j = 0; j < buf_size_w_div16; ++j) {
+ __m256i temp[16];
+ flip_buf_avx2(buf0 + 16 * j, temp, 16);
+ transpose_16bit_16x16_avx2(temp,
+ _buf1 + 16 * (buf_size_w_div16 - 1 - j));
+ }
+ } else {
+ for (int j = 0; j < buf_size_w_div16; ++j) {
+ transpose_16bit_16x16_avx2(buf0 + 16 * j, _buf1 + 16 * j);
+ }
+ }
+ for (int j = 0; j < buf_size_w_div16; ++j) {
+ iidentity_col_16xn_avx2(output + i * 16 * stride + j * 16, stride,
+ buf1 + j * 16, shift[1], 16, txh_idx);
+ }
+ }
+}
+
+static const transform_1d_ssse3 lowbd_txfm_all_1d_zeros_8x8_arr[2][2] = {
+ { av1_idct8_low1_ssse3, av1_idct8_sse2 },
+ { av1_iadst8_low1_ssse3, av1_iadst8_sse2 }
+};
+
+static INLINE void load_buffer_avx2(const int32_t *in, int stride,
+ __m128i *out) {
+ const __m256i a = _mm256_load_si256((const __m256i *)in);
+ const __m256i b = _mm256_load_si256((const __m256i *)(in + stride * 1));
+ const __m256i c = _mm256_load_si256((const __m256i *)(in + stride * 2));
+ const __m256i d = _mm256_load_si256((const __m256i *)(in + stride * 3));
+ const __m256i e = _mm256_load_si256((const __m256i *)(in + stride * 4));
+ const __m256i f = _mm256_load_si256((const __m256i *)(in + stride * 5));
+ const __m256i g = _mm256_load_si256((const __m256i *)(in + stride * 6));
+ const __m256i h = _mm256_load_si256((const __m256i *)(in + stride * 7));
+
+ // a0 a1 a2 a3 b0 b1 b2 b3 a4 a5 a6 a7 b4 b5 b6 b7
+ const __m256i ab_16bit = _mm256_packs_epi32(a, b);
+ // c0 c1 c2 c3 d0 d1 d2 d3 c4 c5 c6 c7 d4 d5 d6 d7
+ const __m256i cd_16bit = _mm256_packs_epi32(c, d);
+ // e0 e1 e2 e3 f0 f1 f2 f3 e4 e5 e6 e7 f4 f5 f6 f7
+ const __m256i ef_16bit = _mm256_packs_epi32(e, f);
+ // g0 g1 g2 g3 h0 h1 h2 h3 g4 g5 g6 g7 h4 h5 h6 h7
+ const __m256i gh_16bit = _mm256_packs_epi32(g, h);
+
+ // a0 a1 a2 a3 a4 a5 a6 a7 b0 b1 b2 b3 b4 b5 b6 b7
+ const __m256i ab = _mm256_permute4x64_epi64(ab_16bit, 0xd8);
+ // c0 c1 c2 c3 c4 c5 c6 c7 d0 d1 d2 d3 d4 d5 d6 d7
+ const __m256i cd = _mm256_permute4x64_epi64(cd_16bit, 0xd8);
+ // e0 e1 e2 e3 e4 e5 e6 e7 f0 f1 f2 f3 f4 f5 f6 f7
+ const __m256i ef = _mm256_permute4x64_epi64(ef_16bit, 0xd8);
+ // g0 g1 g2 g3 g4 g5 g6 g7 h0 h1 h2 h3 h4 h5 h6 h7
+ const __m256i gh = _mm256_permute4x64_epi64(gh_16bit, 0xd8);
+
+ out[0] = _mm256_castsi256_si128(ab);
+ out[1] = _mm256_extractf128_si256(ab, 1);
+ out[2] = _mm256_castsi256_si128(cd);
+ out[3] = _mm256_extractf128_si256(cd, 1);
+ out[4] = _mm256_castsi256_si128(ef);
+ out[5] = _mm256_extractf128_si256(ef, 1);
+ out[6] = _mm256_castsi256_si128(gh);
+ out[7] = _mm256_extractf128_si256(gh, 1);
+}
+
+static INLINE void round_and_transpose_avx2(const __m128i *const in,
+ __m128i *const out, int bit,
+ int *lr_flip) {
+ __m256i buf_temp[4];
+ const __m256i scale = _mm256_set1_epi16(1 << (15 + bit));
+ int j = *lr_flip ? 7 : 0;
+ const int step = *lr_flip ? -1 : 1;
+
+ // 70 71 72 73 74 75 76 77 | 30 31 32 33 34 35 36 37
+ buf_temp[0] = _mm256_inserti128_si256(_mm256_castsi128_si256(in[j]),
+ in[j + 4 * step], 1);
+ j += step;
+ // 60 61 62 63 64 65 66 67 | 20 21 22 23 24 25 26 27
+ buf_temp[1] = _mm256_inserti128_si256(_mm256_castsi128_si256(in[j]),
+ in[j + 4 * step], 1);
+ j += step;
+ // 50 51 52 53 54 55 56 57 | 10 11 12 13 14 15 16 17
+ buf_temp[2] = _mm256_inserti128_si256(_mm256_castsi128_si256(in[j]),
+ in[j + 4 * step], 1);
+ j += step;
+ // 40 41 42 43 44 45 46 47 | 00 01 02 03 04 05 06 07
+ buf_temp[3] = _mm256_inserti128_si256(_mm256_castsi128_si256(in[j]),
+ in[j + 4 * step], 1);
+
+ // 70 71 72 73 74 75 76 77 | 30 31 32 33 34 35 36 37
+ buf_temp[0] = _mm256_mulhrs_epi16(buf_temp[0], scale);
+ // 60 61 62 63 64 65 66 67 | 20 21 22 23 24 25 26 27
+ buf_temp[1] = _mm256_mulhrs_epi16(buf_temp[1], scale);
+ // 50 51 52 53 54 55 56 57 | 10 11 12 13 14 15 16 17
+ buf_temp[2] = _mm256_mulhrs_epi16(buf_temp[2], scale);
+ // 40 41 42 43 44 45 46 47 | 00 01 02 03 04 05 06 07
+ buf_temp[3] = _mm256_mulhrs_epi16(buf_temp[3], scale);
+
+ // 70 60 71 61 72 62 73 63 | 30 20 31 21 32 22 33 23
+ const __m256i unpcklo0 = _mm256_unpacklo_epi16(buf_temp[0], buf_temp[1]);
+ // 74 64 75 65 76 66 77 67 | 34 24 35 25 36 26 37 27
+ const __m256i unpckhi0 = _mm256_unpackhi_epi16(buf_temp[0], buf_temp[1]);
+ // 50 40 51 41 52 42 53 43 | 10 00 11 01 12 02 13 03
+ const __m256i unpcklo1 = _mm256_unpacklo_epi16(buf_temp[2], buf_temp[3]);
+ // 54 44 55 45 56 46 57 47 | 14 04 15 05 16 06 17 07
+ const __m256i unpckhi1 = _mm256_unpackhi_epi16(buf_temp[2], buf_temp[3]);
+
+ // 70 60 50 40 71 61 51 41 | 30 20 10 00 31 21 11 01
+ const __m256i unpcklo00 = _mm256_unpacklo_epi32(unpcklo0, unpcklo1);
+ // 72 62 52 42 73 63 53 43 | 32 22 12 02 33 23 13 03
+ const __m256i unpckhi00 = _mm256_unpackhi_epi32(unpcklo0, unpcklo1);
+ // 74 64 54 44 75 65 55 45 | 34 24 14 04 35 25 15 05
+ const __m256i unpcklo01 = _mm256_unpacklo_epi32(unpckhi0, unpckhi1);
+ // 76 66 56 46 77 67 57 47 | 36 26 16 06 37 27 17 07
+ const __m256i unpckhi01 = _mm256_unpackhi_epi32(unpckhi0, unpckhi1);
+
+ // 70 60 50 40 30 20 10 00 | 71 61 51 41 31 21 11 01
+ const __m256i reg_00 = _mm256_permute4x64_epi64(unpcklo00, 0xd8);
+ // 72 62 52 42 32 22 12 02 | 73 63 53 43 33 23 13 03
+ const __m256i reg_01 = _mm256_permute4x64_epi64(unpckhi00, 0xd8);
+ // 74 64 54 44 34 24 14 04 | 75 65 55 45 35 25 15 05
+ const __m256i reg_10 = _mm256_permute4x64_epi64(unpcklo01, 0xd8);
+ // 76 66 56 46 36 26 16 06 | 77 67 57 47 37 27 17 07
+ const __m256i reg_11 = _mm256_permute4x64_epi64(unpckhi01, 0xd8);
+
+ // 70 60 50 40 30 20 10 00
+ out[0] = _mm256_castsi256_si128(reg_00);
+ // 71 61 51 41 31 21 11 01
+ out[1] = _mm256_extracti128_si256(reg_00, 1);
+ // 72 62 52 42 32 22 12 02
+ out[2] = _mm256_castsi256_si128(reg_01);
+ // 73 63 53 43 33 23 13 03
+ out[3] = _mm256_extracti128_si256(reg_01, 1);
+ // 74 64 54 44 34 24 14 04
+ out[4] = _mm256_castsi256_si128(reg_10);
+ // 75 65 55 45 35 25 15 05
+ out[5] = _mm256_extracti128_si256(reg_10, 1);
+ // 76 66 56 46 36 26 16 06
+ out[6] = _mm256_castsi256_si128(reg_11);
+ // 77 67 57 47 37 27 17 07
+ out[7] = _mm256_extracti128_si256(reg_11, 1);
+}
+
+static INLINE void round_shift_lowbd_write_buffer_avx2(__m128i *in, int bit,
+ uint8_t *output,
+ int stride, int flipud) {
+ __m256i in_256[4], v_256[4];
+ int j = flipud ? 7 : 0;
+ const int step = flipud ? -1 : 1;
+ const __m256i scale = _mm256_set1_epi16(1 << (15 + bit));
+ const __m256i zero = _mm256_setzero_si256();
+ // in[0], in[1]
+ in_256[0] =
+ _mm256_inserti128_si256(_mm256_castsi128_si256(in[j]), in[j + step], 1);
+ j += 2 * step;
+ // in[2], in[3]
+ in_256[1] =
+ _mm256_inserti128_si256(_mm256_castsi128_si256(in[j]), in[j + step], 1);
+ j += 2 * step;
+ // in[4], in[5]
+ in_256[2] =
+ _mm256_inserti128_si256(_mm256_castsi128_si256(in[j]), in[j + step], 1);
+ j += 2 * step;
+ // in[6], in[7]
+ in_256[3] =
+ _mm256_inserti128_si256(_mm256_castsi128_si256(in[j]), in[j + step], 1);
+
+ // i00 i01 i02 i03 i04 i05 i06 i07 i10 i11 i12 i13 i14 i15 i16 i17
+ in_256[0] = _mm256_mulhrs_epi16(in_256[0], scale);
+ // i20 i21 i22 i23 i24 i25 i26 i27 i30 i31 i32 i33 i34 i35 i36 i37
+ in_256[1] = _mm256_mulhrs_epi16(in_256[1], scale);
+ // i40 i41 i42 i43 i44 i45 i46 i47 i50 i51 i52 i53 i54 i55 i56 i57
+ in_256[2] = _mm256_mulhrs_epi16(in_256[2], scale);
+ // i60 i61 i62 i63 i64 i65 i66 i67 i70 i71 i72 i73 i74 i75 i76 i77
+ in_256[3] = _mm256_mulhrs_epi16(in_256[3], scale);
+
+ const __m128i v0 = _mm_loadl_epi64((__m128i const *)(output));
+ const __m128i v1 = _mm_loadl_epi64((__m128i const *)(output + stride));
+ const __m128i v2 = _mm_loadl_epi64((__m128i const *)(output + 2 * stride));
+ const __m128i v3 = _mm_loadl_epi64((__m128i const *)(output + 3 * stride));
+ const __m128i v4 = _mm_loadl_epi64((__m128i const *)(output + 4 * stride));
+ const __m128i v5 = _mm_loadl_epi64((__m128i const *)(output + 5 * stride));
+ const __m128i v6 = _mm_loadl_epi64((__m128i const *)(output + 6 * stride));
+ const __m128i v7 = _mm_loadl_epi64((__m128i const *)(output + 7 * stride));
+
+ v_256[0] = _mm256_inserti128_si256(_mm256_castsi128_si256(v0), v1, 1);
+ v_256[1] = _mm256_inserti128_si256(_mm256_castsi128_si256(v2), v3, 1);
+ v_256[2] = _mm256_inserti128_si256(_mm256_castsi128_si256(v4), v5, 1);
+ v_256[3] = _mm256_inserti128_si256(_mm256_castsi128_si256(v6), v7, 1);
+
+ const __m256i unpcklo0 = _mm256_unpacklo_epi8(v_256[0], zero);
+ const __m256i unpcklo1 = _mm256_unpacklo_epi8(v_256[1], zero);
+ const __m256i unpcklo2 = _mm256_unpacklo_epi8(v_256[2], zero);
+ const __m256i unpcklo3 = _mm256_unpacklo_epi8(v_256[3], zero);
+ // 00 01 10 11
+ const __m256i x0 = _mm256_adds_epi16(in_256[0], unpcklo0);
+ // 20 21 30 31
+ const __m256i x1 = _mm256_adds_epi16(in_256[1], unpcklo1);
+ // 40 41 50 51
+ const __m256i x2 = _mm256_adds_epi16(in_256[2], unpcklo2);
+ // 60 61 70 71
+ const __m256i x3 = _mm256_adds_epi16(in_256[3], unpcklo3);
+
+ // 00 01 20 21 10 11 30 31
+ const __m256i res_0123 = _mm256_packus_epi16(x0, x1);
+ // 40 41 60 61 50 51 70 71
+ const __m256i res_4567 = _mm256_packus_epi16(x2, x3);
+
+ // 00 01 20 21
+ const __m128i res_02 = _mm256_castsi256_si128(res_0123);
+ // 10 11 30 31
+ const __m128i res_13 = _mm256_extracti128_si256(res_0123, 1);
+ // 40 41 60 61
+ const __m128i res_46 = _mm256_castsi256_si128(res_4567);
+ // 50 51 70 71
+ const __m128i res_57 = _mm256_extracti128_si256(res_4567, 1);
+
+ // 00 01
+ _mm_storel_epi64((__m128i *)(output), res_02);
+ // 10 11
+ _mm_storel_epi64((__m128i *)(output + stride), res_13);
+ // 20 21
+ _mm_storel_epi64((__m128i *)(output + 2 * stride),
+ _mm_unpackhi_epi64(res_02, res_02));
+ // 30 31
+ _mm_storel_epi64((__m128i *)(output + 3 * stride),
+ _mm_unpackhi_epi64(res_13, res_13));
+ // 40 41
+ _mm_storel_epi64((__m128i *)(output + 4 * stride), res_46);
+ // 50 51
+ _mm_storel_epi64((__m128i *)(output + 5 * stride), res_57);
+ // 60 61
+ _mm_storel_epi64((__m128i *)(output + 6 * stride),
+ _mm_unpackhi_epi64(res_46, res_46));
+ // 70 71
+ _mm_storel_epi64((__m128i *)(output + 7 * stride),
+ _mm_unpackhi_epi64(res_57, res_57));
+}
+
+// AVX2 implementation has the advantage when combined multiple operations
+// together.
+static INLINE void lowbd_inv_txfm2d_8x8_no_identity_avx2(
+ const int32_t *input, uint8_t *output, int stride, TX_TYPE tx_type,
+ TX_SIZE tx_size, int eob) {
+ __m128i buf1[8];
+ const int input_stride = 8;
+ const int8_t *shift = av1_inv_txfm_shift_ls[tx_size];
+ assert(hitx_1d_tab[tx_type] < 2);
+ assert(vitx_1d_tab[tx_type] < 2);
+ const transform_1d_ssse3 row_txfm =
+ lowbd_txfm_all_1d_zeros_8x8_arr[hitx_1d_tab[tx_type]][eob != 1];
+ const transform_1d_ssse3 col_txfm =
+ lowbd_txfm_all_1d_zeros_8x8_arr[vitx_1d_tab[tx_type]][eob != 1];
+
+ assert(col_txfm != NULL);
+ assert(row_txfm != NULL);
+ int ud_flip, lr_flip;
+ get_flip_cfg(tx_type, &ud_flip, &lr_flip);
+
+ __m128i buf0[8];
+ __m128i *buf0_cur = buf0;
+ load_buffer_avx2(input, input_stride, buf0_cur);
+ row_txfm(buf0, buf0);
+
+ assert(shift[0] < 0);
+ __m128i *_buf1 = buf1;
+ round_and_transpose_avx2(buf0, _buf1, shift[0], &lr_flip);
+ assert(shift[1] < 0);
+ col_txfm(buf1, buf1);
+ round_shift_lowbd_write_buffer_avx2(buf1, shift[1], output, stride, ud_flip);
+}
+
+// AVX2 implementation of 8x8 inverse transform. Observed that coding AVX2 for
+// tx_type with identity in either of the direction has no advantage.
+static void lowbd_inv_txfm2d_add_8x8_avx2(const int32_t *input, uint8_t *output,
+ int stride, TX_TYPE tx_type,
+ TX_SIZE tx_size, int eob) {
+ switch (tx_type) {
+ case IDTX:
+ av1_lowbd_inv_txfm2d_add_idtx_ssse3(input, output, stride, tx_size);
+
+ break;
+ case V_DCT:
+ case V_ADST:
+ case V_FLIPADST:
+ av1_lowbd_inv_txfm2d_add_h_identity_ssse3(input, output, stride, tx_type,
+ tx_size, eob);
+ break;
+ case H_DCT:
+ case H_ADST:
+ case H_FLIPADST:
+ av1_lowbd_inv_txfm2d_add_v_identity_ssse3(input, output, stride, tx_type,
+ tx_size, eob);
+ break;
+ default:
+ lowbd_inv_txfm2d_8x8_no_identity_avx2(input, output, stride, tx_type,
+ tx_size, eob);
+ }
+}
+
+// for 32x32,32x64,64x32,64x64,16x32,32x16,64x16,16x64
+static INLINE void lowbd_inv_txfm2d_add_universe_avx2(
+ const int32_t *input, uint8_t *output, int stride, TX_TYPE tx_type,
+ TX_SIZE tx_size, int eob) {
+ (void)eob;
+ switch (tx_type) {
+ case DCT_DCT:
+ case ADST_DCT: // ADST in vertical, DCT in horizontal
+ case DCT_ADST: // DCT in vertical, ADST in horizontal
+ case ADST_ADST: // ADST in both directions
+ case FLIPADST_DCT:
+ case DCT_FLIPADST:
+ case FLIPADST_FLIPADST:
+ case ADST_FLIPADST:
+ case FLIPADST_ADST:
+ lowbd_inv_txfm2d_add_no_identity_avx2(input, output, stride, tx_type,
+ tx_size, eob);
+ break;
+ case IDTX:
+ lowbd_inv_txfm2d_add_idtx_avx2(input, output, stride, tx_size, eob);
+ break;
+ case V_DCT:
+ case V_ADST:
+ case V_FLIPADST:
+ lowbd_inv_txfm2d_add_h_identity_avx2(input, output, stride, tx_type,
+ tx_size, eob);
+ break;
+ case H_DCT:
+ case H_ADST:
+ case H_FLIPADST:
+ lowbd_inv_txfm2d_add_v_identity_avx2(input, output, stride, tx_type,
+ tx_size, eob);
+ break;
+ default:
+ av1_lowbd_inv_txfm2d_add_ssse3(input, output, stride, tx_type, tx_size,
+ eob);
+ break;
+ }
+}
+
+void av1_lowbd_inv_txfm2d_add_avx2(const int32_t *input, uint8_t *output,
+ int stride, TX_TYPE tx_type, TX_SIZE tx_size,
+ int eob) {
+ switch (tx_size) {
+ case TX_4X4:
+ case TX_4X8:
+ case TX_8X4:
+ case TX_8X16:
+ case TX_16X8:
+ case TX_4X16:
+ case TX_16X4:
+ case TX_8X32:
+ case TX_32X8:
+ av1_lowbd_inv_txfm2d_add_ssse3(input, output, stride, tx_type, tx_size,
+ eob);
+ break;
+ case TX_8X8:
+ lowbd_inv_txfm2d_add_8x8_avx2(input, output, stride, tx_type, tx_size,
+ eob);
+ break;
+ case TX_16X16:
+ case TX_32X32:
+ case TX_64X64:
+ case TX_16X32:
+ case TX_32X16:
+ case TX_32X64:
+ case TX_64X32:
+ case TX_16X64:
+ case TX_64X16:
+ default:
+ lowbd_inv_txfm2d_add_universe_avx2(input, output, stride, tx_type,
+ tx_size, eob);
+ break;
+ }
+}
+
+void av1_inv_txfm_add_avx2(const tran_low_t *dqcoeff, uint8_t *dst, int stride,
+ const TxfmParam *txfm_param) {
+ const TX_TYPE tx_type = txfm_param->tx_type;
+ if (!txfm_param->lossless) {
+ av1_lowbd_inv_txfm2d_add_avx2(dqcoeff, dst, stride, tx_type,
+ txfm_param->tx_size, txfm_param->eob);
+ } else {
+ av1_inv_txfm_add_c(dqcoeff, dst, stride, txfm_param);
+ }
+}
diff --git a/third_party/aom/av1/common/x86/av1_inv_txfm_avx2.h b/third_party/aom/av1/common/x86/av1_inv_txfm_avx2.h
new file mode 100644
index 0000000000..a09dea389f
--- /dev/null
+++ b/third_party/aom/av1/common/x86/av1_inv_txfm_avx2.h
@@ -0,0 +1,71 @@
+/*
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+#ifndef AOM_AV1_COMMON_X86_AV1_INV_TXFM_AVX2_H_
+#define AOM_AV1_COMMON_X86_AV1_INV_TXFM_AVX2_H_
+
+#include <immintrin.h>
+
+#include "config/aom_config.h"
+#include "config/av1_rtcd.h"
+
+#include "aom/aom_integer.h"
+#include "aom_dsp/x86/transpose_sse2.h"
+#include "aom_dsp/x86/txfm_common_sse2.h"
+#include "aom_dsp/x86/txfm_common_avx2.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+// half input is zero
+#define btf_16_w16_0_avx2(w0, w1, in, out0, out1) \
+ do { \
+ const __m256i _w0 = _mm256_set1_epi16(w0 * 8); \
+ const __m256i _w1 = _mm256_set1_epi16(w1 * 8); \
+ const __m256i _in = in; \
+ out0 = _mm256_mulhrs_epi16(_in, _w0); \
+ out1 = _mm256_mulhrs_epi16(_in, _w1); \
+ } while (0)
+
+static INLINE void round_shift_avx2(const __m256i *input, __m256i *output,
+ int size) {
+ const __m256i scale = _mm256_set1_epi16(NewInvSqrt2 * 8);
+ for (int i = 0; i < size; ++i) {
+ output[i] = _mm256_mulhrs_epi16(input[i], scale);
+ }
+}
+
+static INLINE void write_recon_w16_avx2(__m256i res, uint8_t *output) {
+ __m128i pred = _mm_loadu_si128((__m128i const *)(output));
+ __m256i u = _mm256_adds_epi16(_mm256_cvtepu8_epi16(pred), res);
+ __m128i y = _mm256_castsi256_si128(
+ _mm256_permute4x64_epi64(_mm256_packus_epi16(u, u), 168));
+ _mm_storeu_si128((__m128i *)(output), y);
+}
+
+static INLINE void lowbd_write_buffer_16xn_avx2(__m256i *in, uint8_t *output,
+ int stride, int flipud,
+ int height) {
+ int j = flipud ? (height - 1) : 0;
+ const int step = flipud ? -1 : 1;
+ for (int i = 0; i < height; ++i, j += step) {
+ write_recon_w16_avx2(in[j], output + i * stride);
+ }
+}
+
+void av1_lowbd_inv_txfm2d_add_avx2(const int32_t *input, uint8_t *output,
+ int stride, TX_TYPE tx_type, TX_SIZE tx_size,
+ int eob);
+#ifdef __cplusplus
+}
+#endif
+
+#endif // AOM_AV1_COMMON_X86_AV1_INV_TXFM_AVX2_H_
diff --git a/third_party/aom/av1/common/x86/av1_inv_txfm_ssse3.c b/third_party/aom/av1/common/x86/av1_inv_txfm_ssse3.c
new file mode 100644
index 0000000000..79a6064c3e
--- /dev/null
+++ b/third_party/aom/av1/common/x86/av1_inv_txfm_ssse3.c
@@ -0,0 +1,2904 @@
+/*
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include "config/aom_config.h"
+#include "config/av1_rtcd.h"
+
+#include "av1/common/av1_inv_txfm1d_cfg.h"
+#include "av1/common/x86/av1_inv_txfm_ssse3.h"
+#include "av1/common/x86/av1_txfm_sse2.h"
+
+// TODO(venkatsanampudi@ittiam.com): move this to header file
+
+// Sqrt2, Sqrt2^2, Sqrt2^3, Sqrt2^4, Sqrt2^5
+static int32_t NewSqrt2list[TX_SIZES] = { 5793, 2 * 4096, 2 * 5793, 4 * 4096,
+ 4 * 5793 };
+
+// TODO(binpengsmail@gmail.com): replace some for loop with do {} while
+
+static void idct4_sse2(const __m128i *input, __m128i *output) {
+ const int8_t cos_bit = INV_COS_BIT;
+ const int32_t *cospi = cospi_arr(INV_COS_BIT);
+ const __m128i __rounding = _mm_set1_epi32(1 << (INV_COS_BIT - 1));
+
+ const __m128i cospi_p32_p32 = pair_set_epi16(cospi[32], cospi[32]);
+ const __m128i cospi_p32_m32 = pair_set_epi16(cospi[32], -cospi[32]);
+ const __m128i cospi_p48_m16 = pair_set_epi16(cospi[48], -cospi[16]);
+ const __m128i cospi_p16_p48 = pair_set_epi16(cospi[16], cospi[48]);
+
+ // stage 1
+ __m128i x[4];
+ x[0] = input[0];
+ x[1] = input[2];
+ x[2] = input[1];
+ x[3] = input[3];
+
+ // stage 2
+ btf_16_sse2(cospi_p32_p32, cospi_p32_m32, x[0], x[1], x[0], x[1]);
+ btf_16_sse2(cospi_p48_m16, cospi_p16_p48, x[2], x[3], x[2], x[3]);
+
+ // stage 3
+ btf_16_adds_subs_out_sse2(output[0], output[3], x[0], x[3]);
+ btf_16_adds_subs_out_sse2(output[1], output[2], x[1], x[2]);
+}
+
+static void idct4_w4_sse2(const __m128i *input, __m128i *output) {
+ const int8_t cos_bit = INV_COS_BIT;
+ const int32_t *cospi = cospi_arr(INV_COS_BIT);
+ const __m128i __rounding = _mm_set1_epi32(1 << (INV_COS_BIT - 1));
+
+ const __m128i cospi_p32_p32 = pair_set_epi16(cospi[32], cospi[32]);
+ const __m128i cospi_p32_m32 = pair_set_epi16(cospi[32], -cospi[32]);
+ const __m128i cospi_p48_m16 = pair_set_epi16(cospi[48], -cospi[16]);
+ const __m128i cospi_p16_p48 = pair_set_epi16(cospi[16], cospi[48]);
+
+ // stage 1
+ __m128i x[4];
+ x[0] = input[0];
+ x[1] = input[2];
+ x[2] = input[1];
+ x[3] = input[3];
+
+ // stage 2
+ btf_16_4p_sse2(cospi_p32_p32, cospi_p32_m32, x[0], x[1], x[0], x[1]);
+ btf_16_4p_sse2(cospi_p48_m16, cospi_p16_p48, x[2], x[3], x[2], x[3]);
+
+ // stage 3
+ btf_16_adds_subs_out_sse2(output[0], output[3], x[0], x[3]);
+ btf_16_adds_subs_out_sse2(output[1], output[2], x[1], x[2]);
+}
+
+void av1_idct8_low1_ssse3(const __m128i *input, __m128i *output) {
+ const int32_t *cospi = cospi_arr(INV_COS_BIT);
+
+ // stage 1
+ __m128i x[2];
+ x[0] = input[0];
+
+ // stage 2
+ // stage 3
+ btf_16_ssse3(cospi[32], cospi[32], x[0], x[0], x[1]);
+
+ // stage 4
+ // stage 5
+ output[0] = x[0];
+ output[7] = x[0];
+ output[1] = x[1];
+ output[6] = x[1];
+ output[2] = x[1];
+ output[5] = x[1];
+ output[3] = x[0];
+ output[4] = x[0];
+}
+
+void av1_idct8_sse2(const __m128i *input, __m128i *output) {
+ const int8_t cos_bit = INV_COS_BIT;
+ const int32_t *cospi = cospi_arr(INV_COS_BIT);
+ const __m128i __rounding = _mm_set1_epi32(1 << (INV_COS_BIT - 1));
+
+ const __m128i cospi_p56_m08 = pair_set_epi16(cospi[56], -cospi[8]);
+ const __m128i cospi_p08_p56 = pair_set_epi16(cospi[8], cospi[56]);
+ const __m128i cospi_p24_m40 = pair_set_epi16(cospi[24], -cospi[40]);
+ const __m128i cospi_p40_p24 = pair_set_epi16(cospi[40], cospi[24]);
+ const __m128i cospi_p32_p32 = pair_set_epi16(cospi[32], cospi[32]);
+ const __m128i cospi_p32_m32 = pair_set_epi16(cospi[32], -cospi[32]);
+ const __m128i cospi_p48_m16 = pair_set_epi16(cospi[48], -cospi[16]);
+ const __m128i cospi_p16_p48 = pair_set_epi16(cospi[16], cospi[48]);
+ const __m128i cospi_m32_p32 = pair_set_epi16(-cospi[32], cospi[32]);
+
+ // stage 1
+ __m128i x[8];
+ x[0] = input[0];
+ x[1] = input[4];
+ x[2] = input[2];
+ x[3] = input[6];
+ x[4] = input[1];
+ x[5] = input[5];
+ x[6] = input[3];
+ x[7] = input[7];
+
+ // stage 2
+ btf_16_sse2(cospi_p56_m08, cospi_p08_p56, x[4], x[7], x[4], x[7]);
+ btf_16_sse2(cospi_p24_m40, cospi_p40_p24, x[5], x[6], x[5], x[6]);
+
+ // stage 3
+ btf_16_sse2(cospi_p32_p32, cospi_p32_m32, x[0], x[1], x[0], x[1]);
+ btf_16_sse2(cospi_p48_m16, cospi_p16_p48, x[2], x[3], x[2], x[3]);
+ btf_16_adds_subs_sse2(x[4], x[5]);
+ btf_16_subs_adds_sse2(x[7], x[6]);
+
+ // stage 4
+ btf_16_adds_subs_sse2(x[0], x[3]);
+ btf_16_adds_subs_sse2(x[1], x[2]);
+ btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x[5], x[6], x[5], x[6]);
+
+ // stage 5
+ btf_16_adds_subs_out_sse2(output[0], output[7], x[0], x[7]);
+ btf_16_adds_subs_out_sse2(output[1], output[6], x[1], x[6]);
+ btf_16_adds_subs_out_sse2(output[2], output[5], x[2], x[5]);
+ btf_16_adds_subs_out_sse2(output[3], output[4], x[3], x[4]);
+}
+
+static void idct8_w4_sse2(const __m128i *input, __m128i *output) {
+ const int8_t cos_bit = INV_COS_BIT;
+ const int32_t *cospi = cospi_arr(INV_COS_BIT);
+ const __m128i __rounding = _mm_set1_epi32(1 << (INV_COS_BIT - 1));
+
+ const __m128i cospi_p56_m08 = pair_set_epi16(cospi[56], -cospi[8]);
+ const __m128i cospi_p08_p56 = pair_set_epi16(cospi[8], cospi[56]);
+ const __m128i cospi_p24_m40 = pair_set_epi16(cospi[24], -cospi[40]);
+ const __m128i cospi_p40_p24 = pair_set_epi16(cospi[40], cospi[24]);
+ const __m128i cospi_p32_p32 = pair_set_epi16(cospi[32], cospi[32]);
+ const __m128i cospi_p32_m32 = pair_set_epi16(cospi[32], -cospi[32]);
+ const __m128i cospi_p48_m16 = pair_set_epi16(cospi[48], -cospi[16]);
+ const __m128i cospi_p16_p48 = pair_set_epi16(cospi[16], cospi[48]);
+ const __m128i cospi_m32_p32 = pair_set_epi16(-cospi[32], cospi[32]);
+
+ // stage 1
+ __m128i x[8];
+ x[0] = input[0];
+ x[1] = input[4];
+ x[2] = input[2];
+ x[3] = input[6];
+ x[4] = input[1];
+ x[5] = input[5];
+ x[6] = input[3];
+ x[7] = input[7];
+
+ // stage 2
+ btf_16_4p_sse2(cospi_p56_m08, cospi_p08_p56, x[4], x[7], x[4], x[7]);
+ btf_16_4p_sse2(cospi_p24_m40, cospi_p40_p24, x[5], x[6], x[5], x[6]);
+
+ // stage 3
+ btf_16_4p_sse2(cospi_p32_p32, cospi_p32_m32, x[0], x[1], x[0], x[1]);
+ btf_16_4p_sse2(cospi_p48_m16, cospi_p16_p48, x[2], x[3], x[2], x[3]);
+ btf_16_adds_subs_sse2(x[4], x[5]);
+ btf_16_subs_adds_sse2(x[7], x[6]);
+
+ // stage 4
+ btf_16_adds_subs_sse2(x[0], x[3]);
+ btf_16_adds_subs_sse2(x[1], x[2]);
+ btf_16_4p_sse2(cospi_m32_p32, cospi_p32_p32, x[5], x[6], x[5], x[6]);
+
+ // stage 5
+ btf_16_adds_subs_out_sse2(output[0], output[7], x[0], x[7]);
+ btf_16_adds_subs_out_sse2(output[1], output[6], x[1], x[6]);
+ btf_16_adds_subs_out_sse2(output[2], output[5], x[2], x[5]);
+ btf_16_adds_subs_out_sse2(output[3], output[4], x[3], x[4]);
+}
+
+static INLINE void idct16_stage5_sse2(__m128i *x, const int32_t *cospi,
+ const __m128i __rounding,
+ int8_t cos_bit) {
+ const __m128i cospi_m32_p32 = pair_set_epi16(-cospi[32], cospi[32]);
+ const __m128i cospi_p32_p32 = pair_set_epi16(cospi[32], cospi[32]);
+ btf_16_adds_subs_sse2(x[0], x[3]);
+ btf_16_adds_subs_sse2(x[1], x[2]);
+ btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x[5], x[6], x[5], x[6]);
+ btf_16_adds_subs_sse2(x[8], x[11]);
+ btf_16_adds_subs_sse2(x[9], x[10]);
+ btf_16_subs_adds_sse2(x[15], x[12]);
+ btf_16_subs_adds_sse2(x[14], x[13]);
+}
+
+static INLINE void idct16_stage6_sse2(__m128i *x, const int32_t *cospi,
+ const __m128i __rounding,
+ int8_t cos_bit) {
+ const __m128i cospi_m32_p32 = pair_set_epi16(-cospi[32], cospi[32]);
+ const __m128i cospi_p32_p32 = pair_set_epi16(cospi[32], cospi[32]);
+ btf_16_adds_subs_sse2(x[0], x[7]);
+ btf_16_adds_subs_sse2(x[1], x[6]);
+ btf_16_adds_subs_sse2(x[2], x[5]);
+ btf_16_adds_subs_sse2(x[3], x[4]);
+ btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x[10], x[13], x[10], x[13]);
+ btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x[11], x[12], x[11], x[12]);
+}
+
+static INLINE void idct16_stage7_sse2(__m128i *output, __m128i *x) {
+ btf_16_adds_subs_out_sse2(output[0], output[15], x[0], x[15]);
+ btf_16_adds_subs_out_sse2(output[1], output[14], x[1], x[14]);
+ btf_16_adds_subs_out_sse2(output[2], output[13], x[2], x[13]);
+ btf_16_adds_subs_out_sse2(output[3], output[12], x[3], x[12]);
+ btf_16_adds_subs_out_sse2(output[4], output[11], x[4], x[11]);
+ btf_16_adds_subs_out_sse2(output[5], output[10], x[5], x[10]);
+ btf_16_adds_subs_out_sse2(output[6], output[9], x[6], x[9]);
+ btf_16_adds_subs_out_sse2(output[7], output[8], x[7], x[8]);
+}
+
+static void idct16_low1_ssse3(const __m128i *input, __m128i *output) {
+ const int32_t *cospi = cospi_arr(INV_COS_BIT);
+
+ // stage 1
+ __m128i x[2];
+ x[0] = input[0];
+
+ // stage 2
+ // stage 3
+ // stage 4
+ btf_16_ssse3(cospi[32], cospi[32], x[0], x[0], x[1]);
+
+ // stage 5
+ // stage 6
+ // stage 7
+ output[0] = x[0];
+ output[15] = x[0];
+ output[1] = x[1];
+ output[14] = x[1];
+ output[2] = x[1];
+ output[13] = x[1];
+ output[3] = x[0];
+ output[12] = x[0];
+ output[4] = x[0];
+ output[11] = x[0];
+ output[5] = x[1];
+ output[10] = x[1];
+ output[6] = x[1];
+ output[9] = x[1];
+ output[7] = x[0];
+ output[8] = x[0];
+}
+
+static void idct16_low8_ssse3(const __m128i *input, __m128i *output) {
+ const int8_t cos_bit = INV_COS_BIT;
+ const int32_t *cospi = cospi_arr(INV_COS_BIT);
+ const __m128i __rounding = _mm_set1_epi32(1 << (INV_COS_BIT - 1));
+ const __m128i cospi_m16_p48 = pair_set_epi16(-cospi[16], cospi[48]);
+ const __m128i cospi_p48_p16 = pair_set_epi16(cospi[48], cospi[16]);
+ const __m128i cospi_m48_m16 = pair_set_epi16(-cospi[48], -cospi[16]);
+
+ // stage 1
+ __m128i x[16];
+ x[0] = input[0];
+ x[2] = input[4];
+ x[4] = input[2];
+ x[6] = input[6];
+ x[8] = input[1];
+ x[10] = input[5];
+ x[12] = input[3];
+ x[14] = input[7];
+
+ // stage 2
+ btf_16_ssse3(cospi[60], cospi[4], x[8], x[8], x[15]);
+ btf_16_ssse3(-cospi[36], cospi[28], x[14], x[9], x[14]);
+ btf_16_ssse3(cospi[44], cospi[20], x[10], x[10], x[13]);
+ btf_16_ssse3(-cospi[52], cospi[12], x[12], x[11], x[12]);
+
+ // stage 3
+ btf_16_ssse3(cospi[56], cospi[8], x[4], x[4], x[7]);
+ btf_16_ssse3(-cospi[40], cospi[24], x[6], x[5], x[6]);
+ btf_16_adds_subs_sse2(x[8], x[9]);
+ btf_16_subs_adds_sse2(x[11], x[10]);
+ btf_16_adds_subs_sse2(x[12], x[13]);
+ btf_16_subs_adds_sse2(x[15], x[14]);
+
+ // stage 4
+ btf_16_ssse3(cospi[32], cospi[32], x[0], x[0], x[1]);
+ btf_16_ssse3(cospi[48], cospi[16], x[2], x[2], x[3]);
+ btf_16_adds_subs_sse2(x[4], x[5]);
+ btf_16_subs_adds_sse2(x[7], x[6]);
+ btf_16_sse2(cospi_m16_p48, cospi_p48_p16, x[9], x[14], x[9], x[14]);
+ btf_16_sse2(cospi_m48_m16, cospi_m16_p48, x[10], x[13], x[10], x[13]);
+
+ idct16_stage5_sse2(x, cospi, __rounding, cos_bit);
+ idct16_stage6_sse2(x, cospi, __rounding, cos_bit);
+ idct16_stage7_sse2(output, x);
+}
+
+static void idct16_sse2(const __m128i *input, __m128i *output) {
+ const int8_t cos_bit = INV_COS_BIT;
+ const int32_t *cospi = cospi_arr(INV_COS_BIT);
+ const __m128i __rounding = _mm_set1_epi32(1 << (INV_COS_BIT - 1));
+
+ const __m128i cospi_p60_m04 = pair_set_epi16(cospi[60], -cospi[4]);
+ const __m128i cospi_p04_p60 = pair_set_epi16(cospi[4], cospi[60]);
+ const __m128i cospi_p28_m36 = pair_set_epi16(cospi[28], -cospi[36]);
+ const __m128i cospi_p36_p28 = pair_set_epi16(cospi[36], cospi[28]);
+ const __m128i cospi_p44_m20 = pair_set_epi16(cospi[44], -cospi[20]);
+ const __m128i cospi_p20_p44 = pair_set_epi16(cospi[20], cospi[44]);
+ const __m128i cospi_p12_m52 = pair_set_epi16(cospi[12], -cospi[52]);
+ const __m128i cospi_p52_p12 = pair_set_epi16(cospi[52], cospi[12]);
+ const __m128i cospi_p56_m08 = pair_set_epi16(cospi[56], -cospi[8]);
+ const __m128i cospi_p08_p56 = pair_set_epi16(cospi[8], cospi[56]);
+ const __m128i cospi_p24_m40 = pair_set_epi16(cospi[24], -cospi[40]);
+ const __m128i cospi_p40_p24 = pair_set_epi16(cospi[40], cospi[24]);
+ const __m128i cospi_p32_p32 = pair_set_epi16(cospi[32], cospi[32]);
+ const __m128i cospi_p32_m32 = pair_set_epi16(cospi[32], -cospi[32]);
+ const __m128i cospi_p48_m16 = pair_set_epi16(cospi[48], -cospi[16]);
+ const __m128i cospi_p16_p48 = pair_set_epi16(cospi[16], cospi[48]);
+ const __m128i cospi_m16_p48 = pair_set_epi16(-cospi[16], cospi[48]);
+ const __m128i cospi_p48_p16 = pair_set_epi16(cospi[48], cospi[16]);
+ const __m128i cospi_m48_m16 = pair_set_epi16(-cospi[48], -cospi[16]);
+
+ // stage 1
+ __m128i x[16];
+ x[0] = input[0];
+ x[1] = input[8];
+ x[2] = input[4];
+ x[3] = input[12];
+ x[4] = input[2];
+ x[5] = input[10];
+ x[6] = input[6];
+ x[7] = input[14];
+ x[8] = input[1];
+ x[9] = input[9];
+ x[10] = input[5];
+ x[11] = input[13];
+ x[12] = input[3];
+ x[13] = input[11];
+ x[14] = input[7];
+ x[15] = input[15];
+
+ // stage 2
+ btf_16_sse2(cospi_p60_m04, cospi_p04_p60, x[8], x[15], x[8], x[15]);
+ btf_16_sse2(cospi_p28_m36, cospi_p36_p28, x[9], x[14], x[9], x[14]);
+ btf_16_sse2(cospi_p44_m20, cospi_p20_p44, x[10], x[13], x[10], x[13]);
+ btf_16_sse2(cospi_p12_m52, cospi_p52_p12, x[11], x[12], x[11], x[12]);
+
+ // stage 3
+ btf_16_sse2(cospi_p56_m08, cospi_p08_p56, x[4], x[7], x[4], x[7]);
+ btf_16_sse2(cospi_p24_m40, cospi_p40_p24, x[5], x[6], x[5], x[6]);
+ btf_16_adds_subs_sse2(x[8], x[9]);
+ btf_16_subs_adds_sse2(x[11], x[10]);
+ btf_16_adds_subs_sse2(x[12], x[13]);
+ btf_16_subs_adds_sse2(x[15], x[14]);
+
+ // stage 4
+ btf_16_sse2(cospi_p32_p32, cospi_p32_m32, x[0], x[1], x[0], x[1]);
+ btf_16_sse2(cospi_p48_m16, cospi_p16_p48, x[2], x[3], x[2], x[3]);
+ btf_16_adds_subs_sse2(x[4], x[5]);
+ btf_16_subs_adds_sse2(x[7], x[6]);
+ btf_16_sse2(cospi_m16_p48, cospi_p48_p16, x[9], x[14], x[9], x[14]);
+ btf_16_sse2(cospi_m48_m16, cospi_m16_p48, x[10], x[13], x[10], x[13]);
+
+ // stage 5~7
+ idct16_stage5_sse2(x, cospi, __rounding, cos_bit);
+ idct16_stage6_sse2(x, cospi, __rounding, cos_bit);
+ idct16_stage7_sse2(output, x);
+}
+
+static void idct16_w4_sse2(const __m128i *input, __m128i *output) {
+ const int8_t cos_bit = INV_COS_BIT;
+ const int32_t *cospi = cospi_arr(INV_COS_BIT);
+ const __m128i __rounding = _mm_set1_epi32(1 << (INV_COS_BIT - 1));
+
+ const __m128i cospi_p60_m04 = pair_set_epi16(cospi[60], -cospi[4]);
+ const __m128i cospi_p04_p60 = pair_set_epi16(cospi[4], cospi[60]);
+ const __m128i cospi_p28_m36 = pair_set_epi16(cospi[28], -cospi[36]);
+ const __m128i cospi_p36_p28 = pair_set_epi16(cospi[36], cospi[28]);
+ const __m128i cospi_p44_m20 = pair_set_epi16(cospi[44], -cospi[20]);
+ const __m128i cospi_p20_p44 = pair_set_epi16(cospi[20], cospi[44]);
+ const __m128i cospi_p12_m52 = pair_set_epi16(cospi[12], -cospi[52]);
+ const __m128i cospi_p52_p12 = pair_set_epi16(cospi[52], cospi[12]);
+ const __m128i cospi_p56_m08 = pair_set_epi16(cospi[56], -cospi[8]);
+ const __m128i cospi_p08_p56 = pair_set_epi16(cospi[8], cospi[56]);
+ const __m128i cospi_p24_m40 = pair_set_epi16(cospi[24], -cospi[40]);
+ const __m128i cospi_p40_p24 = pair_set_epi16(cospi[40], cospi[24]);
+ const __m128i cospi_p32_p32 = pair_set_epi16(cospi[32], cospi[32]);
+ const __m128i cospi_p32_m32 = pair_set_epi16(cospi[32], -cospi[32]);
+ const __m128i cospi_p48_m16 = pair_set_epi16(cospi[48], -cospi[16]);
+ const __m128i cospi_p16_p48 = pair_set_epi16(cospi[16], cospi[48]);
+ const __m128i cospi_m16_p48 = pair_set_epi16(-cospi[16], cospi[48]);
+ const __m128i cospi_p48_p16 = pair_set_epi16(cospi[48], cospi[16]);
+ const __m128i cospi_m48_m16 = pair_set_epi16(-cospi[48], -cospi[16]);
+ const __m128i cospi_m32_p32 = pair_set_epi16(-cospi[32], cospi[32]);
+
+ // stage 1
+ __m128i x[16];
+ x[0] = input[0];
+ x[1] = input[8];
+ x[2] = input[4];
+ x[3] = input[12];
+ x[4] = input[2];
+ x[5] = input[10];
+ x[6] = input[6];
+ x[7] = input[14];
+ x[8] = input[1];
+ x[9] = input[9];
+ x[10] = input[5];
+ x[11] = input[13];
+ x[12] = input[3];
+ x[13] = input[11];
+ x[14] = input[7];
+ x[15] = input[15];
+
+ // stage 2
+ btf_16_4p_sse2(cospi_p60_m04, cospi_p04_p60, x[8], x[15], x[8], x[15]);
+ btf_16_4p_sse2(cospi_p28_m36, cospi_p36_p28, x[9], x[14], x[9], x[14]);
+ btf_16_4p_sse2(cospi_p44_m20, cospi_p20_p44, x[10], x[13], x[10], x[13]);
+ btf_16_4p_sse2(cospi_p12_m52, cospi_p52_p12, x[11], x[12], x[11], x[12]);
+
+ // stage 3
+ btf_16_4p_sse2(cospi_p56_m08, cospi_p08_p56, x[4], x[7], x[4], x[7]);
+ btf_16_4p_sse2(cospi_p24_m40, cospi_p40_p24, x[5], x[6], x[5], x[6]);
+ btf_16_adds_subs_sse2(x[8], x[9]);
+ btf_16_subs_adds_sse2(x[11], x[10]);
+ btf_16_adds_subs_sse2(x[12], x[13]);
+ btf_16_subs_adds_sse2(x[15], x[14]);
+
+ // stage 4
+ btf_16_4p_sse2(cospi_p32_p32, cospi_p32_m32, x[0], x[1], x[0], x[1]);
+ btf_16_4p_sse2(cospi_p48_m16, cospi_p16_p48, x[2], x[3], x[2], x[3]);
+ btf_16_adds_subs_sse2(x[4], x[5]);
+ btf_16_subs_adds_sse2(x[7], x[6]);
+ btf_16_4p_sse2(cospi_m16_p48, cospi_p48_p16, x[9], x[14], x[9], x[14]);
+ btf_16_4p_sse2(cospi_m48_m16, cospi_m16_p48, x[10], x[13], x[10], x[13]);
+
+ // stage 5
+ btf_16_adds_subs_sse2(x[0], x[3]);
+ btf_16_adds_subs_sse2(x[1], x[2]);
+ btf_16_4p_sse2(cospi_m32_p32, cospi_p32_p32, x[5], x[6], x[5], x[6]);
+ btf_16_adds_subs_sse2(x[8], x[11]);
+ btf_16_adds_subs_sse2(x[9], x[10]);
+ btf_16_subs_adds_sse2(x[15], x[12]);
+ btf_16_subs_adds_sse2(x[14], x[13]);
+
+ // stage 6
+ btf_16_adds_subs_sse2(x[0], x[7]);
+ btf_16_adds_subs_sse2(x[1], x[6]);
+ btf_16_adds_subs_sse2(x[2], x[5]);
+ btf_16_adds_subs_sse2(x[3], x[4]);
+ btf_16_4p_sse2(cospi_m32_p32, cospi_p32_p32, x[10], x[13], x[10], x[13]);
+ btf_16_4p_sse2(cospi_m32_p32, cospi_p32_p32, x[11], x[12], x[11], x[12]);
+
+ // stage 7
+ idct16_stage7_sse2(output, x);
+}
+
+static INLINE void idct32_high16_stage3_sse2(__m128i *x) {
+ btf_16_adds_subs_sse2(x[16], x[17]);
+ btf_16_subs_adds_sse2(x[19], x[18]);
+ btf_16_adds_subs_sse2(x[20], x[21]);
+ btf_16_subs_adds_sse2(x[23], x[22]);
+ btf_16_adds_subs_sse2(x[24], x[25]);
+ btf_16_subs_adds_sse2(x[27], x[26]);
+ btf_16_adds_subs_sse2(x[28], x[29]);
+ btf_16_subs_adds_sse2(x[31], x[30]);
+}
+
+static INLINE void idct32_high16_stage4_sse2(__m128i *x, const int32_t *cospi,
+ const __m128i __rounding,
+ int8_t cos_bit) {
+ const __m128i cospi_m08_p56 = pair_set_epi16(-cospi[8], cospi[56]);
+ const __m128i cospi_p56_p08 = pair_set_epi16(cospi[56], cospi[8]);
+ const __m128i cospi_m56_m08 = pair_set_epi16(-cospi[56], -cospi[8]);
+ const __m128i cospi_m40_p24 = pair_set_epi16(-cospi[40], cospi[24]);
+ const __m128i cospi_p24_p40 = pair_set_epi16(cospi[24], cospi[40]);
+ const __m128i cospi_m24_m40 = pair_set_epi16(-cospi[24], -cospi[40]);
+ btf_16_sse2(cospi_m08_p56, cospi_p56_p08, x[17], x[30], x[17], x[30]);
+ btf_16_sse2(cospi_m56_m08, cospi_m08_p56, x[18], x[29], x[18], x[29]);
+ btf_16_sse2(cospi_m40_p24, cospi_p24_p40, x[21], x[26], x[21], x[26]);
+ btf_16_sse2(cospi_m24_m40, cospi_m40_p24, x[22], x[25], x[22], x[25]);
+}
+
+static INLINE void idct32_high24_stage5_sse2(__m128i *x, const int32_t *cospi,
+ const __m128i __rounding,
+ int8_t cos_bit) {
+ const __m128i cospi_m16_p48 = pair_set_epi16(-cospi[16], cospi[48]);
+ const __m128i cospi_p48_p16 = pair_set_epi16(cospi[48], cospi[16]);
+ const __m128i cospi_m48_m16 = pair_set_epi16(-cospi[48], -cospi[16]);
+ btf_16_sse2(cospi_m16_p48, cospi_p48_p16, x[9], x[14], x[9], x[14]);
+ btf_16_sse2(cospi_m48_m16, cospi_m16_p48, x[10], x[13], x[10], x[13]);
+ btf_16_adds_subs_sse2(x[16], x[19]);
+ btf_16_adds_subs_sse2(x[17], x[18]);
+ btf_16_subs_adds_sse2(x[23], x[20]);
+ btf_16_subs_adds_sse2(x[22], x[21]);
+ btf_16_adds_subs_sse2(x[24], x[27]);
+ btf_16_adds_subs_sse2(x[25], x[26]);
+ btf_16_subs_adds_sse2(x[31], x[28]);
+ btf_16_subs_adds_sse2(x[30], x[29]);
+}
+
+static INLINE void idct32_high28_stage6_sse2(__m128i *x, const int32_t *cospi,
+ const __m128i __rounding,
+ int8_t cos_bit) {
+ const __m128i cospi_m32_p32 = pair_set_epi16(-cospi[32], cospi[32]);
+ const __m128i cospi_p32_p32 = pair_set_epi16(cospi[32], cospi[32]);
+ const __m128i cospi_m16_p48 = pair_set_epi16(-cospi[16], cospi[48]);
+ const __m128i cospi_p48_p16 = pair_set_epi16(cospi[48], cospi[16]);
+ const __m128i cospi_m48_m16 = pair_set_epi16(-cospi[48], -cospi[16]);
+ btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x[5], x[6], x[5], x[6]);
+ btf_16_adds_subs_sse2(x[8], x[11]);
+ btf_16_adds_subs_sse2(x[9], x[10]);
+ btf_16_subs_adds_sse2(x[15], x[12]);
+ btf_16_subs_adds_sse2(x[14], x[13]);
+ btf_16_sse2(cospi_m16_p48, cospi_p48_p16, x[18], x[29], x[18], x[29]);
+ btf_16_sse2(cospi_m16_p48, cospi_p48_p16, x[19], x[28], x[19], x[28]);
+ btf_16_sse2(cospi_m48_m16, cospi_m16_p48, x[20], x[27], x[20], x[27]);
+ btf_16_sse2(cospi_m48_m16, cospi_m16_p48, x[21], x[26], x[21], x[26]);
+}
+
+static INLINE void idct32_stage7_sse2(__m128i *x, const int32_t *cospi,
+ const __m128i __rounding,
+ int8_t cos_bit) {
+ const __m128i cospi_m32_p32 = pair_set_epi16(-cospi[32], cospi[32]);
+ const __m128i cospi_p32_p32 = pair_set_epi16(cospi[32], cospi[32]);
+ btf_16_adds_subs_sse2(x[0], x[7]);
+ btf_16_adds_subs_sse2(x[1], x[6]);
+ btf_16_adds_subs_sse2(x[2], x[5]);
+ btf_16_adds_subs_sse2(x[3], x[4]);
+ btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x[10], x[13], x[10], x[13]);
+ btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x[11], x[12], x[11], x[12]);
+ btf_16_adds_subs_sse2(x[16], x[23]);
+ btf_16_adds_subs_sse2(x[17], x[22]);
+ btf_16_adds_subs_sse2(x[18], x[21]);
+ btf_16_adds_subs_sse2(x[19], x[20]);
+ btf_16_subs_adds_sse2(x[31], x[24]);
+ btf_16_subs_adds_sse2(x[30], x[25]);
+ btf_16_subs_adds_sse2(x[29], x[26]);
+ btf_16_subs_adds_sse2(x[28], x[27]);
+}
+
+static INLINE void idct32_stage8_sse2(__m128i *x, const int32_t *cospi,
+ const __m128i __rounding,
+ int8_t cos_bit) {
+ const __m128i cospi_m32_p32 = pair_set_epi16(-cospi[32], cospi[32]);
+ const __m128i cospi_p32_p32 = pair_set_epi16(cospi[32], cospi[32]);
+ btf_16_adds_subs_sse2(x[0], x[15]);
+ btf_16_adds_subs_sse2(x[1], x[14]);
+ btf_16_adds_subs_sse2(x[2], x[13]);
+ btf_16_adds_subs_sse2(x[3], x[12]);
+ btf_16_adds_subs_sse2(x[4], x[11]);
+ btf_16_adds_subs_sse2(x[5], x[10]);
+ btf_16_adds_subs_sse2(x[6], x[9]);
+ btf_16_adds_subs_sse2(x[7], x[8]);
+ btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x[20], x[27], x[20], x[27]);
+ btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x[21], x[26], x[21], x[26]);
+ btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x[22], x[25], x[22], x[25]);
+ btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x[23], x[24], x[23], x[24]);
+}
+
+static INLINE void idct32_stage9_sse2(__m128i *output, __m128i *x) {
+ btf_16_adds_subs_out_sse2(output[0], output[31], x[0], x[31]);
+ btf_16_adds_subs_out_sse2(output[1], output[30], x[1], x[30]);
+ btf_16_adds_subs_out_sse2(output[2], output[29], x[2], x[29]);
+ btf_16_adds_subs_out_sse2(output[3], output[28], x[3], x[28]);
+ btf_16_adds_subs_out_sse2(output[4], output[27], x[4], x[27]);
+ btf_16_adds_subs_out_sse2(output[5], output[26], x[5], x[26]);
+ btf_16_adds_subs_out_sse2(output[6], output[25], x[6], x[25]);
+ btf_16_adds_subs_out_sse2(output[7], output[24], x[7], x[24]);
+ btf_16_adds_subs_out_sse2(output[8], output[23], x[8], x[23]);
+ btf_16_adds_subs_out_sse2(output[9], output[22], x[9], x[22]);
+ btf_16_adds_subs_out_sse2(output[10], output[21], x[10], x[21]);
+ btf_16_adds_subs_out_sse2(output[11], output[20], x[11], x[20]);
+ btf_16_adds_subs_out_sse2(output[12], output[19], x[12], x[19]);
+ btf_16_adds_subs_out_sse2(output[13], output[18], x[13], x[18]);
+ btf_16_adds_subs_out_sse2(output[14], output[17], x[14], x[17]);
+ btf_16_adds_subs_out_sse2(output[15], output[16], x[15], x[16]);
+}
+
+static void idct32_low1_ssse3(const __m128i *input, __m128i *output) {
+ const int32_t *cospi = cospi_arr(INV_COS_BIT);
+
+ // stage 1
+ __m128i x[2];
+ x[0] = input[0];
+
+ // stage 2
+ // stage 3
+ // stage 4
+ // stage 5
+ btf_16_ssse3(cospi[32], cospi[32], x[0], x[0], x[1]);
+
+ // stage 6
+ // stage 7
+ // stage 8
+ // stage 9
+ output[0] = x[0];
+ output[31] = x[0];
+ output[1] = x[1];
+ output[30] = x[1];
+ output[2] = x[1];
+ output[29] = x[1];
+ output[3] = x[0];
+ output[28] = x[0];
+ output[4] = x[0];
+ output[27] = x[0];
+ output[5] = x[1];
+ output[26] = x[1];
+ output[6] = x[1];
+ output[25] = x[1];
+ output[7] = x[0];
+ output[24] = x[0];
+ output[8] = x[0];
+ output[23] = x[0];
+ output[9] = x[1];
+ output[22] = x[1];
+ output[10] = x[1];
+ output[21] = x[1];
+ output[11] = x[0];
+ output[20] = x[0];
+ output[12] = x[0];
+ output[19] = x[0];
+ output[13] = x[1];
+ output[18] = x[1];
+ output[14] = x[1];
+ output[17] = x[1];
+ output[15] = x[0];
+ output[16] = x[0];
+}
+
+static void idct32_low8_ssse3(const __m128i *input, __m128i *output) {
+ const int8_t cos_bit = INV_COS_BIT;
+ const int32_t *cospi = cospi_arr(INV_COS_BIT);
+ const __m128i __rounding = _mm_set1_epi32(1 << (INV_COS_BIT - 1));
+
+ // stage 1
+ __m128i x[32];
+ x[0] = input[0];
+ x[4] = input[4];
+ x[8] = input[2];
+ x[12] = input[6];
+ x[16] = input[1];
+ x[20] = input[5];
+ x[24] = input[3];
+ x[28] = input[7];
+
+ // stage 2
+ btf_16_ssse3(cospi[62], cospi[2], x[16], x[16], x[31]);
+ btf_16_ssse3(-cospi[50], cospi[14], x[28], x[19], x[28]);
+ btf_16_ssse3(cospi[54], cospi[10], x[20], x[20], x[27]);
+ btf_16_ssse3(-cospi[58], cospi[6], x[24], x[23], x[24]);
+
+ // stage 3
+ btf_16_ssse3(cospi[60], cospi[4], x[8], x[8], x[15]);
+ btf_16_ssse3(-cospi[52], cospi[12], x[12], x[11], x[12]);
+ x[17] = x[16];
+ x[18] = x[19];
+ x[21] = x[20];
+ x[22] = x[23];
+ x[25] = x[24];
+ x[26] = x[27];
+ x[29] = x[28];
+ x[30] = x[31];
+
+ // stage 4
+ btf_16_ssse3(cospi[56], cospi[8], x[4], x[4], x[7]);
+ x[9] = x[8];
+ x[10] = x[11];
+ x[13] = x[12];
+ x[14] = x[15];
+ idct32_high16_stage4_sse2(x, cospi, __rounding, cos_bit);
+
+ // stage 5
+ btf_16_ssse3(cospi[32], cospi[32], x[0], x[0], x[1]);
+ x[5] = x[4];
+ x[6] = x[7];
+ idct32_high24_stage5_sse2(x, cospi, __rounding, cos_bit);
+ // stage 6
+ x[3] = x[0];
+ x[2] = x[1];
+ idct32_high28_stage6_sse2(x, cospi, __rounding, cos_bit);
+
+ idct32_stage7_sse2(x, cospi, __rounding, cos_bit);
+ idct32_stage8_sse2(x, cospi, __rounding, cos_bit);
+ idct32_stage9_sse2(output, x);
+}
+
+static void idct32_low16_ssse3(const __m128i *input, __m128i *output) {
+ const int8_t cos_bit = INV_COS_BIT;
+ const int32_t *cospi = cospi_arr(INV_COS_BIT);
+ const __m128i __rounding = _mm_set1_epi32(1 << (INV_COS_BIT - 1));
+
+ // stage 1
+ __m128i x[32];
+ x[0] = input[0];
+ x[2] = input[8];
+ x[4] = input[4];
+ x[6] = input[12];
+ x[8] = input[2];
+ x[10] = input[10];
+ x[12] = input[6];
+ x[14] = input[14];
+ x[16] = input[1];
+ x[18] = input[9];
+ x[20] = input[5];
+ x[22] = input[13];
+ x[24] = input[3];
+ x[26] = input[11];
+ x[28] = input[7];
+ x[30] = input[15];
+
+ // stage 2
+ btf_16_ssse3(cospi[62], cospi[2], x[16], x[16], x[31]);
+ btf_16_ssse3(-cospi[34], cospi[30], x[30], x[17], x[30]);
+ btf_16_ssse3(cospi[46], cospi[18], x[18], x[18], x[29]);
+ btf_16_ssse3(-cospi[50], cospi[14], x[28], x[19], x[28]);
+ btf_16_ssse3(cospi[54], cospi[10], x[20], x[20], x[27]);
+ btf_16_ssse3(-cospi[42], cospi[22], x[26], x[21], x[26]);
+ btf_16_ssse3(cospi[38], cospi[26], x[22], x[22], x[25]);
+ btf_16_ssse3(-cospi[58], cospi[6], x[24], x[23], x[24]);
+
+ // stage 3
+ btf_16_ssse3(cospi[60], cospi[4], x[8], x[8], x[15]);
+ btf_16_ssse3(-cospi[36], cospi[28], x[14], x[9], x[14]);
+ btf_16_ssse3(cospi[44], cospi[20], x[10], x[10], x[13]);
+ btf_16_ssse3(-cospi[52], cospi[12], x[12], x[11], x[12]);
+ idct32_high16_stage3_sse2(x);
+
+ // stage 4
+ btf_16_ssse3(cospi[56], cospi[8], x[4], x[4], x[7]);
+ btf_16_ssse3(-cospi[40], cospi[24], x[6], x[5], x[6]);
+ btf_16_adds_subs_sse2(x[8], x[9]);
+ btf_16_subs_adds_sse2(x[11], x[10]);
+ btf_16_adds_subs_sse2(x[12], x[13]);
+ btf_16_subs_adds_sse2(x[15], x[14]);
+ idct32_high16_stage4_sse2(x, cospi, __rounding, cos_bit);
+
+ // stage 5
+ btf_16_ssse3(cospi[32], cospi[32], x[0], x[0], x[1]);
+ btf_16_ssse3(cospi[48], cospi[16], x[2], x[2], x[3]);
+ btf_16_adds_subs_sse2(x[4], x[5]);
+ btf_16_subs_adds_sse2(x[7], x[6]);
+ idct32_high24_stage5_sse2(x, cospi, __rounding, cos_bit);
+
+ btf_16_adds_subs_sse2(x[0], x[3]);
+ btf_16_adds_subs_sse2(x[1], x[2]);
+ idct32_high28_stage6_sse2(x, cospi, __rounding, cos_bit);
+
+ idct32_stage7_sse2(x, cospi, __rounding, cos_bit);
+ idct32_stage8_sse2(x, cospi, __rounding, cos_bit);
+ idct32_stage9_sse2(output, x);
+}
+
+static void idct32_sse2(const __m128i *input, __m128i *output) {
+ const int8_t cos_bit = INV_COS_BIT;
+ const int32_t *cospi = cospi_arr(INV_COS_BIT);
+ const __m128i __rounding = _mm_set1_epi32(1 << (INV_COS_BIT - 1));
+
+ const __m128i cospi_p62_m02 = pair_set_epi16(cospi[62], -cospi[2]);
+ const __m128i cospi_p02_p62 = pair_set_epi16(cospi[2], cospi[62]);
+ const __m128i cospi_p30_m34 = pair_set_epi16(cospi[30], -cospi[34]);
+ const __m128i cospi_p34_p30 = pair_set_epi16(cospi[34], cospi[30]);
+ const __m128i cospi_p46_m18 = pair_set_epi16(cospi[46], -cospi[18]);
+ const __m128i cospi_p18_p46 = pair_set_epi16(cospi[18], cospi[46]);
+ const __m128i cospi_p14_m50 = pair_set_epi16(cospi[14], -cospi[50]);
+ const __m128i cospi_p50_p14 = pair_set_epi16(cospi[50], cospi[14]);
+ const __m128i cospi_p54_m10 = pair_set_epi16(cospi[54], -cospi[10]);
+ const __m128i cospi_p10_p54 = pair_set_epi16(cospi[10], cospi[54]);
+ const __m128i cospi_p22_m42 = pair_set_epi16(cospi[22], -cospi[42]);
+ const __m128i cospi_p42_p22 = pair_set_epi16(cospi[42], cospi[22]);
+ const __m128i cospi_p38_m26 = pair_set_epi16(cospi[38], -cospi[26]);
+ const __m128i cospi_p26_p38 = pair_set_epi16(cospi[26], cospi[38]);
+ const __m128i cospi_p06_m58 = pair_set_epi16(cospi[6], -cospi[58]);
+ const __m128i cospi_p58_p06 = pair_set_epi16(cospi[58], cospi[6]);
+ const __m128i cospi_p60_m04 = pair_set_epi16(cospi[60], -cospi[4]);
+ const __m128i cospi_p04_p60 = pair_set_epi16(cospi[4], cospi[60]);
+ const __m128i cospi_p28_m36 = pair_set_epi16(cospi[28], -cospi[36]);
+ const __m128i cospi_p36_p28 = pair_set_epi16(cospi[36], cospi[28]);
+ const __m128i cospi_p44_m20 = pair_set_epi16(cospi[44], -cospi[20]);
+ const __m128i cospi_p20_p44 = pair_set_epi16(cospi[20], cospi[44]);
+ const __m128i cospi_p12_m52 = pair_set_epi16(cospi[12], -cospi[52]);
+ const __m128i cospi_p52_p12 = pair_set_epi16(cospi[52], cospi[12]);
+ const __m128i cospi_p56_m08 = pair_set_epi16(cospi[56], -cospi[8]);
+ const __m128i cospi_p08_p56 = pair_set_epi16(cospi[8], cospi[56]);
+ const __m128i cospi_p24_m40 = pair_set_epi16(cospi[24], -cospi[40]);
+ const __m128i cospi_p40_p24 = pair_set_epi16(cospi[40], cospi[24]);
+ const __m128i cospi_p32_p32 = pair_set_epi16(cospi[32], cospi[32]);
+ const __m128i cospi_p32_m32 = pair_set_epi16(cospi[32], -cospi[32]);
+ const __m128i cospi_p48_m16 = pair_set_epi16(cospi[48], -cospi[16]);
+ const __m128i cospi_p16_p48 = pair_set_epi16(cospi[16], cospi[48]);
+
+ // stage 1
+ __m128i x[32];
+ x[0] = input[0];
+ x[1] = input[16];
+ x[2] = input[8];
+ x[3] = input[24];
+ x[4] = input[4];
+ x[5] = input[20];
+ x[6] = input[12];
+ x[7] = input[28];
+ x[8] = input[2];
+ x[9] = input[18];
+ x[10] = input[10];
+ x[11] = input[26];
+ x[12] = input[6];
+ x[13] = input[22];
+ x[14] = input[14];
+ x[15] = input[30];
+ x[16] = input[1];
+ x[17] = input[17];
+ x[18] = input[9];
+ x[19] = input[25];
+ x[20] = input[5];
+ x[21] = input[21];
+ x[22] = input[13];
+ x[23] = input[29];
+ x[24] = input[3];
+ x[25] = input[19];
+ x[26] = input[11];
+ x[27] = input[27];
+ x[28] = input[7];
+ x[29] = input[23];
+ x[30] = input[15];
+ x[31] = input[31];
+
+ // stage 2
+ btf_16_sse2(cospi_p62_m02, cospi_p02_p62, x[16], x[31], x[16], x[31]);
+ btf_16_sse2(cospi_p30_m34, cospi_p34_p30, x[17], x[30], x[17], x[30]);
+ btf_16_sse2(cospi_p46_m18, cospi_p18_p46, x[18], x[29], x[18], x[29]);
+ btf_16_sse2(cospi_p14_m50, cospi_p50_p14, x[19], x[28], x[19], x[28]);
+ btf_16_sse2(cospi_p54_m10, cospi_p10_p54, x[20], x[27], x[20], x[27]);
+ btf_16_sse2(cospi_p22_m42, cospi_p42_p22, x[21], x[26], x[21], x[26]);
+ btf_16_sse2(cospi_p38_m26, cospi_p26_p38, x[22], x[25], x[22], x[25]);
+ btf_16_sse2(cospi_p06_m58, cospi_p58_p06, x[23], x[24], x[23], x[24]);
+
+ // stage 3
+ btf_16_sse2(cospi_p60_m04, cospi_p04_p60, x[8], x[15], x[8], x[15]);
+ btf_16_sse2(cospi_p28_m36, cospi_p36_p28, x[9], x[14], x[9], x[14]);
+ btf_16_sse2(cospi_p44_m20, cospi_p20_p44, x[10], x[13], x[10], x[13]);
+ btf_16_sse2(cospi_p12_m52, cospi_p52_p12, x[11], x[12], x[11], x[12]);
+ idct32_high16_stage3_sse2(x);
+
+ // stage 4
+ btf_16_sse2(cospi_p56_m08, cospi_p08_p56, x[4], x[7], x[4], x[7]);
+ btf_16_sse2(cospi_p24_m40, cospi_p40_p24, x[5], x[6], x[5], x[6]);
+ btf_16_adds_subs_sse2(x[8], x[9]);
+ btf_16_subs_adds_sse2(x[11], x[10]);
+ btf_16_adds_subs_sse2(x[12], x[13]);
+ btf_16_subs_adds_sse2(x[15], x[14]);
+ idct32_high16_stage4_sse2(x, cospi, __rounding, cos_bit);
+
+ // stage 5
+ btf_16_sse2(cospi_p32_p32, cospi_p32_m32, x[0], x[1], x[0], x[1]);
+ btf_16_sse2(cospi_p48_m16, cospi_p16_p48, x[2], x[3], x[2], x[3]);
+ btf_16_adds_subs_sse2(x[4], x[5]);
+ btf_16_adds_subs_sse2(x[7], x[6]);
+ idct32_high24_stage5_sse2(x, cospi, __rounding, cos_bit);
+
+ // stage 6
+ btf_16_adds_subs_sse2(x[0], x[3]);
+ btf_16_adds_subs_sse2(x[1], x[2]);
+ idct32_high28_stage6_sse2(x, cospi, __rounding, cos_bit);
+
+ // stage 7~8
+ idct32_stage7_sse2(x, cospi, __rounding, cos_bit);
+ idct32_stage8_sse2(x, cospi, __rounding, cos_bit);
+ idct32_stage9_sse2(output, x);
+}
+
+static INLINE void idct64_stage4_high32_sse2(__m128i *x, const int32_t *cospi,
+ const __m128i __rounding,
+ int8_t cos_bit) {
+ const __m128i cospi_m04_p60 = pair_set_epi16(-cospi[4], cospi[60]);
+ const __m128i cospi_p60_p04 = pair_set_epi16(cospi[60], cospi[4]);
+ const __m128i cospi_m60_m04 = pair_set_epi16(-cospi[60], -cospi[4]);
+ const __m128i cospi_m36_p28 = pair_set_epi16(-cospi[36], cospi[28]);
+ const __m128i cospi_p28_p36 = pair_set_epi16(cospi[28], cospi[36]);
+ const __m128i cospi_m28_m36 = pair_set_epi16(-cospi[28], -cospi[36]);
+ const __m128i cospi_m20_p44 = pair_set_epi16(-cospi[20], cospi[44]);
+ const __m128i cospi_p44_p20 = pair_set_epi16(cospi[44], cospi[20]);
+ const __m128i cospi_m44_m20 = pair_set_epi16(-cospi[44], -cospi[20]);
+ const __m128i cospi_m52_p12 = pair_set_epi16(-cospi[52], cospi[12]);
+ const __m128i cospi_p12_p52 = pair_set_epi16(cospi[12], cospi[52]);
+ const __m128i cospi_m12_m52 = pair_set_epi16(-cospi[12], -cospi[52]);
+ btf_16_sse2(cospi_m04_p60, cospi_p60_p04, x[33], x[62], x[33], x[62]);
+ btf_16_sse2(cospi_m60_m04, cospi_m04_p60, x[34], x[61], x[34], x[61]);
+ btf_16_sse2(cospi_m36_p28, cospi_p28_p36, x[37], x[58], x[37], x[58]);
+ btf_16_sse2(cospi_m28_m36, cospi_m36_p28, x[38], x[57], x[38], x[57]);
+ btf_16_sse2(cospi_m20_p44, cospi_p44_p20, x[41], x[54], x[41], x[54]);
+ btf_16_sse2(cospi_m44_m20, cospi_m20_p44, x[42], x[53], x[42], x[53]);
+ btf_16_sse2(cospi_m52_p12, cospi_p12_p52, x[45], x[50], x[45], x[50]);
+ btf_16_sse2(cospi_m12_m52, cospi_m52_p12, x[46], x[49], x[46], x[49]);
+}
+
+static INLINE void idct64_stage5_high48_sse2(__m128i *x, const int32_t *cospi,
+ const __m128i __rounding,
+ int8_t cos_bit) {
+ const __m128i cospi_m08_p56 = pair_set_epi16(-cospi[8], cospi[56]);
+ const __m128i cospi_p56_p08 = pair_set_epi16(cospi[56], cospi[8]);
+ const __m128i cospi_m56_m08 = pair_set_epi16(-cospi[56], -cospi[8]);
+ const __m128i cospi_m40_p24 = pair_set_epi16(-cospi[40], cospi[24]);
+ const __m128i cospi_p24_p40 = pair_set_epi16(cospi[24], cospi[40]);
+ const __m128i cospi_m24_m40 = pair_set_epi16(-cospi[24], -cospi[40]);
+ btf_16_sse2(cospi_m08_p56, cospi_p56_p08, x[17], x[30], x[17], x[30]);
+ btf_16_sse2(cospi_m56_m08, cospi_m08_p56, x[18], x[29], x[18], x[29]);
+ btf_16_sse2(cospi_m40_p24, cospi_p24_p40, x[21], x[26], x[21], x[26]);
+ btf_16_sse2(cospi_m24_m40, cospi_m40_p24, x[22], x[25], x[22], x[25]);
+ btf_16_adds_subs_sse2(x[32], x[35]);
+ btf_16_adds_subs_sse2(x[33], x[34]);
+ btf_16_subs_adds_sse2(x[39], x[36]);
+ btf_16_subs_adds_sse2(x[38], x[37]);
+ btf_16_adds_subs_sse2(x[40], x[43]);
+ btf_16_adds_subs_sse2(x[41], x[42]);
+ btf_16_subs_adds_sse2(x[47], x[44]);
+ btf_16_subs_adds_sse2(x[46], x[45]);
+ btf_16_adds_subs_sse2(x[48], x[51]);
+ btf_16_adds_subs_sse2(x[49], x[50]);
+ btf_16_subs_adds_sse2(x[55], x[52]);
+ btf_16_subs_adds_sse2(x[54], x[53]);
+ btf_16_adds_subs_sse2(x[56], x[59]);
+ btf_16_adds_subs_sse2(x[57], x[58]);
+ btf_16_subs_adds_sse2(x[63], x[60]);
+ btf_16_subs_adds_sse2(x[62], x[61]);
+}
+
+static INLINE void idct64_stage6_high32_sse2(__m128i *x, const int32_t *cospi,
+ const __m128i __rounding,
+ int8_t cos_bit) {
+ const __m128i cospi_m08_p56 = pair_set_epi16(-cospi[8], cospi[56]);
+ const __m128i cospi_p56_p08 = pair_set_epi16(cospi[56], cospi[8]);
+ const __m128i cospi_m56_m08 = pair_set_epi16(-cospi[56], -cospi[8]);
+ const __m128i cospi_m40_p24 = pair_set_epi16(-cospi[40], cospi[24]);
+ const __m128i cospi_p24_p40 = pair_set_epi16(cospi[24], cospi[40]);
+ const __m128i cospi_m24_m40 = pair_set_epi16(-cospi[24], -cospi[40]);
+ btf_16_sse2(cospi_m08_p56, cospi_p56_p08, x[34], x[61], x[34], x[61]);
+ btf_16_sse2(cospi_m08_p56, cospi_p56_p08, x[35], x[60], x[35], x[60]);
+ btf_16_sse2(cospi_m56_m08, cospi_m08_p56, x[36], x[59], x[36], x[59]);
+ btf_16_sse2(cospi_m56_m08, cospi_m08_p56, x[37], x[58], x[37], x[58]);
+ btf_16_sse2(cospi_m40_p24, cospi_p24_p40, x[42], x[53], x[42], x[53]);
+ btf_16_sse2(cospi_m40_p24, cospi_p24_p40, x[43], x[52], x[43], x[52]);
+ btf_16_sse2(cospi_m24_m40, cospi_m40_p24, x[44], x[51], x[44], x[51]);
+ btf_16_sse2(cospi_m24_m40, cospi_m40_p24, x[45], x[50], x[45], x[50]);
+}
+
+static INLINE void idct64_stage6_high48_sse2(__m128i *x, const int32_t *cospi,
+ const __m128i __rounding,
+ int8_t cos_bit) {
+ btf_16_adds_subs_sse2(x[16], x[19]);
+ btf_16_adds_subs_sse2(x[17], x[18]);
+ btf_16_subs_adds_sse2(x[23], x[20]);
+ btf_16_subs_adds_sse2(x[22], x[21]);
+ btf_16_adds_subs_sse2(x[24], x[27]);
+ btf_16_adds_subs_sse2(x[25], x[26]);
+ btf_16_subs_adds_sse2(x[31], x[28]);
+ btf_16_subs_adds_sse2(x[30], x[29]);
+ idct64_stage6_high32_sse2(x, cospi, __rounding, cos_bit);
+}
+
+static INLINE void idct64_stage7_high48_sse2(__m128i *x, const int32_t *cospi,
+ const __m128i __rounding,
+ int8_t cos_bit) {
+ const __m128i cospi_m16_p48 = pair_set_epi16(-cospi[16], cospi[48]);
+ const __m128i cospi_p48_p16 = pair_set_epi16(cospi[48], cospi[16]);
+ const __m128i cospi_m48_m16 = pair_set_epi16(-cospi[48], -cospi[16]);
+ btf_16_sse2(cospi_m16_p48, cospi_p48_p16, x[18], x[29], x[18], x[29]);
+ btf_16_sse2(cospi_m16_p48, cospi_p48_p16, x[19], x[28], x[19], x[28]);
+ btf_16_sse2(cospi_m48_m16, cospi_m16_p48, x[20], x[27], x[20], x[27]);
+ btf_16_sse2(cospi_m48_m16, cospi_m16_p48, x[21], x[26], x[21], x[26]);
+ btf_16_adds_subs_sse2(x[32], x[39]);
+ btf_16_adds_subs_sse2(x[33], x[38]);
+ btf_16_adds_subs_sse2(x[34], x[37]);
+ btf_16_adds_subs_sse2(x[35], x[36]);
+ btf_16_subs_adds_sse2(x[47], x[40]);
+ btf_16_subs_adds_sse2(x[46], x[41]);
+ btf_16_subs_adds_sse2(x[45], x[42]);
+ btf_16_subs_adds_sse2(x[44], x[43]);
+ btf_16_adds_subs_sse2(x[48], x[55]);
+ btf_16_adds_subs_sse2(x[49], x[54]);
+ btf_16_adds_subs_sse2(x[50], x[53]);
+ btf_16_adds_subs_sse2(x[51], x[52]);
+ btf_16_subs_adds_sse2(x[63], x[56]);
+ btf_16_subs_adds_sse2(x[62], x[57]);
+ btf_16_subs_adds_sse2(x[61], x[58]);
+ btf_16_subs_adds_sse2(x[60], x[59]);
+}
+
+static INLINE void idct64_stage8_high48_sse2(__m128i *x, const int32_t *cospi,
+ const __m128i __rounding,
+ int8_t cos_bit) {
+ const __m128i cospi_m16_p48 = pair_set_epi16(-cospi[16], cospi[48]);
+ const __m128i cospi_p48_p16 = pair_set_epi16(cospi[48], cospi[16]);
+ const __m128i cospi_m48_m16 = pair_set_epi16(-cospi[48], -cospi[16]);
+ btf_16_adds_subs_sse2(x[16], x[23]);
+ btf_16_adds_subs_sse2(x[17], x[22]);
+ btf_16_adds_subs_sse2(x[18], x[21]);
+ btf_16_adds_subs_sse2(x[19], x[20]);
+ btf_16_subs_adds_sse2(x[31], x[24]);
+ btf_16_subs_adds_sse2(x[30], x[25]);
+ btf_16_subs_adds_sse2(x[29], x[26]);
+ btf_16_subs_adds_sse2(x[28], x[27]);
+ btf_16_sse2(cospi_m16_p48, cospi_p48_p16, x[36], x[59], x[36], x[59]);
+ btf_16_sse2(cospi_m16_p48, cospi_p48_p16, x[37], x[58], x[37], x[58]);
+ btf_16_sse2(cospi_m16_p48, cospi_p48_p16, x[38], x[57], x[38], x[57]);
+ btf_16_sse2(cospi_m16_p48, cospi_p48_p16, x[39], x[56], x[39], x[56]);
+ btf_16_sse2(cospi_m48_m16, cospi_m16_p48, x[40], x[55], x[40], x[55]);
+ btf_16_sse2(cospi_m48_m16, cospi_m16_p48, x[41], x[54], x[41], x[54]);
+ btf_16_sse2(cospi_m48_m16, cospi_m16_p48, x[42], x[53], x[42], x[53]);
+ btf_16_sse2(cospi_m48_m16, cospi_m16_p48, x[43], x[52], x[43], x[52]);
+}
+
+static INLINE void idct64_stage9_sse2(__m128i *x, const int32_t *cospi,
+ const __m128i __rounding,
+ int8_t cos_bit) {
+ const __m128i cospi_m32_p32 = pair_set_epi16(-cospi[32], cospi[32]);
+ const __m128i cospi_p32_p32 = pair_set_epi16(cospi[32], cospi[32]);
+ btf_16_adds_subs_sse2(x[0], x[15]);
+ btf_16_adds_subs_sse2(x[1], x[14]);
+ btf_16_adds_subs_sse2(x[2], x[13]);
+ btf_16_adds_subs_sse2(x[3], x[12]);
+ btf_16_adds_subs_sse2(x[4], x[11]);
+ btf_16_adds_subs_sse2(x[5], x[10]);
+ btf_16_adds_subs_sse2(x[6], x[9]);
+ btf_16_adds_subs_sse2(x[7], x[8]);
+ btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x[20], x[27], x[20], x[27]);
+ btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x[21], x[26], x[21], x[26]);
+ btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x[22], x[25], x[22], x[25]);
+ btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x[23], x[24], x[23], x[24]);
+ btf_16_adds_subs_sse2(x[32], x[47]);
+ btf_16_adds_subs_sse2(x[33], x[46]);
+ btf_16_adds_subs_sse2(x[34], x[45]);
+ btf_16_adds_subs_sse2(x[35], x[44]);
+ btf_16_adds_subs_sse2(x[36], x[43]);
+ btf_16_adds_subs_sse2(x[37], x[42]);
+ btf_16_adds_subs_sse2(x[38], x[41]);
+ btf_16_adds_subs_sse2(x[39], x[40]);
+ btf_16_subs_adds_sse2(x[63], x[48]);
+ btf_16_subs_adds_sse2(x[62], x[49]);
+ btf_16_subs_adds_sse2(x[61], x[50]);
+ btf_16_subs_adds_sse2(x[60], x[51]);
+ btf_16_subs_adds_sse2(x[59], x[52]);
+ btf_16_subs_adds_sse2(x[58], x[53]);
+ btf_16_subs_adds_sse2(x[57], x[54]);
+ btf_16_subs_adds_sse2(x[56], x[55]);
+}
+
+static INLINE void idct64_stage10_sse2(__m128i *x, const int32_t *cospi,
+ const __m128i __rounding,
+ int8_t cos_bit) {
+ const __m128i cospi_m32_p32 = pair_set_epi16(-cospi[32], cospi[32]);
+ const __m128i cospi_p32_p32 = pair_set_epi16(cospi[32], cospi[32]);
+ btf_16_adds_subs_sse2(x[0], x[31]);
+ btf_16_adds_subs_sse2(x[1], x[30]);
+ btf_16_adds_subs_sse2(x[2], x[29]);
+ btf_16_adds_subs_sse2(x[3], x[28]);
+ btf_16_adds_subs_sse2(x[4], x[27]);
+ btf_16_adds_subs_sse2(x[5], x[26]);
+ btf_16_adds_subs_sse2(x[6], x[25]);
+ btf_16_adds_subs_sse2(x[7], x[24]);
+ btf_16_adds_subs_sse2(x[8], x[23]);
+ btf_16_adds_subs_sse2(x[9], x[22]);
+ btf_16_adds_subs_sse2(x[10], x[21]);
+ btf_16_adds_subs_sse2(x[11], x[20]);
+ btf_16_adds_subs_sse2(x[12], x[19]);
+ btf_16_adds_subs_sse2(x[13], x[18]);
+ btf_16_adds_subs_sse2(x[14], x[17]);
+ btf_16_adds_subs_sse2(x[15], x[16]);
+ btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x[40], x[55], x[40], x[55]);
+ btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x[41], x[54], x[41], x[54]);
+ btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x[42], x[53], x[42], x[53]);
+ btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x[43], x[52], x[43], x[52]);
+ btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x[44], x[51], x[44], x[51]);
+ btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x[45], x[50], x[45], x[50]);
+ btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x[46], x[49], x[46], x[49]);
+ btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x[47], x[48], x[47], x[48]);
+}
+
+static INLINE void idct64_stage11_sse2(__m128i *output, __m128i *x) {
+ btf_16_adds_subs_out_sse2(output[0], output[63], x[0], x[63]);
+ btf_16_adds_subs_out_sse2(output[1], output[62], x[1], x[62]);
+ btf_16_adds_subs_out_sse2(output[2], output[61], x[2], x[61]);
+ btf_16_adds_subs_out_sse2(output[3], output[60], x[3], x[60]);
+ btf_16_adds_subs_out_sse2(output[4], output[59], x[4], x[59]);
+ btf_16_adds_subs_out_sse2(output[5], output[58], x[5], x[58]);
+ btf_16_adds_subs_out_sse2(output[6], output[57], x[6], x[57]);
+ btf_16_adds_subs_out_sse2(output[7], output[56], x[7], x[56]);
+ btf_16_adds_subs_out_sse2(output[8], output[55], x[8], x[55]);
+ btf_16_adds_subs_out_sse2(output[9], output[54], x[9], x[54]);
+ btf_16_adds_subs_out_sse2(output[10], output[53], x[10], x[53]);
+ btf_16_adds_subs_out_sse2(output[11], output[52], x[11], x[52]);
+ btf_16_adds_subs_out_sse2(output[12], output[51], x[12], x[51]);
+ btf_16_adds_subs_out_sse2(output[13], output[50], x[13], x[50]);
+ btf_16_adds_subs_out_sse2(output[14], output[49], x[14], x[49]);
+ btf_16_adds_subs_out_sse2(output[15], output[48], x[15], x[48]);
+ btf_16_adds_subs_out_sse2(output[16], output[47], x[16], x[47]);
+ btf_16_adds_subs_out_sse2(output[17], output[46], x[17], x[46]);
+ btf_16_adds_subs_out_sse2(output[18], output[45], x[18], x[45]);
+ btf_16_adds_subs_out_sse2(output[19], output[44], x[19], x[44]);
+ btf_16_adds_subs_out_sse2(output[20], output[43], x[20], x[43]);
+ btf_16_adds_subs_out_sse2(output[21], output[42], x[21], x[42]);
+ btf_16_adds_subs_out_sse2(output[22], output[41], x[22], x[41]);
+ btf_16_adds_subs_out_sse2(output[23], output[40], x[23], x[40]);
+ btf_16_adds_subs_out_sse2(output[24], output[39], x[24], x[39]);
+ btf_16_adds_subs_out_sse2(output[25], output[38], x[25], x[38]);
+ btf_16_adds_subs_out_sse2(output[26], output[37], x[26], x[37]);
+ btf_16_adds_subs_out_sse2(output[27], output[36], x[27], x[36]);
+ btf_16_adds_subs_out_sse2(output[28], output[35], x[28], x[35]);
+ btf_16_adds_subs_out_sse2(output[29], output[34], x[29], x[34]);
+ btf_16_adds_subs_out_sse2(output[30], output[33], x[30], x[33]);
+ btf_16_adds_subs_out_sse2(output[31], output[32], x[31], x[32]);
+}
+
+static void idct64_low1_ssse3(const __m128i *input, __m128i *output) {
+ const int32_t *cospi = cospi_arr(INV_COS_BIT);
+
+ // stage 1
+ __m128i x[32];
+ x[0] = input[0];
+
+ // stage 2
+ // stage 3
+ // stage 4
+ // stage 5
+ // stage 6
+ btf_16_ssse3(cospi[32], cospi[32], x[0], x[0], x[1]);
+
+ // stage 7
+ // stage 8
+ // stage 9
+ // stage 10
+ // stage 11
+ output[0] = x[0];
+ output[63] = x[0];
+ output[1] = x[1];
+ output[62] = x[1];
+ output[2] = x[1];
+ output[61] = x[1];
+ output[3] = x[0];
+ output[60] = x[0];
+ output[4] = x[0];
+ output[59] = x[0];
+ output[5] = x[1];
+ output[58] = x[1];
+ output[6] = x[1];
+ output[57] = x[1];
+ output[7] = x[0];
+ output[56] = x[0];
+ output[8] = x[0];
+ output[55] = x[0];
+ output[9] = x[1];
+ output[54] = x[1];
+ output[10] = x[1];
+ output[53] = x[1];
+ output[11] = x[0];
+ output[52] = x[0];
+ output[12] = x[0];
+ output[51] = x[0];
+ output[13] = x[1];
+ output[50] = x[1];
+ output[14] = x[1];
+ output[49] = x[1];
+ output[15] = x[0];
+ output[48] = x[0];
+ output[16] = x[0];
+ output[47] = x[0];
+ output[17] = x[1];
+ output[46] = x[1];
+ output[18] = x[1];
+ output[45] = x[1];
+ output[19] = x[0];
+ output[44] = x[0];
+ output[20] = x[0];
+ output[43] = x[0];
+ output[21] = x[1];
+ output[42] = x[1];
+ output[22] = x[1];
+ output[41] = x[1];
+ output[23] = x[0];
+ output[40] = x[0];
+ output[24] = x[0];
+ output[39] = x[0];
+ output[25] = x[1];
+ output[38] = x[1];
+ output[26] = x[1];
+ output[37] = x[1];
+ output[27] = x[0];
+ output[36] = x[0];
+ output[28] = x[0];
+ output[35] = x[0];
+ output[29] = x[1];
+ output[34] = x[1];
+ output[30] = x[1];
+ output[33] = x[1];
+ output[31] = x[0];
+ output[32] = x[0];
+}
+
+static void idct64_low8_ssse3(const __m128i *input, __m128i *output) {
+ const int8_t cos_bit = INV_COS_BIT;
+ const int32_t *cospi = cospi_arr(INV_COS_BIT);
+ const __m128i __rounding = _mm_set1_epi32(1 << (INV_COS_BIT - 1));
+ const __m128i cospi_m04_p60 = pair_set_epi16(-cospi[4], cospi[60]);
+ const __m128i cospi_p60_p04 = pair_set_epi16(cospi[60], cospi[4]);
+ const __m128i cospi_m36_p28 = pair_set_epi16(-cospi[36], cospi[28]);
+ const __m128i cospi_m28_m36 = pair_set_epi16(-cospi[28], -cospi[36]);
+ const __m128i cospi_m20_p44 = pair_set_epi16(-cospi[20], cospi[44]);
+ const __m128i cospi_p44_p20 = pair_set_epi16(cospi[44], cospi[20]);
+ const __m128i cospi_m52_p12 = pair_set_epi16(-cospi[52], cospi[12]);
+ const __m128i cospi_m12_m52 = pair_set_epi16(-cospi[12], -cospi[52]);
+ const __m128i cospi_m08_p56 = pair_set_epi16(-cospi[8], cospi[56]);
+ const __m128i cospi_p56_p08 = pair_set_epi16(cospi[56], cospi[8]);
+ const __m128i cospi_m40_p24 = pair_set_epi16(-cospi[40], cospi[24]);
+ const __m128i cospi_m24_m40 = pair_set_epi16(-cospi[24], -cospi[40]);
+ const __m128i cospi_p32_p32 = pair_set_epi16(cospi[32], cospi[32]);
+ const __m128i cospi_m16_p48 = pair_set_epi16(-cospi[16], cospi[48]);
+ const __m128i cospi_p48_p16 = pair_set_epi16(cospi[48], cospi[16]);
+ const __m128i cospi_m32_p32 = pair_set_epi16(-cospi[32], cospi[32]);
+
+ // stage 1
+ __m128i x[64];
+ x[0] = input[0];
+ x[8] = input[4];
+ x[16] = input[2];
+ x[24] = input[6];
+ x[32] = input[1];
+ x[40] = input[5];
+ x[48] = input[3];
+ x[56] = input[7];
+
+ // stage 2
+ btf_16_ssse3(cospi[63], cospi[1], x[32], x[32], x[63]);
+ btf_16_ssse3(-cospi[57], cospi[7], x[56], x[39], x[56]);
+ btf_16_ssse3(cospi[59], cospi[5], x[40], x[40], x[55]);
+ btf_16_ssse3(-cospi[61], cospi[3], x[48], x[47], x[48]);
+
+ // stage 3
+ btf_16_ssse3(cospi[62], cospi[2], x[16], x[16], x[31]);
+ btf_16_ssse3(-cospi[58], cospi[6], x[24], x[23], x[24]);
+ x[33] = x[32];
+ x[38] = x[39];
+ x[41] = x[40];
+ x[46] = x[47];
+ x[49] = x[48];
+ x[54] = x[55];
+ x[57] = x[56];
+ x[62] = x[63];
+
+ // stage 4
+ btf_16_ssse3(cospi[60], cospi[4], x[8], x[8], x[15]);
+ x[17] = x[16];
+ x[22] = x[23];
+ x[25] = x[24];
+ x[30] = x[31];
+ btf_16_sse2(cospi_m04_p60, cospi_p60_p04, x[33], x[62], x[33], x[62]);
+ btf_16_sse2(cospi_m28_m36, cospi_m36_p28, x[38], x[57], x[38], x[57]);
+ btf_16_sse2(cospi_m20_p44, cospi_p44_p20, x[41], x[54], x[41], x[54]);
+ btf_16_sse2(cospi_m12_m52, cospi_m52_p12, x[46], x[49], x[46], x[49]);
+
+ // stage 5
+ x[9] = x[8];
+ x[14] = x[15];
+ btf_16_sse2(cospi_m08_p56, cospi_p56_p08, x[17], x[30], x[17], x[30]);
+ btf_16_sse2(cospi_m24_m40, cospi_m40_p24, x[22], x[25], x[22], x[25]);
+ x[35] = x[32];
+ x[34] = x[33];
+ x[36] = x[39];
+ x[37] = x[38];
+ x[43] = x[40];
+ x[42] = x[41];
+ x[44] = x[47];
+ x[45] = x[46];
+ x[51] = x[48];
+ x[50] = x[49];
+ x[52] = x[55];
+ x[53] = x[54];
+ x[59] = x[56];
+ x[58] = x[57];
+ x[60] = x[63];
+ x[61] = x[62];
+
+ // stage 6
+ btf_16_ssse3(cospi[32], cospi[32], x[0], x[0], x[1]);
+ btf_16_sse2(cospi_m16_p48, cospi_p48_p16, x[9], x[14], x[9], x[14]);
+ x[19] = x[16];
+ x[18] = x[17];
+ x[20] = x[23];
+ x[21] = x[22];
+ x[27] = x[24];
+ x[26] = x[25];
+ x[28] = x[31];
+ x[29] = x[30];
+ idct64_stage6_high32_sse2(x, cospi, __rounding, cos_bit);
+
+ // stage 7
+ x[3] = x[0];
+ x[2] = x[1];
+ x[11] = x[8];
+ x[10] = x[9];
+ x[12] = x[15];
+ x[13] = x[14];
+ idct64_stage7_high48_sse2(x, cospi, __rounding, cos_bit);
+
+ // stage 8
+ x[7] = x[0];
+ x[6] = x[1];
+ x[5] = x[2];
+ x[4] = x[3];
+ btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x[10], x[13], x[10], x[13]);
+ btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x[11], x[12], x[11], x[12]);
+ idct64_stage8_high48_sse2(x, cospi, __rounding, cos_bit);
+
+ idct64_stage9_sse2(x, cospi, __rounding, cos_bit);
+ idct64_stage10_sse2(x, cospi, __rounding, cos_bit);
+ idct64_stage11_sse2(output, x);
+}
+
+static void idct64_low16_ssse3(const __m128i *input, __m128i *output) {
+ const int8_t cos_bit = INV_COS_BIT;
+ const int32_t *cospi = cospi_arr(INV_COS_BIT);
+ const __m128i __rounding = _mm_set1_epi32(1 << (INV_COS_BIT - 1));
+
+ const __m128i cospi_p32_p32 = pair_set_epi16(cospi[32], cospi[32]);
+ const __m128i cospi_m16_p48 = pair_set_epi16(-cospi[16], cospi[48]);
+ const __m128i cospi_p48_p16 = pair_set_epi16(cospi[48], cospi[16]);
+ const __m128i cospi_m48_m16 = pair_set_epi16(-cospi[48], -cospi[16]);
+ const __m128i cospi_m32_p32 = pair_set_epi16(-cospi[32], cospi[32]);
+
+ // stage 1
+ __m128i x[64];
+ x[0] = input[0];
+ x[4] = input[8];
+ x[8] = input[4];
+ x[12] = input[12];
+ x[16] = input[2];
+ x[20] = input[10];
+ x[24] = input[6];
+ x[28] = input[14];
+ x[32] = input[1];
+ x[36] = input[9];
+ x[40] = input[5];
+ x[44] = input[13];
+ x[48] = input[3];
+ x[52] = input[11];
+ x[56] = input[7];
+ x[60] = input[15];
+
+ // stage 2
+ btf_16_ssse3(cospi[63], cospi[1], x[32], x[32], x[63]);
+ btf_16_ssse3(-cospi[49], cospi[15], x[60], x[35], x[60]);
+ btf_16_ssse3(cospi[55], cospi[9], x[36], x[36], x[59]);
+ btf_16_ssse3(-cospi[57], cospi[7], x[56], x[39], x[56]);
+ btf_16_ssse3(cospi[59], cospi[5], x[40], x[40], x[55]);
+ btf_16_ssse3(-cospi[53], cospi[11], x[52], x[43], x[52]);
+ btf_16_ssse3(cospi[51], cospi[13], x[44], x[44], x[51]);
+ btf_16_ssse3(-cospi[61], cospi[3], x[48], x[47], x[48]);
+
+ // stage 3
+ btf_16_ssse3(cospi[62], cospi[2], x[16], x[16], x[31]);
+ btf_16_ssse3(-cospi[50], cospi[14], x[28], x[19], x[28]);
+ btf_16_ssse3(cospi[54], cospi[10], x[20], x[20], x[27]);
+ btf_16_ssse3(-cospi[58], cospi[6], x[24], x[23], x[24]);
+ x[33] = x[32];
+ x[34] = x[35];
+ x[37] = x[36];
+ x[38] = x[39];
+ x[41] = x[40];
+ x[42] = x[43];
+ x[45] = x[44];
+ x[46] = x[47];
+ x[49] = x[48];
+ x[50] = x[51];
+ x[53] = x[52];
+ x[54] = x[55];
+ x[57] = x[56];
+ x[58] = x[59];
+ x[61] = x[60];
+ x[62] = x[63];
+
+ // stage 4
+ btf_16_ssse3(cospi[60], cospi[4], x[8], x[8], x[15]);
+ btf_16_ssse3(-cospi[52], cospi[12], x[12], x[11], x[12]);
+ x[17] = x[16];
+ x[18] = x[19];
+ x[21] = x[20];
+ x[22] = x[23];
+ x[25] = x[24];
+ x[26] = x[27];
+ x[29] = x[28];
+ x[30] = x[31];
+ idct64_stage4_high32_sse2(x, cospi, __rounding, cos_bit);
+
+ // stage 5
+ btf_16_ssse3(cospi[56], cospi[8], x[4], x[4], x[7]);
+ x[9] = x[8];
+ x[10] = x[11];
+ x[13] = x[12];
+ x[14] = x[15];
+ idct64_stage5_high48_sse2(x, cospi, __rounding, cos_bit);
+
+ // stage 6
+ btf_16_ssse3(cospi[32], cospi[32], x[0], x[0], x[1]);
+ x[5] = x[4];
+ x[6] = x[7];
+ btf_16_sse2(cospi_m16_p48, cospi_p48_p16, x[9], x[14], x[9], x[14]);
+ btf_16_sse2(cospi_m48_m16, cospi_m16_p48, x[10], x[13], x[10], x[13]);
+ idct64_stage6_high48_sse2(x, cospi, __rounding, cos_bit);
+
+ // stage 7
+ x[3] = x[0];
+ x[2] = x[1];
+ btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x[5], x[6], x[5], x[6]);
+ btf_16_adds_subs_sse2(x[8], x[11]);
+ btf_16_adds_subs_sse2(x[9], x[10]);
+ btf_16_subs_adds_sse2(x[15], x[12]);
+ btf_16_subs_adds_sse2(x[14], x[13]);
+ idct64_stage7_high48_sse2(x, cospi, __rounding, cos_bit);
+
+ // stage 8
+ btf_16_adds_subs_sse2(x[0], x[7]);
+ btf_16_adds_subs_sse2(x[1], x[6]);
+ btf_16_adds_subs_sse2(x[2], x[5]);
+ btf_16_adds_subs_sse2(x[3], x[4]);
+ btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x[10], x[13], x[10], x[13]);
+ btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x[11], x[12], x[11], x[12]);
+ idct64_stage8_high48_sse2(x, cospi, __rounding, cos_bit);
+
+ idct64_stage9_sse2(x, cospi, __rounding, cos_bit);
+ idct64_stage10_sse2(x, cospi, __rounding, cos_bit);
+ idct64_stage11_sse2(output, x);
+}
+
+static void idct64_low32_ssse3(const __m128i *input, __m128i *output) {
+ const int8_t cos_bit = INV_COS_BIT;
+ const int32_t *cospi = cospi_arr(INV_COS_BIT);
+ const __m128i __rounding = _mm_set1_epi32(1 << (INV_COS_BIT - 1));
+
+ const __m128i cospi_p32_p32 = pair_set_epi16(cospi[32], cospi[32]);
+ const __m128i cospi_m16_p48 = pair_set_epi16(-cospi[16], cospi[48]);
+ const __m128i cospi_p48_p16 = pair_set_epi16(cospi[48], cospi[16]);
+ const __m128i cospi_m48_m16 = pair_set_epi16(-cospi[48], -cospi[16]);
+ const __m128i cospi_m32_p32 = pair_set_epi16(-cospi[32], cospi[32]);
+
+ // stage 1
+ __m128i x[64];
+ x[0] = input[0];
+ x[2] = input[16];
+ x[4] = input[8];
+ x[6] = input[24];
+ x[8] = input[4];
+ x[10] = input[20];
+ x[12] = input[12];
+ x[14] = input[28];
+ x[16] = input[2];
+ x[18] = input[18];
+ x[20] = input[10];
+ x[22] = input[26];
+ x[24] = input[6];
+ x[26] = input[22];
+ x[28] = input[14];
+ x[30] = input[30];
+ x[32] = input[1];
+ x[34] = input[17];
+ x[36] = input[9];
+ x[38] = input[25];
+ x[40] = input[5];
+ x[42] = input[21];
+ x[44] = input[13];
+ x[46] = input[29];
+ x[48] = input[3];
+ x[50] = input[19];
+ x[52] = input[11];
+ x[54] = input[27];
+ x[56] = input[7];
+ x[58] = input[23];
+ x[60] = input[15];
+ x[62] = input[31];
+
+ // stage 2
+ btf_16_ssse3(cospi[63], cospi[1], x[32], x[32], x[63]);
+ btf_16_ssse3(-cospi[33], cospi[31], x[62], x[33], x[62]);
+ btf_16_ssse3(cospi[47], cospi[17], x[34], x[34], x[61]);
+ btf_16_ssse3(-cospi[49], cospi[15], x[60], x[35], x[60]);
+ btf_16_ssse3(cospi[55], cospi[9], x[36], x[36], x[59]);
+ btf_16_ssse3(-cospi[41], cospi[23], x[58], x[37], x[58]);
+ btf_16_ssse3(cospi[39], cospi[25], x[38], x[38], x[57]);
+ btf_16_ssse3(-cospi[57], cospi[7], x[56], x[39], x[56]);
+ btf_16_ssse3(cospi[59], cospi[5], x[40], x[40], x[55]);
+ btf_16_ssse3(-cospi[37], cospi[27], x[54], x[41], x[54]);
+ btf_16_ssse3(cospi[43], cospi[21], x[42], x[42], x[53]);
+ btf_16_ssse3(-cospi[53], cospi[11], x[52], x[43], x[52]);
+ btf_16_ssse3(cospi[51], cospi[13], x[44], x[44], x[51]);
+ btf_16_ssse3(-cospi[45], cospi[19], x[50], x[45], x[50]);
+ btf_16_ssse3(cospi[35], cospi[29], x[46], x[46], x[49]);
+ btf_16_ssse3(-cospi[61], cospi[3], x[48], x[47], x[48]);
+
+ // stage 3
+ btf_16_ssse3(cospi[62], cospi[2], x[16], x[16], x[31]);
+ btf_16_ssse3(-cospi[34], cospi[30], x[30], x[17], x[30]);
+ btf_16_ssse3(cospi[46], cospi[18], x[18], x[18], x[29]);
+ btf_16_ssse3(-cospi[50], cospi[14], x[28], x[19], x[28]);
+ btf_16_ssse3(cospi[54], cospi[10], x[20], x[20], x[27]);
+ btf_16_ssse3(-cospi[42], cospi[22], x[26], x[21], x[26]);
+ btf_16_ssse3(cospi[38], cospi[26], x[22], x[22], x[25]);
+ btf_16_ssse3(-cospi[58], cospi[6], x[24], x[23], x[24]);
+ btf_16_adds_subs_sse2(x[32], x[33]);
+ btf_16_subs_adds_sse2(x[35], x[34]);
+ btf_16_adds_subs_sse2(x[36], x[37]);
+ btf_16_subs_adds_sse2(x[39], x[38]);
+ btf_16_adds_subs_sse2(x[40], x[41]);
+ btf_16_subs_adds_sse2(x[43], x[42]);
+ btf_16_adds_subs_sse2(x[44], x[45]);
+ btf_16_subs_adds_sse2(x[47], x[46]);
+ btf_16_adds_subs_sse2(x[48], x[49]);
+ btf_16_subs_adds_sse2(x[51], x[50]);
+ btf_16_adds_subs_sse2(x[52], x[53]);
+ btf_16_subs_adds_sse2(x[55], x[54]);
+ btf_16_adds_subs_sse2(x[56], x[57]);
+ btf_16_subs_adds_sse2(x[59], x[58]);
+ btf_16_adds_subs_sse2(x[60], x[61]);
+ btf_16_subs_adds_sse2(x[63], x[62]);
+
+ // stage 4
+ btf_16_ssse3(cospi[60], cospi[4], x[8], x[8], x[15]);
+ btf_16_ssse3(-cospi[36], cospi[28], x[14], x[9], x[14]);
+ btf_16_ssse3(cospi[44], cospi[20], x[10], x[10], x[13]);
+ btf_16_ssse3(-cospi[52], cospi[12], x[12], x[11], x[12]);
+ btf_16_adds_subs_sse2(x[16], x[17]);
+ btf_16_subs_adds_sse2(x[19], x[18]);
+ btf_16_adds_subs_sse2(x[20], x[21]);
+ btf_16_subs_adds_sse2(x[23], x[22]);
+ btf_16_adds_subs_sse2(x[24], x[25]);
+ btf_16_subs_adds_sse2(x[27], x[26]);
+ btf_16_adds_subs_sse2(x[28], x[29]);
+ btf_16_subs_adds_sse2(x[31], x[30]);
+ idct64_stage4_high32_sse2(x, cospi, __rounding, cos_bit);
+
+ // stage 5
+ btf_16_ssse3(cospi[56], cospi[8], x[4], x[4], x[7]);
+ btf_16_ssse3(-cospi[40], cospi[24], x[6], x[5], x[6]);
+ btf_16_adds_subs_sse2(x[8], x[9]);
+ btf_16_subs_adds_sse2(x[11], x[10]);
+ btf_16_adds_subs_sse2(x[12], x[13]);
+ btf_16_subs_adds_sse2(x[15], x[14]);
+ idct64_stage5_high48_sse2(x, cospi, __rounding, cos_bit);
+
+ // stage 6
+ btf_16_ssse3(cospi[32], cospi[32], x[0], x[0], x[1]);
+ btf_16_ssse3(cospi[48], cospi[16], x[2], x[2], x[3]);
+ btf_16_adds_subs_sse2(x[4], x[5]);
+ btf_16_subs_adds_sse2(x[7], x[6]);
+ btf_16_sse2(cospi_m16_p48, cospi_p48_p16, x[9], x[14], x[9], x[14]);
+ btf_16_sse2(cospi_m48_m16, cospi_m16_p48, x[10], x[13], x[10], x[13]);
+ idct64_stage6_high48_sse2(x, cospi, __rounding, cos_bit);
+
+ // stage 7
+ btf_16_adds_subs_sse2(x[0], x[3]);
+ btf_16_adds_subs_sse2(x[1], x[2]);
+ btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x[5], x[6], x[5], x[6]);
+ btf_16_adds_subs_sse2(x[8], x[11]);
+ btf_16_adds_subs_sse2(x[9], x[10]);
+ btf_16_subs_adds_sse2(x[15], x[12]);
+ btf_16_subs_adds_sse2(x[14], x[13]);
+ idct64_stage7_high48_sse2(x, cospi, __rounding, cos_bit);
+
+ // stage 8
+ btf_16_adds_subs_sse2(x[0], x[7]);
+ btf_16_adds_subs_sse2(x[1], x[6]);
+ btf_16_adds_subs_sse2(x[2], x[5]);
+ btf_16_adds_subs_sse2(x[3], x[4]);
+ btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x[10], x[13], x[10], x[13]);
+ btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x[11], x[12], x[11], x[12]);
+ idct64_stage8_high48_sse2(x, cospi, __rounding, cos_bit);
+
+ // stage 9~11
+ idct64_stage9_sse2(x, cospi, __rounding, cos_bit);
+ idct64_stage10_sse2(x, cospi, __rounding, cos_bit);
+ idct64_stage11_sse2(output, x);
+}
+
+static void iadst4_sse2(const __m128i *input, __m128i *output) {
+ const int32_t *sinpi = sinpi_arr(INV_COS_BIT);
+ const __m128i sinpi_p01_p04 = pair_set_epi16(sinpi[1], sinpi[4]);
+ const __m128i sinpi_p02_m01 = pair_set_epi16(sinpi[2], -sinpi[1]);
+ const __m128i sinpi_p03_p02 = pair_set_epi16(sinpi[3], sinpi[2]);
+ const __m128i sinpi_p03_m04 = pair_set_epi16(sinpi[3], -sinpi[4]);
+ const __m128i sinpi_p03_m03 = pair_set_epi16(sinpi[3], -sinpi[3]);
+ const __m128i sinpi_0_p03 = pair_set_epi16(0, sinpi[3]);
+ const __m128i sinpi_p04_p02 = pair_set_epi16(sinpi[4], sinpi[2]);
+ const __m128i sinpi_m03_m01 = pair_set_epi16(-sinpi[3], -sinpi[1]);
+ __m128i x0[4];
+ x0[0] = input[0];
+ x0[1] = input[1];
+ x0[2] = input[2];
+ x0[3] = input[3];
+
+ __m128i u[4];
+ u[0] = _mm_unpacklo_epi16(x0[0], x0[2]);
+ u[1] = _mm_unpackhi_epi16(x0[0], x0[2]);
+ u[2] = _mm_unpacklo_epi16(x0[1], x0[3]);
+ u[3] = _mm_unpackhi_epi16(x0[1], x0[3]);
+
+ __m128i x1[16];
+ x1[0] = _mm_madd_epi16(u[0], sinpi_p01_p04); // x0*sin1 + x2*sin4
+ x1[1] = _mm_madd_epi16(u[1], sinpi_p01_p04);
+ x1[2] = _mm_madd_epi16(u[0], sinpi_p02_m01); // x0*sin2 - x2*sin1
+ x1[3] = _mm_madd_epi16(u[1], sinpi_p02_m01);
+ x1[4] = _mm_madd_epi16(u[2], sinpi_p03_p02); // x1*sin3 + x3*sin2
+ x1[5] = _mm_madd_epi16(u[3], sinpi_p03_p02);
+ x1[6] = _mm_madd_epi16(u[2], sinpi_p03_m04); // x1*sin3 - x3*sin4
+ x1[7] = _mm_madd_epi16(u[3], sinpi_p03_m04);
+ x1[8] = _mm_madd_epi16(u[0], sinpi_p03_m03); // x0*sin3 - x2*sin3
+ x1[9] = _mm_madd_epi16(u[1], sinpi_p03_m03);
+ x1[10] = _mm_madd_epi16(u[2], sinpi_0_p03); // x2*sin3
+ x1[11] = _mm_madd_epi16(u[3], sinpi_0_p03);
+ x1[12] = _mm_madd_epi16(u[0], sinpi_p04_p02); // x0*sin4 + x2*sin2
+ x1[13] = _mm_madd_epi16(u[1], sinpi_p04_p02);
+ x1[14] = _mm_madd_epi16(u[2], sinpi_m03_m01); // -x1*sin3 - x3*sin1
+ x1[15] = _mm_madd_epi16(u[3], sinpi_m03_m01);
+
+ __m128i x2[8];
+ x2[0] = _mm_add_epi32(x1[0], x1[4]); // x0*sin1 +x2*sin4 +x1*sin3 +x3*sin2
+ x2[1] = _mm_add_epi32(x1[1], x1[5]);
+ x2[2] = _mm_add_epi32(x1[2], x1[6]); // x0*sin2 -x2*sin1 +x1*sin3 -x3*sin4
+ x2[3] = _mm_add_epi32(x1[3], x1[7]);
+ x2[4] = _mm_add_epi32(x1[8], x1[10]); // x0*sin3 -x2*sin3 +x3*sin3
+ x2[5] = _mm_add_epi32(x1[9], x1[11]);
+ x2[6] = _mm_add_epi32(x1[12], x1[14]); // x0*sin1 +x2*sin4 +x0*sin2 -x2*sin1
+ x2[7] = _mm_add_epi32(x1[13], x1[15]);
+
+ const __m128i rounding = _mm_set1_epi32(1 << (INV_COS_BIT - 1));
+ for (int i = 0; i < 4; ++i) {
+ __m128i out0 = _mm_add_epi32(x2[2 * i], rounding);
+ __m128i out1 = _mm_add_epi32(x2[2 * i + 1], rounding);
+ out0 = _mm_srai_epi32(out0, INV_COS_BIT);
+ out1 = _mm_srai_epi32(out1, INV_COS_BIT);
+ output[i] = _mm_packs_epi32(out0, out1);
+ }
+}
+
+static void iadst4_w4_sse2(const __m128i *input, __m128i *output) {
+ const int32_t *sinpi = sinpi_arr(INV_COS_BIT);
+ const __m128i sinpi_p01_p04 = pair_set_epi16(sinpi[1], sinpi[4]);
+ const __m128i sinpi_p02_m01 = pair_set_epi16(sinpi[2], -sinpi[1]);
+ const __m128i sinpi_p03_p02 = pair_set_epi16(sinpi[3], sinpi[2]);
+ const __m128i sinpi_p03_m04 = pair_set_epi16(sinpi[3], -sinpi[4]);
+ const __m128i sinpi_p03_m03 = pair_set_epi16(sinpi[3], -sinpi[3]);
+ const __m128i sinpi_0_p03 = pair_set_epi16(0, sinpi[3]);
+ const __m128i sinpi_p04_p02 = pair_set_epi16(sinpi[4], sinpi[2]);
+ const __m128i sinpi_m03_m01 = pair_set_epi16(-sinpi[3], -sinpi[1]);
+ __m128i x0[4];
+ x0[0] = input[0];
+ x0[1] = input[1];
+ x0[2] = input[2];
+ x0[3] = input[3];
+
+ __m128i u[2];
+ u[0] = _mm_unpacklo_epi16(x0[0], x0[2]);
+ u[1] = _mm_unpacklo_epi16(x0[1], x0[3]);
+
+ __m128i x1[8];
+ x1[0] = _mm_madd_epi16(u[0], sinpi_p01_p04); // x0*sin1 + x2*sin4
+ x1[1] = _mm_madd_epi16(u[0], sinpi_p02_m01); // x0*sin2 - x2*sin1
+ x1[2] = _mm_madd_epi16(u[1], sinpi_p03_p02); // x1*sin3 + x3*sin2
+ x1[3] = _mm_madd_epi16(u[1], sinpi_p03_m04); // x1*sin3 - x3*sin4
+ x1[4] = _mm_madd_epi16(u[0], sinpi_p03_m03); // x0*sin3 - x2*sin3
+ x1[5] = _mm_madd_epi16(u[1], sinpi_0_p03); // x2*sin3
+ x1[6] = _mm_madd_epi16(u[0], sinpi_p04_p02); // x0*sin4 + x2*sin2
+ x1[7] = _mm_madd_epi16(u[1], sinpi_m03_m01); // -x1*sin3 - x3*sin1
+
+ __m128i x2[4];
+ x2[0] = _mm_add_epi32(x1[0], x1[2]); // x0*sin1 + x2*sin4 + x1*sin3 + x3*sin2
+ x2[1] = _mm_add_epi32(x1[1], x1[3]); // x0*sin2 - x2*sin1 + x1*sin3 - x3*sin4
+ x2[2] = _mm_add_epi32(x1[4], x1[5]); // x0*sin3 - x2*sin3 + x3*sin3
+ x2[3] = _mm_add_epi32(x1[6], x1[7]); // x0*sin4 + x2*sin2 - x1*sin3 - x3*sin1
+
+ const __m128i rounding = _mm_set1_epi32(1 << (INV_COS_BIT - 1));
+ for (int i = 0; i < 4; ++i) {
+ __m128i out0 = _mm_add_epi32(x2[i], rounding);
+ out0 = _mm_srai_epi32(out0, INV_COS_BIT);
+ output[i] = _mm_packs_epi32(out0, out0);
+ }
+}
+
+void av1_iadst8_low1_ssse3(const __m128i *input, __m128i *output) {
+ const int8_t cos_bit = INV_COS_BIT;
+ const int32_t *cospi = cospi_arr(INV_COS_BIT);
+ const __m128i __zero = _mm_setzero_si128();
+ const __m128i __rounding = _mm_set1_epi32(1 << (INV_COS_BIT - 1));
+
+ const __m128i cospi_p16_p48 = pair_set_epi16(cospi[16], cospi[48]);
+ const __m128i cospi_p48_m16 = pair_set_epi16(cospi[48], -cospi[16]);
+ const __m128i cospi_p32_p32 = pair_set_epi16(cospi[32], cospi[32]);
+ const __m128i cospi_p32_m32 = pair_set_epi16(cospi[32], -cospi[32]);
+
+ // stage 1
+ __m128i x[8];
+ x[1] = input[0];
+
+ // stage 2
+ btf_16_ssse3(cospi[60], -cospi[4], x[1], x[0], x[1]);
+
+ // stage 3
+ x[4] = x[0];
+ x[5] = x[1];
+
+ // stage 4
+ btf_16_sse2(cospi_p16_p48, cospi_p48_m16, x[4], x[5], x[4], x[5]);
+
+ // stage 5
+ x[2] = x[0];
+ x[3] = x[1];
+ x[6] = x[4];
+ x[7] = x[5];
+
+ // stage 6
+ btf_16_sse2(cospi_p32_p32, cospi_p32_m32, x[2], x[3], x[2], x[3]);
+ btf_16_sse2(cospi_p32_p32, cospi_p32_m32, x[6], x[7], x[6], x[7]);
+
+ // stage 7
+ output[0] = x[0];
+ output[1] = _mm_subs_epi16(__zero, x[4]);
+ output[2] = x[6];
+ output[3] = _mm_subs_epi16(__zero, x[2]);
+ output[4] = x[3];
+ output[5] = _mm_subs_epi16(__zero, x[7]);
+ output[6] = x[5];
+ output[7] = _mm_subs_epi16(__zero, x[1]);
+}
+
+void av1_iadst8_sse2(const __m128i *input, __m128i *output) {
+ const int8_t cos_bit = INV_COS_BIT;
+ const int32_t *cospi = cospi_arr(INV_COS_BIT);
+ const __m128i __zero = _mm_setzero_si128();
+ const __m128i __rounding = _mm_set1_epi32(1 << (INV_COS_BIT - 1));
+
+ const __m128i cospi_p04_p60 = pair_set_epi16(cospi[4], cospi[60]);
+ const __m128i cospi_p60_m04 = pair_set_epi16(cospi[60], -cospi[4]);
+ const __m128i cospi_p20_p44 = pair_set_epi16(cospi[20], cospi[44]);
+ const __m128i cospi_p44_m20 = pair_set_epi16(cospi[44], -cospi[20]);
+ const __m128i cospi_p36_p28 = pair_set_epi16(cospi[36], cospi[28]);
+ const __m128i cospi_p28_m36 = pair_set_epi16(cospi[28], -cospi[36]);
+ const __m128i cospi_p52_p12 = pair_set_epi16(cospi[52], cospi[12]);
+ const __m128i cospi_p12_m52 = pair_set_epi16(cospi[12], -cospi[52]);
+ const __m128i cospi_p16_p48 = pair_set_epi16(cospi[16], cospi[48]);
+ const __m128i cospi_p48_m16 = pair_set_epi16(cospi[48], -cospi[16]);
+ const __m128i cospi_m48_p16 = pair_set_epi16(-cospi[48], cospi[16]);
+ const __m128i cospi_p32_p32 = pair_set_epi16(cospi[32], cospi[32]);
+ const __m128i cospi_p32_m32 = pair_set_epi16(cospi[32], -cospi[32]);
+
+ // stage 1
+ __m128i x[8];
+ x[0] = input[7];
+ x[1] = input[0];
+ x[2] = input[5];
+ x[3] = input[2];
+ x[4] = input[3];
+ x[5] = input[4];
+ x[6] = input[1];
+ x[7] = input[6];
+
+ // stage 2
+ btf_16_sse2(cospi_p04_p60, cospi_p60_m04, x[0], x[1], x[0], x[1]);
+ btf_16_sse2(cospi_p20_p44, cospi_p44_m20, x[2], x[3], x[2], x[3]);
+ btf_16_sse2(cospi_p36_p28, cospi_p28_m36, x[4], x[5], x[4], x[5]);
+ btf_16_sse2(cospi_p52_p12, cospi_p12_m52, x[6], x[7], x[6], x[7]);
+
+ // stage 3
+ btf_16_adds_subs_sse2(x[0], x[4]);
+ btf_16_adds_subs_sse2(x[1], x[5]);
+ btf_16_adds_subs_sse2(x[2], x[6]);
+ btf_16_adds_subs_sse2(x[3], x[7]);
+
+ // stage 4
+ btf_16_sse2(cospi_p16_p48, cospi_p48_m16, x[4], x[5], x[4], x[5]);
+ btf_16_sse2(cospi_m48_p16, cospi_p16_p48, x[6], x[7], x[6], x[7]);
+
+ // stage 5
+ btf_16_adds_subs_sse2(x[0], x[2]);
+ btf_16_adds_subs_sse2(x[1], x[3]);
+ btf_16_adds_subs_sse2(x[4], x[6]);
+ btf_16_adds_subs_sse2(x[5], x[7]);
+
+ // stage 6
+ btf_16_sse2(cospi_p32_p32, cospi_p32_m32, x[2], x[3], x[2], x[3]);
+ btf_16_sse2(cospi_p32_p32, cospi_p32_m32, x[6], x[7], x[6], x[7]);
+
+ // stage 7
+ output[0] = x[0];
+ output[1] = _mm_subs_epi16(__zero, x[4]);
+ output[2] = x[6];
+ output[3] = _mm_subs_epi16(__zero, x[2]);
+ output[4] = x[3];
+ output[5] = _mm_subs_epi16(__zero, x[7]);
+ output[6] = x[5];
+ output[7] = _mm_subs_epi16(__zero, x[1]);
+}
+
+static void iadst8_w4_sse2(const __m128i *input, __m128i *output) {
+ const int8_t cos_bit = INV_COS_BIT;
+ const int32_t *cospi = cospi_arr(INV_COS_BIT);
+ const __m128i __zero = _mm_setzero_si128();
+ const __m128i __rounding = _mm_set1_epi32(1 << (INV_COS_BIT - 1));
+
+ const __m128i cospi_p04_p60 = pair_set_epi16(cospi[4], cospi[60]);
+ const __m128i cospi_p60_m04 = pair_set_epi16(cospi[60], -cospi[4]);
+ const __m128i cospi_p20_p44 = pair_set_epi16(cospi[20], cospi[44]);
+ const __m128i cospi_p44_m20 = pair_set_epi16(cospi[44], -cospi[20]);
+ const __m128i cospi_p36_p28 = pair_set_epi16(cospi[36], cospi[28]);
+ const __m128i cospi_p28_m36 = pair_set_epi16(cospi[28], -cospi[36]);
+ const __m128i cospi_p52_p12 = pair_set_epi16(cospi[52], cospi[12]);
+ const __m128i cospi_p12_m52 = pair_set_epi16(cospi[12], -cospi[52]);
+ const __m128i cospi_p16_p48 = pair_set_epi16(cospi[16], cospi[48]);
+ const __m128i cospi_p48_m16 = pair_set_epi16(cospi[48], -cospi[16]);
+ const __m128i cospi_m48_p16 = pair_set_epi16(-cospi[48], cospi[16]);
+ const __m128i cospi_p32_p32 = pair_set_epi16(cospi[32], cospi[32]);
+ const __m128i cospi_p32_m32 = pair_set_epi16(cospi[32], -cospi[32]);
+
+ // stage 1
+ __m128i x[8];
+ x[0] = input[7];
+ x[1] = input[0];
+ x[2] = input[5];
+ x[3] = input[2];
+ x[4] = input[3];
+ x[5] = input[4];
+ x[6] = input[1];
+ x[7] = input[6];
+
+ // stage 2
+ btf_16_4p_sse2(cospi_p04_p60, cospi_p60_m04, x[0], x[1], x[0], x[1]);
+ btf_16_4p_sse2(cospi_p20_p44, cospi_p44_m20, x[2], x[3], x[2], x[3]);
+ btf_16_4p_sse2(cospi_p36_p28, cospi_p28_m36, x[4], x[5], x[4], x[5]);
+ btf_16_4p_sse2(cospi_p52_p12, cospi_p12_m52, x[6], x[7], x[6], x[7]);
+
+ // stage 3
+ btf_16_adds_subs_sse2(x[0], x[4]);
+ btf_16_adds_subs_sse2(x[1], x[5]);
+ btf_16_adds_subs_sse2(x[2], x[6]);
+ btf_16_adds_subs_sse2(x[3], x[7]);
+
+ // stage 4
+ btf_16_4p_sse2(cospi_p16_p48, cospi_p48_m16, x[4], x[5], x[4], x[5]);
+ btf_16_4p_sse2(cospi_m48_p16, cospi_p16_p48, x[6], x[7], x[6], x[7]);
+
+ // stage 5
+ btf_16_adds_subs_sse2(x[0], x[2]);
+ btf_16_adds_subs_sse2(x[1], x[3]);
+ btf_16_adds_subs_sse2(x[4], x[6]);
+ btf_16_adds_subs_sse2(x[5], x[7]);
+
+ // stage 6
+ btf_16_4p_sse2(cospi_p32_p32, cospi_p32_m32, x[2], x[3], x[2], x[3]);
+ btf_16_4p_sse2(cospi_p32_p32, cospi_p32_m32, x[6], x[7], x[6], x[7]);
+
+ // stage 7
+ output[0] = x[0];
+ output[1] = _mm_subs_epi16(__zero, x[4]);
+ output[2] = x[6];
+ output[3] = _mm_subs_epi16(__zero, x[2]);
+ output[4] = x[3];
+ output[5] = _mm_subs_epi16(__zero, x[7]);
+ output[6] = x[5];
+ output[7] = _mm_subs_epi16(__zero, x[1]);
+}
+
+static INLINE void iadst16_stage3_ssse3(__m128i *x) {
+ btf_16_adds_subs_sse2(x[0], x[8]);
+ btf_16_adds_subs_sse2(x[1], x[9]);
+ btf_16_adds_subs_sse2(x[2], x[10]);
+ btf_16_adds_subs_sse2(x[3], x[11]);
+ btf_16_adds_subs_sse2(x[4], x[12]);
+ btf_16_adds_subs_sse2(x[5], x[13]);
+ btf_16_adds_subs_sse2(x[6], x[14]);
+ btf_16_adds_subs_sse2(x[7], x[15]);
+}
+
+static INLINE void iadst16_stage4_ssse3(__m128i *x, const int32_t *cospi,
+ const __m128i __rounding,
+ int8_t cos_bit) {
+ const __m128i cospi_p08_p56 = pair_set_epi16(cospi[8], cospi[56]);
+ const __m128i cospi_p56_m08 = pair_set_epi16(cospi[56], -cospi[8]);
+ const __m128i cospi_p40_p24 = pair_set_epi16(cospi[40], cospi[24]);
+ const __m128i cospi_p24_m40 = pair_set_epi16(cospi[24], -cospi[40]);
+ const __m128i cospi_m56_p08 = pair_set_epi16(-cospi[56], cospi[8]);
+ const __m128i cospi_m24_p40 = pair_set_epi16(-cospi[24], cospi[40]);
+ btf_16_sse2(cospi_p08_p56, cospi_p56_m08, x[8], x[9], x[8], x[9]);
+ btf_16_sse2(cospi_p40_p24, cospi_p24_m40, x[10], x[11], x[10], x[11]);
+ btf_16_sse2(cospi_m56_p08, cospi_p08_p56, x[12], x[13], x[12], x[13]);
+ btf_16_sse2(cospi_m24_p40, cospi_p40_p24, x[14], x[15], x[14], x[15]);
+}
+
+static INLINE void iadst16_stage5_ssse3(__m128i *x) {
+ btf_16_adds_subs_sse2(x[0], x[4]);
+ btf_16_adds_subs_sse2(x[1], x[5]);
+ btf_16_adds_subs_sse2(x[2], x[6]);
+ btf_16_adds_subs_sse2(x[3], x[7]);
+ btf_16_adds_subs_sse2(x[8], x[12]);
+ btf_16_adds_subs_sse2(x[9], x[13]);
+ btf_16_adds_subs_sse2(x[10], x[14]);
+ btf_16_adds_subs_sse2(x[11], x[15]);
+}
+
+static INLINE void iadst16_stage6_ssse3(__m128i *x, const int32_t *cospi,
+ const __m128i __rounding,
+ int8_t cos_bit) {
+ const __m128i cospi_p16_p48 = pair_set_epi16(cospi[16], cospi[48]);
+ const __m128i cospi_p48_m16 = pair_set_epi16(cospi[48], -cospi[16]);
+ const __m128i cospi_m48_p16 = pair_set_epi16(-cospi[48], cospi[16]);
+ btf_16_sse2(cospi_p16_p48, cospi_p48_m16, x[4], x[5], x[4], x[5]);
+ btf_16_sse2(cospi_m48_p16, cospi_p16_p48, x[6], x[7], x[6], x[7]);
+ btf_16_sse2(cospi_p16_p48, cospi_p48_m16, x[12], x[13], x[12], x[13]);
+ btf_16_sse2(cospi_m48_p16, cospi_p16_p48, x[14], x[15], x[14], x[15]);
+}
+
+static INLINE void iadst16_stage7_ssse3(__m128i *x) {
+ btf_16_adds_subs_sse2(x[0], x[2]);
+ btf_16_adds_subs_sse2(x[1], x[3]);
+ btf_16_adds_subs_sse2(x[4], x[6]);
+ btf_16_adds_subs_sse2(x[5], x[7]);
+ btf_16_adds_subs_sse2(x[8], x[10]);
+ btf_16_adds_subs_sse2(x[9], x[11]);
+ btf_16_adds_subs_sse2(x[12], x[14]);
+ btf_16_adds_subs_sse2(x[13], x[15]);
+}
+
+static INLINE void iadst16_stage8_ssse3(__m128i *x, const int32_t *cospi,
+ const __m128i __rounding,
+ int8_t cos_bit) {
+ const __m128i cospi_p32_p32 = pair_set_epi16(cospi[32], cospi[32]);
+ const __m128i cospi_p32_m32 = pair_set_epi16(cospi[32], -cospi[32]);
+ btf_16_sse2(cospi_p32_p32, cospi_p32_m32, x[2], x[3], x[2], x[3]);
+ btf_16_sse2(cospi_p32_p32, cospi_p32_m32, x[6], x[7], x[6], x[7]);
+ btf_16_sse2(cospi_p32_p32, cospi_p32_m32, x[10], x[11], x[10], x[11]);
+ btf_16_sse2(cospi_p32_p32, cospi_p32_m32, x[14], x[15], x[14], x[15]);
+}
+
+static INLINE void iadst16_stage9_ssse3(__m128i *output, __m128i *x) {
+ const __m128i __zero = _mm_setzero_si128();
+ output[0] = x[0];
+ output[1] = _mm_subs_epi16(__zero, x[8]);
+ output[2] = x[12];
+ output[3] = _mm_subs_epi16(__zero, x[4]);
+ output[4] = x[6];
+ output[5] = _mm_subs_epi16(__zero, x[14]);
+ output[6] = x[10];
+ output[7] = _mm_subs_epi16(__zero, x[2]);
+ output[8] = x[3];
+ output[9] = _mm_subs_epi16(__zero, x[11]);
+ output[10] = x[15];
+ output[11] = _mm_subs_epi16(__zero, x[7]);
+ output[12] = x[5];
+ output[13] = _mm_subs_epi16(__zero, x[13]);
+ output[14] = x[9];
+ output[15] = _mm_subs_epi16(__zero, x[1]);
+}
+
+static void iadst16_low1_ssse3(const __m128i *input, __m128i *output) {
+ const int8_t cos_bit = INV_COS_BIT;
+ const int32_t *cospi = cospi_arr(INV_COS_BIT);
+ const __m128i __rounding = _mm_set1_epi32(1 << (INV_COS_BIT - 1));
+
+ const __m128i cospi_p08_p56 = pair_set_epi16(cospi[8], cospi[56]);
+ const __m128i cospi_p56_m08 = pair_set_epi16(cospi[56], -cospi[8]);
+ const __m128i cospi_p16_p48 = pair_set_epi16(cospi[16], cospi[48]);
+ const __m128i cospi_p48_m16 = pair_set_epi16(cospi[48], -cospi[16]);
+
+ // stage 1
+ __m128i x[16];
+ x[1] = input[0];
+
+ // stage 2
+ btf_16_ssse3(cospi[62], -cospi[2], x[1], x[0], x[1]);
+
+ // stage 3
+ x[8] = x[0];
+ x[9] = x[1];
+
+ // stage 4
+ btf_16_sse2(cospi_p08_p56, cospi_p56_m08, x[8], x[9], x[8], x[9]);
+
+ // stage 5
+ x[4] = x[0];
+ x[5] = x[1];
+ x[12] = x[8];
+ x[13] = x[9];
+
+ // stage 6
+ btf_16_sse2(cospi_p16_p48, cospi_p48_m16, x[4], x[5], x[4], x[5]);
+ btf_16_sse2(cospi_p16_p48, cospi_p48_m16, x[12], x[13], x[12], x[13]);
+
+ // stage 7
+ x[2] = x[0];
+ x[3] = x[1];
+ x[6] = x[4];
+ x[7] = x[5];
+ x[10] = x[8];
+ x[11] = x[9];
+ x[14] = x[12];
+ x[15] = x[13];
+
+ iadst16_stage8_ssse3(x, cospi, __rounding, cos_bit);
+ iadst16_stage9_ssse3(output, x);
+}
+
+static void iadst16_low8_ssse3(const __m128i *input, __m128i *output) {
+ const int8_t cos_bit = INV_COS_BIT;
+ const int32_t *cospi = cospi_arr(INV_COS_BIT);
+ const __m128i __rounding = _mm_set1_epi32(1 << (INV_COS_BIT - 1));
+
+ // stage 1
+ __m128i x[16];
+ x[1] = input[0];
+ x[3] = input[2];
+ x[5] = input[4];
+ x[7] = input[6];
+ x[8] = input[7];
+ x[10] = input[5];
+ x[12] = input[3];
+ x[14] = input[1];
+
+ // stage 2
+ btf_16_ssse3(cospi[62], -cospi[2], x[1], x[0], x[1]);
+ btf_16_ssse3(cospi[54], -cospi[10], x[3], x[2], x[3]);
+ btf_16_ssse3(cospi[46], -cospi[18], x[5], x[4], x[5]);
+ btf_16_ssse3(cospi[38], -cospi[26], x[7], x[6], x[7]);
+ btf_16_ssse3(cospi[34], cospi[30], x[8], x[8], x[9]);
+ btf_16_ssse3(cospi[42], cospi[22], x[10], x[10], x[11]);
+ btf_16_ssse3(cospi[50], cospi[14], x[12], x[12], x[13]);
+ btf_16_ssse3(cospi[58], cospi[6], x[14], x[14], x[15]);
+
+ // stage 3
+ iadst16_stage3_ssse3(x);
+ iadst16_stage4_ssse3(x, cospi, __rounding, cos_bit);
+ iadst16_stage5_ssse3(x);
+ iadst16_stage6_ssse3(x, cospi, __rounding, cos_bit);
+ iadst16_stage7_ssse3(x);
+ iadst16_stage8_ssse3(x, cospi, __rounding, cos_bit);
+ iadst16_stage9_ssse3(output, x);
+}
+static void iadst16_sse2(const __m128i *input, __m128i *output) {
+ const int8_t cos_bit = INV_COS_BIT;
+ const int32_t *cospi = cospi_arr(INV_COS_BIT);
+ const __m128i __rounding = _mm_set1_epi32(1 << (INV_COS_BIT - 1));
+ const __m128i cospi_p02_p62 = pair_set_epi16(cospi[2], cospi[62]);
+ const __m128i cospi_p62_m02 = pair_set_epi16(cospi[62], -cospi[2]);
+ const __m128i cospi_p10_p54 = pair_set_epi16(cospi[10], cospi[54]);
+ const __m128i cospi_p54_m10 = pair_set_epi16(cospi[54], -cospi[10]);
+ const __m128i cospi_p18_p46 = pair_set_epi16(cospi[18], cospi[46]);
+ const __m128i cospi_p46_m18 = pair_set_epi16(cospi[46], -cospi[18]);
+ const __m128i cospi_p26_p38 = pair_set_epi16(cospi[26], cospi[38]);
+ const __m128i cospi_p38_m26 = pair_set_epi16(cospi[38], -cospi[26]);
+ const __m128i cospi_p34_p30 = pair_set_epi16(cospi[34], cospi[30]);
+ const __m128i cospi_p30_m34 = pair_set_epi16(cospi[30], -cospi[34]);
+ const __m128i cospi_p42_p22 = pair_set_epi16(cospi[42], cospi[22]);
+ const __m128i cospi_p22_m42 = pair_set_epi16(cospi[22], -cospi[42]);
+ const __m128i cospi_p50_p14 = pair_set_epi16(cospi[50], cospi[14]);
+ const __m128i cospi_p14_m50 = pair_set_epi16(cospi[14], -cospi[50]);
+ const __m128i cospi_p58_p06 = pair_set_epi16(cospi[58], cospi[6]);
+ const __m128i cospi_p06_m58 = pair_set_epi16(cospi[6], -cospi[58]);
+
+ // stage 1
+ __m128i x[16];
+ x[0] = input[15];
+ x[1] = input[0];
+ x[2] = input[13];
+ x[3] = input[2];
+ x[4] = input[11];
+ x[5] = input[4];
+ x[6] = input[9];
+ x[7] = input[6];
+ x[8] = input[7];
+ x[9] = input[8];
+ x[10] = input[5];
+ x[11] = input[10];
+ x[12] = input[3];
+ x[13] = input[12];
+ x[14] = input[1];
+ x[15] = input[14];
+
+ // stage 2
+ btf_16_sse2(cospi_p02_p62, cospi_p62_m02, x[0], x[1], x[0], x[1]);
+ btf_16_sse2(cospi_p10_p54, cospi_p54_m10, x[2], x[3], x[2], x[3]);
+ btf_16_sse2(cospi_p18_p46, cospi_p46_m18, x[4], x[5], x[4], x[5]);
+ btf_16_sse2(cospi_p26_p38, cospi_p38_m26, x[6], x[7], x[6], x[7]);
+ btf_16_sse2(cospi_p34_p30, cospi_p30_m34, x[8], x[9], x[8], x[9]);
+ btf_16_sse2(cospi_p42_p22, cospi_p22_m42, x[10], x[11], x[10], x[11]);
+ btf_16_sse2(cospi_p50_p14, cospi_p14_m50, x[12], x[13], x[12], x[13]);
+ btf_16_sse2(cospi_p58_p06, cospi_p06_m58, x[14], x[15], x[14], x[15]);
+
+ // stage 3~9
+ iadst16_stage3_ssse3(x);
+ iadst16_stage4_ssse3(x, cospi, __rounding, cos_bit);
+ iadst16_stage5_ssse3(x);
+ iadst16_stage6_ssse3(x, cospi, __rounding, cos_bit);
+ iadst16_stage7_ssse3(x);
+ iadst16_stage8_ssse3(x, cospi, __rounding, cos_bit);
+ iadst16_stage9_ssse3(output, x);
+}
+
+static void iadst16_w4_sse2(const __m128i *input, __m128i *output) {
+ const int8_t cos_bit = INV_COS_BIT;
+ const int32_t *cospi = cospi_arr(INV_COS_BIT);
+ const __m128i __rounding = _mm_set1_epi32(1 << (INV_COS_BIT - 1));
+
+ const __m128i cospi_p02_p62 = pair_set_epi16(cospi[2], cospi[62]);
+ const __m128i cospi_p62_m02 = pair_set_epi16(cospi[62], -cospi[2]);
+ const __m128i cospi_p10_p54 = pair_set_epi16(cospi[10], cospi[54]);
+ const __m128i cospi_p54_m10 = pair_set_epi16(cospi[54], -cospi[10]);
+ const __m128i cospi_p18_p46 = pair_set_epi16(cospi[18], cospi[46]);
+ const __m128i cospi_p46_m18 = pair_set_epi16(cospi[46], -cospi[18]);
+ const __m128i cospi_p26_p38 = pair_set_epi16(cospi[26], cospi[38]);
+ const __m128i cospi_p38_m26 = pair_set_epi16(cospi[38], -cospi[26]);
+ const __m128i cospi_p34_p30 = pair_set_epi16(cospi[34], cospi[30]);
+ const __m128i cospi_p30_m34 = pair_set_epi16(cospi[30], -cospi[34]);
+ const __m128i cospi_p42_p22 = pair_set_epi16(cospi[42], cospi[22]);
+ const __m128i cospi_p22_m42 = pair_set_epi16(cospi[22], -cospi[42]);
+ const __m128i cospi_p50_p14 = pair_set_epi16(cospi[50], cospi[14]);
+ const __m128i cospi_p14_m50 = pair_set_epi16(cospi[14], -cospi[50]);
+ const __m128i cospi_p58_p06 = pair_set_epi16(cospi[58], cospi[6]);
+ const __m128i cospi_p06_m58 = pair_set_epi16(cospi[6], -cospi[58]);
+ const __m128i cospi_p08_p56 = pair_set_epi16(cospi[8], cospi[56]);
+ const __m128i cospi_p56_m08 = pair_set_epi16(cospi[56], -cospi[8]);
+ const __m128i cospi_p40_p24 = pair_set_epi16(cospi[40], cospi[24]);
+ const __m128i cospi_p24_m40 = pair_set_epi16(cospi[24], -cospi[40]);
+ const __m128i cospi_m56_p08 = pair_set_epi16(-cospi[56], cospi[8]);
+ const __m128i cospi_m24_p40 = pair_set_epi16(-cospi[24], cospi[40]);
+ const __m128i cospi_p16_p48 = pair_set_epi16(cospi[16], cospi[48]);
+ const __m128i cospi_p48_m16 = pair_set_epi16(cospi[48], -cospi[16]);
+ const __m128i cospi_m48_p16 = pair_set_epi16(-cospi[48], cospi[16]);
+ const __m128i cospi_p32_p32 = pair_set_epi16(cospi[32], cospi[32]);
+ const __m128i cospi_p32_m32 = pair_set_epi16(cospi[32], -cospi[32]);
+
+ // stage 1
+ __m128i x[16];
+ x[0] = input[15];
+ x[1] = input[0];
+ x[2] = input[13];
+ x[3] = input[2];
+ x[4] = input[11];
+ x[5] = input[4];
+ x[6] = input[9];
+ x[7] = input[6];
+ x[8] = input[7];
+ x[9] = input[8];
+ x[10] = input[5];
+ x[11] = input[10];
+ x[12] = input[3];
+ x[13] = input[12];
+ x[14] = input[1];
+ x[15] = input[14];
+
+ // stage 2
+ btf_16_4p_sse2(cospi_p02_p62, cospi_p62_m02, x[0], x[1], x[0], x[1]);
+ btf_16_4p_sse2(cospi_p10_p54, cospi_p54_m10, x[2], x[3], x[2], x[3]);
+ btf_16_4p_sse2(cospi_p18_p46, cospi_p46_m18, x[4], x[5], x[4], x[5]);
+ btf_16_4p_sse2(cospi_p26_p38, cospi_p38_m26, x[6], x[7], x[6], x[7]);
+ btf_16_4p_sse2(cospi_p34_p30, cospi_p30_m34, x[8], x[9], x[8], x[9]);
+ btf_16_4p_sse2(cospi_p42_p22, cospi_p22_m42, x[10], x[11], x[10], x[11]);
+ btf_16_4p_sse2(cospi_p50_p14, cospi_p14_m50, x[12], x[13], x[12], x[13]);
+ btf_16_4p_sse2(cospi_p58_p06, cospi_p06_m58, x[14], x[15], x[14], x[15]);
+
+ // stage 3
+ iadst16_stage3_ssse3(x);
+
+ // stage 4
+ btf_16_4p_sse2(cospi_p08_p56, cospi_p56_m08, x[8], x[9], x[8], x[9]);
+ btf_16_4p_sse2(cospi_p40_p24, cospi_p24_m40, x[10], x[11], x[10], x[11]);
+ btf_16_4p_sse2(cospi_m56_p08, cospi_p08_p56, x[12], x[13], x[12], x[13]);
+ btf_16_4p_sse2(cospi_m24_p40, cospi_p40_p24, x[14], x[15], x[14], x[15]);
+
+ // stage 5
+ iadst16_stage5_ssse3(x);
+
+ // stage 6
+ btf_16_4p_sse2(cospi_p16_p48, cospi_p48_m16, x[4], x[5], x[4], x[5]);
+ btf_16_4p_sse2(cospi_m48_p16, cospi_p16_p48, x[6], x[7], x[6], x[7]);
+ btf_16_4p_sse2(cospi_p16_p48, cospi_p48_m16, x[12], x[13], x[12], x[13]);
+ btf_16_4p_sse2(cospi_m48_p16, cospi_p16_p48, x[14], x[15], x[14], x[15]);
+
+ // stage 7
+ iadst16_stage7_ssse3(x);
+
+ // stage 8
+ btf_16_4p_sse2(cospi_p32_p32, cospi_p32_m32, x[2], x[3], x[2], x[3]);
+ btf_16_4p_sse2(cospi_p32_p32, cospi_p32_m32, x[6], x[7], x[6], x[7]);
+ btf_16_4p_sse2(cospi_p32_p32, cospi_p32_m32, x[10], x[11], x[10], x[11]);
+ btf_16_4p_sse2(cospi_p32_p32, cospi_p32_m32, x[14], x[15], x[14], x[15]);
+
+ // stage 9
+ iadst16_stage9_ssse3(output, x);
+}
+
+static void iidentity4_ssse3(const __m128i *input, __m128i *output) {
+ const int16_t scale_fractional = (NewSqrt2 - (1 << NewSqrt2Bits));
+ const __m128i scale = _mm_set1_epi16(scale_fractional << (15 - NewSqrt2Bits));
+ for (int i = 0; i < 4; ++i) {
+ __m128i x = _mm_mulhrs_epi16(input[i], scale);
+ output[i] = _mm_adds_epi16(x, input[i]);
+ }
+}
+
+static void iidentity8_sse2(const __m128i *input, __m128i *output) {
+ for (int i = 0; i < 8; ++i) {
+ output[i] = _mm_adds_epi16(input[i], input[i]);
+ }
+}
+
+static void iidentity16_ssse3(const __m128i *input, __m128i *output) {
+ const int16_t scale_fractional = 2 * (NewSqrt2 - (1 << NewSqrt2Bits));
+ const __m128i scale = _mm_set1_epi16(scale_fractional << (15 - NewSqrt2Bits));
+ for (int i = 0; i < 16; ++i) {
+ __m128i x = _mm_mulhrs_epi16(input[i], scale);
+ __m128i srcx2 = _mm_adds_epi16(input[i], input[i]);
+ output[i] = _mm_adds_epi16(x, srcx2);
+ }
+}
+
+static INLINE __m128i lowbd_get_recon_8x8_sse2(const __m128i pred,
+ __m128i res) {
+ const __m128i zero = _mm_setzero_si128();
+ __m128i x0 = _mm_adds_epi16(res, _mm_unpacklo_epi8(pred, zero));
+ return _mm_packus_epi16(x0, x0);
+}
+
+static INLINE void lowbd_write_buffer_4xn_sse2(__m128i *in, uint8_t *output,
+ int stride, int flipud,
+ const int height) {
+ int j = flipud ? (height - 1) : 0;
+ const int step = flipud ? -1 : 1;
+ const __m128i zero = _mm_setzero_si128();
+ for (int i = 0; i < height; ++i, j += step) {
+ const __m128i v = _mm_cvtsi32_si128(*((int *)(output + i * stride)));
+ __m128i u = _mm_adds_epi16(in[j], _mm_unpacklo_epi8(v, zero));
+ u = _mm_packus_epi16(u, zero);
+ *((int *)(output + i * stride)) = _mm_cvtsi128_si32(u);
+ }
+}
+
+static INLINE void lowbd_write_buffer_8xn_sse2(__m128i *in, uint8_t *output,
+ int stride, int flipud,
+ const int height) {
+ int j = flipud ? (height - 1) : 0;
+ const int step = flipud ? -1 : 1;
+ for (int i = 0; i < height; ++i, j += step) {
+ const __m128i v = _mm_loadl_epi64((__m128i const *)(output + i * stride));
+ const __m128i u = lowbd_get_recon_8x8_sse2(v, in[j]);
+ _mm_storel_epi64((__m128i *)(output + i * stride), u);
+ }
+}
+
+// 1D functions process process 8 pixels at one time.
+static const transform_1d_ssse3
+ lowbd_txfm_all_1d_w8_arr[TX_SIZES][ITX_TYPES_1D] = {
+ { idct4_sse2, iadst4_sse2, iidentity4_ssse3 },
+ { av1_idct8_sse2, av1_iadst8_sse2, iidentity8_sse2 },
+ { idct16_sse2, iadst16_sse2, iidentity16_ssse3 },
+ { idct32_sse2, NULL, NULL },
+ { idct64_low32_ssse3, NULL, NULL },
+ };
+
+// functions for blocks with eob at DC and within
+// topleft 8x8, 16x16, 32x32 corner
+static const transform_1d_ssse3
+ lowbd_txfm_all_1d_zeros_w8_arr[TX_SIZES][ITX_TYPES_1D][4] = {
+ {
+ { idct4_sse2, idct4_sse2, NULL, NULL },
+ { iadst4_sse2, iadst4_sse2, NULL, NULL },
+ { iidentity4_ssse3, iidentity4_ssse3, NULL, NULL },
+ },
+ { { av1_idct8_low1_ssse3, av1_idct8_sse2, NULL, NULL },
+ { av1_iadst8_low1_ssse3, av1_iadst8_sse2, NULL, NULL },
+ { iidentity8_sse2, iidentity8_sse2, NULL, NULL } },
+ {
+ { idct16_low1_ssse3, idct16_low8_ssse3, idct16_sse2, NULL },
+ { iadst16_low1_ssse3, iadst16_low8_ssse3, iadst16_sse2, NULL },
+ { NULL, NULL, NULL, NULL },
+ },
+ { { idct32_low1_ssse3, idct32_low8_ssse3, idct32_low16_ssse3,
+ idct32_sse2 },
+ { NULL, NULL, NULL, NULL },
+ { NULL, NULL, NULL, NULL } },
+ { { idct64_low1_ssse3, idct64_low8_ssse3, idct64_low16_ssse3,
+ idct64_low32_ssse3 },
+ { NULL, NULL, NULL, NULL },
+ { NULL, NULL, NULL, NULL } }
+ };
+
+// 1D functions process process 4 pixels at one time.
+// used in 4x4, 4x8, 4x16, 8x4, 16x4
+static const transform_1d_ssse3
+ lowbd_txfm_all_1d_w4_arr[TX_SIZES][ITX_TYPES_1D] = {
+ { idct4_w4_sse2, iadst4_w4_sse2, iidentity4_ssse3 },
+ { idct8_w4_sse2, iadst8_w4_sse2, iidentity8_sse2 },
+ { idct16_w4_sse2, iadst16_w4_sse2, iidentity16_ssse3 },
+ { NULL, NULL, NULL },
+ { NULL, NULL, NULL },
+ };
+
+static INLINE void iidentity_row_8xn_ssse3(__m128i *out, const int32_t *input,
+ int stride, int shift, int height,
+ int txw_idx, int rect_type) {
+ const int32_t *input_row = input;
+ const __m128i scale = _mm_set1_epi16(NewSqrt2list[txw_idx]);
+ const __m128i rounding = _mm_set1_epi16((1 << (NewSqrt2Bits - 1)) +
+ (1 << (NewSqrt2Bits - shift - 1)));
+ const __m128i one = _mm_set1_epi16(1);
+ const __m128i scale_rounding = _mm_unpacklo_epi16(scale, rounding);
+ if (rect_type != 1 && rect_type != -1) {
+ for (int i = 0; i < height; ++i) {
+ const __m128i src = load_32bit_to_16bit(input_row);
+ input_row += stride;
+ __m128i lo = _mm_unpacklo_epi16(src, one);
+ __m128i hi = _mm_unpackhi_epi16(src, one);
+ lo = _mm_madd_epi16(lo, scale_rounding);
+ hi = _mm_madd_epi16(hi, scale_rounding);
+ lo = _mm_srai_epi32(lo, NewSqrt2Bits - shift);
+ hi = _mm_srai_epi32(hi, NewSqrt2Bits - shift);
+ out[i] = _mm_packs_epi32(lo, hi);
+ }
+ } else {
+ const __m128i rect_scale =
+ _mm_set1_epi16(NewInvSqrt2 << (15 - NewSqrt2Bits));
+ for (int i = 0; i < height; ++i) {
+ __m128i src = load_32bit_to_16bit(input_row);
+ src = _mm_mulhrs_epi16(src, rect_scale);
+ input_row += stride;
+ __m128i lo = _mm_unpacklo_epi16(src, one);
+ __m128i hi = _mm_unpackhi_epi16(src, one);
+ lo = _mm_madd_epi16(lo, scale_rounding);
+ hi = _mm_madd_epi16(hi, scale_rounding);
+ lo = _mm_srai_epi32(lo, NewSqrt2Bits - shift);
+ hi = _mm_srai_epi32(hi, NewSqrt2Bits - shift);
+ out[i] = _mm_packs_epi32(lo, hi);
+ }
+ }
+}
+
+static INLINE void iidentity_col_8xn_ssse3(uint8_t *output, int stride,
+ __m128i *buf, int shift, int height,
+ int txh_idx) {
+ const __m128i scale = _mm_set1_epi16(NewSqrt2list[txh_idx]);
+ const __m128i scale_rounding = _mm_set1_epi16(1 << (NewSqrt2Bits - 1));
+ const __m128i shift_rounding = _mm_set1_epi32(1 << (-shift - 1));
+ const __m128i one = _mm_set1_epi16(1);
+ const __m128i scale_coeff = _mm_unpacklo_epi16(scale, scale_rounding);
+ const __m128i zero = _mm_setzero_si128();
+ for (int h = 0; h < height; ++h) {
+ __m128i lo = _mm_unpacklo_epi16(buf[h], one);
+ __m128i hi = _mm_unpackhi_epi16(buf[h], one);
+ lo = _mm_madd_epi16(lo, scale_coeff);
+ hi = _mm_madd_epi16(hi, scale_coeff);
+ lo = _mm_srai_epi32(lo, NewSqrt2Bits);
+ hi = _mm_srai_epi32(hi, NewSqrt2Bits);
+ lo = _mm_add_epi32(lo, shift_rounding);
+ hi = _mm_add_epi32(hi, shift_rounding);
+ lo = _mm_srai_epi32(lo, -shift);
+ hi = _mm_srai_epi32(hi, -shift);
+ __m128i x = _mm_packs_epi32(lo, hi);
+
+ const __m128i pred = _mm_loadl_epi64((__m128i const *)(output));
+ x = _mm_adds_epi16(x, _mm_unpacklo_epi8(pred, zero));
+ const __m128i u = _mm_packus_epi16(x, x);
+ _mm_storel_epi64((__m128i *)(output), u);
+ output += stride;
+ }
+}
+
+void av1_lowbd_inv_txfm2d_add_idtx_ssse3(const int32_t *input, uint8_t *output,
+ int stride, TX_SIZE tx_size) {
+ const int8_t *shift = av1_inv_txfm_shift_ls[tx_size];
+ const int txw_idx = get_txw_idx(tx_size);
+ const int txh_idx = get_txh_idx(tx_size);
+ const int txfm_size_col = tx_size_wide[tx_size];
+ const int txfm_size_row = tx_size_high[tx_size];
+ const int col_max = AOMMIN(32, txfm_size_col);
+ const int row_max = AOMMIN(32, txfm_size_row);
+ const int input_stride = row_max;
+ const int rect_type = get_rect_tx_log_ratio(txfm_size_col, txfm_size_row);
+
+ for (int i = 0; i < (col_max >> 3); ++i) {
+ for (int j = 0; j < (row_max >> 3); j++) {
+ __m128i buf[8];
+ iidentity_row_8xn_ssse3(buf, input + j * 8 + i * 8 * input_stride,
+ row_max, shift[0], 8, txw_idx, rect_type);
+ transpose_16bit_8x8(buf, buf);
+ iidentity_col_8xn_ssse3(output + i * 8 + j * 8 * stride, stride, buf,
+ shift[1], 8, txh_idx);
+ }
+ }
+}
+
+static void lowbd_inv_txfm2d_add_4x4_ssse3(const int32_t *input,
+ uint8_t *output, int stride,
+ TX_TYPE tx_type, TX_SIZE tx_size_,
+ int eob) {
+ (void)tx_size_;
+ (void)eob;
+ __m128i buf[4];
+ const TX_SIZE tx_size = TX_4X4;
+ const int8_t *shift = av1_inv_txfm_shift_ls[tx_size];
+ const int txw_idx = get_txw_idx(tx_size);
+ const int txh_idx = get_txh_idx(tx_size);
+ const int txfm_size_col = tx_size_wide[tx_size];
+ const int txfm_size_row = tx_size_high[tx_size];
+
+ const transform_1d_ssse3 row_txfm =
+ lowbd_txfm_all_1d_w4_arr[txw_idx][hitx_1d_tab[tx_type]];
+ const transform_1d_ssse3 col_txfm =
+ lowbd_txfm_all_1d_w4_arr[txh_idx][vitx_1d_tab[tx_type]];
+
+ int ud_flip, lr_flip;
+ get_flip_cfg(tx_type, &ud_flip, &lr_flip);
+ load_buffer_32bit_to_16bit_w4(input, txfm_size_row, buf, txfm_size_col);
+ row_txfm(buf, buf);
+ if (lr_flip) {
+ __m128i temp[4];
+ flip_buf_sse2(buf, temp, txfm_size_col);
+ transpose_16bit_4x4(temp, buf);
+ } else {
+ transpose_16bit_4x4(buf, buf);
+ }
+ col_txfm(buf, buf);
+ round_shift_16bit_ssse3(buf, txfm_size_row, shift[1]);
+ lowbd_write_buffer_4xn_sse2(buf, output, stride, ud_flip, txfm_size_row);
+}
+
+static INLINE __m128i lowbd_get_recon_16x16_sse2(const __m128i pred,
+ __m128i res0, __m128i res1) {
+ const __m128i zero = _mm_setzero_si128();
+ __m128i x0 = _mm_unpacklo_epi8(pred, zero);
+ __m128i x1 = _mm_unpackhi_epi8(pred, zero);
+ x0 = _mm_adds_epi16(res0, x0);
+ x1 = _mm_adds_epi16(res1, x1);
+ return _mm_packus_epi16(x0, x1);
+}
+
+static INLINE void lowbd_write_buffer_16xn_sse2(__m128i *in, uint8_t *output,
+ int stride, int flipud,
+ int height) {
+ int j = flipud ? (height - 1) : 0;
+ const int step = flipud ? -1 : 1;
+ for (int i = 0; i < height; ++i, j += step) {
+ __m128i v = _mm_loadu_si128((__m128i const *)(output + i * stride));
+ __m128i u = lowbd_get_recon_16x16_sse2(v, in[j], in[j + height]);
+ _mm_storeu_si128((__m128i *)(output + i * stride), u);
+ }
+}
+
+static INLINE void round_shift_ssse3(const __m128i *input, __m128i *output,
+ int size) {
+ const __m128i scale = _mm_set1_epi16(NewInvSqrt2 * 8);
+ for (int i = 0; i < size; ++i) {
+ output[i] = _mm_mulhrs_epi16(input[i], scale);
+ }
+}
+
+static INLINE void lowbd_inv_txfm2d_add_no_identity_ssse3(
+ const int32_t *input, uint8_t *output, int stride, TX_TYPE tx_type,
+ TX_SIZE tx_size, int eob) {
+ __m128i buf1[64 * 8];
+ int eobx, eoby;
+ get_eobx_eoby_scan_default(&eobx, &eoby, tx_size, eob);
+ const int8_t *shift = av1_inv_txfm_shift_ls[tx_size];
+ const int txw_idx = get_txw_idx(tx_size);
+ const int txh_idx = get_txh_idx(tx_size);
+ const int txfm_size_col = tx_size_wide[tx_size];
+ const int txfm_size_row = tx_size_high[tx_size];
+ const int buf_size_w_div8 = txfm_size_col >> 3;
+ const int buf_size_nonzero_w = ((eobx + 8) >> 3) << 3;
+ const int buf_size_nonzero_h_div8 = (eoby + 8) >> 3;
+ const int input_stride = AOMMIN(32, txfm_size_row);
+ const int rect_type = get_rect_tx_log_ratio(txfm_size_col, txfm_size_row);
+
+ const int fun_idx_x = lowbd_txfm_all_1d_zeros_idx[eobx];
+ const int fun_idx_y = lowbd_txfm_all_1d_zeros_idx[eoby];
+ const transform_1d_ssse3 row_txfm =
+ lowbd_txfm_all_1d_zeros_w8_arr[txw_idx][hitx_1d_tab[tx_type]][fun_idx_x];
+ const transform_1d_ssse3 col_txfm =
+ lowbd_txfm_all_1d_zeros_w8_arr[txh_idx][vitx_1d_tab[tx_type]][fun_idx_y];
+
+ assert(col_txfm != NULL);
+ assert(row_txfm != NULL);
+ int ud_flip, lr_flip;
+ get_flip_cfg(tx_type, &ud_flip, &lr_flip);
+ for (int i = 0; i < buf_size_nonzero_h_div8; i++) {
+ __m128i buf0[64];
+ load_buffer_32bit_to_16bit(input + 8 * i, input_stride, buf0,
+ buf_size_nonzero_w);
+ if (rect_type == 1 || rect_type == -1) {
+ round_shift_ssse3(buf0, buf0, buf_size_nonzero_w); // rect special code
+ }
+ row_txfm(buf0, buf0);
+ round_shift_16bit_ssse3(buf0, txfm_size_col, shift[0]);
+ __m128i *_buf1 = buf1 + i * 8;
+ if (lr_flip) {
+ for (int j = 0; j < buf_size_w_div8; ++j) {
+ __m128i temp[8];
+ flip_buf_sse2(buf0 + 8 * j, temp, 8);
+ transpose_16bit_8x8(temp,
+ _buf1 + txfm_size_row * (buf_size_w_div8 - 1 - j));
+ }
+ } else {
+ for (int j = 0; j < buf_size_w_div8; ++j) {
+ transpose_16bit_8x8(buf0 + 8 * j, _buf1 + txfm_size_row * j);
+ }
+ }
+ }
+ for (int i = 0; i < buf_size_w_div8; i++) {
+ col_txfm(buf1 + i * txfm_size_row, buf1 + i * txfm_size_row);
+ round_shift_16bit_ssse3(buf1 + i * txfm_size_row, txfm_size_row, shift[1]);
+ }
+
+ if (txfm_size_col >= 16) {
+ for (int i = 0; i < (txfm_size_col >> 4); i++) {
+ lowbd_write_buffer_16xn_sse2(buf1 + i * txfm_size_row * 2,
+ output + 16 * i, stride, ud_flip,
+ txfm_size_row);
+ }
+ } else if (txfm_size_col == 8) {
+ lowbd_write_buffer_8xn_sse2(buf1, output, stride, ud_flip, txfm_size_row);
+ }
+}
+
+void av1_lowbd_inv_txfm2d_add_h_identity_ssse3(const int32_t *input,
+ uint8_t *output, int stride,
+ TX_TYPE tx_type, TX_SIZE tx_size,
+ int eob) {
+ const int8_t *shift = av1_inv_txfm_shift_ls[tx_size];
+ int eobx, eoby;
+ get_eobx_eoby_scan_h_identity(&eobx, &eoby, tx_size, eob);
+ const int txw_idx = get_txw_idx(tx_size);
+ const int txh_idx = get_txh_idx(tx_size);
+ const int txfm_size_col = tx_size_wide[tx_size];
+ const int txfm_size_row = tx_size_high[tx_size];
+ const int buf_size_w_div8 = (eobx + 8) >> 3;
+ const int buf_size_h_div8 = (eoby + 8) >> 3;
+ const int input_stride = AOMMIN(32, txfm_size_row);
+ const int rect_type = get_rect_tx_log_ratio(txfm_size_col, txfm_size_row);
+
+ const int fun_idx = lowbd_txfm_all_1d_zeros_idx[eoby];
+ assert(fun_idx < 5);
+ const transform_1d_ssse3 col_txfm =
+ lowbd_txfm_all_1d_zeros_w8_arr[txh_idx][vitx_1d_tab[tx_type]][fun_idx];
+
+ assert(col_txfm != NULL);
+
+ int ud_flip, lr_flip;
+ get_flip_cfg(tx_type, &ud_flip, &lr_flip);
+ for (int i = 0; i < buf_size_w_div8; i++) {
+ __m128i buf0[64];
+ for (int j = 0; j < buf_size_h_div8; j++) {
+ __m128i *buf0_cur = buf0 + j * 8;
+ const int32_t *input_cur = input + i * 8 * input_stride + j * 8;
+ iidentity_row_8xn_ssse3(buf0_cur, input_cur, input_stride, shift[0], 8,
+ txw_idx, rect_type);
+ transpose_16bit_8x8(buf0_cur, buf0_cur);
+ }
+ col_txfm(buf0, buf0);
+ __m128i mshift = _mm_set1_epi16(1 << (15 + shift[1]));
+ int k = ud_flip ? (txfm_size_row - 1) : 0;
+ const int step = ud_flip ? -1 : 1;
+ uint8_t *out = output + 8 * i;
+ for (int j = 0; j < txfm_size_row; ++j, k += step) {
+ const __m128i v = _mm_loadl_epi64((__m128i const *)(out));
+ __m128i res = _mm_mulhrs_epi16(buf0[k], mshift);
+ const __m128i u = lowbd_get_recon_8x8_sse2(v, res);
+ _mm_storel_epi64((__m128i *)(out), u);
+ out += stride;
+ }
+ }
+}
+
+void av1_lowbd_inv_txfm2d_add_v_identity_ssse3(const int32_t *input,
+ uint8_t *output, int stride,
+ TX_TYPE tx_type, TX_SIZE tx_size,
+ int eob) {
+ __m128i buf1[64];
+ int eobx, eoby;
+ get_eobx_eoby_scan_v_identity(&eobx, &eoby, tx_size, eob);
+ const int8_t *shift = av1_inv_txfm_shift_ls[tx_size];
+ const int txw_idx = get_txw_idx(tx_size);
+ const int txh_idx = get_txh_idx(tx_size);
+ const int txfm_size_col = tx_size_wide[tx_size];
+ const int txfm_size_row = tx_size_high[tx_size];
+ const int buf_size_w_div8 = txfm_size_col >> 3;
+ const int buf_size_nonzero_w = ((eobx + 8) >> 3) << 3;
+ const int buf_size_h_div8 = (eoby + 8) >> 3;
+ const int input_stride = AOMMIN(32, txfm_size_row);
+ const int rect_type = get_rect_tx_log_ratio(txfm_size_col, txfm_size_row);
+
+ const int fun_idx = lowbd_txfm_all_1d_zeros_idx[eobx];
+ const transform_1d_ssse3 row_txfm =
+ lowbd_txfm_all_1d_zeros_w8_arr[txw_idx][hitx_1d_tab[tx_type]][fun_idx];
+
+ assert(row_txfm != NULL);
+ int ud_flip, lr_flip;
+ get_flip_cfg(tx_type, &ud_flip, &lr_flip);
+ for (int i = 0; i < buf_size_h_div8; i++) {
+ __m128i buf0[64];
+ load_buffer_32bit_to_16bit(input + i * 8, input_stride, buf0,
+ buf_size_nonzero_w);
+ if (rect_type == 1 || rect_type == -1) {
+ round_shift_ssse3(buf0, buf0, buf_size_nonzero_w); // rect special code
+ }
+ row_txfm(buf0, buf0);
+ round_shift_16bit_ssse3(buf0, txfm_size_col, shift[0]);
+ __m128i *_buf1 = buf1;
+ if (lr_flip) {
+ for (int j = 0; j < buf_size_w_div8; ++j) {
+ __m128i temp[8];
+ flip_buf_sse2(buf0 + 8 * j, temp, 8);
+ transpose_16bit_8x8(temp, _buf1 + 8 * (buf_size_w_div8 - 1 - j));
+ }
+ } else {
+ for (int j = 0; j < buf_size_w_div8; ++j) {
+ transpose_16bit_8x8(buf0 + 8 * j, _buf1 + 8 * j);
+ }
+ }
+
+ for (int j = 0; j < buf_size_w_div8; ++j) {
+ iidentity_col_8xn_ssse3(output + i * 8 * stride + j * 8, stride,
+ buf1 + j * 8, shift[1], 8, txh_idx);
+ }
+ }
+}
+
+// for 32x32,32x64,64x32,64x64,32x8,8x32,16x32,32x16,64x16,16x64
+static INLINE void lowbd_inv_txfm2d_add_universe_ssse3(
+ const int32_t *input, uint8_t *output, int stride, TX_TYPE tx_type,
+ TX_SIZE tx_size, int eob) {
+ switch (tx_type) {
+ case DCT_DCT:
+ lowbd_inv_txfm2d_add_no_identity_ssse3(input, output, stride, tx_type,
+ tx_size, eob);
+ break;
+ case IDTX:
+ av1_lowbd_inv_txfm2d_add_idtx_ssse3(input, output, stride, tx_size);
+ break;
+ case V_DCT:
+ case V_ADST:
+ case V_FLIPADST:
+ av1_lowbd_inv_txfm2d_add_h_identity_ssse3(input, output, stride, tx_type,
+ tx_size, eob);
+ break;
+ case H_DCT:
+ case H_ADST:
+ case H_FLIPADST:
+ av1_lowbd_inv_txfm2d_add_v_identity_ssse3(input, output, stride, tx_type,
+ tx_size, eob);
+ break;
+ default:
+ lowbd_inv_txfm2d_add_no_identity_ssse3(input, output, stride, tx_type,
+ tx_size, eob);
+ break;
+ }
+}
+
+static void lowbd_inv_txfm2d_add_4x8_ssse3(const int32_t *input,
+ uint8_t *output, int stride,
+ TX_TYPE tx_type, TX_SIZE tx_size_,
+ int eob) {
+ (void)tx_size_;
+ (void)eob;
+ __m128i buf[8];
+ const TX_SIZE tx_size = TX_4X8;
+ const int8_t *shift = av1_inv_txfm_shift_ls[tx_size];
+ const int txw_idx = get_txw_idx(tx_size);
+ const int txh_idx = get_txh_idx(tx_size);
+ const int txfm_size_col = tx_size_wide[tx_size];
+ const int txfm_size_row = tx_size_high[tx_size];
+
+ const transform_1d_ssse3 row_txfm =
+ lowbd_txfm_all_1d_w8_arr[txw_idx][hitx_1d_tab[tx_type]];
+ const transform_1d_ssse3 col_txfm =
+ lowbd_txfm_all_1d_w4_arr[txh_idx][vitx_1d_tab[tx_type]];
+
+ int ud_flip, lr_flip;
+ get_flip_cfg(tx_type, &ud_flip, &lr_flip);
+ load_buffer_32bit_to_16bit(input, txfm_size_row, buf, txfm_size_col);
+ round_shift_ssse3(buf, buf, txfm_size_col); // rect special code
+ row_txfm(buf, buf);
+ // round_shift_16bit_ssse3(buf, txfm_size_col, shift[0]);// shift[0] is 0
+ if (lr_flip) {
+ __m128i temp[4];
+ flip_buf_sse2(buf, temp, txfm_size_col);
+ transpose_16bit_8x4(temp, buf);
+ } else {
+ transpose_16bit_8x4(buf, buf);
+ }
+ col_txfm(buf, buf);
+ round_shift_16bit_ssse3(buf, txfm_size_row, shift[1]);
+ lowbd_write_buffer_4xn_sse2(buf, output, stride, ud_flip, txfm_size_row);
+}
+
+static void lowbd_inv_txfm2d_add_8x4_ssse3(const int32_t *input,
+ uint8_t *output, int stride,
+ TX_TYPE tx_type, TX_SIZE tx_size_,
+ int eob) {
+ (void)tx_size_;
+ (void)eob;
+ __m128i buf[8];
+ const TX_SIZE tx_size = TX_8X4;
+ const int8_t *shift = av1_inv_txfm_shift_ls[tx_size];
+ const int txw_idx = get_txw_idx(tx_size);
+ const int txh_idx = get_txh_idx(tx_size);
+ const int txfm_size_col = tx_size_wide[tx_size];
+ const int txfm_size_row = tx_size_high[tx_size];
+
+ const transform_1d_ssse3 row_txfm =
+ lowbd_txfm_all_1d_w4_arr[txw_idx][hitx_1d_tab[tx_type]];
+ const transform_1d_ssse3 col_txfm =
+ lowbd_txfm_all_1d_w8_arr[txh_idx][vitx_1d_tab[tx_type]];
+
+ int ud_flip, lr_flip;
+ get_flip_cfg(tx_type, &ud_flip, &lr_flip);
+ load_buffer_32bit_to_16bit_w4(input, txfm_size_row, buf, txfm_size_col);
+ round_shift_ssse3(buf, buf, txfm_size_col); // rect special code
+ row_txfm(buf, buf);
+ // round_shift_16bit_ssse3(buf, txfm_size_col, shift[0]); // shift[0] is 0
+ if (lr_flip) {
+ __m128i temp[8];
+ flip_buf_sse2(buf, temp, txfm_size_col);
+ transpose_16bit_4x8(temp, buf);
+ } else {
+ transpose_16bit_4x8(buf, buf);
+ }
+ col_txfm(buf, buf);
+ round_shift_16bit_ssse3(buf, txfm_size_row, shift[1]);
+ lowbd_write_buffer_8xn_sse2(buf, output, stride, ud_flip, txfm_size_row);
+}
+
+static void lowbd_inv_txfm2d_add_4x16_ssse3(const int32_t *input,
+ uint8_t *output, int stride,
+ TX_TYPE tx_type, TX_SIZE tx_size_,
+ int eob) {
+ (void)tx_size_;
+ (void)eob;
+ __m128i buf[16];
+ const TX_SIZE tx_size = TX_4X16;
+ const int8_t *shift = av1_inv_txfm_shift_ls[tx_size];
+ const int txw_idx = get_txw_idx(tx_size);
+ const int txh_idx = get_txh_idx(tx_size);
+ const int txfm_size_col = tx_size_wide[tx_size];
+ const int txfm_size_row = tx_size_high[tx_size];
+
+ const transform_1d_ssse3 row_txfm =
+ lowbd_txfm_all_1d_w8_arr[txw_idx][hitx_1d_tab[tx_type]];
+ const transform_1d_ssse3 col_txfm =
+ lowbd_txfm_all_1d_w4_arr[txh_idx][vitx_1d_tab[tx_type]];
+
+ int ud_flip, lr_flip;
+ get_flip_cfg(tx_type, &ud_flip, &lr_flip);
+
+ const int row_one_loop = 8;
+ for (int i = 0; i < 2; ++i) {
+ const int32_t *input_cur = input + i * row_one_loop;
+ __m128i *buf_cur = buf + i * row_one_loop;
+ load_buffer_32bit_to_16bit(input_cur, txfm_size_row, buf_cur,
+ txfm_size_col);
+ if (row_txfm == iidentity4_ssse3) {
+ const __m128i scale = pair_set_epi16(NewSqrt2, 3 << (NewSqrt2Bits - 1));
+ const __m128i ones = _mm_set1_epi16(1);
+ for (int j = 0; j < 4; ++j) {
+ const __m128i buf_lo = _mm_unpacklo_epi16(buf_cur[j], ones);
+ const __m128i buf_hi = _mm_unpackhi_epi16(buf_cur[j], ones);
+ const __m128i buf_32_lo =
+ _mm_srai_epi32(_mm_madd_epi16(buf_lo, scale), (NewSqrt2Bits + 1));
+ const __m128i buf_32_hi =
+ _mm_srai_epi32(_mm_madd_epi16(buf_hi, scale), (NewSqrt2Bits + 1));
+ buf_cur[j] = _mm_packs_epi32(buf_32_lo, buf_32_hi);
+ }
+ } else {
+ row_txfm(buf_cur, buf_cur);
+ round_shift_16bit_ssse3(buf_cur, row_one_loop, shift[0]);
+ }
+ if (lr_flip) {
+ __m128i temp[8];
+ flip_buf_sse2(buf_cur, temp, txfm_size_col);
+ transpose_16bit_8x4(temp, buf_cur);
+ } else {
+ transpose_16bit_8x4(buf_cur, buf_cur);
+ }
+ }
+ col_txfm(buf, buf);
+ round_shift_16bit_ssse3(buf, txfm_size_row, shift[1]);
+ lowbd_write_buffer_4xn_sse2(buf, output, stride, ud_flip, txfm_size_row);
+}
+
+static void lowbd_inv_txfm2d_add_16x4_ssse3(const int32_t *input,
+ uint8_t *output, int stride,
+ TX_TYPE tx_type, TX_SIZE tx_size_,
+ int eob) {
+ (void)tx_size_;
+ (void)eob;
+ __m128i buf[16];
+ const TX_SIZE tx_size = TX_16X4;
+ const int8_t *shift = av1_inv_txfm_shift_ls[tx_size];
+ const int txw_idx = get_txw_idx(tx_size);
+ const int txh_idx = get_txh_idx(tx_size);
+ const int txfm_size_col = tx_size_wide[tx_size];
+ const int txfm_size_row = tx_size_high[tx_size];
+ const int buf_size_w_div8 = txfm_size_col >> 3;
+
+ const transform_1d_ssse3 row_txfm =
+ lowbd_txfm_all_1d_w4_arr[txw_idx][hitx_1d_tab[tx_type]];
+ const transform_1d_ssse3 col_txfm =
+ lowbd_txfm_all_1d_w8_arr[txh_idx][vitx_1d_tab[tx_type]];
+
+ int ud_flip, lr_flip;
+ get_flip_cfg(tx_type, &ud_flip, &lr_flip);
+ const int row_one_loop = 8;
+ load_buffer_32bit_to_16bit_w4(input, txfm_size_row, buf, txfm_size_col);
+ if (row_txfm == iidentity16_ssse3) {
+ const __m128i scale = pair_set_epi16(2 * NewSqrt2, 3 << (NewSqrt2Bits - 1));
+ const __m128i ones = _mm_set1_epi16(1);
+ for (int j = 0; j < 16; ++j) {
+ const __m128i buf_lo = _mm_unpacklo_epi16(buf[j], ones);
+ const __m128i buf_hi = _mm_unpackhi_epi16(buf[j], ones);
+ const __m128i buf_32_lo =
+ _mm_srai_epi32(_mm_madd_epi16(buf_lo, scale), (NewSqrt2Bits + 1));
+ const __m128i buf_32_hi =
+ _mm_srai_epi32(_mm_madd_epi16(buf_hi, scale), (NewSqrt2Bits + 1));
+ buf[j] = _mm_packs_epi32(buf_32_lo, buf_32_hi);
+ }
+ } else {
+ row_txfm(buf, buf);
+ round_shift_16bit_ssse3(buf, txfm_size_col, shift[0]);
+ }
+ if (lr_flip) {
+ __m128i temp[16];
+ flip_buf_sse2(buf, temp, 16);
+ transpose_16bit_4x8(temp, buf);
+ transpose_16bit_4x8(temp + 8, buf + 8);
+ } else {
+ transpose_16bit_4x8(buf, buf);
+ transpose_16bit_4x8(buf + row_one_loop, buf + row_one_loop);
+ }
+ for (int i = 0; i < buf_size_w_div8; i++) {
+ col_txfm(buf + i * row_one_loop, buf + i * row_one_loop);
+ round_shift_16bit_ssse3(buf + i * row_one_loop, txfm_size_row, shift[1]);
+ }
+ lowbd_write_buffer_8xn_sse2(buf, output, stride, ud_flip, 4);
+ lowbd_write_buffer_8xn_sse2(buf + 8, output + 8, stride, ud_flip, 4);
+}
+
+void av1_lowbd_inv_txfm2d_add_ssse3(const int32_t *input, uint8_t *output,
+ int stride, TX_TYPE tx_type,
+ TX_SIZE tx_size, int eob) {
+ switch (tx_size) {
+ case TX_4X4:
+ lowbd_inv_txfm2d_add_4x4_ssse3(input, output, stride, tx_type, tx_size,
+ eob);
+ break;
+ case TX_4X8:
+ lowbd_inv_txfm2d_add_4x8_ssse3(input, output, stride, tx_type, tx_size,
+ eob);
+ break;
+ case TX_8X4:
+ lowbd_inv_txfm2d_add_8x4_ssse3(input, output, stride, tx_type, tx_size,
+ eob);
+ break;
+ case TX_4X16:
+ lowbd_inv_txfm2d_add_4x16_ssse3(input, output, stride, tx_type, tx_size,
+ eob);
+ break;
+ case TX_16X4:
+ lowbd_inv_txfm2d_add_16x4_ssse3(input, output, stride, tx_type, tx_size,
+ eob);
+ break;
+ default:
+ lowbd_inv_txfm2d_add_universe_ssse3(input, output, stride, tx_type,
+ tx_size, eob);
+ break;
+ }
+}
+
+void av1_inv_txfm_add_ssse3(const tran_low_t *dqcoeff, uint8_t *dst, int stride,
+ const TxfmParam *txfm_param) {
+ if (!txfm_param->lossless) {
+ const TX_TYPE tx_type = txfm_param->tx_type;
+ av1_lowbd_inv_txfm2d_add_ssse3(dqcoeff, dst, stride, tx_type,
+ txfm_param->tx_size, txfm_param->eob);
+
+ } else {
+ av1_inv_txfm_add_c(dqcoeff, dst, stride, txfm_param);
+ }
+}
diff --git a/third_party/aom/av1/common/x86/av1_inv_txfm_ssse3.h b/third_party/aom/av1/common/x86/av1_inv_txfm_ssse3.h
new file mode 100644
index 0000000000..1873d01bc0
--- /dev/null
+++ b/third_party/aom/av1/common/x86/av1_inv_txfm_ssse3.h
@@ -0,0 +1,247 @@
+/*
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+#ifndef AOM_AV1_COMMON_X86_AV1_INV_TXFM_SSSE3_H_
+#define AOM_AV1_COMMON_X86_AV1_INV_TXFM_SSSE3_H_
+
+#include <emmintrin.h> // SSE2
+#include <tmmintrin.h> // SSSE3
+
+#include "config/aom_config.h"
+#include "config/av1_rtcd.h"
+
+#include "aom/aom_integer.h"
+#include "aom_dsp/x86/transpose_sse2.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#define btf_16_ssse3(w0, w1, in, out0, out1) \
+ do { \
+ const __m128i _w0 = _mm_set1_epi16(w0 * 8); \
+ const __m128i _w1 = _mm_set1_epi16(w1 * 8); \
+ const __m128i _in = in; \
+ out0 = _mm_mulhrs_epi16(_in, _w0); \
+ out1 = _mm_mulhrs_epi16(_in, _w1); \
+ } while (0)
+
+#define btf_16_adds_subs_sse2(in0, in1) \
+ do { \
+ const __m128i _in0 = in0; \
+ const __m128i _in1 = in1; \
+ in0 = _mm_adds_epi16(_in0, _in1); \
+ in1 = _mm_subs_epi16(_in0, _in1); \
+ } while (0)
+
+#define btf_16_subs_adds_sse2(in0, in1) \
+ do { \
+ const __m128i _in0 = in0; \
+ const __m128i _in1 = in1; \
+ in1 = _mm_subs_epi16(_in0, _in1); \
+ in0 = _mm_adds_epi16(_in0, _in1); \
+ } while (0)
+
+#define btf_16_adds_subs_out_sse2(out0, out1, in0, in1) \
+ do { \
+ const __m128i _in0 = in0; \
+ const __m128i _in1 = in1; \
+ out0 = _mm_adds_epi16(_in0, _in1); \
+ out1 = _mm_subs_epi16(_in0, _in1); \
+ } while (0)
+
+static INLINE void round_shift_16bit_ssse3(__m128i *in, int size, int bit) {
+ if (bit < 0) {
+ const __m128i scale = _mm_set1_epi16(1 << (15 + bit));
+ for (int i = 0; i < size; ++i) {
+ in[i] = _mm_mulhrs_epi16(in[i], scale);
+ }
+ } else if (bit > 0) {
+ for (int i = 0; i < size; ++i) {
+ in[i] = _mm_slli_epi16(in[i], bit);
+ }
+ }
+}
+
+// 1D itx types
+enum {
+ IDCT_1D,
+ IADST_1D,
+ IFLIPADST_1D = IADST_1D,
+ IIDENTITY_1D,
+ ITX_TYPES_1D,
+} UENUM1BYTE(ITX_TYPE_1D);
+
+static const ITX_TYPE_1D vitx_1d_tab[TX_TYPES] = {
+ IDCT_1D, IADST_1D, IDCT_1D, IADST_1D,
+ IFLIPADST_1D, IDCT_1D, IFLIPADST_1D, IADST_1D,
+ IFLIPADST_1D, IIDENTITY_1D, IDCT_1D, IIDENTITY_1D,
+ IADST_1D, IIDENTITY_1D, IFLIPADST_1D, IIDENTITY_1D,
+};
+
+static const ITX_TYPE_1D hitx_1d_tab[TX_TYPES] = {
+ IDCT_1D, IDCT_1D, IADST_1D, IADST_1D,
+ IDCT_1D, IFLIPADST_1D, IFLIPADST_1D, IFLIPADST_1D,
+ IADST_1D, IIDENTITY_1D, IIDENTITY_1D, IDCT_1D,
+ IIDENTITY_1D, IADST_1D, IIDENTITY_1D, IFLIPADST_1D,
+};
+
+DECLARE_ALIGNED(16, static const int16_t, av1_eob_to_eobxy_8x8_default[8]) = {
+ 0x0707, 0x0707, 0x0707, 0x0707, 0x0707, 0x0707, 0x0707, 0x0707,
+};
+
+DECLARE_ALIGNED(16, static const int16_t,
+ av1_eob_to_eobxy_16x16_default[16]) = {
+ 0x0707, 0x0707, 0x0f0f, 0x0f0f, 0x0f0f, 0x0f0f, 0x0f0f, 0x0f0f,
+ 0x0f0f, 0x0f0f, 0x0f0f, 0x0f0f, 0x0f0f, 0x0f0f, 0x0f0f, 0x0f0f,
+};
+
+DECLARE_ALIGNED(16, static const int16_t,
+ av1_eob_to_eobxy_32x32_default[32]) = {
+ 0x0707, 0x0f0f, 0x0f0f, 0x0f0f, 0x1f1f, 0x1f1f, 0x1f1f, 0x1f1f,
+ 0x1f1f, 0x1f1f, 0x1f1f, 0x1f1f, 0x1f1f, 0x1f1f, 0x1f1f, 0x1f1f,
+ 0x1f1f, 0x1f1f, 0x1f1f, 0x1f1f, 0x1f1f, 0x1f1f, 0x1f1f, 0x1f1f,
+ 0x1f1f, 0x1f1f, 0x1f1f, 0x1f1f, 0x1f1f, 0x1f1f, 0x1f1f, 0x1f1f,
+};
+
+DECLARE_ALIGNED(16, static const int16_t, av1_eob_to_eobxy_8x16_default[16]) = {
+ 0x0707, 0x0707, 0x0707, 0x0707, 0x0707, 0x0f07, 0x0f07, 0x0f07,
+ 0x0f07, 0x0f07, 0x0f07, 0x0f07, 0x0f07, 0x0f07, 0x0f07, 0x0f07,
+};
+
+DECLARE_ALIGNED(16, static const int16_t, av1_eob_to_eobxy_16x8_default[8]) = {
+ 0x0707, 0x0707, 0x070f, 0x070f, 0x070f, 0x070f, 0x070f, 0x070f,
+};
+
+DECLARE_ALIGNED(16, static const int16_t,
+ av1_eob_to_eobxy_16x32_default[32]) = {
+ 0x0707, 0x0707, 0x0f0f, 0x0f0f, 0x0f0f, 0x0f0f, 0x0f0f, 0x0f0f,
+ 0x0f0f, 0x1f0f, 0x1f0f, 0x1f0f, 0x1f0f, 0x1f0f, 0x1f0f, 0x1f0f,
+ 0x1f0f, 0x1f0f, 0x1f0f, 0x1f0f, 0x1f0f, 0x1f0f, 0x1f0f, 0x1f0f,
+ 0x1f0f, 0x1f0f, 0x1f0f, 0x1f0f, 0x1f0f, 0x1f0f, 0x1f0f, 0x1f0f,
+};
+
+DECLARE_ALIGNED(16, static const int16_t,
+ av1_eob_to_eobxy_32x16_default[16]) = {
+ 0x0707, 0x0f0f, 0x0f0f, 0x0f0f, 0x0f1f, 0x0f1f, 0x0f1f, 0x0f1f,
+ 0x0f1f, 0x0f1f, 0x0f1f, 0x0f1f, 0x0f1f, 0x0f1f, 0x0f1f, 0x0f1f,
+};
+
+DECLARE_ALIGNED(16, static const int16_t, av1_eob_to_eobxy_8x32_default[32]) = {
+ 0x0707, 0x0707, 0x0707, 0x0707, 0x0707, 0x0f07, 0x0f07, 0x0f07,
+ 0x0f07, 0x0f07, 0x0f07, 0x0f07, 0x0f07, 0x1f07, 0x1f07, 0x1f07,
+ 0x1f07, 0x1f07, 0x1f07, 0x1f07, 0x1f07, 0x1f07, 0x1f07, 0x1f07,
+ 0x1f07, 0x1f07, 0x1f07, 0x1f07, 0x1f07, 0x1f07, 0x1f07, 0x1f07,
+};
+
+DECLARE_ALIGNED(16, static const int16_t, av1_eob_to_eobxy_32x8_default[8]) = {
+ 0x0707, 0x070f, 0x070f, 0x071f, 0x071f, 0x071f, 0x071f, 0x071f,
+};
+
+DECLARE_ALIGNED(16, static const int16_t *,
+ av1_eob_to_eobxy_default[TX_SIZES_ALL]) = {
+ NULL,
+ av1_eob_to_eobxy_8x8_default,
+ av1_eob_to_eobxy_16x16_default,
+ av1_eob_to_eobxy_32x32_default,
+ av1_eob_to_eobxy_32x32_default,
+ NULL,
+ NULL,
+ av1_eob_to_eobxy_8x16_default,
+ av1_eob_to_eobxy_16x8_default,
+ av1_eob_to_eobxy_16x32_default,
+ av1_eob_to_eobxy_32x16_default,
+ av1_eob_to_eobxy_32x32_default,
+ av1_eob_to_eobxy_32x32_default,
+ NULL,
+ NULL,
+ av1_eob_to_eobxy_8x32_default,
+ av1_eob_to_eobxy_32x8_default,
+ av1_eob_to_eobxy_16x32_default,
+ av1_eob_to_eobxy_32x16_default,
+};
+
+static const int lowbd_txfm_all_1d_zeros_idx[32] = {
+ 0, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2,
+ 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
+};
+
+// Transform block width in log2 for eob (size of 64 map to 32)
+static const int tx_size_wide_log2_eob[TX_SIZES_ALL] = {
+ 2, 3, 4, 5, 5, 2, 3, 3, 4, 4, 5, 5, 5, 2, 4, 3, 5, 4, 5,
+};
+
+static INLINE void get_eobx_eoby_scan_default(int *eobx, int *eoby,
+ TX_SIZE tx_size, int eob) {
+ if (eob == 1) {
+ *eobx = 0;
+ *eoby = 0;
+ return;
+ }
+
+ const int tx_w_log2 = tx_size_wide_log2_eob[tx_size];
+ const int eob_row = (eob - 1) >> tx_w_log2;
+ const int eobxy = av1_eob_to_eobxy_default[tx_size][eob_row];
+ *eobx = eobxy & 0xFF;
+ *eoby = eobxy >> 8;
+}
+
+static int eob_fill[32] = {
+ 0, 7, 7, 7, 7, 7, 7, 7, 15, 15, 15, 15, 15, 15, 15, 15,
+ 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+};
+
+static INLINE void get_eobx_eoby_scan_h_identity(int *eobx, int *eoby,
+ TX_SIZE tx_size, int eob) {
+ eob -= 1;
+ const int txfm_size_col = tx_size_wide[tx_size];
+ const int eobx_max = AOMMIN(32, txfm_size_col) - 1;
+ *eobx = (eob >= eobx_max) ? eobx_max : eob_fill[eob];
+ const int temp_eoby = eob / (eobx_max + 1);
+ assert(temp_eoby < 32);
+ *eoby = eob_fill[temp_eoby];
+}
+
+static INLINE void get_eobx_eoby_scan_v_identity(int *eobx, int *eoby,
+ TX_SIZE tx_size, int eob) {
+ eob -= 1;
+ const int txfm_size_row = tx_size_high[tx_size];
+ const int eoby_max = AOMMIN(32, txfm_size_row) - 1;
+ *eobx = eob_fill[eob / (eoby_max + 1)];
+ *eoby = (eob >= eoby_max) ? eoby_max : eob_fill[eob];
+}
+
+typedef void (*transform_1d_ssse3)(const __m128i *input, __m128i *output);
+
+void av1_lowbd_inv_txfm2d_add_ssse3(const int32_t *input, uint8_t *output,
+ int stride, TX_TYPE tx_type,
+ TX_SIZE tx_size, int eob);
+
+void av1_lowbd_inv_txfm2d_add_idtx_ssse3(const int32_t *input, uint8_t *output,
+ int stride, TX_SIZE tx_size);
+
+void av1_lowbd_inv_txfm2d_add_h_identity_ssse3(const int32_t *input,
+ uint8_t *output, int stride,
+ TX_TYPE tx_type, TX_SIZE tx_size,
+ int eob);
+void av1_lowbd_inv_txfm2d_add_v_identity_ssse3(const int32_t *input,
+ uint8_t *output, int stride,
+ TX_TYPE tx_type, TX_SIZE tx_size,
+ int eob);
+
+void av1_iadst8_low1_ssse3(const __m128i *input, __m128i *output);
+
+void av1_idct8_low1_ssse3(const __m128i *input, __m128i *output);
+
+#ifdef __cplusplus
+} // extern "C"
+#endif
+
+#endif // AOM_AV1_COMMON_X86_AV1_INV_TXFM_SSSE3_H_
diff --git a/third_party/aom/av1/common/x86/av1_txfm_sse2.h b/third_party/aom/av1/common/x86/av1_txfm_sse2.h
new file mode 100644
index 0000000000..129721cf05
--- /dev/null
+++ b/third_party/aom/av1/common/x86/av1_txfm_sse2.h
@@ -0,0 +1,321 @@
+/*
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+#ifndef AOM_AV1_COMMON_X86_AV1_TXFM_SSE2_H_
+#define AOM_AV1_COMMON_X86_AV1_TXFM_SSE2_H_
+
+#include <emmintrin.h> // SSE2
+
+#include "config/aom_config.h"
+#include "config/av1_rtcd.h"
+
+#include "aom/aom_integer.h"
+#include "aom_dsp/x86/transpose_sse2.h"
+#include "aom_dsp/x86/txfm_common_sse2.h"
+#include "av1/common/av1_txfm.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+static INLINE void btf_16_w4_sse2(
+ const __m128i *const w0, const __m128i *const w1, const __m128i __rounding,
+ const int8_t cos_bit, const __m128i *const in0, const __m128i *const in1,
+ __m128i *const out0, __m128i *const out1) {
+ const __m128i t0 = _mm_unpacklo_epi16(*in0, *in1);
+ const __m128i u0 = _mm_madd_epi16(t0, *w0);
+ const __m128i v0 = _mm_madd_epi16(t0, *w1);
+ const __m128i a0 = _mm_add_epi32(u0, __rounding);
+ const __m128i b0 = _mm_add_epi32(v0, __rounding);
+ const __m128i c0 = _mm_srai_epi32(a0, cos_bit);
+ const __m128i d0 = _mm_srai_epi32(b0, cos_bit);
+
+ *out0 = _mm_packs_epi32(c0, c0);
+ *out1 = _mm_packs_epi32(d0, c0);
+}
+
+#define btf_16_4p_sse2(w0, w1, in0, in1, out0, out1) \
+ do { \
+ __m128i t0 = _mm_unpacklo_epi16(in0, in1); \
+ __m128i u0 = _mm_madd_epi16(t0, w0); \
+ __m128i v0 = _mm_madd_epi16(t0, w1); \
+ \
+ __m128i a0 = _mm_add_epi32(u0, __rounding); \
+ __m128i b0 = _mm_add_epi32(v0, __rounding); \
+ \
+ __m128i c0 = _mm_srai_epi32(a0, cos_bit); \
+ __m128i d0 = _mm_srai_epi32(b0, cos_bit); \
+ \
+ out0 = _mm_packs_epi32(c0, c0); \
+ out1 = _mm_packs_epi32(d0, d0); \
+ } while (0)
+
+#define btf_16_sse2(w0, w1, in0, in1, out0, out1) \
+ do { \
+ __m128i t0 = _mm_unpacklo_epi16(in0, in1); \
+ __m128i t1 = _mm_unpackhi_epi16(in0, in1); \
+ __m128i u0 = _mm_madd_epi16(t0, w0); \
+ __m128i u1 = _mm_madd_epi16(t1, w0); \
+ __m128i v0 = _mm_madd_epi16(t0, w1); \
+ __m128i v1 = _mm_madd_epi16(t1, w1); \
+ \
+ __m128i a0 = _mm_add_epi32(u0, __rounding); \
+ __m128i a1 = _mm_add_epi32(u1, __rounding); \
+ __m128i b0 = _mm_add_epi32(v0, __rounding); \
+ __m128i b1 = _mm_add_epi32(v1, __rounding); \
+ \
+ __m128i c0 = _mm_srai_epi32(a0, cos_bit); \
+ __m128i c1 = _mm_srai_epi32(a1, cos_bit); \
+ __m128i d0 = _mm_srai_epi32(b0, cos_bit); \
+ __m128i d1 = _mm_srai_epi32(b1, cos_bit); \
+ \
+ out0 = _mm_packs_epi32(c0, c1); \
+ out1 = _mm_packs_epi32(d0, d1); \
+ } while (0)
+
+static INLINE __m128i load_16bit_to_16bit(const int16_t *a) {
+ return _mm_load_si128((const __m128i *)a);
+}
+
+static INLINE __m128i load_32bit_to_16bit(const int32_t *a) {
+ const __m128i a_low = _mm_load_si128((const __m128i *)a);
+ return _mm_packs_epi32(a_low, *(const __m128i *)(a + 4));
+}
+
+static INLINE __m128i load_32bit_to_16bit_w4(const int32_t *a) {
+ const __m128i a_low = _mm_load_si128((const __m128i *)a);
+ return _mm_packs_epi32(a_low, a_low);
+}
+
+// Store 4 16 bit values. Sign extend the values.
+static INLINE void store_16bit_to_32bit_w4(const __m128i a, int32_t *const b) {
+ const __m128i a_lo = _mm_unpacklo_epi16(a, a);
+ const __m128i a_1 = _mm_srai_epi32(a_lo, 16);
+ _mm_store_si128((__m128i *)b, a_1);
+}
+
+// Store 8 16 bit values. Sign extend the values.
+static INLINE void store_16bit_to_32bit(__m128i a, int32_t *b) {
+ const __m128i a_lo = _mm_unpacklo_epi16(a, a);
+ const __m128i a_hi = _mm_unpackhi_epi16(a, a);
+ const __m128i a_1 = _mm_srai_epi32(a_lo, 16);
+ const __m128i a_2 = _mm_srai_epi32(a_hi, 16);
+ _mm_store_si128((__m128i *)b, a_1);
+ _mm_store_si128((__m128i *)(b + 4), a_2);
+}
+
+static INLINE __m128i scale_round_sse2(const __m128i a, const int scale) {
+ const __m128i scale_rounding = pair_set_epi16(scale, 1 << (NewSqrt2Bits - 1));
+ const __m128i b = _mm_madd_epi16(a, scale_rounding);
+ return _mm_srai_epi32(b, NewSqrt2Bits);
+}
+
+static INLINE void store_rect_16bit_to_32bit_w4(const __m128i a,
+ int32_t *const b) {
+ const __m128i one = _mm_set1_epi16(1);
+ const __m128i a_lo = _mm_unpacklo_epi16(a, one);
+ const __m128i b_lo = scale_round_sse2(a_lo, NewSqrt2);
+ _mm_store_si128((__m128i *)b, b_lo);
+}
+
+static INLINE void store_rect_16bit_to_32bit(const __m128i a,
+ int32_t *const b) {
+ const __m128i one = _mm_set1_epi16(1);
+ const __m128i a_lo = _mm_unpacklo_epi16(a, one);
+ const __m128i a_hi = _mm_unpackhi_epi16(a, one);
+ const __m128i b_lo = scale_round_sse2(a_lo, NewSqrt2);
+ const __m128i b_hi = scale_round_sse2(a_hi, NewSqrt2);
+ _mm_store_si128((__m128i *)b, b_lo);
+ _mm_store_si128((__m128i *)(b + 4), b_hi);
+}
+
+static INLINE void load_buffer_16bit_to_16bit_w4(const int16_t *const in,
+ const int stride,
+ __m128i *const out,
+ const int out_size) {
+ for (int i = 0; i < out_size; ++i) {
+ out[i] = _mm_loadl_epi64((const __m128i *)(in + i * stride));
+ }
+}
+
+static INLINE void load_buffer_16bit_to_16bit_w4_flip(const int16_t *const in,
+ const int stride,
+ __m128i *const out,
+ const int out_size) {
+ for (int i = 0; i < out_size; ++i) {
+ out[out_size - i - 1] = _mm_loadl_epi64((const __m128i *)(in + i * stride));
+ }
+}
+
+static INLINE void load_buffer_16bit_to_16bit(const int16_t *in, int stride,
+ __m128i *out, int out_size) {
+ for (int i = 0; i < out_size; ++i) {
+ out[i] = load_16bit_to_16bit(in + i * stride);
+ }
+}
+
+static INLINE void load_buffer_16bit_to_16bit_flip(const int16_t *in,
+ int stride, __m128i *out,
+ int out_size) {
+ for (int i = 0; i < out_size; ++i) {
+ out[out_size - i - 1] = load_16bit_to_16bit(in + i * stride);
+ }
+}
+
+static INLINE void load_buffer_32bit_to_16bit(const int32_t *in, int stride,
+ __m128i *out, int out_size) {
+ for (int i = 0; i < out_size; ++i) {
+ out[i] = load_32bit_to_16bit(in + i * stride);
+ }
+}
+
+static INLINE void load_buffer_32bit_to_16bit_w4(const int32_t *in, int stride,
+ __m128i *out, int out_size) {
+ for (int i = 0; i < out_size; ++i) {
+ out[i] = load_32bit_to_16bit_w4(in + i * stride);
+ }
+}
+
+static INLINE void load_buffer_32bit_to_16bit_flip(const int32_t *in,
+ int stride, __m128i *out,
+ int out_size) {
+ for (int i = 0; i < out_size; ++i) {
+ out[out_size - i - 1] = load_32bit_to_16bit(in + i * stride);
+ }
+}
+
+static INLINE void store_buffer_16bit_to_32bit_w4(const __m128i *const in,
+ int32_t *const out,
+ const int stride,
+ const int out_size) {
+ for (int i = 0; i < out_size; ++i) {
+ store_16bit_to_32bit_w4(in[i], out + i * stride);
+ }
+}
+
+static INLINE void store_buffer_16bit_to_32bit_w8(const __m128i *const in,
+ int32_t *const out,
+ const int stride,
+ const int out_size) {
+ for (int i = 0; i < out_size; ++i) {
+ store_16bit_to_32bit(in[i], out + i * stride);
+ }
+}
+
+static INLINE void store_rect_buffer_16bit_to_32bit_w4(const __m128i *const in,
+ int32_t *const out,
+ const int stride,
+ const int out_size) {
+ for (int i = 0; i < out_size; ++i) {
+ store_rect_16bit_to_32bit_w4(in[i], out + i * stride);
+ }
+}
+
+static INLINE void store_rect_buffer_16bit_to_32bit_w8(const __m128i *const in,
+ int32_t *const out,
+ const int stride,
+ const int out_size) {
+ for (int i = 0; i < out_size; ++i) {
+ store_rect_16bit_to_32bit(in[i], out + i * stride);
+ }
+}
+
+static INLINE void store_buffer_16bit_to_16bit_8x8(const __m128i *in,
+ uint16_t *out,
+ const int stride) {
+ for (int i = 0; i < 8; ++i) {
+ _mm_store_si128((__m128i *)(out + i * stride), in[i]);
+ }
+}
+
+static INLINE void round_shift_16bit(__m128i *in, int size, int bit) {
+ if (bit < 0) {
+ bit = -bit;
+ __m128i rounding = _mm_set1_epi16(1 << (bit - 1));
+ for (int i = 0; i < size; ++i) {
+ in[i] = _mm_adds_epi16(in[i], rounding);
+ in[i] = _mm_srai_epi16(in[i], bit);
+ }
+ } else if (bit > 0) {
+ for (int i = 0; i < size; ++i) {
+ in[i] = _mm_slli_epi16(in[i], bit);
+ }
+ }
+}
+
+static INLINE void flip_buf_sse2(__m128i *in, __m128i *out, int size) {
+ for (int i = 0; i < size; ++i) {
+ out[size - i - 1] = in[i];
+ }
+}
+
+void av1_lowbd_fwd_txfm2d_4x4_sse2(const int16_t *input, int32_t *output,
+ int stride, TX_TYPE tx_type, int bd);
+
+void av1_lowbd_fwd_txfm2d_4x8_sse2(const int16_t *input, int32_t *output,
+ int stride, TX_TYPE tx_type, int bd);
+
+void av1_lowbd_fwd_txfm2d_4x16_sse2(const int16_t *input, int32_t *output,
+ int stride, TX_TYPE tx_type, int bd);
+
+void av1_lowbd_fwd_txfm2d_8x4_sse2(const int16_t *input, int32_t *output,
+ int stride, TX_TYPE tx_type, int bd);
+
+void av1_lowbd_fwd_txfm2d_8x8_sse2(const int16_t *input, int32_t *output,
+ int stride, TX_TYPE tx_type, int bd);
+
+void av1_lowbd_fwd_txfm2d_8x16_sse2(const int16_t *input, int32_t *output,
+ int stride, TX_TYPE tx_type, int bd);
+
+void av1_lowbd_fwd_txfm2d_8x32_sse2(const int16_t *input, int32_t *output,
+ int stride, TX_TYPE tx_type, int bd);
+
+void av1_lowbd_fwd_txfm2d_16x4_sse2(const int16_t *input, int32_t *output,
+ int stride, TX_TYPE tx_type, int bd);
+
+void av1_lowbd_fwd_txfm2d_16x8_sse2(const int16_t *input, int32_t *output,
+ int stride, TX_TYPE tx_type, int bd);
+
+void av1_lowbd_fwd_txfm2d_16x16_sse2(const int16_t *input, int32_t *output,
+ int stride, TX_TYPE tx_type, int bd);
+
+void av1_lowbd_fwd_txfm2d_16x32_sse2(const int16_t *input, int32_t *output,
+ int stride, TX_TYPE tx_type, int bd);
+
+void av1_lowbd_fwd_txfm2d_32x8_sse2(const int16_t *input, int32_t *output,
+ int stride, TX_TYPE tx_type, int bd);
+
+void av1_lowbd_fwd_txfm2d_32x16_sse2(const int16_t *input, int32_t *output,
+ int stride, TX_TYPE tx_type, int bd);
+
+void av1_lowbd_fwd_txfm2d_32x32_sse2(const int16_t *input, int32_t *output,
+ int stride, TX_TYPE tx_type, int bd);
+
+void av1_lowbd_fwd_txfm2d_16x64_sse2(const int16_t *input, int32_t *output,
+ int stride, TX_TYPE tx_type, int bd);
+
+void av1_lowbd_fwd_txfm2d_64x16_sse2(const int16_t *input, int32_t *output,
+ int stride, TX_TYPE tx_type, int bd);
+
+typedef void (*transform_1d_sse2)(const __m128i *input, __m128i *output,
+ int8_t cos_bit);
+
+void av1_iadst8_sse2(const __m128i *input, __m128i *output);
+
+void av1_idct8_sse2(const __m128i *input, __m128i *output);
+
+typedef struct {
+ transform_1d_sse2 col, row; // vertical and horizontal
+} transform_2d_sse2;
+
+#ifdef __cplusplus
+}
+#endif // __cplusplus
+#endif // AOM_AV1_COMMON_X86_AV1_TXFM_SSE2_H_
diff --git a/third_party/aom/av1/common/x86/av1_txfm_sse4.c b/third_party/aom/av1/common/x86/av1_txfm_sse4.c
new file mode 100644
index 0000000000..1894efdc10
--- /dev/null
+++ b/third_party/aom/av1/common/x86/av1_txfm_sse4.c
@@ -0,0 +1,22 @@
+/*
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include "config/av1_rtcd.h"
+
+#include "av1/common/av1_txfm.h"
+#include "av1/common/x86/av1_txfm_sse4.h"
+
+// This function assumes `arr` is 16-byte aligned.
+void av1_round_shift_array_sse4_1(int32_t *arr, int size, int bit) {
+ __m128i *const vec = (__m128i *)arr;
+ const int vec_size = size >> 2;
+ av1_round_shift_array_32_sse4_1(vec, vec, vec_size, bit);
+}
diff --git a/third_party/aom/av1/common/x86/av1_txfm_sse4.h b/third_party/aom/av1/common/x86/av1_txfm_sse4.h
new file mode 100644
index 0000000000..387dfd6bb3
--- /dev/null
+++ b/third_party/aom/av1/common/x86/av1_txfm_sse4.h
@@ -0,0 +1,72 @@
+/*
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AV1_COMMON_X86_AV1_TXFM_SSE4_H_
+#define AOM_AV1_COMMON_X86_AV1_TXFM_SSE4_H_
+
+#include <smmintrin.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+static INLINE __m128i av1_round_shift_32_sse4_1(__m128i vec, int bit) {
+ __m128i tmp, round;
+ round = _mm_set1_epi32(1 << (bit - 1));
+ tmp = _mm_add_epi32(vec, round);
+ return _mm_srai_epi32(tmp, bit);
+}
+
+static INLINE void av1_round_shift_array_32_sse4_1(const __m128i *input,
+ __m128i *output,
+ const int size,
+ const int bit) {
+ if (bit > 0) {
+ int i;
+ for (i = 0; i < size; i++) {
+ output[i] = av1_round_shift_32_sse4_1(input[i], bit);
+ }
+ } else {
+ int i;
+ for (i = 0; i < size; i++) {
+ output[i] = _mm_slli_epi32(input[i], -bit);
+ }
+ }
+}
+
+static INLINE void av1_round_shift_rect_array_32_sse4_1(const __m128i *input,
+ __m128i *output,
+ const int size,
+ const int bit,
+ const int val) {
+ const __m128i sqrt2 = _mm_set1_epi32(val);
+ if (bit > 0) {
+ int i;
+ for (i = 0; i < size; i++) {
+ const __m128i r0 = av1_round_shift_32_sse4_1(input[i], bit);
+ const __m128i r1 = _mm_mullo_epi32(sqrt2, r0);
+ output[i] = av1_round_shift_32_sse4_1(r1, NewSqrt2Bits);
+ }
+ } else {
+ int i;
+ for (i = 0; i < size; i++) {
+ const __m128i r0 = _mm_slli_epi32(input[i], -bit);
+ const __m128i r1 = _mm_mullo_epi32(sqrt2, r0);
+ output[i] = av1_round_shift_32_sse4_1(r1, NewSqrt2Bits);
+ }
+ }
+}
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif // AOM_AV1_COMMON_X86_AV1_TXFM_SSE4_H_
diff --git a/third_party/aom/av1/common/x86/cdef_block_avx2.c b/third_party/aom/av1/common/x86/cdef_block_avx2.c
new file mode 100644
index 0000000000..1ec4b6c332
--- /dev/null
+++ b/third_party/aom/av1/common/x86/cdef_block_avx2.c
@@ -0,0 +1,357 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include "aom_dsp/aom_simd.h"
+#define SIMD_FUNC(name) name##_avx2
+#include "av1/common/cdef_block_simd.h"
+
+// Mask used to shuffle the elements present in 256bit register.
+const int shuffle_reg_256bit[8] = { 0x0b0a0d0c, 0x07060908, 0x03020504,
+ 0x0f0e0100, 0x0b0a0d0c, 0x07060908,
+ 0x03020504, 0x0f0e0100 };
+
+/* partial A is a 16-bit vector of the form:
+[x8 - - x1 | x16 - - x9] and partial B has the form:
+[0 y1 - y7 | 0 y9 - y15].
+This function computes (x1^2+y1^2)*C1 + (x2^2+y2^2)*C2 + ...
+(x7^2+y2^7)*C7 + (x8^2+0^2)*C8 on each 128-bit lane. Here the C1..C8 constants
+are in const1 and const2. */
+static INLINE __m256i fold_mul_and_sum_avx2(__m256i *partiala,
+ __m256i *partialb,
+ const __m256i *const1,
+ const __m256i *const2) {
+ __m256i tmp;
+ /* Reverse partial B. */
+ *partialb = _mm256_shuffle_epi8(
+ *partialb, _mm256_loadu_si256((const __m256i *)shuffle_reg_256bit));
+
+ /* Interleave the x and y values of identical indices and pair x8 with 0. */
+ tmp = *partiala;
+ *partiala = _mm256_unpacklo_epi16(*partiala, *partialb);
+ *partialb = _mm256_unpackhi_epi16(tmp, *partialb);
+
+ /* Square and add the corresponding x and y values. */
+ *partiala = _mm256_madd_epi16(*partiala, *partiala);
+ *partialb = _mm256_madd_epi16(*partialb, *partialb);
+ /* Multiply by constant. */
+ *partiala = _mm256_mullo_epi32(*partiala, *const1);
+ *partialb = _mm256_mullo_epi32(*partialb, *const2);
+ /* Sum all results. */
+ *partiala = _mm256_add_epi32(*partiala, *partialb);
+ return *partiala;
+}
+
+static INLINE __m256i hsum4_avx2(__m256i *x0, __m256i *x1, __m256i *x2,
+ __m256i *x3) {
+ const __m256i t0 = _mm256_unpacklo_epi32(*x0, *x1);
+ const __m256i t1 = _mm256_unpacklo_epi32(*x2, *x3);
+ const __m256i t2 = _mm256_unpackhi_epi32(*x0, *x1);
+ const __m256i t3 = _mm256_unpackhi_epi32(*x2, *x3);
+
+ *x0 = _mm256_unpacklo_epi64(t0, t1);
+ *x1 = _mm256_unpackhi_epi64(t0, t1);
+ *x2 = _mm256_unpacklo_epi64(t2, t3);
+ *x3 = _mm256_unpackhi_epi64(t2, t3);
+ return _mm256_add_epi32(_mm256_add_epi32(*x0, *x1),
+ _mm256_add_epi32(*x2, *x3));
+}
+
+/* Computes cost for directions 0, 5, 6 and 7. We can call this function again
+to compute the remaining directions. */
+static INLINE __m256i compute_directions_avx2(__m256i *lines,
+ int32_t cost_frist_8x8[4],
+ int32_t cost_second_8x8[4]) {
+ __m256i partial4a, partial4b, partial5a, partial5b, partial7a, partial7b;
+ __m256i partial6;
+ __m256i tmp;
+ /* Partial sums for lines 0 and 1. */
+ partial4a = _mm256_slli_si256(lines[0], 14);
+ partial4b = _mm256_srli_si256(lines[0], 2);
+ partial4a = _mm256_add_epi16(partial4a, _mm256_slli_si256(lines[1], 12));
+ partial4b = _mm256_add_epi16(partial4b, _mm256_srli_si256(lines[1], 4));
+ tmp = _mm256_add_epi16(lines[0], lines[1]);
+ partial5a = _mm256_slli_si256(tmp, 10);
+ partial5b = _mm256_srli_si256(tmp, 6);
+ partial7a = _mm256_slli_si256(tmp, 4);
+ partial7b = _mm256_srli_si256(tmp, 12);
+ partial6 = tmp;
+
+ /* Partial sums for lines 2 and 3. */
+ partial4a = _mm256_add_epi16(partial4a, _mm256_slli_si256(lines[2], 10));
+ partial4b = _mm256_add_epi16(partial4b, _mm256_srli_si256(lines[2], 6));
+ partial4a = _mm256_add_epi16(partial4a, _mm256_slli_si256(lines[3], 8));
+ partial4b = _mm256_add_epi16(partial4b, _mm256_srli_si256(lines[3], 8));
+ tmp = _mm256_add_epi16(lines[2], lines[3]);
+ partial5a = _mm256_add_epi16(partial5a, _mm256_slli_si256(tmp, 8));
+ partial5b = _mm256_add_epi16(partial5b, _mm256_srli_si256(tmp, 8));
+ partial7a = _mm256_add_epi16(partial7a, _mm256_slli_si256(tmp, 6));
+ partial7b = _mm256_add_epi16(partial7b, _mm256_srli_si256(tmp, 10));
+ partial6 = _mm256_add_epi16(partial6, tmp);
+
+ /* Partial sums for lines 4 and 5. */
+ partial4a = _mm256_add_epi16(partial4a, _mm256_slli_si256(lines[4], 6));
+ partial4b = _mm256_add_epi16(partial4b, _mm256_srli_si256(lines[4], 10));
+ partial4a = _mm256_add_epi16(partial4a, _mm256_slli_si256(lines[5], 4));
+ partial4b = _mm256_add_epi16(partial4b, _mm256_srli_si256(lines[5], 12));
+ tmp = _mm256_add_epi16(lines[4], lines[5]);
+ partial5a = _mm256_add_epi16(partial5a, _mm256_slli_si256(tmp, 6));
+ partial5b = _mm256_add_epi16(partial5b, _mm256_srli_si256(tmp, 10));
+ partial7a = _mm256_add_epi16(partial7a, _mm256_slli_si256(tmp, 8));
+ partial7b = _mm256_add_epi16(partial7b, _mm256_srli_si256(tmp, 8));
+ partial6 = _mm256_add_epi16(partial6, tmp);
+
+ /* Partial sums for lines 6 and 7. */
+ partial4a = _mm256_add_epi16(partial4a, _mm256_slli_si256(lines[6], 2));
+ partial4b = _mm256_add_epi16(partial4b, _mm256_srli_si256(lines[6], 14));
+ partial4a = _mm256_add_epi16(partial4a, lines[7]);
+ tmp = _mm256_add_epi16(lines[6], lines[7]);
+ partial5a = _mm256_add_epi16(partial5a, _mm256_slli_si256(tmp, 4));
+ partial5b = _mm256_add_epi16(partial5b, _mm256_srli_si256(tmp, 12));
+ partial7a = _mm256_add_epi16(partial7a, _mm256_slli_si256(tmp, 10));
+ partial7b = _mm256_add_epi16(partial7b, _mm256_srli_si256(tmp, 6));
+ partial6 = _mm256_add_epi16(partial6, tmp);
+
+ const __m256i const_reg_1 =
+ _mm256_set_epi32(210, 280, 420, 840, 210, 280, 420, 840);
+ const __m256i const_reg_2 =
+ _mm256_set_epi32(105, 120, 140, 168, 105, 120, 140, 168);
+ const __m256i const_reg_3 = _mm256_set_epi32(210, 420, 0, 0, 210, 420, 0, 0);
+ const __m256i const_reg_4 =
+ _mm256_set_epi32(105, 105, 105, 140, 105, 105, 105, 140);
+
+ /* Compute costs in terms of partial sums. */
+ partial4a =
+ fold_mul_and_sum_avx2(&partial4a, &partial4b, &const_reg_1, &const_reg_2);
+ partial7a =
+ fold_mul_and_sum_avx2(&partial7a, &partial7b, &const_reg_3, &const_reg_4);
+ partial5a =
+ fold_mul_and_sum_avx2(&partial5a, &partial5b, &const_reg_3, &const_reg_4);
+ partial6 = _mm256_madd_epi16(partial6, partial6);
+ partial6 = _mm256_mullo_epi32(partial6, _mm256_set1_epi32(105));
+
+ partial4a = hsum4_avx2(&partial4a, &partial5a, &partial6, &partial7a);
+ _mm_storeu_si128((__m128i *)cost_frist_8x8,
+ _mm256_castsi256_si128(partial4a));
+ _mm_storeu_si128((__m128i *)cost_second_8x8,
+ _mm256_extractf128_si256(partial4a, 1));
+
+ return partial4a;
+}
+
+/* transpose and reverse the order of the lines -- equivalent to a 90-degree
+counter-clockwise rotation of the pixels. */
+static INLINE void array_reverse_transpose_8x8_avx2(__m256i *in, __m256i *res) {
+ const __m256i tr0_0 = _mm256_unpacklo_epi16(in[0], in[1]);
+ const __m256i tr0_1 = _mm256_unpacklo_epi16(in[2], in[3]);
+ const __m256i tr0_2 = _mm256_unpackhi_epi16(in[0], in[1]);
+ const __m256i tr0_3 = _mm256_unpackhi_epi16(in[2], in[3]);
+ const __m256i tr0_4 = _mm256_unpacklo_epi16(in[4], in[5]);
+ const __m256i tr0_5 = _mm256_unpacklo_epi16(in[6], in[7]);
+ const __m256i tr0_6 = _mm256_unpackhi_epi16(in[4], in[5]);
+ const __m256i tr0_7 = _mm256_unpackhi_epi16(in[6], in[7]);
+
+ const __m256i tr1_0 = _mm256_unpacklo_epi32(tr0_0, tr0_1);
+ const __m256i tr1_1 = _mm256_unpacklo_epi32(tr0_4, tr0_5);
+ const __m256i tr1_2 = _mm256_unpackhi_epi32(tr0_0, tr0_1);
+ const __m256i tr1_3 = _mm256_unpackhi_epi32(tr0_4, tr0_5);
+ const __m256i tr1_4 = _mm256_unpacklo_epi32(tr0_2, tr0_3);
+ const __m256i tr1_5 = _mm256_unpacklo_epi32(tr0_6, tr0_7);
+ const __m256i tr1_6 = _mm256_unpackhi_epi32(tr0_2, tr0_3);
+ const __m256i tr1_7 = _mm256_unpackhi_epi32(tr0_6, tr0_7);
+
+ res[7] = _mm256_unpacklo_epi64(tr1_0, tr1_1);
+ res[6] = _mm256_unpackhi_epi64(tr1_0, tr1_1);
+ res[5] = _mm256_unpacklo_epi64(tr1_2, tr1_3);
+ res[4] = _mm256_unpackhi_epi64(tr1_2, tr1_3);
+ res[3] = _mm256_unpacklo_epi64(tr1_4, tr1_5);
+ res[2] = _mm256_unpackhi_epi64(tr1_4, tr1_5);
+ res[1] = _mm256_unpacklo_epi64(tr1_6, tr1_7);
+ res[0] = _mm256_unpackhi_epi64(tr1_6, tr1_7);
+}
+
+void cdef_find_dir_dual_avx2(const uint16_t *img1, const uint16_t *img2,
+ int stride, int32_t *var_out_1st,
+ int32_t *var_out_2nd, int coeff_shift,
+ int *out_dir_1st_8x8, int *out_dir_2nd_8x8) {
+ int32_t cost_first_8x8[8];
+ int32_t cost_second_8x8[8];
+ // Used to store the best cost for 2 8x8's.
+ int32_t best_cost[2] = { 0 };
+ // Best direction for 2 8x8's.
+ int best_dir[2] = { 0 };
+
+ const __m128i const_coeff_shift_reg = _mm_cvtsi32_si128(coeff_shift);
+ const __m256i const_128_reg = _mm256_set1_epi16(128);
+ __m256i lines[8];
+ for (int i = 0; i < 8; i++) {
+ const __m128i src_1 = _mm_loadu_si128((const __m128i *)&img1[i * stride]);
+ const __m128i src_2 = _mm_loadu_si128((const __m128i *)&img2[i * stride]);
+
+ lines[i] = _mm256_insertf128_si256(_mm256_castsi128_si256(src_1), src_2, 1);
+ lines[i] = _mm256_sub_epi16(
+ _mm256_sra_epi16(lines[i], const_coeff_shift_reg), const_128_reg);
+ }
+
+ /* Compute "mostly vertical" directions. */
+ const __m256i dir47 =
+ compute_directions_avx2(lines, cost_first_8x8 + 4, cost_second_8x8 + 4);
+
+ /* Transpose and reverse the order of the lines. */
+ array_reverse_transpose_8x8_avx2(lines, lines);
+
+ /* Compute "mostly horizontal" directions. */
+ const __m256i dir03 =
+ compute_directions_avx2(lines, cost_first_8x8, cost_second_8x8);
+
+ __m256i max = _mm256_max_epi32(dir03, dir47);
+ max =
+ _mm256_max_epi32(max, _mm256_or_si256(_mm256_srli_si256(max, 8),
+ _mm256_slli_si256(max, 16 - (8))));
+ max =
+ _mm256_max_epi32(max, _mm256_or_si256(_mm256_srli_si256(max, 4),
+ _mm256_slli_si256(max, 16 - (4))));
+
+ const __m128i first_8x8_output = _mm256_castsi256_si128(max);
+ const __m128i second_8x8_output = _mm256_extractf128_si256(max, 1);
+ const __m128i cmpeg_res_00 =
+ _mm_cmpeq_epi32(first_8x8_output, _mm256_castsi256_si128(dir47));
+ const __m128i cmpeg_res_01 =
+ _mm_cmpeq_epi32(first_8x8_output, _mm256_castsi256_si128(dir03));
+ const __m128i cmpeg_res_10 =
+ _mm_cmpeq_epi32(second_8x8_output, _mm256_extractf128_si256(dir47, 1));
+ const __m128i cmpeg_res_11 =
+ _mm_cmpeq_epi32(second_8x8_output, _mm256_extractf128_si256(dir03, 1));
+ const __m128i t_first_8x8 = _mm_packs_epi32(cmpeg_res_01, cmpeg_res_00);
+ const __m128i t_second_8x8 = _mm_packs_epi32(cmpeg_res_11, cmpeg_res_10);
+
+ best_cost[0] = _mm_cvtsi128_si32(_mm256_castsi256_si128(max));
+ best_cost[1] = _mm_cvtsi128_si32(second_8x8_output);
+ best_dir[0] = _mm_movemask_epi8(_mm_packs_epi16(t_first_8x8, t_first_8x8));
+ best_dir[0] =
+ get_msb(best_dir[0] ^ (best_dir[0] - 1)); // Count trailing zeros
+ best_dir[1] = _mm_movemask_epi8(_mm_packs_epi16(t_second_8x8, t_second_8x8));
+ best_dir[1] =
+ get_msb(best_dir[1] ^ (best_dir[1] - 1)); // Count trailing zeros
+
+ /* Difference between the optimal variance and the variance along the
+ orthogonal direction. Again, the sum(x^2) terms cancel out. */
+ *var_out_1st = best_cost[0] - cost_first_8x8[(best_dir[0] + 4) & 7];
+ *var_out_2nd = best_cost[1] - cost_second_8x8[(best_dir[1] + 4) & 7];
+
+ /* We'd normally divide by 840, but dividing by 1024 is close enough
+ for what we're going to do with this. */
+ *var_out_1st >>= 10;
+ *var_out_2nd >>= 10;
+ *out_dir_1st_8x8 = best_dir[0];
+ *out_dir_2nd_8x8 = best_dir[1];
+}
+
+void cdef_copy_rect8_8bit_to_16bit_avx2(uint16_t *dst, int dstride,
+ const uint8_t *src, int sstride,
+ int width, int height) {
+ int j = 0;
+ int remaining_width = width;
+ assert(height % 2 == 0);
+ assert(height > 0);
+ assert(width > 0);
+
+ // Process multiple 32 pixels at a time.
+ if (remaining_width > 31) {
+ int i = 0;
+ do {
+ j = 0;
+ do {
+ __m128i row00 =
+ _mm_loadu_si128((const __m128i *)&src[(i + 0) * sstride + (j + 0)]);
+ __m128i row01 = _mm_loadu_si128(
+ (const __m128i *)&src[(i + 0) * sstride + (j + 16)]);
+ __m128i row10 =
+ _mm_loadu_si128((const __m128i *)&src[(i + 1) * sstride + (j + 0)]);
+ __m128i row11 = _mm_loadu_si128(
+ (const __m128i *)&src[(i + 1) * sstride + (j + 16)]);
+ _mm256_storeu_si256((__m256i *)&dst[(i + 0) * dstride + (j + 0)],
+ _mm256_cvtepu8_epi16(row00));
+ _mm256_storeu_si256((__m256i *)&dst[(i + 0) * dstride + (j + 16)],
+ _mm256_cvtepu8_epi16(row01));
+ _mm256_storeu_si256((__m256i *)&dst[(i + 1) * dstride + (j + 0)],
+ _mm256_cvtepu8_epi16(row10));
+ _mm256_storeu_si256((__m256i *)&dst[(i + 1) * dstride + (j + 16)],
+ _mm256_cvtepu8_epi16(row11));
+ j += 32;
+ } while (j <= width - 32);
+ i += 2;
+ } while (i < height);
+ remaining_width = width & 31;
+ }
+
+ // Process 16 pixels at a time.
+ if (remaining_width > 15) {
+ int i = 0;
+ do {
+ __m128i row0 =
+ _mm_loadu_si128((const __m128i *)&src[(i + 0) * sstride + j]);
+ __m128i row1 =
+ _mm_loadu_si128((const __m128i *)&src[(i + 1) * sstride + j]);
+ _mm256_storeu_si256((__m256i *)&dst[(i + 0) * dstride + j],
+ _mm256_cvtepu8_epi16(row0));
+ _mm256_storeu_si256((__m256i *)&dst[(i + 1) * dstride + j],
+ _mm256_cvtepu8_epi16(row1));
+ i += 2;
+ } while (i < height);
+ remaining_width = width & 15;
+ j += 16;
+ }
+
+ // Process 8 pixels at a time.
+ if (remaining_width > 7) {
+ int i = 0;
+ do {
+ __m128i row0 =
+ _mm_loadl_epi64((const __m128i *)&src[(i + 0) * sstride + j]);
+ __m128i row1 =
+ _mm_loadl_epi64((const __m128i *)&src[(i + 1) * sstride + j]);
+ _mm_storeu_si128((__m128i *)&dst[(i + 0) * dstride + j],
+ _mm_unpacklo_epi8(row0, _mm_setzero_si128()));
+ _mm_storeu_si128((__m128i *)&dst[(i + 1) * dstride + j],
+ _mm_unpacklo_epi8(row1, _mm_setzero_si128()));
+ i += 2;
+ } while (i < height);
+ remaining_width = width & 7;
+ j += 8;
+ }
+
+ // Process 4 pixels at a time.
+ if (remaining_width > 3) {
+ int i = 0;
+ do {
+ __m128i row0 =
+ _mm_cvtsi32_si128(*((const int32_t *)&src[(i + 0) * sstride + j]));
+ __m128i row1 =
+ _mm_cvtsi32_si128(*((const int32_t *)&src[(i + 1) * sstride + j]));
+ _mm_storel_epi64((__m128i *)&dst[(i + 0) * dstride + j],
+ _mm_unpacklo_epi8(row0, _mm_setzero_si128()));
+ _mm_storel_epi64((__m128i *)&dst[(i + 1) * dstride + j],
+ _mm_unpacklo_epi8(row1, _mm_setzero_si128()));
+ i += 2;
+ } while (i < height);
+ remaining_width = width & 3;
+ j += 4;
+ }
+
+ // Process the remaining pixels.
+ if (remaining_width) {
+ for (int i = 0; i < height; i++) {
+ for (int k = j; k < width; k++) {
+ dst[i * dstride + k] = src[i * sstride + k];
+ }
+ }
+ }
+}
diff --git a/third_party/aom/av1/common/x86/cdef_block_sse2.c b/third_party/aom/av1/common/x86/cdef_block_sse2.c
new file mode 100644
index 0000000000..5ab7ffa2ff
--- /dev/null
+++ b/third_party/aom/av1/common/x86/cdef_block_sse2.c
@@ -0,0 +1,40 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include "aom_dsp/aom_simd.h"
+#define SIMD_FUNC(name) name##_sse2
+#include "av1/common/cdef_block_simd.h"
+
+void cdef_find_dir_dual_sse2(const uint16_t *img1, const uint16_t *img2,
+ int stride, int32_t *var_out_1st,
+ int32_t *var_out_2nd, int coeff_shift,
+ int *out_dir_1st_8x8, int *out_dir_2nd_8x8) {
+ // Process first 8x8.
+ *out_dir_1st_8x8 = cdef_find_dir(img1, stride, var_out_1st, coeff_shift);
+
+ // Process second 8x8.
+ *out_dir_2nd_8x8 = cdef_find_dir(img2, stride, var_out_2nd, coeff_shift);
+}
+
+void cdef_copy_rect8_8bit_to_16bit_sse2(uint16_t *dst, int dstride,
+ const uint8_t *src, int sstride,
+ int width, int height) {
+ int j = 0;
+ for (int i = 0; i < height; i++) {
+ for (j = 0; j < (width & ~0x7); j += 8) {
+ v64 row = v64_load_unaligned(&src[i * sstride + j]);
+ v128_store_unaligned(&dst[i * dstride + j], v128_unpack_u8_s16(row));
+ }
+ for (; j < width; j++) {
+ dst[i * dstride + j] = src[i * sstride + j];
+ }
+ }
+}
diff --git a/third_party/aom/av1/common/x86/cdef_block_sse4.c b/third_party/aom/av1/common/x86/cdef_block_sse4.c
new file mode 100644
index 0000000000..344c1e47c9
--- /dev/null
+++ b/third_party/aom/av1/common/x86/cdef_block_sse4.c
@@ -0,0 +1,40 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include "aom_dsp/aom_simd.h"
+#define SIMD_FUNC(name) name##_sse4_1
+#include "av1/common/cdef_block_simd.h"
+
+void cdef_find_dir_dual_sse4_1(const uint16_t *img1, const uint16_t *img2,
+ int stride, int32_t *var_out_1st,
+ int32_t *var_out_2nd, int coeff_shift,
+ int *out_dir_1st_8x8, int *out_dir_2nd_8x8) {
+ // Process first 8x8.
+ *out_dir_1st_8x8 = cdef_find_dir(img1, stride, var_out_1st, coeff_shift);
+
+ // Process second 8x8.
+ *out_dir_2nd_8x8 = cdef_find_dir(img2, stride, var_out_2nd, coeff_shift);
+}
+
+void cdef_copy_rect8_8bit_to_16bit_sse4_1(uint16_t *dst, int dstride,
+ const uint8_t *src, int sstride,
+ int width, int height) {
+ int j = 0;
+ for (int i = 0; i < height; i++) {
+ for (j = 0; j < (width & ~0x7); j += 8) {
+ v64 row = v64_load_unaligned(&src[i * sstride + j]);
+ v128_store_unaligned(&dst[i * dstride + j], v128_unpack_u8_s16(row));
+ }
+ for (; j < width; j++) {
+ dst[i * dstride + j] = src[i * sstride + j];
+ }
+ }
+}
diff --git a/third_party/aom/av1/common/x86/cdef_block_ssse3.c b/third_party/aom/av1/common/x86/cdef_block_ssse3.c
new file mode 100644
index 0000000000..0fb36eb6e0
--- /dev/null
+++ b/third_party/aom/av1/common/x86/cdef_block_ssse3.c
@@ -0,0 +1,40 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include "aom_dsp/aom_simd.h"
+#define SIMD_FUNC(name) name##_ssse3
+#include "av1/common/cdef_block_simd.h"
+
+void cdef_find_dir_dual_ssse3(const uint16_t *img1, const uint16_t *img2,
+ int stride, int32_t *var_out_1st,
+ int32_t *var_out_2nd, int coeff_shift,
+ int *out_dir_1st_8x8, int *out_dir_2nd_8x8) {
+ // Process first 8x8.
+ *out_dir_1st_8x8 = cdef_find_dir(img1, stride, var_out_1st, coeff_shift);
+
+ // Process second 8x8.
+ *out_dir_2nd_8x8 = cdef_find_dir(img2, stride, var_out_2nd, coeff_shift);
+}
+
+void cdef_copy_rect8_8bit_to_16bit_ssse3(uint16_t *dst, int dstride,
+ const uint8_t *src, int sstride,
+ int width, int height) {
+ int j;
+ for (int i = 0; i < height; i++) {
+ for (j = 0; j < (width & ~0x7); j += 8) {
+ v64 row = v64_load_unaligned(&src[i * sstride + j]);
+ v128_store_unaligned(&dst[i * dstride + j], v128_unpack_u8_s16(row));
+ }
+ for (; j < width; j++) {
+ dst[i * dstride + j] = src[i * sstride + j];
+ }
+ }
+}
diff --git a/third_party/aom/av1/common/x86/cfl_avx2.c b/third_party/aom/av1/common/x86/cfl_avx2.c
new file mode 100644
index 0000000000..e1e187c4a6
--- /dev/null
+++ b/third_party/aom/av1/common/x86/cfl_avx2.c
@@ -0,0 +1,495 @@
+/*
+ * Copyright (c) 2017, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+#include <immintrin.h>
+
+#include "config/av1_rtcd.h"
+
+#include "av1/common/cfl.h"
+
+#include "av1/common/x86/cfl_simd.h"
+
+#define CFL_GET_SUBSAMPLE_FUNCTION_AVX2(sub, bd) \
+ CFL_SUBSAMPLE(avx2, sub, bd, 32, 32) \
+ CFL_SUBSAMPLE(avx2, sub, bd, 32, 16) \
+ CFL_SUBSAMPLE(avx2, sub, bd, 32, 8) \
+ cfl_subsample_##bd##_fn cfl_get_luma_subsampling_##sub##_##bd##_avx2( \
+ TX_SIZE tx_size) { \
+ static const cfl_subsample_##bd##_fn subfn_##sub[TX_SIZES_ALL] = { \
+ cfl_subsample_##bd##_##sub##_4x4_ssse3, /* 4x4 */ \
+ cfl_subsample_##bd##_##sub##_8x8_ssse3, /* 8x8 */ \
+ cfl_subsample_##bd##_##sub##_16x16_ssse3, /* 16x16 */ \
+ cfl_subsample_##bd##_##sub##_32x32_avx2, /* 32x32 */ \
+ NULL, /* 64x64 (invalid CFL size) */ \
+ cfl_subsample_##bd##_##sub##_4x8_ssse3, /* 4x8 */ \
+ cfl_subsample_##bd##_##sub##_8x4_ssse3, /* 8x4 */ \
+ cfl_subsample_##bd##_##sub##_8x16_ssse3, /* 8x16 */ \
+ cfl_subsample_##bd##_##sub##_16x8_ssse3, /* 16x8 */ \
+ cfl_subsample_##bd##_##sub##_16x32_ssse3, /* 16x32 */ \
+ cfl_subsample_##bd##_##sub##_32x16_avx2, /* 32x16 */ \
+ NULL, /* 32x64 (invalid CFL size) */ \
+ NULL, /* 64x32 (invalid CFL size) */ \
+ cfl_subsample_##bd##_##sub##_4x16_ssse3, /* 4x16 */ \
+ cfl_subsample_##bd##_##sub##_16x4_ssse3, /* 16x4 */ \
+ cfl_subsample_##bd##_##sub##_8x32_ssse3, /* 8x32 */ \
+ cfl_subsample_##bd##_##sub##_32x8_avx2, /* 32x8 */ \
+ NULL, /* 16x64 (invalid CFL size) */ \
+ NULL, /* 64x16 (invalid CFL size) */ \
+ }; \
+ return subfn_##sub[tx_size]; \
+ }
+
+/**
+ * Adds 4 pixels (in a 2x2 grid) and multiplies them by 2. Resulting in a more
+ * precise version of a box filter 4:2:0 pixel subsampling in Q3.
+ *
+ * The CfL prediction buffer is always of size CFL_BUF_SQUARE. However, the
+ * active area is specified using width and height.
+ *
+ * Note: We don't need to worry about going over the active area, as long as we
+ * stay inside the CfL prediction buffer.
+ *
+ * Note: For 4:2:0 luma subsampling, the width will never be greater than 16.
+ */
+static void cfl_luma_subsampling_420_lbd_avx2(const uint8_t *input,
+ int input_stride,
+ uint16_t *pred_buf_q3, int width,
+ int height) {
+ (void)width; // Forever 32
+ const __m256i twos = _mm256_set1_epi8(2); // Thirty two twos
+ const int luma_stride = input_stride << 1;
+ __m256i *row = (__m256i *)pred_buf_q3;
+ const __m256i *row_end = row + (height >> 1) * CFL_BUF_LINE_I256;
+ do {
+ __m256i top = _mm256_loadu_si256((__m256i *)input);
+ __m256i bot = _mm256_loadu_si256((__m256i *)(input + input_stride));
+
+ __m256i top_16x16 = _mm256_maddubs_epi16(top, twos);
+ __m256i bot_16x16 = _mm256_maddubs_epi16(bot, twos);
+ __m256i sum_16x16 = _mm256_add_epi16(top_16x16, bot_16x16);
+
+ _mm256_storeu_si256(row, sum_16x16);
+
+ input += luma_stride;
+ } while ((row += CFL_BUF_LINE_I256) < row_end);
+}
+
+CFL_GET_SUBSAMPLE_FUNCTION_AVX2(420, lbd)
+
+/**
+ * Adds 2 pixels (in a 2x1 grid) and multiplies them by 4. Resulting in a more
+ * precise version of a box filter 4:2:2 pixel subsampling in Q3.
+ *
+ * The CfL prediction buffer is always of size CFL_BUF_SQUARE. However, the
+ * active area is specified using width and height.
+ *
+ * Note: We don't need to worry about going over the active area, as long as we
+ * stay inside the CfL prediction buffer.
+ */
+static void cfl_luma_subsampling_422_lbd_avx2(const uint8_t *input,
+ int input_stride,
+ uint16_t *pred_buf_q3, int width,
+ int height) {
+ (void)width; // Forever 32
+ const __m256i fours = _mm256_set1_epi8(4); // Thirty two fours
+ __m256i *row = (__m256i *)pred_buf_q3;
+ const __m256i *row_end = row + height * CFL_BUF_LINE_I256;
+ do {
+ __m256i top = _mm256_loadu_si256((__m256i *)input);
+ __m256i top_16x16 = _mm256_maddubs_epi16(top, fours);
+ _mm256_storeu_si256(row, top_16x16);
+ input += input_stride;
+ } while ((row += CFL_BUF_LINE_I256) < row_end);
+}
+
+CFL_GET_SUBSAMPLE_FUNCTION_AVX2(422, lbd)
+
+/**
+ * Multiplies the pixels by 8 (scaling in Q3). The AVX2 subsampling is only
+ * performed on block of width 32.
+ *
+ * The CfL prediction buffer is always of size CFL_BUF_SQUARE. However, the
+ * active area is specified using width and height.
+ *
+ * Note: We don't need to worry about going over the active area, as long as we
+ * stay inside the CfL prediction buffer.
+ */
+static void cfl_luma_subsampling_444_lbd_avx2(const uint8_t *input,
+ int input_stride,
+ uint16_t *pred_buf_q3, int width,
+ int height) {
+ (void)width; // Forever 32
+ __m256i *row = (__m256i *)pred_buf_q3;
+ const __m256i *row_end = row + height * CFL_BUF_LINE_I256;
+ const __m256i zeros = _mm256_setzero_si256();
+ do {
+ __m256i top = _mm256_loadu_si256((__m256i *)input);
+ top = _mm256_permute4x64_epi64(top, _MM_SHUFFLE(3, 1, 2, 0));
+
+ __m256i row_lo = _mm256_unpacklo_epi8(top, zeros);
+ row_lo = _mm256_slli_epi16(row_lo, 3);
+ __m256i row_hi = _mm256_unpackhi_epi8(top, zeros);
+ row_hi = _mm256_slli_epi16(row_hi, 3);
+
+ _mm256_storeu_si256(row, row_lo);
+ _mm256_storeu_si256(row + 1, row_hi);
+
+ input += input_stride;
+ } while ((row += CFL_BUF_LINE_I256) < row_end);
+}
+
+CFL_GET_SUBSAMPLE_FUNCTION_AVX2(444, lbd)
+
+#if CONFIG_AV1_HIGHBITDEPTH
+/**
+ * Adds 4 pixels (in a 2x2 grid) and multiplies them by 2. Resulting in a more
+ * precise version of a box filter 4:2:0 pixel subsampling in Q3.
+ *
+ * The CfL prediction buffer is always of size CFL_BUF_SQUARE. However, the
+ * active area is specified using width and height.
+ *
+ * Note: We don't need to worry about going over the active area, as long as we
+ * stay inside the CfL prediction buffer.
+ *
+ * Note: For 4:2:0 luma subsampling, the width will never be greater than 16.
+ */
+static void cfl_luma_subsampling_420_hbd_avx2(const uint16_t *input,
+ int input_stride,
+ uint16_t *pred_buf_q3, int width,
+ int height) {
+ (void)width; // Forever 32
+ const int luma_stride = input_stride << 1;
+ __m256i *row = (__m256i *)pred_buf_q3;
+ const __m256i *row_end = row + (height >> 1) * CFL_BUF_LINE_I256;
+ do {
+ __m256i top = _mm256_loadu_si256((__m256i *)input);
+ __m256i bot = _mm256_loadu_si256((__m256i *)(input + input_stride));
+ __m256i sum = _mm256_add_epi16(top, bot);
+
+ __m256i top_1 = _mm256_loadu_si256((__m256i *)(input + 16));
+ __m256i bot_1 = _mm256_loadu_si256((__m256i *)(input + 16 + input_stride));
+ __m256i sum_1 = _mm256_add_epi16(top_1, bot_1);
+
+ __m256i hsum = _mm256_hadd_epi16(sum, sum_1);
+ hsum = _mm256_permute4x64_epi64(hsum, _MM_SHUFFLE(3, 1, 2, 0));
+ hsum = _mm256_add_epi16(hsum, hsum);
+
+ _mm256_storeu_si256(row, hsum);
+
+ input += luma_stride;
+ } while ((row += CFL_BUF_LINE_I256) < row_end);
+}
+
+CFL_GET_SUBSAMPLE_FUNCTION_AVX2(420, hbd)
+
+/**
+ * Adds 2 pixels (in a 2x1 grid) and multiplies them by 4. Resulting in a more
+ * precise version of a box filter 4:2:2 pixel subsampling in Q3.
+ *
+ * The CfL prediction buffer is always of size CFL_BUF_SQUARE. However, the
+ * active area is specified using width and height.
+ *
+ * Note: We don't need to worry about going over the active area, as long as we
+ * stay inside the CfL prediction buffer.
+ *
+ */
+static void cfl_luma_subsampling_422_hbd_avx2(const uint16_t *input,
+ int input_stride,
+ uint16_t *pred_buf_q3, int width,
+ int height) {
+ (void)width; // Forever 32
+ __m256i *row = (__m256i *)pred_buf_q3;
+ const __m256i *row_end = row + height * CFL_BUF_LINE_I256;
+ do {
+ __m256i top = _mm256_loadu_si256((__m256i *)input);
+ __m256i top_1 = _mm256_loadu_si256((__m256i *)(input + 16));
+ __m256i hsum = _mm256_hadd_epi16(top, top_1);
+ hsum = _mm256_permute4x64_epi64(hsum, _MM_SHUFFLE(3, 1, 2, 0));
+ hsum = _mm256_slli_epi16(hsum, 2);
+
+ _mm256_storeu_si256(row, hsum);
+
+ input += input_stride;
+ } while ((row += CFL_BUF_LINE_I256) < row_end);
+}
+
+CFL_GET_SUBSAMPLE_FUNCTION_AVX2(422, hbd)
+
+static void cfl_luma_subsampling_444_hbd_avx2(const uint16_t *input,
+ int input_stride,
+ uint16_t *pred_buf_q3, int width,
+ int height) {
+ (void)width; // Forever 32
+ __m256i *row = (__m256i *)pred_buf_q3;
+ const __m256i *row_end = row + height * CFL_BUF_LINE_I256;
+ do {
+ __m256i top = _mm256_loadu_si256((__m256i *)input);
+ __m256i top_1 = _mm256_loadu_si256((__m256i *)(input + 16));
+ _mm256_storeu_si256(row, _mm256_slli_epi16(top, 3));
+ _mm256_storeu_si256(row + 1, _mm256_slli_epi16(top_1, 3));
+ input += input_stride;
+ } while ((row += CFL_BUF_LINE_I256) < row_end);
+}
+
+CFL_GET_SUBSAMPLE_FUNCTION_AVX2(444, hbd)
+#endif // CONFIG_AV1_HIGHBITDEPTH
+
+static INLINE __m256i predict_unclipped(const __m256i *input, __m256i alpha_q12,
+ __m256i alpha_sign, __m256i dc_q0) {
+ __m256i ac_q3 = _mm256_loadu_si256(input);
+ __m256i ac_sign = _mm256_sign_epi16(alpha_sign, ac_q3);
+ __m256i scaled_luma_q0 =
+ _mm256_mulhrs_epi16(_mm256_abs_epi16(ac_q3), alpha_q12);
+ scaled_luma_q0 = _mm256_sign_epi16(scaled_luma_q0, ac_sign);
+ return _mm256_add_epi16(scaled_luma_q0, dc_q0);
+}
+
+static INLINE void cfl_predict_lbd_avx2(const int16_t *pred_buf_q3,
+ uint8_t *dst, int dst_stride,
+ int alpha_q3, int width, int height) {
+ (void)width;
+ const __m256i alpha_sign = _mm256_set1_epi16(alpha_q3);
+ const __m256i alpha_q12 = _mm256_slli_epi16(_mm256_abs_epi16(alpha_sign), 9);
+ const __m256i dc_q0 = _mm256_set1_epi16(*dst);
+ __m256i *row = (__m256i *)pred_buf_q3;
+ const __m256i *row_end = row + height * CFL_BUF_LINE_I256;
+
+ do {
+ __m256i res = predict_unclipped(row, alpha_q12, alpha_sign, dc_q0);
+ __m256i next = predict_unclipped(row + 1, alpha_q12, alpha_sign, dc_q0);
+ res = _mm256_packus_epi16(res, next);
+ res = _mm256_permute4x64_epi64(res, _MM_SHUFFLE(3, 1, 2, 0));
+ _mm256_storeu_si256((__m256i *)dst, res);
+ dst += dst_stride;
+ } while ((row += CFL_BUF_LINE_I256) < row_end);
+}
+
+CFL_PREDICT_X(avx2, 32, 8, lbd)
+CFL_PREDICT_X(avx2, 32, 16, lbd)
+CFL_PREDICT_X(avx2, 32, 32, lbd)
+
+cfl_predict_lbd_fn cfl_get_predict_lbd_fn_avx2(TX_SIZE tx_size) {
+ static const cfl_predict_lbd_fn pred[TX_SIZES_ALL] = {
+ cfl_predict_lbd_4x4_ssse3, /* 4x4 */
+ cfl_predict_lbd_8x8_ssse3, /* 8x8 */
+ cfl_predict_lbd_16x16_ssse3, /* 16x16 */
+ cfl_predict_lbd_32x32_avx2, /* 32x32 */
+ NULL, /* 64x64 (invalid CFL size) */
+ cfl_predict_lbd_4x8_ssse3, /* 4x8 */
+ cfl_predict_lbd_8x4_ssse3, /* 8x4 */
+ cfl_predict_lbd_8x16_ssse3, /* 8x16 */
+ cfl_predict_lbd_16x8_ssse3, /* 16x8 */
+ cfl_predict_lbd_16x32_ssse3, /* 16x32 */
+ cfl_predict_lbd_32x16_avx2, /* 32x16 */
+ NULL, /* 32x64 (invalid CFL size) */
+ NULL, /* 64x32 (invalid CFL size) */
+ cfl_predict_lbd_4x16_ssse3, /* 4x16 */
+ cfl_predict_lbd_16x4_ssse3, /* 16x4 */
+ cfl_predict_lbd_8x32_ssse3, /* 8x32 */
+ cfl_predict_lbd_32x8_avx2, /* 32x8 */
+ NULL, /* 16x64 (invalid CFL size) */
+ NULL, /* 64x16 (invalid CFL size) */
+ };
+ // Modulo TX_SIZES_ALL to ensure that an attacker won't be able to index the
+ // function pointer array out of bounds.
+ return pred[tx_size % TX_SIZES_ALL];
+}
+
+#if CONFIG_AV1_HIGHBITDEPTH
+static __m256i highbd_max_epi16(int bd) {
+ const __m256i neg_one = _mm256_set1_epi16(-1);
+ // (1 << bd) - 1 => -(-1 << bd) -1 => -1 - (-1 << bd) => -1 ^ (-1 << bd)
+ return _mm256_xor_si256(_mm256_slli_epi16(neg_one, bd), neg_one);
+}
+
+static __m256i highbd_clamp_epi16(__m256i u, __m256i zero, __m256i max) {
+ return _mm256_max_epi16(_mm256_min_epi16(u, max), zero);
+}
+
+static INLINE void cfl_predict_hbd_avx2(const int16_t *pred_buf_q3,
+ uint16_t *dst, int dst_stride,
+ int alpha_q3, int bd, int width,
+ int height) {
+ // Use SSSE3 version for smaller widths
+ assert(width == 16 || width == 32);
+ const __m256i alpha_sign = _mm256_set1_epi16(alpha_q3);
+ const __m256i alpha_q12 = _mm256_slli_epi16(_mm256_abs_epi16(alpha_sign), 9);
+ const __m256i dc_q0 = _mm256_loadu_si256((__m256i *)dst);
+ const __m256i max = highbd_max_epi16(bd);
+
+ __m256i *row = (__m256i *)pred_buf_q3;
+ const __m256i *row_end = row + height * CFL_BUF_LINE_I256;
+ do {
+ const __m256i res = predict_unclipped(row, alpha_q12, alpha_sign, dc_q0);
+ _mm256_storeu_si256((__m256i *)dst,
+ highbd_clamp_epi16(res, _mm256_setzero_si256(), max));
+ if (width == 32) {
+ const __m256i res_1 =
+ predict_unclipped(row + 1, alpha_q12, alpha_sign, dc_q0);
+ _mm256_storeu_si256(
+ (__m256i *)(dst + 16),
+ highbd_clamp_epi16(res_1, _mm256_setzero_si256(), max));
+ }
+ dst += dst_stride;
+ } while ((row += CFL_BUF_LINE_I256) < row_end);
+}
+
+CFL_PREDICT_X(avx2, 16, 4, hbd)
+CFL_PREDICT_X(avx2, 16, 8, hbd)
+CFL_PREDICT_X(avx2, 16, 16, hbd)
+CFL_PREDICT_X(avx2, 16, 32, hbd)
+CFL_PREDICT_X(avx2, 32, 8, hbd)
+CFL_PREDICT_X(avx2, 32, 16, hbd)
+CFL_PREDICT_X(avx2, 32, 32, hbd)
+
+cfl_predict_hbd_fn cfl_get_predict_hbd_fn_avx2(TX_SIZE tx_size) {
+ static const cfl_predict_hbd_fn pred[TX_SIZES_ALL] = {
+ cfl_predict_hbd_4x4_ssse3, /* 4x4 */
+ cfl_predict_hbd_8x8_ssse3, /* 8x8 */
+ cfl_predict_hbd_16x16_avx2, /* 16x16 */
+ cfl_predict_hbd_32x32_avx2, /* 32x32 */
+ NULL, /* 64x64 (invalid CFL size) */
+ cfl_predict_hbd_4x8_ssse3, /* 4x8 */
+ cfl_predict_hbd_8x4_ssse3, /* 8x4 */
+ cfl_predict_hbd_8x16_ssse3, /* 8x16 */
+ cfl_predict_hbd_16x8_avx2, /* 16x8 */
+ cfl_predict_hbd_16x32_avx2, /* 16x32 */
+ cfl_predict_hbd_32x16_avx2, /* 32x16 */
+ NULL, /* 32x64 (invalid CFL size) */
+ NULL, /* 64x32 (invalid CFL size) */
+ cfl_predict_hbd_4x16_ssse3, /* 4x16 */
+ cfl_predict_hbd_16x4_avx2, /* 16x4 */
+ cfl_predict_hbd_8x32_ssse3, /* 8x32 */
+ cfl_predict_hbd_32x8_avx2, /* 32x8 */
+ NULL, /* 16x64 (invalid CFL size) */
+ NULL, /* 64x16 (invalid CFL size) */
+ };
+ // Modulo TX_SIZES_ALL to ensure that an attacker won't be able to index the
+ // function pointer array out of bounds.
+ return pred[tx_size % TX_SIZES_ALL];
+}
+#endif // CONFIG_AV1_HIGHBITDEPTH
+
+// Returns a vector where all the (32-bits) elements are the sum of all the
+// lanes in a.
+static INLINE __m256i fill_sum_epi32(__m256i a) {
+ // Given that a == [A, B, C, D, E, F, G, H]
+ a = _mm256_hadd_epi32(a, a);
+ // Given that A' == A + B, C' == C + D, E' == E + F, G' == G + H
+ // a == [A', C', A', C', E', G', E', G']
+ a = _mm256_permute4x64_epi64(a, _MM_SHUFFLE(3, 1, 2, 0));
+ // a == [A', C', E', G', A', C', E', G']
+ a = _mm256_hadd_epi32(a, a);
+ // Given that A'' == A' + C' and E'' == E' + G'
+ // a == [A'', E'', A'', E'', A'', E'', A'', E'']
+ return _mm256_hadd_epi32(a, a);
+ // Given that A''' == A'' + E''
+ // a == [A''', A''', A''', A''', A''', A''', A''', A''']
+}
+
+static INLINE __m256i _mm256_addl_epi16(__m256i a) {
+ return _mm256_add_epi32(_mm256_unpacklo_epi16(a, _mm256_setzero_si256()),
+ _mm256_unpackhi_epi16(a, _mm256_setzero_si256()));
+}
+
+static INLINE void subtract_average_avx2(const uint16_t *src_ptr,
+ int16_t *dst_ptr, int width,
+ int height, int round_offset,
+ int num_pel_log2) {
+ // Use SSE2 version for smaller widths
+ assert(width == 16 || width == 32);
+
+ const __m256i *src = (__m256i *)src_ptr;
+ const __m256i *const end = src + height * CFL_BUF_LINE_I256;
+ // To maximize usage of the AVX2 registers, we sum two rows per loop
+ // iteration
+ const int step = 2 * CFL_BUF_LINE_I256;
+
+ __m256i sum = _mm256_setzero_si256();
+ // For width 32, we use a second sum accumulator to reduce accumulator
+ // dependencies in the loop.
+ __m256i sum2;
+ if (width == 32) sum2 = _mm256_setzero_si256();
+
+ do {
+ // Add top row to the bottom row
+ __m256i l0 = _mm256_add_epi16(_mm256_loadu_si256(src),
+ _mm256_loadu_si256(src + CFL_BUF_LINE_I256));
+ sum = _mm256_add_epi32(sum, _mm256_addl_epi16(l0));
+ if (width == 32) { /* Don't worry, this if it gets optimized out. */
+ // Add the second part of the top row to the second part of the bottom row
+ __m256i l1 =
+ _mm256_add_epi16(_mm256_loadu_si256(src + 1),
+ _mm256_loadu_si256(src + 1 + CFL_BUF_LINE_I256));
+ sum2 = _mm256_add_epi32(sum2, _mm256_addl_epi16(l1));
+ }
+ src += step;
+ } while (src < end);
+ // Combine both sum accumulators
+ if (width == 32) sum = _mm256_add_epi32(sum, sum2);
+
+ __m256i fill = fill_sum_epi32(sum);
+
+ __m256i avg_epi16 = _mm256_srli_epi32(
+ _mm256_add_epi32(fill, _mm256_set1_epi32(round_offset)), num_pel_log2);
+ avg_epi16 = _mm256_packs_epi32(avg_epi16, avg_epi16);
+
+ // Store and subtract loop
+ src = (__m256i *)src_ptr;
+ __m256i *dst = (__m256i *)dst_ptr;
+ do {
+ _mm256_storeu_si256(dst,
+ _mm256_sub_epi16(_mm256_loadu_si256(src), avg_epi16));
+ if (width == 32) {
+ _mm256_storeu_si256(
+ dst + 1, _mm256_sub_epi16(_mm256_loadu_si256(src + 1), avg_epi16));
+ }
+ src += CFL_BUF_LINE_I256;
+ dst += CFL_BUF_LINE_I256;
+ } while (src < end);
+}
+
+// Declare wrappers for AVX2 sizes
+CFL_SUB_AVG_X(avx2, 16, 4, 32, 6)
+CFL_SUB_AVG_X(avx2, 16, 8, 64, 7)
+CFL_SUB_AVG_X(avx2, 16, 16, 128, 8)
+CFL_SUB_AVG_X(avx2, 16, 32, 256, 9)
+CFL_SUB_AVG_X(avx2, 32, 8, 128, 8)
+CFL_SUB_AVG_X(avx2, 32, 16, 256, 9)
+CFL_SUB_AVG_X(avx2, 32, 32, 512, 10)
+
+// Based on the observation that for small blocks AVX2 does not outperform
+// SSE2, we call the SSE2 code for block widths 4 and 8.
+cfl_subtract_average_fn cfl_get_subtract_average_fn_avx2(TX_SIZE tx_size) {
+ static const cfl_subtract_average_fn sub_avg[TX_SIZES_ALL] = {
+ cfl_subtract_average_4x4_sse2, /* 4x4 */
+ cfl_subtract_average_8x8_sse2, /* 8x8 */
+ cfl_subtract_average_16x16_avx2, /* 16x16 */
+ cfl_subtract_average_32x32_avx2, /* 32x32 */
+ NULL, /* 64x64 (invalid CFL size) */
+ cfl_subtract_average_4x8_sse2, /* 4x8 */
+ cfl_subtract_average_8x4_sse2, /* 8x4 */
+ cfl_subtract_average_8x16_sse2, /* 8x16 */
+ cfl_subtract_average_16x8_avx2, /* 16x8 */
+ cfl_subtract_average_16x32_avx2, /* 16x32 */
+ cfl_subtract_average_32x16_avx2, /* 32x16 */
+ NULL, /* 32x64 (invalid CFL size) */
+ NULL, /* 64x32 (invalid CFL size) */
+ cfl_subtract_average_4x16_sse2, /* 4x16 */
+ cfl_subtract_average_16x4_avx2, /* 16x4 */
+ cfl_subtract_average_8x32_sse2, /* 8x32 */
+ cfl_subtract_average_32x8_avx2, /* 32x8 */
+ NULL, /* 16x64 (invalid CFL size) */
+ NULL, /* 64x16 (invalid CFL size) */
+ };
+ // Modulo TX_SIZES_ALL to ensure that an attacker won't be able to
+ // index the function pointer array out of bounds.
+ return sub_avg[tx_size % TX_SIZES_ALL];
+}
diff --git a/third_party/aom/av1/common/x86/cfl_simd.h b/third_party/aom/av1/common/x86/cfl_simd.h
new file mode 100644
index 0000000000..03ae02a922
--- /dev/null
+++ b/third_party/aom/av1/common/x86/cfl_simd.h
@@ -0,0 +1,246 @@
+/*
+ * Copyright (c) 2017, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AV1_COMMON_X86_CFL_SIMD_H_
+#define AOM_AV1_COMMON_X86_CFL_SIMD_H_
+
+#include "av1/common/blockd.h"
+
+// SSSE3 version is optimal for with == 4, we reuse them in AVX2
+void cfl_subsample_lbd_420_4x4_ssse3(const uint8_t *cfl_type, int input_stride,
+ uint16_t *output_q3);
+void cfl_subsample_lbd_420_4x8_ssse3(const uint8_t *cfl_type, int input_stride,
+ uint16_t *output_q3);
+void cfl_subsample_lbd_420_4x16_ssse3(const uint8_t *cfl_type, int input_stride,
+ uint16_t *output_q3);
+
+// SSSE3 version is optimal for with == 8, we reuse it in AVX2
+void cfl_subsample_lbd_420_8x4_ssse3(const uint8_t *cfl_type, int input_stride,
+ uint16_t *output_q3);
+void cfl_subsample_lbd_420_8x8_ssse3(const uint8_t *cfl_type, int input_stride,
+ uint16_t *output_q3);
+void cfl_subsample_lbd_420_8x16_ssse3(const uint8_t *cfl_type, int input_stride,
+ uint16_t *output_q3);
+void cfl_subsample_lbd_420_8x32_ssse3(const uint8_t *cfl_type, int input_stride,
+ uint16_t *output_q3);
+
+// SSSE3 version is optimal for with == 16, we reuse it in AVX2
+void cfl_subsample_lbd_420_16x4_ssse3(const uint8_t *cfl_type, int input_stride,
+ uint16_t *output_q3);
+void cfl_subsample_lbd_420_16x8_ssse3(const uint8_t *cfl_type, int input_stride,
+ uint16_t *output_q3);
+void cfl_subsample_lbd_420_16x16_ssse3(const uint8_t *cfl_type,
+ int input_stride, uint16_t *output_q3);
+void cfl_subsample_lbd_420_16x32_ssse3(const uint8_t *cfl_type,
+ int input_stride, uint16_t *output_q3);
+
+// SSSE3 version is optimal for with == 4, we reuse them in AVX2
+void cfl_subsample_lbd_422_4x4_ssse3(const uint8_t *cfl_type, int input_stride,
+ uint16_t *output_q3);
+void cfl_subsample_lbd_422_4x8_ssse3(const uint8_t *cfl_type, int input_stride,
+ uint16_t *output_q3);
+void cfl_subsample_lbd_422_4x16_ssse3(const uint8_t *cfl_type, int input_stride,
+ uint16_t *output_q3);
+
+// SSSE3 version is optimal for with == 8, we reuse it in AVX2
+void cfl_subsample_lbd_422_8x4_ssse3(const uint8_t *cfl_type, int input_stride,
+ uint16_t *output_q3);
+void cfl_subsample_lbd_422_8x8_ssse3(const uint8_t *cfl_type, int input_stride,
+ uint16_t *output_q3);
+void cfl_subsample_lbd_422_8x16_ssse3(const uint8_t *cfl_type, int input_stride,
+ uint16_t *output_q3);
+void cfl_subsample_lbd_422_8x32_ssse3(const uint8_t *cfl_type, int input_stride,
+ uint16_t *output_q3);
+
+// SSSE3 version is optimal for with == 16, we reuse it in AVX2
+void cfl_subsample_lbd_422_16x4_ssse3(const uint8_t *cfl_type, int input_stride,
+ uint16_t *output_q3);
+void cfl_subsample_lbd_422_16x8_ssse3(const uint8_t *cfl_type, int input_stride,
+ uint16_t *output_q3);
+void cfl_subsample_lbd_422_16x16_ssse3(const uint8_t *cfl_type,
+ int input_stride, uint16_t *output_q3);
+void cfl_subsample_lbd_422_16x32_ssse3(const uint8_t *cfl_type,
+ int input_stride, uint16_t *output_q3);
+
+// SSSE3 version is optimal for with == 4, we reuse them in AVX2
+void cfl_subsample_lbd_444_4x4_ssse3(const uint8_t *cfl_type, int input_stride,
+ uint16_t *output_q3);
+void cfl_subsample_lbd_444_4x8_ssse3(const uint8_t *cfl_type, int input_stride,
+ uint16_t *output_q3);
+void cfl_subsample_lbd_444_4x16_ssse3(const uint8_t *cfl_type, int input_stride,
+ uint16_t *output_q3);
+
+// SSSE3 version is optimal for with == 8, we reuse it in AVX2
+void cfl_subsample_lbd_444_8x4_ssse3(const uint8_t *cfl_type, int input_stride,
+ uint16_t *output_q3);
+void cfl_subsample_lbd_444_8x8_ssse3(const uint8_t *cfl_type, int input_stride,
+ uint16_t *output_q3);
+void cfl_subsample_lbd_444_8x16_ssse3(const uint8_t *cfl_type, int input_stride,
+ uint16_t *output_q3);
+void cfl_subsample_lbd_444_8x32_ssse3(const uint8_t *cfl_type, int input_stride,
+ uint16_t *output_q3);
+
+// SSSE3 version is optimal for with == 16, we reuse it in AVX2
+void cfl_subsample_lbd_444_16x4_ssse3(const uint8_t *cfl_type, int input_stride,
+ uint16_t *output_q3);
+void cfl_subsample_lbd_444_16x8_ssse3(const uint8_t *cfl_type, int input_stride,
+ uint16_t *output_q3);
+void cfl_subsample_lbd_444_16x16_ssse3(const uint8_t *cfl_type,
+ int input_stride, uint16_t *output_q3);
+void cfl_subsample_lbd_444_16x32_ssse3(const uint8_t *cfl_type,
+ int input_stride, uint16_t *output_q3);
+
+#if CONFIG_AV1_HIGHBITDEPTH
+void cfl_subsample_hbd_420_4x4_ssse3(const uint16_t *cfl_type, int input_stride,
+ uint16_t *output_q3);
+void cfl_subsample_hbd_420_4x8_ssse3(const uint16_t *cfl_type, int input_stride,
+ uint16_t *output_q3);
+void cfl_subsample_hbd_420_4x16_ssse3(const uint16_t *cfl_type,
+ int input_stride, uint16_t *output_q3);
+
+// SSSE3 version is optimal for with == 8, we reuse it in AVX2
+void cfl_subsample_hbd_420_8x4_ssse3(const uint16_t *cfl_type, int input_stride,
+ uint16_t *output_q3);
+void cfl_subsample_hbd_420_8x8_ssse3(const uint16_t *cfl_type, int input_stride,
+ uint16_t *output_q3);
+void cfl_subsample_hbd_420_8x16_ssse3(const uint16_t *cfl_type,
+ int input_stride, uint16_t *output_q3);
+void cfl_subsample_hbd_420_8x32_ssse3(const uint16_t *cfl_type,
+ int input_stride, uint16_t *output_q3);
+
+// SSSE3 version is faster for with == 16, we reuse it in AVX2
+void cfl_subsample_hbd_420_16x4_ssse3(const uint16_t *cfl_type,
+ int input_stride, uint16_t *output_q3);
+void cfl_subsample_hbd_420_16x8_ssse3(const uint16_t *cfl_type,
+ int input_stride, uint16_t *output_q3);
+void cfl_subsample_hbd_420_16x16_ssse3(const uint16_t *cfl_type,
+ int input_stride, uint16_t *output_q3);
+void cfl_subsample_hbd_420_16x32_ssse3(const uint16_t *cfl_type,
+ int input_stride, uint16_t *output_q3);
+
+void cfl_subsample_hbd_422_4x4_ssse3(const uint16_t *cfl_type, int input_stride,
+ uint16_t *output_q3);
+void cfl_subsample_hbd_422_4x8_ssse3(const uint16_t *cfl_type, int input_stride,
+ uint16_t *output_q3);
+void cfl_subsample_hbd_422_4x16_ssse3(const uint16_t *cfl_type,
+ int input_stride, uint16_t *output_q3);
+
+// SSSE3 version is optimal for with == 8, we reuse it in AVX2
+void cfl_subsample_hbd_422_8x4_ssse3(const uint16_t *cfl_type, int input_stride,
+ uint16_t *output_q3);
+void cfl_subsample_hbd_422_8x8_ssse3(const uint16_t *cfl_type, int input_stride,
+ uint16_t *output_q3);
+void cfl_subsample_hbd_422_8x16_ssse3(const uint16_t *cfl_type,
+ int input_stride, uint16_t *output_q3);
+void cfl_subsample_hbd_422_8x32_ssse3(const uint16_t *cfl_type,
+ int input_stride, uint16_t *output_q3);
+
+// SSSE3 version is faster for with == 16, we reuse it in AVX2
+void cfl_subsample_hbd_422_16x4_ssse3(const uint16_t *cfl_type,
+ int input_stride, uint16_t *output_q3);
+void cfl_subsample_hbd_422_16x8_ssse3(const uint16_t *cfl_type,
+ int input_stride, uint16_t *output_q3);
+void cfl_subsample_hbd_422_16x16_ssse3(const uint16_t *cfl_type,
+ int input_stride, uint16_t *output_q3);
+void cfl_subsample_hbd_422_16x32_ssse3(const uint16_t *cfl_type,
+ int input_stride, uint16_t *output_q3);
+
+void cfl_subsample_hbd_444_4x4_ssse3(const uint16_t *cfl_type, int input_stride,
+ uint16_t *output_q3);
+void cfl_subsample_hbd_444_4x8_ssse3(const uint16_t *cfl_type, int input_stride,
+ uint16_t *output_q3);
+void cfl_subsample_hbd_444_4x16_ssse3(const uint16_t *cfl_type,
+ int input_stride, uint16_t *output_q3);
+
+// SSSE3 version is optimal for with == 8, we reuse it in AVX2
+void cfl_subsample_hbd_444_8x4_ssse3(const uint16_t *cfl_type, int input_stride,
+ uint16_t *output_q3);
+void cfl_subsample_hbd_444_8x8_ssse3(const uint16_t *cfl_type, int input_stride,
+ uint16_t *output_q3);
+void cfl_subsample_hbd_444_8x16_ssse3(const uint16_t *cfl_type,
+ int input_stride, uint16_t *output_q3);
+void cfl_subsample_hbd_444_8x32_ssse3(const uint16_t *cfl_type,
+ int input_stride, uint16_t *output_q3);
+
+// SSSE3 version is faster for with == 16, we reuse it in AVX2
+void cfl_subsample_hbd_444_16x4_ssse3(const uint16_t *cfl_type,
+ int input_stride, uint16_t *output_q3);
+void cfl_subsample_hbd_444_16x8_ssse3(const uint16_t *cfl_type,
+ int input_stride, uint16_t *output_q3);
+void cfl_subsample_hbd_444_16x16_ssse3(const uint16_t *cfl_type,
+ int input_stride, uint16_t *output_q3);
+void cfl_subsample_hbd_444_16x32_ssse3(const uint16_t *cfl_type,
+ int input_stride, uint16_t *output_q3);
+#endif // CONFIG_AV1_HIGHBITDEPTH
+
+// SSE2 version is optimal for with == 4, we reuse them in AVX2
+void cfl_subtract_average_4x4_sse2(const uint16_t *src, int16_t *dst);
+void cfl_subtract_average_4x8_sse2(const uint16_t *src, int16_t *dst);
+void cfl_subtract_average_4x16_sse2(const uint16_t *src, int16_t *dst);
+
+// SSE2 version is optimal for with == 8, we reuse them in AVX2
+void cfl_subtract_average_8x4_sse2(const uint16_t *src, int16_t *dst);
+void cfl_subtract_average_8x8_sse2(const uint16_t *src, int16_t *dst);
+void cfl_subtract_average_8x16_sse2(const uint16_t *src, int16_t *dst);
+void cfl_subtract_average_8x32_sse2(const uint16_t *src, int16_t *dst);
+
+void cfl_predict_lbd_4x4_ssse3(const int16_t *pred_buf_q3, uint8_t *dst,
+ int dst_stride, int alpha_q3);
+void cfl_predict_lbd_4x8_ssse3(const int16_t *pred_buf_q3, uint8_t *dst,
+ int dst_stride, int alpha_q3);
+void cfl_predict_lbd_4x16_ssse3(const int16_t *pred_buf_q3, uint8_t *dst,
+ int dst_stride, int alpha_q3);
+
+void cfl_predict_lbd_8x4_ssse3(const int16_t *pred_buf_q3, uint8_t *dst,
+ int dst_stride, int alpha_q3);
+void cfl_predict_lbd_8x8_ssse3(const int16_t *pred_buf_q3, uint8_t *dst,
+ int dst_stride, int alpha_q3);
+void cfl_predict_lbd_8x16_ssse3(const int16_t *pred_buf_q3, uint8_t *dst,
+ int dst_stride, int alpha_q3);
+void cfl_predict_lbd_8x32_ssse3(const int16_t *pred_buf_q3, uint8_t *dst,
+ int dst_stride, int alpha_q3);
+
+void cfl_predict_lbd_16x4_ssse3(const int16_t *pred_buf_q3, uint8_t *dst,
+ int dst_stride, int alpha_q3);
+void cfl_predict_lbd_16x8_ssse3(const int16_t *pred_buf_q3, uint8_t *dst,
+ int dst_stride, int alpha_q3);
+void cfl_predict_lbd_16x16_ssse3(const int16_t *pred_buf_q3, uint8_t *dst,
+ int dst_stride, int alpha_q3);
+void cfl_predict_lbd_16x32_ssse3(const int16_t *pred_buf_q3, uint8_t *dst,
+ int dst_stride, int alpha_q3);
+
+#if CONFIG_AV1_HIGHBITDEPTH
+void cfl_predict_hbd_4x4_ssse3(const int16_t *pred_buf_q3, uint16_t *dst,
+ int dst_stride, int alpha_q3, int bd);
+void cfl_predict_hbd_4x8_ssse3(const int16_t *pred_buf_q3, uint16_t *dst,
+ int dst_stride, int alpha_q3, int bd);
+void cfl_predict_hbd_4x16_ssse3(const int16_t *pred_buf_q3, uint16_t *dst,
+ int dst_stride, int alpha_q3, int bd);
+
+void cfl_predict_hbd_8x4_ssse3(const int16_t *pred_buf_q3, uint16_t *dst,
+ int dst_stride, int alpha_q3, int bd);
+void cfl_predict_hbd_8x8_ssse3(const int16_t *pred_buf_q3, uint16_t *dst,
+ int dst_stride, int alpha_q3, int bd);
+void cfl_predict_hbd_8x16_ssse3(const int16_t *pred_buf_q3, uint16_t *dst,
+ int dst_stride, int alpha_q3, int bd);
+void cfl_predict_hbd_8x32_ssse3(const int16_t *pred_buf_q3, uint16_t *dst,
+ int dst_stride, int alpha_q3, int bd);
+
+void cfl_predict_hbd_16x4_ssse3(const int16_t *pred_buf_q3, uint16_t *dst,
+ int dst_stride, int alpha_q3, int bd);
+void cfl_predict_hbd_16x8_ssse3(const int16_t *pred_buf_q3, uint16_t *dst,
+ int dst_stride, int alpha_q3, int bd);
+void cfl_predict_hbd_16x16_ssse3(const int16_t *pred_buf_q3, uint16_t *dst,
+ int dst_stride, int alpha_q3, int bd);
+void cfl_predict_hbd_16x32_ssse3(const int16_t *pred_buf_q3, uint16_t *dst,
+ int dst_stride, int alpha_q3, int bd);
+#endif // CONFIG_AV1_HIGHBITDEPTH
+#endif // AOM_AV1_COMMON_X86_CFL_SIMD_H_
diff --git a/third_party/aom/av1/common/x86/cfl_sse2.c b/third_party/aom/av1/common/x86/cfl_sse2.c
new file mode 100644
index 0000000000..4783fe098c
--- /dev/null
+++ b/third_party/aom/av1/common/x86/cfl_sse2.c
@@ -0,0 +1,89 @@
+/*
+ * Copyright (c) 2017, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <emmintrin.h>
+
+#include "av1/common/cfl.h"
+#include "config/av1_rtcd.h"
+
+static INLINE __m128i fill_sum_epi32(__m128i l0) {
+ l0 = _mm_add_epi32(l0, _mm_shuffle_epi32(l0, _MM_SHUFFLE(1, 0, 3, 2)));
+ return _mm_add_epi32(l0, _mm_shuffle_epi32(l0, _MM_SHUFFLE(2, 3, 0, 1)));
+}
+
+static INLINE void subtract_average_sse2(const uint16_t *src_ptr,
+ int16_t *dst_ptr, int width,
+ int height, int round_offset,
+ int num_pel_log2) {
+ const __m128i zeros = _mm_setzero_si128();
+ const __m128i round_offset_epi32 = _mm_set1_epi32(round_offset);
+ const __m128i *src = (__m128i *)src_ptr;
+ const __m128i *const end = src + height * CFL_BUF_LINE_I128;
+ const int step = CFL_BUF_LINE_I128 * (1 + (width == 8) + 3 * (width == 4));
+
+ __m128i sum = zeros;
+ do {
+ __m128i l0;
+ if (width == 4) {
+ l0 = _mm_add_epi16(_mm_loadl_epi64(src),
+ _mm_loadl_epi64(src + CFL_BUF_LINE_I128));
+ __m128i l1 = _mm_add_epi16(_mm_loadl_epi64(src + 2 * CFL_BUF_LINE_I128),
+ _mm_loadl_epi64(src + 3 * CFL_BUF_LINE_I128));
+ sum = _mm_add_epi32(sum, _mm_add_epi32(_mm_unpacklo_epi16(l0, zeros),
+ _mm_unpacklo_epi16(l1, zeros)));
+ } else {
+ if (width == 8) {
+ l0 = _mm_add_epi16(_mm_loadu_si128(src),
+ _mm_loadu_si128(src + CFL_BUF_LINE_I128));
+ } else {
+ l0 = _mm_add_epi16(_mm_loadu_si128(src), _mm_loadu_si128(src + 1));
+ }
+ sum = _mm_add_epi32(sum, _mm_add_epi32(_mm_unpacklo_epi16(l0, zeros),
+ _mm_unpackhi_epi16(l0, zeros)));
+ if (width == 32) {
+ l0 = _mm_add_epi16(_mm_loadu_si128(src + 2), _mm_loadu_si128(src + 3));
+ sum = _mm_add_epi32(sum, _mm_add_epi32(_mm_unpacklo_epi16(l0, zeros),
+ _mm_unpackhi_epi16(l0, zeros)));
+ }
+ }
+ src += step;
+ } while (src < end);
+
+ sum = fill_sum_epi32(sum);
+
+ __m128i avg_epi16 =
+ _mm_srli_epi32(_mm_add_epi32(sum, round_offset_epi32), num_pel_log2);
+ avg_epi16 = _mm_packs_epi32(avg_epi16, avg_epi16);
+
+ src = (__m128i *)src_ptr;
+ __m128i *dst = (__m128i *)dst_ptr;
+ do {
+ if (width == 4) {
+ _mm_storel_epi64(dst, _mm_sub_epi16(_mm_loadl_epi64(src), avg_epi16));
+ } else {
+ _mm_storeu_si128(dst, _mm_sub_epi16(_mm_loadu_si128(src), avg_epi16));
+ if (width > 8) {
+ _mm_storeu_si128(dst + 1,
+ _mm_sub_epi16(_mm_loadu_si128(src + 1), avg_epi16));
+ if (width == 32) {
+ _mm_storeu_si128(dst + 2,
+ _mm_sub_epi16(_mm_loadu_si128(src + 2), avg_epi16));
+ _mm_storeu_si128(dst + 3,
+ _mm_sub_epi16(_mm_loadu_si128(src + 3), avg_epi16));
+ }
+ }
+ }
+ src += CFL_BUF_LINE_I128;
+ dst += CFL_BUF_LINE_I128;
+ } while (src < end);
+}
+
+CFL_SUB_AVG_FN(sse2)
diff --git a/third_party/aom/av1/common/x86/cfl_ssse3.c b/third_party/aom/av1/common/x86/cfl_ssse3.c
new file mode 100644
index 0000000000..476b6609a9
--- /dev/null
+++ b/third_party/aom/av1/common/x86/cfl_ssse3.c
@@ -0,0 +1,397 @@
+/*
+ * Copyright (c) 2017, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <tmmintrin.h>
+
+#include "config/av1_rtcd.h"
+
+#include "av1/common/cfl.h"
+
+#include "av1/common/x86/cfl_simd.h"
+
+// Load 32-bit integer from memory into the first element of dst.
+static INLINE __m128i _mm_loadh_epi32(__m128i const *mem_addr) {
+ return _mm_cvtsi32_si128(*((int *)mem_addr));
+}
+
+// Store 32-bit integer from the first element of a into memory.
+static INLINE void _mm_storeh_epi32(__m128i const *mem_addr, __m128i a) {
+ *((int *)mem_addr) = _mm_cvtsi128_si32(a);
+}
+
+/**
+ * Adds 4 pixels (in a 2x2 grid) and multiplies them by 2. Resulting in a more
+ * precise version of a box filter 4:2:0 pixel subsampling in Q3.
+ *
+ * The CfL prediction buffer is always of size CFL_BUF_SQUARE. However, the
+ * active area is specified using width and height.
+ *
+ * Note: We don't need to worry about going over the active area, as long as we
+ * stay inside the CfL prediction buffer.
+ */
+static INLINE void cfl_luma_subsampling_420_lbd_ssse3(const uint8_t *input,
+ int input_stride,
+ uint16_t *pred_buf_q3,
+ int width, int height) {
+ const __m128i twos = _mm_set1_epi8(2);
+ __m128i *pred_buf_m128i = (__m128i *)pred_buf_q3;
+ const __m128i *end = pred_buf_m128i + (height >> 1) * CFL_BUF_LINE_I128;
+ const int luma_stride = input_stride << 1;
+ do {
+ if (width == 4) {
+ __m128i top = _mm_loadh_epi32((__m128i *)input);
+ top = _mm_maddubs_epi16(top, twos);
+ __m128i bot = _mm_loadh_epi32((__m128i *)(input + input_stride));
+ bot = _mm_maddubs_epi16(bot, twos);
+ const __m128i sum = _mm_add_epi16(top, bot);
+ _mm_storeh_epi32(pred_buf_m128i, sum);
+ } else if (width == 8) {
+ __m128i top = _mm_loadl_epi64((__m128i *)input);
+ top = _mm_maddubs_epi16(top, twos);
+ __m128i bot = _mm_loadl_epi64((__m128i *)(input + input_stride));
+ bot = _mm_maddubs_epi16(bot, twos);
+ const __m128i sum = _mm_add_epi16(top, bot);
+ _mm_storel_epi64(pred_buf_m128i, sum);
+ } else {
+ __m128i top = _mm_loadu_si128((__m128i *)input);
+ top = _mm_maddubs_epi16(top, twos);
+ __m128i bot = _mm_loadu_si128((__m128i *)(input + input_stride));
+ bot = _mm_maddubs_epi16(bot, twos);
+ const __m128i sum = _mm_add_epi16(top, bot);
+ _mm_storeu_si128(pred_buf_m128i, sum);
+ if (width == 32) {
+ __m128i top_1 = _mm_loadu_si128(((__m128i *)input) + 1);
+ __m128i bot_1 =
+ _mm_loadu_si128(((__m128i *)(input + input_stride)) + 1);
+ top_1 = _mm_maddubs_epi16(top_1, twos);
+ bot_1 = _mm_maddubs_epi16(bot_1, twos);
+ __m128i sum_1 = _mm_add_epi16(top_1, bot_1);
+ _mm_storeu_si128(pred_buf_m128i + 1, sum_1);
+ }
+ }
+ input += luma_stride;
+ pred_buf_m128i += CFL_BUF_LINE_I128;
+ } while (pred_buf_m128i < end);
+}
+
+/**
+ * Adds 2 pixels (in a 2x1 grid) and multiplies them by 4. Resulting in a more
+ * precise version of a box filter 4:2:2 pixel subsampling in Q3.
+ *
+ * The CfL prediction buffer is always of size CFL_BUF_SQUARE. However, the
+ * active area is specified using width and height.
+ *
+ * Note: We don't need to worry about going over the active area, as long as we
+ * stay inside the CfL prediction buffer.
+ */
+static INLINE void cfl_luma_subsampling_422_lbd_ssse3(const uint8_t *input,
+ int input_stride,
+ uint16_t *pred_buf_q3,
+ int width, int height) {
+ const __m128i fours = _mm_set1_epi8(4);
+ __m128i *pred_buf_m128i = (__m128i *)pred_buf_q3;
+ const __m128i *end = pred_buf_m128i + height * CFL_BUF_LINE_I128;
+ do {
+ if (width == 4) {
+ __m128i top = _mm_loadh_epi32((__m128i *)input);
+ top = _mm_maddubs_epi16(top, fours);
+ _mm_storeh_epi32(pred_buf_m128i, top);
+ } else if (width == 8) {
+ __m128i top = _mm_loadl_epi64((__m128i *)input);
+ top = _mm_maddubs_epi16(top, fours);
+ _mm_storel_epi64(pred_buf_m128i, top);
+ } else {
+ __m128i top = _mm_loadu_si128((__m128i *)input);
+ top = _mm_maddubs_epi16(top, fours);
+ _mm_storeu_si128(pred_buf_m128i, top);
+ if (width == 32) {
+ __m128i top_1 = _mm_loadu_si128(((__m128i *)input) + 1);
+ top_1 = _mm_maddubs_epi16(top_1, fours);
+ _mm_storeu_si128(pred_buf_m128i + 1, top_1);
+ }
+ }
+ input += input_stride;
+ pred_buf_m128i += CFL_BUF_LINE_I128;
+ } while (pred_buf_m128i < end);
+}
+
+/**
+ * Multiplies the pixels by 8 (scaling in Q3).
+ *
+ * The CfL prediction buffer is always of size CFL_BUF_SQUARE. However, the
+ * active area is specified using width and height.
+ *
+ * Note: We don't need to worry about going over the active area, as long as we
+ * stay inside the CfL prediction buffer.
+ */
+static INLINE void cfl_luma_subsampling_444_lbd_ssse3(const uint8_t *input,
+ int input_stride,
+ uint16_t *pred_buf_q3,
+ int width, int height) {
+ const __m128i zeros = _mm_setzero_si128();
+ const int luma_stride = input_stride;
+ __m128i *pred_buf_m128i = (__m128i *)pred_buf_q3;
+ const __m128i *end = pred_buf_m128i + height * CFL_BUF_LINE_I128;
+ do {
+ if (width == 4) {
+ __m128i row = _mm_loadh_epi32((__m128i *)input);
+ row = _mm_unpacklo_epi8(row, zeros);
+ _mm_storel_epi64(pred_buf_m128i, _mm_slli_epi16(row, 3));
+ } else if (width == 8) {
+ __m128i row = _mm_loadl_epi64((__m128i *)input);
+ row = _mm_unpacklo_epi8(row, zeros);
+ _mm_storeu_si128(pred_buf_m128i, _mm_slli_epi16(row, 3));
+ } else {
+ __m128i row = _mm_loadu_si128((__m128i *)input);
+ const __m128i row_lo = _mm_unpacklo_epi8(row, zeros);
+ const __m128i row_hi = _mm_unpackhi_epi8(row, zeros);
+ _mm_storeu_si128(pred_buf_m128i, _mm_slli_epi16(row_lo, 3));
+ _mm_storeu_si128(pred_buf_m128i + 1, _mm_slli_epi16(row_hi, 3));
+ if (width == 32) {
+ __m128i row_1 = _mm_loadu_si128(((__m128i *)input) + 1);
+ const __m128i row_1_lo = _mm_unpacklo_epi8(row_1, zeros);
+ const __m128i row_1_hi = _mm_unpackhi_epi8(row_1, zeros);
+ _mm_storeu_si128(pred_buf_m128i + 2, _mm_slli_epi16(row_1_lo, 3));
+ _mm_storeu_si128(pred_buf_m128i + 3, _mm_slli_epi16(row_1_hi, 3));
+ }
+ }
+ input += luma_stride;
+ pred_buf_m128i += CFL_BUF_LINE_I128;
+ } while (pred_buf_m128i < end);
+}
+
+#if CONFIG_AV1_HIGHBITDEPTH
+/**
+ * Adds 4 pixels (in a 2x2 grid) and multiplies them by 2. Resulting in a more
+ * precise version of a box filter 4:2:0 pixel subsampling in Q3.
+ *
+ * The CfL prediction buffer is always of size CFL_BUF_SQUARE. However, the
+ * active area is specified using width and height.
+ *
+ * Note: We don't need to worry about going over the active area, as long as we
+ * stay inside the CfL prediction buffer.
+ */
+static INLINE void cfl_luma_subsampling_420_hbd_ssse3(const uint16_t *input,
+ int input_stride,
+ uint16_t *pred_buf_q3,
+ int width, int height) {
+ const uint16_t *end = pred_buf_q3 + (height >> 1) * CFL_BUF_LINE;
+ const int luma_stride = input_stride << 1;
+ do {
+ if (width == 4) {
+ const __m128i top = _mm_loadl_epi64((__m128i *)input);
+ const __m128i bot = _mm_loadl_epi64((__m128i *)(input + input_stride));
+ __m128i sum = _mm_add_epi16(top, bot);
+ sum = _mm_hadd_epi16(sum, sum);
+ *((int *)pred_buf_q3) = _mm_cvtsi128_si32(_mm_add_epi16(sum, sum));
+ } else {
+ const __m128i top = _mm_loadu_si128((__m128i *)input);
+ const __m128i bot = _mm_loadu_si128((__m128i *)(input + input_stride));
+ __m128i sum = _mm_add_epi16(top, bot);
+ if (width == 8) {
+ sum = _mm_hadd_epi16(sum, sum);
+ _mm_storel_epi64((__m128i *)pred_buf_q3, _mm_add_epi16(sum, sum));
+ } else {
+ const __m128i top_1 = _mm_loadu_si128(((__m128i *)input) + 1);
+ const __m128i bot_1 =
+ _mm_loadu_si128(((__m128i *)(input + input_stride)) + 1);
+ sum = _mm_hadd_epi16(sum, _mm_add_epi16(top_1, bot_1));
+ _mm_storeu_si128((__m128i *)pred_buf_q3, _mm_add_epi16(sum, sum));
+ if (width == 32) {
+ const __m128i top_2 = _mm_loadu_si128(((__m128i *)input) + 2);
+ const __m128i bot_2 =
+ _mm_loadu_si128(((__m128i *)(input + input_stride)) + 2);
+ const __m128i top_3 = _mm_loadu_si128(((__m128i *)input) + 3);
+ const __m128i bot_3 =
+ _mm_loadu_si128(((__m128i *)(input + input_stride)) + 3);
+ const __m128i sum_2 = _mm_add_epi16(top_2, bot_2);
+ const __m128i sum_3 = _mm_add_epi16(top_3, bot_3);
+ __m128i next_sum = _mm_hadd_epi16(sum_2, sum_3);
+ _mm_storeu_si128(((__m128i *)pred_buf_q3) + 1,
+ _mm_add_epi16(next_sum, next_sum));
+ }
+ }
+ }
+ input += luma_stride;
+ } while ((pred_buf_q3 += CFL_BUF_LINE) < end);
+}
+
+/**
+ * Adds 2 pixels (in a 2x1 grid) and multiplies them by 4. Resulting in a more
+ * precise version of a box filter 4:2:2 pixel subsampling in Q3.
+ *
+ * The CfL prediction buffer is always of size CFL_BUF_SQUARE. However, the
+ * active area is specified using width and height.
+ *
+ * Note: We don't need to worry about going over the active area, as long as we
+ * stay inside the CfL prediction buffer.
+ */
+static INLINE void cfl_luma_subsampling_422_hbd_ssse3(const uint16_t *input,
+ int input_stride,
+ uint16_t *pred_buf_q3,
+ int width, int height) {
+ __m128i *pred_buf_m128i = (__m128i *)pred_buf_q3;
+ const __m128i *end = pred_buf_m128i + height * CFL_BUF_LINE_I128;
+ do {
+ if (width == 4) {
+ const __m128i top = _mm_loadl_epi64((__m128i *)input);
+ const __m128i sum = _mm_slli_epi16(_mm_hadd_epi16(top, top), 2);
+ _mm_storeh_epi32(pred_buf_m128i, sum);
+ } else {
+ const __m128i top = _mm_loadu_si128((__m128i *)input);
+ if (width == 8) {
+ const __m128i sum = _mm_slli_epi16(_mm_hadd_epi16(top, top), 2);
+ _mm_storel_epi64(pred_buf_m128i, sum);
+ } else {
+ const __m128i top_1 = _mm_loadu_si128(((__m128i *)input) + 1);
+ const __m128i sum = _mm_slli_epi16(_mm_hadd_epi16(top, top_1), 2);
+ _mm_storeu_si128(pred_buf_m128i, sum);
+ if (width == 32) {
+ const __m128i top_2 = _mm_loadu_si128(((__m128i *)input) + 2);
+ const __m128i top_3 = _mm_loadu_si128(((__m128i *)input) + 3);
+ const __m128i sum_1 = _mm_slli_epi16(_mm_hadd_epi16(top_2, top_3), 2);
+ _mm_storeu_si128(pred_buf_m128i + 1, sum_1);
+ }
+ }
+ }
+ pred_buf_m128i += CFL_BUF_LINE_I128;
+ input += input_stride;
+ } while (pred_buf_m128i < end);
+}
+
+static INLINE void cfl_luma_subsampling_444_hbd_ssse3(const uint16_t *input,
+ int input_stride,
+ uint16_t *pred_buf_q3,
+ int width, int height) {
+ const uint16_t *end = pred_buf_q3 + height * CFL_BUF_LINE;
+ do {
+ if (width == 4) {
+ const __m128i row = _mm_slli_epi16(_mm_loadl_epi64((__m128i *)input), 3);
+ _mm_storel_epi64((__m128i *)pred_buf_q3, row);
+ } else {
+ const __m128i row = _mm_slli_epi16(_mm_loadu_si128((__m128i *)input), 3);
+ _mm_storeu_si128((__m128i *)pred_buf_q3, row);
+ if (width >= 16) {
+ __m128i row_1 = _mm_loadu_si128(((__m128i *)input) + 1);
+ row_1 = _mm_slli_epi16(row_1, 3);
+ _mm_storeu_si128(((__m128i *)pred_buf_q3) + 1, row_1);
+ if (width == 32) {
+ __m128i row_2 = _mm_loadu_si128(((__m128i *)input) + 2);
+ row_2 = _mm_slli_epi16(row_2, 3);
+ _mm_storeu_si128(((__m128i *)pred_buf_q3) + 2, row_2);
+ __m128i row_3 = _mm_loadu_si128(((__m128i *)input) + 3);
+ row_3 = _mm_slli_epi16(row_3, 3);
+ _mm_storeu_si128(((__m128i *)pred_buf_q3) + 3, row_3);
+ }
+ }
+ }
+ input += input_stride;
+ pred_buf_q3 += CFL_BUF_LINE;
+ } while (pred_buf_q3 < end);
+}
+#endif // CONFIG_AV1_HIGHBITDEPTH
+
+CFL_GET_SUBSAMPLE_FUNCTION(ssse3)
+
+static INLINE __m128i predict_unclipped(const __m128i *input, __m128i alpha_q12,
+ __m128i alpha_sign, __m128i dc_q0) {
+ __m128i ac_q3 = _mm_loadu_si128(input);
+ __m128i ac_sign = _mm_sign_epi16(alpha_sign, ac_q3);
+ __m128i scaled_luma_q0 = _mm_mulhrs_epi16(_mm_abs_epi16(ac_q3), alpha_q12);
+ scaled_luma_q0 = _mm_sign_epi16(scaled_luma_q0, ac_sign);
+ return _mm_add_epi16(scaled_luma_q0, dc_q0);
+}
+
+static INLINE void cfl_predict_lbd_ssse3(const int16_t *pred_buf_q3,
+ uint8_t *dst, int dst_stride,
+ int alpha_q3, int width, int height) {
+ const __m128i alpha_sign = _mm_set1_epi16(alpha_q3);
+ const __m128i alpha_q12 = _mm_slli_epi16(_mm_abs_epi16(alpha_sign), 9);
+ const __m128i dc_q0 = _mm_set1_epi16(*dst);
+ __m128i *row = (__m128i *)pred_buf_q3;
+ const __m128i *row_end = row + height * CFL_BUF_LINE_I128;
+ do {
+ __m128i res = predict_unclipped(row, alpha_q12, alpha_sign, dc_q0);
+ if (width < 16) {
+ res = _mm_packus_epi16(res, res);
+ if (width == 4)
+ _mm_storeh_epi32((__m128i *)dst, res);
+ else
+ _mm_storel_epi64((__m128i *)dst, res);
+ } else {
+ __m128i next = predict_unclipped(row + 1, alpha_q12, alpha_sign, dc_q0);
+ res = _mm_packus_epi16(res, next);
+ _mm_storeu_si128((__m128i *)dst, res);
+ if (width == 32) {
+ res = predict_unclipped(row + 2, alpha_q12, alpha_sign, dc_q0);
+ next = predict_unclipped(row + 3, alpha_q12, alpha_sign, dc_q0);
+ res = _mm_packus_epi16(res, next);
+ _mm_storeu_si128((__m128i *)(dst + 16), res);
+ }
+ }
+ dst += dst_stride;
+ } while ((row += CFL_BUF_LINE_I128) < row_end);
+}
+
+CFL_PREDICT_FN(ssse3, lbd)
+
+#if CONFIG_AV1_HIGHBITDEPTH
+static INLINE __m128i highbd_max_epi16(int bd) {
+ const __m128i neg_one = _mm_set1_epi16(-1);
+ // (1 << bd) - 1 => -(-1 << bd) -1 => -1 - (-1 << bd) => -1 ^ (-1 << bd)
+ return _mm_xor_si128(_mm_slli_epi16(neg_one, bd), neg_one);
+}
+
+static INLINE __m128i highbd_clamp_epi16(__m128i u, __m128i zero, __m128i max) {
+ return _mm_max_epi16(_mm_min_epi16(u, max), zero);
+}
+
+static INLINE void cfl_predict_hbd_ssse3(const int16_t *pred_buf_q3,
+ uint16_t *dst, int dst_stride,
+ int alpha_q3, int bd, int width,
+ int height) {
+ const __m128i alpha_sign = _mm_set1_epi16(alpha_q3);
+ const __m128i alpha_q12 = _mm_slli_epi16(_mm_abs_epi16(alpha_sign), 9);
+ const __m128i dc_q0 = _mm_set1_epi16(*dst);
+ const __m128i max = highbd_max_epi16(bd);
+ const __m128i zeros = _mm_setzero_si128();
+ __m128i *row = (__m128i *)pred_buf_q3;
+ const __m128i *row_end = row + height * CFL_BUF_LINE_I128;
+ do {
+ __m128i res = predict_unclipped(row, alpha_q12, alpha_sign, dc_q0);
+ res = highbd_clamp_epi16(res, zeros, max);
+ if (width == 4) {
+ _mm_storel_epi64((__m128i *)dst, res);
+ } else {
+ _mm_storeu_si128((__m128i *)dst, res);
+ }
+ if (width >= 16) {
+ const __m128i res_1 =
+ predict_unclipped(row + 1, alpha_q12, alpha_sign, dc_q0);
+ _mm_storeu_si128(((__m128i *)dst) + 1,
+ highbd_clamp_epi16(res_1, zeros, max));
+ }
+ if (width == 32) {
+ const __m128i res_2 =
+ predict_unclipped(row + 2, alpha_q12, alpha_sign, dc_q0);
+ _mm_storeu_si128((__m128i *)(dst + 16),
+ highbd_clamp_epi16(res_2, zeros, max));
+ const __m128i res_3 =
+ predict_unclipped(row + 3, alpha_q12, alpha_sign, dc_q0);
+ _mm_storeu_si128((__m128i *)(dst + 24),
+ highbd_clamp_epi16(res_3, zeros, max));
+ }
+ dst += dst_stride;
+ } while ((row += CFL_BUF_LINE_I128) < row_end);
+}
+
+CFL_PREDICT_FN(ssse3, hbd)
+#endif // CONFIG_AV1_HIGHBITDEPTH
diff --git a/third_party/aom/av1/common/x86/convolve_2d_avx2.c b/third_party/aom/av1/common/x86/convolve_2d_avx2.c
new file mode 100644
index 0000000000..1b39a0a8d5
--- /dev/null
+++ b/third_party/aom/av1/common/x86/convolve_2d_avx2.c
@@ -0,0 +1,161 @@
+/*
+ * Copyright (c) 2017, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <immintrin.h>
+
+#include "config/av1_rtcd.h"
+
+#include "third_party/SVT-AV1/convolve_2d_avx2.h"
+
+#include "aom_dsp/x86/convolve_avx2.h"
+#include "aom_dsp/aom_filter.h"
+#include "aom_dsp/x86/synonyms.h"
+
+#include "av1/common/convolve.h"
+
+void av1_convolve_2d_sr_general_avx2(const uint8_t *src, int src_stride,
+ uint8_t *dst, int dst_stride, int w, int h,
+ const InterpFilterParams *filter_params_x,
+ const InterpFilterParams *filter_params_y,
+ const int subpel_x_qn,
+ const int subpel_y_qn,
+ ConvolveParams *conv_params) {
+ if (filter_params_x->taps > 8) {
+ const int bd = 8;
+ int im_stride = 8, i;
+ DECLARE_ALIGNED(32, int16_t, im_block[(MAX_SB_SIZE + MAX_FILTER_TAP) * 8]);
+ const int bits =
+ FILTER_BITS * 2 - conv_params->round_0 - conv_params->round_1;
+ const int offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0;
+
+ assert(conv_params->round_0 > 0);
+
+ const __m256i round_const_h12 = _mm256_set1_epi32(
+ ((1 << (conv_params->round_0)) >> 1) + (1 << (bd + FILTER_BITS - 1)));
+ const __m128i round_shift_h12 = _mm_cvtsi32_si128(conv_params->round_0);
+
+ const __m256i sum_round_v = _mm256_set1_epi32(
+ (1 << offset_bits) + ((1 << conv_params->round_1) >> 1));
+ const __m128i sum_shift_v = _mm_cvtsi32_si128(conv_params->round_1);
+
+ const __m256i round_const_v = _mm256_set1_epi32(
+ ((1 << bits) >> 1) - (1 << (offset_bits - conv_params->round_1)) -
+ ((1 << (offset_bits - conv_params->round_1)) >> 1));
+ const __m128i round_shift_v = _mm_cvtsi32_si128(bits);
+
+ __m256i coeffs_h[6] = { 0 }, coeffs_v[6] = { 0 };
+
+ int horiz_tap = 12;
+ int vert_tap = 12;
+
+ prepare_coeffs_12taps(filter_params_x, subpel_x_qn, coeffs_h);
+ prepare_coeffs_12taps(filter_params_y, subpel_y_qn, coeffs_v);
+
+ int im_h = h + vert_tap - 1;
+ const int fo_vert = vert_tap / 2 - 1;
+ const int fo_horiz = horiz_tap / 2 - 1;
+ const uint8_t *const src_ptr = src - fo_vert * src_stride - fo_horiz;
+
+ for (int j = 0; j < w; j += 8) {
+ CONVOLVE_SR_HORIZONTAL_FILTER_12TAP
+ CONVOLVE_SR_VERTICAL_FILTER_12TAP
+ }
+ } else {
+ const int bd = 8;
+ int im_stride = 8, i;
+ DECLARE_ALIGNED(32, int16_t, im_block[(MAX_SB_SIZE + MAX_FILTER_TAP) * 8]);
+ const int bits =
+ FILTER_BITS * 2 - conv_params->round_0 - conv_params->round_1;
+ const int offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0;
+
+ assert(conv_params->round_0 > 0);
+
+ const __m256i round_const_h =
+ _mm256_set1_epi16(((1 << (conv_params->round_0 - 1)) >> 1) +
+ (1 << (bd + FILTER_BITS - 2)));
+ const __m128i round_shift_h = _mm_cvtsi32_si128(conv_params->round_0 - 1);
+
+ const __m256i sum_round_v = _mm256_set1_epi32(
+ (1 << offset_bits) + ((1 << conv_params->round_1) >> 1));
+ const __m128i sum_shift_v = _mm_cvtsi32_si128(conv_params->round_1);
+
+ const __m256i round_const_v = _mm256_set1_epi32(
+ ((1 << bits) >> 1) - (1 << (offset_bits - conv_params->round_1)) -
+ ((1 << (offset_bits - conv_params->round_1)) >> 1));
+ const __m128i round_shift_v = _mm_cvtsi32_si128(bits);
+
+ __m256i filt[4], coeffs_h[4], coeffs_v[4];
+
+ prepare_coeffs_lowbd(filter_params_x, subpel_x_qn, coeffs_h);
+ prepare_coeffs(filter_params_y, subpel_y_qn, coeffs_v);
+
+ int horiz_tap = get_filter_tap(filter_params_x, subpel_x_qn);
+ int vert_tap = get_filter_tap(filter_params_y, subpel_y_qn);
+
+ if (horiz_tap == 6)
+ prepare_coeffs_6t_lowbd(filter_params_x, subpel_x_qn, coeffs_h);
+ else
+ prepare_coeffs_lowbd(filter_params_x, subpel_x_qn, coeffs_h);
+
+ if (vert_tap == 6)
+ prepare_coeffs_6t(filter_params_y, subpel_y_qn, coeffs_v);
+ else
+ prepare_coeffs(filter_params_y, subpel_y_qn, coeffs_v);
+
+ int im_h = h + vert_tap - 1;
+ const int fo_vert = vert_tap / 2 - 1;
+ const int fo_horiz = horiz_tap / 2 - 1;
+ const uint8_t *const src_ptr = src - fo_vert * src_stride - fo_horiz;
+
+ filt[0] = _mm256_load_si256((__m256i const *)filt1_global_avx2);
+ filt[1] = _mm256_load_si256((__m256i const *)filt2_global_avx2);
+ filt[2] = _mm256_load_si256((__m256i const *)filt3_global_avx2);
+ filt[3] = _mm256_load_si256((__m256i const *)filt4_global_avx2);
+
+ for (int j = 0; j < w; j += 8) {
+ if (horiz_tap == 4) {
+ CONVOLVE_SR_HORIZONTAL_FILTER_4TAP
+ } else if (horiz_tap == 6) {
+ CONVOLVE_SR_HORIZONTAL_FILTER_6TAP
+ } else {
+ CONVOLVE_SR_HORIZONTAL_FILTER_8TAP
+ }
+
+ if (vert_tap == 4) {
+ CONVOLVE_SR_VERTICAL_FILTER_4TAP
+ } else if (vert_tap == 6) {
+ CONVOLVE_SR_VERTICAL_FILTER_6TAP
+ } else {
+ CONVOLVE_SR_VERTICAL_FILTER_8TAP
+ }
+ }
+ }
+}
+
+void av1_convolve_2d_sr_avx2(
+ const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride,
+ int32_t w, int32_t h, const InterpFilterParams *filter_params_x,
+ const InterpFilterParams *filter_params_y, const int32_t subpel_x_q4,
+ const int32_t subpel_y_q4, ConvolveParams *conv_params) {
+ const int32_t tap_x = get_filter_tap(filter_params_x, subpel_x_q4);
+ const int32_t tap_y = get_filter_tap(filter_params_y, subpel_y_q4);
+
+ const bool use_general = (tap_x == 12 || tap_y == 12);
+ if (use_general) {
+ av1_convolve_2d_sr_general_avx2(src, src_stride, dst, dst_stride, w, h,
+ filter_params_x, filter_params_y,
+ subpel_x_q4, subpel_y_q4, conv_params);
+ } else {
+ av1_convolve_2d_sr_specialized_avx2(src, src_stride, dst, dst_stride, w, h,
+ filter_params_x, filter_params_y,
+ subpel_x_q4, subpel_y_q4, conv_params);
+ }
+}
diff --git a/third_party/aom/av1/common/x86/convolve_2d_sse2.c b/third_party/aom/av1/common/x86/convolve_2d_sse2.c
new file mode 100644
index 0000000000..1b85f37294
--- /dev/null
+++ b/third_party/aom/av1/common/x86/convolve_2d_sse2.c
@@ -0,0 +1,547 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <emmintrin.h>
+
+#include "config/av1_rtcd.h"
+
+#include "aom_dsp/aom_dsp_common.h"
+#include "aom_dsp/aom_filter.h"
+#include "aom_dsp/x86/convolve_sse2.h"
+#include "aom_dsp/x86/convolve_common_intrin.h"
+#include "av1/common/convolve.h"
+
+void av1_convolve_2d_sr_12tap_sse2(const uint8_t *src, int src_stride,
+ uint8_t *dst, int dst_stride, int w, int h,
+ const InterpFilterParams *filter_params_x,
+ const InterpFilterParams *filter_params_y,
+ const int subpel_x_qn, const int subpel_y_qn,
+ ConvolveParams *conv_params) {
+ const int bd = 8;
+
+ DECLARE_ALIGNED(16, int16_t,
+ im_block[(MAX_SB_SIZE + MAX_FILTER_TAP - 1) * MAX_SB_SIZE]);
+ int im_h = h + filter_params_y->taps - 1;
+ int im_stride = w;
+ int i, j;
+ const int fo_vert = filter_params_y->taps / 2 - 1;
+ const int fo_horiz = filter_params_x->taps / 2 - 1;
+ const uint8_t *const src_ptr = src - fo_vert * src_stride - fo_horiz;
+
+ const __m128i zero = _mm_setzero_si128();
+ const int bits =
+ FILTER_BITS * 2 - conv_params->round_0 - conv_params->round_1;
+ const int offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0;
+
+ assert(conv_params->round_0 > 0);
+ __m128i coeffs[6];
+
+ /* Horizontal filter */
+ {
+ prepare_coeffs_12tap(filter_params_x, subpel_x_qn, coeffs);
+
+ const __m128i round_const = _mm_set1_epi32(
+ (1 << (bd + FILTER_BITS - 1)) + ((1 << conv_params->round_0) >> 1));
+ const __m128i round_shift = _mm_cvtsi32_si128(conv_params->round_0);
+
+ for (i = 0; i < im_h; ++i) {
+ for (j = 0; j < w; j += 8) {
+ const __m128i data =
+ _mm_loadu_si128((__m128i *)&src_ptr[i * src_stride + j]);
+ const __m128i data_2 =
+ _mm_loadu_si128((__m128i *)&src_ptr[i * src_stride + (j + 4)]);
+
+ // Filter even-index pixels
+ const __m128i src_0 = _mm_unpacklo_epi8(data, zero);
+ const __m128i res_0 = _mm_madd_epi16(src_0, coeffs[0]);
+ const __m128i src_2 = _mm_unpacklo_epi8(_mm_srli_si128(data, 2), zero);
+ const __m128i res_2 = _mm_madd_epi16(src_2, coeffs[1]);
+ const __m128i src_4 = _mm_unpacklo_epi8(data_2, zero);
+ const __m128i res_4 = _mm_madd_epi16(src_4, coeffs[2]);
+ const __m128i src_6 =
+ _mm_unpacklo_epi8(_mm_srli_si128(data_2, 2), zero);
+ const __m128i res_6 = _mm_madd_epi16(src_6, coeffs[3]);
+ const __m128i src_8 =
+ _mm_unpacklo_epi8(_mm_srli_si128(data_2, 4), zero);
+ const __m128i res_8 = _mm_madd_epi16(src_8, coeffs[4]);
+ const __m128i src_10 =
+ _mm_unpacklo_epi8(_mm_srli_si128(data_2, 6), zero);
+ const __m128i res_10 = _mm_madd_epi16(src_10, coeffs[5]);
+
+ const __m128i res_0246 = _mm_add_epi32(_mm_add_epi32(res_0, res_4),
+ _mm_add_epi32(res_2, res_6));
+ __m128i res_even =
+ _mm_add_epi32(_mm_add_epi32(res_8, res_10), res_0246);
+ res_even =
+ _mm_sra_epi32(_mm_add_epi32(res_even, round_const), round_shift);
+
+ // Filter odd-index pixels
+ const __m128i src_1 = _mm_unpacklo_epi8(_mm_srli_si128(data, 1), zero);
+ const __m128i res_1 = _mm_madd_epi16(src_1, coeffs[0]);
+ const __m128i src_3 = _mm_unpacklo_epi8(_mm_srli_si128(data, 3), zero);
+ const __m128i res_3 = _mm_madd_epi16(src_3, coeffs[1]);
+ const __m128i src_5 =
+ _mm_unpacklo_epi8(_mm_srli_si128(data_2, 1), zero);
+ const __m128i res_5 = _mm_madd_epi16(src_5, coeffs[2]);
+ const __m128i src_7 =
+ _mm_unpacklo_epi8(_mm_srli_si128(data_2, 3), zero);
+ const __m128i res_7 = _mm_madd_epi16(src_7, coeffs[3]);
+ const __m128i src_9 =
+ _mm_unpacklo_epi8(_mm_srli_si128(data_2, 5), zero);
+ const __m128i res_9 = _mm_madd_epi16(src_9, coeffs[4]);
+ const __m128i src_11 =
+ _mm_unpacklo_epi8(_mm_srli_si128(data_2, 7), zero);
+ const __m128i res_11 = _mm_madd_epi16(src_11, coeffs[5]);
+
+ const __m128i res_1357 = _mm_add_epi32(_mm_add_epi32(res_1, res_5),
+ _mm_add_epi32(res_3, res_7));
+ __m128i res_odd = _mm_add_epi32(_mm_add_epi32(res_9, res_11), res_1357);
+ res_odd =
+ _mm_sra_epi32(_mm_add_epi32(res_odd, round_const), round_shift);
+
+ // Pack in the column order 0, 2, 4, 6, 1, 3, 5, 7
+ __m128i res = _mm_packs_epi32(res_even, res_odd);
+ _mm_storeu_si128((__m128i *)&im_block[i * im_stride + j], res);
+ }
+ }
+ }
+
+ /* Vertical filter */
+ {
+ prepare_coeffs_12tap(filter_params_y, subpel_y_qn, coeffs);
+
+ const __m128i sum_round =
+ _mm_set1_epi32((1 << offset_bits) + ((1 << conv_params->round_1) >> 1));
+ const __m128i sum_shift = _mm_cvtsi32_si128(conv_params->round_1);
+
+ const __m128i round_const = _mm_set1_epi32(
+ ((1 << bits) >> 1) - (1 << (offset_bits - conv_params->round_1)) -
+ ((1 << (offset_bits - conv_params->round_1)) >> 1));
+ const __m128i round_shift = _mm_cvtsi32_si128(bits);
+
+ for (i = 0; i < h; ++i) {
+ for (j = 0; j < w; j += 8) {
+ // Filter even-index pixels
+ const int16_t *data = &im_block[i * im_stride + j];
+ const __m128i src_0 =
+ _mm_unpacklo_epi16(*(__m128i *)(data + 0 * im_stride),
+ *(__m128i *)(data + 1 * im_stride));
+ const __m128i src_2 =
+ _mm_unpacklo_epi16(*(__m128i *)(data + 2 * im_stride),
+ *(__m128i *)(data + 3 * im_stride));
+ const __m128i src_4 =
+ _mm_unpacklo_epi16(*(__m128i *)(data + 4 * im_stride),
+ *(__m128i *)(data + 5 * im_stride));
+ const __m128i src_6 =
+ _mm_unpacklo_epi16(*(__m128i *)(data + 6 * im_stride),
+ *(__m128i *)(data + 7 * im_stride));
+ const __m128i src_8 =
+ _mm_unpacklo_epi16(*(__m128i *)(data + 8 * im_stride),
+ *(__m128i *)(data + 9 * im_stride));
+ const __m128i src_10 =
+ _mm_unpacklo_epi16(*(__m128i *)(data + 10 * im_stride),
+ *(__m128i *)(data + 11 * im_stride));
+
+ const __m128i res_0 = _mm_madd_epi16(src_0, coeffs[0]);
+ const __m128i res_2 = _mm_madd_epi16(src_2, coeffs[1]);
+ const __m128i res_4 = _mm_madd_epi16(src_4, coeffs[2]);
+ const __m128i res_6 = _mm_madd_epi16(src_6, coeffs[3]);
+ const __m128i res_8 = _mm_madd_epi16(src_8, coeffs[4]);
+ const __m128i res_10 = _mm_madd_epi16(src_10, coeffs[5]);
+
+ const __m128i res_0246 = _mm_add_epi32(_mm_add_epi32(res_0, res_2),
+ _mm_add_epi32(res_4, res_6));
+ __m128i res_even =
+ _mm_add_epi32(_mm_add_epi32(res_8, res_10), res_0246);
+
+ // Filter odd-index pixels
+ const __m128i src_1 =
+ _mm_unpackhi_epi16(*(__m128i *)(data + 0 * im_stride),
+ *(__m128i *)(data + 1 * im_stride));
+ const __m128i src_3 =
+ _mm_unpackhi_epi16(*(__m128i *)(data + 2 * im_stride),
+ *(__m128i *)(data + 3 * im_stride));
+ const __m128i src_5 =
+ _mm_unpackhi_epi16(*(__m128i *)(data + 4 * im_stride),
+ *(__m128i *)(data + 5 * im_stride));
+ const __m128i src_7 =
+ _mm_unpackhi_epi16(*(__m128i *)(data + 6 * im_stride),
+ *(__m128i *)(data + 7 * im_stride));
+ const __m128i src_9 =
+ _mm_unpackhi_epi16(*(__m128i *)(data + 8 * im_stride),
+ *(__m128i *)(data + 9 * im_stride));
+ const __m128i src_11 =
+ _mm_unpackhi_epi16(*(__m128i *)(data + 10 * im_stride),
+ *(__m128i *)(data + 11 * im_stride));
+
+ const __m128i res_1 = _mm_madd_epi16(src_1, coeffs[0]);
+ const __m128i res_3 = _mm_madd_epi16(src_3, coeffs[1]);
+ const __m128i res_5 = _mm_madd_epi16(src_5, coeffs[2]);
+ const __m128i res_7 = _mm_madd_epi16(src_7, coeffs[3]);
+ const __m128i res_9 = _mm_madd_epi16(src_9, coeffs[4]);
+ const __m128i res_11 = _mm_madd_epi16(src_11, coeffs[5]);
+
+ const __m128i res_1357 = _mm_add_epi32(_mm_add_epi32(res_1, res_5),
+ _mm_add_epi32(res_3, res_7));
+ __m128i res_odd = _mm_add_epi32(_mm_add_epi32(res_9, res_11), res_1357);
+
+ // Rearrange pixels back into the order 0 ... 7
+ const __m128i res_lo = _mm_unpacklo_epi32(res_even, res_odd);
+ const __m128i res_hi = _mm_unpackhi_epi32(res_even, res_odd);
+
+ __m128i res_lo_round =
+ _mm_sra_epi32(_mm_add_epi32(res_lo, sum_round), sum_shift);
+ __m128i res_hi_round =
+ _mm_sra_epi32(_mm_add_epi32(res_hi, sum_round), sum_shift);
+
+ res_lo_round = _mm_sra_epi32(_mm_add_epi32(res_lo_round, round_const),
+ round_shift);
+ res_hi_round = _mm_sra_epi32(_mm_add_epi32(res_hi_round, round_const),
+ round_shift);
+
+ const __m128i res16 = _mm_packs_epi32(res_lo_round, res_hi_round);
+ const __m128i res = _mm_packus_epi16(res16, res16);
+
+ // Accumulate values into the destination buffer
+ __m128i *const p = (__m128i *)&dst[i * dst_stride + j];
+
+ _mm_storel_epi64(p, res);
+ }
+ }
+ }
+}
+
+void av1_convolve_2d_sr_sse2(const uint8_t *src, int src_stride, uint8_t *dst,
+ int dst_stride, int w, int h,
+ const InterpFilterParams *filter_params_x,
+ const InterpFilterParams *filter_params_y,
+ const int subpel_x_qn, const int subpel_y_qn,
+ ConvolveParams *conv_params) {
+ if (filter_params_x->taps > 8) {
+ if (w < 8) {
+ av1_convolve_2d_sr_c(src, src_stride, dst, dst_stride, w, h,
+ filter_params_x, filter_params_y, subpel_x_qn,
+ subpel_y_qn, conv_params);
+ } else {
+ av1_convolve_2d_sr_12tap_sse2(src, src_stride, dst, dst_stride, w, h,
+ filter_params_x, filter_params_y,
+ subpel_x_qn, subpel_y_qn, conv_params);
+ }
+ } else {
+ const int bd = 8;
+
+ DECLARE_ALIGNED(16, int16_t,
+ im_block[(MAX_SB_SIZE + MAX_FILTER_TAP - 1) * MAX_SB_SIZE]);
+ int im_h = h + filter_params_y->taps - 1;
+ int im_stride = MAX_SB_SIZE;
+ int i, j;
+ const int fo_vert = filter_params_y->taps / 2 - 1;
+ const int fo_horiz = filter_params_x->taps / 2 - 1;
+ const uint8_t *const src_ptr = src - fo_vert * src_stride - fo_horiz;
+
+ const __m128i zero = _mm_setzero_si128();
+ const int bits =
+ FILTER_BITS * 2 - conv_params->round_0 - conv_params->round_1;
+ const int offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0;
+
+ assert(conv_params->round_0 > 0);
+
+ /* Horizontal filter */
+ {
+ const int16_t *x_filter = av1_get_interp_filter_subpel_kernel(
+ filter_params_x, subpel_x_qn & SUBPEL_MASK);
+ const __m128i coeffs_x = _mm_loadu_si128((__m128i *)x_filter);
+
+ // coeffs 0 1 0 1 2 3 2 3
+ const __m128i tmp_0 = _mm_unpacklo_epi32(coeffs_x, coeffs_x);
+ // coeffs 4 5 4 5 6 7 6 7
+ const __m128i tmp_1 = _mm_unpackhi_epi32(coeffs_x, coeffs_x);
+
+ // coeffs 0 1 0 1 0 1 0 1
+ const __m128i coeff_01 = _mm_unpacklo_epi64(tmp_0, tmp_0);
+ // coeffs 2 3 2 3 2 3 2 3
+ const __m128i coeff_23 = _mm_unpackhi_epi64(tmp_0, tmp_0);
+ // coeffs 4 5 4 5 4 5 4 5
+ const __m128i coeff_45 = _mm_unpacklo_epi64(tmp_1, tmp_1);
+ // coeffs 6 7 6 7 6 7 6 7
+ const __m128i coeff_67 = _mm_unpackhi_epi64(tmp_1, tmp_1);
+
+ const __m128i round_const = _mm_set1_epi32(
+ (1 << (bd + FILTER_BITS - 1)) + ((1 << conv_params->round_0) >> 1));
+ const __m128i round_shift = _mm_cvtsi32_si128(conv_params->round_0);
+
+ for (i = 0; i < im_h; ++i) {
+ for (j = 0; j < w; j += 8) {
+ const __m128i data =
+ _mm_loadu_si128((__m128i *)&src_ptr[i * src_stride + j]);
+
+ // Filter even-index pixels
+ const __m128i src_0 = _mm_unpacklo_epi8(data, zero);
+ const __m128i res_0 = _mm_madd_epi16(src_0, coeff_01);
+ const __m128i src_2 =
+ _mm_unpacklo_epi8(_mm_srli_si128(data, 2), zero);
+ const __m128i res_2 = _mm_madd_epi16(src_2, coeff_23);
+ const __m128i src_4 =
+ _mm_unpacklo_epi8(_mm_srli_si128(data, 4), zero);
+ const __m128i res_4 = _mm_madd_epi16(src_4, coeff_45);
+ const __m128i src_6 =
+ _mm_unpacklo_epi8(_mm_srli_si128(data, 6), zero);
+ const __m128i res_6 = _mm_madd_epi16(src_6, coeff_67);
+
+ __m128i res_even = _mm_add_epi32(_mm_add_epi32(res_0, res_4),
+ _mm_add_epi32(res_2, res_6));
+ res_even =
+ _mm_sra_epi32(_mm_add_epi32(res_even, round_const), round_shift);
+
+ // Filter odd-index pixels
+ const __m128i src_1 =
+ _mm_unpacklo_epi8(_mm_srli_si128(data, 1), zero);
+ const __m128i res_1 = _mm_madd_epi16(src_1, coeff_01);
+ const __m128i src_3 =
+ _mm_unpacklo_epi8(_mm_srli_si128(data, 3), zero);
+ const __m128i res_3 = _mm_madd_epi16(src_3, coeff_23);
+ const __m128i src_5 =
+ _mm_unpacklo_epi8(_mm_srli_si128(data, 5), zero);
+ const __m128i res_5 = _mm_madd_epi16(src_5, coeff_45);
+ const __m128i src_7 =
+ _mm_unpacklo_epi8(_mm_srli_si128(data, 7), zero);
+ const __m128i res_7 = _mm_madd_epi16(src_7, coeff_67);
+
+ __m128i res_odd = _mm_add_epi32(_mm_add_epi32(res_1, res_5),
+ _mm_add_epi32(res_3, res_7));
+ res_odd =
+ _mm_sra_epi32(_mm_add_epi32(res_odd, round_const), round_shift);
+
+ // Pack in the column order 0, 2, 4, 6, 1, 3, 5, 7
+ __m128i res = _mm_packs_epi32(res_even, res_odd);
+ _mm_storeu_si128((__m128i *)&im_block[i * im_stride + j], res);
+ }
+ }
+ }
+
+ /* Vertical filter */
+ {
+ const int16_t *y_filter = av1_get_interp_filter_subpel_kernel(
+ filter_params_y, subpel_y_qn & SUBPEL_MASK);
+ const __m128i coeffs_y = _mm_loadu_si128((__m128i *)y_filter);
+
+ // coeffs 0 1 0 1 2 3 2 3
+ const __m128i tmp_0 = _mm_unpacklo_epi32(coeffs_y, coeffs_y);
+ // coeffs 4 5 4 5 6 7 6 7
+ const __m128i tmp_1 = _mm_unpackhi_epi32(coeffs_y, coeffs_y);
+
+ // coeffs 0 1 0 1 0 1 0 1
+ const __m128i coeff_01 = _mm_unpacklo_epi64(tmp_0, tmp_0);
+ // coeffs 2 3 2 3 2 3 2 3
+ const __m128i coeff_23 = _mm_unpackhi_epi64(tmp_0, tmp_0);
+ // coeffs 4 5 4 5 4 5 4 5
+ const __m128i coeff_45 = _mm_unpacklo_epi64(tmp_1, tmp_1);
+ // coeffs 6 7 6 7 6 7 6 7
+ const __m128i coeff_67 = _mm_unpackhi_epi64(tmp_1, tmp_1);
+
+ const __m128i sum_round = _mm_set1_epi32(
+ (1 << offset_bits) + ((1 << conv_params->round_1) >> 1));
+ const __m128i sum_shift = _mm_cvtsi32_si128(conv_params->round_1);
+
+ const __m128i round_const = _mm_set1_epi32(
+ ((1 << bits) >> 1) - (1 << (offset_bits - conv_params->round_1)) -
+ ((1 << (offset_bits - conv_params->round_1)) >> 1));
+ const __m128i round_shift = _mm_cvtsi32_si128(bits);
+
+ for (i = 0; i < h; ++i) {
+ for (j = 0; j < w; j += 8) {
+ // Filter even-index pixels
+ const int16_t *data = &im_block[i * im_stride + j];
+ const __m128i src_0 =
+ _mm_unpacklo_epi16(*(__m128i *)(data + 0 * im_stride),
+ *(__m128i *)(data + 1 * im_stride));
+ const __m128i src_2 =
+ _mm_unpacklo_epi16(*(__m128i *)(data + 2 * im_stride),
+ *(__m128i *)(data + 3 * im_stride));
+ const __m128i src_4 =
+ _mm_unpacklo_epi16(*(__m128i *)(data + 4 * im_stride),
+ *(__m128i *)(data + 5 * im_stride));
+ const __m128i src_6 =
+ _mm_unpacklo_epi16(*(__m128i *)(data + 6 * im_stride),
+ *(__m128i *)(data + 7 * im_stride));
+
+ const __m128i res_0 = _mm_madd_epi16(src_0, coeff_01);
+ const __m128i res_2 = _mm_madd_epi16(src_2, coeff_23);
+ const __m128i res_4 = _mm_madd_epi16(src_4, coeff_45);
+ const __m128i res_6 = _mm_madd_epi16(src_6, coeff_67);
+
+ const __m128i res_even = _mm_add_epi32(_mm_add_epi32(res_0, res_2),
+ _mm_add_epi32(res_4, res_6));
+
+ // Filter odd-index pixels
+ const __m128i src_1 =
+ _mm_unpackhi_epi16(*(__m128i *)(data + 0 * im_stride),
+ *(__m128i *)(data + 1 * im_stride));
+ const __m128i src_3 =
+ _mm_unpackhi_epi16(*(__m128i *)(data + 2 * im_stride),
+ *(__m128i *)(data + 3 * im_stride));
+ const __m128i src_5 =
+ _mm_unpackhi_epi16(*(__m128i *)(data + 4 * im_stride),
+ *(__m128i *)(data + 5 * im_stride));
+ const __m128i src_7 =
+ _mm_unpackhi_epi16(*(__m128i *)(data + 6 * im_stride),
+ *(__m128i *)(data + 7 * im_stride));
+
+ const __m128i res_1 = _mm_madd_epi16(src_1, coeff_01);
+ const __m128i res_3 = _mm_madd_epi16(src_3, coeff_23);
+ const __m128i res_5 = _mm_madd_epi16(src_5, coeff_45);
+ const __m128i res_7 = _mm_madd_epi16(src_7, coeff_67);
+
+ const __m128i res_odd = _mm_add_epi32(_mm_add_epi32(res_1, res_3),
+ _mm_add_epi32(res_5, res_7));
+
+ // Rearrange pixels back into the order 0 ... 7
+ const __m128i res_lo = _mm_unpacklo_epi32(res_even, res_odd);
+ const __m128i res_hi = _mm_unpackhi_epi32(res_even, res_odd);
+
+ __m128i res_lo_round =
+ _mm_sra_epi32(_mm_add_epi32(res_lo, sum_round), sum_shift);
+ __m128i res_hi_round =
+ _mm_sra_epi32(_mm_add_epi32(res_hi, sum_round), sum_shift);
+
+ res_lo_round = _mm_sra_epi32(_mm_add_epi32(res_lo_round, round_const),
+ round_shift);
+ res_hi_round = _mm_sra_epi32(_mm_add_epi32(res_hi_round, round_const),
+ round_shift);
+
+ const __m128i res16 = _mm_packs_epi32(res_lo_round, res_hi_round);
+ const __m128i res = _mm_packus_epi16(res16, res16);
+
+ // Accumulate values into the destination buffer
+ __m128i *const p = (__m128i *)&dst[i * dst_stride + j];
+
+ if (w == 2) {
+ *(uint16_t *)p = (uint16_t)_mm_cvtsi128_si32(res);
+ } else if (w == 4) {
+ *(int *)p = _mm_cvtsi128_si32(res);
+ } else {
+ _mm_storel_epi64(p, res);
+ }
+ }
+ }
+ }
+ }
+}
+
+void av1_dist_wtd_convolve_2d_copy_sse2(const uint8_t *src, int src_stride,
+ uint8_t *dst0, int dst_stride0, int w,
+ int h, ConvolveParams *conv_params) {
+ const int bd = 8;
+ CONV_BUF_TYPE *dst = conv_params->dst;
+ int dst_stride = conv_params->dst_stride;
+
+ const int bits =
+ FILTER_BITS * 2 - conv_params->round_1 - conv_params->round_0;
+ const int do_average = conv_params->do_average;
+ const int use_dist_wtd_comp_avg = conv_params->use_dist_wtd_comp_avg;
+ const __m128i zero = _mm_setzero_si128();
+ const __m128i left_shift = _mm_cvtsi32_si128(bits);
+ int i, j;
+
+ const int w0 = conv_params->fwd_offset;
+ const int w1 = conv_params->bck_offset;
+ const __m128i wt0 = _mm_set1_epi16(w0);
+ const __m128i wt1 = _mm_set1_epi16(w1);
+ const __m128i wt = _mm_unpacklo_epi16(wt0, wt1);
+
+ const int offset_0 =
+ bd + 2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1;
+ const int offset = (1 << offset_0) + (1 << (offset_0 - 1));
+ const __m128i offset_const = _mm_set1_epi16(offset);
+ const int rounding_shift =
+ 2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1;
+ const __m128i rounding_const = _mm_set1_epi16((1 << rounding_shift) >> 1);
+
+ assert((w % 4) == 0);
+
+ if (!(w % 16)) {
+ for (i = 0; i < h; ++i) {
+ for (j = 0; j < w; j += 16) {
+ const __m128i d8 = _mm_loadu_si128((__m128i *)&src[j]);
+
+ const __m128i d16_lo = _mm_unpacklo_epi8(d8, zero);
+ const __m128i d16_hi = _mm_unpackhi_epi8(d8, zero);
+
+ const __m128i res_lo = _mm_sll_epi16(d16_lo, left_shift);
+ const __m128i res_unsigned_lo = _mm_add_epi16(res_lo, offset_const);
+
+ const __m128i res_hi = _mm_sll_epi16(d16_hi, left_shift);
+ const __m128i res_unsigned_hi = _mm_add_epi16(res_hi, offset_const);
+
+ if (do_average) {
+ const __m128i data_ref_0_lo = _mm_loadu_si128((__m128i *)(&dst[j]));
+ const __m128i data_ref_0_hi =
+ _mm_loadu_si128((__m128i *)(&dst[j + 8]));
+
+ const __m128i comp_avg_res_lo = comp_avg(
+ &data_ref_0_lo, &res_unsigned_lo, &wt, use_dist_wtd_comp_avg);
+
+ const __m128i round_result_lo = convolve_rounding(
+ &comp_avg_res_lo, &offset_const, &rounding_const, rounding_shift);
+
+ const __m128i comp_avg_res_hi = comp_avg(
+ &data_ref_0_hi, &res_unsigned_hi, &wt, use_dist_wtd_comp_avg);
+
+ const __m128i round_result_hi = convolve_rounding(
+ &comp_avg_res_hi, &offset_const, &rounding_const, rounding_shift);
+
+ const __m128i res_8 =
+ _mm_packus_epi16(round_result_lo, round_result_hi);
+
+ _mm_store_si128((__m128i *)(&dst0[j]), res_8);
+ } else {
+ _mm_store_si128((__m128i *)(&dst[j]), res_unsigned_lo);
+ _mm_store_si128((__m128i *)(&dst[j + 8]), res_unsigned_hi);
+ }
+ }
+ src += src_stride;
+ dst += dst_stride;
+ dst0 += dst_stride0;
+ }
+ } else {
+ for (i = 0; i < h; ++i) {
+ for (j = 0; j < w; j += 8) {
+ const __m128i d8 = _mm_loadl_epi64((__m128i *)&src[j]);
+ const __m128i d16_0 = _mm_unpacklo_epi8(d8, zero);
+
+ const __m128i res = _mm_sll_epi16(d16_0, left_shift);
+ const __m128i res_unsigned = _mm_add_epi16(res, offset_const);
+
+ if (do_average) {
+ const __m128i data_ref_0 = _mm_loadu_si128((__m128i *)(&dst[j]));
+
+ const __m128i comp_avg_res =
+ comp_avg(&data_ref_0, &res_unsigned, &wt, use_dist_wtd_comp_avg);
+
+ const __m128i round_result = convolve_rounding(
+ &comp_avg_res, &offset_const, &rounding_const, rounding_shift);
+
+ const __m128i res_8 = _mm_packus_epi16(round_result, round_result);
+
+ if (w > 4)
+ _mm_storel_epi64((__m128i *)(&dst0[j]), res_8);
+ else
+ *(int *)(&dst0[j]) = _mm_cvtsi128_si32(res_8);
+ } else {
+ _mm_store_si128((__m128i *)(&dst[j]), res_unsigned);
+ }
+ }
+ src += src_stride;
+ dst += dst_stride;
+ dst0 += dst_stride0;
+ }
+ }
+}
diff --git a/third_party/aom/av1/common/x86/convolve_avx2.c b/third_party/aom/av1/common/x86/convolve_avx2.c
new file mode 100644
index 0000000000..3862bbeac1
--- /dev/null
+++ b/third_party/aom/av1/common/x86/convolve_avx2.c
@@ -0,0 +1,916 @@
+/*
+ * Copyright (c) 2017, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <immintrin.h>
+
+#include "config/av1_rtcd.h"
+
+#include "third_party/SVT-AV1/convolve_avx2.h"
+
+#include "aom_dsp/aom_dsp_common.h"
+#include "aom_dsp/x86/convolve_avx2.h"
+#include "aom_dsp/x86/convolve_common_intrin.h"
+#include "aom_dsp/x86/synonyms.h"
+
+static AOM_INLINE void av1_convolve_y_sr_general_avx2(
+ const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w,
+ int h, const InterpFilterParams *filter_params_y, const int subpel_y_qn) {
+ // right shift is F-1 because we are already dividing
+ // filter co-efficients by 2
+ const int right_shift_bits = (FILTER_BITS - 1);
+ __m128i right_shift = _mm_cvtsi32_si128(right_shift_bits);
+ __m256i right_shift_const = _mm256_set1_epi16((1 << right_shift_bits) >> 1);
+
+ __m256i coeffs[6], s[12];
+ __m128i d[10];
+
+ int i, vert_tap = get_filter_tap(filter_params_y, subpel_y_qn);
+
+ if (vert_tap == 6)
+ prepare_coeffs_6t_lowbd(filter_params_y, subpel_y_qn, coeffs);
+ else if (vert_tap == 12) {
+ prepare_coeffs_12taps(filter_params_y, subpel_y_qn, coeffs);
+ } else {
+ prepare_coeffs_lowbd(filter_params_y, subpel_y_qn, coeffs);
+ }
+
+ // vert_filt as 4 tap
+ if (vert_tap == 4) {
+ const int fo_vert = 1;
+ const uint8_t *const src_ptr = src - fo_vert * src_stride;
+ for (int j = 0; j < w; j += 16) {
+ const uint8_t *data = &src_ptr[j];
+ d[0] = _mm_loadu_si128((__m128i *)(data + 0 * src_stride));
+ d[1] = _mm_loadu_si128((__m128i *)(data + 1 * src_stride));
+ d[2] = _mm_loadu_si128((__m128i *)(data + 2 * src_stride));
+ d[3] = _mm_loadu_si128((__m128i *)(data + 3 * src_stride));
+ d[4] = _mm_loadu_si128((__m128i *)(data + 4 * src_stride));
+
+ // Load lines a and b. Line a to lower 128, line b to upper 128
+ const __m256i src_01a = _mm256_permute2x128_si256(
+ _mm256_castsi128_si256(d[0]), _mm256_castsi128_si256(d[1]), 0x20);
+
+ const __m256i src_12a = _mm256_permute2x128_si256(
+ _mm256_castsi128_si256(d[1]), _mm256_castsi128_si256(d[2]), 0x20);
+
+ const __m256i src_23a = _mm256_permute2x128_si256(
+ _mm256_castsi128_si256(d[2]), _mm256_castsi128_si256(d[3]), 0x20);
+
+ const __m256i src_34a = _mm256_permute2x128_si256(
+ _mm256_castsi128_si256(d[3]), _mm256_castsi128_si256(d[4]), 0x20);
+
+ s[0] = _mm256_unpacklo_epi8(src_01a, src_12a);
+ s[1] = _mm256_unpacklo_epi8(src_23a, src_34a);
+
+ s[3] = _mm256_unpackhi_epi8(src_01a, src_12a);
+ s[4] = _mm256_unpackhi_epi8(src_23a, src_34a);
+
+ for (i = 0; i < h; i += 2) {
+ data = &src_ptr[i * src_stride + j];
+ d[5] = _mm_loadu_si128((__m128i *)(data + 5 * src_stride));
+ const __m256i src_45a = _mm256_permute2x128_si256(
+ _mm256_castsi128_si256(d[4]), _mm256_castsi128_si256(d[5]), 0x20);
+
+ d[4] = _mm_loadu_si128((__m128i *)(data + 6 * src_stride));
+ const __m256i src_56a = _mm256_permute2x128_si256(
+ _mm256_castsi128_si256(d[5]), _mm256_castsi128_si256(d[4]), 0x20);
+
+ s[2] = _mm256_unpacklo_epi8(src_45a, src_56a);
+ s[5] = _mm256_unpackhi_epi8(src_45a, src_56a);
+
+ const __m256i res_lo = convolve_lowbd_4tap(s, coeffs + 1);
+ /* rounding code */
+ // shift by F - 1
+ const __m256i res_16b_lo = _mm256_sra_epi16(
+ _mm256_add_epi16(res_lo, right_shift_const), right_shift);
+ // 8 bit conversion and saturation to uint8
+ __m256i res_8b_lo = _mm256_packus_epi16(res_16b_lo, res_16b_lo);
+
+ if (w - j > 8) {
+ const __m256i res_hi = convolve_lowbd_4tap(s + 3, coeffs + 1);
+
+ /* rounding code */
+ // shift by F - 1
+ const __m256i res_16b_hi = _mm256_sra_epi16(
+ _mm256_add_epi16(res_hi, right_shift_const), right_shift);
+ // 8 bit conversion and saturation to uint8
+ __m256i res_8b_hi = _mm256_packus_epi16(res_16b_hi, res_16b_hi);
+
+ __m256i res_a = _mm256_unpacklo_epi64(res_8b_lo, res_8b_hi);
+
+ const __m128i res_0 = _mm256_castsi256_si128(res_a);
+ const __m128i res_1 = _mm256_extracti128_si256(res_a, 1);
+
+ _mm_storeu_si128((__m128i *)&dst[i * dst_stride + j], res_0);
+ _mm_storeu_si128((__m128i *)&dst[i * dst_stride + j + dst_stride],
+ res_1);
+ } else {
+ const __m128i res_0 = _mm256_castsi256_si128(res_8b_lo);
+ const __m128i res_1 = _mm256_extracti128_si256(res_8b_lo, 1);
+ if (w - j > 4) {
+ _mm_storel_epi64((__m128i *)&dst[i * dst_stride + j], res_0);
+ _mm_storel_epi64((__m128i *)&dst[i * dst_stride + j + dst_stride],
+ res_1);
+ } else if (w - j > 2) {
+ xx_storel_32(&dst[i * dst_stride + j], res_0);
+ xx_storel_32(&dst[i * dst_stride + j + dst_stride], res_1);
+ } else {
+ __m128i *const p_0 = (__m128i *)&dst[i * dst_stride + j];
+ __m128i *const p_1 =
+ (__m128i *)&dst[i * dst_stride + j + dst_stride];
+ *(uint16_t *)p_0 = (uint16_t)_mm_cvtsi128_si32(res_0);
+ *(uint16_t *)p_1 = (uint16_t)_mm_cvtsi128_si32(res_1);
+ }
+ }
+ s[0] = s[1];
+ s[1] = s[2];
+
+ s[3] = s[4];
+ s[4] = s[5];
+ }
+ }
+ } else if (vert_tap == 6) {
+ const int fo_vert = vert_tap / 2 - 1;
+ const uint8_t *const src_ptr = src - fo_vert * src_stride;
+
+ for (int j = 0; j < w; j += 16) {
+ const uint8_t *data = &src_ptr[j];
+ __m256i src6;
+
+ d[0] = _mm_loadu_si128((__m128i *)(data + 0 * src_stride));
+ d[1] = _mm_loadu_si128((__m128i *)(data + 1 * src_stride));
+ d[2] = _mm_loadu_si128((__m128i *)(data + 2 * src_stride));
+ d[3] = _mm_loadu_si128((__m128i *)(data + 3 * src_stride));
+ // Load lines a and b. Line a to lower 128, line b to upper 128
+ const __m256i src_01a = _mm256_permute2x128_si256(
+ _mm256_castsi128_si256(d[0]), _mm256_castsi128_si256(d[1]), 0x20);
+
+ const __m256i src_12a = _mm256_permute2x128_si256(
+ _mm256_castsi128_si256(d[1]), _mm256_castsi128_si256(d[2]), 0x20);
+
+ const __m256i src_23a = _mm256_permute2x128_si256(
+ _mm256_castsi128_si256(d[2]), _mm256_castsi128_si256(d[3]), 0x20);
+
+ src6 = _mm256_castsi128_si256(
+ _mm_loadu_si128((__m128i *)(data + 4 * src_stride)));
+ const __m256i src_34a =
+ _mm256_permute2x128_si256(_mm256_castsi128_si256(d[3]), src6, 0x20);
+
+ s[0] = _mm256_unpacklo_epi8(src_01a, src_12a);
+ s[1] = _mm256_unpacklo_epi8(src_23a, src_34a);
+
+ s[3] = _mm256_unpackhi_epi8(src_01a, src_12a);
+ s[4] = _mm256_unpackhi_epi8(src_23a, src_34a);
+
+ for (i = 0; i < h; i += 2) {
+ data = &src_ptr[i * src_stride + j];
+ const __m256i src_45a = _mm256_permute2x128_si256(
+ src6,
+ _mm256_castsi128_si256(
+ _mm_loadu_si128((__m128i *)(data + 5 * src_stride))),
+ 0x20);
+
+ src6 = _mm256_castsi128_si256(
+ _mm_loadu_si128((__m128i *)(data + 6 * src_stride)));
+ const __m256i src_56a = _mm256_permute2x128_si256(
+ _mm256_castsi128_si256(
+ _mm_loadu_si128((__m128i *)(data + 5 * src_stride))),
+ src6, 0x20);
+
+ s[2] = _mm256_unpacklo_epi8(src_45a, src_56a);
+ s[5] = _mm256_unpackhi_epi8(src_45a, src_56a);
+
+ const __m256i res_lo = convolve_lowbd_6tap(s, coeffs);
+
+ /* rounding code */
+ // shift by F - 1
+ const __m256i res_16b_lo = _mm256_sra_epi16(
+ _mm256_add_epi16(res_lo, right_shift_const), right_shift);
+ // 8 bit conversion and saturation to uint8
+ __m256i res_8b_lo = _mm256_packus_epi16(res_16b_lo, res_16b_lo);
+
+ if (w - j > 8) {
+ const __m256i res_hi = convolve_lowbd_6tap(s + 3, coeffs);
+
+ /* rounding code */
+ // shift by F - 1
+ const __m256i res_16b_hi = _mm256_sra_epi16(
+ _mm256_add_epi16(res_hi, right_shift_const), right_shift);
+ // 8 bit conversion and saturation to uint8
+ __m256i res_8b_hi = _mm256_packus_epi16(res_16b_hi, res_16b_hi);
+
+ __m256i res_a = _mm256_unpacklo_epi64(res_8b_lo, res_8b_hi);
+
+ const __m128i res_0 = _mm256_castsi256_si128(res_a);
+ const __m128i res_1 = _mm256_extracti128_si256(res_a, 1);
+
+ _mm_storeu_si128((__m128i *)&dst[i * dst_stride + j], res_0);
+ _mm_storeu_si128((__m128i *)&dst[i * dst_stride + j + dst_stride],
+ res_1);
+ } else {
+ const __m128i res_0 = _mm256_castsi256_si128(res_8b_lo);
+ const __m128i res_1 = _mm256_extracti128_si256(res_8b_lo, 1);
+ if (w - j > 4) {
+ _mm_storel_epi64((__m128i *)&dst[i * dst_stride + j], res_0);
+ _mm_storel_epi64((__m128i *)&dst[i * dst_stride + j + dst_stride],
+ res_1);
+ } else if (w - j > 2) {
+ xx_storel_32(&dst[i * dst_stride + j], res_0);
+ xx_storel_32(&dst[i * dst_stride + j + dst_stride], res_1);
+ } else {
+ __m128i *const p_0 = (__m128i *)&dst[i * dst_stride + j];
+ __m128i *const p_1 =
+ (__m128i *)&dst[i * dst_stride + j + dst_stride];
+ *(uint16_t *)p_0 = (uint16_t)_mm_cvtsi128_si32(res_0);
+ *(uint16_t *)p_1 = (uint16_t)_mm_cvtsi128_si32(res_1);
+ }
+ }
+ s[0] = s[1];
+ s[1] = s[2];
+ s[3] = s[4];
+ s[4] = s[5];
+ }
+ }
+ } else if (vert_tap == 12) { // vert_tap == 12
+ const int fo_vert = filter_params_y->taps / 2 - 1;
+ const uint8_t *const src_ptr = src - fo_vert * src_stride;
+ const __m256i v_zero = _mm256_setzero_si256();
+ right_shift = _mm_cvtsi32_si128(FILTER_BITS);
+ right_shift_const = _mm256_set1_epi32((1 << FILTER_BITS) >> 1);
+
+ for (int j = 0; j < w; j += 8) {
+ const uint8_t *data = &src_ptr[j];
+ __m256i src10;
+
+ d[0] = _mm_loadl_epi64((__m128i *)(data + 0 * src_stride));
+ d[1] = _mm_loadl_epi64((__m128i *)(data + 1 * src_stride));
+ d[2] = _mm_loadl_epi64((__m128i *)(data + 2 * src_stride));
+ d[3] = _mm_loadl_epi64((__m128i *)(data + 3 * src_stride));
+ d[4] = _mm_loadl_epi64((__m128i *)(data + 4 * src_stride));
+ d[5] = _mm_loadl_epi64((__m128i *)(data + 5 * src_stride));
+ d[6] = _mm_loadl_epi64((__m128i *)(data + 6 * src_stride));
+ d[7] = _mm_loadl_epi64((__m128i *)(data + 7 * src_stride));
+ d[8] = _mm_loadl_epi64((__m128i *)(data + 8 * src_stride));
+ d[9] = _mm_loadl_epi64((__m128i *)(data + 9 * src_stride));
+ // Load lines a and b. Line a to lower 128, line b to upper 128
+ const __m256i src_01a = _mm256_permute2x128_si256(
+ _mm256_castsi128_si256(d[0]), _mm256_castsi128_si256(d[1]), 0x20);
+
+ const __m256i src_12a = _mm256_permute2x128_si256(
+ _mm256_castsi128_si256(d[1]), _mm256_castsi128_si256(d[2]), 0x20);
+
+ const __m256i src_23a = _mm256_permute2x128_si256(
+ _mm256_castsi128_si256(d[2]), _mm256_castsi128_si256(d[3]), 0x20);
+
+ const __m256i src_34a = _mm256_permute2x128_si256(
+ _mm256_castsi128_si256(d[3]), _mm256_castsi128_si256(d[4]), 0x20);
+
+ const __m256i src_45a = _mm256_permute2x128_si256(
+ _mm256_castsi128_si256(d[4]), _mm256_castsi128_si256(d[5]), 0x20);
+
+ const __m256i src_56a = _mm256_permute2x128_si256(
+ _mm256_castsi128_si256(d[5]), _mm256_castsi128_si256(d[6]), 0x20);
+
+ const __m256i src_67a = _mm256_permute2x128_si256(
+ _mm256_castsi128_si256(d[6]), _mm256_castsi128_si256(d[7]), 0x20);
+
+ const __m256i src_78a = _mm256_permute2x128_si256(
+ _mm256_castsi128_si256(d[7]), _mm256_castsi128_si256(d[8]), 0x20);
+
+ const __m256i src_89a = _mm256_permute2x128_si256(
+ _mm256_castsi128_si256(d[8]), _mm256_castsi128_si256(d[9]), 0x20);
+
+ src10 = _mm256_castsi128_si256(
+ _mm_loadl_epi64((__m128i *)(data + 10 * src_stride)));
+ const __m256i src_910a =
+ _mm256_permute2x128_si256(_mm256_castsi128_si256(d[9]), src10, 0x20);
+
+ const __m256i src_01 = _mm256_unpacklo_epi8(src_01a, v_zero);
+ const __m256i src_12 = _mm256_unpacklo_epi8(src_12a, v_zero);
+ const __m256i src_23 = _mm256_unpacklo_epi8(src_23a, v_zero);
+ const __m256i src_34 = _mm256_unpacklo_epi8(src_34a, v_zero);
+ const __m256i src_45 = _mm256_unpacklo_epi8(src_45a, v_zero);
+ const __m256i src_56 = _mm256_unpacklo_epi8(src_56a, v_zero);
+ const __m256i src_67 = _mm256_unpacklo_epi8(src_67a, v_zero);
+ const __m256i src_78 = _mm256_unpacklo_epi8(src_78a, v_zero);
+ const __m256i src_89 = _mm256_unpacklo_epi8(src_89a, v_zero);
+ const __m256i src_910 = _mm256_unpacklo_epi8(src_910a, v_zero);
+
+ s[0] = _mm256_unpacklo_epi16(src_01, src_12);
+ s[1] = _mm256_unpacklo_epi16(src_23, src_34);
+ s[2] = _mm256_unpacklo_epi16(src_45, src_56);
+ s[3] = _mm256_unpacklo_epi16(src_67, src_78);
+ s[4] = _mm256_unpacklo_epi16(src_89, src_910);
+
+ s[6] = _mm256_unpackhi_epi16(src_01, src_12);
+ s[7] = _mm256_unpackhi_epi16(src_23, src_34);
+ s[8] = _mm256_unpackhi_epi16(src_45, src_56);
+ s[9] = _mm256_unpackhi_epi16(src_67, src_78);
+ s[10] = _mm256_unpackhi_epi16(src_89, src_910);
+
+ for (i = 0; i < h; i += 2) {
+ data = &src_ptr[i * src_stride + j];
+ const __m256i src_1011a = _mm256_permute2x128_si256(
+ src10,
+ _mm256_castsi128_si256(
+ _mm_loadl_epi64((__m128i *)(data + 11 * src_stride))),
+ 0x20);
+
+ src10 = _mm256_castsi128_si256(
+ _mm_loadl_epi64((__m128i *)(data + 12 * src_stride)));
+
+ const __m256i src_1112a = _mm256_permute2x128_si256(
+ _mm256_castsi128_si256(
+ _mm_loadl_epi64((__m128i *)(data + 11 * src_stride))),
+ src10, 0x20);
+
+ const __m256i src_1011 = _mm256_unpacklo_epi8(src_1011a, v_zero);
+ const __m256i src_1112 = _mm256_unpacklo_epi8(src_1112a, v_zero);
+
+ s[5] = _mm256_unpacklo_epi16(src_1011, src_1112);
+ s[11] = _mm256_unpackhi_epi16(src_1011, src_1112);
+
+ const __m256i res_lo = convolve_12taps(s, coeffs);
+
+ const __m256i res_32b_lo = _mm256_sra_epi32(
+ _mm256_add_epi32(res_lo, right_shift_const), right_shift);
+ // 8 bit conversion and saturation to uint8
+ __m256i res_16b_lo = _mm256_packs_epi32(res_32b_lo, res_32b_lo);
+ __m256i res_8b_lo = _mm256_packus_epi16(res_16b_lo, res_16b_lo);
+
+ if (w - j > 4) {
+ const __m256i res_hi = convolve_12taps(s + 6, coeffs);
+
+ const __m256i res_32b_hi = _mm256_sra_epi32(
+ _mm256_add_epi32(res_hi, right_shift_const), right_shift);
+ __m256i res_16b_hi = _mm256_packs_epi32(res_32b_hi, res_32b_hi);
+ // 8 bit conversion and saturation to uint8
+ __m256i res_8b_hi = _mm256_packus_epi16(res_16b_hi, res_16b_hi);
+
+ __m256i res_a = _mm256_unpacklo_epi32(res_8b_lo, res_8b_hi);
+
+ const __m128i res_0 = _mm256_extracti128_si256(res_a, 0);
+ const __m128i res_1 = _mm256_extracti128_si256(res_a, 1);
+
+ _mm_storel_epi64((__m128i *)&dst[i * dst_stride + j], res_0);
+ _mm_storel_epi64((__m128i *)&dst[i * dst_stride + j + dst_stride],
+ res_1);
+ } else {
+ const __m128i res_0 = _mm256_extracti128_si256(res_8b_lo, 0);
+ const __m128i res_1 = _mm256_extracti128_si256(res_8b_lo, 1);
+ if (w - j > 2) {
+ *(int *)&dst[i * dst_stride + j] = _mm_cvtsi128_si32(res_0);
+ *(int *)&dst[i * dst_stride + j + dst_stride] =
+ _mm_cvtsi128_si32(res_1);
+ } else {
+ *(uint16_t *)&dst[i * dst_stride + j] =
+ (uint16_t)_mm_cvtsi128_si32(res_0);
+ *(uint16_t *)&dst[i * dst_stride + j + dst_stride] =
+ (uint16_t)_mm_cvtsi128_si32(res_1);
+ }
+ }
+ s[0] = s[1];
+ s[1] = s[2];
+ s[2] = s[3];
+ s[3] = s[4];
+ s[4] = s[5];
+
+ s[6] = s[7];
+ s[7] = s[8];
+ s[8] = s[9];
+ s[9] = s[10];
+ s[10] = s[11];
+ }
+ }
+ } else {
+ const int fo_vert = filter_params_y->taps / 2 - 1;
+ const uint8_t *const src_ptr = src - fo_vert * src_stride;
+
+ for (int j = 0; j < w; j += 16) {
+ const uint8_t *data = &src_ptr[j];
+ __m256i src6;
+
+ d[0] = _mm_loadu_si128((__m128i *)(data + 0 * src_stride));
+ d[1] = _mm_loadu_si128((__m128i *)(data + 1 * src_stride));
+ d[2] = _mm_loadu_si128((__m128i *)(data + 2 * src_stride));
+ d[3] = _mm_loadu_si128((__m128i *)(data + 3 * src_stride));
+ d[4] = _mm_loadu_si128((__m128i *)(data + 4 * src_stride));
+ d[5] = _mm_loadu_si128((__m128i *)(data + 5 * src_stride));
+ // Load lines a and b. Line a to lower 128, line b to upper 128
+ const __m256i src_01a = _mm256_permute2x128_si256(
+ _mm256_castsi128_si256(d[0]), _mm256_castsi128_si256(d[1]), 0x20);
+
+ const __m256i src_12a = _mm256_permute2x128_si256(
+ _mm256_castsi128_si256(d[1]), _mm256_castsi128_si256(d[2]), 0x20);
+
+ const __m256i src_23a = _mm256_permute2x128_si256(
+ _mm256_castsi128_si256(d[2]), _mm256_castsi128_si256(d[3]), 0x20);
+
+ const __m256i src_34a = _mm256_permute2x128_si256(
+ _mm256_castsi128_si256(d[3]), _mm256_castsi128_si256(d[4]), 0x20);
+
+ const __m256i src_45a = _mm256_permute2x128_si256(
+ _mm256_castsi128_si256(d[4]), _mm256_castsi128_si256(d[5]), 0x20);
+
+ src6 = _mm256_castsi128_si256(
+ _mm_loadu_si128((__m128i *)(data + 6 * src_stride)));
+ const __m256i src_56a =
+ _mm256_permute2x128_si256(_mm256_castsi128_si256(d[5]), src6, 0x20);
+
+ s[0] = _mm256_unpacklo_epi8(src_01a, src_12a);
+ s[1] = _mm256_unpacklo_epi8(src_23a, src_34a);
+ s[2] = _mm256_unpacklo_epi8(src_45a, src_56a);
+
+ s[4] = _mm256_unpackhi_epi8(src_01a, src_12a);
+ s[5] = _mm256_unpackhi_epi8(src_23a, src_34a);
+ s[6] = _mm256_unpackhi_epi8(src_45a, src_56a);
+
+ for (i = 0; i < h; i += 2) {
+ data = &src_ptr[i * src_stride + j];
+ const __m256i src_67a = _mm256_permute2x128_si256(
+ src6,
+ _mm256_castsi128_si256(
+ _mm_loadu_si128((__m128i *)(data + 7 * src_stride))),
+ 0x20);
+
+ src6 = _mm256_castsi128_si256(
+ _mm_loadu_si128((__m128i *)(data + 8 * src_stride)));
+ const __m256i src_78a = _mm256_permute2x128_si256(
+ _mm256_castsi128_si256(
+ _mm_loadu_si128((__m128i *)(data + 7 * src_stride))),
+ src6, 0x20);
+
+ s[3] = _mm256_unpacklo_epi8(src_67a, src_78a);
+ s[7] = _mm256_unpackhi_epi8(src_67a, src_78a);
+
+ const __m256i res_lo = convolve_lowbd(s, coeffs);
+
+ /* rounding code */
+ // shift by F - 1
+ const __m256i res_16b_lo = _mm256_sra_epi16(
+ _mm256_add_epi16(res_lo, right_shift_const), right_shift);
+ // 8 bit conversion and saturation to uint8
+ __m256i res_8b_lo = _mm256_packus_epi16(res_16b_lo, res_16b_lo);
+
+ if (w - j > 8) {
+ const __m256i res_hi = convolve_lowbd(s + 4, coeffs);
+
+ /* rounding code */
+ // shift by F - 1
+ const __m256i res_16b_hi = _mm256_sra_epi16(
+ _mm256_add_epi16(res_hi, right_shift_const), right_shift);
+ // 8 bit conversion and saturation to uint8
+ __m256i res_8b_hi = _mm256_packus_epi16(res_16b_hi, res_16b_hi);
+
+ __m256i res_a = _mm256_unpacklo_epi64(res_8b_lo, res_8b_hi);
+
+ const __m128i res_0 = _mm256_castsi256_si128(res_a);
+ const __m128i res_1 = _mm256_extracti128_si256(res_a, 1);
+
+ _mm_storeu_si128((__m128i *)&dst[i * dst_stride + j], res_0);
+ _mm_storeu_si128((__m128i *)&dst[i * dst_stride + j + dst_stride],
+ res_1);
+ } else {
+ const __m128i res_0 = _mm256_castsi256_si128(res_8b_lo);
+ const __m128i res_1 = _mm256_extracti128_si256(res_8b_lo, 1);
+ if (w - j > 4) {
+ _mm_storel_epi64((__m128i *)&dst[i * dst_stride + j], res_0);
+ _mm_storel_epi64((__m128i *)&dst[i * dst_stride + j + dst_stride],
+ res_1);
+ } else if (w - j > 2) {
+ xx_storel_32(&dst[i * dst_stride + j], res_0);
+ xx_storel_32(&dst[i * dst_stride + j + dst_stride], res_1);
+ } else {
+ __m128i *const p_0 = (__m128i *)&dst[i * dst_stride + j];
+ __m128i *const p_1 =
+ (__m128i *)&dst[i * dst_stride + j + dst_stride];
+ *(uint16_t *)p_0 = (uint16_t)_mm_cvtsi128_si32(res_0);
+ *(uint16_t *)p_1 = (uint16_t)_mm_cvtsi128_si32(res_1);
+ }
+ }
+ s[0] = s[1];
+ s[1] = s[2];
+ s[2] = s[3];
+
+ s[4] = s[5];
+ s[5] = s[6];
+ s[6] = s[7];
+ }
+ }
+ }
+}
+
+void av1_convolve_y_sr_avx2(const uint8_t *src, int32_t src_stride,
+ uint8_t *dst, int32_t dst_stride, int32_t w,
+ int32_t h,
+ const InterpFilterParams *filter_params_y,
+ const int32_t subpel_y_q4) {
+ const int vert_tap = get_filter_tap(filter_params_y, subpel_y_q4);
+
+ if (vert_tap == 12) {
+ av1_convolve_y_sr_general_avx2(src, src_stride, dst, dst_stride, w, h,
+ filter_params_y, subpel_y_q4);
+ } else {
+ av1_convolve_y_sr_specialized_avx2(src, src_stride, dst, dst_stride, w, h,
+ filter_params_y, subpel_y_q4);
+ }
+}
+
+static AOM_INLINE void av1_convolve_x_sr_general_avx2(
+ const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w,
+ int h, const InterpFilterParams *filter_params_x, const int subpel_x_qn,
+ ConvolveParams *conv_params) {
+ const int bits = FILTER_BITS - conv_params->round_0;
+ const __m128i round_shift = _mm_cvtsi32_si128(bits);
+ __m256i round_0_const =
+ _mm256_set1_epi16((1 << (conv_params->round_0 - 1)) >> 1);
+ __m128i round_0_shift = _mm_cvtsi32_si128(conv_params->round_0 - 1);
+ __m256i round_const = _mm256_set1_epi16((1 << bits) >> 1);
+ int i, horiz_tap = get_filter_tap(filter_params_x, subpel_x_qn);
+
+ assert(bits >= 0);
+ assert((FILTER_BITS - conv_params->round_1) >= 0 ||
+ ((conv_params->round_0 + conv_params->round_1) == 2 * FILTER_BITS));
+ assert(conv_params->round_0 > 0);
+
+ __m256i coeffs[6], filt[4];
+ filt[0] = _mm256_load_si256((__m256i const *)(filt_global_avx2));
+ filt[1] = _mm256_load_si256((__m256i const *)(filt_global_avx2 + 32));
+
+ if (horiz_tap == 6)
+ prepare_coeffs_6t_lowbd(filter_params_x, subpel_x_qn, coeffs);
+ else if (horiz_tap == 12) {
+ prepare_coeffs_12taps(filter_params_x, subpel_x_qn, coeffs);
+ } else {
+ prepare_coeffs_lowbd(filter_params_x, subpel_x_qn, coeffs);
+ }
+
+ // horz_filt as 4 tap
+ if (horiz_tap == 4) {
+ const int fo_horiz = 1;
+ const uint8_t *const src_ptr = src - fo_horiz;
+ if (w <= 8) {
+ for (i = 0; i < h; i += 2) {
+ const __m256i data = _mm256_permute2x128_si256(
+ _mm256_castsi128_si256(
+ _mm_loadu_si128((__m128i *)(&src_ptr[i * src_stride]))),
+ _mm256_castsi128_si256(_mm_loadu_si128(
+ (__m128i *)(&src_ptr[i * src_stride + src_stride]))),
+ 0x20);
+
+ __m256i res_16b = convolve_lowbd_x_4tap(data, coeffs + 1, filt);
+
+ res_16b = _mm256_sra_epi16(_mm256_add_epi16(res_16b, round_0_const),
+ round_0_shift);
+
+ res_16b = _mm256_sra_epi16(_mm256_add_epi16(res_16b, round_const),
+ round_shift);
+
+ /* rounding code */
+ // 8 bit conversion and saturation to uint8
+ __m256i res_8b = _mm256_packus_epi16(res_16b, res_16b);
+
+ const __m128i res_0 = _mm256_castsi256_si128(res_8b);
+ const __m128i res_1 = _mm256_extracti128_si256(res_8b, 1);
+
+ if (w > 4) {
+ _mm_storel_epi64((__m128i *)&dst[i * dst_stride], res_0);
+ _mm_storel_epi64((__m128i *)&dst[i * dst_stride + dst_stride], res_1);
+ } else if (w > 2) {
+ xx_storel_32(&dst[i * dst_stride], res_0);
+ xx_storel_32(&dst[i * dst_stride + dst_stride], res_1);
+ } else {
+ __m128i *const p_0 = (__m128i *)&dst[i * dst_stride];
+ __m128i *const p_1 = (__m128i *)&dst[i * dst_stride + dst_stride];
+ *(uint16_t *)p_0 = (uint16_t)_mm_cvtsi128_si32(res_0);
+ *(uint16_t *)p_1 = (uint16_t)_mm_cvtsi128_si32(res_1);
+ }
+ }
+ } else {
+ for (i = 0; i < h; ++i) {
+ for (int j = 0; j < w; j += 16) {
+ // 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 8 9 10 11 12 13 14 15 16 17
+ // 18 19 20 21 22 23
+ const __m256i data = _mm256_inserti128_si256(
+ _mm256_loadu_si256((__m256i *)&src_ptr[(i * src_stride) + j]),
+ _mm_loadu_si128((__m128i *)&src_ptr[(i * src_stride) + (j + 8)]),
+ 1);
+
+ __m256i res_16b = convolve_lowbd_x_4tap(data, coeffs + 1, filt);
+
+ res_16b = _mm256_sra_epi16(_mm256_add_epi16(res_16b, round_0_const),
+ round_0_shift);
+
+ res_16b = _mm256_sra_epi16(_mm256_add_epi16(res_16b, round_const),
+ round_shift);
+
+ /* rounding code */
+ // 8 bit conversion and saturation to uint8
+ __m256i res_8b = _mm256_packus_epi16(res_16b, res_16b);
+
+ // Store values into the destination buffer
+ // 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
+ res_8b = _mm256_permute4x64_epi64(res_8b, 216);
+ __m128i res = _mm256_castsi256_si128(res_8b);
+ _mm_storeu_si128((__m128i *)&dst[i * dst_stride + j], res);
+ }
+ }
+ }
+ } else if (horiz_tap == 6) {
+ const int fo_horiz = horiz_tap / 2 - 1;
+ const uint8_t *const src_ptr = src - fo_horiz;
+ filt[2] = _mm256_load_si256((__m256i const *)(filt_global_avx2 + 32 * 2));
+ filt[3] = _mm256_load_si256((__m256i const *)(filt_global_avx2 + 32 * 3));
+
+ if (w <= 8) {
+ for (i = 0; i < h; i += 2) {
+ const __m256i data = _mm256_permute2x128_si256(
+ _mm256_castsi128_si256(
+ _mm_loadu_si128((__m128i *)(&src_ptr[i * src_stride]))),
+ _mm256_castsi128_si256(_mm_loadu_si128(
+ (__m128i *)(&src_ptr[i * src_stride + src_stride]))),
+ 0x20);
+
+ __m256i res_16b = convolve_lowbd_x_6tap(data, coeffs, filt);
+
+ res_16b = _mm256_sra_epi16(_mm256_add_epi16(res_16b, round_0_const),
+ round_0_shift);
+
+ res_16b = _mm256_sra_epi16(_mm256_add_epi16(res_16b, round_const),
+ round_shift);
+
+ /* rounding code */
+ // 8 bit conversion and saturation to uint8
+ __m256i res_8b = _mm256_packus_epi16(res_16b, res_16b);
+
+ const __m128i res_0 = _mm256_castsi256_si128(res_8b);
+ const __m128i res_1 = _mm256_extracti128_si256(res_8b, 1);
+ if (w > 4) {
+ _mm_storel_epi64((__m128i *)&dst[i * dst_stride], res_0);
+ _mm_storel_epi64((__m128i *)&dst[i * dst_stride + dst_stride], res_1);
+ } else if (w > 2) {
+ xx_storel_32(&dst[i * dst_stride], res_0);
+ xx_storel_32(&dst[i * dst_stride + dst_stride], res_1);
+ } else {
+ __m128i *const p_0 = (__m128i *)&dst[i * dst_stride];
+ __m128i *const p_1 = (__m128i *)&dst[i * dst_stride + dst_stride];
+ *(uint16_t *)p_0 = _mm_cvtsi128_si32(res_0);
+ *(uint16_t *)p_1 = _mm_cvtsi128_si32(res_1);
+ }
+ }
+ } else {
+ for (i = 0; i < h; ++i) {
+ for (int j = 0; j < w; j += 16) {
+ // 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 8 9 10 11 12 13 14 15 16 17
+ // 18 19 20 21 22 23
+ const __m256i data = _mm256_inserti128_si256(
+ _mm256_loadu_si256((__m256i *)&src_ptr[(i * src_stride) + j]),
+ _mm_loadu_si128((__m128i *)&src_ptr[(i * src_stride) + (j + 8)]),
+ 1);
+
+ __m256i res_16b = convolve_lowbd_x_6tap(data, coeffs, filt);
+
+ res_16b = _mm256_sra_epi16(_mm256_add_epi16(res_16b, round_0_const),
+ round_0_shift);
+
+ res_16b = _mm256_sra_epi16(_mm256_add_epi16(res_16b, round_const),
+ round_shift);
+
+ /* rounding code */
+ // 8 bit conversion and saturation to uint8
+ __m256i res_8b = _mm256_packus_epi16(res_16b, res_16b);
+
+ // Store values into the destination buffer
+ // 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
+ res_8b = _mm256_permute4x64_epi64(res_8b, 216);
+ __m128i res = _mm256_castsi256_si128(res_8b);
+ _mm_storeu_si128((__m128i *)&dst[i * dst_stride + j], res);
+ }
+ }
+ }
+ } else if (horiz_tap == 12) { // horiz_tap == 12
+ const int fo_horiz = filter_params_x->taps / 2 - 1;
+ const uint8_t *const src_ptr = src - fo_horiz;
+ const __m256i v_zero = _mm256_setzero_si256();
+ round_0_const = _mm256_set1_epi32((1 << (conv_params->round_0)) >> 1);
+ round_const = _mm256_set1_epi32((1 << bits) >> 1);
+ round_0_shift = _mm_cvtsi32_si128(conv_params->round_0);
+ __m256i s[6];
+
+ if (w <= 4) {
+ for (i = 0; i < h; i += 2) {
+ const __m256i data = _mm256_permute2x128_si256(
+ _mm256_castsi128_si256(
+ _mm_loadu_si128((__m128i *)(&src_ptr[i * src_stride]))),
+ _mm256_castsi128_si256(_mm_loadu_si128(
+ (__m128i *)(&src_ptr[i * src_stride + src_stride]))),
+ 0x20);
+ // row0 0..7 row1 0..7
+ const __m256i s_16lo = _mm256_unpacklo_epi8(data, v_zero);
+ // row0 8..F row1 8..F
+ const __m256i s_16hi = _mm256_unpackhi_epi8(data, v_zero);
+
+ // row0 00 00 01 01 .. 03 03 row1 00 00 01 01 .. 03 03
+ const __m256i s_lolo = _mm256_unpacklo_epi16(s_16lo, s_16lo);
+ // row0 04 04 .. 07 07 row1 04 04 .. 07 07
+ const __m256i s_lohi = _mm256_unpackhi_epi16(s_16lo, s_16lo);
+
+ // row0 08 08 09 09 .. 0B 0B row1 08 08 09 09 .. 0B 0B
+ const __m256i s_hilo = _mm256_unpacklo_epi16(s_16hi, s_16hi);
+ // row0 0C 0C .. 0F 0F row1 0C 0C .. 0F 0F
+ const __m256i s_hihi = _mm256_unpackhi_epi16(s_16hi, s_16hi);
+
+ // 00 01 01 02 02 03 03 04 10 11 11 12 12 13 13 14
+ s[0] = _mm256_alignr_epi8(s_lohi, s_lolo, 2);
+ // 02 03 03 04 04 05 05 06 12 13 13 14 14 15 15 16
+ s[1] = _mm256_alignr_epi8(s_lohi, s_lolo, 10);
+ // 04 05 05 06 06 07 07 08 14 15 15 16 16 17 17 18
+ s[2] = _mm256_alignr_epi8(s_hilo, s_lohi, 2);
+ // 06 07 07 08 08 09 09 0A 16 17 17 18 18 19 19 1A
+ s[3] = _mm256_alignr_epi8(s_hilo, s_lohi, 10);
+ // 08 09 09 0A 0A 0B 0B 0C 18 19 19 1A 1A 1B 1B 1C
+ s[4] = _mm256_alignr_epi8(s_hihi, s_hilo, 2);
+ // 0A 0B 0B 0C 0C 0D 0D 0E 1A 1B 1B 1C 1C 1D 1D 1E
+ s[5] = _mm256_alignr_epi8(s_hihi, s_hilo, 10);
+
+ const __m256i res_lo = convolve_12taps(s, coeffs);
+
+ __m256i res_32b_lo = _mm256_sra_epi32(
+ _mm256_add_epi32(res_lo, round_0_const), round_0_shift);
+
+ // 00 01 02 03 10 12 13 14
+ res_32b_lo = _mm256_sra_epi32(_mm256_add_epi32(res_32b_lo, round_const),
+ round_shift);
+ // 8 bit conversion and saturation to uint8
+ // 00 01 02 03 00 01 02 03 10 11 12 13 10 11 12 13
+ __m256i res_16b_lo = _mm256_packs_epi32(res_32b_lo, res_32b_lo);
+ // 00 01 02 03 00 01 02 03 00 01 02 03 00 01 02 03
+ // 10 11 12 13 10 11 12 13 10 11 12 13 10 11 12 13
+ __m256i res_8b_lo = _mm256_packus_epi16(res_16b_lo, res_16b_lo);
+
+ // 00 01 02 03 00 01 02 03 00 01 02 03 00 01 02 03
+ const __m128i res_0 = _mm256_extracti128_si256(res_8b_lo, 0);
+ // 10 11 12 13 10 11 12 13 10 11 12 13 10 11 12 13
+ const __m128i res_1 = _mm256_extracti128_si256(res_8b_lo, 1);
+ if (w > 2) {
+ // 00 01 02 03
+ *(int *)&dst[i * dst_stride] = _mm_cvtsi128_si32(res_0);
+ // 10 11 12 13
+ *(int *)&dst[i * dst_stride + dst_stride] = _mm_cvtsi128_si32(res_1);
+ } else {
+ // 00 01
+ *(uint16_t *)&dst[i * dst_stride] =
+ (uint16_t)_mm_cvtsi128_si32(res_0);
+ // 10 11
+ *(uint16_t *)&dst[i * dst_stride + dst_stride] =
+ (uint16_t)_mm_cvtsi128_si32(res_1);
+ }
+ }
+ } else {
+ for (i = 0; i < h; i++) {
+ for (int j = 0; j < w; j += 8) {
+ const __m256i data = _mm256_permute2x128_si256(
+ _mm256_castsi128_si256(
+ _mm_loadu_si128((__m128i *)(&src_ptr[i * src_stride + j]))),
+ _mm256_castsi128_si256(_mm_loadu_si128(
+ (__m128i *)(&src_ptr[i * src_stride + j + 4]))),
+ 0x20);
+ // row0 0..7 4..B
+ const __m256i s_16lo = _mm256_unpacklo_epi8(data, v_zero);
+ // row0 8..F C..13
+ const __m256i s_16hi = _mm256_unpackhi_epi8(data, v_zero);
+
+ // row0 00 00 01 01 .. 03 03 04 04 05 05 .. 07 07
+ const __m256i s_lolo = _mm256_unpacklo_epi16(s_16lo, s_16lo);
+ // row0 04 04 .. 07 07 08 08 .. 0B 0B
+ const __m256i s_lohi = _mm256_unpackhi_epi16(s_16lo, s_16lo);
+
+ // row0 08 08 09 09 .. 0B 0B 0C 0C 0D 0D .. 0F 0F
+ const __m256i s_hilo = _mm256_unpacklo_epi16(s_16hi, s_16hi);
+ // row0 0C 0C 0D 0D .. 0F 0F 10 10 11 11 .. 13 13
+ const __m256i s_hihi = _mm256_unpackhi_epi16(s_16hi, s_16hi);
+
+ s[0] = _mm256_alignr_epi8(s_lohi, s_lolo, 2);
+ s[1] = _mm256_alignr_epi8(s_lohi, s_lolo, 10);
+ s[2] = _mm256_alignr_epi8(s_hilo, s_lohi, 2);
+ s[3] = _mm256_alignr_epi8(s_hilo, s_lohi, 10);
+ s[4] = _mm256_alignr_epi8(s_hihi, s_hilo, 2);
+ s[5] = _mm256_alignr_epi8(s_hihi, s_hilo, 10);
+
+ const __m256i res_lo = convolve_12taps(s, coeffs);
+
+ __m256i res_32b_lo = _mm256_sra_epi32(
+ _mm256_add_epi32(res_lo, round_0_const), round_0_shift);
+
+ res_32b_lo = _mm256_sra_epi32(
+ _mm256_add_epi32(res_32b_lo, round_const), round_shift);
+ // 8 bit conversion and saturation to uint8
+ __m256i res_16b_lo = _mm256_packs_epi32(res_32b_lo, res_32b_lo);
+ __m256i res_8b_lo = _mm256_packus_epi16(res_16b_lo, res_16b_lo);
+ const __m128i res_0 = _mm256_extracti128_si256(res_8b_lo, 0);
+ const __m128i res_1 = _mm256_extracti128_si256(res_8b_lo, 1);
+ *(int *)&dst[i * dst_stride + j] = _mm_cvtsi128_si32(res_0);
+ *(int *)&dst[i * dst_stride + j + 4] = _mm_cvtsi128_si32(res_1);
+ }
+ }
+ }
+ } else {
+ const int fo_horiz = filter_params_x->taps / 2 - 1;
+ const uint8_t *const src_ptr = src - fo_horiz;
+ filt[2] = _mm256_load_si256((__m256i const *)(filt_global_avx2 + 32 * 2));
+ filt[3] = _mm256_load_si256((__m256i const *)(filt_global_avx2 + 32 * 3));
+
+ if (w <= 8) {
+ for (i = 0; i < h; i += 2) {
+ const __m256i data = _mm256_permute2x128_si256(
+ _mm256_castsi128_si256(
+ _mm_loadu_si128((__m128i *)(&src_ptr[i * src_stride]))),
+ _mm256_castsi128_si256(_mm_loadu_si128(
+ (__m128i *)(&src_ptr[i * src_stride + src_stride]))),
+ 0x20);
+
+ __m256i res_16b = convolve_lowbd_x(data, coeffs, filt);
+
+ res_16b = _mm256_sra_epi16(_mm256_add_epi16(res_16b, round_0_const),
+ round_0_shift);
+
+ res_16b = _mm256_sra_epi16(_mm256_add_epi16(res_16b, round_const),
+ round_shift);
+
+ /* rounding code */
+ // 8 bit conversion and saturation to uint8
+ __m256i res_8b = _mm256_packus_epi16(res_16b, res_16b);
+
+ const __m128i res_0 = _mm256_castsi256_si128(res_8b);
+ const __m128i res_1 = _mm256_extracti128_si256(res_8b, 1);
+ if (w > 4) {
+ _mm_storel_epi64((__m128i *)&dst[i * dst_stride], res_0);
+ _mm_storel_epi64((__m128i *)&dst[i * dst_stride + dst_stride], res_1);
+ } else if (w > 2) {
+ xx_storel_32(&dst[i * dst_stride], res_0);
+ xx_storel_32(&dst[i * dst_stride + dst_stride], res_1);
+ } else {
+ __m128i *const p_0 = (__m128i *)&dst[i * dst_stride];
+ __m128i *const p_1 = (__m128i *)&dst[i * dst_stride + dst_stride];
+ *(uint16_t *)p_0 = (uint16_t)_mm_cvtsi128_si32(res_0);
+ *(uint16_t *)p_1 = (uint16_t)_mm_cvtsi128_si32(res_1);
+ }
+ }
+ } else {
+ for (i = 0; i < h; ++i) {
+ for (int j = 0; j < w; j += 16) {
+ // 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 8 9 10 11 12 13 14 15 16 17
+ // 18 19 20 21 22 23
+ const __m256i data = _mm256_inserti128_si256(
+ _mm256_loadu_si256((__m256i *)&src_ptr[(i * src_stride) + j]),
+ _mm_loadu_si128((__m128i *)&src_ptr[(i * src_stride) + (j + 8)]),
+ 1);
+
+ __m256i res_16b = convolve_lowbd_x(data, coeffs, filt);
+
+ res_16b = _mm256_sra_epi16(_mm256_add_epi16(res_16b, round_0_const),
+ round_0_shift);
+
+ res_16b = _mm256_sra_epi16(_mm256_add_epi16(res_16b, round_const),
+ round_shift);
+
+ /* rounding code */
+ // 8 bit conversion and saturation to uint8
+ __m256i res_8b = _mm256_packus_epi16(res_16b, res_16b);
+
+ // Store values into the destination buffer
+ // 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
+ res_8b = _mm256_permute4x64_epi64(res_8b, 216);
+ __m128i res = _mm256_castsi256_si128(res_8b);
+ _mm_storeu_si128((__m128i *)&dst[i * dst_stride + j], res);
+ }
+ }
+ }
+ }
+}
+
+void av1_convolve_x_sr_avx2(const uint8_t *src, int32_t src_stride,
+ uint8_t *dst, int32_t dst_stride, int32_t w,
+ int32_t h,
+ const InterpFilterParams *filter_params_x,
+ const int32_t subpel_x_q4,
+ ConvolveParams *conv_params) {
+ const int horz_tap = get_filter_tap(filter_params_x, subpel_x_q4);
+
+ if (horz_tap == 12) {
+ av1_convolve_x_sr_general_avx2(src, src_stride, dst, dst_stride, w, h,
+ filter_params_x, subpel_x_q4, conv_params);
+ } else {
+ av1_convolve_x_sr_specialized_avx2(src, src_stride, dst, dst_stride, w, h,
+ filter_params_x, subpel_x_q4,
+ conv_params);
+ }
+}
diff --git a/third_party/aom/av1/common/x86/convolve_sse2.c b/third_party/aom/av1/common/x86/convolve_sse2.c
new file mode 100644
index 0000000000..012e75c1ae
--- /dev/null
+++ b/third_party/aom/av1/common/x86/convolve_sse2.c
@@ -0,0 +1,500 @@
+/*
+ * Copyright (c) 2017, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <emmintrin.h>
+
+#include "config/av1_rtcd.h"
+
+#include "aom_dsp/aom_dsp_common.h"
+#include "aom_dsp/aom_filter.h"
+#include "aom_dsp/x86/convolve_common_intrin.h"
+#include "av1/common/convolve.h"
+
+static INLINE void prepare_coeffs(const InterpFilterParams *const filter_params,
+ const int subpel_q4,
+ __m128i *const coeffs /* [4] */) {
+ const int16_t *const y_filter = av1_get_interp_filter_subpel_kernel(
+ filter_params, subpel_q4 & SUBPEL_MASK);
+ const __m128i coeffs_y = _mm_loadu_si128((__m128i *)y_filter);
+ // coeffs 0 1 0 1 2 3 2 3
+ const __m128i tmp_0 = _mm_unpacklo_epi32(coeffs_y, coeffs_y);
+ // coeffs 4 5 4 5 6 7 6 7
+ const __m128i tmp_1 = _mm_unpackhi_epi32(coeffs_y, coeffs_y);
+
+ coeffs[0] = _mm_unpacklo_epi64(tmp_0, tmp_0); // coeffs 0 1 0 1 0 1 0 1
+ coeffs[1] = _mm_unpackhi_epi64(tmp_0, tmp_0); // coeffs 2 3 2 3 2 3 2 3
+ coeffs[2] = _mm_unpacklo_epi64(tmp_1, tmp_1); // coeffs 4 5 4 5 4 5 4 5
+ coeffs[3] = _mm_unpackhi_epi64(tmp_1, tmp_1); // coeffs 6 7 6 7 6 7 6 7
+}
+
+static INLINE __m128i convolve(const __m128i *const s,
+ const __m128i *const coeffs) {
+ const __m128i d0 = _mm_madd_epi16(s[0], coeffs[0]);
+ const __m128i d1 = _mm_madd_epi16(s[1], coeffs[1]);
+ const __m128i d2 = _mm_madd_epi16(s[2], coeffs[2]);
+ const __m128i d3 = _mm_madd_epi16(s[3], coeffs[3]);
+ const __m128i d = _mm_add_epi32(_mm_add_epi32(d0, d1), _mm_add_epi32(d2, d3));
+ return d;
+}
+
+static INLINE __m128i convolve_lo_x(const __m128i *const s,
+ const __m128i *const coeffs) {
+ __m128i ss[4];
+ ss[0] = _mm_unpacklo_epi8(s[0], _mm_setzero_si128());
+ ss[1] = _mm_unpacklo_epi8(s[1], _mm_setzero_si128());
+ ss[2] = _mm_unpacklo_epi8(s[2], _mm_setzero_si128());
+ ss[3] = _mm_unpacklo_epi8(s[3], _mm_setzero_si128());
+ return convolve(ss, coeffs);
+}
+
+static INLINE __m128i convolve_lo_y(const __m128i *const s,
+ const __m128i *const coeffs) {
+ __m128i ss[4];
+ ss[0] = _mm_unpacklo_epi8(s[0], _mm_setzero_si128());
+ ss[1] = _mm_unpacklo_epi8(s[2], _mm_setzero_si128());
+ ss[2] = _mm_unpacklo_epi8(s[4], _mm_setzero_si128());
+ ss[3] = _mm_unpacklo_epi8(s[6], _mm_setzero_si128());
+ return convolve(ss, coeffs);
+}
+
+static INLINE __m128i convolve_hi_y(const __m128i *const s,
+ const __m128i *const coeffs) {
+ __m128i ss[4];
+ ss[0] = _mm_unpackhi_epi8(s[0], _mm_setzero_si128());
+ ss[1] = _mm_unpackhi_epi8(s[2], _mm_setzero_si128());
+ ss[2] = _mm_unpackhi_epi8(s[4], _mm_setzero_si128());
+ ss[3] = _mm_unpackhi_epi8(s[6], _mm_setzero_si128());
+ return convolve(ss, coeffs);
+}
+
+void av1_convolve_y_sr_12tap_sse2(const uint8_t *src, int src_stride,
+ uint8_t *dst, int dst_stride, int w, int h,
+ const InterpFilterParams *filter_params_y,
+ int subpel_y_qn) {
+ const int fo_vert = filter_params_y->taps / 2 - 1;
+ const uint8_t *src_ptr = src - fo_vert * src_stride;
+ const __m128i round_const = _mm_set1_epi32((1 << FILTER_BITS) >> 1);
+ const __m128i round_shift = _mm_cvtsi32_si128(FILTER_BITS);
+ __m128i coeffs[6];
+
+ prepare_coeffs_12tap(filter_params_y, subpel_y_qn, coeffs);
+
+ int j = 0;
+ do {
+ __m128i s[12], src10, res_lo, res_hi;
+ __m128i res_lo_round, res_hi_round, res16, res;
+ const uint8_t *data = &src_ptr[j];
+
+ src10 = _mm_loadl_epi64((__m128i *)(data + 10 * src_stride));
+ s[0] =
+ _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(data + 0 * src_stride)),
+ _mm_loadl_epi64((__m128i *)(data + 1 * src_stride)));
+ s[1] =
+ _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(data + 1 * src_stride)),
+ _mm_loadl_epi64((__m128i *)(data + 2 * src_stride)));
+ s[2] =
+ _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(data + 2 * src_stride)),
+ _mm_loadl_epi64((__m128i *)(data + 3 * src_stride)));
+ s[3] =
+ _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(data + 3 * src_stride)),
+ _mm_loadl_epi64((__m128i *)(data + 4 * src_stride)));
+ s[4] =
+ _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(data + 4 * src_stride)),
+ _mm_loadl_epi64((__m128i *)(data + 5 * src_stride)));
+ s[5] =
+ _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(data + 5 * src_stride)),
+ _mm_loadl_epi64((__m128i *)(data + 6 * src_stride)));
+ s[6] =
+ _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(data + 6 * src_stride)),
+ _mm_loadl_epi64((__m128i *)(data + 7 * src_stride)));
+ s[7] =
+ _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(data + 7 * src_stride)),
+ _mm_loadl_epi64((__m128i *)(data + 8 * src_stride)));
+ s[8] =
+ _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(data + 8 * src_stride)),
+ _mm_loadl_epi64((__m128i *)(data + 9 * src_stride)));
+ s[9] = _mm_unpacklo_epi8(
+ _mm_loadl_epi64((__m128i *)(data + 9 * src_stride)), src10);
+
+ int i = 0;
+ do {
+ data = &src_ptr[i * src_stride + j];
+ s[10] = _mm_unpacklo_epi8(
+ src10, _mm_loadl_epi64((__m128i *)(data + 11 * src_stride)));
+ src10 = _mm_loadl_epi64((__m128i *)(data + 12 * src_stride));
+ s[11] = _mm_unpacklo_epi8(
+ _mm_loadl_epi64((__m128i *)(data + 11 * src_stride)), src10);
+
+ res_lo = convolve_lo_y_12tap(s, coeffs); // Filter low index pixels
+ res_hi = convolve_hi_y_12tap(s, coeffs); // Filter high index pixels
+
+ res_lo_round =
+ _mm_sra_epi32(_mm_add_epi32(res_lo, round_const), round_shift);
+ res_hi_round =
+ _mm_sra_epi32(_mm_add_epi32(res_hi, round_const), round_shift);
+
+ res16 = _mm_packs_epi32(res_lo_round, res_hi_round);
+ res = _mm_packus_epi16(res16, res16);
+
+ _mm_storel_epi64((__m128i *)(dst + i * dst_stride + j), res);
+ i++;
+
+ res_lo = convolve_lo_y_12tap(s + 1, coeffs); // Filter low index pixels
+ res_hi = convolve_hi_y_12tap(s + 1, coeffs); // Filter high index pixels
+
+ res_lo_round =
+ _mm_sra_epi32(_mm_add_epi32(res_lo, round_const), round_shift);
+ res_hi_round =
+ _mm_sra_epi32(_mm_add_epi32(res_hi, round_const), round_shift);
+
+ res16 = _mm_packs_epi32(res_lo_round, res_hi_round);
+ res = _mm_packus_epi16(res16, res16);
+
+ _mm_storel_epi64((__m128i *)(dst + i * dst_stride + j), res);
+ i++;
+
+ s[0] = s[2];
+ s[1] = s[3];
+ s[2] = s[4];
+ s[3] = s[5];
+ s[4] = s[6];
+ s[5] = s[7];
+ s[6] = s[8];
+ s[7] = s[9];
+ s[8] = s[10];
+ s[9] = s[11];
+ } while (i < h);
+ j += 8;
+ } while (j < w);
+}
+
+void av1_convolve_y_sr_sse2(const uint8_t *src, int src_stride, uint8_t *dst,
+ int dst_stride, int w, int h,
+ const InterpFilterParams *filter_params_y,
+ const int subpel_y_qn) {
+ if (filter_params_y->taps > 8) {
+ if (w < 8) {
+ av1_convolve_y_sr_c(src, src_stride, dst, dst_stride, w, h,
+ filter_params_y, subpel_y_qn);
+ } else {
+ av1_convolve_y_sr_12tap_sse2(src, src_stride, dst, dst_stride, w, h,
+ filter_params_y, subpel_y_qn);
+ }
+ } else {
+ const int fo_vert = filter_params_y->taps / 2 - 1;
+ const uint8_t *src_ptr = src - fo_vert * src_stride;
+ const __m128i round_const = _mm_set1_epi32((1 << FILTER_BITS) >> 1);
+ const __m128i round_shift = _mm_cvtsi32_si128(FILTER_BITS);
+ __m128i coeffs[4];
+
+ prepare_coeffs(filter_params_y, subpel_y_qn, coeffs);
+
+ if (w <= 4) {
+ __m128i s[8], src6, res, res_round, res16;
+ int res_int;
+ src6 = _mm_cvtsi32_si128(*(int *)(src_ptr + 6 * src_stride));
+ s[0] = _mm_unpacklo_epi8(
+ _mm_cvtsi32_si128(*(int *)(src_ptr + 0 * src_stride)),
+ _mm_cvtsi32_si128(*(int *)(src_ptr + 1 * src_stride)));
+ s[1] = _mm_unpacklo_epi8(
+ _mm_cvtsi32_si128(*(int *)(src_ptr + 1 * src_stride)),
+ _mm_cvtsi32_si128(*(int *)(src_ptr + 2 * src_stride)));
+ s[2] = _mm_unpacklo_epi8(
+ _mm_cvtsi32_si128(*(int *)(src_ptr + 2 * src_stride)),
+ _mm_cvtsi32_si128(*(int *)(src_ptr + 3 * src_stride)));
+ s[3] = _mm_unpacklo_epi8(
+ _mm_cvtsi32_si128(*(int *)(src_ptr + 3 * src_stride)),
+ _mm_cvtsi32_si128(*(int *)(src_ptr + 4 * src_stride)));
+ s[4] = _mm_unpacklo_epi8(
+ _mm_cvtsi32_si128(*(int *)(src_ptr + 4 * src_stride)),
+ _mm_cvtsi32_si128(*(int *)(src_ptr + 5 * src_stride)));
+ s[5] = _mm_unpacklo_epi8(
+ _mm_cvtsi32_si128(*(int *)(src_ptr + 5 * src_stride)), src6);
+
+ do {
+ s[6] = _mm_unpacklo_epi8(
+ src6, _mm_cvtsi32_si128(*(int *)(src_ptr + 7 * src_stride)));
+ src6 = _mm_cvtsi32_si128(*(int *)(src_ptr + 8 * src_stride));
+ s[7] = _mm_unpacklo_epi8(
+ _mm_cvtsi32_si128(*(int *)(src_ptr + 7 * src_stride)), src6);
+
+ res = convolve_lo_y(s + 0, coeffs);
+ res_round = _mm_sra_epi32(_mm_add_epi32(res, round_const), round_shift);
+ res16 = _mm_packs_epi32(res_round, res_round);
+ res_int = _mm_cvtsi128_si32(_mm_packus_epi16(res16, res16));
+
+ if (w == 2)
+ *(uint16_t *)dst = (uint16_t)res_int;
+ else
+ *(int *)dst = res_int;
+
+ src_ptr += src_stride;
+ dst += dst_stride;
+
+ res = convolve_lo_y(s + 1, coeffs);
+ res_round = _mm_sra_epi32(_mm_add_epi32(res, round_const), round_shift);
+ res16 = _mm_packs_epi32(res_round, res_round);
+ res_int = _mm_cvtsi128_si32(_mm_packus_epi16(res16, res16));
+
+ if (w == 2)
+ *(uint16_t *)dst = (uint16_t)res_int;
+ else
+ *(int *)dst = res_int;
+
+ src_ptr += src_stride;
+ dst += dst_stride;
+
+ s[0] = s[2];
+ s[1] = s[3];
+ s[2] = s[4];
+ s[3] = s[5];
+ s[4] = s[6];
+ s[5] = s[7];
+ h -= 2;
+ } while (h);
+ } else {
+ assert(!(w % 8));
+ int j = 0;
+ do {
+ __m128i s[8], src6, res_lo, res_hi;
+ __m128i res_lo_round, res_hi_round, res16, res;
+ const uint8_t *data = &src_ptr[j];
+
+ src6 = _mm_loadl_epi64((__m128i *)(data + 6 * src_stride));
+ s[0] = _mm_unpacklo_epi8(
+ _mm_loadl_epi64((__m128i *)(data + 0 * src_stride)),
+ _mm_loadl_epi64((__m128i *)(data + 1 * src_stride)));
+ s[1] = _mm_unpacklo_epi8(
+ _mm_loadl_epi64((__m128i *)(data + 1 * src_stride)),
+ _mm_loadl_epi64((__m128i *)(data + 2 * src_stride)));
+ s[2] = _mm_unpacklo_epi8(
+ _mm_loadl_epi64((__m128i *)(data + 2 * src_stride)),
+ _mm_loadl_epi64((__m128i *)(data + 3 * src_stride)));
+ s[3] = _mm_unpacklo_epi8(
+ _mm_loadl_epi64((__m128i *)(data + 3 * src_stride)),
+ _mm_loadl_epi64((__m128i *)(data + 4 * src_stride)));
+ s[4] = _mm_unpacklo_epi8(
+ _mm_loadl_epi64((__m128i *)(data + 4 * src_stride)),
+ _mm_loadl_epi64((__m128i *)(data + 5 * src_stride)));
+ s[5] = _mm_unpacklo_epi8(
+ _mm_loadl_epi64((__m128i *)(data + 5 * src_stride)), src6);
+
+ int i = 0;
+ do {
+ data = &src_ptr[i * src_stride + j];
+ s[6] = _mm_unpacklo_epi8(
+ src6, _mm_loadl_epi64((__m128i *)(data + 7 * src_stride)));
+ src6 = _mm_loadl_epi64((__m128i *)(data + 8 * src_stride));
+ s[7] = _mm_unpacklo_epi8(
+ _mm_loadl_epi64((__m128i *)(data + 7 * src_stride)), src6);
+
+ res_lo = convolve_lo_y(s, coeffs); // Filter low index pixels
+ res_hi = convolve_hi_y(s, coeffs); // Filter high index pixels
+
+ res_lo_round =
+ _mm_sra_epi32(_mm_add_epi32(res_lo, round_const), round_shift);
+ res_hi_round =
+ _mm_sra_epi32(_mm_add_epi32(res_hi, round_const), round_shift);
+
+ res16 = _mm_packs_epi32(res_lo_round, res_hi_round);
+ res = _mm_packus_epi16(res16, res16);
+
+ _mm_storel_epi64((__m128i *)(dst + i * dst_stride + j), res);
+ i++;
+
+ res_lo = convolve_lo_y(s + 1, coeffs); // Filter low index pixels
+ res_hi = convolve_hi_y(s + 1, coeffs); // Filter high index pixels
+
+ res_lo_round =
+ _mm_sra_epi32(_mm_add_epi32(res_lo, round_const), round_shift);
+ res_hi_round =
+ _mm_sra_epi32(_mm_add_epi32(res_hi, round_const), round_shift);
+
+ res16 = _mm_packs_epi32(res_lo_round, res_hi_round);
+ res = _mm_packus_epi16(res16, res16);
+
+ _mm_storel_epi64((__m128i *)(dst + i * dst_stride + j), res);
+ i++;
+
+ s[0] = s[2];
+ s[1] = s[3];
+ s[2] = s[4];
+ s[3] = s[5];
+ s[4] = s[6];
+ s[5] = s[7];
+ } while (i < h);
+ j += 8;
+ } while (j < w);
+ }
+ }
+}
+
+void av1_convolve_x_sr_12tap_sse2(const uint8_t *src, int src_stride,
+ uint8_t *dst, int dst_stride, int w, int h,
+ const InterpFilterParams *filter_params_x,
+ int subpel_x_qn,
+ ConvolveParams *conv_params) {
+ const int fo_horiz = filter_params_x->taps / 2 - 1;
+ const uint8_t *src_ptr = src - fo_horiz;
+ const int bits = FILTER_BITS - conv_params->round_0;
+ const __m128i round_0_const =
+ _mm_set1_epi32((1 << conv_params->round_0) >> 1);
+ const __m128i round_const = _mm_set1_epi32((1 << bits) >> 1);
+ const __m128i round_0_shift = _mm_cvtsi32_si128(conv_params->round_0);
+ const __m128i round_shift = _mm_cvtsi32_si128(bits);
+ const __m128i zero = _mm_setzero_si128();
+ __m128i coeffs[6];
+
+ assert(bits >= 0);
+ assert((FILTER_BITS - conv_params->round_1) >= 0 ||
+ ((conv_params->round_0 + conv_params->round_1) == 2 * FILTER_BITS));
+
+ prepare_coeffs_12tap(filter_params_x, subpel_x_qn, coeffs);
+
+ int i = 0;
+ do {
+ int j = 0;
+ do {
+ const __m128i data =
+ _mm_loadu_si128((__m128i *)&src_ptr[i * src_stride + j]);
+ __m128i s[4];
+
+ s[0] = _mm_unpacklo_epi16(data, _mm_srli_si128(data, 1));
+ s[1] =
+ _mm_unpacklo_epi16(_mm_srli_si128(data, 2), _mm_srli_si128(data, 3));
+ s[2] =
+ _mm_unpacklo_epi16(_mm_srli_si128(data, 4), _mm_srli_si128(data, 5));
+ s[3] =
+ _mm_unpacklo_epi16(_mm_srli_si128(data, 6), _mm_srli_si128(data, 7));
+
+ const __m128i res32 = convolve_lo_x_12tap(s, coeffs, zero);
+
+ __m128i res32_round =
+ _mm_sra_epi32(_mm_add_epi32(res32, round_0_const), round_0_shift);
+ res32_round =
+ _mm_sra_epi32(_mm_add_epi32(res32_round, round_const), round_shift);
+
+ const __m128i res16 = _mm_packs_epi32(res32_round, zero);
+ const __m128i res = _mm_packus_epi16(res16, zero);
+
+ const int val = _mm_cvtsi128_si32(res);
+ memcpy((dst + i * dst_stride + j), &val, sizeof(val));
+ j += 4;
+ } while (j < w);
+ } while (++i < h);
+}
+
+void av1_convolve_x_sr_sse2(const uint8_t *src, int src_stride, uint8_t *dst,
+ int dst_stride, int w, int h,
+ const InterpFilterParams *filter_params_x,
+ const int subpel_x_qn,
+ ConvolveParams *conv_params) {
+ if (filter_params_x->taps > 8) {
+ if (w < 4) {
+ av1_convolve_x_sr_c(src, src_stride, dst, dst_stride, w, h,
+ filter_params_x, subpel_x_qn, conv_params);
+ } else {
+ av1_convolve_x_sr_12tap_sse2(src, src_stride, dst, dst_stride, w, h,
+ filter_params_x, subpel_x_qn, conv_params);
+ }
+ } else {
+ const int fo_horiz = filter_params_x->taps / 2 - 1;
+ const uint8_t *src_ptr = src - fo_horiz;
+ const int bits = FILTER_BITS - conv_params->round_0;
+ const __m128i round_0_const =
+ _mm_set1_epi32((1 << conv_params->round_0) >> 1);
+ const __m128i round_const = _mm_set1_epi32((1 << bits) >> 1);
+ const __m128i round_0_shift = _mm_cvtsi32_si128(conv_params->round_0);
+ const __m128i round_shift = _mm_cvtsi32_si128(bits);
+ __m128i coeffs[4];
+
+ assert(bits >= 0);
+ assert((FILTER_BITS - conv_params->round_1) >= 0 ||
+ ((conv_params->round_0 + conv_params->round_1) == 2 * FILTER_BITS));
+
+ prepare_coeffs(filter_params_x, subpel_x_qn, coeffs);
+
+ if (w <= 4) {
+ do {
+ const __m128i data = _mm_loadu_si128((__m128i *)src_ptr);
+ __m128i s[4];
+
+ s[0] = _mm_unpacklo_epi8(data, _mm_srli_si128(data, 1));
+ s[1] =
+ _mm_unpacklo_epi8(_mm_srli_si128(data, 2), _mm_srli_si128(data, 3));
+ s[2] =
+ _mm_unpacklo_epi8(_mm_srli_si128(data, 4), _mm_srli_si128(data, 5));
+ s[3] =
+ _mm_unpacklo_epi8(_mm_srli_si128(data, 6), _mm_srli_si128(data, 7));
+ const __m128i res_lo = convolve_lo_x(s, coeffs);
+ __m128i res_lo_round =
+ _mm_sra_epi32(_mm_add_epi32(res_lo, round_0_const), round_0_shift);
+ res_lo_round = _mm_sra_epi32(_mm_add_epi32(res_lo_round, round_const),
+ round_shift);
+
+ const __m128i res16 = _mm_packs_epi32(res_lo_round, res_lo_round);
+ const __m128i res = _mm_packus_epi16(res16, res16);
+
+ int r = _mm_cvtsi128_si32(res);
+ if (w == 2)
+ *(uint16_t *)dst = (uint16_t)r;
+ else
+ *(int *)dst = r;
+
+ src_ptr += src_stride;
+ dst += dst_stride;
+ } while (--h);
+ } else {
+ assert(!(w % 8));
+ int i = 0;
+ do {
+ int j = 0;
+ do {
+ const __m128i data =
+ _mm_loadu_si128((__m128i *)&src_ptr[i * src_stride + j]);
+ __m128i s[4];
+
+ // Filter even-index pixels
+ s[0] = data;
+ s[1] = _mm_srli_si128(data, 2);
+ s[2] = _mm_srli_si128(data, 4);
+ s[3] = _mm_srli_si128(data, 6);
+ const __m128i res_even = convolve_lo_x(s, coeffs);
+
+ // Filter odd-index pixels
+ s[0] = _mm_srli_si128(data, 1);
+ s[1] = _mm_srli_si128(data, 3);
+ s[2] = _mm_srli_si128(data, 5);
+ s[3] = _mm_srli_si128(data, 7);
+ const __m128i res_odd = convolve_lo_x(s, coeffs);
+
+ // Rearrange pixels back into the order 0 ... 7
+ const __m128i res_lo = _mm_unpacklo_epi32(res_even, res_odd);
+ const __m128i res_hi = _mm_unpackhi_epi32(res_even, res_odd);
+ __m128i res_lo_round = _mm_sra_epi32(
+ _mm_add_epi32(res_lo, round_0_const), round_0_shift);
+ res_lo_round = _mm_sra_epi32(_mm_add_epi32(res_lo_round, round_const),
+ round_shift);
+ __m128i res_hi_round = _mm_sra_epi32(
+ _mm_add_epi32(res_hi, round_0_const), round_0_shift);
+ res_hi_round = _mm_sra_epi32(_mm_add_epi32(res_hi_round, round_const),
+ round_shift);
+
+ const __m128i res16 = _mm_packs_epi32(res_lo_round, res_hi_round);
+ const __m128i res = _mm_packus_epi16(res16, res16);
+
+ _mm_storel_epi64((__m128i *)(dst + i * dst_stride + j), res);
+ j += 8;
+ } while (j < w);
+ } while (++i < h);
+ }
+ }
+}
diff --git a/third_party/aom/av1/common/x86/filterintra_sse4.c b/third_party/aom/av1/common/x86/filterintra_sse4.c
new file mode 100644
index 0000000000..d05bb0e15f
--- /dev/null
+++ b/third_party/aom/av1/common/x86/filterintra_sse4.c
@@ -0,0 +1,350 @@
+/*
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <assert.h>
+#include <smmintrin.h>
+#include <string.h>
+
+#include "config/av1_rtcd.h"
+
+#include "aom_dsp/x86/synonyms.h"
+#include "av1/common/enums.h"
+#include "av1/common/reconintra.h"
+
+//------------------------------------------------------------------------------
+// filter_intra_predictor_sse4_1
+
+// This shuffle mask selects 32-bit blocks in the order 0, 1, 0, 1, which
+// duplicates the first 8 bytes of a 128-bit vector into the second 8 bytes.
+#define DUPLICATE_FIRST_HALF 0x44
+
+// Apply all filter taps to the given 7 packed 16-bit values, keeping the 8th
+// at zero to preserve the sum.
+static INLINE void filter_4x2_sse4_1(uint8_t *dst, const ptrdiff_t stride,
+ const __m128i *pixels,
+ const __m128i *taps_0_1,
+ const __m128i *taps_2_3,
+ const __m128i *taps_4_5,
+ const __m128i *taps_6_7) {
+ const __m128i mul_0_01 = _mm_maddubs_epi16(*pixels, *taps_0_1);
+ const __m128i mul_0_23 = _mm_maddubs_epi16(*pixels, *taps_2_3);
+ // |output_half| contains 8 partial sums.
+ __m128i output_half = _mm_hadd_epi16(mul_0_01, mul_0_23);
+ __m128i output = _mm_hadd_epi16(output_half, output_half);
+ const __m128i output_row0 =
+ _mm_packus_epi16(xx_roundn_epi16_unsigned(output, 4),
+ /* arbitrary pack arg */ output);
+ xx_storel_32(dst, output_row0);
+ const __m128i mul_1_01 = _mm_maddubs_epi16(*pixels, *taps_4_5);
+ const __m128i mul_1_23 = _mm_maddubs_epi16(*pixels, *taps_6_7);
+ output_half = _mm_hadd_epi16(mul_1_01, mul_1_23);
+ output = _mm_hadd_epi16(output_half, output_half);
+ const __m128i output_row1 =
+ _mm_packus_epi16(xx_roundn_epi16_unsigned(output, 4),
+ /* arbitrary pack arg */ output);
+ xx_storel_32(dst + stride, output_row1);
+}
+
+// 4xH transform sizes are given special treatment because xx_loadl_64 goes out
+// of bounds and every block involves the left column. This implementation
+// loads TL from the top row for the first block, so it is not
+static INLINE void filter_4xh(uint8_t *dest, ptrdiff_t stride,
+ const uint8_t *const top_ptr,
+ const uint8_t *const left_ptr, int mode,
+ const int height) {
+ const __m128i taps_0_1 = xx_load_128(av1_filter_intra_taps[mode][0]);
+ const __m128i taps_2_3 = xx_load_128(av1_filter_intra_taps[mode][2]);
+ const __m128i taps_4_5 = xx_load_128(av1_filter_intra_taps[mode][4]);
+ const __m128i taps_6_7 = xx_load_128(av1_filter_intra_taps[mode][6]);
+ __m128i top = xx_loadl_32(top_ptr - 1);
+ __m128i pixels = _mm_insert_epi8(top, (int8_t)top_ptr[3], 4);
+ __m128i left = (height == 4 ? xx_loadl_32(left_ptr) : xx_loadl_64(left_ptr));
+ left = _mm_slli_si128(left, 5);
+
+ // Relative pixels: top[-1], top[0], top[1], top[2], top[3], left[0], left[1],
+ // left[2], left[3], left[4], left[5], left[6], left[7]
+ pixels = _mm_or_si128(left, pixels);
+
+ // Duplicate first 8 bytes.
+ pixels = _mm_shuffle_epi32(pixels, DUPLICATE_FIRST_HALF);
+ filter_4x2_sse4_1(dest, stride, &pixels, &taps_0_1, &taps_2_3, &taps_4_5,
+ &taps_6_7);
+ dest += stride; // Move to y = 1.
+ pixels = xx_loadl_32(dest);
+
+ // Relative pixels: top[0], top[1], top[2], top[3], empty, left[-2], left[-1],
+ // left[0], left[1], ...
+ pixels = _mm_or_si128(left, pixels);
+
+ // This mask rearranges bytes in the order: 6, 0, 1, 2, 3, 7, 8, 15. The last
+ // byte is an unused value, which shall be multiplied by 0 when we apply the
+ // filter.
+ const int64_t kInsertTopLeftFirstMask = 0x0F08070302010006;
+
+ // Insert left[-1] in front as TL and put left[0] and left[1] at the end.
+ const __m128i pixel_order1 = _mm_set1_epi64x(kInsertTopLeftFirstMask);
+ pixels = _mm_shuffle_epi8(pixels, pixel_order1);
+ dest += stride; // Move to y = 2.
+ filter_4x2_sse4_1(dest, stride, &pixels, &taps_0_1, &taps_2_3, &taps_4_5,
+ &taps_6_7);
+ dest += stride; // Move to y = 3.
+
+ // Compute the middle 8 rows before using common code for the final 4 rows.
+ // Because the common code below this block assumes that
+ if (height == 16) {
+ // This shift allows us to use pixel_order2 twice after shifting by 2 later.
+ left = _mm_slli_si128(left, 1);
+ pixels = xx_loadl_32(dest);
+
+ // Relative pixels: top[0], top[1], top[2], top[3], empty, empty, left[-4],
+ // left[-3], left[-2], left[-1], left[0], left[1], left[2], left[3]
+ pixels = _mm_or_si128(left, pixels);
+
+ // This mask rearranges bytes in the order: 9, 0, 1, 2, 3, 7, 8, 15. The
+ // last byte is an unused value, as above. The top-left was shifted to
+ // position nine to keep two empty spaces after the top pixels.
+ const int64_t kInsertTopLeftSecondMask = 0x0F0B0A0302010009;
+
+ // Insert (relative) left[-1] in front as TL and put left[0] and left[1] at
+ // the end.
+ const __m128i pixel_order2 = _mm_set1_epi64x(kInsertTopLeftSecondMask);
+ pixels = _mm_shuffle_epi8(pixels, pixel_order2);
+ dest += stride; // Move to y = 4.
+
+ // First 4x2 in the if body.
+ filter_4x2_sse4_1(dest, stride, &pixels, &taps_0_1, &taps_2_3, &taps_4_5,
+ &taps_6_7);
+
+ // Clear all but final pixel in the first 8 of left column.
+ __m128i keep_top_left = _mm_srli_si128(left, 13);
+ dest += stride; // Move to y = 5.
+ pixels = xx_loadl_32(dest);
+ left = _mm_srli_si128(left, 2);
+
+ // Relative pixels: top[0], top[1], top[2], top[3], left[-6],
+ // left[-5], left[-4], left[-3], left[-2], left[-1], left[0], left[1]
+ pixels = _mm_or_si128(left, pixels);
+ left = xx_loadl_64(left_ptr + 8);
+
+ pixels = _mm_shuffle_epi8(pixels, pixel_order2);
+ dest += stride; // Move to y = 6.
+
+ // Second 4x2 in the if body.
+ filter_4x2_sse4_1(dest, stride, &pixels, &taps_0_1, &taps_2_3, &taps_4_5,
+ &taps_6_7);
+
+ // Position TL value so we can use pixel_order1.
+ keep_top_left = _mm_slli_si128(keep_top_left, 6);
+ dest += stride; // Move to y = 7.
+ pixels = xx_loadl_32(dest);
+ left = _mm_slli_si128(left, 7);
+ left = _mm_or_si128(left, keep_top_left);
+
+ // Relative pixels: top[0], top[1], top[2], top[3], empty, empty,
+ // left[-1], left[0], left[1], left[2], left[3], ...
+ pixels = _mm_or_si128(left, pixels);
+ pixels = _mm_shuffle_epi8(pixels, pixel_order1);
+ dest += stride; // Move to y = 8.
+
+ // Third 4x2 in the if body.
+ filter_4x2_sse4_1(dest, stride, &pixels, &taps_0_1, &taps_2_3, &taps_4_5,
+ &taps_6_7);
+ dest += stride; // Move to y = 9.
+
+ // Prepare final inputs.
+ pixels = xx_loadl_32(dest);
+ left = _mm_srli_si128(left, 2);
+
+ // Relative pixels: top[0], top[1], top[2], top[3], left[-3], left[-2]
+ // left[-1], left[0], left[1], left[2], left[3], ...
+ pixels = _mm_or_si128(left, pixels);
+ pixels = _mm_shuffle_epi8(pixels, pixel_order1);
+ dest += stride; // Move to y = 10.
+
+ // Fourth 4x2 in the if body.
+ filter_4x2_sse4_1(dest, stride, &pixels, &taps_0_1, &taps_2_3, &taps_4_5,
+ &taps_6_7);
+ dest += stride; // Move to y = 11.
+ }
+
+ // In both the 8 and 16 case, we assume that the left vector has the next TL
+ // at position 8.
+ if (height > 4) {
+ // Erase prior left pixels by shifting TL to position 0.
+ left = _mm_srli_si128(left, 8);
+ left = _mm_slli_si128(left, 6);
+ pixels = xx_loadl_32(dest);
+
+ // Relative pixels: top[0], top[1], top[2], top[3], empty, empty,
+ // left[-1], left[0], left[1], left[2], left[3], ...
+ pixels = _mm_or_si128(left, pixels);
+ pixels = _mm_shuffle_epi8(pixels, pixel_order1);
+ dest += stride; // Move to y = 12 or 4.
+
+ // First of final two 4x2 blocks.
+ filter_4x2_sse4_1(dest, stride, &pixels, &taps_0_1, &taps_2_3, &taps_4_5,
+ &taps_6_7);
+ dest += stride; // Move to y = 13 or 5.
+ pixels = xx_loadl_32(dest);
+ left = _mm_srli_si128(left, 2);
+
+ // Relative pixels: top[0], top[1], top[2], top[3], left[-3], left[-2]
+ // left[-1], left[0], left[1], left[2], left[3], ...
+ pixels = _mm_or_si128(left, pixels);
+ pixels = _mm_shuffle_epi8(pixels, pixel_order1);
+ dest += stride; // Move to y = 14 or 6.
+
+ // Last of final two 4x2 blocks.
+ filter_4x2_sse4_1(dest, stride, &pixels, &taps_0_1, &taps_2_3, &taps_4_5,
+ &taps_6_7);
+ }
+}
+
+static INLINE void filter_intra_predictor_sse4_1(void *const dest,
+ ptrdiff_t stride,
+ const void *const top_row,
+ const void *const left_column,
+ int mode, const int width,
+ const int height) {
+ const uint8_t *const top_ptr = (const uint8_t *)top_row;
+ const uint8_t *const left_ptr = (const uint8_t *)left_column;
+ uint8_t *dst = (uint8_t *)dest;
+ if (width == 4) {
+ filter_4xh(dst, stride, top_ptr, left_ptr, mode, height);
+ return;
+ }
+
+ // There is one set of 7 taps for each of the 4x2 output pixels.
+ const __m128i taps_0_1 = xx_load_128(av1_filter_intra_taps[mode][0]);
+ const __m128i taps_2_3 = xx_load_128(av1_filter_intra_taps[mode][2]);
+ const __m128i taps_4_5 = xx_load_128(av1_filter_intra_taps[mode][4]);
+ const __m128i taps_6_7 = xx_load_128(av1_filter_intra_taps[mode][6]);
+
+ // This mask rearranges bytes in the order: 0, 1, 2, 3, 4, 8, 9, 15. The 15 at
+ // the end is an unused value, which shall be multiplied by 0 when we apply
+ // the filter.
+ const int64_t kCondenseLeftMask = 0x0F09080403020100;
+
+ // Takes the "left section" and puts it right after p0-p4.
+ const __m128i pixel_order1 = _mm_set1_epi64x(kCondenseLeftMask);
+
+ // This mask rearranges bytes in the order: 8, 0, 1, 2, 3, 9, 10, 15. The last
+ // byte is unused as above.
+ const int64_t kInsertTopLeftMask = 0x0F0A090302010008;
+
+ // Shuffles the "top left" from the left section, to the front. Used when
+ // grabbing data from left_column and not top_row.
+ const __m128i pixel_order2 = _mm_set1_epi64x(kInsertTopLeftMask);
+
+ // This first pass takes care of the cases where the top left pixel comes from
+ // top_row.
+ __m128i pixels = xx_loadl_64(top_ptr - 1);
+ __m128i left = _mm_slli_si128(xx_loadl_32(left_column), 8);
+ pixels = _mm_or_si128(pixels, left);
+
+ // Two sets of the same pixels to multiply with two sets of taps.
+ pixels = _mm_shuffle_epi8(pixels, pixel_order1);
+ filter_4x2_sse4_1(dst, stride, &pixels, &taps_0_1, &taps_2_3, &taps_4_5,
+ &taps_6_7);
+ left = _mm_srli_si128(left, 1);
+
+ // Load
+ pixels = xx_loadl_32(dst + stride);
+
+ // Because of the above shift, this OR 'invades' the final of the first 8
+ // bytes of |pixels|. This is acceptable because the 8th filter tap is always
+ // a padded 0.
+ pixels = _mm_or_si128(pixels, left);
+ pixels = _mm_shuffle_epi8(pixels, pixel_order2);
+ const ptrdiff_t stride2 = stride << 1;
+ const ptrdiff_t stride4 = stride << 2;
+ filter_4x2_sse4_1(dst + stride2, stride, &pixels, &taps_0_1, &taps_2_3,
+ &taps_4_5, &taps_6_7);
+ dst += 4;
+ for (int x = 3; x < width - 4; x += 4) {
+ pixels = xx_loadl_32(top_ptr + x);
+ pixels = _mm_insert_epi8(pixels, (int8_t)top_ptr[x + 4], 4);
+ pixels = _mm_insert_epi8(pixels, (int8_t)dst[-1], 5);
+ pixels = _mm_insert_epi8(pixels, (int8_t)dst[stride - 1], 6);
+
+ // Duplicate bottom half into upper half.
+ pixels = _mm_shuffle_epi32(pixels, DUPLICATE_FIRST_HALF);
+ filter_4x2_sse4_1(dst, stride, &pixels, &taps_0_1, &taps_2_3, &taps_4_5,
+ &taps_6_7);
+ pixels = xx_loadl_32(dst + stride - 1);
+ pixels = _mm_insert_epi8(pixels, (int8_t)dst[stride + 3], 4);
+ pixels = _mm_insert_epi8(pixels, (int8_t)dst[stride2 - 1], 5);
+ pixels = _mm_insert_epi8(pixels, (int8_t)dst[stride + stride2 - 1], 6);
+
+ // Duplicate bottom half into upper half.
+ pixels = _mm_shuffle_epi32(pixels, DUPLICATE_FIRST_HALF);
+ filter_4x2_sse4_1(dst + stride2, stride, &pixels, &taps_0_1, &taps_2_3,
+ &taps_4_5, &taps_6_7);
+ dst += 4;
+ }
+
+ // Now we handle heights that reference previous blocks rather than top_row.
+ for (int y = 4; y < height; y += 4) {
+ // Leftmost 4x4 block for this height.
+ dst -= width;
+ dst += stride4;
+
+ // Top Left is not available by offset in these leftmost blocks.
+ pixels = xx_loadl_32(dst - stride);
+ left = _mm_slli_si128(xx_loadl_32(left_ptr + y - 1), 8);
+ left = _mm_insert_epi8(left, (int8_t)left_ptr[y + 3], 12);
+ pixels = _mm_or_si128(pixels, left);
+ pixels = _mm_shuffle_epi8(pixels, pixel_order2);
+ filter_4x2_sse4_1(dst, stride, &pixels, &taps_0_1, &taps_2_3, &taps_4_5,
+ &taps_6_7);
+
+ // The bytes shifted into positions 6 and 7 will be ignored by the shuffle.
+ left = _mm_srli_si128(left, 2);
+ pixels = xx_loadl_32(dst + stride);
+ pixels = _mm_or_si128(pixels, left);
+ pixels = _mm_shuffle_epi8(pixels, pixel_order2);
+ filter_4x2_sse4_1(dst + stride2, stride, &pixels, &taps_0_1, &taps_2_3,
+ &taps_4_5, &taps_6_7);
+
+ dst += 4;
+
+ // Remaining 4x4 blocks for this height.
+ for (int x = 4; x < width; x += 4) {
+ pixels = xx_loadl_32(dst - stride - 1);
+ pixels = _mm_insert_epi8(pixels, (int8_t)dst[-stride + 3], 4);
+ pixels = _mm_insert_epi8(pixels, (int8_t)dst[-1], 5);
+ pixels = _mm_insert_epi8(pixels, (int8_t)dst[stride - 1], 6);
+
+ // Duplicate bottom half into upper half.
+ pixels = _mm_shuffle_epi32(pixels, DUPLICATE_FIRST_HALF);
+ filter_4x2_sse4_1(dst, stride, &pixels, &taps_0_1, &taps_2_3, &taps_4_5,
+ &taps_6_7);
+ pixels = xx_loadl_32(dst + stride - 1);
+ pixels = _mm_insert_epi8(pixels, (int8_t)dst[stride + 3], 4);
+ pixels = _mm_insert_epi8(pixels, (int8_t)dst[stride2 - 1], 5);
+ pixels = _mm_insert_epi8(pixels, (int8_t)dst[stride2 + stride - 1], 6);
+
+ // Duplicate bottom half into upper half.
+ pixels = _mm_shuffle_epi32(pixels, DUPLICATE_FIRST_HALF);
+ filter_4x2_sse4_1(dst + stride2, stride, &pixels, &taps_0_1, &taps_2_3,
+ &taps_4_5, &taps_6_7);
+ dst += 4;
+ }
+ }
+}
+
+void av1_filter_intra_predictor_sse4_1(uint8_t *dst, ptrdiff_t stride,
+ TX_SIZE tx_size, const uint8_t *above,
+ const uint8_t *left, int mode) {
+ const int bw = tx_size_wide[tx_size];
+ const int bh = tx_size_high[tx_size];
+ filter_intra_predictor_sse4_1(dst, stride, above, left, mode, bw, bh);
+}
diff --git a/third_party/aom/av1/common/x86/highbd_convolve_2d_avx2.c b/third_party/aom/av1/common/x86/highbd_convolve_2d_avx2.c
new file mode 100644
index 0000000000..d65318ccfa
--- /dev/null
+++ b/third_party/aom/av1/common/x86/highbd_convolve_2d_avx2.c
@@ -0,0 +1,200 @@
+/*
+ * Copyright (c) 2017, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <immintrin.h>
+#include <assert.h>
+
+#include "config/av1_rtcd.h"
+
+#include "aom_dsp/x86/convolve_avx2.h"
+#include "aom_dsp/x86/synonyms.h"
+#include "aom_dsp/aom_dsp_common.h"
+#include "aom_dsp/aom_filter.h"
+#include "av1/common/convolve.h"
+
+void av1_highbd_convolve_2d_sr_ssse3(
+ const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w,
+ int h, const InterpFilterParams *filter_params_x,
+ const InterpFilterParams *filter_params_y, const int subpel_x_qn,
+ const int subpel_y_qn, ConvolveParams *conv_params, int bd);
+
+void av1_highbd_convolve_2d_sr_avx2(const uint16_t *src, int src_stride,
+ uint16_t *dst, int dst_stride, int w, int h,
+ const InterpFilterParams *filter_params_x,
+ const InterpFilterParams *filter_params_y,
+ const int subpel_x_qn,
+ const int subpel_y_qn,
+ ConvolveParams *conv_params, int bd) {
+ if (filter_params_x->taps == 12) {
+ av1_highbd_convolve_2d_sr_ssse3(src, src_stride, dst, dst_stride, w, h,
+ filter_params_x, filter_params_y,
+ subpel_x_qn, subpel_y_qn, conv_params, bd);
+ return;
+ }
+
+ DECLARE_ALIGNED(32, int16_t, im_block[(MAX_SB_SIZE + MAX_FILTER_TAP) * 8]);
+ int im_h = h + filter_params_y->taps - 1;
+ int im_stride = 8;
+ int i, j;
+ const int fo_vert = filter_params_y->taps / 2 - 1;
+ const int fo_horiz = filter_params_x->taps / 2 - 1;
+ const uint16_t *const src_ptr = src - fo_vert * src_stride - fo_horiz;
+
+ // Check that, even with 12-bit input, the intermediate values will fit
+ // into an unsigned 16-bit intermediate array.
+ assert(bd + FILTER_BITS + 2 - conv_params->round_0 <= 16);
+
+ __m256i s[8], coeffs_y[4], coeffs_x[4];
+
+ const __m256i round_const_x = _mm256_set1_epi32(
+ ((1 << conv_params->round_0) >> 1) + (1 << (bd + FILTER_BITS - 1)));
+ const __m128i round_shift_x = _mm_cvtsi32_si128(conv_params->round_0);
+
+ const __m256i round_const_y = _mm256_set1_epi32(
+ ((1 << conv_params->round_1) >> 1) -
+ (1 << (bd + 2 * FILTER_BITS - conv_params->round_0 - 1)));
+ const __m128i round_shift_y = _mm_cvtsi32_si128(conv_params->round_1);
+
+ const int bits =
+ FILTER_BITS * 2 - conv_params->round_0 - conv_params->round_1;
+ const __m128i round_shift_bits = _mm_cvtsi32_si128(bits);
+ const __m256i round_const_bits = _mm256_set1_epi32((1 << bits) >> 1);
+ const __m256i clip_pixel =
+ _mm256_set1_epi16(bd == 10 ? 1023 : (bd == 12 ? 4095 : 255));
+ const __m256i zero = _mm256_setzero_si256();
+
+ prepare_coeffs(filter_params_x, subpel_x_qn, coeffs_x);
+ prepare_coeffs(filter_params_y, subpel_y_qn, coeffs_y);
+
+ for (j = 0; j < w; j += 8) {
+ /* Horizontal filter */
+ {
+ for (i = 0; i < im_h; i += 2) {
+ const __m256i row0 =
+ _mm256_loadu_si256((__m256i *)&src_ptr[i * src_stride + j]);
+ __m256i row1 = _mm256_setzero_si256();
+ if (i + 1 < im_h)
+ row1 =
+ _mm256_loadu_si256((__m256i *)&src_ptr[(i + 1) * src_stride + j]);
+
+ const __m256i r0 = _mm256_permute2x128_si256(row0, row1, 0x20);
+ const __m256i r1 = _mm256_permute2x128_si256(row0, row1, 0x31);
+
+ // even pixels
+ s[0] = _mm256_alignr_epi8(r1, r0, 0);
+ s[1] = _mm256_alignr_epi8(r1, r0, 4);
+ s[2] = _mm256_alignr_epi8(r1, r0, 8);
+ s[3] = _mm256_alignr_epi8(r1, r0, 12);
+
+ __m256i res_even = convolve(s, coeffs_x);
+ res_even = _mm256_sra_epi32(_mm256_add_epi32(res_even, round_const_x),
+ round_shift_x);
+
+ // odd pixels
+ s[0] = _mm256_alignr_epi8(r1, r0, 2);
+ s[1] = _mm256_alignr_epi8(r1, r0, 6);
+ s[2] = _mm256_alignr_epi8(r1, r0, 10);
+ s[3] = _mm256_alignr_epi8(r1, r0, 14);
+
+ __m256i res_odd = convolve(s, coeffs_x);
+ res_odd = _mm256_sra_epi32(_mm256_add_epi32(res_odd, round_const_x),
+ round_shift_x);
+
+ __m256i res_even1 = _mm256_packs_epi32(res_even, res_even);
+ __m256i res_odd1 = _mm256_packs_epi32(res_odd, res_odd);
+ __m256i res = _mm256_unpacklo_epi16(res_even1, res_odd1);
+
+ _mm256_store_si256((__m256i *)&im_block[i * im_stride], res);
+ }
+ }
+
+ /* Vertical filter */
+ {
+ __m256i s0 = _mm256_loadu_si256((__m256i *)(im_block + 0 * im_stride));
+ __m256i s1 = _mm256_loadu_si256((__m256i *)(im_block + 1 * im_stride));
+ __m256i s2 = _mm256_loadu_si256((__m256i *)(im_block + 2 * im_stride));
+ __m256i s3 = _mm256_loadu_si256((__m256i *)(im_block + 3 * im_stride));
+ __m256i s4 = _mm256_loadu_si256((__m256i *)(im_block + 4 * im_stride));
+ __m256i s5 = _mm256_loadu_si256((__m256i *)(im_block + 5 * im_stride));
+
+ s[0] = _mm256_unpacklo_epi16(s0, s1);
+ s[1] = _mm256_unpacklo_epi16(s2, s3);
+ s[2] = _mm256_unpacklo_epi16(s4, s5);
+
+ s[4] = _mm256_unpackhi_epi16(s0, s1);
+ s[5] = _mm256_unpackhi_epi16(s2, s3);
+ s[6] = _mm256_unpackhi_epi16(s4, s5);
+
+ for (i = 0; i < h; i += 2) {
+ const int16_t *data = &im_block[i * im_stride];
+
+ const __m256i s6 =
+ _mm256_loadu_si256((__m256i *)(data + 6 * im_stride));
+ const __m256i s7 =
+ _mm256_loadu_si256((__m256i *)(data + 7 * im_stride));
+
+ s[3] = _mm256_unpacklo_epi16(s6, s7);
+ s[7] = _mm256_unpackhi_epi16(s6, s7);
+
+ const __m256i res_a = convolve(s, coeffs_y);
+ __m256i res_a_round = _mm256_sra_epi32(
+ _mm256_add_epi32(res_a, round_const_y), round_shift_y);
+
+ res_a_round = _mm256_sra_epi32(
+ _mm256_add_epi32(res_a_round, round_const_bits), round_shift_bits);
+
+ if (w - j > 4) {
+ const __m256i res_b = convolve(s + 4, coeffs_y);
+ __m256i res_b_round = _mm256_sra_epi32(
+ _mm256_add_epi32(res_b, round_const_y), round_shift_y);
+ res_b_round =
+ _mm256_sra_epi32(_mm256_add_epi32(res_b_round, round_const_bits),
+ round_shift_bits);
+
+ __m256i res_16bit = _mm256_packs_epi32(res_a_round, res_b_round);
+ res_16bit = _mm256_min_epi16(res_16bit, clip_pixel);
+ res_16bit = _mm256_max_epi16(res_16bit, zero);
+
+ _mm_storeu_si128((__m128i *)&dst[i * dst_stride + j],
+ _mm256_castsi256_si128(res_16bit));
+ _mm_storeu_si128((__m128i *)&dst[i * dst_stride + j + dst_stride],
+ _mm256_extracti128_si256(res_16bit, 1));
+ } else if (w == 4) {
+ res_a_round = _mm256_packs_epi32(res_a_round, res_a_round);
+ res_a_round = _mm256_min_epi16(res_a_round, clip_pixel);
+ res_a_round = _mm256_max_epi16(res_a_round, zero);
+
+ _mm_storel_epi64((__m128i *)&dst[i * dst_stride + j],
+ _mm256_castsi256_si128(res_a_round));
+ _mm_storel_epi64((__m128i *)&dst[i * dst_stride + j + dst_stride],
+ _mm256_extracti128_si256(res_a_round, 1));
+ } else {
+ res_a_round = _mm256_packs_epi32(res_a_round, res_a_round);
+ res_a_round = _mm256_min_epi16(res_a_round, clip_pixel);
+ res_a_round = _mm256_max_epi16(res_a_round, zero);
+
+ xx_storel_32(&dst[i * dst_stride + j],
+ _mm256_castsi256_si128(res_a_round));
+ xx_storel_32(&dst[i * dst_stride + j + dst_stride],
+ _mm256_extracti128_si256(res_a_round, 1));
+ }
+
+ s[0] = s[1];
+ s[1] = s[2];
+ s[2] = s[3];
+
+ s[4] = s[5];
+ s[5] = s[6];
+ s[6] = s[7];
+ }
+ }
+ }
+}
diff --git a/third_party/aom/av1/common/x86/highbd_convolve_2d_sse4.c b/third_party/aom/av1/common/x86/highbd_convolve_2d_sse4.c
new file mode 100644
index 0000000000..89d7199f48
--- /dev/null
+++ b/third_party/aom/av1/common/x86/highbd_convolve_2d_sse4.c
@@ -0,0 +1,421 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <tmmintrin.h>
+#include <smmintrin.h>
+#include <assert.h>
+
+#include "config/av1_rtcd.h"
+
+#include "aom_dsp/aom_dsp_common.h"
+#include "aom_dsp/aom_filter.h"
+#include "aom_dsp/x86/convolve_sse2.h"
+#include "aom_dsp/x86/convolve_sse4_1.h"
+#include "av1/common/convolve.h"
+
+void av1_highbd_dist_wtd_convolve_2d_copy_sse4_1(const uint16_t *src,
+ int src_stride, uint16_t *dst0,
+ int dst_stride0, int w, int h,
+ ConvolveParams *conv_params,
+ int bd) {
+ CONV_BUF_TYPE *dst = conv_params->dst;
+ int dst_stride = conv_params->dst_stride;
+
+ const int bits =
+ FILTER_BITS * 2 - conv_params->round_1 - conv_params->round_0;
+ const __m128i left_shift = _mm_cvtsi32_si128(bits);
+ const int do_average = conv_params->do_average;
+ const int use_dist_wtd_comp_avg = conv_params->use_dist_wtd_comp_avg;
+ const int w0 = conv_params->fwd_offset;
+ const int w1 = conv_params->bck_offset;
+ const __m128i wt0 = _mm_set1_epi32(w0);
+ const __m128i wt1 = _mm_set1_epi32(w1);
+ const __m128i zero = _mm_setzero_si128();
+ int i, j;
+
+ const int offset_0 =
+ bd + 2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1;
+ const int offset = (1 << offset_0) + (1 << (offset_0 - 1));
+ const __m128i offset_const = _mm_set1_epi32(offset);
+ const __m128i offset_const_16b = _mm_set1_epi16(offset);
+ const int rounding_shift =
+ 2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1;
+ const __m128i rounding_const = _mm_set1_epi32((1 << rounding_shift) >> 1);
+ const __m128i clip_pixel_to_bd =
+ _mm_set1_epi16(bd == 10 ? 1023 : (bd == 12 ? 4095 : 255));
+
+ assert(bits <= 4);
+
+ if (!(w % 8)) {
+ for (i = 0; i < h; i += 1) {
+ for (j = 0; j < w; j += 8) {
+ const __m128i src_16bit =
+ _mm_loadu_si128((__m128i *)(&src[i * src_stride + j]));
+ const __m128i res = _mm_sll_epi16(src_16bit, left_shift);
+ if (do_average) {
+ const __m128i data_0 =
+ _mm_loadu_si128((__m128i *)(&dst[i * dst_stride + j]));
+
+ const __m128i data_ref_0_lo = _mm_unpacklo_epi16(data_0, zero);
+ const __m128i data_ref_0_hi = _mm_unpackhi_epi16(data_0, zero);
+
+ const __m128i res_32b_lo = _mm_unpacklo_epi16(res, zero);
+ const __m128i res_unsigned_lo =
+ _mm_add_epi32(res_32b_lo, offset_const);
+
+ const __m128i comp_avg_res_lo =
+ highbd_comp_avg_sse4_1(&data_ref_0_lo, &res_unsigned_lo, &wt0,
+ &wt1, use_dist_wtd_comp_avg);
+
+ const __m128i res_32b_hi = _mm_unpackhi_epi16(res, zero);
+ const __m128i res_unsigned_hi =
+ _mm_add_epi32(res_32b_hi, offset_const);
+
+ const __m128i comp_avg_res_hi =
+ highbd_comp_avg_sse4_1(&data_ref_0_hi, &res_unsigned_hi, &wt0,
+ &wt1, use_dist_wtd_comp_avg);
+
+ const __m128i round_result_lo = highbd_convolve_rounding_sse2(
+ &comp_avg_res_lo, &offset_const, &rounding_const, rounding_shift);
+ const __m128i round_result_hi = highbd_convolve_rounding_sse2(
+ &comp_avg_res_hi, &offset_const, &rounding_const, rounding_shift);
+
+ const __m128i res_16b =
+ _mm_packus_epi32(round_result_lo, round_result_hi);
+ const __m128i res_clip = _mm_min_epi16(res_16b, clip_pixel_to_bd);
+
+ _mm_store_si128((__m128i *)(&dst0[i * dst_stride0 + j]), res_clip);
+ } else {
+ const __m128i res_unsigned_16b =
+ _mm_adds_epu16(res, offset_const_16b);
+
+ _mm_store_si128((__m128i *)(&dst[i * dst_stride + j]),
+ res_unsigned_16b);
+ }
+ }
+ }
+ } else if (!(w % 4)) {
+ for (i = 0; i < h; i += 2) {
+ for (j = 0; j < w; j += 4) {
+ const __m128i src_row_0 =
+ _mm_loadl_epi64((__m128i *)(&src[i * src_stride + j]));
+ const __m128i src_row_1 =
+ _mm_loadl_epi64((__m128i *)(&src[i * src_stride + j + src_stride]));
+ const __m128i src_10 = _mm_unpacklo_epi64(src_row_0, src_row_1);
+
+ const __m128i res = _mm_sll_epi16(src_10, left_shift);
+
+ if (do_average) {
+ const __m128i data_0 =
+ _mm_loadl_epi64((__m128i *)(&dst[i * dst_stride + j]));
+ const __m128i data_1 = _mm_loadl_epi64(
+ (__m128i *)(&dst[i * dst_stride + j + dst_stride]));
+
+ const __m128i data_ref_0 = _mm_unpacklo_epi16(data_0, zero);
+ const __m128i data_ref_1 = _mm_unpacklo_epi16(data_1, zero);
+
+ const __m128i res_32b = _mm_unpacklo_epi16(res, zero);
+ const __m128i res_unsigned_lo = _mm_add_epi32(res_32b, offset_const);
+
+ const __m128i res_32b_hi = _mm_unpackhi_epi16(res, zero);
+ const __m128i res_unsigned_hi =
+ _mm_add_epi32(res_32b_hi, offset_const);
+
+ const __m128i comp_avg_res_lo = highbd_comp_avg_sse4_1(
+ &data_ref_0, &res_unsigned_lo, &wt0, &wt1, use_dist_wtd_comp_avg);
+ const __m128i comp_avg_res_hi = highbd_comp_avg_sse4_1(
+ &data_ref_1, &res_unsigned_hi, &wt0, &wt1, use_dist_wtd_comp_avg);
+
+ const __m128i round_result_lo = highbd_convolve_rounding_sse2(
+ &comp_avg_res_lo, &offset_const, &rounding_const, rounding_shift);
+ const __m128i round_result_hi = highbd_convolve_rounding_sse2(
+ &comp_avg_res_hi, &offset_const, &rounding_const, rounding_shift);
+
+ const __m128i res_16b =
+ _mm_packus_epi32(round_result_lo, round_result_hi);
+ const __m128i res_clip = _mm_min_epi16(res_16b, clip_pixel_to_bd);
+
+ const __m128i res_1 = _mm_srli_si128(res_clip, 8);
+
+ _mm_storel_epi64((__m128i *)(&dst0[i * dst_stride0 + j]), res_clip);
+ _mm_storel_epi64(
+ (__m128i *)(&dst0[i * dst_stride0 + j + dst_stride0]), res_1);
+ } else {
+ const __m128i res_unsigned_16b =
+ _mm_adds_epu16(res, offset_const_16b);
+
+ const __m128i res_1 = _mm_srli_si128(res_unsigned_16b, 8);
+
+ _mm_storel_epi64((__m128i *)(&dst[i * dst_stride + j]),
+ res_unsigned_16b);
+ _mm_storel_epi64((__m128i *)(&dst[i * dst_stride + j + dst_stride]),
+ res_1);
+ }
+ }
+ }
+ }
+}
+
+void av1_highbd_dist_wtd_convolve_2d_sse4_1(
+ const uint16_t *src, int src_stride, uint16_t *dst0, int dst_stride0, int w,
+ int h, const InterpFilterParams *filter_params_x,
+ const InterpFilterParams *filter_params_y, const int subpel_x_qn,
+ const int subpel_y_qn, ConvolveParams *conv_params, int bd) {
+ DECLARE_ALIGNED(16, int16_t,
+ im_block[(MAX_SB_SIZE + MAX_FILTER_TAP - 1) * MAX_SB_SIZE]);
+ CONV_BUF_TYPE *dst = conv_params->dst;
+ int dst_stride = conv_params->dst_stride;
+ int im_h = h + filter_params_y->taps - 1;
+ int im_stride = MAX_SB_SIZE;
+ int i, j;
+ const int do_average = conv_params->do_average;
+ const int use_dist_wtd_comp_avg = conv_params->use_dist_wtd_comp_avg;
+ const int fo_vert = filter_params_y->taps / 2 - 1;
+ const int fo_horiz = filter_params_x->taps / 2 - 1;
+ const uint16_t *const src_ptr = src - fo_vert * src_stride - fo_horiz;
+
+ const int w0 = conv_params->fwd_offset;
+ const int w1 = conv_params->bck_offset;
+ const __m128i wt0 = _mm_set1_epi32(w0);
+ const __m128i wt1 = _mm_set1_epi32(w1);
+
+ const int offset_0 =
+ bd + 2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1;
+ const int offset = (1 << offset_0) + (1 << (offset_0 - 1));
+ const __m128i offset_const = _mm_set1_epi32(offset);
+ const int rounding_shift =
+ 2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1;
+ const __m128i rounding_const = _mm_set1_epi32((1 << rounding_shift) >> 1);
+ const __m128i clip_pixel_to_bd =
+ _mm_set1_epi16(bd == 10 ? 1023 : (bd == 12 ? 4095 : 255));
+
+ // Check that, even with 12-bit input, the intermediate values will fit
+ // into an unsigned 16-bit intermediate array.
+ assert(bd + FILTER_BITS + 2 - conv_params->round_0 <= 16);
+
+ /* Horizontal filter */
+ {
+ const int16_t *x_filter = av1_get_interp_filter_subpel_kernel(
+ filter_params_x, subpel_x_qn & SUBPEL_MASK);
+ const __m128i coeffs_x = _mm_loadu_si128((__m128i *)x_filter);
+
+ // coeffs 0 1 0 1 2 3 2 3
+ const __m128i tmp_0 = _mm_unpacklo_epi32(coeffs_x, coeffs_x);
+ // coeffs 4 5 4 5 6 7 6 7
+ const __m128i tmp_1 = _mm_unpackhi_epi32(coeffs_x, coeffs_x);
+
+ // coeffs 0 1 0 1 0 1 0 1
+ const __m128i coeff_01 = _mm_unpacklo_epi64(tmp_0, tmp_0);
+ // coeffs 2 3 2 3 2 3 2 3
+ const __m128i coeff_23 = _mm_unpackhi_epi64(tmp_0, tmp_0);
+ // coeffs 4 5 4 5 4 5 4 5
+ const __m128i coeff_45 = _mm_unpacklo_epi64(tmp_1, tmp_1);
+ // coeffs 6 7 6 7 6 7 6 7
+ const __m128i coeff_67 = _mm_unpackhi_epi64(tmp_1, tmp_1);
+
+ const __m128i round_const = _mm_set1_epi32(
+ ((1 << conv_params->round_0) >> 1) + (1 << (bd + FILTER_BITS - 1)));
+ const __m128i round_shift = _mm_cvtsi32_si128(conv_params->round_0);
+
+ for (i = 0; i < im_h; ++i) {
+ for (j = 0; j < w; j += 8) {
+ const __m128i data =
+ _mm_loadu_si128((__m128i *)&src_ptr[i * src_stride + j]);
+ const __m128i data2 =
+ _mm_loadu_si128((__m128i *)&src_ptr[i * src_stride + j + 8]);
+
+ // Filter even-index pixels
+ const __m128i res_0 = _mm_madd_epi16(data, coeff_01);
+ const __m128i res_2 =
+ _mm_madd_epi16(_mm_alignr_epi8(data2, data, 4), coeff_23);
+ const __m128i res_4 =
+ _mm_madd_epi16(_mm_alignr_epi8(data2, data, 8), coeff_45);
+ const __m128i res_6 =
+ _mm_madd_epi16(_mm_alignr_epi8(data2, data, 12), coeff_67);
+
+ __m128i res_even = _mm_add_epi32(_mm_add_epi32(res_0, res_4),
+ _mm_add_epi32(res_2, res_6));
+ res_even =
+ _mm_sra_epi32(_mm_add_epi32(res_even, round_const), round_shift);
+
+ // Filter odd-index pixels
+ const __m128i res_1 =
+ _mm_madd_epi16(_mm_alignr_epi8(data2, data, 2), coeff_01);
+ const __m128i res_3 =
+ _mm_madd_epi16(_mm_alignr_epi8(data2, data, 6), coeff_23);
+ const __m128i res_5 =
+ _mm_madd_epi16(_mm_alignr_epi8(data2, data, 10), coeff_45);
+ const __m128i res_7 =
+ _mm_madd_epi16(_mm_alignr_epi8(data2, data, 14), coeff_67);
+
+ __m128i res_odd = _mm_add_epi32(_mm_add_epi32(res_1, res_5),
+ _mm_add_epi32(res_3, res_7));
+ res_odd =
+ _mm_sra_epi32(_mm_add_epi32(res_odd, round_const), round_shift);
+
+ // Pack in the column order 0, 2, 4, 6, 1, 3, 5, 7
+ __m128i res = _mm_packs_epi32(res_even, res_odd);
+ _mm_storeu_si128((__m128i *)&im_block[i * im_stride + j], res);
+ }
+ }
+ }
+
+ /* Vertical filter */
+ {
+ const int16_t *y_filter = av1_get_interp_filter_subpel_kernel(
+ filter_params_y, subpel_y_qn & SUBPEL_MASK);
+ const __m128i coeffs_y = _mm_loadu_si128((__m128i *)y_filter);
+
+ // coeffs 0 1 0 1 2 3 2 3
+ const __m128i tmp_0 = _mm_unpacklo_epi32(coeffs_y, coeffs_y);
+ // coeffs 4 5 4 5 6 7 6 7
+ const __m128i tmp_1 = _mm_unpackhi_epi32(coeffs_y, coeffs_y);
+
+ // coeffs 0 1 0 1 0 1 0 1
+ const __m128i coeff_01 = _mm_unpacklo_epi64(tmp_0, tmp_0);
+ // coeffs 2 3 2 3 2 3 2 3
+ const __m128i coeff_23 = _mm_unpackhi_epi64(tmp_0, tmp_0);
+ // coeffs 4 5 4 5 4 5 4 5
+ const __m128i coeff_45 = _mm_unpacklo_epi64(tmp_1, tmp_1);
+ // coeffs 6 7 6 7 6 7 6 7
+ const __m128i coeff_67 = _mm_unpackhi_epi64(tmp_1, tmp_1);
+
+ const __m128i round_const = _mm_set1_epi32(
+ ((1 << conv_params->round_1) >> 1) -
+ (1 << (bd + 2 * FILTER_BITS - conv_params->round_0 - 1)));
+ const __m128i round_shift = _mm_cvtsi32_si128(conv_params->round_1);
+
+ for (i = 0; i < h; ++i) {
+ for (j = 0; j < w; j += 8) {
+ // Filter even-index pixels
+ const int16_t *data = &im_block[i * im_stride + j];
+ const __m128i src_0 =
+ _mm_unpacklo_epi16(*(__m128i *)(data + 0 * im_stride),
+ *(__m128i *)(data + 1 * im_stride));
+ const __m128i src_2 =
+ _mm_unpacklo_epi16(*(__m128i *)(data + 2 * im_stride),
+ *(__m128i *)(data + 3 * im_stride));
+ const __m128i src_4 =
+ _mm_unpacklo_epi16(*(__m128i *)(data + 4 * im_stride),
+ *(__m128i *)(data + 5 * im_stride));
+ const __m128i src_6 =
+ _mm_unpacklo_epi16(*(__m128i *)(data + 6 * im_stride),
+ *(__m128i *)(data + 7 * im_stride));
+
+ const __m128i res_0 = _mm_madd_epi16(src_0, coeff_01);
+ const __m128i res_2 = _mm_madd_epi16(src_2, coeff_23);
+ const __m128i res_4 = _mm_madd_epi16(src_4, coeff_45);
+ const __m128i res_6 = _mm_madd_epi16(src_6, coeff_67);
+
+ const __m128i res_even = _mm_add_epi32(_mm_add_epi32(res_0, res_2),
+ _mm_add_epi32(res_4, res_6));
+
+ // Filter odd-index pixels
+ const __m128i src_1 =
+ _mm_unpackhi_epi16(*(__m128i *)(data + 0 * im_stride),
+ *(__m128i *)(data + 1 * im_stride));
+ const __m128i src_3 =
+ _mm_unpackhi_epi16(*(__m128i *)(data + 2 * im_stride),
+ *(__m128i *)(data + 3 * im_stride));
+ const __m128i src_5 =
+ _mm_unpackhi_epi16(*(__m128i *)(data + 4 * im_stride),
+ *(__m128i *)(data + 5 * im_stride));
+ const __m128i src_7 =
+ _mm_unpackhi_epi16(*(__m128i *)(data + 6 * im_stride),
+ *(__m128i *)(data + 7 * im_stride));
+
+ const __m128i res_1 = _mm_madd_epi16(src_1, coeff_01);
+ const __m128i res_3 = _mm_madd_epi16(src_3, coeff_23);
+ const __m128i res_5 = _mm_madd_epi16(src_5, coeff_45);
+ const __m128i res_7 = _mm_madd_epi16(src_7, coeff_67);
+
+ const __m128i res_odd = _mm_add_epi32(_mm_add_epi32(res_1, res_3),
+ _mm_add_epi32(res_5, res_7));
+
+ // Rearrange pixels back into the order 0 ... 7
+ const __m128i res_lo = _mm_unpacklo_epi32(res_even, res_odd);
+ const __m128i res_hi = _mm_unpackhi_epi32(res_even, res_odd);
+
+ const __m128i res_lo_round =
+ _mm_sra_epi32(_mm_add_epi32(res_lo, round_const), round_shift);
+
+ const __m128i res_unsigned_lo =
+ _mm_add_epi32(res_lo_round, offset_const);
+
+ if (w < 8) {
+ if (do_average) {
+ const __m128i data_0 =
+ _mm_loadl_epi64((__m128i *)(&dst[i * dst_stride + j]));
+
+ const __m128i data_ref_0 = _mm_cvtepu16_epi32(data_0);
+
+ const __m128i comp_avg_res =
+ highbd_comp_avg_sse4_1(&data_ref_0, &res_unsigned_lo, &wt0,
+ &wt1, use_dist_wtd_comp_avg);
+
+ const __m128i round_result = highbd_convolve_rounding_sse2(
+ &comp_avg_res, &offset_const, &rounding_const, rounding_shift);
+
+ const __m128i res_16b =
+ _mm_packus_epi32(round_result, round_result);
+ const __m128i res_clip = _mm_min_epi16(res_16b, clip_pixel_to_bd);
+
+ _mm_storel_epi64((__m128i *)(&dst0[i * dst_stride0 + j]), res_clip);
+ } else {
+ const __m128i res_16b =
+ _mm_packus_epi32(res_unsigned_lo, res_unsigned_lo);
+ _mm_storel_epi64((__m128i *)(&dst[i * dst_stride + j]), res_16b);
+ }
+ } else {
+ const __m128i res_hi_round =
+ _mm_sra_epi32(_mm_add_epi32(res_hi, round_const), round_shift);
+
+ const __m128i res_unsigned_hi =
+ _mm_add_epi32(res_hi_round, offset_const);
+
+ if (do_average) {
+ const __m128i data_lo =
+ _mm_loadl_epi64((__m128i *)(&dst[i * dst_stride + j]));
+ const __m128i data_hi =
+ _mm_loadl_epi64((__m128i *)(&dst[i * dst_stride + j + 4]));
+
+ const __m128i data_ref_0_lo = _mm_cvtepu16_epi32(data_lo);
+ const __m128i data_ref_0_hi = _mm_cvtepu16_epi32(data_hi);
+
+ const __m128i comp_avg_res_lo =
+ highbd_comp_avg_sse4_1(&data_ref_0_lo, &res_unsigned_lo, &wt0,
+ &wt1, use_dist_wtd_comp_avg);
+ const __m128i comp_avg_res_hi =
+ highbd_comp_avg_sse4_1(&data_ref_0_hi, &res_unsigned_hi, &wt0,
+ &wt1, use_dist_wtd_comp_avg);
+
+ const __m128i round_result_lo =
+ highbd_convolve_rounding_sse2(&comp_avg_res_lo, &offset_const,
+ &rounding_const, rounding_shift);
+ const __m128i round_result_hi =
+ highbd_convolve_rounding_sse2(&comp_avg_res_hi, &offset_const,
+ &rounding_const, rounding_shift);
+
+ const __m128i res_16b =
+ _mm_packus_epi32(round_result_lo, round_result_hi);
+ const __m128i res_clip = _mm_min_epi16(res_16b, clip_pixel_to_bd);
+
+ _mm_store_si128((__m128i *)(&dst0[i * dst_stride0 + j]), res_clip);
+ } else {
+ const __m128i res_16b =
+ _mm_packus_epi32(res_unsigned_lo, res_unsigned_hi);
+ _mm_store_si128((__m128i *)(&dst[i * dst_stride + j]), res_16b);
+ }
+ }
+ }
+ }
+ }
+}
diff --git a/third_party/aom/av1/common/x86/highbd_convolve_2d_ssse3.c b/third_party/aom/av1/common/x86/highbd_convolve_2d_ssse3.c
new file mode 100644
index 0000000000..88974ba260
--- /dev/null
+++ b/third_party/aom/av1/common/x86/highbd_convolve_2d_ssse3.c
@@ -0,0 +1,414 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <tmmintrin.h>
+#include <assert.h>
+
+#include "config/av1_rtcd.h"
+
+#include "aom_dsp/aom_dsp_common.h"
+#include "aom_dsp/aom_filter.h"
+#include "aom_dsp/x86/convolve_sse2.h"
+#include "av1/common/convolve.h"
+#include "aom_dsp/x86/convolve_common_intrin.h"
+
+void av1_highbd_convolve_2d_sr_ssse3(
+ const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w,
+ int h, const InterpFilterParams *filter_params_x,
+ const InterpFilterParams *filter_params_y, const int subpel_x_qn,
+ const int subpel_y_qn, ConvolveParams *conv_params, int bd) {
+ DECLARE_ALIGNED(32, int16_t, im_block[(MAX_SB_SIZE + MAX_FILTER_TAP) * 8]);
+ int im_h = h + filter_params_y->taps - 1;
+ int im_stride = 8;
+ int i, j;
+ const int fo_vert = filter_params_y->taps / 2 - 1;
+ const int fo_horiz = filter_params_x->taps / 2 - 1;
+ const uint16_t *const src_ptr = src - fo_vert * src_stride - fo_horiz;
+
+ // Check that, even with 12-bit input, the intermediate values will fit
+ // into an unsigned 16-bit intermediate array.
+ assert(bd + FILTER_BITS + 2 - conv_params->round_0 <= 16);
+
+ const __m128i round_const_x = _mm_set1_epi32(
+ ((1 << conv_params->round_0) >> 1) + (1 << (bd + FILTER_BITS - 1)));
+ const __m128i round_shift_x = _mm_cvtsi32_si128(conv_params->round_0);
+
+ const __m128i round_const_y =
+ _mm_set1_epi32(((1 << conv_params->round_1) >> 1) -
+ (1 << (bd + 2 * FILTER_BITS - conv_params->round_0 - 1)));
+ const __m128i round_shift_y = _mm_cvtsi32_si128(conv_params->round_1);
+
+ const int bits =
+ FILTER_BITS * 2 - conv_params->round_0 - conv_params->round_1;
+ const __m128i round_shift_bits = _mm_cvtsi32_si128(bits);
+ const __m128i round_const_bits = _mm_set1_epi32((1 << bits) >> 1);
+ const __m128i clip_pixel =
+ _mm_set1_epi16(bd == 10 ? 1023 : (bd == 12 ? 4095 : 255));
+ const __m128i zero = _mm_setzero_si128();
+
+ if (filter_params_x->taps == 12) {
+ __m128i coeffs_x[6], coeffs_y[6], s[24];
+ prepare_coeffs_12tap(filter_params_x, subpel_x_qn, coeffs_x);
+ prepare_coeffs_12tap(filter_params_y, subpel_y_qn, coeffs_y);
+
+ for (j = 0; j < w; j += 8) {
+ /* Horizontal filter */
+ {
+ for (i = 0; i < im_h; i += 1) {
+ const __m128i row00 =
+ _mm_loadu_si128((__m128i *)&src_ptr[i * src_stride + j]);
+ const __m128i row01 =
+ _mm_loadu_si128((__m128i *)&src_ptr[i * src_stride + (j + 8)]);
+ const __m128i row02 =
+ _mm_loadu_si128((__m128i *)&src_ptr[i * src_stride + (j + 16)]);
+
+ // even pixels
+ s[0] = _mm_alignr_epi8(row01, row00, 0);
+ s[1] = _mm_alignr_epi8(row01, row00, 4);
+ s[2] = _mm_alignr_epi8(row01, row00, 8);
+ s[3] = _mm_alignr_epi8(row01, row00, 12);
+ s[4] = _mm_alignr_epi8(row02, row01, 0);
+ s[5] = _mm_alignr_epi8(row02, row01, 4);
+
+ __m128i res_even = convolve_12tap(s, coeffs_x);
+ res_even = _mm_sra_epi32(_mm_add_epi32(res_even, round_const_x),
+ round_shift_x);
+
+ // odd pixels
+ s[0] = _mm_alignr_epi8(row01, row00, 2);
+ s[1] = _mm_alignr_epi8(row01, row00, 6);
+ s[2] = _mm_alignr_epi8(row01, row00, 10);
+ s[3] = _mm_alignr_epi8(row01, row00, 14);
+ s[4] = _mm_alignr_epi8(row02, row01, 2);
+ s[5] = _mm_alignr_epi8(row02, row01, 6);
+
+ __m128i res_odd = convolve_12tap(s, coeffs_x);
+ res_odd = _mm_sra_epi32(_mm_add_epi32(res_odd, round_const_x),
+ round_shift_x);
+
+ __m128i res_even1 = _mm_packs_epi32(res_even, res_even);
+ __m128i res_odd1 = _mm_packs_epi32(res_odd, res_odd);
+ __m128i res = _mm_unpacklo_epi16(res_even1, res_odd1);
+
+ _mm_store_si128((__m128i *)&im_block[i * im_stride], res);
+ }
+ }
+
+ /* Vertical filter */
+ {
+ __m128i s0 = _mm_loadu_si128((__m128i *)(im_block + 0 * im_stride));
+ __m128i s1 = _mm_loadu_si128((__m128i *)(im_block + 1 * im_stride));
+ __m128i s2 = _mm_loadu_si128((__m128i *)(im_block + 2 * im_stride));
+ __m128i s3 = _mm_loadu_si128((__m128i *)(im_block + 3 * im_stride));
+ __m128i s4 = _mm_loadu_si128((__m128i *)(im_block + 4 * im_stride));
+ __m128i s5 = _mm_loadu_si128((__m128i *)(im_block + 5 * im_stride));
+ __m128i s6 = _mm_loadu_si128((__m128i *)(im_block + 6 * im_stride));
+ __m128i s7 = _mm_loadu_si128((__m128i *)(im_block + 7 * im_stride));
+ __m128i s8 = _mm_loadu_si128((__m128i *)(im_block + 8 * im_stride));
+ __m128i s9 = _mm_loadu_si128((__m128i *)(im_block + 9 * im_stride));
+ __m128i s10 = _mm_loadu_si128((__m128i *)(im_block + 10 * im_stride));
+
+ s[0] = _mm_unpacklo_epi16(s0, s1);
+ s[1] = _mm_unpacklo_epi16(s2, s3);
+ s[2] = _mm_unpacklo_epi16(s4, s5);
+ s[3] = _mm_unpacklo_epi16(s6, s7);
+ s[4] = _mm_unpacklo_epi16(s8, s9);
+
+ s[6] = _mm_unpackhi_epi16(s0, s1);
+ s[7] = _mm_unpackhi_epi16(s2, s3);
+ s[8] = _mm_unpackhi_epi16(s4, s5);
+ s[9] = _mm_unpackhi_epi16(s6, s7);
+ s[10] = _mm_unpackhi_epi16(s8, s9);
+
+ s[12] = _mm_unpacklo_epi16(s1, s2);
+ s[13] = _mm_unpacklo_epi16(s3, s4);
+ s[14] = _mm_unpacklo_epi16(s5, s6);
+ s[15] = _mm_unpacklo_epi16(s7, s8);
+ s[16] = _mm_unpacklo_epi16(s9, s10);
+
+ s[18] = _mm_unpackhi_epi16(s1, s2);
+ s[19] = _mm_unpackhi_epi16(s3, s4);
+ s[20] = _mm_unpackhi_epi16(s5, s6);
+ s[21] = _mm_unpackhi_epi16(s7, s8);
+ s[22] = _mm_unpackhi_epi16(s9, s10);
+
+ for (i = 0; i < h; i += 2) {
+ const int16_t *data = &im_block[i * im_stride];
+
+ __m128i s11 = _mm_loadu_si128((__m128i *)(data + 11 * im_stride));
+ __m128i s12 = _mm_loadu_si128((__m128i *)(data + 12 * im_stride));
+
+ s[5] = _mm_unpacklo_epi16(s10, s11);
+ s[11] = _mm_unpackhi_epi16(s10, s11);
+
+ s[17] = _mm_unpacklo_epi16(s11, s12);
+ s[23] = _mm_unpackhi_epi16(s11, s12);
+
+ const __m128i res_a0 = convolve_12tap(s, coeffs_y);
+ __m128i res_a_round0 = _mm_sra_epi32(
+ _mm_add_epi32(res_a0, round_const_y), round_shift_y);
+ res_a_round0 = _mm_sra_epi32(
+ _mm_add_epi32(res_a_round0, round_const_bits), round_shift_bits);
+
+ const __m128i res_a1 = convolve_12tap(s + 12, coeffs_y);
+ __m128i res_a_round1 = _mm_sra_epi32(
+ _mm_add_epi32(res_a1, round_const_y), round_shift_y);
+ res_a_round1 = _mm_sra_epi32(
+ _mm_add_epi32(res_a_round1, round_const_bits), round_shift_bits);
+
+ if (w - j > 4) {
+ const __m128i res_b0 = convolve_12tap(s + 6, coeffs_y);
+ __m128i res_b_round0 = _mm_sra_epi32(
+ _mm_add_epi32(res_b0, round_const_y), round_shift_y);
+ res_b_round0 =
+ _mm_sra_epi32(_mm_add_epi32(res_b_round0, round_const_bits),
+ round_shift_bits);
+
+ const __m128i res_b1 = convolve_12tap(s + 18, coeffs_y);
+ __m128i res_b_round1 = _mm_sra_epi32(
+ _mm_add_epi32(res_b1, round_const_y), round_shift_y);
+ res_b_round1 =
+ _mm_sra_epi32(_mm_add_epi32(res_b_round1, round_const_bits),
+ round_shift_bits);
+
+ __m128i res_16bit0 = _mm_packs_epi32(res_a_round0, res_b_round0);
+ res_16bit0 = _mm_min_epi16(res_16bit0, clip_pixel);
+ res_16bit0 = _mm_max_epi16(res_16bit0, zero);
+
+ __m128i res_16bit1 = _mm_packs_epi32(res_a_round1, res_b_round1);
+ res_16bit1 = _mm_min_epi16(res_16bit1, clip_pixel);
+ res_16bit1 = _mm_max_epi16(res_16bit1, zero);
+
+ _mm_storeu_si128((__m128i *)&dst[i * dst_stride + j], res_16bit0);
+ _mm_storeu_si128((__m128i *)&dst[i * dst_stride + j + dst_stride],
+ res_16bit1);
+ } else if (w == 4) {
+ res_a_round0 = _mm_packs_epi32(res_a_round0, res_a_round0);
+ res_a_round0 = _mm_min_epi16(res_a_round0, clip_pixel);
+ res_a_round0 = _mm_max_epi16(res_a_round0, zero);
+
+ res_a_round1 = _mm_packs_epi32(res_a_round1, res_a_round1);
+ res_a_round1 = _mm_min_epi16(res_a_round1, clip_pixel);
+ res_a_round1 = _mm_max_epi16(res_a_round1, zero);
+
+ _mm_storel_epi64((__m128i *)&dst[i * dst_stride + j], res_a_round0);
+ _mm_storel_epi64((__m128i *)&dst[i * dst_stride + j + dst_stride],
+ res_a_round1);
+ } else {
+ res_a_round0 = _mm_packs_epi32(res_a_round0, res_a_round0);
+ res_a_round0 = _mm_min_epi16(res_a_round0, clip_pixel);
+ res_a_round0 = _mm_max_epi16(res_a_round0, zero);
+
+ res_a_round1 = _mm_packs_epi32(res_a_round1, res_a_round1);
+ res_a_round1 = _mm_min_epi16(res_a_round1, clip_pixel);
+ res_a_round1 = _mm_max_epi16(res_a_round1, zero);
+
+ *((int *)(&dst[i * dst_stride + j])) =
+ _mm_cvtsi128_si32(res_a_round0);
+
+ *((int *)(&dst[i * dst_stride + j + dst_stride])) =
+ _mm_cvtsi128_si32(res_a_round1);
+ }
+ s[0] = s[1];
+ s[1] = s[2];
+ s[2] = s[3];
+ s[3] = s[4];
+ s[4] = s[5];
+
+ s[6] = s[7];
+ s[7] = s[8];
+ s[8] = s[9];
+ s[9] = s[10];
+ s[10] = s[11];
+
+ s[12] = s[13];
+ s[13] = s[14];
+ s[14] = s[15];
+ s[15] = s[16];
+ s[16] = s[17];
+
+ s[18] = s[19];
+ s[19] = s[20];
+ s[20] = s[21];
+ s[21] = s[22];
+ s[22] = s[23];
+
+ s10 = s12;
+ }
+ }
+ }
+ } else {
+ __m128i coeffs_x[4], coeffs_y[4], s[16];
+ prepare_coeffs(filter_params_x, subpel_x_qn, coeffs_x);
+ prepare_coeffs(filter_params_y, subpel_y_qn, coeffs_y);
+
+ for (j = 0; j < w; j += 8) {
+ /* Horizontal filter */
+ {
+ for (i = 0; i < im_h; i += 1) {
+ const __m128i row00 =
+ _mm_loadu_si128((__m128i *)&src_ptr[i * src_stride + j]);
+ const __m128i row01 =
+ _mm_loadu_si128((__m128i *)&src_ptr[i * src_stride + (j + 8)]);
+
+ // even pixels
+ s[0] = _mm_alignr_epi8(row01, row00, 0);
+ s[1] = _mm_alignr_epi8(row01, row00, 4);
+ s[2] = _mm_alignr_epi8(row01, row00, 8);
+ s[3] = _mm_alignr_epi8(row01, row00, 12);
+
+ __m128i res_even = convolve(s, coeffs_x);
+ res_even = _mm_sra_epi32(_mm_add_epi32(res_even, round_const_x),
+ round_shift_x);
+
+ // odd pixels
+ s[0] = _mm_alignr_epi8(row01, row00, 2);
+ s[1] = _mm_alignr_epi8(row01, row00, 6);
+ s[2] = _mm_alignr_epi8(row01, row00, 10);
+ s[3] = _mm_alignr_epi8(row01, row00, 14);
+
+ __m128i res_odd = convolve(s, coeffs_x);
+ res_odd = _mm_sra_epi32(_mm_add_epi32(res_odd, round_const_x),
+ round_shift_x);
+
+ __m128i res_even1 = _mm_packs_epi32(res_even, res_even);
+ __m128i res_odd1 = _mm_packs_epi32(res_odd, res_odd);
+ __m128i res = _mm_unpacklo_epi16(res_even1, res_odd1);
+
+ _mm_store_si128((__m128i *)&im_block[i * im_stride], res);
+ }
+ }
+
+ /* Vertical filter */
+ {
+ __m128i s0 = _mm_loadu_si128((__m128i *)(im_block + 0 * im_stride));
+ __m128i s1 = _mm_loadu_si128((__m128i *)(im_block + 1 * im_stride));
+ __m128i s2 = _mm_loadu_si128((__m128i *)(im_block + 2 * im_stride));
+ __m128i s3 = _mm_loadu_si128((__m128i *)(im_block + 3 * im_stride));
+ __m128i s4 = _mm_loadu_si128((__m128i *)(im_block + 4 * im_stride));
+ __m128i s5 = _mm_loadu_si128((__m128i *)(im_block + 5 * im_stride));
+ __m128i s6 = _mm_loadu_si128((__m128i *)(im_block + 6 * im_stride));
+
+ s[0] = _mm_unpacklo_epi16(s0, s1);
+ s[1] = _mm_unpacklo_epi16(s2, s3);
+ s[2] = _mm_unpacklo_epi16(s4, s5);
+
+ s[4] = _mm_unpackhi_epi16(s0, s1);
+ s[5] = _mm_unpackhi_epi16(s2, s3);
+ s[6] = _mm_unpackhi_epi16(s4, s5);
+
+ s[0 + 8] = _mm_unpacklo_epi16(s1, s2);
+ s[1 + 8] = _mm_unpacklo_epi16(s3, s4);
+ s[2 + 8] = _mm_unpacklo_epi16(s5, s6);
+
+ s[4 + 8] = _mm_unpackhi_epi16(s1, s2);
+ s[5 + 8] = _mm_unpackhi_epi16(s3, s4);
+ s[6 + 8] = _mm_unpackhi_epi16(s5, s6);
+
+ for (i = 0; i < h; i += 2) {
+ const int16_t *data = &im_block[i * im_stride];
+
+ __m128i s7 = _mm_loadu_si128((__m128i *)(data + 7 * im_stride));
+ __m128i s8 = _mm_loadu_si128((__m128i *)(data + 8 * im_stride));
+
+ s[3] = _mm_unpacklo_epi16(s6, s7);
+ s[7] = _mm_unpackhi_epi16(s6, s7);
+
+ s[3 + 8] = _mm_unpacklo_epi16(s7, s8);
+ s[7 + 8] = _mm_unpackhi_epi16(s7, s8);
+
+ const __m128i res_a0 = convolve(s, coeffs_y);
+ __m128i res_a_round0 = _mm_sra_epi32(
+ _mm_add_epi32(res_a0, round_const_y), round_shift_y);
+ res_a_round0 = _mm_sra_epi32(
+ _mm_add_epi32(res_a_round0, round_const_bits), round_shift_bits);
+
+ const __m128i res_a1 = convolve(s + 8, coeffs_y);
+ __m128i res_a_round1 = _mm_sra_epi32(
+ _mm_add_epi32(res_a1, round_const_y), round_shift_y);
+ res_a_round1 = _mm_sra_epi32(
+ _mm_add_epi32(res_a_round1, round_const_bits), round_shift_bits);
+
+ if (w - j > 4) {
+ const __m128i res_b0 = convolve(s + 4, coeffs_y);
+ __m128i res_b_round0 = _mm_sra_epi32(
+ _mm_add_epi32(res_b0, round_const_y), round_shift_y);
+ res_b_round0 =
+ _mm_sra_epi32(_mm_add_epi32(res_b_round0, round_const_bits),
+ round_shift_bits);
+
+ const __m128i res_b1 = convolve(s + 4 + 8, coeffs_y);
+ __m128i res_b_round1 = _mm_sra_epi32(
+ _mm_add_epi32(res_b1, round_const_y), round_shift_y);
+ res_b_round1 =
+ _mm_sra_epi32(_mm_add_epi32(res_b_round1, round_const_bits),
+ round_shift_bits);
+
+ __m128i res_16bit0 = _mm_packs_epi32(res_a_round0, res_b_round0);
+ res_16bit0 = _mm_min_epi16(res_16bit0, clip_pixel);
+ res_16bit0 = _mm_max_epi16(res_16bit0, zero);
+
+ __m128i res_16bit1 = _mm_packs_epi32(res_a_round1, res_b_round1);
+ res_16bit1 = _mm_min_epi16(res_16bit1, clip_pixel);
+ res_16bit1 = _mm_max_epi16(res_16bit1, zero);
+
+ _mm_storeu_si128((__m128i *)&dst[i * dst_stride + j], res_16bit0);
+ _mm_storeu_si128((__m128i *)&dst[i * dst_stride + j + dst_stride],
+ res_16bit1);
+ } else if (w == 4) {
+ res_a_round0 = _mm_packs_epi32(res_a_round0, res_a_round0);
+ res_a_round0 = _mm_min_epi16(res_a_round0, clip_pixel);
+ res_a_round0 = _mm_max_epi16(res_a_round0, zero);
+
+ res_a_round1 = _mm_packs_epi32(res_a_round1, res_a_round1);
+ res_a_round1 = _mm_min_epi16(res_a_round1, clip_pixel);
+ res_a_round1 = _mm_max_epi16(res_a_round1, zero);
+
+ _mm_storel_epi64((__m128i *)&dst[i * dst_stride + j], res_a_round0);
+ _mm_storel_epi64((__m128i *)&dst[i * dst_stride + j + dst_stride],
+ res_a_round1);
+ } else {
+ res_a_round0 = _mm_packs_epi32(res_a_round0, res_a_round0);
+ res_a_round0 = _mm_min_epi16(res_a_round0, clip_pixel);
+ res_a_round0 = _mm_max_epi16(res_a_round0, zero);
+
+ res_a_round1 = _mm_packs_epi32(res_a_round1, res_a_round1);
+ res_a_round1 = _mm_min_epi16(res_a_round1, clip_pixel);
+ res_a_round1 = _mm_max_epi16(res_a_round1, zero);
+
+ *((int *)(&dst[i * dst_stride + j])) =
+ _mm_cvtsi128_si32(res_a_round0);
+
+ *((int *)(&dst[i * dst_stride + j + dst_stride])) =
+ _mm_cvtsi128_si32(res_a_round1);
+ }
+ s[0] = s[1];
+ s[1] = s[2];
+ s[2] = s[3];
+
+ s[4] = s[5];
+ s[5] = s[6];
+ s[6] = s[7];
+
+ s[0 + 8] = s[1 + 8];
+ s[1 + 8] = s[2 + 8];
+ s[2 + 8] = s[3 + 8];
+
+ s[4 + 8] = s[5 + 8];
+ s[5 + 8] = s[6 + 8];
+ s[6 + 8] = s[7 + 8];
+
+ s6 = s8;
+ }
+ }
+ }
+ }
+}
diff --git a/third_party/aom/av1/common/x86/highbd_inv_txfm_avx2.c b/third_party/aom/av1/common/x86/highbd_inv_txfm_avx2.c
new file mode 100644
index 0000000000..cbfe5614c3
--- /dev/null
+++ b/third_party/aom/av1/common/x86/highbd_inv_txfm_avx2.c
@@ -0,0 +1,4239 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+#include <assert.h>
+#include <immintrin.h>
+
+#include "config/aom_config.h"
+#include "config/av1_rtcd.h"
+
+#include "av1/common/av1_inv_txfm1d_cfg.h"
+#include "av1/common/idct.h"
+#include "av1/common/x86/av1_inv_txfm_ssse3.h"
+#include "av1/common/x86/highbd_txfm_utility_sse4.h"
+#include "aom_dsp/x86/txfm_common_avx2.h"
+
+// Note:
+// Total 32x4 registers to represent 32x32 block coefficients.
+// For high bit depth, each coefficient is 4-byte.
+// Each __m256i register holds 8 coefficients.
+// So each "row" we needs 4 register. Totally 32 rows
+// Register layout:
+// v0, v1, v2, v3,
+// v4, v5, v6, v7,
+// ... ...
+// v124, v125, v126, v127
+
+static INLINE __m256i highbd_clamp_epi16_avx2(__m256i u, int bd) {
+ const __m256i zero = _mm256_setzero_si256();
+ const __m256i one = _mm256_set1_epi16(1);
+ const __m256i max = _mm256_sub_epi16(_mm256_slli_epi16(one, bd), one);
+ __m256i clamped, mask;
+
+ mask = _mm256_cmpgt_epi16(u, max);
+ clamped = _mm256_andnot_si256(mask, u);
+ mask = _mm256_and_si256(mask, max);
+ clamped = _mm256_or_si256(mask, clamped);
+ mask = _mm256_cmpgt_epi16(clamped, zero);
+ clamped = _mm256_and_si256(clamped, mask);
+
+ return clamped;
+}
+
+static INLINE void round_shift_4x4_avx2(__m256i *in, int shift) {
+ if (shift != 0) {
+ __m256i rnding = _mm256_set1_epi32(1 << (shift - 1));
+ in[0] = _mm256_add_epi32(in[0], rnding);
+ in[1] = _mm256_add_epi32(in[1], rnding);
+ in[2] = _mm256_add_epi32(in[2], rnding);
+ in[3] = _mm256_add_epi32(in[3], rnding);
+
+ in[0] = _mm256_srai_epi32(in[0], shift);
+ in[1] = _mm256_srai_epi32(in[1], shift);
+ in[2] = _mm256_srai_epi32(in[2], shift);
+ in[3] = _mm256_srai_epi32(in[3], shift);
+ }
+}
+
+static INLINE void round_shift_8x8_avx2(__m256i *in, int shift) {
+ round_shift_4x4_avx2(in, shift);
+ round_shift_4x4_avx2(in + 4, shift);
+ round_shift_4x4_avx2(in + 8, shift);
+ round_shift_4x4_avx2(in + 12, shift);
+}
+
+static void highbd_clamp_epi32_avx2(__m256i *in, __m256i *out,
+ const __m256i *clamp_lo,
+ const __m256i *clamp_hi, int size) {
+ __m256i a0, a1;
+ for (int i = 0; i < size; i += 4) {
+ a0 = _mm256_max_epi32(in[i], *clamp_lo);
+ out[i] = _mm256_min_epi32(a0, *clamp_hi);
+
+ a1 = _mm256_max_epi32(in[i + 1], *clamp_lo);
+ out[i + 1] = _mm256_min_epi32(a1, *clamp_hi);
+
+ a0 = _mm256_max_epi32(in[i + 2], *clamp_lo);
+ out[i + 2] = _mm256_min_epi32(a0, *clamp_hi);
+
+ a1 = _mm256_max_epi32(in[i + 3], *clamp_lo);
+ out[i + 3] = _mm256_min_epi32(a1, *clamp_hi);
+ }
+}
+
+static INLINE __m256i highbd_get_recon_16x8_avx2(const __m256i pred,
+ __m256i res0, __m256i res1,
+ const int bd) {
+ __m256i x0 = _mm256_cvtepi16_epi32(_mm256_castsi256_si128(pred));
+ __m256i x1 = _mm256_cvtepi16_epi32(_mm256_extractf128_si256(pred, 1));
+
+ x0 = _mm256_add_epi32(res0, x0);
+ x1 = _mm256_add_epi32(res1, x1);
+ x0 = _mm256_packus_epi32(x0, x1);
+ x0 = _mm256_permute4x64_epi64(x0, 0xd8);
+ x0 = highbd_clamp_epi16_avx2(x0, bd);
+ return x0;
+}
+
+static INLINE void highbd_write_buffer_16xn_avx2(__m256i *in, uint16_t *output,
+ int stride, int flipud,
+ int height, const int bd) {
+ int j = flipud ? (height - 1) : 0;
+ const int step = flipud ? -1 : 1;
+ for (int i = 0; i < height; ++i, j += step) {
+ __m256i v = _mm256_loadu_si256((__m256i const *)(output + i * stride));
+ __m256i u = highbd_get_recon_16x8_avx2(v, in[j], in[j + height], bd);
+
+ _mm256_storeu_si256((__m256i *)(output + i * stride), u);
+ }
+}
+static INLINE __m256i highbd_get_recon_8x8_avx2(const __m256i pred, __m256i res,
+ const int bd) {
+ __m256i x0 = pred;
+ x0 = _mm256_add_epi32(res, x0);
+ x0 = _mm256_packus_epi32(x0, x0);
+ x0 = _mm256_permute4x64_epi64(x0, 0xd8);
+ x0 = highbd_clamp_epi16_avx2(x0, bd);
+ return x0;
+}
+
+static INLINE void highbd_write_buffer_8xn_avx2(__m256i *in, uint16_t *output,
+ int stride, int flipud,
+ int height, const int bd) {
+ int j = flipud ? (height - 1) : 0;
+ __m128i temp;
+ const int step = flipud ? -1 : 1;
+ for (int i = 0; i < height; ++i, j += step) {
+ temp = _mm_loadu_si128((__m128i const *)(output + i * stride));
+ __m256i v = _mm256_cvtepi16_epi32(temp);
+ __m256i u = highbd_get_recon_8x8_avx2(v, in[j], bd);
+ __m128i u1 = _mm256_castsi256_si128(u);
+ _mm_storeu_si128((__m128i *)(output + i * stride), u1);
+ }
+}
+static void neg_shift_avx2(const __m256i in0, const __m256i in1, __m256i *out0,
+ __m256i *out1, const __m256i *clamp_lo,
+ const __m256i *clamp_hi, int shift) {
+ __m256i offset = _mm256_set1_epi32((1 << shift) >> 1);
+ __m256i a0 = _mm256_add_epi32(offset, in0);
+ __m256i a1 = _mm256_sub_epi32(offset, in1);
+
+ a0 = _mm256_sra_epi32(a0, _mm_cvtsi32_si128(shift));
+ a1 = _mm256_sra_epi32(a1, _mm_cvtsi32_si128(shift));
+
+ a0 = _mm256_max_epi32(a0, *clamp_lo);
+ a0 = _mm256_min_epi32(a0, *clamp_hi);
+ a1 = _mm256_max_epi32(a1, *clamp_lo);
+ a1 = _mm256_min_epi32(a1, *clamp_hi);
+
+ *out0 = a0;
+ *out1 = a1;
+}
+
+static void transpose_8x8_avx2(const __m256i *in, __m256i *out) {
+ __m256i u0, u1, u2, u3, u4, u5, u6, u7;
+ __m256i x0, x1;
+
+ u0 = _mm256_unpacklo_epi32(in[0], in[1]);
+ u1 = _mm256_unpackhi_epi32(in[0], in[1]);
+
+ u2 = _mm256_unpacklo_epi32(in[2], in[3]);
+ u3 = _mm256_unpackhi_epi32(in[2], in[3]);
+
+ u4 = _mm256_unpacklo_epi32(in[4], in[5]);
+ u5 = _mm256_unpackhi_epi32(in[4], in[5]);
+
+ u6 = _mm256_unpacklo_epi32(in[6], in[7]);
+ u7 = _mm256_unpackhi_epi32(in[6], in[7]);
+
+ x0 = _mm256_unpacklo_epi64(u0, u2);
+ x1 = _mm256_unpacklo_epi64(u4, u6);
+ out[0] = _mm256_permute2f128_si256(x0, x1, 0x20);
+ out[4] = _mm256_permute2f128_si256(x0, x1, 0x31);
+
+ x0 = _mm256_unpackhi_epi64(u0, u2);
+ x1 = _mm256_unpackhi_epi64(u4, u6);
+ out[1] = _mm256_permute2f128_si256(x0, x1, 0x20);
+ out[5] = _mm256_permute2f128_si256(x0, x1, 0x31);
+
+ x0 = _mm256_unpacklo_epi64(u1, u3);
+ x1 = _mm256_unpacklo_epi64(u5, u7);
+ out[2] = _mm256_permute2f128_si256(x0, x1, 0x20);
+ out[6] = _mm256_permute2f128_si256(x0, x1, 0x31);
+
+ x0 = _mm256_unpackhi_epi64(u1, u3);
+ x1 = _mm256_unpackhi_epi64(u5, u7);
+ out[3] = _mm256_permute2f128_si256(x0, x1, 0x20);
+ out[7] = _mm256_permute2f128_si256(x0, x1, 0x31);
+}
+
+static void transpose_8x8_flip_avx2(const __m256i *in, __m256i *out) {
+ __m256i u0, u1, u2, u3, u4, u5, u6, u7;
+ __m256i x0, x1;
+
+ u0 = _mm256_unpacklo_epi32(in[7], in[6]);
+ u1 = _mm256_unpackhi_epi32(in[7], in[6]);
+
+ u2 = _mm256_unpacklo_epi32(in[5], in[4]);
+ u3 = _mm256_unpackhi_epi32(in[5], in[4]);
+
+ u4 = _mm256_unpacklo_epi32(in[3], in[2]);
+ u5 = _mm256_unpackhi_epi32(in[3], in[2]);
+
+ u6 = _mm256_unpacklo_epi32(in[1], in[0]);
+ u7 = _mm256_unpackhi_epi32(in[1], in[0]);
+
+ x0 = _mm256_unpacklo_epi64(u0, u2);
+ x1 = _mm256_unpacklo_epi64(u4, u6);
+ out[0] = _mm256_permute2f128_si256(x0, x1, 0x20);
+ out[4] = _mm256_permute2f128_si256(x0, x1, 0x31);
+
+ x0 = _mm256_unpackhi_epi64(u0, u2);
+ x1 = _mm256_unpackhi_epi64(u4, u6);
+ out[1] = _mm256_permute2f128_si256(x0, x1, 0x20);
+ out[5] = _mm256_permute2f128_si256(x0, x1, 0x31);
+
+ x0 = _mm256_unpacklo_epi64(u1, u3);
+ x1 = _mm256_unpacklo_epi64(u5, u7);
+ out[2] = _mm256_permute2f128_si256(x0, x1, 0x20);
+ out[6] = _mm256_permute2f128_si256(x0, x1, 0x31);
+
+ x0 = _mm256_unpackhi_epi64(u1, u3);
+ x1 = _mm256_unpackhi_epi64(u5, u7);
+ out[3] = _mm256_permute2f128_si256(x0, x1, 0x20);
+ out[7] = _mm256_permute2f128_si256(x0, x1, 0x31);
+}
+
+static INLINE void load_buffer_32bit_input(const int32_t *in, int stride,
+ __m256i *out, int out_size) {
+ for (int i = 0; i < out_size; ++i) {
+ out[i] = _mm256_loadu_si256((const __m256i *)(in + i * stride));
+ }
+}
+
+static INLINE __m256i half_btf_0_avx2(const __m256i *w0, const __m256i *n0,
+ const __m256i *rounding, int bit) {
+ __m256i x;
+ x = _mm256_mullo_epi32(*w0, *n0);
+ x = _mm256_add_epi32(x, *rounding);
+ x = _mm256_srai_epi32(x, bit);
+ return x;
+}
+
+static INLINE __m256i half_btf_avx2(const __m256i *w0, const __m256i *n0,
+ const __m256i *w1, const __m256i *n1,
+ const __m256i *rounding, int bit) {
+ __m256i x, y;
+
+ x = _mm256_mullo_epi32(*w0, *n0);
+ y = _mm256_mullo_epi32(*w1, *n1);
+ x = _mm256_add_epi32(x, y);
+ x = _mm256_add_epi32(x, *rounding);
+ x = _mm256_srai_epi32(x, bit);
+ return x;
+}
+
+static void addsub_avx2(const __m256i in0, const __m256i in1, __m256i *out0,
+ __m256i *out1, const __m256i *clamp_lo,
+ const __m256i *clamp_hi) {
+ __m256i a0 = _mm256_add_epi32(in0, in1);
+ __m256i a1 = _mm256_sub_epi32(in0, in1);
+
+ a0 = _mm256_max_epi32(a0, *clamp_lo);
+ a0 = _mm256_min_epi32(a0, *clamp_hi);
+ a1 = _mm256_max_epi32(a1, *clamp_lo);
+ a1 = _mm256_min_epi32(a1, *clamp_hi);
+
+ *out0 = a0;
+ *out1 = a1;
+}
+
+static INLINE void idct32_stage4_avx2(
+ __m256i *bf1, const __m256i *cospim8, const __m256i *cospi56,
+ const __m256i *cospi8, const __m256i *cospim56, const __m256i *cospim40,
+ const __m256i *cospi24, const __m256i *cospi40, const __m256i *cospim24,
+ const __m256i *rounding, int bit) {
+ __m256i temp1, temp2;
+ temp1 = half_btf_avx2(cospim8, &bf1[17], cospi56, &bf1[30], rounding, bit);
+ bf1[30] = half_btf_avx2(cospi56, &bf1[17], cospi8, &bf1[30], rounding, bit);
+ bf1[17] = temp1;
+
+ temp2 = half_btf_avx2(cospim56, &bf1[18], cospim8, &bf1[29], rounding, bit);
+ bf1[29] = half_btf_avx2(cospim8, &bf1[18], cospi56, &bf1[29], rounding, bit);
+ bf1[18] = temp2;
+
+ temp1 = half_btf_avx2(cospim40, &bf1[21], cospi24, &bf1[26], rounding, bit);
+ bf1[26] = half_btf_avx2(cospi24, &bf1[21], cospi40, &bf1[26], rounding, bit);
+ bf1[21] = temp1;
+
+ temp2 = half_btf_avx2(cospim24, &bf1[22], cospim40, &bf1[25], rounding, bit);
+ bf1[25] = half_btf_avx2(cospim40, &bf1[22], cospi24, &bf1[25], rounding, bit);
+ bf1[22] = temp2;
+}
+
+static INLINE void idct32_stage5_avx2(
+ __m256i *bf1, const __m256i *cospim16, const __m256i *cospi48,
+ const __m256i *cospi16, const __m256i *cospim48, const __m256i *clamp_lo,
+ const __m256i *clamp_hi, const __m256i *rounding, int bit) {
+ __m256i temp1, temp2;
+ temp1 = half_btf_avx2(cospim16, &bf1[9], cospi48, &bf1[14], rounding, bit);
+ bf1[14] = half_btf_avx2(cospi48, &bf1[9], cospi16, &bf1[14], rounding, bit);
+ bf1[9] = temp1;
+
+ temp2 = half_btf_avx2(cospim48, &bf1[10], cospim16, &bf1[13], rounding, bit);
+ bf1[13] = half_btf_avx2(cospim16, &bf1[10], cospi48, &bf1[13], rounding, bit);
+ bf1[10] = temp2;
+
+ addsub_avx2(bf1[16], bf1[19], bf1 + 16, bf1 + 19, clamp_lo, clamp_hi);
+ addsub_avx2(bf1[17], bf1[18], bf1 + 17, bf1 + 18, clamp_lo, clamp_hi);
+ addsub_avx2(bf1[23], bf1[20], bf1 + 23, bf1 + 20, clamp_lo, clamp_hi);
+ addsub_avx2(bf1[22], bf1[21], bf1 + 22, bf1 + 21, clamp_lo, clamp_hi);
+ addsub_avx2(bf1[24], bf1[27], bf1 + 24, bf1 + 27, clamp_lo, clamp_hi);
+ addsub_avx2(bf1[25], bf1[26], bf1 + 25, bf1 + 26, clamp_lo, clamp_hi);
+ addsub_avx2(bf1[31], bf1[28], bf1 + 31, bf1 + 28, clamp_lo, clamp_hi);
+ addsub_avx2(bf1[30], bf1[29], bf1 + 30, bf1 + 29, clamp_lo, clamp_hi);
+}
+
+static INLINE void idct32_stage6_avx2(
+ __m256i *bf1, const __m256i *cospim32, const __m256i *cospi32,
+ const __m256i *cospim16, const __m256i *cospi48, const __m256i *cospi16,
+ const __m256i *cospim48, const __m256i *clamp_lo, const __m256i *clamp_hi,
+ const __m256i *rounding, int bit) {
+ __m256i temp1, temp2;
+ temp1 = half_btf_avx2(cospim32, &bf1[5], cospi32, &bf1[6], rounding, bit);
+ bf1[6] = half_btf_avx2(cospi32, &bf1[5], cospi32, &bf1[6], rounding, bit);
+ bf1[5] = temp1;
+
+ addsub_avx2(bf1[8], bf1[11], bf1 + 8, bf1 + 11, clamp_lo, clamp_hi);
+ addsub_avx2(bf1[9], bf1[10], bf1 + 9, bf1 + 10, clamp_lo, clamp_hi);
+ addsub_avx2(bf1[15], bf1[12], bf1 + 15, bf1 + 12, clamp_lo, clamp_hi);
+ addsub_avx2(bf1[14], bf1[13], bf1 + 14, bf1 + 13, clamp_lo, clamp_hi);
+
+ temp1 = half_btf_avx2(cospim16, &bf1[18], cospi48, &bf1[29], rounding, bit);
+ bf1[29] = half_btf_avx2(cospi48, &bf1[18], cospi16, &bf1[29], rounding, bit);
+ bf1[18] = temp1;
+ temp2 = half_btf_avx2(cospim16, &bf1[19], cospi48, &bf1[28], rounding, bit);
+ bf1[28] = half_btf_avx2(cospi48, &bf1[19], cospi16, &bf1[28], rounding, bit);
+ bf1[19] = temp2;
+ temp1 = half_btf_avx2(cospim48, &bf1[20], cospim16, &bf1[27], rounding, bit);
+ bf1[27] = half_btf_avx2(cospim16, &bf1[20], cospi48, &bf1[27], rounding, bit);
+ bf1[20] = temp1;
+ temp2 = half_btf_avx2(cospim48, &bf1[21], cospim16, &bf1[26], rounding, bit);
+ bf1[26] = half_btf_avx2(cospim16, &bf1[21], cospi48, &bf1[26], rounding, bit);
+ bf1[21] = temp2;
+}
+
+static INLINE void idct32_stage7_avx2(__m256i *bf1, const __m256i *cospim32,
+ const __m256i *cospi32,
+ const __m256i *clamp_lo,
+ const __m256i *clamp_hi,
+ const __m256i *rounding, int bit) {
+ __m256i temp1, temp2;
+ addsub_avx2(bf1[0], bf1[7], bf1 + 0, bf1 + 7, clamp_lo, clamp_hi);
+ addsub_avx2(bf1[1], bf1[6], bf1 + 1, bf1 + 6, clamp_lo, clamp_hi);
+ addsub_avx2(bf1[2], bf1[5], bf1 + 2, bf1 + 5, clamp_lo, clamp_hi);
+ addsub_avx2(bf1[3], bf1[4], bf1 + 3, bf1 + 4, clamp_lo, clamp_hi);
+
+ temp1 = half_btf_avx2(cospim32, &bf1[10], cospi32, &bf1[13], rounding, bit);
+ bf1[13] = half_btf_avx2(cospi32, &bf1[10], cospi32, &bf1[13], rounding, bit);
+ bf1[10] = temp1;
+ temp2 = half_btf_avx2(cospim32, &bf1[11], cospi32, &bf1[12], rounding, bit);
+ bf1[12] = half_btf_avx2(cospi32, &bf1[11], cospi32, &bf1[12], rounding, bit);
+ bf1[11] = temp2;
+
+ addsub_avx2(bf1[16], bf1[23], bf1 + 16, bf1 + 23, clamp_lo, clamp_hi);
+ addsub_avx2(bf1[17], bf1[22], bf1 + 17, bf1 + 22, clamp_lo, clamp_hi);
+ addsub_avx2(bf1[18], bf1[21], bf1 + 18, bf1 + 21, clamp_lo, clamp_hi);
+ addsub_avx2(bf1[19], bf1[20], bf1 + 19, bf1 + 20, clamp_lo, clamp_hi);
+ addsub_avx2(bf1[31], bf1[24], bf1 + 31, bf1 + 24, clamp_lo, clamp_hi);
+ addsub_avx2(bf1[30], bf1[25], bf1 + 30, bf1 + 25, clamp_lo, clamp_hi);
+ addsub_avx2(bf1[29], bf1[26], bf1 + 29, bf1 + 26, clamp_lo, clamp_hi);
+ addsub_avx2(bf1[28], bf1[27], bf1 + 28, bf1 + 27, clamp_lo, clamp_hi);
+}
+
+static INLINE void idct32_stage8_avx2(__m256i *bf1, const __m256i *cospim32,
+ const __m256i *cospi32,
+ const __m256i *clamp_lo,
+ const __m256i *clamp_hi,
+ const __m256i *rounding, int bit) {
+ __m256i temp1, temp2;
+ addsub_avx2(bf1[0], bf1[15], bf1 + 0, bf1 + 15, clamp_lo, clamp_hi);
+ addsub_avx2(bf1[1], bf1[14], bf1 + 1, bf1 + 14, clamp_lo, clamp_hi);
+ addsub_avx2(bf1[2], bf1[13], bf1 + 2, bf1 + 13, clamp_lo, clamp_hi);
+ addsub_avx2(bf1[3], bf1[12], bf1 + 3, bf1 + 12, clamp_lo, clamp_hi);
+ addsub_avx2(bf1[4], bf1[11], bf1 + 4, bf1 + 11, clamp_lo, clamp_hi);
+ addsub_avx2(bf1[5], bf1[10], bf1 + 5, bf1 + 10, clamp_lo, clamp_hi);
+ addsub_avx2(bf1[6], bf1[9], bf1 + 6, bf1 + 9, clamp_lo, clamp_hi);
+ addsub_avx2(bf1[7], bf1[8], bf1 + 7, bf1 + 8, clamp_lo, clamp_hi);
+
+ temp1 = half_btf_avx2(cospim32, &bf1[20], cospi32, &bf1[27], rounding, bit);
+ bf1[27] = half_btf_avx2(cospi32, &bf1[20], cospi32, &bf1[27], rounding, bit);
+ bf1[20] = temp1;
+ temp2 = half_btf_avx2(cospim32, &bf1[21], cospi32, &bf1[26], rounding, bit);
+ bf1[26] = half_btf_avx2(cospi32, &bf1[21], cospi32, &bf1[26], rounding, bit);
+ bf1[21] = temp2;
+ temp1 = half_btf_avx2(cospim32, &bf1[22], cospi32, &bf1[25], rounding, bit);
+ bf1[25] = half_btf_avx2(cospi32, &bf1[22], cospi32, &bf1[25], rounding, bit);
+ bf1[22] = temp1;
+ temp2 = half_btf_avx2(cospim32, &bf1[23], cospi32, &bf1[24], rounding, bit);
+ bf1[24] = half_btf_avx2(cospi32, &bf1[23], cospi32, &bf1[24], rounding, bit);
+ bf1[23] = temp2;
+}
+
+static INLINE void idct32_stage9_avx2(__m256i *bf1, __m256i *out,
+ const int do_cols, const int bd,
+ const int out_shift,
+ const __m256i *clamp_lo,
+ const __m256i *clamp_hi) {
+ addsub_avx2(bf1[0], bf1[31], out + 0, out + 31, clamp_lo, clamp_hi);
+ addsub_avx2(bf1[1], bf1[30], out + 1, out + 30, clamp_lo, clamp_hi);
+ addsub_avx2(bf1[2], bf1[29], out + 2, out + 29, clamp_lo, clamp_hi);
+ addsub_avx2(bf1[3], bf1[28], out + 3, out + 28, clamp_lo, clamp_hi);
+ addsub_avx2(bf1[4], bf1[27], out + 4, out + 27, clamp_lo, clamp_hi);
+ addsub_avx2(bf1[5], bf1[26], out + 5, out + 26, clamp_lo, clamp_hi);
+ addsub_avx2(bf1[6], bf1[25], out + 6, out + 25, clamp_lo, clamp_hi);
+ addsub_avx2(bf1[7], bf1[24], out + 7, out + 24, clamp_lo, clamp_hi);
+ addsub_avx2(bf1[8], bf1[23], out + 8, out + 23, clamp_lo, clamp_hi);
+ addsub_avx2(bf1[9], bf1[22], out + 9, out + 22, clamp_lo, clamp_hi);
+ addsub_avx2(bf1[10], bf1[21], out + 10, out + 21, clamp_lo, clamp_hi);
+ addsub_avx2(bf1[11], bf1[20], out + 11, out + 20, clamp_lo, clamp_hi);
+ addsub_avx2(bf1[12], bf1[19], out + 12, out + 19, clamp_lo, clamp_hi);
+ addsub_avx2(bf1[13], bf1[18], out + 13, out + 18, clamp_lo, clamp_hi);
+ addsub_avx2(bf1[14], bf1[17], out + 14, out + 17, clamp_lo, clamp_hi);
+ addsub_avx2(bf1[15], bf1[16], out + 15, out + 16, clamp_lo, clamp_hi);
+ if (!do_cols) {
+ const int log_range_out = AOMMAX(16, bd + 6);
+ const __m256i clamp_lo_out = _mm256_set1_epi32(-(1 << (log_range_out - 1)));
+ const __m256i clamp_hi_out =
+ _mm256_set1_epi32((1 << (log_range_out - 1)) - 1);
+ round_shift_8x8_avx2(out, out_shift);
+ round_shift_8x8_avx2(out + 16, out_shift);
+ highbd_clamp_epi32_avx2(out, out, &clamp_lo_out, &clamp_hi_out, 32);
+ }
+}
+
+static void idct32_low1_avx2(__m256i *in, __m256i *out, int bit, int do_cols,
+ int bd, int out_shift) {
+ const int32_t *cospi = cospi_arr(bit);
+ const __m256i cospi32 = _mm256_set1_epi32(cospi[32]);
+ const __m256i rounding = _mm256_set1_epi32(1 << (bit - 1));
+ const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8));
+ __m256i clamp_lo = _mm256_set1_epi32(-(1 << (log_range - 1)));
+ __m256i clamp_hi = _mm256_set1_epi32((1 << (log_range - 1)) - 1);
+ __m256i x;
+ // stage 0
+ // stage 1
+ // stage 2
+ // stage 3
+ // stage 4
+ // stage 5
+ x = _mm256_mullo_epi32(in[0], cospi32);
+ x = _mm256_add_epi32(x, rounding);
+ x = _mm256_srai_epi32(x, bit);
+
+ // stage 6
+ // stage 7
+ // stage 8
+ // stage 9
+ if (!do_cols) {
+ const int log_range_out = AOMMAX(16, bd + 6);
+ __m256i offset = _mm256_set1_epi32((1 << out_shift) >> 1);
+ clamp_lo = _mm256_set1_epi32(-(1 << (log_range_out - 1)));
+ clamp_hi = _mm256_set1_epi32((1 << (log_range_out - 1)) - 1);
+ x = _mm256_add_epi32(offset, x);
+ x = _mm256_sra_epi32(x, _mm_cvtsi32_si128(out_shift));
+ }
+ x = _mm256_max_epi32(x, clamp_lo);
+ x = _mm256_min_epi32(x, clamp_hi);
+ out[0] = x;
+ out[1] = x;
+ out[2] = x;
+ out[3] = x;
+ out[4] = x;
+ out[5] = x;
+ out[6] = x;
+ out[7] = x;
+ out[8] = x;
+ out[9] = x;
+ out[10] = x;
+ out[11] = x;
+ out[12] = x;
+ out[13] = x;
+ out[14] = x;
+ out[15] = x;
+ out[16] = x;
+ out[17] = x;
+ out[18] = x;
+ out[19] = x;
+ out[20] = x;
+ out[21] = x;
+ out[22] = x;
+ out[23] = x;
+ out[24] = x;
+ out[25] = x;
+ out[26] = x;
+ out[27] = x;
+ out[28] = x;
+ out[29] = x;
+ out[30] = x;
+ out[31] = x;
+}
+
+static void idct32_low8_avx2(__m256i *in, __m256i *out, int bit, int do_cols,
+ int bd, int out_shift) {
+ const int32_t *cospi = cospi_arr(bit);
+ const __m256i cospi62 = _mm256_set1_epi32(cospi[62]);
+ const __m256i cospi14 = _mm256_set1_epi32(cospi[14]);
+ const __m256i cospi54 = _mm256_set1_epi32(cospi[54]);
+ const __m256i cospi6 = _mm256_set1_epi32(cospi[6]);
+ const __m256i cospi10 = _mm256_set1_epi32(cospi[10]);
+ const __m256i cospi2 = _mm256_set1_epi32(cospi[2]);
+ const __m256i cospim58 = _mm256_set1_epi32(-cospi[58]);
+ const __m256i cospim50 = _mm256_set1_epi32(-cospi[50]);
+ const __m256i cospi60 = _mm256_set1_epi32(cospi[60]);
+ const __m256i cospi12 = _mm256_set1_epi32(cospi[12]);
+ const __m256i cospi4 = _mm256_set1_epi32(cospi[4]);
+ const __m256i cospim52 = _mm256_set1_epi32(-cospi[52]);
+ const __m256i cospi56 = _mm256_set1_epi32(cospi[56]);
+ const __m256i cospi24 = _mm256_set1_epi32(cospi[24]);
+ const __m256i cospi40 = _mm256_set1_epi32(cospi[40]);
+ const __m256i cospi8 = _mm256_set1_epi32(cospi[8]);
+ const __m256i cospim40 = _mm256_set1_epi32(-cospi[40]);
+ const __m256i cospim8 = _mm256_set1_epi32(-cospi[8]);
+ const __m256i cospim56 = _mm256_set1_epi32(-cospi[56]);
+ const __m256i cospim24 = _mm256_set1_epi32(-cospi[24]);
+ const __m256i cospi32 = _mm256_set1_epi32(cospi[32]);
+ const __m256i cospim32 = _mm256_set1_epi32(-cospi[32]);
+ const __m256i cospi48 = _mm256_set1_epi32(cospi[48]);
+ const __m256i cospim48 = _mm256_set1_epi32(-cospi[48]);
+ const __m256i cospi16 = _mm256_set1_epi32(cospi[16]);
+ const __m256i cospim16 = _mm256_set1_epi32(-cospi[16]);
+ const __m256i rounding = _mm256_set1_epi32(1 << (bit - 1));
+ const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8));
+ const __m256i clamp_lo = _mm256_set1_epi32(-(1 << (log_range - 1)));
+ const __m256i clamp_hi = _mm256_set1_epi32((1 << (log_range - 1)) - 1);
+ __m256i bf1[32];
+
+ {
+ // stage 0
+ // stage 1
+ bf1[0] = in[0];
+ bf1[4] = in[4];
+ bf1[8] = in[2];
+ bf1[12] = in[6];
+ bf1[16] = in[1];
+ bf1[20] = in[5];
+ bf1[24] = in[3];
+ bf1[28] = in[7];
+
+ // stage 2
+ bf1[31] = half_btf_0_avx2(&cospi2, &bf1[16], &rounding, bit);
+ bf1[16] = half_btf_0_avx2(&cospi62, &bf1[16], &rounding, bit);
+ bf1[19] = half_btf_0_avx2(&cospim50, &bf1[28], &rounding, bit);
+ bf1[28] = half_btf_0_avx2(&cospi14, &bf1[28], &rounding, bit);
+ bf1[27] = half_btf_0_avx2(&cospi10, &bf1[20], &rounding, bit);
+ bf1[20] = half_btf_0_avx2(&cospi54, &bf1[20], &rounding, bit);
+ bf1[23] = half_btf_0_avx2(&cospim58, &bf1[24], &rounding, bit);
+ bf1[24] = half_btf_0_avx2(&cospi6, &bf1[24], &rounding, bit);
+
+ // stage 3
+ bf1[15] = half_btf_0_avx2(&cospi4, &bf1[8], &rounding, bit);
+ bf1[8] = half_btf_0_avx2(&cospi60, &bf1[8], &rounding, bit);
+
+ bf1[11] = half_btf_0_avx2(&cospim52, &bf1[12], &rounding, bit);
+ bf1[12] = half_btf_0_avx2(&cospi12, &bf1[12], &rounding, bit);
+ bf1[17] = bf1[16];
+ bf1[18] = bf1[19];
+ bf1[21] = bf1[20];
+ bf1[22] = bf1[23];
+ bf1[25] = bf1[24];
+ bf1[26] = bf1[27];
+ bf1[29] = bf1[28];
+ bf1[30] = bf1[31];
+
+ // stage 4
+ bf1[7] = half_btf_0_avx2(&cospi8, &bf1[4], &rounding, bit);
+ bf1[4] = half_btf_0_avx2(&cospi56, &bf1[4], &rounding, bit);
+
+ bf1[9] = bf1[8];
+ bf1[10] = bf1[11];
+ bf1[13] = bf1[12];
+ bf1[14] = bf1[15];
+
+ idct32_stage4_avx2(bf1, &cospim8, &cospi56, &cospi8, &cospim56, &cospim40,
+ &cospi24, &cospi40, &cospim24, &rounding, bit);
+
+ // stage 5
+ bf1[0] = half_btf_0_avx2(&cospi32, &bf1[0], &rounding, bit);
+ bf1[1] = bf1[0];
+ bf1[5] = bf1[4];
+ bf1[6] = bf1[7];
+
+ idct32_stage5_avx2(bf1, &cospim16, &cospi48, &cospi16, &cospim48, &clamp_lo,
+ &clamp_hi, &rounding, bit);
+
+ // stage 6
+ bf1[3] = bf1[0];
+ bf1[2] = bf1[1];
+
+ idct32_stage6_avx2(bf1, &cospim32, &cospi32, &cospim16, &cospi48, &cospi16,
+ &cospim48, &clamp_lo, &clamp_hi, &rounding, bit);
+
+ // stage 7
+ idct32_stage7_avx2(bf1, &cospim32, &cospi32, &clamp_lo, &clamp_hi,
+ &rounding, bit);
+
+ // stage 8
+ idct32_stage8_avx2(bf1, &cospim32, &cospi32, &clamp_lo, &clamp_hi,
+ &rounding, bit);
+
+ // stage 9
+ idct32_stage9_avx2(bf1, out, do_cols, bd, out_shift, &clamp_lo, &clamp_hi);
+ }
+}
+
+static void idct32_low16_avx2(__m256i *in, __m256i *out, int bit, int do_cols,
+ int bd, int out_shift) {
+ const int32_t *cospi = cospi_arr(bit);
+ const __m256i cospi62 = _mm256_set1_epi32(cospi[62]);
+ const __m256i cospi30 = _mm256_set1_epi32(cospi[30]);
+ const __m256i cospi46 = _mm256_set1_epi32(cospi[46]);
+ const __m256i cospi14 = _mm256_set1_epi32(cospi[14]);
+ const __m256i cospi54 = _mm256_set1_epi32(cospi[54]);
+ const __m256i cospi22 = _mm256_set1_epi32(cospi[22]);
+ const __m256i cospi38 = _mm256_set1_epi32(cospi[38]);
+ const __m256i cospi6 = _mm256_set1_epi32(cospi[6]);
+ const __m256i cospi26 = _mm256_set1_epi32(cospi[26]);
+ const __m256i cospi10 = _mm256_set1_epi32(cospi[10]);
+ const __m256i cospi18 = _mm256_set1_epi32(cospi[18]);
+ const __m256i cospi2 = _mm256_set1_epi32(cospi[2]);
+ const __m256i cospim58 = _mm256_set1_epi32(-cospi[58]);
+ const __m256i cospim42 = _mm256_set1_epi32(-cospi[42]);
+ const __m256i cospim50 = _mm256_set1_epi32(-cospi[50]);
+ const __m256i cospim34 = _mm256_set1_epi32(-cospi[34]);
+ const __m256i cospi60 = _mm256_set1_epi32(cospi[60]);
+ const __m256i cospi28 = _mm256_set1_epi32(cospi[28]);
+ const __m256i cospi44 = _mm256_set1_epi32(cospi[44]);
+ const __m256i cospi12 = _mm256_set1_epi32(cospi[12]);
+ const __m256i cospi20 = _mm256_set1_epi32(cospi[20]);
+ const __m256i cospi4 = _mm256_set1_epi32(cospi[4]);
+ const __m256i cospim52 = _mm256_set1_epi32(-cospi[52]);
+ const __m256i cospim36 = _mm256_set1_epi32(-cospi[36]);
+ const __m256i cospi56 = _mm256_set1_epi32(cospi[56]);
+ const __m256i cospi24 = _mm256_set1_epi32(cospi[24]);
+ const __m256i cospi40 = _mm256_set1_epi32(cospi[40]);
+ const __m256i cospi8 = _mm256_set1_epi32(cospi[8]);
+ const __m256i cospim40 = _mm256_set1_epi32(-cospi[40]);
+ const __m256i cospim8 = _mm256_set1_epi32(-cospi[8]);
+ const __m256i cospim56 = _mm256_set1_epi32(-cospi[56]);
+ const __m256i cospim24 = _mm256_set1_epi32(-cospi[24]);
+ const __m256i cospi32 = _mm256_set1_epi32(cospi[32]);
+ const __m256i cospim32 = _mm256_set1_epi32(-cospi[32]);
+ const __m256i cospi48 = _mm256_set1_epi32(cospi[48]);
+ const __m256i cospim48 = _mm256_set1_epi32(-cospi[48]);
+ const __m256i cospi16 = _mm256_set1_epi32(cospi[16]);
+ const __m256i cospim16 = _mm256_set1_epi32(-cospi[16]);
+ const __m256i rounding = _mm256_set1_epi32(1 << (bit - 1));
+ const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8));
+ const __m256i clamp_lo = _mm256_set1_epi32(-(1 << (log_range - 1)));
+ const __m256i clamp_hi = _mm256_set1_epi32((1 << (log_range - 1)) - 1);
+ __m256i bf1[32];
+
+ {
+ // stage 0
+ // stage 1
+ bf1[0] = in[0];
+ bf1[2] = in[8];
+ bf1[4] = in[4];
+ bf1[6] = in[12];
+ bf1[8] = in[2];
+ bf1[10] = in[10];
+ bf1[12] = in[6];
+ bf1[14] = in[14];
+ bf1[16] = in[1];
+ bf1[18] = in[9];
+ bf1[20] = in[5];
+ bf1[22] = in[13];
+ bf1[24] = in[3];
+ bf1[26] = in[11];
+ bf1[28] = in[7];
+ bf1[30] = in[15];
+
+ // stage 2
+ bf1[31] = half_btf_0_avx2(&cospi2, &bf1[16], &rounding, bit);
+ bf1[16] = half_btf_0_avx2(&cospi62, &bf1[16], &rounding, bit);
+ bf1[17] = half_btf_0_avx2(&cospim34, &bf1[30], &rounding, bit);
+ bf1[30] = half_btf_0_avx2(&cospi30, &bf1[30], &rounding, bit);
+ bf1[29] = half_btf_0_avx2(&cospi18, &bf1[18], &rounding, bit);
+ bf1[18] = half_btf_0_avx2(&cospi46, &bf1[18], &rounding, bit);
+ bf1[19] = half_btf_0_avx2(&cospim50, &bf1[28], &rounding, bit);
+ bf1[28] = half_btf_0_avx2(&cospi14, &bf1[28], &rounding, bit);
+ bf1[27] = half_btf_0_avx2(&cospi10, &bf1[20], &rounding, bit);
+ bf1[20] = half_btf_0_avx2(&cospi54, &bf1[20], &rounding, bit);
+ bf1[21] = half_btf_0_avx2(&cospim42, &bf1[26], &rounding, bit);
+ bf1[26] = half_btf_0_avx2(&cospi22, &bf1[26], &rounding, bit);
+ bf1[25] = half_btf_0_avx2(&cospi26, &bf1[22], &rounding, bit);
+ bf1[22] = half_btf_0_avx2(&cospi38, &bf1[22], &rounding, bit);
+ bf1[23] = half_btf_0_avx2(&cospim58, &bf1[24], &rounding, bit);
+ bf1[24] = half_btf_0_avx2(&cospi6, &bf1[24], &rounding, bit);
+
+ // stage 3
+ bf1[15] = half_btf_0_avx2(&cospi4, &bf1[8], &rounding, bit);
+ bf1[8] = half_btf_0_avx2(&cospi60, &bf1[8], &rounding, bit);
+ bf1[9] = half_btf_0_avx2(&cospim36, &bf1[14], &rounding, bit);
+ bf1[14] = half_btf_0_avx2(&cospi28, &bf1[14], &rounding, bit);
+ bf1[13] = half_btf_0_avx2(&cospi20, &bf1[10], &rounding, bit);
+ bf1[10] = half_btf_0_avx2(&cospi44, &bf1[10], &rounding, bit);
+ bf1[11] = half_btf_0_avx2(&cospim52, &bf1[12], &rounding, bit);
+ bf1[12] = half_btf_0_avx2(&cospi12, &bf1[12], &rounding, bit);
+
+ addsub_avx2(bf1[16], bf1[17], bf1 + 16, bf1 + 17, &clamp_lo, &clamp_hi);
+ addsub_avx2(bf1[19], bf1[18], bf1 + 19, bf1 + 18, &clamp_lo, &clamp_hi);
+ addsub_avx2(bf1[20], bf1[21], bf1 + 20, bf1 + 21, &clamp_lo, &clamp_hi);
+ addsub_avx2(bf1[23], bf1[22], bf1 + 23, bf1 + 22, &clamp_lo, &clamp_hi);
+ addsub_avx2(bf1[24], bf1[25], bf1 + 24, bf1 + 25, &clamp_lo, &clamp_hi);
+ addsub_avx2(bf1[27], bf1[26], bf1 + 27, bf1 + 26, &clamp_lo, &clamp_hi);
+ addsub_avx2(bf1[28], bf1[29], bf1 + 28, bf1 + 29, &clamp_lo, &clamp_hi);
+ addsub_avx2(bf1[31], bf1[30], bf1 + 31, bf1 + 30, &clamp_lo, &clamp_hi);
+
+ // stage 4
+ bf1[7] = half_btf_0_avx2(&cospi8, &bf1[4], &rounding, bit);
+ bf1[4] = half_btf_0_avx2(&cospi56, &bf1[4], &rounding, bit);
+ bf1[5] = half_btf_0_avx2(&cospim40, &bf1[6], &rounding, bit);
+ bf1[6] = half_btf_0_avx2(&cospi24, &bf1[6], &rounding, bit);
+
+ addsub_avx2(bf1[8], bf1[9], bf1 + 8, bf1 + 9, &clamp_lo, &clamp_hi);
+ addsub_avx2(bf1[11], bf1[10], bf1 + 11, bf1 + 10, &clamp_lo, &clamp_hi);
+ addsub_avx2(bf1[12], bf1[13], bf1 + 12, bf1 + 13, &clamp_lo, &clamp_hi);
+ addsub_avx2(bf1[15], bf1[14], bf1 + 15, bf1 + 14, &clamp_lo, &clamp_hi);
+
+ idct32_stage4_avx2(bf1, &cospim8, &cospi56, &cospi8, &cospim56, &cospim40,
+ &cospi24, &cospi40, &cospim24, &rounding, bit);
+
+ // stage 5
+ bf1[0] = half_btf_0_avx2(&cospi32, &bf1[0], &rounding, bit);
+ bf1[1] = bf1[0];
+ bf1[3] = half_btf_0_avx2(&cospi16, &bf1[2], &rounding, bit);
+ bf1[2] = half_btf_0_avx2(&cospi48, &bf1[2], &rounding, bit);
+
+ addsub_avx2(bf1[4], bf1[5], bf1 + 4, bf1 + 5, &clamp_lo, &clamp_hi);
+ addsub_avx2(bf1[7], bf1[6], bf1 + 7, bf1 + 6, &clamp_lo, &clamp_hi);
+
+ idct32_stage5_avx2(bf1, &cospim16, &cospi48, &cospi16, &cospim48, &clamp_lo,
+ &clamp_hi, &rounding, bit);
+
+ // stage 6
+ addsub_avx2(bf1[0], bf1[3], bf1 + 0, bf1 + 3, &clamp_lo, &clamp_hi);
+ addsub_avx2(bf1[1], bf1[2], bf1 + 1, bf1 + 2, &clamp_lo, &clamp_hi);
+
+ idct32_stage6_avx2(bf1, &cospim32, &cospi32, &cospim16, &cospi48, &cospi16,
+ &cospim48, &clamp_lo, &clamp_hi, &rounding, bit);
+
+ // stage 7
+ idct32_stage7_avx2(bf1, &cospim32, &cospi32, &clamp_lo, &clamp_hi,
+ &rounding, bit);
+
+ // stage 8
+ idct32_stage8_avx2(bf1, &cospim32, &cospi32, &clamp_lo, &clamp_hi,
+ &rounding, bit);
+
+ // stage 9
+ idct32_stage9_avx2(bf1, out, do_cols, bd, out_shift, &clamp_lo, &clamp_hi);
+ }
+}
+
+static void idct32_avx2(__m256i *in, __m256i *out, int bit, int do_cols, int bd,
+ int out_shift) {
+ const int32_t *cospi = cospi_arr(bit);
+ const __m256i cospi62 = _mm256_set1_epi32(cospi[62]);
+ const __m256i cospi30 = _mm256_set1_epi32(cospi[30]);
+ const __m256i cospi46 = _mm256_set1_epi32(cospi[46]);
+ const __m256i cospi14 = _mm256_set1_epi32(cospi[14]);
+ const __m256i cospi54 = _mm256_set1_epi32(cospi[54]);
+ const __m256i cospi22 = _mm256_set1_epi32(cospi[22]);
+ const __m256i cospi38 = _mm256_set1_epi32(cospi[38]);
+ const __m256i cospi6 = _mm256_set1_epi32(cospi[6]);
+ const __m256i cospi58 = _mm256_set1_epi32(cospi[58]);
+ const __m256i cospi26 = _mm256_set1_epi32(cospi[26]);
+ const __m256i cospi42 = _mm256_set1_epi32(cospi[42]);
+ const __m256i cospi10 = _mm256_set1_epi32(cospi[10]);
+ const __m256i cospi50 = _mm256_set1_epi32(cospi[50]);
+ const __m256i cospi18 = _mm256_set1_epi32(cospi[18]);
+ const __m256i cospi34 = _mm256_set1_epi32(cospi[34]);
+ const __m256i cospi2 = _mm256_set1_epi32(cospi[2]);
+ const __m256i cospim58 = _mm256_set1_epi32(-cospi[58]);
+ const __m256i cospim26 = _mm256_set1_epi32(-cospi[26]);
+ const __m256i cospim42 = _mm256_set1_epi32(-cospi[42]);
+ const __m256i cospim10 = _mm256_set1_epi32(-cospi[10]);
+ const __m256i cospim50 = _mm256_set1_epi32(-cospi[50]);
+ const __m256i cospim18 = _mm256_set1_epi32(-cospi[18]);
+ const __m256i cospim34 = _mm256_set1_epi32(-cospi[34]);
+ const __m256i cospim2 = _mm256_set1_epi32(-cospi[2]);
+ const __m256i cospi60 = _mm256_set1_epi32(cospi[60]);
+ const __m256i cospi28 = _mm256_set1_epi32(cospi[28]);
+ const __m256i cospi44 = _mm256_set1_epi32(cospi[44]);
+ const __m256i cospi12 = _mm256_set1_epi32(cospi[12]);
+ const __m256i cospi52 = _mm256_set1_epi32(cospi[52]);
+ const __m256i cospi20 = _mm256_set1_epi32(cospi[20]);
+ const __m256i cospi36 = _mm256_set1_epi32(cospi[36]);
+ const __m256i cospi4 = _mm256_set1_epi32(cospi[4]);
+ const __m256i cospim52 = _mm256_set1_epi32(-cospi[52]);
+ const __m256i cospim20 = _mm256_set1_epi32(-cospi[20]);
+ const __m256i cospim36 = _mm256_set1_epi32(-cospi[36]);
+ const __m256i cospim4 = _mm256_set1_epi32(-cospi[4]);
+ const __m256i cospi56 = _mm256_set1_epi32(cospi[56]);
+ const __m256i cospi24 = _mm256_set1_epi32(cospi[24]);
+ const __m256i cospi40 = _mm256_set1_epi32(cospi[40]);
+ const __m256i cospi8 = _mm256_set1_epi32(cospi[8]);
+ const __m256i cospim40 = _mm256_set1_epi32(-cospi[40]);
+ const __m256i cospim8 = _mm256_set1_epi32(-cospi[8]);
+ const __m256i cospim56 = _mm256_set1_epi32(-cospi[56]);
+ const __m256i cospim24 = _mm256_set1_epi32(-cospi[24]);
+ const __m256i cospi32 = _mm256_set1_epi32(cospi[32]);
+ const __m256i cospim32 = _mm256_set1_epi32(-cospi[32]);
+ const __m256i cospi48 = _mm256_set1_epi32(cospi[48]);
+ const __m256i cospim48 = _mm256_set1_epi32(-cospi[48]);
+ const __m256i cospi16 = _mm256_set1_epi32(cospi[16]);
+ const __m256i cospim16 = _mm256_set1_epi32(-cospi[16]);
+ const __m256i rounding = _mm256_set1_epi32(1 << (bit - 1));
+ const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8));
+ const __m256i clamp_lo = _mm256_set1_epi32(-(1 << (log_range - 1)));
+ const __m256i clamp_hi = _mm256_set1_epi32((1 << (log_range - 1)) - 1);
+ __m256i bf1[32], bf0[32];
+
+ {
+ // stage 0
+ // stage 1
+ bf1[0] = in[0];
+ bf1[1] = in[16];
+ bf1[2] = in[8];
+ bf1[3] = in[24];
+ bf1[4] = in[4];
+ bf1[5] = in[20];
+ bf1[6] = in[12];
+ bf1[7] = in[28];
+ bf1[8] = in[2];
+ bf1[9] = in[18];
+ bf1[10] = in[10];
+ bf1[11] = in[26];
+ bf1[12] = in[6];
+ bf1[13] = in[22];
+ bf1[14] = in[14];
+ bf1[15] = in[30];
+ bf1[16] = in[1];
+ bf1[17] = in[17];
+ bf1[18] = in[9];
+ bf1[19] = in[25];
+ bf1[20] = in[5];
+ bf1[21] = in[21];
+ bf1[22] = in[13];
+ bf1[23] = in[29];
+ bf1[24] = in[3];
+ bf1[25] = in[19];
+ bf1[26] = in[11];
+ bf1[27] = in[27];
+ bf1[28] = in[7];
+ bf1[29] = in[23];
+ bf1[30] = in[15];
+ bf1[31] = in[31];
+
+ // stage 2
+ bf0[0] = bf1[0];
+ bf0[1] = bf1[1];
+ bf0[2] = bf1[2];
+ bf0[3] = bf1[3];
+ bf0[4] = bf1[4];
+ bf0[5] = bf1[5];
+ bf0[6] = bf1[6];
+ bf0[7] = bf1[7];
+ bf0[8] = bf1[8];
+ bf0[9] = bf1[9];
+ bf0[10] = bf1[10];
+ bf0[11] = bf1[11];
+ bf0[12] = bf1[12];
+ bf0[13] = bf1[13];
+ bf0[14] = bf1[14];
+ bf0[15] = bf1[15];
+ bf0[16] =
+ half_btf_avx2(&cospi62, &bf1[16], &cospim2, &bf1[31], &rounding, bit);
+ bf0[17] =
+ half_btf_avx2(&cospi30, &bf1[17], &cospim34, &bf1[30], &rounding, bit);
+ bf0[18] =
+ half_btf_avx2(&cospi46, &bf1[18], &cospim18, &bf1[29], &rounding, bit);
+ bf0[19] =
+ half_btf_avx2(&cospi14, &bf1[19], &cospim50, &bf1[28], &rounding, bit);
+ bf0[20] =
+ half_btf_avx2(&cospi54, &bf1[20], &cospim10, &bf1[27], &rounding, bit);
+ bf0[21] =
+ half_btf_avx2(&cospi22, &bf1[21], &cospim42, &bf1[26], &rounding, bit);
+ bf0[22] =
+ half_btf_avx2(&cospi38, &bf1[22], &cospim26, &bf1[25], &rounding, bit);
+ bf0[23] =
+ half_btf_avx2(&cospi6, &bf1[23], &cospim58, &bf1[24], &rounding, bit);
+ bf0[24] =
+ half_btf_avx2(&cospi58, &bf1[23], &cospi6, &bf1[24], &rounding, bit);
+ bf0[25] =
+ half_btf_avx2(&cospi26, &bf1[22], &cospi38, &bf1[25], &rounding, bit);
+ bf0[26] =
+ half_btf_avx2(&cospi42, &bf1[21], &cospi22, &bf1[26], &rounding, bit);
+ bf0[27] =
+ half_btf_avx2(&cospi10, &bf1[20], &cospi54, &bf1[27], &rounding, bit);
+ bf0[28] =
+ half_btf_avx2(&cospi50, &bf1[19], &cospi14, &bf1[28], &rounding, bit);
+ bf0[29] =
+ half_btf_avx2(&cospi18, &bf1[18], &cospi46, &bf1[29], &rounding, bit);
+ bf0[30] =
+ half_btf_avx2(&cospi34, &bf1[17], &cospi30, &bf1[30], &rounding, bit);
+ bf0[31] =
+ half_btf_avx2(&cospi2, &bf1[16], &cospi62, &bf1[31], &rounding, bit);
+
+ // stage 3
+ bf1[0] = bf0[0];
+ bf1[1] = bf0[1];
+ bf1[2] = bf0[2];
+ bf1[3] = bf0[3];
+ bf1[4] = bf0[4];
+ bf1[5] = bf0[5];
+ bf1[6] = bf0[6];
+ bf1[7] = bf0[7];
+ bf1[8] =
+ half_btf_avx2(&cospi60, &bf0[8], &cospim4, &bf0[15], &rounding, bit);
+ bf1[9] =
+ half_btf_avx2(&cospi28, &bf0[9], &cospim36, &bf0[14], &rounding, bit);
+ bf1[10] =
+ half_btf_avx2(&cospi44, &bf0[10], &cospim20, &bf0[13], &rounding, bit);
+ bf1[11] =
+ half_btf_avx2(&cospi12, &bf0[11], &cospim52, &bf0[12], &rounding, bit);
+ bf1[12] =
+ half_btf_avx2(&cospi52, &bf0[11], &cospi12, &bf0[12], &rounding, bit);
+ bf1[13] =
+ half_btf_avx2(&cospi20, &bf0[10], &cospi44, &bf0[13], &rounding, bit);
+ bf1[14] =
+ half_btf_avx2(&cospi36, &bf0[9], &cospi28, &bf0[14], &rounding, bit);
+ bf1[15] =
+ half_btf_avx2(&cospi4, &bf0[8], &cospi60, &bf0[15], &rounding, bit);
+
+ addsub_avx2(bf0[16], bf0[17], bf1 + 16, bf1 + 17, &clamp_lo, &clamp_hi);
+ addsub_avx2(bf0[19], bf0[18], bf1 + 19, bf1 + 18, &clamp_lo, &clamp_hi);
+ addsub_avx2(bf0[20], bf0[21], bf1 + 20, bf1 + 21, &clamp_lo, &clamp_hi);
+ addsub_avx2(bf0[23], bf0[22], bf1 + 23, bf1 + 22, &clamp_lo, &clamp_hi);
+ addsub_avx2(bf0[24], bf0[25], bf1 + 24, bf1 + 25, &clamp_lo, &clamp_hi);
+ addsub_avx2(bf0[27], bf0[26], bf1 + 27, bf1 + 26, &clamp_lo, &clamp_hi);
+ addsub_avx2(bf0[28], bf0[29], bf1 + 28, bf1 + 29, &clamp_lo, &clamp_hi);
+ addsub_avx2(bf0[31], bf0[30], bf1 + 31, bf1 + 30, &clamp_lo, &clamp_hi);
+
+ // stage 4
+ bf0[0] = bf1[0];
+ bf0[1] = bf1[1];
+ bf0[2] = bf1[2];
+ bf0[3] = bf1[3];
+ bf0[4] =
+ half_btf_avx2(&cospi56, &bf1[4], &cospim8, &bf1[7], &rounding, bit);
+ bf0[5] =
+ half_btf_avx2(&cospi24, &bf1[5], &cospim40, &bf1[6], &rounding, bit);
+ bf0[6] =
+ half_btf_avx2(&cospi40, &bf1[5], &cospi24, &bf1[6], &rounding, bit);
+ bf0[7] = half_btf_avx2(&cospi8, &bf1[4], &cospi56, &bf1[7], &rounding, bit);
+
+ addsub_avx2(bf1[8], bf1[9], bf0 + 8, bf0 + 9, &clamp_lo, &clamp_hi);
+ addsub_avx2(bf1[11], bf1[10], bf0 + 11, bf0 + 10, &clamp_lo, &clamp_hi);
+ addsub_avx2(bf1[12], bf1[13], bf0 + 12, bf0 + 13, &clamp_lo, &clamp_hi);
+ addsub_avx2(bf1[15], bf1[14], bf0 + 15, bf0 + 14, &clamp_lo, &clamp_hi);
+
+ bf0[16] = bf1[16];
+ bf0[17] =
+ half_btf_avx2(&cospim8, &bf1[17], &cospi56, &bf1[30], &rounding, bit);
+ bf0[18] =
+ half_btf_avx2(&cospim56, &bf1[18], &cospim8, &bf1[29], &rounding, bit);
+ bf0[19] = bf1[19];
+ bf0[20] = bf1[20];
+ bf0[21] =
+ half_btf_avx2(&cospim40, &bf1[21], &cospi24, &bf1[26], &rounding, bit);
+ bf0[22] =
+ half_btf_avx2(&cospim24, &bf1[22], &cospim40, &bf1[25], &rounding, bit);
+ bf0[23] = bf1[23];
+ bf0[24] = bf1[24];
+ bf0[25] =
+ half_btf_avx2(&cospim40, &bf1[22], &cospi24, &bf1[25], &rounding, bit);
+ bf0[26] =
+ half_btf_avx2(&cospi24, &bf1[21], &cospi40, &bf1[26], &rounding, bit);
+ bf0[27] = bf1[27];
+ bf0[28] = bf1[28];
+ bf0[29] =
+ half_btf_avx2(&cospim8, &bf1[18], &cospi56, &bf1[29], &rounding, bit);
+ bf0[30] =
+ half_btf_avx2(&cospi56, &bf1[17], &cospi8, &bf1[30], &rounding, bit);
+ bf0[31] = bf1[31];
+
+ // stage 5
+ bf1[0] =
+ half_btf_avx2(&cospi32, &bf0[0], &cospi32, &bf0[1], &rounding, bit);
+ bf1[1] =
+ half_btf_avx2(&cospi32, &bf0[0], &cospim32, &bf0[1], &rounding, bit);
+ bf1[2] =
+ half_btf_avx2(&cospi48, &bf0[2], &cospim16, &bf0[3], &rounding, bit);
+ bf1[3] =
+ half_btf_avx2(&cospi16, &bf0[2], &cospi48, &bf0[3], &rounding, bit);
+ addsub_avx2(bf0[4], bf0[5], bf1 + 4, bf1 + 5, &clamp_lo, &clamp_hi);
+ addsub_avx2(bf0[7], bf0[6], bf1 + 7, bf1 + 6, &clamp_lo, &clamp_hi);
+ bf1[8] = bf0[8];
+ bf1[9] =
+ half_btf_avx2(&cospim16, &bf0[9], &cospi48, &bf0[14], &rounding, bit);
+ bf1[10] =
+ half_btf_avx2(&cospim48, &bf0[10], &cospim16, &bf0[13], &rounding, bit);
+ bf1[11] = bf0[11];
+ bf1[12] = bf0[12];
+ bf1[13] =
+ half_btf_avx2(&cospim16, &bf0[10], &cospi48, &bf0[13], &rounding, bit);
+ bf1[14] =
+ half_btf_avx2(&cospi48, &bf0[9], &cospi16, &bf0[14], &rounding, bit);
+ bf1[15] = bf0[15];
+ addsub_avx2(bf0[16], bf0[19], bf1 + 16, bf1 + 19, &clamp_lo, &clamp_hi);
+ addsub_avx2(bf0[17], bf0[18], bf1 + 17, bf1 + 18, &clamp_lo, &clamp_hi);
+ addsub_avx2(bf0[23], bf0[20], bf1 + 23, bf1 + 20, &clamp_lo, &clamp_hi);
+ addsub_avx2(bf0[22], bf0[21], bf1 + 22, bf1 + 21, &clamp_lo, &clamp_hi);
+ addsub_avx2(bf0[24], bf0[27], bf1 + 24, bf1 + 27, &clamp_lo, &clamp_hi);
+ addsub_avx2(bf0[25], bf0[26], bf1 + 25, bf1 + 26, &clamp_lo, &clamp_hi);
+ addsub_avx2(bf0[31], bf0[28], bf1 + 31, bf1 + 28, &clamp_lo, &clamp_hi);
+ addsub_avx2(bf0[30], bf0[29], bf1 + 30, bf1 + 29, &clamp_lo, &clamp_hi);
+
+ // stage 6
+ addsub_avx2(bf1[0], bf1[3], bf0 + 0, bf0 + 3, &clamp_lo, &clamp_hi);
+ addsub_avx2(bf1[1], bf1[2], bf0 + 1, bf0 + 2, &clamp_lo, &clamp_hi);
+ bf0[4] = bf1[4];
+ bf0[5] =
+ half_btf_avx2(&cospim32, &bf1[5], &cospi32, &bf1[6], &rounding, bit);
+ bf0[6] =
+ half_btf_avx2(&cospi32, &bf1[5], &cospi32, &bf1[6], &rounding, bit);
+ bf0[7] = bf1[7];
+ addsub_avx2(bf1[8], bf1[11], bf0 + 8, bf0 + 11, &clamp_lo, &clamp_hi);
+ addsub_avx2(bf1[9], bf1[10], bf0 + 9, bf0 + 10, &clamp_lo, &clamp_hi);
+ addsub_avx2(bf1[15], bf1[12], bf0 + 15, bf0 + 12, &clamp_lo, &clamp_hi);
+ addsub_avx2(bf1[14], bf1[13], bf0 + 14, bf0 + 13, &clamp_lo, &clamp_hi);
+ bf0[16] = bf1[16];
+ bf0[17] = bf1[17];
+ bf0[18] =
+ half_btf_avx2(&cospim16, &bf1[18], &cospi48, &bf1[29], &rounding, bit);
+ bf0[19] =
+ half_btf_avx2(&cospim16, &bf1[19], &cospi48, &bf1[28], &rounding, bit);
+ bf0[20] =
+ half_btf_avx2(&cospim48, &bf1[20], &cospim16, &bf1[27], &rounding, bit);
+ bf0[21] =
+ half_btf_avx2(&cospim48, &bf1[21], &cospim16, &bf1[26], &rounding, bit);
+ bf0[22] = bf1[22];
+ bf0[23] = bf1[23];
+ bf0[24] = bf1[24];
+ bf0[25] = bf1[25];
+ bf0[26] =
+ half_btf_avx2(&cospim16, &bf1[21], &cospi48, &bf1[26], &rounding, bit);
+ bf0[27] =
+ half_btf_avx2(&cospim16, &bf1[20], &cospi48, &bf1[27], &rounding, bit);
+ bf0[28] =
+ half_btf_avx2(&cospi48, &bf1[19], &cospi16, &bf1[28], &rounding, bit);
+ bf0[29] =
+ half_btf_avx2(&cospi48, &bf1[18], &cospi16, &bf1[29], &rounding, bit);
+ bf0[30] = bf1[30];
+ bf0[31] = bf1[31];
+
+ // stage 7
+ addsub_avx2(bf0[0], bf0[7], bf1 + 0, bf1 + 7, &clamp_lo, &clamp_hi);
+ addsub_avx2(bf0[1], bf0[6], bf1 + 1, bf1 + 6, &clamp_lo, &clamp_hi);
+ addsub_avx2(bf0[2], bf0[5], bf1 + 2, bf1 + 5, &clamp_lo, &clamp_hi);
+ addsub_avx2(bf0[3], bf0[4], bf1 + 3, bf1 + 4, &clamp_lo, &clamp_hi);
+ bf1[8] = bf0[8];
+ bf1[9] = bf0[9];
+ bf1[10] =
+ half_btf_avx2(&cospim32, &bf0[10], &cospi32, &bf0[13], &rounding, bit);
+ bf1[11] =
+ half_btf_avx2(&cospim32, &bf0[11], &cospi32, &bf0[12], &rounding, bit);
+ bf1[12] =
+ half_btf_avx2(&cospi32, &bf0[11], &cospi32, &bf0[12], &rounding, bit);
+ bf1[13] =
+ half_btf_avx2(&cospi32, &bf0[10], &cospi32, &bf0[13], &rounding, bit);
+ bf1[14] = bf0[14];
+ bf1[15] = bf0[15];
+ addsub_avx2(bf0[16], bf0[23], bf1 + 16, bf1 + 23, &clamp_lo, &clamp_hi);
+ addsub_avx2(bf0[17], bf0[22], bf1 + 17, bf1 + 22, &clamp_lo, &clamp_hi);
+ addsub_avx2(bf0[18], bf0[21], bf1 + 18, bf1 + 21, &clamp_lo, &clamp_hi);
+ addsub_avx2(bf0[19], bf0[20], bf1 + 19, bf1 + 20, &clamp_lo, &clamp_hi);
+ addsub_avx2(bf0[31], bf0[24], bf1 + 31, bf1 + 24, &clamp_lo, &clamp_hi);
+ addsub_avx2(bf0[30], bf0[25], bf1 + 30, bf1 + 25, &clamp_lo, &clamp_hi);
+ addsub_avx2(bf0[29], bf0[26], bf1 + 29, bf1 + 26, &clamp_lo, &clamp_hi);
+ addsub_avx2(bf0[28], bf0[27], bf1 + 28, bf1 + 27, &clamp_lo, &clamp_hi);
+
+ // stage 8
+ addsub_avx2(bf1[0], bf1[15], bf0 + 0, bf0 + 15, &clamp_lo, &clamp_hi);
+ addsub_avx2(bf1[1], bf1[14], bf0 + 1, bf0 + 14, &clamp_lo, &clamp_hi);
+ addsub_avx2(bf1[2], bf1[13], bf0 + 2, bf0 + 13, &clamp_lo, &clamp_hi);
+ addsub_avx2(bf1[3], bf1[12], bf0 + 3, bf0 + 12, &clamp_lo, &clamp_hi);
+ addsub_avx2(bf1[4], bf1[11], bf0 + 4, bf0 + 11, &clamp_lo, &clamp_hi);
+ addsub_avx2(bf1[5], bf1[10], bf0 + 5, bf0 + 10, &clamp_lo, &clamp_hi);
+ addsub_avx2(bf1[6], bf1[9], bf0 + 6, bf0 + 9, &clamp_lo, &clamp_hi);
+ addsub_avx2(bf1[7], bf1[8], bf0 + 7, bf0 + 8, &clamp_lo, &clamp_hi);
+ bf0[16] = bf1[16];
+ bf0[17] = bf1[17];
+ bf0[18] = bf1[18];
+ bf0[19] = bf1[19];
+ bf0[20] =
+ half_btf_avx2(&cospim32, &bf1[20], &cospi32, &bf1[27], &rounding, bit);
+ bf0[21] =
+ half_btf_avx2(&cospim32, &bf1[21], &cospi32, &bf1[26], &rounding, bit);
+ bf0[22] =
+ half_btf_avx2(&cospim32, &bf1[22], &cospi32, &bf1[25], &rounding, bit);
+ bf0[23] =
+ half_btf_avx2(&cospim32, &bf1[23], &cospi32, &bf1[24], &rounding, bit);
+ bf0[24] =
+ half_btf_avx2(&cospi32, &bf1[23], &cospi32, &bf1[24], &rounding, bit);
+ bf0[25] =
+ half_btf_avx2(&cospi32, &bf1[22], &cospi32, &bf1[25], &rounding, bit);
+ bf0[26] =
+ half_btf_avx2(&cospi32, &bf1[21], &cospi32, &bf1[26], &rounding, bit);
+ bf0[27] =
+ half_btf_avx2(&cospi32, &bf1[20], &cospi32, &bf1[27], &rounding, bit);
+ bf0[28] = bf1[28];
+ bf0[29] = bf1[29];
+ bf0[30] = bf1[30];
+ bf0[31] = bf1[31];
+
+ // stage 9
+ addsub_avx2(bf0[0], bf0[31], out + 0, out + 31, &clamp_lo, &clamp_hi);
+ addsub_avx2(bf0[1], bf0[30], out + 1, out + 30, &clamp_lo, &clamp_hi);
+ addsub_avx2(bf0[2], bf0[29], out + 2, out + 29, &clamp_lo, &clamp_hi);
+ addsub_avx2(bf0[3], bf0[28], out + 3, out + 28, &clamp_lo, &clamp_hi);
+ addsub_avx2(bf0[4], bf0[27], out + 4, out + 27, &clamp_lo, &clamp_hi);
+ addsub_avx2(bf0[5], bf0[26], out + 5, out + 26, &clamp_lo, &clamp_hi);
+ addsub_avx2(bf0[6], bf0[25], out + 6, out + 25, &clamp_lo, &clamp_hi);
+ addsub_avx2(bf0[7], bf0[24], out + 7, out + 24, &clamp_lo, &clamp_hi);
+ addsub_avx2(bf0[8], bf0[23], out + 8, out + 23, &clamp_lo, &clamp_hi);
+ addsub_avx2(bf0[9], bf0[22], out + 9, out + 22, &clamp_lo, &clamp_hi);
+ addsub_avx2(bf0[10], bf0[21], out + 10, out + 21, &clamp_lo, &clamp_hi);
+ addsub_avx2(bf0[11], bf0[20], out + 11, out + 20, &clamp_lo, &clamp_hi);
+ addsub_avx2(bf0[12], bf0[19], out + 12, out + 19, &clamp_lo, &clamp_hi);
+ addsub_avx2(bf0[13], bf0[18], out + 13, out + 18, &clamp_lo, &clamp_hi);
+ addsub_avx2(bf0[14], bf0[17], out + 14, out + 17, &clamp_lo, &clamp_hi);
+ addsub_avx2(bf0[15], bf0[16], out + 15, out + 16, &clamp_lo, &clamp_hi);
+ if (!do_cols) {
+ const int log_range_out = AOMMAX(16, bd + 6);
+ const __m256i clamp_lo_out =
+ _mm256_set1_epi32(-(1 << (log_range_out - 1)));
+ const __m256i clamp_hi_out =
+ _mm256_set1_epi32((1 << (log_range_out - 1)) - 1);
+ round_shift_8x8_avx2(out, out_shift);
+ round_shift_8x8_avx2(out + 16, out_shift);
+ highbd_clamp_epi32_avx2(out, out, &clamp_lo_out, &clamp_hi_out, 32);
+ }
+ }
+}
+static void idct16_low1_avx2(__m256i *in, __m256i *out, int bit, int do_cols,
+ int bd, int out_shift) {
+ const int32_t *cospi = cospi_arr(bit);
+ const __m256i cospi32 = _mm256_set1_epi32(cospi[32]);
+ const __m256i rnding = _mm256_set1_epi32(1 << (bit - 1));
+ const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8));
+ __m256i clamp_lo = _mm256_set1_epi32(-(1 << (log_range - 1)));
+ __m256i clamp_hi = _mm256_set1_epi32((1 << (log_range - 1)) - 1);
+
+ {
+ // stage 0
+ // stage 1
+ // stage 2
+ // stage 3
+ // stage 4
+ in[0] = _mm256_mullo_epi32(in[0], cospi32);
+ in[0] = _mm256_add_epi32(in[0], rnding);
+ in[0] = _mm256_srai_epi32(in[0], bit);
+
+ // stage 5
+ // stage 6
+ // stage 7
+ if (!do_cols) {
+ const int log_range_out = AOMMAX(16, bd + 6);
+ clamp_lo = _mm256_set1_epi32(-(1 << (log_range_out - 1)));
+ clamp_hi = _mm256_set1_epi32((1 << (log_range_out - 1)) - 1);
+ __m256i offset = _mm256_set1_epi32((1 << out_shift) >> 1);
+ in[0] = _mm256_add_epi32(in[0], offset);
+ in[0] = _mm256_sra_epi32(in[0], _mm_cvtsi32_si128(out_shift));
+ }
+ in[0] = _mm256_max_epi32(in[0], clamp_lo);
+ in[0] = _mm256_min_epi32(in[0], clamp_hi);
+ out[0] = in[0];
+ out[1] = in[0];
+ out[2] = in[0];
+ out[3] = in[0];
+ out[4] = in[0];
+ out[5] = in[0];
+ out[6] = in[0];
+ out[7] = in[0];
+ out[8] = in[0];
+ out[9] = in[0];
+ out[10] = in[0];
+ out[11] = in[0];
+ out[12] = in[0];
+ out[13] = in[0];
+ out[14] = in[0];
+ out[15] = in[0];
+ }
+}
+
+static void idct16_low8_avx2(__m256i *in, __m256i *out, int bit, int do_cols,
+ int bd, int out_shift) {
+ const int32_t *cospi = cospi_arr(bit);
+ const __m256i cospi60 = _mm256_set1_epi32(cospi[60]);
+ const __m256i cospi28 = _mm256_set1_epi32(cospi[28]);
+ const __m256i cospi44 = _mm256_set1_epi32(cospi[44]);
+ const __m256i cospi20 = _mm256_set1_epi32(cospi[20]);
+ const __m256i cospi12 = _mm256_set1_epi32(cospi[12]);
+ const __m256i cospi4 = _mm256_set1_epi32(cospi[4]);
+ const __m256i cospi56 = _mm256_set1_epi32(cospi[56]);
+ const __m256i cospi24 = _mm256_set1_epi32(cospi[24]);
+ const __m256i cospim40 = _mm256_set1_epi32(-cospi[40]);
+ const __m256i cospi8 = _mm256_set1_epi32(cospi[8]);
+ const __m256i cospi32 = _mm256_set1_epi32(cospi[32]);
+ const __m256i cospi48 = _mm256_set1_epi32(cospi[48]);
+ const __m256i cospi16 = _mm256_set1_epi32(cospi[16]);
+ const __m256i cospim16 = _mm256_set1_epi32(-cospi[16]);
+ const __m256i cospim48 = _mm256_set1_epi32(-cospi[48]);
+ const __m256i cospim36 = _mm256_set1_epi32(-cospi[36]);
+ const __m256i cospim52 = _mm256_set1_epi32(-cospi[52]);
+ const __m256i rnding = _mm256_set1_epi32(1 << (bit - 1));
+ const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8));
+ const __m256i clamp_lo = _mm256_set1_epi32(-(1 << (log_range - 1)));
+ const __m256i clamp_hi = _mm256_set1_epi32((1 << (log_range - 1)) - 1);
+ __m256i u[16], x, y;
+
+ {
+ // stage 0
+ // stage 1
+ u[0] = in[0];
+ u[2] = in[4];
+ u[4] = in[2];
+ u[6] = in[6];
+ u[8] = in[1];
+ u[10] = in[5];
+ u[12] = in[3];
+ u[14] = in[7];
+
+ // stage 2
+ u[15] = half_btf_0_avx2(&cospi4, &u[8], &rnding, bit);
+ u[8] = half_btf_0_avx2(&cospi60, &u[8], &rnding, bit);
+
+ u[9] = half_btf_0_avx2(&cospim36, &u[14], &rnding, bit);
+ u[14] = half_btf_0_avx2(&cospi28, &u[14], &rnding, bit);
+
+ u[13] = half_btf_0_avx2(&cospi20, &u[10], &rnding, bit);
+ u[10] = half_btf_0_avx2(&cospi44, &u[10], &rnding, bit);
+
+ u[11] = half_btf_0_avx2(&cospim52, &u[12], &rnding, bit);
+ u[12] = half_btf_0_avx2(&cospi12, &u[12], &rnding, bit);
+
+ // stage 3
+ u[7] = half_btf_0_avx2(&cospi8, &u[4], &rnding, bit);
+ u[4] = half_btf_0_avx2(&cospi56, &u[4], &rnding, bit);
+ u[5] = half_btf_0_avx2(&cospim40, &u[6], &rnding, bit);
+ u[6] = half_btf_0_avx2(&cospi24, &u[6], &rnding, bit);
+
+ addsub_avx2(u[8], u[9], &u[8], &u[9], &clamp_lo, &clamp_hi);
+ addsub_avx2(u[11], u[10], &u[11], &u[10], &clamp_lo, &clamp_hi);
+ addsub_avx2(u[12], u[13], &u[12], &u[13], &clamp_lo, &clamp_hi);
+ addsub_avx2(u[15], u[14], &u[15], &u[14], &clamp_lo, &clamp_hi);
+
+ // stage 4
+ x = _mm256_mullo_epi32(u[0], cospi32);
+ u[0] = _mm256_add_epi32(x, rnding);
+ u[0] = _mm256_srai_epi32(u[0], bit);
+ u[1] = u[0];
+
+ u[3] = half_btf_0_avx2(&cospi16, &u[2], &rnding, bit);
+ u[2] = half_btf_0_avx2(&cospi48, &u[2], &rnding, bit);
+
+ addsub_avx2(u[4], u[5], &u[4], &u[5], &clamp_lo, &clamp_hi);
+ addsub_avx2(u[7], u[6], &u[7], &u[6], &clamp_lo, &clamp_hi);
+
+ x = half_btf_avx2(&cospim16, &u[9], &cospi48, &u[14], &rnding, bit);
+ u[14] = half_btf_avx2(&cospi48, &u[9], &cospi16, &u[14], &rnding, bit);
+ u[9] = x;
+ y = half_btf_avx2(&cospim48, &u[10], &cospim16, &u[13], &rnding, bit);
+ u[13] = half_btf_avx2(&cospim16, &u[10], &cospi48, &u[13], &rnding, bit);
+ u[10] = y;
+
+ // stage 5
+ addsub_avx2(u[0], u[3], &u[0], &u[3], &clamp_lo, &clamp_hi);
+ addsub_avx2(u[1], u[2], &u[1], &u[2], &clamp_lo, &clamp_hi);
+
+ x = _mm256_mullo_epi32(u[5], cospi32);
+ y = _mm256_mullo_epi32(u[6], cospi32);
+ u[5] = _mm256_sub_epi32(y, x);
+ u[5] = _mm256_add_epi32(u[5], rnding);
+ u[5] = _mm256_srai_epi32(u[5], bit);
+
+ u[6] = _mm256_add_epi32(y, x);
+ u[6] = _mm256_add_epi32(u[6], rnding);
+ u[6] = _mm256_srai_epi32(u[6], bit);
+
+ addsub_avx2(u[8], u[11], &u[8], &u[11], &clamp_lo, &clamp_hi);
+ addsub_avx2(u[9], u[10], &u[9], &u[10], &clamp_lo, &clamp_hi);
+ addsub_avx2(u[15], u[12], &u[15], &u[12], &clamp_lo, &clamp_hi);
+ addsub_avx2(u[14], u[13], &u[14], &u[13], &clamp_lo, &clamp_hi);
+
+ // stage 6
+ addsub_avx2(u[0], u[7], &u[0], &u[7], &clamp_lo, &clamp_hi);
+ addsub_avx2(u[1], u[6], &u[1], &u[6], &clamp_lo, &clamp_hi);
+ addsub_avx2(u[2], u[5], &u[2], &u[5], &clamp_lo, &clamp_hi);
+ addsub_avx2(u[3], u[4], &u[3], &u[4], &clamp_lo, &clamp_hi);
+
+ x = _mm256_mullo_epi32(u[10], cospi32);
+ y = _mm256_mullo_epi32(u[13], cospi32);
+ u[10] = _mm256_sub_epi32(y, x);
+ u[10] = _mm256_add_epi32(u[10], rnding);
+ u[10] = _mm256_srai_epi32(u[10], bit);
+
+ u[13] = _mm256_add_epi32(x, y);
+ u[13] = _mm256_add_epi32(u[13], rnding);
+ u[13] = _mm256_srai_epi32(u[13], bit);
+
+ x = _mm256_mullo_epi32(u[11], cospi32);
+ y = _mm256_mullo_epi32(u[12], cospi32);
+ u[11] = _mm256_sub_epi32(y, x);
+ u[11] = _mm256_add_epi32(u[11], rnding);
+ u[11] = _mm256_srai_epi32(u[11], bit);
+
+ u[12] = _mm256_add_epi32(x, y);
+ u[12] = _mm256_add_epi32(u[12], rnding);
+ u[12] = _mm256_srai_epi32(u[12], bit);
+ // stage 7
+ addsub_avx2(u[0], u[15], out + 0, out + 15, &clamp_lo, &clamp_hi);
+ addsub_avx2(u[1], u[14], out + 1, out + 14, &clamp_lo, &clamp_hi);
+ addsub_avx2(u[2], u[13], out + 2, out + 13, &clamp_lo, &clamp_hi);
+ addsub_avx2(u[3], u[12], out + 3, out + 12, &clamp_lo, &clamp_hi);
+ addsub_avx2(u[4], u[11], out + 4, out + 11, &clamp_lo, &clamp_hi);
+ addsub_avx2(u[5], u[10], out + 5, out + 10, &clamp_lo, &clamp_hi);
+ addsub_avx2(u[6], u[9], out + 6, out + 9, &clamp_lo, &clamp_hi);
+ addsub_avx2(u[7], u[8], out + 7, out + 8, &clamp_lo, &clamp_hi);
+
+ if (!do_cols) {
+ const int log_range_out = AOMMAX(16, bd + 6);
+ const __m256i clamp_lo_out =
+ _mm256_set1_epi32(-(1 << (log_range_out - 1)));
+ const __m256i clamp_hi_out =
+ _mm256_set1_epi32((1 << (log_range_out - 1)) - 1);
+ round_shift_8x8_avx2(out, out_shift);
+ highbd_clamp_epi32_avx2(out, out, &clamp_lo_out, &clamp_hi_out, 16);
+ }
+ }
+}
+
+static void idct16_avx2(__m256i *in, __m256i *out, int bit, int do_cols, int bd,
+ int out_shift) {
+ const int32_t *cospi = cospi_arr(bit);
+ const __m256i cospi60 = _mm256_set1_epi32(cospi[60]);
+ const __m256i cospim4 = _mm256_set1_epi32(-cospi[4]);
+ const __m256i cospi28 = _mm256_set1_epi32(cospi[28]);
+ const __m256i cospim36 = _mm256_set1_epi32(-cospi[36]);
+ const __m256i cospi44 = _mm256_set1_epi32(cospi[44]);
+ const __m256i cospi20 = _mm256_set1_epi32(cospi[20]);
+ const __m256i cospim20 = _mm256_set1_epi32(-cospi[20]);
+ const __m256i cospi12 = _mm256_set1_epi32(cospi[12]);
+ const __m256i cospim52 = _mm256_set1_epi32(-cospi[52]);
+ const __m256i cospi52 = _mm256_set1_epi32(cospi[52]);
+ const __m256i cospi36 = _mm256_set1_epi32(cospi[36]);
+ const __m256i cospi4 = _mm256_set1_epi32(cospi[4]);
+ const __m256i cospi56 = _mm256_set1_epi32(cospi[56]);
+ const __m256i cospim8 = _mm256_set1_epi32(-cospi[8]);
+ const __m256i cospi24 = _mm256_set1_epi32(cospi[24]);
+ const __m256i cospim40 = _mm256_set1_epi32(-cospi[40]);
+ const __m256i cospi40 = _mm256_set1_epi32(cospi[40]);
+ const __m256i cospi8 = _mm256_set1_epi32(cospi[8]);
+ const __m256i cospi32 = _mm256_set1_epi32(cospi[32]);
+ const __m256i cospi48 = _mm256_set1_epi32(cospi[48]);
+ const __m256i cospi16 = _mm256_set1_epi32(cospi[16]);
+ const __m256i cospim16 = _mm256_set1_epi32(-cospi[16]);
+ const __m256i cospim48 = _mm256_set1_epi32(-cospi[48]);
+ const __m256i rnding = _mm256_set1_epi32(1 << (bit - 1));
+ const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8));
+ const __m256i clamp_lo = _mm256_set1_epi32(-(1 << (log_range - 1)));
+ const __m256i clamp_hi = _mm256_set1_epi32((1 << (log_range - 1)) - 1);
+ __m256i u[16], v[16], x, y;
+
+ {
+ // stage 0
+ // stage 1
+ u[0] = in[0];
+ u[1] = in[8];
+ u[2] = in[4];
+ u[3] = in[12];
+ u[4] = in[2];
+ u[5] = in[10];
+ u[6] = in[6];
+ u[7] = in[14];
+ u[8] = in[1];
+ u[9] = in[9];
+ u[10] = in[5];
+ u[11] = in[13];
+ u[12] = in[3];
+ u[13] = in[11];
+ u[14] = in[7];
+ u[15] = in[15];
+
+ // stage 2
+ v[0] = u[0];
+ v[1] = u[1];
+ v[2] = u[2];
+ v[3] = u[3];
+ v[4] = u[4];
+ v[5] = u[5];
+ v[6] = u[6];
+ v[7] = u[7];
+
+ v[8] = half_btf_avx2(&cospi60, &u[8], &cospim4, &u[15], &rnding, bit);
+ v[9] = half_btf_avx2(&cospi28, &u[9], &cospim36, &u[14], &rnding, bit);
+ v[10] = half_btf_avx2(&cospi44, &u[10], &cospim20, &u[13], &rnding, bit);
+ v[11] = half_btf_avx2(&cospi12, &u[11], &cospim52, &u[12], &rnding, bit);
+ v[12] = half_btf_avx2(&cospi52, &u[11], &cospi12, &u[12], &rnding, bit);
+ v[13] = half_btf_avx2(&cospi20, &u[10], &cospi44, &u[13], &rnding, bit);
+ v[14] = half_btf_avx2(&cospi36, &u[9], &cospi28, &u[14], &rnding, bit);
+ v[15] = half_btf_avx2(&cospi4, &u[8], &cospi60, &u[15], &rnding, bit);
+
+ // stage 3
+ u[0] = v[0];
+ u[1] = v[1];
+ u[2] = v[2];
+ u[3] = v[3];
+ u[4] = half_btf_avx2(&cospi56, &v[4], &cospim8, &v[7], &rnding, bit);
+ u[5] = half_btf_avx2(&cospi24, &v[5], &cospim40, &v[6], &rnding, bit);
+ u[6] = half_btf_avx2(&cospi40, &v[5], &cospi24, &v[6], &rnding, bit);
+ u[7] = half_btf_avx2(&cospi8, &v[4], &cospi56, &v[7], &rnding, bit);
+ addsub_avx2(v[8], v[9], &u[8], &u[9], &clamp_lo, &clamp_hi);
+ addsub_avx2(v[11], v[10], &u[11], &u[10], &clamp_lo, &clamp_hi);
+ addsub_avx2(v[12], v[13], &u[12], &u[13], &clamp_lo, &clamp_hi);
+ addsub_avx2(v[15], v[14], &u[15], &u[14], &clamp_lo, &clamp_hi);
+
+ // stage 4
+ x = _mm256_mullo_epi32(u[0], cospi32);
+ y = _mm256_mullo_epi32(u[1], cospi32);
+ v[0] = _mm256_add_epi32(x, y);
+ v[0] = _mm256_add_epi32(v[0], rnding);
+ v[0] = _mm256_srai_epi32(v[0], bit);
+
+ v[1] = _mm256_sub_epi32(x, y);
+ v[1] = _mm256_add_epi32(v[1], rnding);
+ v[1] = _mm256_srai_epi32(v[1], bit);
+
+ v[2] = half_btf_avx2(&cospi48, &u[2], &cospim16, &u[3], &rnding, bit);
+ v[3] = half_btf_avx2(&cospi16, &u[2], &cospi48, &u[3], &rnding, bit);
+ addsub_avx2(u[4], u[5], &v[4], &v[5], &clamp_lo, &clamp_hi);
+ addsub_avx2(u[7], u[6], &v[7], &v[6], &clamp_lo, &clamp_hi);
+ v[8] = u[8];
+ v[9] = half_btf_avx2(&cospim16, &u[9], &cospi48, &u[14], &rnding, bit);
+ v[10] = half_btf_avx2(&cospim48, &u[10], &cospim16, &u[13], &rnding, bit);
+ v[11] = u[11];
+ v[12] = u[12];
+ v[13] = half_btf_avx2(&cospim16, &u[10], &cospi48, &u[13], &rnding, bit);
+ v[14] = half_btf_avx2(&cospi48, &u[9], &cospi16, &u[14], &rnding, bit);
+ v[15] = u[15];
+
+ // stage 5
+ addsub_avx2(v[0], v[3], &u[0], &u[3], &clamp_lo, &clamp_hi);
+ addsub_avx2(v[1], v[2], &u[1], &u[2], &clamp_lo, &clamp_hi);
+ u[4] = v[4];
+
+ x = _mm256_mullo_epi32(v[5], cospi32);
+ y = _mm256_mullo_epi32(v[6], cospi32);
+ u[5] = _mm256_sub_epi32(y, x);
+ u[5] = _mm256_add_epi32(u[5], rnding);
+ u[5] = _mm256_srai_epi32(u[5], bit);
+
+ u[6] = _mm256_add_epi32(y, x);
+ u[6] = _mm256_add_epi32(u[6], rnding);
+ u[6] = _mm256_srai_epi32(u[6], bit);
+
+ u[7] = v[7];
+ addsub_avx2(v[8], v[11], &u[8], &u[11], &clamp_lo, &clamp_hi);
+ addsub_avx2(v[9], v[10], &u[9], &u[10], &clamp_lo, &clamp_hi);
+ addsub_avx2(v[15], v[12], &u[15], &u[12], &clamp_lo, &clamp_hi);
+ addsub_avx2(v[14], v[13], &u[14], &u[13], &clamp_lo, &clamp_hi);
+
+ // stage 6
+ addsub_avx2(u[0], u[7], &v[0], &v[7], &clamp_lo, &clamp_hi);
+ addsub_avx2(u[1], u[6], &v[1], &v[6], &clamp_lo, &clamp_hi);
+ addsub_avx2(u[2], u[5], &v[2], &v[5], &clamp_lo, &clamp_hi);
+ addsub_avx2(u[3], u[4], &v[3], &v[4], &clamp_lo, &clamp_hi);
+ v[8] = u[8];
+ v[9] = u[9];
+
+ x = _mm256_mullo_epi32(u[10], cospi32);
+ y = _mm256_mullo_epi32(u[13], cospi32);
+ v[10] = _mm256_sub_epi32(y, x);
+ v[10] = _mm256_add_epi32(v[10], rnding);
+ v[10] = _mm256_srai_epi32(v[10], bit);
+
+ v[13] = _mm256_add_epi32(x, y);
+ v[13] = _mm256_add_epi32(v[13], rnding);
+ v[13] = _mm256_srai_epi32(v[13], bit);
+
+ x = _mm256_mullo_epi32(u[11], cospi32);
+ y = _mm256_mullo_epi32(u[12], cospi32);
+ v[11] = _mm256_sub_epi32(y, x);
+ v[11] = _mm256_add_epi32(v[11], rnding);
+ v[11] = _mm256_srai_epi32(v[11], bit);
+
+ v[12] = _mm256_add_epi32(x, y);
+ v[12] = _mm256_add_epi32(v[12], rnding);
+ v[12] = _mm256_srai_epi32(v[12], bit);
+
+ v[14] = u[14];
+ v[15] = u[15];
+
+ // stage 7
+ addsub_avx2(v[0], v[15], out + 0, out + 15, &clamp_lo, &clamp_hi);
+ addsub_avx2(v[1], v[14], out + 1, out + 14, &clamp_lo, &clamp_hi);
+ addsub_avx2(v[2], v[13], out + 2, out + 13, &clamp_lo, &clamp_hi);
+ addsub_avx2(v[3], v[12], out + 3, out + 12, &clamp_lo, &clamp_hi);
+ addsub_avx2(v[4], v[11], out + 4, out + 11, &clamp_lo, &clamp_hi);
+ addsub_avx2(v[5], v[10], out + 5, out + 10, &clamp_lo, &clamp_hi);
+ addsub_avx2(v[6], v[9], out + 6, out + 9, &clamp_lo, &clamp_hi);
+ addsub_avx2(v[7], v[8], out + 7, out + 8, &clamp_lo, &clamp_hi);
+
+ if (!do_cols) {
+ const int log_range_out = AOMMAX(16, bd + 6);
+ const __m256i clamp_lo_out =
+ _mm256_set1_epi32(-(1 << (log_range_out - 1)));
+ const __m256i clamp_hi_out =
+ _mm256_set1_epi32((1 << (log_range_out - 1)) - 1);
+ round_shift_8x8_avx2(out, out_shift);
+ highbd_clamp_epi32_avx2(out, out, &clamp_lo_out, &clamp_hi_out, 16);
+ }
+ }
+}
+
+static void iadst16_low1_avx2(__m256i *in, __m256i *out, int bit, int do_cols,
+ int bd, int out_shift) {
+ const int32_t *cospi = cospi_arr(bit);
+ const __m256i cospi2 = _mm256_set1_epi32(cospi[2]);
+ const __m256i cospi62 = _mm256_set1_epi32(cospi[62]);
+ const __m256i cospi8 = _mm256_set1_epi32(cospi[8]);
+ const __m256i cospi56 = _mm256_set1_epi32(cospi[56]);
+ const __m256i cospi48 = _mm256_set1_epi32(cospi[48]);
+ const __m256i cospi16 = _mm256_set1_epi32(cospi[16]);
+ const __m256i cospi32 = _mm256_set1_epi32(cospi[32]);
+ const __m256i rnding = _mm256_set1_epi32(1 << (bit - 1));
+ const __m256i zero = _mm256_setzero_si256();
+ __m256i v[16], x, y, temp1, temp2;
+
+ // Calculate the column 0, 1, 2, 3
+ {
+ // stage 0
+ // stage 1
+ // stage 2
+ x = _mm256_mullo_epi32(in[0], cospi62);
+ v[0] = _mm256_add_epi32(x, rnding);
+ v[0] = _mm256_srai_epi32(v[0], bit);
+
+ x = _mm256_mullo_epi32(in[0], cospi2);
+ v[1] = _mm256_sub_epi32(zero, x);
+ v[1] = _mm256_add_epi32(v[1], rnding);
+ v[1] = _mm256_srai_epi32(v[1], bit);
+
+ // stage 3
+ v[8] = v[0];
+ v[9] = v[1];
+
+ // stage 4
+ temp1 = _mm256_mullo_epi32(v[8], cospi8);
+ x = _mm256_mullo_epi32(v[9], cospi56);
+ temp1 = _mm256_add_epi32(temp1, x);
+ temp1 = _mm256_add_epi32(temp1, rnding);
+ temp1 = _mm256_srai_epi32(temp1, bit);
+
+ temp2 = _mm256_mullo_epi32(v[8], cospi56);
+ x = _mm256_mullo_epi32(v[9], cospi8);
+ temp2 = _mm256_sub_epi32(temp2, x);
+ temp2 = _mm256_add_epi32(temp2, rnding);
+ temp2 = _mm256_srai_epi32(temp2, bit);
+ v[8] = temp1;
+ v[9] = temp2;
+
+ // stage 5
+ v[4] = v[0];
+ v[5] = v[1];
+ v[12] = v[8];
+ v[13] = v[9];
+
+ // stage 6
+ temp1 = _mm256_mullo_epi32(v[4], cospi16);
+ x = _mm256_mullo_epi32(v[5], cospi48);
+ temp1 = _mm256_add_epi32(temp1, x);
+ temp1 = _mm256_add_epi32(temp1, rnding);
+ temp1 = _mm256_srai_epi32(temp1, bit);
+
+ temp2 = _mm256_mullo_epi32(v[4], cospi48);
+ x = _mm256_mullo_epi32(v[5], cospi16);
+ temp2 = _mm256_sub_epi32(temp2, x);
+ temp2 = _mm256_add_epi32(temp2, rnding);
+ temp2 = _mm256_srai_epi32(temp2, bit);
+ v[4] = temp1;
+ v[5] = temp2;
+
+ temp1 = _mm256_mullo_epi32(v[12], cospi16);
+ x = _mm256_mullo_epi32(v[13], cospi48);
+ temp1 = _mm256_add_epi32(temp1, x);
+ temp1 = _mm256_add_epi32(temp1, rnding);
+ temp1 = _mm256_srai_epi32(temp1, bit);
+
+ temp2 = _mm256_mullo_epi32(v[12], cospi48);
+ x = _mm256_mullo_epi32(v[13], cospi16);
+ temp2 = _mm256_sub_epi32(temp2, x);
+ temp2 = _mm256_add_epi32(temp2, rnding);
+ temp2 = _mm256_srai_epi32(temp2, bit);
+ v[12] = temp1;
+ v[13] = temp2;
+
+ // stage 7
+ v[2] = v[0];
+ v[3] = v[1];
+ v[6] = v[4];
+ v[7] = v[5];
+ v[10] = v[8];
+ v[11] = v[9];
+ v[14] = v[12];
+ v[15] = v[13];
+
+ // stage 8
+ y = _mm256_mullo_epi32(v[2], cospi32);
+ x = _mm256_mullo_epi32(v[3], cospi32);
+ v[2] = _mm256_add_epi32(y, x);
+ v[2] = _mm256_add_epi32(v[2], rnding);
+ v[2] = _mm256_srai_epi32(v[2], bit);
+
+ v[3] = _mm256_sub_epi32(y, x);
+ v[3] = _mm256_add_epi32(v[3], rnding);
+ v[3] = _mm256_srai_epi32(v[3], bit);
+
+ y = _mm256_mullo_epi32(v[6], cospi32);
+ x = _mm256_mullo_epi32(v[7], cospi32);
+ v[6] = _mm256_add_epi32(y, x);
+ v[6] = _mm256_add_epi32(v[6], rnding);
+ v[6] = _mm256_srai_epi32(v[6], bit);
+
+ v[7] = _mm256_sub_epi32(y, x);
+ v[7] = _mm256_add_epi32(v[7], rnding);
+ v[7] = _mm256_srai_epi32(v[7], bit);
+
+ y = _mm256_mullo_epi32(v[10], cospi32);
+ x = _mm256_mullo_epi32(v[11], cospi32);
+ v[10] = _mm256_add_epi32(y, x);
+ v[10] = _mm256_add_epi32(v[10], rnding);
+ v[10] = _mm256_srai_epi32(v[10], bit);
+
+ v[11] = _mm256_sub_epi32(y, x);
+ v[11] = _mm256_add_epi32(v[11], rnding);
+ v[11] = _mm256_srai_epi32(v[11], bit);
+
+ y = _mm256_mullo_epi32(v[14], cospi32);
+ x = _mm256_mullo_epi32(v[15], cospi32);
+ v[14] = _mm256_add_epi32(y, x);
+ v[14] = _mm256_add_epi32(v[14], rnding);
+ v[14] = _mm256_srai_epi32(v[14], bit);
+
+ v[15] = _mm256_sub_epi32(y, x);
+ v[15] = _mm256_add_epi32(v[15], rnding);
+ v[15] = _mm256_srai_epi32(v[15], bit);
+
+ // stage 9
+ if (do_cols) {
+ out[0] = v[0];
+ out[1] = _mm256_sub_epi32(_mm256_setzero_si256(), v[8]);
+ out[2] = v[12];
+ out[3] = _mm256_sub_epi32(_mm256_setzero_si256(), v[4]);
+ out[4] = v[6];
+ out[5] = _mm256_sub_epi32(_mm256_setzero_si256(), v[14]);
+ out[6] = v[10];
+ out[7] = _mm256_sub_epi32(_mm256_setzero_si256(), v[2]);
+ out[8] = v[3];
+ out[9] = _mm256_sub_epi32(_mm256_setzero_si256(), v[11]);
+ out[10] = v[15];
+ out[11] = _mm256_sub_epi32(_mm256_setzero_si256(), v[7]);
+ out[12] = v[5];
+ out[13] = _mm256_sub_epi32(_mm256_setzero_si256(), v[13]);
+ out[14] = v[9];
+ out[15] = _mm256_sub_epi32(_mm256_setzero_si256(), v[1]);
+ } else {
+ const int log_range_out = AOMMAX(16, bd + 6);
+ const __m256i clamp_lo_out =
+ _mm256_set1_epi32(-(1 << (log_range_out - 1)));
+ const __m256i clamp_hi_out =
+ _mm256_set1_epi32((1 << (log_range_out - 1)) - 1);
+
+ neg_shift_avx2(v[0], v[8], out + 0, out + 1, &clamp_lo_out, &clamp_hi_out,
+ out_shift);
+ neg_shift_avx2(v[12], v[4], out + 2, out + 3, &clamp_lo_out,
+ &clamp_hi_out, out_shift);
+ neg_shift_avx2(v[6], v[14], out + 4, out + 5, &clamp_lo_out,
+ &clamp_hi_out, out_shift);
+ neg_shift_avx2(v[10], v[2], out + 6, out + 7, &clamp_lo_out,
+ &clamp_hi_out, out_shift);
+ neg_shift_avx2(v[3], v[11], out + 8, out + 9, &clamp_lo_out,
+ &clamp_hi_out, out_shift);
+ neg_shift_avx2(v[15], v[7], out + 10, out + 11, &clamp_lo_out,
+ &clamp_hi_out, out_shift);
+ neg_shift_avx2(v[5], v[13], out + 12, out + 13, &clamp_lo_out,
+ &clamp_hi_out, out_shift);
+ neg_shift_avx2(v[9], v[1], out + 14, out + 15, &clamp_lo_out,
+ &clamp_hi_out, out_shift);
+ }
+ }
+}
+
+static void iadst16_low8_avx2(__m256i *in, __m256i *out, int bit, int do_cols,
+ int bd, int out_shift) {
+ const int32_t *cospi = cospi_arr(bit);
+ const __m256i cospi2 = _mm256_set1_epi32(cospi[2]);
+ const __m256i cospi62 = _mm256_set1_epi32(cospi[62]);
+ const __m256i cospi10 = _mm256_set1_epi32(cospi[10]);
+ const __m256i cospi54 = _mm256_set1_epi32(cospi[54]);
+ const __m256i cospi18 = _mm256_set1_epi32(cospi[18]);
+ const __m256i cospi46 = _mm256_set1_epi32(cospi[46]);
+ const __m256i cospi26 = _mm256_set1_epi32(cospi[26]);
+ const __m256i cospi38 = _mm256_set1_epi32(cospi[38]);
+ const __m256i cospi34 = _mm256_set1_epi32(cospi[34]);
+ const __m256i cospi30 = _mm256_set1_epi32(cospi[30]);
+ const __m256i cospi42 = _mm256_set1_epi32(cospi[42]);
+ const __m256i cospi22 = _mm256_set1_epi32(cospi[22]);
+ const __m256i cospi50 = _mm256_set1_epi32(cospi[50]);
+ const __m256i cospi14 = _mm256_set1_epi32(cospi[14]);
+ const __m256i cospi58 = _mm256_set1_epi32(cospi[58]);
+ const __m256i cospi6 = _mm256_set1_epi32(cospi[6]);
+ const __m256i cospi8 = _mm256_set1_epi32(cospi[8]);
+ const __m256i cospi56 = _mm256_set1_epi32(cospi[56]);
+ const __m256i cospi40 = _mm256_set1_epi32(cospi[40]);
+ const __m256i cospi24 = _mm256_set1_epi32(cospi[24]);
+ const __m256i cospim56 = _mm256_set1_epi32(-cospi[56]);
+ const __m256i cospim24 = _mm256_set1_epi32(-cospi[24]);
+ const __m256i cospi48 = _mm256_set1_epi32(cospi[48]);
+ const __m256i cospi16 = _mm256_set1_epi32(cospi[16]);
+ const __m256i cospim48 = _mm256_set1_epi32(-cospi[48]);
+ const __m256i cospi32 = _mm256_set1_epi32(cospi[32]);
+ const __m256i rnding = _mm256_set1_epi32(1 << (bit - 1));
+ const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8));
+ const __m256i clamp_lo = _mm256_set1_epi32(-(1 << (log_range - 1)));
+ const __m256i clamp_hi = _mm256_set1_epi32((1 << (log_range - 1)) - 1);
+ __m256i u[16], x, y;
+
+ {
+ // stage 0
+ // stage 1
+ // stage 2
+ __m256i zero = _mm256_setzero_si256();
+ x = _mm256_mullo_epi32(in[0], cospi62);
+ u[0] = _mm256_add_epi32(x, rnding);
+ u[0] = _mm256_srai_epi32(u[0], bit);
+
+ x = _mm256_mullo_epi32(in[0], cospi2);
+ u[1] = _mm256_sub_epi32(zero, x);
+ u[1] = _mm256_add_epi32(u[1], rnding);
+ u[1] = _mm256_srai_epi32(u[1], bit);
+
+ x = _mm256_mullo_epi32(in[2], cospi54);
+ u[2] = _mm256_add_epi32(x, rnding);
+ u[2] = _mm256_srai_epi32(u[2], bit);
+
+ x = _mm256_mullo_epi32(in[2], cospi10);
+ u[3] = _mm256_sub_epi32(zero, x);
+ u[3] = _mm256_add_epi32(u[3], rnding);
+ u[3] = _mm256_srai_epi32(u[3], bit);
+
+ x = _mm256_mullo_epi32(in[4], cospi46);
+ u[4] = _mm256_add_epi32(x, rnding);
+ u[4] = _mm256_srai_epi32(u[4], bit);
+
+ x = _mm256_mullo_epi32(in[4], cospi18);
+ u[5] = _mm256_sub_epi32(zero, x);
+ u[5] = _mm256_add_epi32(u[5], rnding);
+ u[5] = _mm256_srai_epi32(u[5], bit);
+
+ x = _mm256_mullo_epi32(in[6], cospi38);
+ u[6] = _mm256_add_epi32(x, rnding);
+ u[6] = _mm256_srai_epi32(u[6], bit);
+
+ x = _mm256_mullo_epi32(in[6], cospi26);
+ u[7] = _mm256_sub_epi32(zero, x);
+ u[7] = _mm256_add_epi32(u[7], rnding);
+ u[7] = _mm256_srai_epi32(u[7], bit);
+
+ u[8] = _mm256_mullo_epi32(in[7], cospi34);
+ u[8] = _mm256_add_epi32(u[8], rnding);
+ u[8] = _mm256_srai_epi32(u[8], bit);
+
+ u[9] = _mm256_mullo_epi32(in[7], cospi30);
+ u[9] = _mm256_add_epi32(u[9], rnding);
+ u[9] = _mm256_srai_epi32(u[9], bit);
+
+ u[10] = _mm256_mullo_epi32(in[5], cospi42);
+ u[10] = _mm256_add_epi32(u[10], rnding);
+ u[10] = _mm256_srai_epi32(u[10], bit);
+
+ u[11] = _mm256_mullo_epi32(in[5], cospi22);
+ u[11] = _mm256_add_epi32(u[11], rnding);
+ u[11] = _mm256_srai_epi32(u[11], bit);
+
+ u[12] = _mm256_mullo_epi32(in[3], cospi50);
+ u[12] = _mm256_add_epi32(u[12], rnding);
+ u[12] = _mm256_srai_epi32(u[12], bit);
+
+ u[13] = _mm256_mullo_epi32(in[3], cospi14);
+ u[13] = _mm256_add_epi32(u[13], rnding);
+ u[13] = _mm256_srai_epi32(u[13], bit);
+
+ u[14] = _mm256_mullo_epi32(in[1], cospi58);
+ u[14] = _mm256_add_epi32(u[14], rnding);
+ u[14] = _mm256_srai_epi32(u[14], bit);
+
+ u[15] = _mm256_mullo_epi32(in[1], cospi6);
+ u[15] = _mm256_add_epi32(u[15], rnding);
+ u[15] = _mm256_srai_epi32(u[15], bit);
+
+ // stage 3
+ addsub_avx2(u[0], u[8], &u[0], &u[8], &clamp_lo, &clamp_hi);
+ addsub_avx2(u[1], u[9], &u[1], &u[9], &clamp_lo, &clamp_hi);
+ addsub_avx2(u[2], u[10], &u[2], &u[10], &clamp_lo, &clamp_hi);
+ addsub_avx2(u[3], u[11], &u[3], &u[11], &clamp_lo, &clamp_hi);
+ addsub_avx2(u[4], u[12], &u[4], &u[12], &clamp_lo, &clamp_hi);
+ addsub_avx2(u[5], u[13], &u[5], &u[13], &clamp_lo, &clamp_hi);
+ addsub_avx2(u[6], u[14], &u[6], &u[14], &clamp_lo, &clamp_hi);
+ addsub_avx2(u[7], u[15], &u[7], &u[15], &clamp_lo, &clamp_hi);
+
+ // stage 4
+ y = _mm256_mullo_epi32(u[8], cospi56);
+ x = _mm256_mullo_epi32(u[9], cospi56);
+ u[8] = _mm256_mullo_epi32(u[8], cospi8);
+ u[8] = _mm256_add_epi32(u[8], x);
+ u[8] = _mm256_add_epi32(u[8], rnding);
+ u[8] = _mm256_srai_epi32(u[8], bit);
+
+ x = _mm256_mullo_epi32(u[9], cospi8);
+ u[9] = _mm256_sub_epi32(y, x);
+ u[9] = _mm256_add_epi32(u[9], rnding);
+ u[9] = _mm256_srai_epi32(u[9], bit);
+
+ x = _mm256_mullo_epi32(u[11], cospi24);
+ y = _mm256_mullo_epi32(u[10], cospi24);
+ u[10] = _mm256_mullo_epi32(u[10], cospi40);
+ u[10] = _mm256_add_epi32(u[10], x);
+ u[10] = _mm256_add_epi32(u[10], rnding);
+ u[10] = _mm256_srai_epi32(u[10], bit);
+
+ x = _mm256_mullo_epi32(u[11], cospi40);
+ u[11] = _mm256_sub_epi32(y, x);
+ u[11] = _mm256_add_epi32(u[11], rnding);
+ u[11] = _mm256_srai_epi32(u[11], bit);
+
+ x = _mm256_mullo_epi32(u[13], cospi8);
+ y = _mm256_mullo_epi32(u[12], cospi8);
+ u[12] = _mm256_mullo_epi32(u[12], cospim56);
+ u[12] = _mm256_add_epi32(u[12], x);
+ u[12] = _mm256_add_epi32(u[12], rnding);
+ u[12] = _mm256_srai_epi32(u[12], bit);
+
+ x = _mm256_mullo_epi32(u[13], cospim56);
+ u[13] = _mm256_sub_epi32(y, x);
+ u[13] = _mm256_add_epi32(u[13], rnding);
+ u[13] = _mm256_srai_epi32(u[13], bit);
+
+ x = _mm256_mullo_epi32(u[15], cospi40);
+ y = _mm256_mullo_epi32(u[14], cospi40);
+ u[14] = _mm256_mullo_epi32(u[14], cospim24);
+ u[14] = _mm256_add_epi32(u[14], x);
+ u[14] = _mm256_add_epi32(u[14], rnding);
+ u[14] = _mm256_srai_epi32(u[14], bit);
+
+ x = _mm256_mullo_epi32(u[15], cospim24);
+ u[15] = _mm256_sub_epi32(y, x);
+ u[15] = _mm256_add_epi32(u[15], rnding);
+ u[15] = _mm256_srai_epi32(u[15], bit);
+
+ // stage 5
+ addsub_avx2(u[0], u[4], &u[0], &u[4], &clamp_lo, &clamp_hi);
+ addsub_avx2(u[1], u[5], &u[1], &u[5], &clamp_lo, &clamp_hi);
+ addsub_avx2(u[2], u[6], &u[2], &u[6], &clamp_lo, &clamp_hi);
+ addsub_avx2(u[3], u[7], &u[3], &u[7], &clamp_lo, &clamp_hi);
+ addsub_avx2(u[8], u[12], &u[8], &u[12], &clamp_lo, &clamp_hi);
+ addsub_avx2(u[9], u[13], &u[9], &u[13], &clamp_lo, &clamp_hi);
+ addsub_avx2(u[10], u[14], &u[10], &u[14], &clamp_lo, &clamp_hi);
+ addsub_avx2(u[11], u[15], &u[11], &u[15], &clamp_lo, &clamp_hi);
+
+ // stage 6
+ x = _mm256_mullo_epi32(u[5], cospi48);
+ y = _mm256_mullo_epi32(u[4], cospi48);
+ u[4] = _mm256_mullo_epi32(u[4], cospi16);
+ u[4] = _mm256_add_epi32(u[4], x);
+ u[4] = _mm256_add_epi32(u[4], rnding);
+ u[4] = _mm256_srai_epi32(u[4], bit);
+
+ x = _mm256_mullo_epi32(u[5], cospi16);
+ u[5] = _mm256_sub_epi32(y, x);
+ u[5] = _mm256_add_epi32(u[5], rnding);
+ u[5] = _mm256_srai_epi32(u[5], bit);
+
+ x = _mm256_mullo_epi32(u[7], cospi16);
+ y = _mm256_mullo_epi32(u[6], cospi16);
+ u[6] = _mm256_mullo_epi32(u[6], cospim48);
+ u[6] = _mm256_add_epi32(u[6], x);
+ u[6] = _mm256_add_epi32(u[6], rnding);
+ u[6] = _mm256_srai_epi32(u[6], bit);
+
+ x = _mm256_mullo_epi32(u[7], cospim48);
+ u[7] = _mm256_sub_epi32(y, x);
+ u[7] = _mm256_add_epi32(u[7], rnding);
+ u[7] = _mm256_srai_epi32(u[7], bit);
+
+ x = _mm256_mullo_epi32(u[13], cospi48);
+ y = _mm256_mullo_epi32(u[12], cospi48);
+ u[12] = _mm256_mullo_epi32(u[12], cospi16);
+ u[12] = _mm256_add_epi32(u[12], x);
+ u[12] = _mm256_add_epi32(u[12], rnding);
+ u[12] = _mm256_srai_epi32(u[12], bit);
+
+ x = _mm256_mullo_epi32(u[13], cospi16);
+ u[13] = _mm256_sub_epi32(y, x);
+ u[13] = _mm256_add_epi32(u[13], rnding);
+ u[13] = _mm256_srai_epi32(u[13], bit);
+
+ x = _mm256_mullo_epi32(u[15], cospi16);
+ y = _mm256_mullo_epi32(u[14], cospi16);
+ u[14] = _mm256_mullo_epi32(u[14], cospim48);
+ u[14] = _mm256_add_epi32(u[14], x);
+ u[14] = _mm256_add_epi32(u[14], rnding);
+ u[14] = _mm256_srai_epi32(u[14], bit);
+
+ x = _mm256_mullo_epi32(u[15], cospim48);
+ u[15] = _mm256_sub_epi32(y, x);
+ u[15] = _mm256_add_epi32(u[15], rnding);
+ u[15] = _mm256_srai_epi32(u[15], bit);
+
+ // stage 7
+ addsub_avx2(u[0], u[2], &u[0], &u[2], &clamp_lo, &clamp_hi);
+ addsub_avx2(u[1], u[3], &u[1], &u[3], &clamp_lo, &clamp_hi);
+ addsub_avx2(u[4], u[6], &u[4], &u[6], &clamp_lo, &clamp_hi);
+ addsub_avx2(u[5], u[7], &u[5], &u[7], &clamp_lo, &clamp_hi);
+ addsub_avx2(u[8], u[10], &u[8], &u[10], &clamp_lo, &clamp_hi);
+ addsub_avx2(u[9], u[11], &u[9], &u[11], &clamp_lo, &clamp_hi);
+ addsub_avx2(u[12], u[14], &u[12], &u[14], &clamp_lo, &clamp_hi);
+ addsub_avx2(u[13], u[15], &u[13], &u[15], &clamp_lo, &clamp_hi);
+
+ // stage 8
+ y = _mm256_mullo_epi32(u[2], cospi32);
+ x = _mm256_mullo_epi32(u[3], cospi32);
+ u[2] = _mm256_add_epi32(y, x);
+ u[2] = _mm256_add_epi32(u[2], rnding);
+ u[2] = _mm256_srai_epi32(u[2], bit);
+
+ u[3] = _mm256_sub_epi32(y, x);
+ u[3] = _mm256_add_epi32(u[3], rnding);
+ u[3] = _mm256_srai_epi32(u[3], bit);
+ y = _mm256_mullo_epi32(u[6], cospi32);
+ x = _mm256_mullo_epi32(u[7], cospi32);
+ u[6] = _mm256_add_epi32(y, x);
+ u[6] = _mm256_add_epi32(u[6], rnding);
+ u[6] = _mm256_srai_epi32(u[6], bit);
+
+ u[7] = _mm256_sub_epi32(y, x);
+ u[7] = _mm256_add_epi32(u[7], rnding);
+ u[7] = _mm256_srai_epi32(u[7], bit);
+
+ y = _mm256_mullo_epi32(u[10], cospi32);
+ x = _mm256_mullo_epi32(u[11], cospi32);
+ u[10] = _mm256_add_epi32(y, x);
+ u[10] = _mm256_add_epi32(u[10], rnding);
+ u[10] = _mm256_srai_epi32(u[10], bit);
+
+ u[11] = _mm256_sub_epi32(y, x);
+ u[11] = _mm256_add_epi32(u[11], rnding);
+ u[11] = _mm256_srai_epi32(u[11], bit);
+
+ y = _mm256_mullo_epi32(u[14], cospi32);
+ x = _mm256_mullo_epi32(u[15], cospi32);
+ u[14] = _mm256_add_epi32(y, x);
+ u[14] = _mm256_add_epi32(u[14], rnding);
+ u[14] = _mm256_srai_epi32(u[14], bit);
+
+ u[15] = _mm256_sub_epi32(y, x);
+ u[15] = _mm256_add_epi32(u[15], rnding);
+ u[15] = _mm256_srai_epi32(u[15], bit);
+
+ // stage 9
+ if (do_cols) {
+ out[0] = u[0];
+ out[1] = _mm256_sub_epi32(_mm256_setzero_si256(), u[8]);
+ out[2] = u[12];
+ out[3] = _mm256_sub_epi32(_mm256_setzero_si256(), u[4]);
+ out[4] = u[6];
+ out[5] = _mm256_sub_epi32(_mm256_setzero_si256(), u[14]);
+ out[6] = u[10];
+ out[7] = _mm256_sub_epi32(_mm256_setzero_si256(), u[2]);
+ out[8] = u[3];
+ out[9] = _mm256_sub_epi32(_mm256_setzero_si256(), u[11]);
+ out[10] = u[15];
+ out[11] = _mm256_sub_epi32(_mm256_setzero_si256(), u[7]);
+ out[12] = u[5];
+ out[13] = _mm256_sub_epi32(_mm256_setzero_si256(), u[13]);
+ out[14] = u[9];
+ out[15] = _mm256_sub_epi32(_mm256_setzero_si256(), u[1]);
+ } else {
+ const int log_range_out = AOMMAX(16, bd + 6);
+ const __m256i clamp_lo_out =
+ _mm256_set1_epi32(-(1 << (log_range_out - 1)));
+ const __m256i clamp_hi_out =
+ _mm256_set1_epi32((1 << (log_range_out - 1)) - 1);
+
+ neg_shift_avx2(u[0], u[8], out + 0, out + 1, &clamp_lo_out, &clamp_hi_out,
+ out_shift);
+ neg_shift_avx2(u[12], u[4], out + 2, out + 3, &clamp_lo_out,
+ &clamp_hi_out, out_shift);
+ neg_shift_avx2(u[6], u[14], out + 4, out + 5, &clamp_lo_out,
+ &clamp_hi_out, out_shift);
+ neg_shift_avx2(u[10], u[2], out + 6, out + 7, &clamp_lo_out,
+ &clamp_hi_out, out_shift);
+ neg_shift_avx2(u[3], u[11], out + 8, out + 9, &clamp_lo_out,
+ &clamp_hi_out, out_shift);
+ neg_shift_avx2(u[15], u[7], out + 10, out + 11, &clamp_lo_out,
+ &clamp_hi_out, out_shift);
+ neg_shift_avx2(u[5], u[13], out + 12, out + 13, &clamp_lo_out,
+ &clamp_hi_out, out_shift);
+ neg_shift_avx2(u[9], u[1], out + 14, out + 15, &clamp_lo_out,
+ &clamp_hi_out, out_shift);
+ }
+ }
+}
+
+static void iadst16_avx2(__m256i *in, __m256i *out, int bit, int do_cols,
+ int bd, int out_shift) {
+ const int32_t *cospi = cospi_arr(bit);
+ const __m256i cospi2 = _mm256_set1_epi32(cospi[2]);
+ const __m256i cospi62 = _mm256_set1_epi32(cospi[62]);
+ const __m256i cospi10 = _mm256_set1_epi32(cospi[10]);
+ const __m256i cospi54 = _mm256_set1_epi32(cospi[54]);
+ const __m256i cospi18 = _mm256_set1_epi32(cospi[18]);
+ const __m256i cospi46 = _mm256_set1_epi32(cospi[46]);
+ const __m256i cospi26 = _mm256_set1_epi32(cospi[26]);
+ const __m256i cospi38 = _mm256_set1_epi32(cospi[38]);
+ const __m256i cospi34 = _mm256_set1_epi32(cospi[34]);
+ const __m256i cospi30 = _mm256_set1_epi32(cospi[30]);
+ const __m256i cospi42 = _mm256_set1_epi32(cospi[42]);
+ const __m256i cospi22 = _mm256_set1_epi32(cospi[22]);
+ const __m256i cospi50 = _mm256_set1_epi32(cospi[50]);
+ const __m256i cospi14 = _mm256_set1_epi32(cospi[14]);
+ const __m256i cospi58 = _mm256_set1_epi32(cospi[58]);
+ const __m256i cospi6 = _mm256_set1_epi32(cospi[6]);
+ const __m256i cospi8 = _mm256_set1_epi32(cospi[8]);
+ const __m256i cospi56 = _mm256_set1_epi32(cospi[56]);
+ const __m256i cospi40 = _mm256_set1_epi32(cospi[40]);
+ const __m256i cospi24 = _mm256_set1_epi32(cospi[24]);
+ const __m256i cospim56 = _mm256_set1_epi32(-cospi[56]);
+ const __m256i cospim24 = _mm256_set1_epi32(-cospi[24]);
+ const __m256i cospi48 = _mm256_set1_epi32(cospi[48]);
+ const __m256i cospi16 = _mm256_set1_epi32(cospi[16]);
+ const __m256i cospim48 = _mm256_set1_epi32(-cospi[48]);
+ const __m256i cospi32 = _mm256_set1_epi32(cospi[32]);
+ const __m256i rnding = _mm256_set1_epi32(1 << (bit - 1));
+ const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8));
+ const __m256i clamp_lo = _mm256_set1_epi32(-(1 << (log_range - 1)));
+ const __m256i clamp_hi = _mm256_set1_epi32((1 << (log_range - 1)) - 1);
+ __m256i u[16], v[16], x, y;
+
+ {
+ // stage 0
+ // stage 1
+ // stage 2
+ v[0] = _mm256_mullo_epi32(in[15], cospi2);
+ x = _mm256_mullo_epi32(in[0], cospi62);
+ v[0] = _mm256_add_epi32(v[0], x);
+ v[0] = _mm256_add_epi32(v[0], rnding);
+ v[0] = _mm256_srai_epi32(v[0], bit);
+
+ v[1] = _mm256_mullo_epi32(in[15], cospi62);
+ x = _mm256_mullo_epi32(in[0], cospi2);
+ v[1] = _mm256_sub_epi32(v[1], x);
+ v[1] = _mm256_add_epi32(v[1], rnding);
+ v[1] = _mm256_srai_epi32(v[1], bit);
+
+ v[2] = _mm256_mullo_epi32(in[13], cospi10);
+ x = _mm256_mullo_epi32(in[2], cospi54);
+ v[2] = _mm256_add_epi32(v[2], x);
+ v[2] = _mm256_add_epi32(v[2], rnding);
+ v[2] = _mm256_srai_epi32(v[2], bit);
+
+ v[3] = _mm256_mullo_epi32(in[13], cospi54);
+ x = _mm256_mullo_epi32(in[2], cospi10);
+ v[3] = _mm256_sub_epi32(v[3], x);
+ v[3] = _mm256_add_epi32(v[3], rnding);
+ v[3] = _mm256_srai_epi32(v[3], bit);
+
+ v[4] = _mm256_mullo_epi32(in[11], cospi18);
+ x = _mm256_mullo_epi32(in[4], cospi46);
+ v[4] = _mm256_add_epi32(v[4], x);
+ v[4] = _mm256_add_epi32(v[4], rnding);
+ v[4] = _mm256_srai_epi32(v[4], bit);
+
+ v[5] = _mm256_mullo_epi32(in[11], cospi46);
+ x = _mm256_mullo_epi32(in[4], cospi18);
+ v[5] = _mm256_sub_epi32(v[5], x);
+ v[5] = _mm256_add_epi32(v[5], rnding);
+ v[5] = _mm256_srai_epi32(v[5], bit);
+
+ v[6] = _mm256_mullo_epi32(in[9], cospi26);
+ x = _mm256_mullo_epi32(in[6], cospi38);
+ v[6] = _mm256_add_epi32(v[6], x);
+ v[6] = _mm256_add_epi32(v[6], rnding);
+ v[6] = _mm256_srai_epi32(v[6], bit);
+
+ v[7] = _mm256_mullo_epi32(in[9], cospi38);
+ x = _mm256_mullo_epi32(in[6], cospi26);
+ v[7] = _mm256_sub_epi32(v[7], x);
+ v[7] = _mm256_add_epi32(v[7], rnding);
+ v[7] = _mm256_srai_epi32(v[7], bit);
+
+ v[8] = _mm256_mullo_epi32(in[7], cospi34);
+ x = _mm256_mullo_epi32(in[8], cospi30);
+ v[8] = _mm256_add_epi32(v[8], x);
+ v[8] = _mm256_add_epi32(v[8], rnding);
+ v[8] = _mm256_srai_epi32(v[8], bit);
+
+ v[9] = _mm256_mullo_epi32(in[7], cospi30);
+ x = _mm256_mullo_epi32(in[8], cospi34);
+ v[9] = _mm256_sub_epi32(v[9], x);
+ v[9] = _mm256_add_epi32(v[9], rnding);
+ v[9] = _mm256_srai_epi32(v[9], bit);
+
+ v[10] = _mm256_mullo_epi32(in[5], cospi42);
+ x = _mm256_mullo_epi32(in[10], cospi22);
+ v[10] = _mm256_add_epi32(v[10], x);
+ v[10] = _mm256_add_epi32(v[10], rnding);
+ v[10] = _mm256_srai_epi32(v[10], bit);
+
+ v[11] = _mm256_mullo_epi32(in[5], cospi22);
+ x = _mm256_mullo_epi32(in[10], cospi42);
+ v[11] = _mm256_sub_epi32(v[11], x);
+ v[11] = _mm256_add_epi32(v[11], rnding);
+ v[11] = _mm256_srai_epi32(v[11], bit);
+
+ v[12] = _mm256_mullo_epi32(in[3], cospi50);
+ x = _mm256_mullo_epi32(in[12], cospi14);
+ v[12] = _mm256_add_epi32(v[12], x);
+ v[12] = _mm256_add_epi32(v[12], rnding);
+ v[12] = _mm256_srai_epi32(v[12], bit);
+
+ v[13] = _mm256_mullo_epi32(in[3], cospi14);
+ x = _mm256_mullo_epi32(in[12], cospi50);
+ v[13] = _mm256_sub_epi32(v[13], x);
+ v[13] = _mm256_add_epi32(v[13], rnding);
+ v[13] = _mm256_srai_epi32(v[13], bit);
+
+ v[14] = _mm256_mullo_epi32(in[1], cospi58);
+ x = _mm256_mullo_epi32(in[14], cospi6);
+ v[14] = _mm256_add_epi32(v[14], x);
+ v[14] = _mm256_add_epi32(v[14], rnding);
+ v[14] = _mm256_srai_epi32(v[14], bit);
+
+ v[15] = _mm256_mullo_epi32(in[1], cospi6);
+ x = _mm256_mullo_epi32(in[14], cospi58);
+ v[15] = _mm256_sub_epi32(v[15], x);
+ v[15] = _mm256_add_epi32(v[15], rnding);
+ v[15] = _mm256_srai_epi32(v[15], bit);
+
+ // stage 3
+ addsub_avx2(v[0], v[8], &u[0], &u[8], &clamp_lo, &clamp_hi);
+ addsub_avx2(v[1], v[9], &u[1], &u[9], &clamp_lo, &clamp_hi);
+ addsub_avx2(v[2], v[10], &u[2], &u[10], &clamp_lo, &clamp_hi);
+ addsub_avx2(v[3], v[11], &u[3], &u[11], &clamp_lo, &clamp_hi);
+ addsub_avx2(v[4], v[12], &u[4], &u[12], &clamp_lo, &clamp_hi);
+ addsub_avx2(v[5], v[13], &u[5], &u[13], &clamp_lo, &clamp_hi);
+ addsub_avx2(v[6], v[14], &u[6], &u[14], &clamp_lo, &clamp_hi);
+ addsub_avx2(v[7], v[15], &u[7], &u[15], &clamp_lo, &clamp_hi);
+
+ // stage 4
+ v[0] = u[0];
+ v[1] = u[1];
+ v[2] = u[2];
+ v[3] = u[3];
+ v[4] = u[4];
+ v[5] = u[5];
+ v[6] = u[6];
+ v[7] = u[7];
+
+ v[8] = _mm256_mullo_epi32(u[8], cospi8);
+ x = _mm256_mullo_epi32(u[9], cospi56);
+ v[8] = _mm256_add_epi32(v[8], x);
+ v[8] = _mm256_add_epi32(v[8], rnding);
+ v[8] = _mm256_srai_epi32(v[8], bit);
+
+ v[9] = _mm256_mullo_epi32(u[8], cospi56);
+ x = _mm256_mullo_epi32(u[9], cospi8);
+ v[9] = _mm256_sub_epi32(v[9], x);
+ v[9] = _mm256_add_epi32(v[9], rnding);
+ v[9] = _mm256_srai_epi32(v[9], bit);
+
+ v[10] = _mm256_mullo_epi32(u[10], cospi40);
+ x = _mm256_mullo_epi32(u[11], cospi24);
+ v[10] = _mm256_add_epi32(v[10], x);
+ v[10] = _mm256_add_epi32(v[10], rnding);
+ v[10] = _mm256_srai_epi32(v[10], bit);
+
+ v[11] = _mm256_mullo_epi32(u[10], cospi24);
+ x = _mm256_mullo_epi32(u[11], cospi40);
+ v[11] = _mm256_sub_epi32(v[11], x);
+ v[11] = _mm256_add_epi32(v[11], rnding);
+ v[11] = _mm256_srai_epi32(v[11], bit);
+
+ v[12] = _mm256_mullo_epi32(u[12], cospim56);
+ x = _mm256_mullo_epi32(u[13], cospi8);
+ v[12] = _mm256_add_epi32(v[12], x);
+ v[12] = _mm256_add_epi32(v[12], rnding);
+ v[12] = _mm256_srai_epi32(v[12], bit);
+
+ v[13] = _mm256_mullo_epi32(u[12], cospi8);
+ x = _mm256_mullo_epi32(u[13], cospim56);
+ v[13] = _mm256_sub_epi32(v[13], x);
+ v[13] = _mm256_add_epi32(v[13], rnding);
+ v[13] = _mm256_srai_epi32(v[13], bit);
+
+ v[14] = _mm256_mullo_epi32(u[14], cospim24);
+ x = _mm256_mullo_epi32(u[15], cospi40);
+ v[14] = _mm256_add_epi32(v[14], x);
+ v[14] = _mm256_add_epi32(v[14], rnding);
+ v[14] = _mm256_srai_epi32(v[14], bit);
+
+ v[15] = _mm256_mullo_epi32(u[14], cospi40);
+ x = _mm256_mullo_epi32(u[15], cospim24);
+ v[15] = _mm256_sub_epi32(v[15], x);
+ v[15] = _mm256_add_epi32(v[15], rnding);
+ v[15] = _mm256_srai_epi32(v[15], bit);
+
+ // stage 5
+ addsub_avx2(v[0], v[4], &u[0], &u[4], &clamp_lo, &clamp_hi);
+ addsub_avx2(v[1], v[5], &u[1], &u[5], &clamp_lo, &clamp_hi);
+ addsub_avx2(v[2], v[6], &u[2], &u[6], &clamp_lo, &clamp_hi);
+ addsub_avx2(v[3], v[7], &u[3], &u[7], &clamp_lo, &clamp_hi);
+ addsub_avx2(v[8], v[12], &u[8], &u[12], &clamp_lo, &clamp_hi);
+ addsub_avx2(v[9], v[13], &u[9], &u[13], &clamp_lo, &clamp_hi);
+ addsub_avx2(v[10], v[14], &u[10], &u[14], &clamp_lo, &clamp_hi);
+ addsub_avx2(v[11], v[15], &u[11], &u[15], &clamp_lo, &clamp_hi);
+
+ // stage 6
+ v[0] = u[0];
+ v[1] = u[1];
+ v[2] = u[2];
+ v[3] = u[3];
+
+ v[4] = _mm256_mullo_epi32(u[4], cospi16);
+ x = _mm256_mullo_epi32(u[5], cospi48);
+ v[4] = _mm256_add_epi32(v[4], x);
+ v[4] = _mm256_add_epi32(v[4], rnding);
+ v[4] = _mm256_srai_epi32(v[4], bit);
+
+ v[5] = _mm256_mullo_epi32(u[4], cospi48);
+ x = _mm256_mullo_epi32(u[5], cospi16);
+ v[5] = _mm256_sub_epi32(v[5], x);
+ v[5] = _mm256_add_epi32(v[5], rnding);
+ v[5] = _mm256_srai_epi32(v[5], bit);
+
+ v[6] = _mm256_mullo_epi32(u[6], cospim48);
+ x = _mm256_mullo_epi32(u[7], cospi16);
+ v[6] = _mm256_add_epi32(v[6], x);
+ v[6] = _mm256_add_epi32(v[6], rnding);
+ v[6] = _mm256_srai_epi32(v[6], bit);
+
+ v[7] = _mm256_mullo_epi32(u[6], cospi16);
+ x = _mm256_mullo_epi32(u[7], cospim48);
+ v[7] = _mm256_sub_epi32(v[7], x);
+ v[7] = _mm256_add_epi32(v[7], rnding);
+ v[7] = _mm256_srai_epi32(v[7], bit);
+
+ v[8] = u[8];
+ v[9] = u[9];
+ v[10] = u[10];
+ v[11] = u[11];
+
+ v[12] = _mm256_mullo_epi32(u[12], cospi16);
+ x = _mm256_mullo_epi32(u[13], cospi48);
+ v[12] = _mm256_add_epi32(v[12], x);
+ v[12] = _mm256_add_epi32(v[12], rnding);
+ v[12] = _mm256_srai_epi32(v[12], bit);
+
+ v[13] = _mm256_mullo_epi32(u[12], cospi48);
+ x = _mm256_mullo_epi32(u[13], cospi16);
+ v[13] = _mm256_sub_epi32(v[13], x);
+ v[13] = _mm256_add_epi32(v[13], rnding);
+ v[13] = _mm256_srai_epi32(v[13], bit);
+
+ v[14] = _mm256_mullo_epi32(u[14], cospim48);
+ x = _mm256_mullo_epi32(u[15], cospi16);
+ v[14] = _mm256_add_epi32(v[14], x);
+ v[14] = _mm256_add_epi32(v[14], rnding);
+ v[14] = _mm256_srai_epi32(v[14], bit);
+
+ v[15] = _mm256_mullo_epi32(u[14], cospi16);
+ x = _mm256_mullo_epi32(u[15], cospim48);
+ v[15] = _mm256_sub_epi32(v[15], x);
+ v[15] = _mm256_add_epi32(v[15], rnding);
+ v[15] = _mm256_srai_epi32(v[15], bit);
+
+ // stage 7
+ addsub_avx2(v[0], v[2], &u[0], &u[2], &clamp_lo, &clamp_hi);
+ addsub_avx2(v[1], v[3], &u[1], &u[3], &clamp_lo, &clamp_hi);
+ addsub_avx2(v[4], v[6], &u[4], &u[6], &clamp_lo, &clamp_hi);
+ addsub_avx2(v[5], v[7], &u[5], &u[7], &clamp_lo, &clamp_hi);
+ addsub_avx2(v[8], v[10], &u[8], &u[10], &clamp_lo, &clamp_hi);
+ addsub_avx2(v[9], v[11], &u[9], &u[11], &clamp_lo, &clamp_hi);
+ addsub_avx2(v[12], v[14], &u[12], &u[14], &clamp_lo, &clamp_hi);
+ addsub_avx2(v[13], v[15], &u[13], &u[15], &clamp_lo, &clamp_hi);
+
+ // stage 8
+ v[0] = u[0];
+ v[1] = u[1];
+
+ y = _mm256_mullo_epi32(u[2], cospi32);
+ x = _mm256_mullo_epi32(u[3], cospi32);
+ v[2] = _mm256_add_epi32(y, x);
+ v[2] = _mm256_add_epi32(v[2], rnding);
+ v[2] = _mm256_srai_epi32(v[2], bit);
+
+ v[3] = _mm256_sub_epi32(y, x);
+ v[3] = _mm256_add_epi32(v[3], rnding);
+ v[3] = _mm256_srai_epi32(v[3], bit);
+
+ v[4] = u[4];
+ v[5] = u[5];
+
+ y = _mm256_mullo_epi32(u[6], cospi32);
+ x = _mm256_mullo_epi32(u[7], cospi32);
+ v[6] = _mm256_add_epi32(y, x);
+ v[6] = _mm256_add_epi32(v[6], rnding);
+ v[6] = _mm256_srai_epi32(v[6], bit);
+
+ v[7] = _mm256_sub_epi32(y, x);
+ v[7] = _mm256_add_epi32(v[7], rnding);
+ v[7] = _mm256_srai_epi32(v[7], bit);
+
+ v[8] = u[8];
+ v[9] = u[9];
+
+ y = _mm256_mullo_epi32(u[10], cospi32);
+ x = _mm256_mullo_epi32(u[11], cospi32);
+ v[10] = _mm256_add_epi32(y, x);
+ v[10] = _mm256_add_epi32(v[10], rnding);
+ v[10] = _mm256_srai_epi32(v[10], bit);
+
+ v[11] = _mm256_sub_epi32(y, x);
+ v[11] = _mm256_add_epi32(v[11], rnding);
+ v[11] = _mm256_srai_epi32(v[11], bit);
+
+ v[12] = u[12];
+ v[13] = u[13];
+
+ y = _mm256_mullo_epi32(u[14], cospi32);
+ x = _mm256_mullo_epi32(u[15], cospi32);
+ v[14] = _mm256_add_epi32(y, x);
+ v[14] = _mm256_add_epi32(v[14], rnding);
+ v[14] = _mm256_srai_epi32(v[14], bit);
+
+ v[15] = _mm256_sub_epi32(y, x);
+ v[15] = _mm256_add_epi32(v[15], rnding);
+ v[15] = _mm256_srai_epi32(v[15], bit);
+
+ // stage 9
+ if (do_cols) {
+ out[0] = v[0];
+ out[1] = _mm256_sub_epi32(_mm256_setzero_si256(), v[8]);
+ out[2] = v[12];
+ out[3] = _mm256_sub_epi32(_mm256_setzero_si256(), v[4]);
+ out[4] = v[6];
+ out[5] = _mm256_sub_epi32(_mm256_setzero_si256(), v[14]);
+ out[6] = v[10];
+ out[7] = _mm256_sub_epi32(_mm256_setzero_si256(), v[2]);
+ out[8] = v[3];
+ out[9] = _mm256_sub_epi32(_mm256_setzero_si256(), v[11]);
+ out[10] = v[15];
+ out[11] = _mm256_sub_epi32(_mm256_setzero_si256(), v[7]);
+ out[12] = v[5];
+ out[13] = _mm256_sub_epi32(_mm256_setzero_si256(), v[13]);
+ out[14] = v[9];
+ out[15] = _mm256_sub_epi32(_mm256_setzero_si256(), v[1]);
+ } else {
+ const int log_range_out = AOMMAX(16, bd + 6);
+ const __m256i clamp_lo_out =
+ _mm256_set1_epi32(-(1 << (log_range_out - 1)));
+ const __m256i clamp_hi_out =
+ _mm256_set1_epi32((1 << (log_range_out - 1)) - 1);
+
+ neg_shift_avx2(v[0], v[8], out + 0, out + 1, &clamp_lo_out, &clamp_hi_out,
+ out_shift);
+ neg_shift_avx2(v[12], v[4], out + 2, out + 3, &clamp_lo_out,
+ &clamp_hi_out, out_shift);
+ neg_shift_avx2(v[6], v[14], out + 4, out + 5, &clamp_lo_out,
+ &clamp_hi_out, out_shift);
+ neg_shift_avx2(v[10], v[2], out + 6, out + 7, &clamp_lo_out,
+ &clamp_hi_out, out_shift);
+ neg_shift_avx2(v[3], v[11], out + 8, out + 9, &clamp_lo_out,
+ &clamp_hi_out, out_shift);
+ neg_shift_avx2(v[15], v[7], out + 10, out + 11, &clamp_lo_out,
+ &clamp_hi_out, out_shift);
+ neg_shift_avx2(v[5], v[13], out + 12, out + 13, &clamp_lo_out,
+ &clamp_hi_out, out_shift);
+ neg_shift_avx2(v[9], v[1], out + 14, out + 15, &clamp_lo_out,
+ &clamp_hi_out, out_shift);
+ }
+ }
+}
+static void idct8x8_low1_avx2(__m256i *in, __m256i *out, int bit, int do_cols,
+ int bd, int out_shift) {
+ const int32_t *cospi = cospi_arr(bit);
+ const __m256i cospi32 = _mm256_set1_epi32(cospi[32]);
+ const __m256i rnding = _mm256_set1_epi32(1 << (bit - 1));
+ const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8));
+ __m256i clamp_lo = _mm256_set1_epi32(-(1 << (log_range - 1)));
+ __m256i clamp_hi = _mm256_set1_epi32((1 << (log_range - 1)) - 1);
+ __m256i x;
+
+ // stage 0
+ // stage 1
+ // stage 2
+ // stage 3
+ x = _mm256_mullo_epi32(in[0], cospi32);
+ x = _mm256_add_epi32(x, rnding);
+ x = _mm256_srai_epi32(x, bit);
+
+ // stage 4
+ // stage 5
+ if (!do_cols) {
+ const int log_range_out = AOMMAX(16, bd + 6);
+ __m256i offset = _mm256_set1_epi32((1 << out_shift) >> 1);
+ clamp_lo = _mm256_set1_epi32(-(1 << (log_range_out - 1)));
+ clamp_hi = _mm256_set1_epi32((1 << (log_range_out - 1)) - 1);
+ x = _mm256_add_epi32(x, offset);
+ x = _mm256_sra_epi32(x, _mm_cvtsi32_si128(out_shift));
+ }
+ x = _mm256_max_epi32(x, clamp_lo);
+ x = _mm256_min_epi32(x, clamp_hi);
+ out[0] = x;
+ out[1] = x;
+ out[2] = x;
+ out[3] = x;
+ out[4] = x;
+ out[5] = x;
+ out[6] = x;
+ out[7] = x;
+}
+static void idct8x8_avx2(__m256i *in, __m256i *out, int bit, int do_cols,
+ int bd, int out_shift) {
+ const int32_t *cospi = cospi_arr(bit);
+ const __m256i cospi56 = _mm256_set1_epi32(cospi[56]);
+ const __m256i cospim8 = _mm256_set1_epi32(-cospi[8]);
+ const __m256i cospi24 = _mm256_set1_epi32(cospi[24]);
+ const __m256i cospim40 = _mm256_set1_epi32(-cospi[40]);
+ const __m256i cospi40 = _mm256_set1_epi32(cospi[40]);
+ const __m256i cospi8 = _mm256_set1_epi32(cospi[8]);
+ const __m256i cospi32 = _mm256_set1_epi32(cospi[32]);
+ const __m256i cospi48 = _mm256_set1_epi32(cospi[48]);
+ const __m256i cospim16 = _mm256_set1_epi32(-cospi[16]);
+ const __m256i cospi16 = _mm256_set1_epi32(cospi[16]);
+ const __m256i rnding = _mm256_set1_epi32(1 << (bit - 1));
+ const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8));
+ const __m256i clamp_lo = _mm256_set1_epi32(-(1 << (log_range - 1)));
+ const __m256i clamp_hi = _mm256_set1_epi32((1 << (log_range - 1)) - 1);
+ __m256i u0, u1, u2, u3, u4, u5, u6, u7;
+ __m256i v0, v1, v2, v3, v4, v5, v6, v7;
+ __m256i x, y;
+
+ // stage 0
+ // stage 1
+ // stage 2
+ u0 = in[0];
+ u1 = in[4];
+ u2 = in[2];
+ u3 = in[6];
+
+ x = _mm256_mullo_epi32(in[1], cospi56);
+ y = _mm256_mullo_epi32(in[7], cospim8);
+ u4 = _mm256_add_epi32(x, y);
+ u4 = _mm256_add_epi32(u4, rnding);
+ u4 = _mm256_srai_epi32(u4, bit);
+
+ x = _mm256_mullo_epi32(in[1], cospi8);
+ y = _mm256_mullo_epi32(in[7], cospi56);
+ u7 = _mm256_add_epi32(x, y);
+ u7 = _mm256_add_epi32(u7, rnding);
+ u7 = _mm256_srai_epi32(u7, bit);
+
+ x = _mm256_mullo_epi32(in[5], cospi24);
+ y = _mm256_mullo_epi32(in[3], cospim40);
+ u5 = _mm256_add_epi32(x, y);
+ u5 = _mm256_add_epi32(u5, rnding);
+ u5 = _mm256_srai_epi32(u5, bit);
+
+ x = _mm256_mullo_epi32(in[5], cospi40);
+ y = _mm256_mullo_epi32(in[3], cospi24);
+ u6 = _mm256_add_epi32(x, y);
+ u6 = _mm256_add_epi32(u6, rnding);
+ u6 = _mm256_srai_epi32(u6, bit);
+
+ // stage 3
+ x = _mm256_mullo_epi32(u0, cospi32);
+ y = _mm256_mullo_epi32(u1, cospi32);
+ v0 = _mm256_add_epi32(x, y);
+ v0 = _mm256_add_epi32(v0, rnding);
+ v0 = _mm256_srai_epi32(v0, bit);
+
+ v1 = _mm256_sub_epi32(x, y);
+ v1 = _mm256_add_epi32(v1, rnding);
+ v1 = _mm256_srai_epi32(v1, bit);
+
+ x = _mm256_mullo_epi32(u2, cospi48);
+ y = _mm256_mullo_epi32(u3, cospim16);
+ v2 = _mm256_add_epi32(x, y);
+ v2 = _mm256_add_epi32(v2, rnding);
+ v2 = _mm256_srai_epi32(v2, bit);
+
+ x = _mm256_mullo_epi32(u2, cospi16);
+ y = _mm256_mullo_epi32(u3, cospi48);
+ v3 = _mm256_add_epi32(x, y);
+ v3 = _mm256_add_epi32(v3, rnding);
+ v3 = _mm256_srai_epi32(v3, bit);
+
+ addsub_avx2(u4, u5, &v4, &v5, &clamp_lo, &clamp_hi);
+ addsub_avx2(u7, u6, &v7, &v6, &clamp_lo, &clamp_hi);
+
+ // stage 4
+ addsub_avx2(v0, v3, &u0, &u3, &clamp_lo, &clamp_hi);
+ addsub_avx2(v1, v2, &u1, &u2, &clamp_lo, &clamp_hi);
+ u4 = v4;
+ u7 = v7;
+
+ x = _mm256_mullo_epi32(v5, cospi32);
+ y = _mm256_mullo_epi32(v6, cospi32);
+ u6 = _mm256_add_epi32(y, x);
+ u6 = _mm256_add_epi32(u6, rnding);
+ u6 = _mm256_srai_epi32(u6, bit);
+
+ u5 = _mm256_sub_epi32(y, x);
+ u5 = _mm256_add_epi32(u5, rnding);
+ u5 = _mm256_srai_epi32(u5, bit);
+
+ addsub_avx2(u0, u7, out + 0, out + 7, &clamp_lo, &clamp_hi);
+ addsub_avx2(u1, u6, out + 1, out + 6, &clamp_lo, &clamp_hi);
+ addsub_avx2(u2, u5, out + 2, out + 5, &clamp_lo, &clamp_hi);
+ addsub_avx2(u3, u4, out + 3, out + 4, &clamp_lo, &clamp_hi);
+ // stage 5
+ if (!do_cols) {
+ const int log_range_out = AOMMAX(16, bd + 6);
+ const __m256i clamp_lo_out = _mm256_set1_epi32(-(1 << (log_range_out - 1)));
+ const __m256i clamp_hi_out =
+ _mm256_set1_epi32((1 << (log_range_out - 1)) - 1);
+
+ round_shift_4x4_avx2(out, out_shift);
+ round_shift_4x4_avx2(out + 4, out_shift);
+ highbd_clamp_epi32_avx2(out, out, &clamp_lo_out, &clamp_hi_out, 8);
+ }
+}
+static void iadst8x8_low1_avx2(__m256i *in, __m256i *out, int bit, int do_cols,
+ int bd, int out_shift) {
+ const int32_t *cospi = cospi_arr(bit);
+ const __m256i cospi4 = _mm256_set1_epi32(cospi[4]);
+ const __m256i cospi60 = _mm256_set1_epi32(cospi[60]);
+ const __m256i cospi16 = _mm256_set1_epi32(cospi[16]);
+ const __m256i cospi48 = _mm256_set1_epi32(cospi[48]);
+ const __m256i cospi32 = _mm256_set1_epi32(cospi[32]);
+ const __m256i rnding = _mm256_set1_epi32(1 << (bit - 1));
+ const __m256i kZero = _mm256_setzero_si256();
+ __m256i u[8], x;
+
+ // stage 0
+ // stage 1
+ // stage 2
+
+ x = _mm256_mullo_epi32(in[0], cospi60);
+ u[0] = _mm256_add_epi32(x, rnding);
+ u[0] = _mm256_srai_epi32(u[0], bit);
+
+ x = _mm256_mullo_epi32(in[0], cospi4);
+ u[1] = _mm256_sub_epi32(kZero, x);
+ u[1] = _mm256_add_epi32(u[1], rnding);
+ u[1] = _mm256_srai_epi32(u[1], bit);
+
+ // stage 3
+ // stage 4
+ __m256i temp1, temp2;
+ temp1 = _mm256_mullo_epi32(u[0], cospi16);
+ x = _mm256_mullo_epi32(u[1], cospi48);
+ temp1 = _mm256_add_epi32(temp1, x);
+ temp1 = _mm256_add_epi32(temp1, rnding);
+ temp1 = _mm256_srai_epi32(temp1, bit);
+ u[4] = temp1;
+
+ temp2 = _mm256_mullo_epi32(u[0], cospi48);
+ x = _mm256_mullo_epi32(u[1], cospi16);
+ u[5] = _mm256_sub_epi32(temp2, x);
+ u[5] = _mm256_add_epi32(u[5], rnding);
+ u[5] = _mm256_srai_epi32(u[5], bit);
+
+ // stage 5
+ // stage 6
+ temp1 = _mm256_mullo_epi32(u[0], cospi32);
+ x = _mm256_mullo_epi32(u[1], cospi32);
+ u[2] = _mm256_add_epi32(temp1, x);
+ u[2] = _mm256_add_epi32(u[2], rnding);
+ u[2] = _mm256_srai_epi32(u[2], bit);
+
+ u[3] = _mm256_sub_epi32(temp1, x);
+ u[3] = _mm256_add_epi32(u[3], rnding);
+ u[3] = _mm256_srai_epi32(u[3], bit);
+
+ temp1 = _mm256_mullo_epi32(u[4], cospi32);
+ x = _mm256_mullo_epi32(u[5], cospi32);
+ u[6] = _mm256_add_epi32(temp1, x);
+ u[6] = _mm256_add_epi32(u[6], rnding);
+ u[6] = _mm256_srai_epi32(u[6], bit);
+
+ u[7] = _mm256_sub_epi32(temp1, x);
+ u[7] = _mm256_add_epi32(u[7], rnding);
+ u[7] = _mm256_srai_epi32(u[7], bit);
+
+ // stage 7
+ if (do_cols) {
+ out[0] = u[0];
+ out[1] = _mm256_sub_epi32(kZero, u[4]);
+ out[2] = u[6];
+ out[3] = _mm256_sub_epi32(kZero, u[2]);
+ out[4] = u[3];
+ out[5] = _mm256_sub_epi32(kZero, u[7]);
+ out[6] = u[5];
+ out[7] = _mm256_sub_epi32(kZero, u[1]);
+ } else {
+ const int log_range_out = AOMMAX(16, bd + 6);
+ const __m256i clamp_lo_out = _mm256_set1_epi32(-(1 << (log_range_out - 1)));
+ const __m256i clamp_hi_out =
+ _mm256_set1_epi32((1 << (log_range_out - 1)) - 1);
+
+ neg_shift_avx2(u[0], u[4], out + 0, out + 1, &clamp_lo_out, &clamp_hi_out,
+ out_shift);
+ neg_shift_avx2(u[6], u[2], out + 2, out + 3, &clamp_lo_out, &clamp_hi_out,
+ out_shift);
+ neg_shift_avx2(u[3], u[7], out + 4, out + 5, &clamp_lo_out, &clamp_hi_out,
+ out_shift);
+ neg_shift_avx2(u[5], u[1], out + 6, out + 7, &clamp_lo_out, &clamp_hi_out,
+ out_shift);
+ }
+}
+
+static void iadst8x8_avx2(__m256i *in, __m256i *out, int bit, int do_cols,
+ int bd, int out_shift) {
+ const int32_t *cospi = cospi_arr(bit);
+ const __m256i cospi4 = _mm256_set1_epi32(cospi[4]);
+ const __m256i cospi60 = _mm256_set1_epi32(cospi[60]);
+ const __m256i cospi20 = _mm256_set1_epi32(cospi[20]);
+ const __m256i cospi44 = _mm256_set1_epi32(cospi[44]);
+ const __m256i cospi36 = _mm256_set1_epi32(cospi[36]);
+ const __m256i cospi28 = _mm256_set1_epi32(cospi[28]);
+ const __m256i cospi52 = _mm256_set1_epi32(cospi[52]);
+ const __m256i cospi12 = _mm256_set1_epi32(cospi[12]);
+ const __m256i cospi16 = _mm256_set1_epi32(cospi[16]);
+ const __m256i cospi48 = _mm256_set1_epi32(cospi[48]);
+ const __m256i cospim48 = _mm256_set1_epi32(-cospi[48]);
+ const __m256i cospi32 = _mm256_set1_epi32(cospi[32]);
+ const __m256i rnding = _mm256_set1_epi32(1 << (bit - 1));
+ const __m256i kZero = _mm256_setzero_si256();
+ const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8));
+ const __m256i clamp_lo = _mm256_set1_epi32(-(1 << (log_range - 1)));
+ const __m256i clamp_hi = _mm256_set1_epi32((1 << (log_range - 1)) - 1);
+ __m256i u[8], v[8], x;
+
+ // stage 0
+ // stage 1
+ // stage 2
+
+ u[0] = _mm256_mullo_epi32(in[7], cospi4);
+ x = _mm256_mullo_epi32(in[0], cospi60);
+ u[0] = _mm256_add_epi32(u[0], x);
+ u[0] = _mm256_add_epi32(u[0], rnding);
+ u[0] = _mm256_srai_epi32(u[0], bit);
+
+ u[1] = _mm256_mullo_epi32(in[7], cospi60);
+ x = _mm256_mullo_epi32(in[0], cospi4);
+ u[1] = _mm256_sub_epi32(u[1], x);
+ u[1] = _mm256_add_epi32(u[1], rnding);
+ u[1] = _mm256_srai_epi32(u[1], bit);
+
+ u[2] = _mm256_mullo_epi32(in[5], cospi20);
+ x = _mm256_mullo_epi32(in[2], cospi44);
+ u[2] = _mm256_add_epi32(u[2], x);
+ u[2] = _mm256_add_epi32(u[2], rnding);
+ u[2] = _mm256_srai_epi32(u[2], bit);
+
+ u[3] = _mm256_mullo_epi32(in[5], cospi44);
+ x = _mm256_mullo_epi32(in[2], cospi20);
+ u[3] = _mm256_sub_epi32(u[3], x);
+ u[3] = _mm256_add_epi32(u[3], rnding);
+ u[3] = _mm256_srai_epi32(u[3], bit);
+
+ u[4] = _mm256_mullo_epi32(in[3], cospi36);
+ x = _mm256_mullo_epi32(in[4], cospi28);
+ u[4] = _mm256_add_epi32(u[4], x);
+ u[4] = _mm256_add_epi32(u[4], rnding);
+ u[4] = _mm256_srai_epi32(u[4], bit);
+
+ u[5] = _mm256_mullo_epi32(in[3], cospi28);
+ x = _mm256_mullo_epi32(in[4], cospi36);
+ u[5] = _mm256_sub_epi32(u[5], x);
+ u[5] = _mm256_add_epi32(u[5], rnding);
+ u[5] = _mm256_srai_epi32(u[5], bit);
+
+ u[6] = _mm256_mullo_epi32(in[1], cospi52);
+ x = _mm256_mullo_epi32(in[6], cospi12);
+ u[6] = _mm256_add_epi32(u[6], x);
+ u[6] = _mm256_add_epi32(u[6], rnding);
+ u[6] = _mm256_srai_epi32(u[6], bit);
+
+ u[7] = _mm256_mullo_epi32(in[1], cospi12);
+ x = _mm256_mullo_epi32(in[6], cospi52);
+ u[7] = _mm256_sub_epi32(u[7], x);
+ u[7] = _mm256_add_epi32(u[7], rnding);
+ u[7] = _mm256_srai_epi32(u[7], bit);
+
+ // stage 3
+ addsub_avx2(u[0], u[4], &v[0], &v[4], &clamp_lo, &clamp_hi);
+ addsub_avx2(u[1], u[5], &v[1], &v[5], &clamp_lo, &clamp_hi);
+ addsub_avx2(u[2], u[6], &v[2], &v[6], &clamp_lo, &clamp_hi);
+ addsub_avx2(u[3], u[7], &v[3], &v[7], &clamp_lo, &clamp_hi);
+
+ // stage 4
+ u[0] = v[0];
+ u[1] = v[1];
+ u[2] = v[2];
+ u[3] = v[3];
+
+ u[4] = _mm256_mullo_epi32(v[4], cospi16);
+ x = _mm256_mullo_epi32(v[5], cospi48);
+ u[4] = _mm256_add_epi32(u[4], x);
+ u[4] = _mm256_add_epi32(u[4], rnding);
+ u[4] = _mm256_srai_epi32(u[4], bit);
+
+ u[5] = _mm256_mullo_epi32(v[4], cospi48);
+ x = _mm256_mullo_epi32(v[5], cospi16);
+ u[5] = _mm256_sub_epi32(u[5], x);
+ u[5] = _mm256_add_epi32(u[5], rnding);
+ u[5] = _mm256_srai_epi32(u[5], bit);
+
+ u[6] = _mm256_mullo_epi32(v[6], cospim48);
+ x = _mm256_mullo_epi32(v[7], cospi16);
+ u[6] = _mm256_add_epi32(u[6], x);
+ u[6] = _mm256_add_epi32(u[6], rnding);
+ u[6] = _mm256_srai_epi32(u[6], bit);
+
+ u[7] = _mm256_mullo_epi32(v[6], cospi16);
+ x = _mm256_mullo_epi32(v[7], cospim48);
+ u[7] = _mm256_sub_epi32(u[7], x);
+ u[7] = _mm256_add_epi32(u[7], rnding);
+ u[7] = _mm256_srai_epi32(u[7], bit);
+
+ // stage 5
+ addsub_avx2(u[0], u[2], &v[0], &v[2], &clamp_lo, &clamp_hi);
+ addsub_avx2(u[1], u[3], &v[1], &v[3], &clamp_lo, &clamp_hi);
+ addsub_avx2(u[4], u[6], &v[4], &v[6], &clamp_lo, &clamp_hi);
+ addsub_avx2(u[5], u[7], &v[5], &v[7], &clamp_lo, &clamp_hi);
+
+ // stage 6
+ u[0] = v[0];
+ u[1] = v[1];
+ u[4] = v[4];
+ u[5] = v[5];
+
+ v[0] = _mm256_mullo_epi32(v[2], cospi32);
+ x = _mm256_mullo_epi32(v[3], cospi32);
+ u[2] = _mm256_add_epi32(v[0], x);
+ u[2] = _mm256_add_epi32(u[2], rnding);
+ u[2] = _mm256_srai_epi32(u[2], bit);
+
+ u[3] = _mm256_sub_epi32(v[0], x);
+ u[3] = _mm256_add_epi32(u[3], rnding);
+ u[3] = _mm256_srai_epi32(u[3], bit);
+
+ v[0] = _mm256_mullo_epi32(v[6], cospi32);
+ x = _mm256_mullo_epi32(v[7], cospi32);
+ u[6] = _mm256_add_epi32(v[0], x);
+ u[6] = _mm256_add_epi32(u[6], rnding);
+ u[6] = _mm256_srai_epi32(u[6], bit);
+
+ u[7] = _mm256_sub_epi32(v[0], x);
+ u[7] = _mm256_add_epi32(u[7], rnding);
+ u[7] = _mm256_srai_epi32(u[7], bit);
+
+ // stage 7
+ if (do_cols) {
+ out[0] = u[0];
+ out[1] = _mm256_sub_epi32(kZero, u[4]);
+ out[2] = u[6];
+ out[3] = _mm256_sub_epi32(kZero, u[2]);
+ out[4] = u[3];
+ out[5] = _mm256_sub_epi32(kZero, u[7]);
+ out[6] = u[5];
+ out[7] = _mm256_sub_epi32(kZero, u[1]);
+ } else {
+ const int log_range_out = AOMMAX(16, bd + 6);
+ const __m256i clamp_lo_out = _mm256_set1_epi32(-(1 << (log_range_out - 1)));
+ const __m256i clamp_hi_out =
+ _mm256_set1_epi32((1 << (log_range_out - 1)) - 1);
+
+ neg_shift_avx2(u[0], u[4], out + 0, out + 1, &clamp_lo_out, &clamp_hi_out,
+ out_shift);
+ neg_shift_avx2(u[6], u[2], out + 2, out + 3, &clamp_lo_out, &clamp_hi_out,
+ out_shift);
+ neg_shift_avx2(u[3], u[7], out + 4, out + 5, &clamp_lo_out, &clamp_hi_out,
+ out_shift);
+ neg_shift_avx2(u[5], u[1], out + 6, out + 7, &clamp_lo_out, &clamp_hi_out,
+ out_shift);
+ }
+}
+static INLINE void idct64_stage8_avx2(
+ __m256i *u, const __m256i *cospim32, const __m256i *cospi32,
+ const __m256i *cospim16, const __m256i *cospi48, const __m256i *cospi16,
+ const __m256i *cospim48, const __m256i *clamp_lo, const __m256i *clamp_hi,
+ const __m256i *rnding, int bit) {
+ int i;
+ __m256i temp1, temp2, temp3, temp4;
+ temp1 = half_btf_avx2(cospim32, &u[10], cospi32, &u[13], rnding, bit);
+ u[13] = half_btf_avx2(cospi32, &u[10], cospi32, &u[13], rnding, bit);
+ u[10] = temp1;
+ temp2 = half_btf_avx2(cospim32, &u[11], cospi32, &u[12], rnding, bit);
+ u[12] = half_btf_avx2(cospi32, &u[11], cospi32, &u[12], rnding, bit);
+ u[11] = temp2;
+
+ for (i = 16; i < 20; ++i) {
+ addsub_avx2(u[i], u[i ^ 7], &u[i], &u[i ^ 7], clamp_lo, clamp_hi);
+ addsub_avx2(u[i ^ 15], u[i ^ 8], &u[i ^ 15], &u[i ^ 8], clamp_lo, clamp_hi);
+ }
+
+ temp1 = half_btf_avx2(cospim16, &u[36], cospi48, &u[59], rnding, bit);
+ temp2 = half_btf_avx2(cospim16, &u[37], cospi48, &u[58], rnding, bit);
+ temp3 = half_btf_avx2(cospim16, &u[38], cospi48, &u[57], rnding, bit);
+ temp4 = half_btf_avx2(cospim16, &u[39], cospi48, &u[56], rnding, bit);
+ u[56] = half_btf_avx2(cospi48, &u[39], cospi16, &u[56], rnding, bit);
+ u[57] = half_btf_avx2(cospi48, &u[38], cospi16, &u[57], rnding, bit);
+ u[58] = half_btf_avx2(cospi48, &u[37], cospi16, &u[58], rnding, bit);
+ u[59] = half_btf_avx2(cospi48, &u[36], cospi16, &u[59], rnding, bit);
+ u[36] = temp1;
+ u[37] = temp2;
+ u[38] = temp3;
+ u[39] = temp4;
+
+ temp1 = half_btf_avx2(cospim48, &u[40], cospim16, &u[55], rnding, bit);
+ temp2 = half_btf_avx2(cospim48, &u[41], cospim16, &u[54], rnding, bit);
+ temp3 = half_btf_avx2(cospim48, &u[42], cospim16, &u[53], rnding, bit);
+ temp4 = half_btf_avx2(cospim48, &u[43], cospim16, &u[52], rnding, bit);
+ u[52] = half_btf_avx2(cospim16, &u[43], cospi48, &u[52], rnding, bit);
+ u[53] = half_btf_avx2(cospim16, &u[42], cospi48, &u[53], rnding, bit);
+ u[54] = half_btf_avx2(cospim16, &u[41], cospi48, &u[54], rnding, bit);
+ u[55] = half_btf_avx2(cospim16, &u[40], cospi48, &u[55], rnding, bit);
+ u[40] = temp1;
+ u[41] = temp2;
+ u[42] = temp3;
+ u[43] = temp4;
+}
+
+static INLINE void idct64_stage9_avx2(__m256i *u, const __m256i *cospim32,
+ const __m256i *cospi32,
+ const __m256i *clamp_lo,
+ const __m256i *clamp_hi,
+ const __m256i *rnding, int bit) {
+ int i;
+ __m256i temp1, temp2, temp3, temp4;
+ for (i = 0; i < 8; ++i) {
+ addsub_avx2(u[i], u[15 - i], &u[i], &u[15 - i], clamp_lo, clamp_hi);
+ }
+
+ temp1 = half_btf_avx2(cospim32, &u[20], cospi32, &u[27], rnding, bit);
+ temp2 = half_btf_avx2(cospim32, &u[21], cospi32, &u[26], rnding, bit);
+ temp3 = half_btf_avx2(cospim32, &u[22], cospi32, &u[25], rnding, bit);
+ temp4 = half_btf_avx2(cospim32, &u[23], cospi32, &u[24], rnding, bit);
+ u[24] = half_btf_avx2(cospi32, &u[23], cospi32, &u[24], rnding, bit);
+ u[25] = half_btf_avx2(cospi32, &u[22], cospi32, &u[25], rnding, bit);
+ u[26] = half_btf_avx2(cospi32, &u[21], cospi32, &u[26], rnding, bit);
+ u[27] = half_btf_avx2(cospi32, &u[20], cospi32, &u[27], rnding, bit);
+ u[20] = temp1;
+ u[21] = temp2;
+ u[22] = temp3;
+ u[23] = temp4;
+ for (i = 32; i < 40; i++) {
+ addsub_avx2(u[i], u[i ^ 15], &u[i], &u[i ^ 15], clamp_lo, clamp_hi);
+ }
+
+ for (i = 48; i < 56; i++) {
+ addsub_avx2(u[i ^ 15], u[i], &u[i ^ 15], &u[i], clamp_lo, clamp_hi);
+ }
+}
+
+static INLINE void idct64_stage10_avx2(__m256i *u, const __m256i *cospim32,
+ const __m256i *cospi32,
+ const __m256i *clamp_lo,
+ const __m256i *clamp_hi,
+ const __m256i *rnding, int bit) {
+ __m256i temp1, temp2, temp3, temp4;
+ for (int i = 0; i < 16; i++) {
+ addsub_avx2(u[i], u[31 - i], &u[i], &u[31 - i], clamp_lo, clamp_hi);
+ }
+
+ temp1 = half_btf_avx2(cospim32, &u[40], cospi32, &u[55], rnding, bit);
+ temp2 = half_btf_avx2(cospim32, &u[41], cospi32, &u[54], rnding, bit);
+ temp3 = half_btf_avx2(cospim32, &u[42], cospi32, &u[53], rnding, bit);
+ temp4 = half_btf_avx2(cospim32, &u[43], cospi32, &u[52], rnding, bit);
+ u[52] = half_btf_avx2(cospi32, &u[43], cospi32, &u[52], rnding, bit);
+ u[53] = half_btf_avx2(cospi32, &u[42], cospi32, &u[53], rnding, bit);
+ u[54] = half_btf_avx2(cospi32, &u[41], cospi32, &u[54], rnding, bit);
+ u[55] = half_btf_avx2(cospi32, &u[40], cospi32, &u[55], rnding, bit);
+ u[40] = temp1;
+ u[41] = temp2;
+ u[42] = temp3;
+ u[43] = temp4;
+
+ temp1 = half_btf_avx2(cospim32, &u[44], cospi32, &u[51], rnding, bit);
+ temp2 = half_btf_avx2(cospim32, &u[45], cospi32, &u[50], rnding, bit);
+ temp3 = half_btf_avx2(cospim32, &u[46], cospi32, &u[49], rnding, bit);
+ temp4 = half_btf_avx2(cospim32, &u[47], cospi32, &u[48], rnding, bit);
+ u[48] = half_btf_avx2(cospi32, &u[47], cospi32, &u[48], rnding, bit);
+ u[49] = half_btf_avx2(cospi32, &u[46], cospi32, &u[49], rnding, bit);
+ u[50] = half_btf_avx2(cospi32, &u[45], cospi32, &u[50], rnding, bit);
+ u[51] = half_btf_avx2(cospi32, &u[44], cospi32, &u[51], rnding, bit);
+ u[44] = temp1;
+ u[45] = temp2;
+ u[46] = temp3;
+ u[47] = temp4;
+}
+
+static INLINE void idct64_stage11_avx2(__m256i *u, __m256i *out, int do_cols,
+ int bd, int out_shift,
+ const __m256i *clamp_lo,
+ const __m256i *clamp_hi) {
+ for (int i = 0; i < 32; i++) {
+ addsub_avx2(u[i], u[63 - i], &out[(i)], &out[(63 - i)], clamp_lo, clamp_hi);
+ }
+
+ if (!do_cols) {
+ const int log_range_out = AOMMAX(16, bd + 6);
+ const __m256i clamp_lo_out = _mm256_set1_epi32(-(1 << (log_range_out - 1)));
+ const __m256i clamp_hi_out =
+ _mm256_set1_epi32((1 << (log_range_out - 1)) - 1);
+
+ round_shift_8x8_avx2(out, out_shift);
+ round_shift_8x8_avx2(out + 16, out_shift);
+ round_shift_8x8_avx2(out + 32, out_shift);
+ round_shift_8x8_avx2(out + 48, out_shift);
+ highbd_clamp_epi32_avx2(out, out, &clamp_lo_out, &clamp_hi_out, 64);
+ }
+}
+
+static void idct64_low1_avx2(__m256i *in, __m256i *out, int bit, int do_cols,
+ int bd, int out_shift) {
+ const int32_t *cospi = cospi_arr(bit);
+ const __m256i rnding = _mm256_set1_epi32(1 << (bit - 1));
+ const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8));
+ __m256i clamp_lo = _mm256_set1_epi32(-(1 << (log_range - 1)));
+ __m256i clamp_hi = _mm256_set1_epi32((1 << (log_range - 1)) - 1);
+
+ const __m256i cospi32 = _mm256_set1_epi32(cospi[32]);
+
+ {
+ __m256i x;
+
+ // stage 1
+ // stage 2
+ // stage 3
+ // stage 4
+ // stage 5
+ // stage 6
+ x = half_btf_0_avx2(&cospi32, &in[0], &rnding, bit);
+
+ // stage 8
+ // stage 9
+ // stage 10
+ // stage 11
+ if (!do_cols) {
+ const int log_range_out = AOMMAX(16, bd + 6);
+ clamp_lo = _mm256_set1_epi32(-(1 << (log_range_out - 1)));
+ clamp_hi = _mm256_set1_epi32((1 << (log_range_out - 1)) - 1);
+ if (out_shift != 0) {
+ __m256i offset = _mm256_set1_epi32((1 << out_shift) >> 1);
+ x = _mm256_add_epi32(x, offset);
+ x = _mm256_sra_epi32(x, _mm_cvtsi32_si128(out_shift));
+ }
+ }
+ x = _mm256_max_epi32(x, clamp_lo);
+ x = _mm256_min_epi32(x, clamp_hi);
+ out[0] = x;
+ out[1] = x;
+ out[2] = x;
+ out[3] = x;
+ out[4] = x;
+ out[5] = x;
+ out[6] = x;
+ out[7] = x;
+ out[8] = x;
+ out[9] = x;
+ out[10] = x;
+ out[11] = x;
+ out[12] = x;
+ out[13] = x;
+ out[14] = x;
+ out[15] = x;
+ out[16] = x;
+ out[17] = x;
+ out[18] = x;
+ out[19] = x;
+ out[20] = x;
+ out[21] = x;
+ out[22] = x;
+ out[23] = x;
+ out[24] = x;
+ out[25] = x;
+ out[26] = x;
+ out[27] = x;
+ out[28] = x;
+ out[29] = x;
+ out[30] = x;
+ out[31] = x;
+ out[32] = x;
+ out[33] = x;
+ out[34] = x;
+ out[35] = x;
+ out[36] = x;
+ out[37] = x;
+ out[38] = x;
+ out[39] = x;
+ out[40] = x;
+ out[41] = x;
+ out[42] = x;
+ out[43] = x;
+ out[44] = x;
+ out[45] = x;
+ out[46] = x;
+ out[47] = x;
+ out[48] = x;
+ out[49] = x;
+ out[50] = x;
+ out[51] = x;
+ out[52] = x;
+ out[53] = x;
+ out[54] = x;
+ out[55] = x;
+ out[56] = x;
+ out[57] = x;
+ out[58] = x;
+ out[59] = x;
+ out[60] = x;
+ out[61] = x;
+ out[62] = x;
+ out[63] = x;
+ }
+}
+static void idct64_low8_avx2(__m256i *in, __m256i *out, int bit, int do_cols,
+ int bd, int out_shift) {
+ int i, j;
+ const int32_t *cospi = cospi_arr(bit);
+ const __m256i rnding = _mm256_set1_epi32(1 << (bit - 1));
+ const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8));
+ const __m256i clamp_lo = _mm256_set1_epi32(-(1 << (log_range - 1)));
+ const __m256i clamp_hi = _mm256_set1_epi32((1 << (log_range - 1)) - 1);
+
+ const __m256i cospi1 = _mm256_set1_epi32(cospi[1]);
+ const __m256i cospi2 = _mm256_set1_epi32(cospi[2]);
+ const __m256i cospi3 = _mm256_set1_epi32(cospi[3]);
+ const __m256i cospi4 = _mm256_set1_epi32(cospi[4]);
+ const __m256i cospi6 = _mm256_set1_epi32(cospi[6]);
+ const __m256i cospi8 = _mm256_set1_epi32(cospi[8]);
+ const __m256i cospi12 = _mm256_set1_epi32(cospi[12]);
+ const __m256i cospi16 = _mm256_set1_epi32(cospi[16]);
+ const __m256i cospi20 = _mm256_set1_epi32(cospi[20]);
+ const __m256i cospi24 = _mm256_set1_epi32(cospi[24]);
+ const __m256i cospi28 = _mm256_set1_epi32(cospi[28]);
+ const __m256i cospi32 = _mm256_set1_epi32(cospi[32]);
+ const __m256i cospi40 = _mm256_set1_epi32(cospi[40]);
+ const __m256i cospi44 = _mm256_set1_epi32(cospi[44]);
+ const __m256i cospi48 = _mm256_set1_epi32(cospi[48]);
+ const __m256i cospi56 = _mm256_set1_epi32(cospi[56]);
+ const __m256i cospi60 = _mm256_set1_epi32(cospi[60]);
+ const __m256i cospim4 = _mm256_set1_epi32(-cospi[4]);
+ const __m256i cospim8 = _mm256_set1_epi32(-cospi[8]);
+ const __m256i cospim12 = _mm256_set1_epi32(-cospi[12]);
+ const __m256i cospim16 = _mm256_set1_epi32(-cospi[16]);
+ const __m256i cospim20 = _mm256_set1_epi32(-cospi[20]);
+ const __m256i cospim24 = _mm256_set1_epi32(-cospi[24]);
+ const __m256i cospim28 = _mm256_set1_epi32(-cospi[28]);
+ const __m256i cospim32 = _mm256_set1_epi32(-cospi[32]);
+ const __m256i cospim36 = _mm256_set1_epi32(-cospi[36]);
+ const __m256i cospim40 = _mm256_set1_epi32(-cospi[40]);
+ const __m256i cospim48 = _mm256_set1_epi32(-cospi[48]);
+ const __m256i cospim52 = _mm256_set1_epi32(-cospi[52]);
+ const __m256i cospim56 = _mm256_set1_epi32(-cospi[56]);
+ const __m256i cospi63 = _mm256_set1_epi32(cospi[63]);
+ const __m256i cospim57 = _mm256_set1_epi32(-cospi[57]);
+ const __m256i cospi7 = _mm256_set1_epi32(cospi[7]);
+ const __m256i cospi5 = _mm256_set1_epi32(cospi[5]);
+ const __m256i cospi59 = _mm256_set1_epi32(cospi[59]);
+ const __m256i cospim61 = _mm256_set1_epi32(-cospi[61]);
+ const __m256i cospim58 = _mm256_set1_epi32(-cospi[58]);
+ const __m256i cospi62 = _mm256_set1_epi32(cospi[62]);
+
+ {
+ __m256i u[64];
+
+ // stage 1
+ u[0] = in[0];
+ u[8] = in[4];
+ u[16] = in[2];
+ u[24] = in[6];
+ u[32] = in[1];
+ u[40] = in[5];
+ u[48] = in[3];
+ u[56] = in[7];
+
+ // stage 2
+ u[63] = half_btf_0_avx2(&cospi1, &u[32], &rnding, bit);
+ u[32] = half_btf_0_avx2(&cospi63, &u[32], &rnding, bit);
+ u[39] = half_btf_0_avx2(&cospim57, &u[56], &rnding, bit);
+ u[56] = half_btf_0_avx2(&cospi7, &u[56], &rnding, bit);
+ u[55] = half_btf_0_avx2(&cospi5, &u[40], &rnding, bit);
+ u[40] = half_btf_0_avx2(&cospi59, &u[40], &rnding, bit);
+ u[47] = half_btf_0_avx2(&cospim61, &u[48], &rnding, bit);
+ u[48] = half_btf_0_avx2(&cospi3, &u[48], &rnding, bit);
+
+ // stage 3
+ u[31] = half_btf_0_avx2(&cospi2, &u[16], &rnding, bit);
+ u[16] = half_btf_0_avx2(&cospi62, &u[16], &rnding, bit);
+ u[23] = half_btf_0_avx2(&cospim58, &u[24], &rnding, bit);
+ u[24] = half_btf_0_avx2(&cospi6, &u[24], &rnding, bit);
+ u[33] = u[32];
+ u[38] = u[39];
+ u[41] = u[40];
+ u[46] = u[47];
+ u[49] = u[48];
+ u[54] = u[55];
+ u[57] = u[56];
+ u[62] = u[63];
+
+ // stage 4
+ __m256i temp1, temp2;
+ u[15] = half_btf_0_avx2(&cospi4, &u[8], &rnding, bit);
+ u[8] = half_btf_0_avx2(&cospi60, &u[8], &rnding, bit);
+ u[17] = u[16];
+ u[22] = u[23];
+ u[25] = u[24];
+ u[30] = u[31];
+
+ temp1 = half_btf_avx2(&cospim4, &u[33], &cospi60, &u[62], &rnding, bit);
+ u[62] = half_btf_avx2(&cospi60, &u[33], &cospi4, &u[62], &rnding, bit);
+ u[33] = temp1;
+
+ temp2 = half_btf_avx2(&cospim36, &u[38], &cospi28, &u[57], &rnding, bit);
+ u[38] = half_btf_avx2(&cospim28, &u[38], &cospim36, &u[57], &rnding, bit);
+ u[57] = temp2;
+
+ temp1 = half_btf_avx2(&cospim20, &u[41], &cospi44, &u[54], &rnding, bit);
+ u[54] = half_btf_avx2(&cospi44, &u[41], &cospi20, &u[54], &rnding, bit);
+ u[41] = temp1;
+
+ temp2 = half_btf_avx2(&cospim12, &u[46], &cospim52, &u[49], &rnding, bit);
+ u[49] = half_btf_avx2(&cospim52, &u[46], &cospi12, &u[49], &rnding, bit);
+ u[46] = temp2;
+
+ // stage 5
+ u[9] = u[8];
+ u[14] = u[15];
+
+ temp1 = half_btf_avx2(&cospim8, &u[17], &cospi56, &u[30], &rnding, bit);
+ u[30] = half_btf_avx2(&cospi56, &u[17], &cospi8, &u[30], &rnding, bit);
+ u[17] = temp1;
+
+ temp2 = half_btf_avx2(&cospim24, &u[22], &cospim40, &u[25], &rnding, bit);
+ u[25] = half_btf_avx2(&cospim40, &u[22], &cospi24, &u[25], &rnding, bit);
+ u[22] = temp2;
+
+ u[35] = u[32];
+ u[34] = u[33];
+ u[36] = u[39];
+ u[37] = u[38];
+ u[43] = u[40];
+ u[42] = u[41];
+ u[44] = u[47];
+ u[45] = u[46];
+ u[51] = u[48];
+ u[50] = u[49];
+ u[52] = u[55];
+ u[53] = u[54];
+ u[59] = u[56];
+ u[58] = u[57];
+ u[60] = u[63];
+ u[61] = u[62];
+
+ // stage 6
+ temp1 = half_btf_0_avx2(&cospi32, &u[0], &rnding, bit);
+ u[1] = half_btf_0_avx2(&cospi32, &u[0], &rnding, bit);
+ u[0] = temp1;
+
+ temp2 = half_btf_avx2(&cospim16, &u[9], &cospi48, &u[14], &rnding, bit);
+ u[14] = half_btf_avx2(&cospi48, &u[9], &cospi16, &u[14], &rnding, bit);
+ u[9] = temp2;
+ u[19] = u[16];
+ u[18] = u[17];
+ u[20] = u[23];
+ u[21] = u[22];
+ u[27] = u[24];
+ u[26] = u[25];
+ u[28] = u[31];
+ u[29] = u[30];
+
+ temp1 = half_btf_avx2(&cospim8, &u[34], &cospi56, &u[61], &rnding, bit);
+ u[61] = half_btf_avx2(&cospi56, &u[34], &cospi8, &u[61], &rnding, bit);
+ u[34] = temp1;
+ temp2 = half_btf_avx2(&cospim8, &u[35], &cospi56, &u[60], &rnding, bit);
+ u[60] = half_btf_avx2(&cospi56, &u[35], &cospi8, &u[60], &rnding, bit);
+ u[35] = temp2;
+ temp1 = half_btf_avx2(&cospim56, &u[36], &cospim8, &u[59], &rnding, bit);
+ u[59] = half_btf_avx2(&cospim8, &u[36], &cospi56, &u[59], &rnding, bit);
+ u[36] = temp1;
+ temp2 = half_btf_avx2(&cospim56, &u[37], &cospim8, &u[58], &rnding, bit);
+ u[58] = half_btf_avx2(&cospim8, &u[37], &cospi56, &u[58], &rnding, bit);
+ u[37] = temp2;
+ temp1 = half_btf_avx2(&cospim40, &u[42], &cospi24, &u[53], &rnding, bit);
+ u[53] = half_btf_avx2(&cospi24, &u[42], &cospi40, &u[53], &rnding, bit);
+ u[42] = temp1;
+ temp2 = half_btf_avx2(&cospim40, &u[43], &cospi24, &u[52], &rnding, bit);
+ u[52] = half_btf_avx2(&cospi24, &u[43], &cospi40, &u[52], &rnding, bit);
+ u[43] = temp2;
+ temp1 = half_btf_avx2(&cospim24, &u[44], &cospim40, &u[51], &rnding, bit);
+ u[51] = half_btf_avx2(&cospim40, &u[44], &cospi24, &u[51], &rnding, bit);
+ u[44] = temp1;
+ temp2 = half_btf_avx2(&cospim24, &u[45], &cospim40, &u[50], &rnding, bit);
+ u[50] = half_btf_avx2(&cospim40, &u[45], &cospi24, &u[50], &rnding, bit);
+ u[45] = temp2;
+
+ // stage 7
+ u[3] = u[0];
+ u[2] = u[1];
+ u[11] = u[8];
+ u[10] = u[9];
+ u[12] = u[15];
+ u[13] = u[14];
+
+ temp1 = half_btf_avx2(&cospim16, &u[18], &cospi48, &u[29], &rnding, bit);
+ u[29] = half_btf_avx2(&cospi48, &u[18], &cospi16, &u[29], &rnding, bit);
+ u[18] = temp1;
+ temp2 = half_btf_avx2(&cospim16, &u[19], &cospi48, &u[28], &rnding, bit);
+ u[28] = half_btf_avx2(&cospi48, &u[19], &cospi16, &u[28], &rnding, bit);
+ u[19] = temp2;
+ temp1 = half_btf_avx2(&cospim48, &u[20], &cospim16, &u[27], &rnding, bit);
+ u[27] = half_btf_avx2(&cospim16, &u[20], &cospi48, &u[27], &rnding, bit);
+ u[20] = temp1;
+ temp2 = half_btf_avx2(&cospim48, &u[21], &cospim16, &u[26], &rnding, bit);
+ u[26] = half_btf_avx2(&cospim16, &u[21], &cospi48, &u[26], &rnding, bit);
+ u[21] = temp2;
+ for (i = 32; i < 64; i += 16) {
+ for (j = i; j < i + 4; j++) {
+ addsub_avx2(u[j], u[j ^ 7], &u[j], &u[j ^ 7], &clamp_lo, &clamp_hi);
+ addsub_avx2(u[j ^ 15], u[j ^ 8], &u[j ^ 15], &u[j ^ 8], &clamp_lo,
+ &clamp_hi);
+ }
+ }
+
+ // stage 8
+ u[7] = u[0];
+ u[6] = u[1];
+ u[5] = u[2];
+ u[4] = u[3];
+
+ idct64_stage8_avx2(u, &cospim32, &cospi32, &cospim16, &cospi48, &cospi16,
+ &cospim48, &clamp_lo, &clamp_hi, &rnding, bit);
+
+ // stage 9
+ idct64_stage9_avx2(u, &cospim32, &cospi32, &clamp_lo, &clamp_hi, &rnding,
+ bit);
+
+ // stage 10
+ idct64_stage10_avx2(u, &cospim32, &cospi32, &clamp_lo, &clamp_hi, &rnding,
+ bit);
+
+ // stage 11
+ idct64_stage11_avx2(u, out, do_cols, bd, out_shift, &clamp_lo, &clamp_hi);
+ }
+}
+static void idct64_low16_avx2(__m256i *in, __m256i *out, int bit, int do_cols,
+ int bd, int out_shift) {
+ int i, j;
+ const int32_t *cospi = cospi_arr(bit);
+ const __m256i rnding = _mm256_set1_epi32(1 << (bit - 1));
+ const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8));
+ const __m256i clamp_lo = _mm256_set1_epi32(-(1 << (log_range - 1)));
+ const __m256i clamp_hi = _mm256_set1_epi32((1 << (log_range - 1)) - 1);
+
+ const __m256i cospi1 = _mm256_set1_epi32(cospi[1]);
+ const __m256i cospi2 = _mm256_set1_epi32(cospi[2]);
+ const __m256i cospi3 = _mm256_set1_epi32(cospi[3]);
+ const __m256i cospi4 = _mm256_set1_epi32(cospi[4]);
+ const __m256i cospi5 = _mm256_set1_epi32(cospi[5]);
+ const __m256i cospi6 = _mm256_set1_epi32(cospi[6]);
+ const __m256i cospi7 = _mm256_set1_epi32(cospi[7]);
+ const __m256i cospi8 = _mm256_set1_epi32(cospi[8]);
+ const __m256i cospi9 = _mm256_set1_epi32(cospi[9]);
+ const __m256i cospi10 = _mm256_set1_epi32(cospi[10]);
+ const __m256i cospi11 = _mm256_set1_epi32(cospi[11]);
+ const __m256i cospi12 = _mm256_set1_epi32(cospi[12]);
+ const __m256i cospi13 = _mm256_set1_epi32(cospi[13]);
+ const __m256i cospi14 = _mm256_set1_epi32(cospi[14]);
+ const __m256i cospi15 = _mm256_set1_epi32(cospi[15]);
+ const __m256i cospi16 = _mm256_set1_epi32(cospi[16]);
+ const __m256i cospi20 = _mm256_set1_epi32(cospi[20]);
+ const __m256i cospi24 = _mm256_set1_epi32(cospi[24]);
+ const __m256i cospi28 = _mm256_set1_epi32(cospi[28]);
+ const __m256i cospi32 = _mm256_set1_epi32(cospi[32]);
+ const __m256i cospi36 = _mm256_set1_epi32(cospi[36]);
+ const __m256i cospi40 = _mm256_set1_epi32(cospi[40]);
+ const __m256i cospi44 = _mm256_set1_epi32(cospi[44]);
+ const __m256i cospi48 = _mm256_set1_epi32(cospi[48]);
+ const __m256i cospi51 = _mm256_set1_epi32(cospi[51]);
+ const __m256i cospi52 = _mm256_set1_epi32(cospi[52]);
+ const __m256i cospi54 = _mm256_set1_epi32(cospi[54]);
+ const __m256i cospi55 = _mm256_set1_epi32(cospi[55]);
+ const __m256i cospi56 = _mm256_set1_epi32(cospi[56]);
+ const __m256i cospi59 = _mm256_set1_epi32(cospi[59]);
+ const __m256i cospi60 = _mm256_set1_epi32(cospi[60]);
+ const __m256i cospi62 = _mm256_set1_epi32(cospi[62]);
+ const __m256i cospi63 = _mm256_set1_epi32(cospi[63]);
+
+ const __m256i cospim4 = _mm256_set1_epi32(-cospi[4]);
+ const __m256i cospim8 = _mm256_set1_epi32(-cospi[8]);
+ const __m256i cospim12 = _mm256_set1_epi32(-cospi[12]);
+ const __m256i cospim16 = _mm256_set1_epi32(-cospi[16]);
+ const __m256i cospim20 = _mm256_set1_epi32(-cospi[20]);
+ const __m256i cospim24 = _mm256_set1_epi32(-cospi[24]);
+ const __m256i cospim28 = _mm256_set1_epi32(-cospi[28]);
+ const __m256i cospim32 = _mm256_set1_epi32(-cospi[32]);
+ const __m256i cospim36 = _mm256_set1_epi32(-cospi[36]);
+ const __m256i cospim40 = _mm256_set1_epi32(-cospi[40]);
+ const __m256i cospim44 = _mm256_set1_epi32(-cospi[44]);
+ const __m256i cospim48 = _mm256_set1_epi32(-cospi[48]);
+ const __m256i cospim49 = _mm256_set1_epi32(-cospi[49]);
+ const __m256i cospim50 = _mm256_set1_epi32(-cospi[50]);
+ const __m256i cospim52 = _mm256_set1_epi32(-cospi[52]);
+ const __m256i cospim53 = _mm256_set1_epi32(-cospi[53]);
+ const __m256i cospim56 = _mm256_set1_epi32(-cospi[56]);
+ const __m256i cospim57 = _mm256_set1_epi32(-cospi[57]);
+ const __m256i cospim58 = _mm256_set1_epi32(-cospi[58]);
+ const __m256i cospim60 = _mm256_set1_epi32(-cospi[60]);
+ const __m256i cospim61 = _mm256_set1_epi32(-cospi[61]);
+
+ {
+ __m256i u[64];
+ __m256i tmp1, tmp2, tmp3, tmp4;
+ // stage 1
+ u[0] = in[0];
+ u[32] = in[1];
+ u[36] = in[9];
+ u[40] = in[5];
+ u[44] = in[13];
+ u[48] = in[3];
+ u[52] = in[11];
+ u[56] = in[7];
+ u[60] = in[15];
+ u[16] = in[2];
+ u[20] = in[10];
+ u[24] = in[6];
+ u[28] = in[14];
+ u[4] = in[8];
+ u[8] = in[4];
+ u[12] = in[12];
+
+ // stage 2
+ u[63] = half_btf_0_avx2(&cospi1, &u[32], &rnding, bit);
+ u[32] = half_btf_0_avx2(&cospi63, &u[32], &rnding, bit);
+ u[35] = half_btf_0_avx2(&cospim49, &u[60], &rnding, bit);
+ u[60] = half_btf_0_avx2(&cospi15, &u[60], &rnding, bit);
+ u[59] = half_btf_0_avx2(&cospi9, &u[36], &rnding, bit);
+ u[36] = half_btf_0_avx2(&cospi55, &u[36], &rnding, bit);
+ u[39] = half_btf_0_avx2(&cospim57, &u[56], &rnding, bit);
+ u[56] = half_btf_0_avx2(&cospi7, &u[56], &rnding, bit);
+ u[55] = half_btf_0_avx2(&cospi5, &u[40], &rnding, bit);
+ u[40] = half_btf_0_avx2(&cospi59, &u[40], &rnding, bit);
+ u[43] = half_btf_0_avx2(&cospim53, &u[52], &rnding, bit);
+ u[52] = half_btf_0_avx2(&cospi11, &u[52], &rnding, bit);
+ u[47] = half_btf_0_avx2(&cospim61, &u[48], &rnding, bit);
+ u[48] = half_btf_0_avx2(&cospi3, &u[48], &rnding, bit);
+ u[51] = half_btf_0_avx2(&cospi13, &u[44], &rnding, bit);
+ u[44] = half_btf_0_avx2(&cospi51, &u[44], &rnding, bit);
+
+ // stage 3
+ u[31] = half_btf_0_avx2(&cospi2, &u[16], &rnding, bit);
+ u[16] = half_btf_0_avx2(&cospi62, &u[16], &rnding, bit);
+ u[19] = half_btf_0_avx2(&cospim50, &u[28], &rnding, bit);
+ u[28] = half_btf_0_avx2(&cospi14, &u[28], &rnding, bit);
+ u[27] = half_btf_0_avx2(&cospi10, &u[20], &rnding, bit);
+ u[20] = half_btf_0_avx2(&cospi54, &u[20], &rnding, bit);
+ u[23] = half_btf_0_avx2(&cospim58, &u[24], &rnding, bit);
+ u[24] = half_btf_0_avx2(&cospi6, &u[24], &rnding, bit);
+ u[33] = u[32];
+ u[34] = u[35];
+ u[37] = u[36];
+ u[38] = u[39];
+ u[41] = u[40];
+ u[42] = u[43];
+ u[45] = u[44];
+ u[46] = u[47];
+ u[49] = u[48];
+ u[50] = u[51];
+ u[53] = u[52];
+ u[54] = u[55];
+ u[57] = u[56];
+ u[58] = u[59];
+ u[61] = u[60];
+ u[62] = u[63];
+
+ // stage 4
+ u[15] = half_btf_0_avx2(&cospi4, &u[8], &rnding, bit);
+ u[8] = half_btf_0_avx2(&cospi60, &u[8], &rnding, bit);
+ u[11] = half_btf_0_avx2(&cospim52, &u[12], &rnding, bit);
+ u[12] = half_btf_0_avx2(&cospi12, &u[12], &rnding, bit);
+
+ u[17] = u[16];
+ u[18] = u[19];
+ u[21] = u[20];
+ u[22] = u[23];
+ u[25] = u[24];
+ u[26] = u[27];
+ u[29] = u[28];
+ u[30] = u[31];
+
+ tmp1 = half_btf_avx2(&cospim4, &u[33], &cospi60, &u[62], &rnding, bit);
+ tmp2 = half_btf_avx2(&cospim60, &u[34], &cospim4, &u[61], &rnding, bit);
+ tmp3 = half_btf_avx2(&cospim36, &u[37], &cospi28, &u[58], &rnding, bit);
+ tmp4 = half_btf_avx2(&cospim28, &u[38], &cospim36, &u[57], &rnding, bit);
+ u[57] = half_btf_avx2(&cospim36, &u[38], &cospi28, &u[57], &rnding, bit);
+ u[58] = half_btf_avx2(&cospi28, &u[37], &cospi36, &u[58], &rnding, bit);
+ u[61] = half_btf_avx2(&cospim4, &u[34], &cospi60, &u[61], &rnding, bit);
+ u[62] = half_btf_avx2(&cospi60, &u[33], &cospi4, &u[62], &rnding, bit);
+ u[33] = tmp1;
+ u[34] = tmp2;
+ u[37] = tmp3;
+ u[38] = tmp4;
+
+ tmp1 = half_btf_avx2(&cospim20, &u[41], &cospi44, &u[54], &rnding, bit);
+ tmp2 = half_btf_avx2(&cospim44, &u[42], &cospim20, &u[53], &rnding, bit);
+ tmp3 = half_btf_avx2(&cospim52, &u[45], &cospi12, &u[50], &rnding, bit);
+ tmp4 = half_btf_avx2(&cospim12, &u[46], &cospim52, &u[49], &rnding, bit);
+ u[49] = half_btf_avx2(&cospim52, &u[46], &cospi12, &u[49], &rnding, bit);
+ u[50] = half_btf_avx2(&cospi12, &u[45], &cospi52, &u[50], &rnding, bit);
+ u[53] = half_btf_avx2(&cospim20, &u[42], &cospi44, &u[53], &rnding, bit);
+ u[54] = half_btf_avx2(&cospi44, &u[41], &cospi20, &u[54], &rnding, bit);
+ u[41] = tmp1;
+ u[42] = tmp2;
+ u[45] = tmp3;
+ u[46] = tmp4;
+
+ // stage 5
+ u[7] = half_btf_0_avx2(&cospi8, &u[4], &rnding, bit);
+ u[4] = half_btf_0_avx2(&cospi56, &u[4], &rnding, bit);
+
+ u[9] = u[8];
+ u[10] = u[11];
+ u[13] = u[12];
+ u[14] = u[15];
+
+ tmp1 = half_btf_avx2(&cospim8, &u[17], &cospi56, &u[30], &rnding, bit);
+ tmp2 = half_btf_avx2(&cospim56, &u[18], &cospim8, &u[29], &rnding, bit);
+ tmp3 = half_btf_avx2(&cospim40, &u[21], &cospi24, &u[26], &rnding, bit);
+ tmp4 = half_btf_avx2(&cospim24, &u[22], &cospim40, &u[25], &rnding, bit);
+ u[25] = half_btf_avx2(&cospim40, &u[22], &cospi24, &u[25], &rnding, bit);
+ u[26] = half_btf_avx2(&cospi24, &u[21], &cospi40, &u[26], &rnding, bit);
+ u[29] = half_btf_avx2(&cospim8, &u[18], &cospi56, &u[29], &rnding, bit);
+ u[30] = half_btf_avx2(&cospi56, &u[17], &cospi8, &u[30], &rnding, bit);
+ u[17] = tmp1;
+ u[18] = tmp2;
+ u[21] = tmp3;
+ u[22] = tmp4;
+
+ for (i = 32; i < 64; i += 8) {
+ addsub_avx2(u[i + 0], u[i + 3], &u[i + 0], &u[i + 3], &clamp_lo,
+ &clamp_hi);
+ addsub_avx2(u[i + 1], u[i + 2], &u[i + 1], &u[i + 2], &clamp_lo,
+ &clamp_hi);
+
+ addsub_avx2(u[i + 7], u[i + 4], &u[i + 7], &u[i + 4], &clamp_lo,
+ &clamp_hi);
+ addsub_avx2(u[i + 6], u[i + 5], &u[i + 6], &u[i + 5], &clamp_lo,
+ &clamp_hi);
+ }
+
+ // stage 6
+ tmp1 = half_btf_0_avx2(&cospi32, &u[0], &rnding, bit);
+ u[1] = half_btf_0_avx2(&cospi32, &u[0], &rnding, bit);
+ u[0] = tmp1;
+ u[5] = u[4];
+ u[6] = u[7];
+
+ tmp1 = half_btf_avx2(&cospim16, &u[9], &cospi48, &u[14], &rnding, bit);
+ u[14] = half_btf_avx2(&cospi48, &u[9], &cospi16, &u[14], &rnding, bit);
+ u[9] = tmp1;
+ tmp2 = half_btf_avx2(&cospim48, &u[10], &cospim16, &u[13], &rnding, bit);
+ u[13] = half_btf_avx2(&cospim16, &u[10], &cospi48, &u[13], &rnding, bit);
+ u[10] = tmp2;
+
+ for (i = 16; i < 32; i += 8) {
+ addsub_avx2(u[i + 0], u[i + 3], &u[i + 0], &u[i + 3], &clamp_lo,
+ &clamp_hi);
+ addsub_avx2(u[i + 1], u[i + 2], &u[i + 1], &u[i + 2], &clamp_lo,
+ &clamp_hi);
+
+ addsub_avx2(u[i + 7], u[i + 4], &u[i + 7], &u[i + 4], &clamp_lo,
+ &clamp_hi);
+ addsub_avx2(u[i + 6], u[i + 5], &u[i + 6], &u[i + 5], &clamp_lo,
+ &clamp_hi);
+ }
+
+ tmp1 = half_btf_avx2(&cospim8, &u[34], &cospi56, &u[61], &rnding, bit);
+ tmp2 = half_btf_avx2(&cospim8, &u[35], &cospi56, &u[60], &rnding, bit);
+ tmp3 = half_btf_avx2(&cospim56, &u[36], &cospim8, &u[59], &rnding, bit);
+ tmp4 = half_btf_avx2(&cospim56, &u[37], &cospim8, &u[58], &rnding, bit);
+ u[58] = half_btf_avx2(&cospim8, &u[37], &cospi56, &u[58], &rnding, bit);
+ u[59] = half_btf_avx2(&cospim8, &u[36], &cospi56, &u[59], &rnding, bit);
+ u[60] = half_btf_avx2(&cospi56, &u[35], &cospi8, &u[60], &rnding, bit);
+ u[61] = half_btf_avx2(&cospi56, &u[34], &cospi8, &u[61], &rnding, bit);
+ u[34] = tmp1;
+ u[35] = tmp2;
+ u[36] = tmp3;
+ u[37] = tmp4;
+
+ tmp1 = half_btf_avx2(&cospim40, &u[42], &cospi24, &u[53], &rnding, bit);
+ tmp2 = half_btf_avx2(&cospim40, &u[43], &cospi24, &u[52], &rnding, bit);
+ tmp3 = half_btf_avx2(&cospim24, &u[44], &cospim40, &u[51], &rnding, bit);
+ tmp4 = half_btf_avx2(&cospim24, &u[45], &cospim40, &u[50], &rnding, bit);
+ u[50] = half_btf_avx2(&cospim40, &u[45], &cospi24, &u[50], &rnding, bit);
+ u[51] = half_btf_avx2(&cospim40, &u[44], &cospi24, &u[51], &rnding, bit);
+ u[52] = half_btf_avx2(&cospi24, &u[43], &cospi40, &u[52], &rnding, bit);
+ u[53] = half_btf_avx2(&cospi24, &u[42], &cospi40, &u[53], &rnding, bit);
+ u[42] = tmp1;
+ u[43] = tmp2;
+ u[44] = tmp3;
+ u[45] = tmp4;
+
+ // stage 7
+ u[3] = u[0];
+ u[2] = u[1];
+ tmp1 = half_btf_avx2(&cospim32, &u[5], &cospi32, &u[6], &rnding, bit);
+ u[6] = half_btf_avx2(&cospi32, &u[5], &cospi32, &u[6], &rnding, bit);
+ u[5] = tmp1;
+ addsub_avx2(u[8], u[11], &u[8], &u[11], &clamp_lo, &clamp_hi);
+ addsub_avx2(u[9], u[10], &u[9], &u[10], &clamp_lo, &clamp_hi);
+ addsub_avx2(u[15], u[12], &u[15], &u[12], &clamp_lo, &clamp_hi);
+ addsub_avx2(u[14], u[13], &u[14], &u[13], &clamp_lo, &clamp_hi);
+
+ tmp1 = half_btf_avx2(&cospim16, &u[18], &cospi48, &u[29], &rnding, bit);
+ tmp2 = half_btf_avx2(&cospim16, &u[19], &cospi48, &u[28], &rnding, bit);
+ tmp3 = half_btf_avx2(&cospim48, &u[20], &cospim16, &u[27], &rnding, bit);
+ tmp4 = half_btf_avx2(&cospim48, &u[21], &cospim16, &u[26], &rnding, bit);
+ u[26] = half_btf_avx2(&cospim16, &u[21], &cospi48, &u[26], &rnding, bit);
+ u[27] = half_btf_avx2(&cospim16, &u[20], &cospi48, &u[27], &rnding, bit);
+ u[28] = half_btf_avx2(&cospi48, &u[19], &cospi16, &u[28], &rnding, bit);
+ u[29] = half_btf_avx2(&cospi48, &u[18], &cospi16, &u[29], &rnding, bit);
+ u[18] = tmp1;
+ u[19] = tmp2;
+ u[20] = tmp3;
+ u[21] = tmp4;
+
+ for (i = 32; i < 64; i += 16) {
+ for (j = i; j < i + 4; j++) {
+ addsub_avx2(u[j], u[j ^ 7], &u[j], &u[j ^ 7], &clamp_lo, &clamp_hi);
+ addsub_avx2(u[j ^ 15], u[j ^ 8], &u[j ^ 15], &u[j ^ 8], &clamp_lo,
+ &clamp_hi);
+ }
+ }
+
+ // stage 8
+ for (i = 0; i < 4; ++i) {
+ addsub_avx2(u[i], u[7 - i], &u[i], &u[7 - i], &clamp_lo, &clamp_hi);
+ }
+
+ idct64_stage8_avx2(u, &cospim32, &cospi32, &cospim16, &cospi48, &cospi16,
+ &cospim48, &clamp_lo, &clamp_hi, &rnding, bit);
+
+ // stage 9
+ idct64_stage9_avx2(u, &cospim32, &cospi32, &clamp_lo, &clamp_hi, &rnding,
+ bit);
+
+ // stage 10
+ idct64_stage10_avx2(u, &cospim32, &cospi32, &clamp_lo, &clamp_hi, &rnding,
+ bit);
+
+ // stage 11
+ idct64_stage11_avx2(u, out, do_cols, bd, out_shift, &clamp_lo, &clamp_hi);
+ }
+}
+static void idct64_avx2(__m256i *in, __m256i *out, int bit, int do_cols, int bd,
+ int out_shift) {
+ int i, j;
+ const int32_t *cospi = cospi_arr(bit);
+ const __m256i rnding = _mm256_set1_epi32(1 << (bit - 1));
+ const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8));
+ const __m256i clamp_lo = _mm256_set1_epi32(-(1 << (log_range - 1)));
+ const __m256i clamp_hi = _mm256_set1_epi32((1 << (log_range - 1)) - 1);
+
+ const __m256i cospi1 = _mm256_set1_epi32(cospi[1]);
+ const __m256i cospi2 = _mm256_set1_epi32(cospi[2]);
+ const __m256i cospi3 = _mm256_set1_epi32(cospi[3]);
+ const __m256i cospi4 = _mm256_set1_epi32(cospi[4]);
+ const __m256i cospi5 = _mm256_set1_epi32(cospi[5]);
+ const __m256i cospi6 = _mm256_set1_epi32(cospi[6]);
+ const __m256i cospi7 = _mm256_set1_epi32(cospi[7]);
+ const __m256i cospi8 = _mm256_set1_epi32(cospi[8]);
+ const __m256i cospi9 = _mm256_set1_epi32(cospi[9]);
+ const __m256i cospi10 = _mm256_set1_epi32(cospi[10]);
+ const __m256i cospi11 = _mm256_set1_epi32(cospi[11]);
+ const __m256i cospi12 = _mm256_set1_epi32(cospi[12]);
+ const __m256i cospi13 = _mm256_set1_epi32(cospi[13]);
+ const __m256i cospi14 = _mm256_set1_epi32(cospi[14]);
+ const __m256i cospi15 = _mm256_set1_epi32(cospi[15]);
+ const __m256i cospi16 = _mm256_set1_epi32(cospi[16]);
+ const __m256i cospi17 = _mm256_set1_epi32(cospi[17]);
+ const __m256i cospi18 = _mm256_set1_epi32(cospi[18]);
+ const __m256i cospi19 = _mm256_set1_epi32(cospi[19]);
+ const __m256i cospi20 = _mm256_set1_epi32(cospi[20]);
+ const __m256i cospi21 = _mm256_set1_epi32(cospi[21]);
+ const __m256i cospi22 = _mm256_set1_epi32(cospi[22]);
+ const __m256i cospi23 = _mm256_set1_epi32(cospi[23]);
+ const __m256i cospi24 = _mm256_set1_epi32(cospi[24]);
+ const __m256i cospi25 = _mm256_set1_epi32(cospi[25]);
+ const __m256i cospi26 = _mm256_set1_epi32(cospi[26]);
+ const __m256i cospi27 = _mm256_set1_epi32(cospi[27]);
+ const __m256i cospi28 = _mm256_set1_epi32(cospi[28]);
+ const __m256i cospi29 = _mm256_set1_epi32(cospi[29]);
+ const __m256i cospi30 = _mm256_set1_epi32(cospi[30]);
+ const __m256i cospi31 = _mm256_set1_epi32(cospi[31]);
+ const __m256i cospi32 = _mm256_set1_epi32(cospi[32]);
+ const __m256i cospi35 = _mm256_set1_epi32(cospi[35]);
+ const __m256i cospi36 = _mm256_set1_epi32(cospi[36]);
+ const __m256i cospi38 = _mm256_set1_epi32(cospi[38]);
+ const __m256i cospi39 = _mm256_set1_epi32(cospi[39]);
+ const __m256i cospi40 = _mm256_set1_epi32(cospi[40]);
+ const __m256i cospi43 = _mm256_set1_epi32(cospi[43]);
+ const __m256i cospi44 = _mm256_set1_epi32(cospi[44]);
+ const __m256i cospi46 = _mm256_set1_epi32(cospi[46]);
+ const __m256i cospi47 = _mm256_set1_epi32(cospi[47]);
+ const __m256i cospi48 = _mm256_set1_epi32(cospi[48]);
+ const __m256i cospi51 = _mm256_set1_epi32(cospi[51]);
+ const __m256i cospi52 = _mm256_set1_epi32(cospi[52]);
+ const __m256i cospi54 = _mm256_set1_epi32(cospi[54]);
+ const __m256i cospi55 = _mm256_set1_epi32(cospi[55]);
+ const __m256i cospi56 = _mm256_set1_epi32(cospi[56]);
+ const __m256i cospi59 = _mm256_set1_epi32(cospi[59]);
+ const __m256i cospi60 = _mm256_set1_epi32(cospi[60]);
+ const __m256i cospi62 = _mm256_set1_epi32(cospi[62]);
+ const __m256i cospi63 = _mm256_set1_epi32(cospi[63]);
+
+ const __m256i cospim4 = _mm256_set1_epi32(-cospi[4]);
+ const __m256i cospim8 = _mm256_set1_epi32(-cospi[8]);
+ const __m256i cospim12 = _mm256_set1_epi32(-cospi[12]);
+ const __m256i cospim16 = _mm256_set1_epi32(-cospi[16]);
+ const __m256i cospim20 = _mm256_set1_epi32(-cospi[20]);
+ const __m256i cospim24 = _mm256_set1_epi32(-cospi[24]);
+ const __m256i cospim28 = _mm256_set1_epi32(-cospi[28]);
+ const __m256i cospim32 = _mm256_set1_epi32(-cospi[32]);
+ const __m256i cospim33 = _mm256_set1_epi32(-cospi[33]);
+ const __m256i cospim34 = _mm256_set1_epi32(-cospi[34]);
+ const __m256i cospim36 = _mm256_set1_epi32(-cospi[36]);
+ const __m256i cospim37 = _mm256_set1_epi32(-cospi[37]);
+ const __m256i cospim40 = _mm256_set1_epi32(-cospi[40]);
+ const __m256i cospim41 = _mm256_set1_epi32(-cospi[41]);
+ const __m256i cospim42 = _mm256_set1_epi32(-cospi[42]);
+ const __m256i cospim44 = _mm256_set1_epi32(-cospi[44]);
+ const __m256i cospim45 = _mm256_set1_epi32(-cospi[45]);
+ const __m256i cospim48 = _mm256_set1_epi32(-cospi[48]);
+ const __m256i cospim49 = _mm256_set1_epi32(-cospi[49]);
+ const __m256i cospim50 = _mm256_set1_epi32(-cospi[50]);
+ const __m256i cospim52 = _mm256_set1_epi32(-cospi[52]);
+ const __m256i cospim53 = _mm256_set1_epi32(-cospi[53]);
+ const __m256i cospim56 = _mm256_set1_epi32(-cospi[56]);
+ const __m256i cospim57 = _mm256_set1_epi32(-cospi[57]);
+ const __m256i cospim58 = _mm256_set1_epi32(-cospi[58]);
+ const __m256i cospim60 = _mm256_set1_epi32(-cospi[60]);
+ const __m256i cospim61 = _mm256_set1_epi32(-cospi[61]);
+
+ {
+ __m256i u[64], v[64];
+
+ // stage 1
+ u[32] = in[1];
+ u[34] = in[17];
+ u[36] = in[9];
+ u[38] = in[25];
+ u[40] = in[5];
+ u[42] = in[21];
+ u[44] = in[13];
+ u[46] = in[29];
+ u[48] = in[3];
+ u[50] = in[19];
+ u[52] = in[11];
+ u[54] = in[27];
+ u[56] = in[7];
+ u[58] = in[23];
+ u[60] = in[15];
+ u[62] = in[31];
+
+ v[16] = in[2];
+ v[18] = in[18];
+ v[20] = in[10];
+ v[22] = in[26];
+ v[24] = in[6];
+ v[26] = in[22];
+ v[28] = in[14];
+ v[30] = in[30];
+
+ u[8] = in[4];
+ u[10] = in[20];
+ u[12] = in[12];
+ u[14] = in[28];
+
+ v[4] = in[8];
+ v[6] = in[24];
+
+ u[0] = in[0];
+ u[2] = in[16];
+
+ // stage 2
+ v[32] = half_btf_0_avx2(&cospi63, &u[32], &rnding, bit);
+ v[33] = half_btf_0_avx2(&cospim33, &u[62], &rnding, bit);
+ v[34] = half_btf_0_avx2(&cospi47, &u[34], &rnding, bit);
+ v[35] = half_btf_0_avx2(&cospim49, &u[60], &rnding, bit);
+ v[36] = half_btf_0_avx2(&cospi55, &u[36], &rnding, bit);
+ v[37] = half_btf_0_avx2(&cospim41, &u[58], &rnding, bit);
+ v[38] = half_btf_0_avx2(&cospi39, &u[38], &rnding, bit);
+ v[39] = half_btf_0_avx2(&cospim57, &u[56], &rnding, bit);
+ v[40] = half_btf_0_avx2(&cospi59, &u[40], &rnding, bit);
+ v[41] = half_btf_0_avx2(&cospim37, &u[54], &rnding, bit);
+ v[42] = half_btf_0_avx2(&cospi43, &u[42], &rnding, bit);
+ v[43] = half_btf_0_avx2(&cospim53, &u[52], &rnding, bit);
+ v[44] = half_btf_0_avx2(&cospi51, &u[44], &rnding, bit);
+ v[45] = half_btf_0_avx2(&cospim45, &u[50], &rnding, bit);
+ v[46] = half_btf_0_avx2(&cospi35, &u[46], &rnding, bit);
+ v[47] = half_btf_0_avx2(&cospim61, &u[48], &rnding, bit);
+ v[48] = half_btf_0_avx2(&cospi3, &u[48], &rnding, bit);
+ v[49] = half_btf_0_avx2(&cospi29, &u[46], &rnding, bit);
+ v[50] = half_btf_0_avx2(&cospi19, &u[50], &rnding, bit);
+ v[51] = half_btf_0_avx2(&cospi13, &u[44], &rnding, bit);
+ v[52] = half_btf_0_avx2(&cospi11, &u[52], &rnding, bit);
+ v[53] = half_btf_0_avx2(&cospi21, &u[42], &rnding, bit);
+ v[54] = half_btf_0_avx2(&cospi27, &u[54], &rnding, bit);
+ v[55] = half_btf_0_avx2(&cospi5, &u[40], &rnding, bit);
+ v[56] = half_btf_0_avx2(&cospi7, &u[56], &rnding, bit);
+ v[57] = half_btf_0_avx2(&cospi25, &u[38], &rnding, bit);
+ v[58] = half_btf_0_avx2(&cospi23, &u[58], &rnding, bit);
+ v[59] = half_btf_0_avx2(&cospi9, &u[36], &rnding, bit);
+ v[60] = half_btf_0_avx2(&cospi15, &u[60], &rnding, bit);
+ v[61] = half_btf_0_avx2(&cospi17, &u[34], &rnding, bit);
+ v[62] = half_btf_0_avx2(&cospi31, &u[62], &rnding, bit);
+ v[63] = half_btf_0_avx2(&cospi1, &u[32], &rnding, bit);
+
+ // stage 3
+ u[16] = half_btf_0_avx2(&cospi62, &v[16], &rnding, bit);
+ u[17] = half_btf_0_avx2(&cospim34, &v[30], &rnding, bit);
+ u[18] = half_btf_0_avx2(&cospi46, &v[18], &rnding, bit);
+ u[19] = half_btf_0_avx2(&cospim50, &v[28], &rnding, bit);
+ u[20] = half_btf_0_avx2(&cospi54, &v[20], &rnding, bit);
+ u[21] = half_btf_0_avx2(&cospim42, &v[26], &rnding, bit);
+ u[22] = half_btf_0_avx2(&cospi38, &v[22], &rnding, bit);
+ u[23] = half_btf_0_avx2(&cospim58, &v[24], &rnding, bit);
+ u[24] = half_btf_0_avx2(&cospi6, &v[24], &rnding, bit);
+ u[25] = half_btf_0_avx2(&cospi26, &v[22], &rnding, bit);
+ u[26] = half_btf_0_avx2(&cospi22, &v[26], &rnding, bit);
+ u[27] = half_btf_0_avx2(&cospi10, &v[20], &rnding, bit);
+ u[28] = half_btf_0_avx2(&cospi14, &v[28], &rnding, bit);
+ u[29] = half_btf_0_avx2(&cospi18, &v[18], &rnding, bit);
+ u[30] = half_btf_0_avx2(&cospi30, &v[30], &rnding, bit);
+ u[31] = half_btf_0_avx2(&cospi2, &v[16], &rnding, bit);
+
+ for (i = 32; i < 64; i += 4) {
+ addsub_avx2(v[i + 0], v[i + 1], &u[i + 0], &u[i + 1], &clamp_lo,
+ &clamp_hi);
+ addsub_avx2(v[i + 3], v[i + 2], &u[i + 3], &u[i + 2], &clamp_lo,
+ &clamp_hi);
+ }
+
+ // stage 4
+ v[8] = half_btf_0_avx2(&cospi60, &u[8], &rnding, bit);
+ v[9] = half_btf_0_avx2(&cospim36, &u[14], &rnding, bit);
+ v[10] = half_btf_0_avx2(&cospi44, &u[10], &rnding, bit);
+ v[11] = half_btf_0_avx2(&cospim52, &u[12], &rnding, bit);
+ v[12] = half_btf_0_avx2(&cospi12, &u[12], &rnding, bit);
+ v[13] = half_btf_0_avx2(&cospi20, &u[10], &rnding, bit);
+ v[14] = half_btf_0_avx2(&cospi28, &u[14], &rnding, bit);
+ v[15] = half_btf_0_avx2(&cospi4, &u[8], &rnding, bit);
+
+ for (i = 16; i < 32; i += 4) {
+ addsub_avx2(u[i + 0], u[i + 1], &v[i + 0], &v[i + 1], &clamp_lo,
+ &clamp_hi);
+ addsub_avx2(u[i + 3], u[i + 2], &v[i + 3], &v[i + 2], &clamp_lo,
+ &clamp_hi);
+ }
+
+ for (i = 32; i < 64; i += 4) {
+ v[i + 0] = u[i + 0];
+ v[i + 3] = u[i + 3];
+ }
+
+ v[33] = half_btf_avx2(&cospim4, &u[33], &cospi60, &u[62], &rnding, bit);
+ v[34] = half_btf_avx2(&cospim60, &u[34], &cospim4, &u[61], &rnding, bit);
+ v[37] = half_btf_avx2(&cospim36, &u[37], &cospi28, &u[58], &rnding, bit);
+ v[38] = half_btf_avx2(&cospim28, &u[38], &cospim36, &u[57], &rnding, bit);
+ v[41] = half_btf_avx2(&cospim20, &u[41], &cospi44, &u[54], &rnding, bit);
+ v[42] = half_btf_avx2(&cospim44, &u[42], &cospim20, &u[53], &rnding, bit);
+ v[45] = half_btf_avx2(&cospim52, &u[45], &cospi12, &u[50], &rnding, bit);
+ v[46] = half_btf_avx2(&cospim12, &u[46], &cospim52, &u[49], &rnding, bit);
+ v[49] = half_btf_avx2(&cospim52, &u[46], &cospi12, &u[49], &rnding, bit);
+ v[50] = half_btf_avx2(&cospi12, &u[45], &cospi52, &u[50], &rnding, bit);
+ v[53] = half_btf_avx2(&cospim20, &u[42], &cospi44, &u[53], &rnding, bit);
+ v[54] = half_btf_avx2(&cospi44, &u[41], &cospi20, &u[54], &rnding, bit);
+ v[57] = half_btf_avx2(&cospim36, &u[38], &cospi28, &u[57], &rnding, bit);
+ v[58] = half_btf_avx2(&cospi28, &u[37], &cospi36, &u[58], &rnding, bit);
+ v[61] = half_btf_avx2(&cospim4, &u[34], &cospi60, &u[61], &rnding, bit);
+ v[62] = half_btf_avx2(&cospi60, &u[33], &cospi4, &u[62], &rnding, bit);
+
+ // stage 5
+ u[4] = half_btf_0_avx2(&cospi56, &v[4], &rnding, bit);
+ u[5] = half_btf_0_avx2(&cospim40, &v[6], &rnding, bit);
+ u[6] = half_btf_0_avx2(&cospi24, &v[6], &rnding, bit);
+ u[7] = half_btf_0_avx2(&cospi8, &v[4], &rnding, bit);
+
+ for (i = 8; i < 16; i += 4) {
+ addsub_avx2(v[i + 0], v[i + 1], &u[i + 0], &u[i + 1], &clamp_lo,
+ &clamp_hi);
+ addsub_avx2(v[i + 3], v[i + 2], &u[i + 3], &u[i + 2], &clamp_lo,
+ &clamp_hi);
+ }
+
+ for (i = 16; i < 32; i += 4) {
+ u[i + 0] = v[i + 0];
+ u[i + 3] = v[i + 3];
+ }
+
+ u[17] = half_btf_avx2(&cospim8, &v[17], &cospi56, &v[30], &rnding, bit);
+ u[18] = half_btf_avx2(&cospim56, &v[18], &cospim8, &v[29], &rnding, bit);
+ u[21] = half_btf_avx2(&cospim40, &v[21], &cospi24, &v[26], &rnding, bit);
+ u[22] = half_btf_avx2(&cospim24, &v[22], &cospim40, &v[25], &rnding, bit);
+ u[25] = half_btf_avx2(&cospim40, &v[22], &cospi24, &v[25], &rnding, bit);
+ u[26] = half_btf_avx2(&cospi24, &v[21], &cospi40, &v[26], &rnding, bit);
+ u[29] = half_btf_avx2(&cospim8, &v[18], &cospi56, &v[29], &rnding, bit);
+ u[30] = half_btf_avx2(&cospi56, &v[17], &cospi8, &v[30], &rnding, bit);
+
+ for (i = 32; i < 64; i += 8) {
+ addsub_avx2(v[i + 0], v[i + 3], &u[i + 0], &u[i + 3], &clamp_lo,
+ &clamp_hi);
+ addsub_avx2(v[i + 1], v[i + 2], &u[i + 1], &u[i + 2], &clamp_lo,
+ &clamp_hi);
+
+ addsub_avx2(v[i + 7], v[i + 4], &u[i + 7], &u[i + 4], &clamp_lo,
+ &clamp_hi);
+ addsub_avx2(v[i + 6], v[i + 5], &u[i + 6], &u[i + 5], &clamp_lo,
+ &clamp_hi);
+ }
+
+ // stage 6
+ v[0] = half_btf_0_avx2(&cospi32, &u[0], &rnding, bit);
+ v[1] = half_btf_0_avx2(&cospi32, &u[0], &rnding, bit);
+ v[2] = half_btf_0_avx2(&cospi48, &u[2], &rnding, bit);
+ v[3] = half_btf_0_avx2(&cospi16, &u[2], &rnding, bit);
+
+ addsub_avx2(u[4], u[5], &v[4], &v[5], &clamp_lo, &clamp_hi);
+ addsub_avx2(u[7], u[6], &v[7], &v[6], &clamp_lo, &clamp_hi);
+
+ for (i = 8; i < 16; i += 4) {
+ v[i + 0] = u[i + 0];
+ v[i + 3] = u[i + 3];
+ }
+
+ v[9] = half_btf_avx2(&cospim16, &u[9], &cospi48, &u[14], &rnding, bit);
+ v[10] = half_btf_avx2(&cospim48, &u[10], &cospim16, &u[13], &rnding, bit);
+ v[13] = half_btf_avx2(&cospim16, &u[10], &cospi48, &u[13], &rnding, bit);
+ v[14] = half_btf_avx2(&cospi48, &u[9], &cospi16, &u[14], &rnding, bit);
+
+ for (i = 16; i < 32; i += 8) {
+ addsub_avx2(u[i + 0], u[i + 3], &v[i + 0], &v[i + 3], &clamp_lo,
+ &clamp_hi);
+ addsub_avx2(u[i + 1], u[i + 2], &v[i + 1], &v[i + 2], &clamp_lo,
+ &clamp_hi);
+
+ addsub_avx2(u[i + 7], u[i + 4], &v[i + 7], &v[i + 4], &clamp_lo,
+ &clamp_hi);
+ addsub_avx2(u[i + 6], u[i + 5], &v[i + 6], &v[i + 5], &clamp_lo,
+ &clamp_hi);
+ }
+
+ for (i = 32; i < 64; i += 8) {
+ v[i + 0] = u[i + 0];
+ v[i + 1] = u[i + 1];
+ v[i + 6] = u[i + 6];
+ v[i + 7] = u[i + 7];
+ }
+
+ v[34] = half_btf_avx2(&cospim8, &u[34], &cospi56, &u[61], &rnding, bit);
+ v[35] = half_btf_avx2(&cospim8, &u[35], &cospi56, &u[60], &rnding, bit);
+ v[36] = half_btf_avx2(&cospim56, &u[36], &cospim8, &u[59], &rnding, bit);
+ v[37] = half_btf_avx2(&cospim56, &u[37], &cospim8, &u[58], &rnding, bit);
+ v[42] = half_btf_avx2(&cospim40, &u[42], &cospi24, &u[53], &rnding, bit);
+ v[43] = half_btf_avx2(&cospim40, &u[43], &cospi24, &u[52], &rnding, bit);
+ v[44] = half_btf_avx2(&cospim24, &u[44], &cospim40, &u[51], &rnding, bit);
+ v[45] = half_btf_avx2(&cospim24, &u[45], &cospim40, &u[50], &rnding, bit);
+ v[50] = half_btf_avx2(&cospim40, &u[45], &cospi24, &u[50], &rnding, bit);
+ v[51] = half_btf_avx2(&cospim40, &u[44], &cospi24, &u[51], &rnding, bit);
+ v[52] = half_btf_avx2(&cospi24, &u[43], &cospi40, &u[52], &rnding, bit);
+ v[53] = half_btf_avx2(&cospi24, &u[42], &cospi40, &u[53], &rnding, bit);
+ v[58] = half_btf_avx2(&cospim8, &u[37], &cospi56, &u[58], &rnding, bit);
+ v[59] = half_btf_avx2(&cospim8, &u[36], &cospi56, &u[59], &rnding, bit);
+ v[60] = half_btf_avx2(&cospi56, &u[35], &cospi8, &u[60], &rnding, bit);
+ v[61] = half_btf_avx2(&cospi56, &u[34], &cospi8, &u[61], &rnding, bit);
+
+ // stage 7
+ addsub_avx2(v[0], v[3], &u[0], &u[3], &clamp_lo, &clamp_hi);
+ addsub_avx2(v[1], v[2], &u[1], &u[2], &clamp_lo, &clamp_hi);
+
+ u[4] = v[4];
+ u[7] = v[7];
+ u[5] = half_btf_avx2(&cospim32, &v[5], &cospi32, &v[6], &rnding, bit);
+ u[6] = half_btf_avx2(&cospi32, &v[5], &cospi32, &v[6], &rnding, bit);
+
+ addsub_avx2(v[8], v[11], &u[8], &u[11], &clamp_lo, &clamp_hi);
+ addsub_avx2(v[9], v[10], &u[9], &u[10], &clamp_lo, &clamp_hi);
+ addsub_avx2(v[15], v[12], &u[15], &u[12], &clamp_lo, &clamp_hi);
+ addsub_avx2(v[14], v[13], &u[14], &u[13], &clamp_lo, &clamp_hi);
+
+ for (i = 16; i < 32; i += 8) {
+ u[i + 0] = v[i + 0];
+ u[i + 1] = v[i + 1];
+ u[i + 6] = v[i + 6];
+ u[i + 7] = v[i + 7];
+ }
+
+ u[18] = half_btf_avx2(&cospim16, &v[18], &cospi48, &v[29], &rnding, bit);
+ u[19] = half_btf_avx2(&cospim16, &v[19], &cospi48, &v[28], &rnding, bit);
+ u[20] = half_btf_avx2(&cospim48, &v[20], &cospim16, &v[27], &rnding, bit);
+ u[21] = half_btf_avx2(&cospim48, &v[21], &cospim16, &v[26], &rnding, bit);
+ u[26] = half_btf_avx2(&cospim16, &v[21], &cospi48, &v[26], &rnding, bit);
+ u[27] = half_btf_avx2(&cospim16, &v[20], &cospi48, &v[27], &rnding, bit);
+ u[28] = half_btf_avx2(&cospi48, &v[19], &cospi16, &v[28], &rnding, bit);
+ u[29] = half_btf_avx2(&cospi48, &v[18], &cospi16, &v[29], &rnding, bit);
+
+ for (i = 32; i < 64; i += 16) {
+ for (j = i; j < i + 4; j++) {
+ addsub_avx2(v[j], v[j ^ 7], &u[j], &u[j ^ 7], &clamp_lo, &clamp_hi);
+ addsub_avx2(v[j ^ 15], v[j ^ 8], &u[j ^ 15], &u[j ^ 8], &clamp_lo,
+ &clamp_hi);
+ }
+ }
+
+ // stage 8
+ for (i = 0; i < 4; ++i) {
+ addsub_avx2(u[i], u[7 - i], &v[i], &v[7 - i], &clamp_lo, &clamp_hi);
+ }
+
+ v[8] = u[8];
+ v[9] = u[9];
+ v[14] = u[14];
+ v[15] = u[15];
+
+ v[10] = half_btf_avx2(&cospim32, &u[10], &cospi32, &u[13], &rnding, bit);
+ v[11] = half_btf_avx2(&cospim32, &u[11], &cospi32, &u[12], &rnding, bit);
+ v[12] = half_btf_avx2(&cospi32, &u[11], &cospi32, &u[12], &rnding, bit);
+ v[13] = half_btf_avx2(&cospi32, &u[10], &cospi32, &u[13], &rnding, bit);
+
+ for (i = 16; i < 20; ++i) {
+ addsub_avx2(u[i], u[i ^ 7], &v[i], &v[i ^ 7], &clamp_lo, &clamp_hi);
+ addsub_avx2(u[i ^ 15], u[i ^ 8], &v[i ^ 15], &v[i ^ 8], &clamp_lo,
+ &clamp_hi);
+ }
+
+ for (i = 32; i < 36; ++i) {
+ v[i] = u[i];
+ v[i + 12] = u[i + 12];
+ v[i + 16] = u[i + 16];
+ v[i + 28] = u[i + 28];
+ }
+
+ v[36] = half_btf_avx2(&cospim16, &u[36], &cospi48, &u[59], &rnding, bit);
+ v[37] = half_btf_avx2(&cospim16, &u[37], &cospi48, &u[58], &rnding, bit);
+ v[38] = half_btf_avx2(&cospim16, &u[38], &cospi48, &u[57], &rnding, bit);
+ v[39] = half_btf_avx2(&cospim16, &u[39], &cospi48, &u[56], &rnding, bit);
+ v[40] = half_btf_avx2(&cospim48, &u[40], &cospim16, &u[55], &rnding, bit);
+ v[41] = half_btf_avx2(&cospim48, &u[41], &cospim16, &u[54], &rnding, bit);
+ v[42] = half_btf_avx2(&cospim48, &u[42], &cospim16, &u[53], &rnding, bit);
+ v[43] = half_btf_avx2(&cospim48, &u[43], &cospim16, &u[52], &rnding, bit);
+ v[52] = half_btf_avx2(&cospim16, &u[43], &cospi48, &u[52], &rnding, bit);
+ v[53] = half_btf_avx2(&cospim16, &u[42], &cospi48, &u[53], &rnding, bit);
+ v[54] = half_btf_avx2(&cospim16, &u[41], &cospi48, &u[54], &rnding, bit);
+ v[55] = half_btf_avx2(&cospim16, &u[40], &cospi48, &u[55], &rnding, bit);
+ v[56] = half_btf_avx2(&cospi48, &u[39], &cospi16, &u[56], &rnding, bit);
+ v[57] = half_btf_avx2(&cospi48, &u[38], &cospi16, &u[57], &rnding, bit);
+ v[58] = half_btf_avx2(&cospi48, &u[37], &cospi16, &u[58], &rnding, bit);
+ v[59] = half_btf_avx2(&cospi48, &u[36], &cospi16, &u[59], &rnding, bit);
+
+ // stage 9
+ for (i = 0; i < 8; ++i) {
+ addsub_avx2(v[i], v[15 - i], &u[i], &u[15 - i], &clamp_lo, &clamp_hi);
+ }
+
+ for (i = 16; i < 20; ++i) {
+ u[i] = v[i];
+ u[i + 12] = v[i + 12];
+ }
+
+ u[20] = half_btf_avx2(&cospim32, &v[20], &cospi32, &v[27], &rnding, bit);
+ u[21] = half_btf_avx2(&cospim32, &v[21], &cospi32, &v[26], &rnding, bit);
+ u[22] = half_btf_avx2(&cospim32, &v[22], &cospi32, &v[25], &rnding, bit);
+ u[23] = half_btf_avx2(&cospim32, &v[23], &cospi32, &v[24], &rnding, bit);
+ u[24] = half_btf_avx2(&cospi32, &v[23], &cospi32, &v[24], &rnding, bit);
+ u[25] = half_btf_avx2(&cospi32, &v[22], &cospi32, &v[25], &rnding, bit);
+ u[26] = half_btf_avx2(&cospi32, &v[21], &cospi32, &v[26], &rnding, bit);
+ u[27] = half_btf_avx2(&cospi32, &v[20], &cospi32, &v[27], &rnding, bit);
+
+ for (i = 32; i < 40; i++) {
+ addsub_avx2(v[i], v[i ^ 15], &u[i], &u[i ^ 15], &clamp_lo, &clamp_hi);
+ }
+
+ for (i = 48; i < 56; i++) {
+ addsub_avx2(v[i ^ 15], v[i], &u[i ^ 15], &u[i], &clamp_lo, &clamp_hi);
+ }
+
+ // stage 10
+ for (i = 0; i < 16; i++) {
+ addsub_avx2(u[i], u[31 - i], &v[i], &v[31 - i], &clamp_lo, &clamp_hi);
+ }
+
+ for (i = 32; i < 40; i++) v[i] = u[i];
+
+ v[40] = half_btf_avx2(&cospim32, &u[40], &cospi32, &u[55], &rnding, bit);
+ v[41] = half_btf_avx2(&cospim32, &u[41], &cospi32, &u[54], &rnding, bit);
+ v[42] = half_btf_avx2(&cospim32, &u[42], &cospi32, &u[53], &rnding, bit);
+ v[43] = half_btf_avx2(&cospim32, &u[43], &cospi32, &u[52], &rnding, bit);
+ v[44] = half_btf_avx2(&cospim32, &u[44], &cospi32, &u[51], &rnding, bit);
+ v[45] = half_btf_avx2(&cospim32, &u[45], &cospi32, &u[50], &rnding, bit);
+ v[46] = half_btf_avx2(&cospim32, &u[46], &cospi32, &u[49], &rnding, bit);
+ v[47] = half_btf_avx2(&cospim32, &u[47], &cospi32, &u[48], &rnding, bit);
+ v[48] = half_btf_avx2(&cospi32, &u[47], &cospi32, &u[48], &rnding, bit);
+ v[49] = half_btf_avx2(&cospi32, &u[46], &cospi32, &u[49], &rnding, bit);
+ v[50] = half_btf_avx2(&cospi32, &u[45], &cospi32, &u[50], &rnding, bit);
+ v[51] = half_btf_avx2(&cospi32, &u[44], &cospi32, &u[51], &rnding, bit);
+ v[52] = half_btf_avx2(&cospi32, &u[43], &cospi32, &u[52], &rnding, bit);
+ v[53] = half_btf_avx2(&cospi32, &u[42], &cospi32, &u[53], &rnding, bit);
+ v[54] = half_btf_avx2(&cospi32, &u[41], &cospi32, &u[54], &rnding, bit);
+ v[55] = half_btf_avx2(&cospi32, &u[40], &cospi32, &u[55], &rnding, bit);
+
+ for (i = 56; i < 64; i++) v[i] = u[i];
+
+ // stage 11
+ for (i = 0; i < 32; i++) {
+ addsub_avx2(v[i], v[63 - i], &out[(i)], &out[(63 - i)], &clamp_lo,
+ &clamp_hi);
+ }
+ if (!do_cols) {
+ const int log_range_out = AOMMAX(16, bd + 6);
+ const __m256i clamp_lo_out =
+ _mm256_set1_epi32(-(1 << (log_range_out - 1)));
+ const __m256i clamp_hi_out =
+ _mm256_set1_epi32((1 << (log_range_out - 1)) - 1);
+
+ round_shift_8x8_avx2(out, out_shift);
+ round_shift_8x8_avx2(out + 16, out_shift);
+ round_shift_8x8_avx2(out + 32, out_shift);
+ round_shift_8x8_avx2(out + 48, out_shift);
+ highbd_clamp_epi32_avx2(out, out, &clamp_lo_out, &clamp_hi_out, 64);
+ }
+ }
+}
+typedef void (*transform_1d_avx2)(__m256i *in, __m256i *out, int bit,
+ int do_cols, int bd, int out_shift);
+
+static const transform_1d_avx2
+ highbd_txfm_all_1d_zeros_w8_arr[TX_SIZES][ITX_TYPES_1D][4] = {
+ {
+ { NULL, NULL, NULL, NULL },
+ { NULL, NULL, NULL, NULL },
+ { NULL, NULL, NULL, NULL },
+ },
+ {
+ { idct8x8_low1_avx2, idct8x8_avx2, NULL, NULL },
+ { iadst8x8_low1_avx2, iadst8x8_avx2, NULL, NULL },
+ { NULL, NULL, NULL, NULL },
+ },
+ {
+ { idct16_low1_avx2, idct16_low8_avx2, idct16_avx2, NULL },
+ { iadst16_low1_avx2, iadst16_low8_avx2, iadst16_avx2, NULL },
+ { NULL, NULL, NULL, NULL },
+ },
+ { { idct32_low1_avx2, idct32_low8_avx2, idct32_low16_avx2, idct32_avx2 },
+ { NULL, NULL, NULL, NULL },
+ { NULL, NULL, NULL, NULL } },
+
+ { { idct64_low1_avx2, idct64_low8_avx2, idct64_low16_avx2, idct64_avx2 },
+ { NULL, NULL, NULL, NULL },
+ { NULL, NULL, NULL, NULL } }
+ };
+
+static void highbd_inv_txfm2d_add_no_identity_avx2(const int32_t *input,
+ uint16_t *output, int stride,
+ TX_TYPE tx_type,
+ TX_SIZE tx_size, int eob,
+ const int bd) {
+ __m256i buf1[64 * 8];
+ int eobx, eoby;
+ get_eobx_eoby_scan_default(&eobx, &eoby, tx_size, eob);
+ const int8_t *shift = av1_inv_txfm_shift_ls[tx_size];
+ const int txw_idx = get_txw_idx(tx_size);
+ const int txh_idx = get_txh_idx(tx_size);
+ const int txfm_size_col = tx_size_wide[tx_size];
+ const int txfm_size_row = tx_size_high[tx_size];
+ const int buf_size_w_div8 = txfm_size_col >> 3;
+ const int buf_size_nonzero_w = (eobx + 8) >> 3 << 3;
+ const int buf_size_nonzero_h_div8 = (eoby + 8) >> 3;
+ const int input_stride = AOMMIN(32, txfm_size_row);
+ const int rect_type = get_rect_tx_log_ratio(txfm_size_col, txfm_size_row);
+ const int fun_idx_x = lowbd_txfm_all_1d_zeros_idx[eobx];
+ const int fun_idx_y = lowbd_txfm_all_1d_zeros_idx[eoby];
+ const transform_1d_avx2 row_txfm =
+ highbd_txfm_all_1d_zeros_w8_arr[txw_idx][hitx_1d_tab[tx_type]][fun_idx_x];
+ const transform_1d_avx2 col_txfm =
+ highbd_txfm_all_1d_zeros_w8_arr[txh_idx][vitx_1d_tab[tx_type]][fun_idx_y];
+
+ assert(col_txfm != NULL);
+ assert(row_txfm != NULL);
+ int ud_flip, lr_flip;
+ get_flip_cfg(tx_type, &ud_flip, &lr_flip);
+
+ // 1st stage: column transform
+ for (int i = 0; i < buf_size_nonzero_h_div8; i++) {
+ __m256i buf0[64];
+ load_buffer_32bit_input(input + i * 8, input_stride, buf0,
+ buf_size_nonzero_w);
+ if (rect_type == 1 || rect_type == -1) {
+ round_shift_rect_array_32_avx2(buf0, buf0, buf_size_nonzero_w, 0,
+ NewInvSqrt2);
+ }
+ row_txfm(buf0, buf0, INV_COS_BIT, 0, bd, -shift[0]);
+
+ __m256i *_buf1 = buf1 + i * 8;
+ if (lr_flip) {
+ for (int j = 0; j < buf_size_w_div8; ++j) {
+ transpose_8x8_flip_avx2(
+ &buf0[j * 8], &_buf1[(buf_size_w_div8 - 1 - j) * txfm_size_row]);
+ }
+ } else {
+ for (int j = 0; j < buf_size_w_div8; ++j) {
+ transpose_8x8_avx2(&buf0[j * 8], &_buf1[j * txfm_size_row]);
+ }
+ }
+ }
+ // 2nd stage: column transform
+ for (int i = 0; i < buf_size_w_div8; i++) {
+ col_txfm(buf1 + i * txfm_size_row, buf1 + i * txfm_size_row, INV_COS_BIT, 1,
+ bd, 0);
+
+ round_shift_array_32_avx2(buf1 + i * txfm_size_row,
+ buf1 + i * txfm_size_row, txfm_size_row,
+ -shift[1]);
+ }
+
+ // write to buffer
+ if (txfm_size_col >= 16) {
+ for (int i = 0; i < (txfm_size_col >> 4); i++) {
+ highbd_write_buffer_16xn_avx2(buf1 + i * txfm_size_row * 2,
+ output + 16 * i, stride, ud_flip,
+ txfm_size_row, bd);
+ }
+ } else if (txfm_size_col == 8) {
+ highbd_write_buffer_8xn_avx2(buf1, output, stride, ud_flip, txfm_size_row,
+ bd);
+ }
+}
+
+void av1_highbd_inv_txfm2d_add_universe_avx2(const int32_t *input,
+ uint8_t *output, int stride,
+ TX_TYPE tx_type, TX_SIZE tx_size,
+ int eob, const int bd) {
+ switch (tx_type) {
+ case DCT_DCT:
+ case ADST_DCT:
+ case DCT_ADST:
+ case ADST_ADST:
+ case FLIPADST_DCT:
+ case DCT_FLIPADST:
+ case FLIPADST_FLIPADST:
+ case ADST_FLIPADST:
+ case FLIPADST_ADST:
+ highbd_inv_txfm2d_add_no_identity_avx2(input, CONVERT_TO_SHORTPTR(output),
+ stride, tx_type, tx_size, eob, bd);
+ break;
+ case IDTX:
+ case H_DCT:
+ case H_ADST:
+ case H_FLIPADST:
+ case V_DCT:
+ case V_ADST:
+ case V_FLIPADST:
+ av1_highbd_inv_txfm2d_add_universe_sse4_1(input, output, stride, tx_type,
+ tx_size, eob, bd);
+ break;
+ default: assert(0); break;
+ }
+}
+void av1_highbd_inv_txfm_add_avx2(const tran_low_t *input, uint8_t *dest,
+ int stride, const TxfmParam *txfm_param) {
+ assert(av1_ext_tx_used[txfm_param->tx_set_type][txfm_param->tx_type]);
+ const TX_SIZE tx_size = txfm_param->tx_size;
+ switch (tx_size) {
+ case TX_4X8:
+ av1_highbd_inv_txfm_add_4x8_sse4_1(input, dest, stride, txfm_param);
+ break;
+ case TX_8X4:
+ av1_highbd_inv_txfm_add_8x4_sse4_1(input, dest, stride, txfm_param);
+ break;
+ case TX_4X4:
+ av1_highbd_inv_txfm_add_4x4_sse4_1(input, dest, stride, txfm_param);
+ break;
+ case TX_16X4:
+ av1_highbd_inv_txfm_add_16x4_sse4_1(input, dest, stride, txfm_param);
+ break;
+ case TX_4X16:
+ av1_highbd_inv_txfm_add_4x16_sse4_1(input, dest, stride, txfm_param);
+ break;
+ default:
+ av1_highbd_inv_txfm2d_add_universe_avx2(
+ input, dest, stride, txfm_param->tx_type, txfm_param->tx_size,
+ txfm_param->eob, txfm_param->bd);
+ break;
+ }
+}
diff --git a/third_party/aom/av1/common/x86/highbd_inv_txfm_sse4.c b/third_party/aom/av1/common/x86/highbd_inv_txfm_sse4.c
new file mode 100644
index 0000000000..4ff6a90f95
--- /dev/null
+++ b/third_party/aom/av1/common/x86/highbd_inv_txfm_sse4.c
@@ -0,0 +1,5830 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+#include <assert.h>
+#include <smmintrin.h> /* SSE4.1 */
+
+#include "config/aom_config.h"
+#include "config/av1_rtcd.h"
+
+#include "av1/common/av1_inv_txfm1d_cfg.h"
+#include "av1/common/idct.h"
+#include "av1/common/x86/av1_inv_txfm_ssse3.h"
+#include "av1/common/x86/av1_txfm_sse2.h"
+#include "av1/common/x86/av1_txfm_sse4.h"
+#include "av1/common/x86/highbd_txfm_utility_sse4.h"
+
+static INLINE __m128i highbd_clamp_epi16(__m128i u, int bd) {
+ const __m128i zero = _mm_setzero_si128();
+ const __m128i one = _mm_set1_epi16(1);
+ const __m128i max = _mm_sub_epi16(_mm_slli_epi16(one, bd), one);
+ __m128i clamped, mask;
+
+ mask = _mm_cmpgt_epi16(u, max);
+ clamped = _mm_andnot_si128(mask, u);
+ mask = _mm_and_si128(mask, max);
+ clamped = _mm_or_si128(mask, clamped);
+ mask = _mm_cmpgt_epi16(clamped, zero);
+ clamped = _mm_and_si128(clamped, mask);
+
+ return clamped;
+}
+
+static INLINE void round_shift_4x4(__m128i *in, int shift) {
+ if (shift != 0) {
+ __m128i rnding = _mm_set1_epi32(1 << (shift - 1));
+ in[0] = _mm_add_epi32(in[0], rnding);
+ in[1] = _mm_add_epi32(in[1], rnding);
+ in[2] = _mm_add_epi32(in[2], rnding);
+ in[3] = _mm_add_epi32(in[3], rnding);
+
+ in[0] = _mm_srai_epi32(in[0], shift);
+ in[1] = _mm_srai_epi32(in[1], shift);
+ in[2] = _mm_srai_epi32(in[2], shift);
+ in[3] = _mm_srai_epi32(in[3], shift);
+ }
+}
+
+static void round_shift_8x8(__m128i *in, int shift) {
+ round_shift_4x4(&in[0], shift);
+ round_shift_4x4(&in[4], shift);
+ round_shift_4x4(&in[8], shift);
+ round_shift_4x4(&in[12], shift);
+}
+
+static void highbd_clamp_epi32_sse4_1(__m128i *in, __m128i *out,
+ const __m128i *clamp_lo,
+ const __m128i *clamp_hi, int size) {
+ __m128i a0, a1;
+ for (int i = 0; i < size; i += 4) {
+ a0 = _mm_max_epi32(in[i], *clamp_lo);
+ out[i] = _mm_min_epi32(a0, *clamp_hi);
+
+ a1 = _mm_max_epi32(in[i + 1], *clamp_lo);
+ out[i + 1] = _mm_min_epi32(a1, *clamp_hi);
+
+ a0 = _mm_max_epi32(in[i + 2], *clamp_lo);
+ out[i + 2] = _mm_min_epi32(a0, *clamp_hi);
+
+ a1 = _mm_max_epi32(in[i + 3], *clamp_lo);
+ out[i + 3] = _mm_min_epi32(a1, *clamp_hi);
+ }
+}
+
+static INLINE __m128i highbd_get_recon_8x8_sse4_1(const __m128i pred,
+ __m128i res0, __m128i res1,
+ const int bd) {
+ __m128i x0 = _mm_cvtepi16_epi32(pred);
+ __m128i x1 = _mm_cvtepi16_epi32(_mm_srli_si128(pred, 8));
+ __m128i min_clip_val = _mm_setzero_si128();
+ __m128i max_clip_val = _mm_set1_epi32((1 << bd) - 1);
+ x0 = _mm_add_epi32(res0, x0);
+ x1 = _mm_add_epi32(res1, x1);
+ x0 = _mm_max_epi32(x0, min_clip_val);
+ x0 = _mm_min_epi32(x0, max_clip_val);
+ x1 = _mm_max_epi32(x1, min_clip_val);
+ x1 = _mm_min_epi32(x1, max_clip_val);
+ x0 = _mm_packus_epi32(x0, x1);
+ return x0;
+}
+
+static INLINE __m128i highbd_get_recon_4xn_sse4_1(const __m128i pred,
+ __m128i res0, const int bd) {
+ __m128i x0 = _mm_cvtepi16_epi32(pred);
+
+ x0 = _mm_add_epi32(res0, x0);
+ x0 = _mm_packus_epi32(x0, x0);
+ x0 = highbd_clamp_epi16(x0, bd);
+ return x0;
+}
+
+static INLINE void highbd_write_buffer_4xn_sse4_1(__m128i *in, uint16_t *output,
+ int stride, int flipud,
+ int height, const int bd) {
+ int j = flipud ? (height - 1) : 0;
+ const int step = flipud ? -1 : 1;
+ for (int i = 0; i < height; ++i, j += step) {
+ __m128i v = _mm_loadl_epi64((__m128i const *)(output + i * stride));
+ __m128i u = highbd_get_recon_4xn_sse4_1(v, in[j], bd);
+
+ _mm_storel_epi64((__m128i *)(output + i * stride), u);
+ }
+}
+
+static INLINE void highbd_write_buffer_8xn_sse4_1(__m128i *in, uint16_t *output,
+ int stride, int flipud,
+ int height, const int bd) {
+ int j = flipud ? (height - 1) : 0;
+ const int step = flipud ? -1 : 1;
+ for (int i = 0; i < height; ++i, j += step) {
+ __m128i v = _mm_loadu_si128((__m128i const *)(output + i * stride));
+ __m128i u = highbd_get_recon_8x8_sse4_1(v, in[j], in[j + height], bd);
+
+ _mm_storeu_si128((__m128i *)(output + i * stride), u);
+ }
+}
+
+static INLINE void load_buffer_32bit_input(const int32_t *in, int stride,
+ __m128i *out, int out_size) {
+ for (int i = 0; i < out_size; ++i) {
+ out[i] = _mm_loadu_si128((const __m128i *)(in + i * stride));
+ }
+}
+
+static INLINE void load_buffer_4x4(const int32_t *coeff, __m128i *in) {
+ in[0] = _mm_load_si128((const __m128i *)(coeff + 0));
+ in[1] = _mm_load_si128((const __m128i *)(coeff + 4));
+ in[2] = _mm_load_si128((const __m128i *)(coeff + 8));
+ in[3] = _mm_load_si128((const __m128i *)(coeff + 12));
+}
+
+void av1_highbd_iwht4x4_16_add_sse4_1(const tran_low_t *input, uint8_t *dest8,
+ int stride, int bd) {
+ /* 4-point reversible, orthonormal inverse Walsh-Hadamard in 3.5 adds,
+ 0.5 shifts per pixel. */
+ __m128i op[4];
+ uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
+
+ load_buffer_4x4(input, op);
+
+ // Shift before-hand.
+ op[0] = _mm_srai_epi32(op[0], UNIT_QUANT_SHIFT);
+ op[1] = _mm_srai_epi32(op[1], UNIT_QUANT_SHIFT);
+ op[2] = _mm_srai_epi32(op[2], UNIT_QUANT_SHIFT);
+ op[3] = _mm_srai_epi32(op[3], UNIT_QUANT_SHIFT);
+
+ for (int i = 0; i < 2; ++i) {
+ __m128i a1 = op[0];
+ __m128i c1 = op[1];
+ __m128i d1 = op[2];
+ __m128i b1 = op[3];
+ a1 = _mm_add_epi32(a1, c1); // a1 += c1
+ d1 = _mm_sub_epi32(d1, b1); // d1 -= b1
+ __m128i e1 = _mm_sub_epi32(a1, d1); // e1 = (a1 - d1) >> 1
+ e1 = _mm_srai_epi32(e1, 1);
+ b1 = _mm_sub_epi32(e1, b1); // b1 = e1 - b1
+ c1 = _mm_sub_epi32(e1, c1); // c1 = e1 - c1
+ a1 = _mm_sub_epi32(a1, b1); // a1 -= b1
+ d1 = _mm_add_epi32(d1, c1); // d1 += c1
+
+ op[0] = a1;
+ op[1] = b1;
+ op[2] = c1;
+ op[3] = d1;
+ if (i == 0) {
+ transpose_32bit_4x4(op, op);
+ }
+ }
+
+ // Convert to int16_t. The C code checks that we are in range.
+ op[0] = _mm_packs_epi32(op[0], op[1]);
+ op[1] = _mm_packs_epi32(op[2], op[3]);
+
+ // Load uint16_t.
+ __m128i dst[2];
+ __m128i tmp[4];
+ tmp[0] = _mm_loadl_epi64((const __m128i *)(dest + 0 * stride));
+ tmp[1] = _mm_loadl_epi64((const __m128i *)(dest + 1 * stride));
+ dst[0] = _mm_unpacklo_epi64(tmp[0], tmp[1]);
+ tmp[2] = _mm_loadl_epi64((const __m128i *)(dest + 2 * stride));
+ tmp[3] = _mm_loadl_epi64((const __m128i *)(dest + 3 * stride));
+ dst[1] = _mm_unpacklo_epi64(tmp[2], tmp[3]);
+
+ // Add to the previous results.
+ dst[0] = _mm_add_epi16(dst[0], op[0]);
+ dst[1] = _mm_add_epi16(dst[1], op[1]);
+
+ // Clamp.
+ dst[0] = highbd_clamp_epi16(dst[0], bd);
+ dst[1] = highbd_clamp_epi16(dst[1], bd);
+
+ // Store.
+ _mm_storel_epi64((__m128i *)(dest + 0 * stride), dst[0]);
+ dst[0] = _mm_srli_si128(dst[0], 8);
+ _mm_storel_epi64((__m128i *)(dest + 1 * stride), dst[0]);
+ _mm_storel_epi64((__m128i *)(dest + 2 * stride), dst[1]);
+ dst[1] = _mm_srli_si128(dst[1], 8);
+ _mm_storel_epi64((__m128i *)(dest + 3 * stride), dst[1]);
+}
+
+static void addsub_sse4_1(const __m128i in0, const __m128i in1, __m128i *out0,
+ __m128i *out1, const __m128i *clamp_lo,
+ const __m128i *clamp_hi) {
+ __m128i a0 = _mm_add_epi32(in0, in1);
+ __m128i a1 = _mm_sub_epi32(in0, in1);
+
+ a0 = _mm_max_epi32(a0, *clamp_lo);
+ a0 = _mm_min_epi32(a0, *clamp_hi);
+ a1 = _mm_max_epi32(a1, *clamp_lo);
+ a1 = _mm_min_epi32(a1, *clamp_hi);
+
+ *out0 = a0;
+ *out1 = a1;
+}
+
+static void shift_and_clamp_sse4_1(__m128i *in0, __m128i *in1,
+ const __m128i *clamp_lo,
+ const __m128i *clamp_hi, int shift) {
+ __m128i offset = _mm_set1_epi32((1 << shift) >> 1);
+ __m128i in0_w_offset = _mm_add_epi32(*in0, offset);
+ __m128i in1_w_offset = _mm_add_epi32(*in1, offset);
+
+ in0_w_offset = _mm_sra_epi32(in0_w_offset, _mm_cvtsi32_si128(shift));
+ in1_w_offset = _mm_sra_epi32(in1_w_offset, _mm_cvtsi32_si128(shift));
+
+ in0_w_offset = _mm_max_epi32(in0_w_offset, *clamp_lo);
+ in0_w_offset = _mm_min_epi32(in0_w_offset, *clamp_hi);
+ in1_w_offset = _mm_max_epi32(in1_w_offset, *clamp_lo);
+ in1_w_offset = _mm_min_epi32(in1_w_offset, *clamp_hi);
+
+ *in0 = in0_w_offset;
+ *in1 = in1_w_offset;
+}
+
+static INLINE void idct32_stage4_sse4_1(
+ __m128i *bf1, const __m128i *cospim8, const __m128i *cospi56,
+ const __m128i *cospi8, const __m128i *cospim56, const __m128i *cospim40,
+ const __m128i *cospi24, const __m128i *cospi40, const __m128i *cospim24,
+ const __m128i *rounding, int bit) {
+ __m128i temp1, temp2;
+ temp1 = half_btf_sse4_1(cospim8, &bf1[17], cospi56, &bf1[30], rounding, bit);
+ bf1[30] = half_btf_sse4_1(cospi56, &bf1[17], cospi8, &bf1[30], rounding, bit);
+ bf1[17] = temp1;
+
+ temp2 = half_btf_sse4_1(cospim56, &bf1[18], cospim8, &bf1[29], rounding, bit);
+ bf1[29] =
+ half_btf_sse4_1(cospim8, &bf1[18], cospi56, &bf1[29], rounding, bit);
+ bf1[18] = temp2;
+
+ temp1 = half_btf_sse4_1(cospim40, &bf1[21], cospi24, &bf1[26], rounding, bit);
+ bf1[26] =
+ half_btf_sse4_1(cospi24, &bf1[21], cospi40, &bf1[26], rounding, bit);
+ bf1[21] = temp1;
+
+ temp2 =
+ half_btf_sse4_1(cospim24, &bf1[22], cospim40, &bf1[25], rounding, bit);
+ bf1[25] =
+ half_btf_sse4_1(cospim40, &bf1[22], cospi24, &bf1[25], rounding, bit);
+ bf1[22] = temp2;
+}
+
+static INLINE void idct32_stage5_sse4_1(
+ __m128i *bf1, const __m128i *cospim16, const __m128i *cospi48,
+ const __m128i *cospi16, const __m128i *cospim48, const __m128i *clamp_lo,
+ const __m128i *clamp_hi, const __m128i *rounding, int bit) {
+ __m128i temp1, temp2;
+ temp1 = half_btf_sse4_1(cospim16, &bf1[9], cospi48, &bf1[14], rounding, bit);
+ bf1[14] = half_btf_sse4_1(cospi48, &bf1[9], cospi16, &bf1[14], rounding, bit);
+ bf1[9] = temp1;
+
+ temp2 =
+ half_btf_sse4_1(cospim48, &bf1[10], cospim16, &bf1[13], rounding, bit);
+ bf1[13] =
+ half_btf_sse4_1(cospim16, &bf1[10], cospi48, &bf1[13], rounding, bit);
+ bf1[10] = temp2;
+
+ addsub_sse4_1(bf1[16], bf1[19], bf1 + 16, bf1 + 19, clamp_lo, clamp_hi);
+ addsub_sse4_1(bf1[17], bf1[18], bf1 + 17, bf1 + 18, clamp_lo, clamp_hi);
+ addsub_sse4_1(bf1[23], bf1[20], bf1 + 23, bf1 + 20, clamp_lo, clamp_hi);
+ addsub_sse4_1(bf1[22], bf1[21], bf1 + 22, bf1 + 21, clamp_lo, clamp_hi);
+ addsub_sse4_1(bf1[24], bf1[27], bf1 + 24, bf1 + 27, clamp_lo, clamp_hi);
+ addsub_sse4_1(bf1[25], bf1[26], bf1 + 25, bf1 + 26, clamp_lo, clamp_hi);
+ addsub_sse4_1(bf1[31], bf1[28], bf1 + 31, bf1 + 28, clamp_lo, clamp_hi);
+ addsub_sse4_1(bf1[30], bf1[29], bf1 + 30, bf1 + 29, clamp_lo, clamp_hi);
+}
+
+static INLINE void idct32_stage6_sse4_1(
+ __m128i *bf1, const __m128i *cospim32, const __m128i *cospi32,
+ const __m128i *cospim16, const __m128i *cospi48, const __m128i *cospi16,
+ const __m128i *cospim48, const __m128i *clamp_lo, const __m128i *clamp_hi,
+ const __m128i *rounding, int bit) {
+ __m128i temp1, temp2;
+ temp1 = half_btf_sse4_1(cospim32, &bf1[5], cospi32, &bf1[6], rounding, bit);
+ bf1[6] = half_btf_sse4_1(cospi32, &bf1[5], cospi32, &bf1[6], rounding, bit);
+ bf1[5] = temp1;
+
+ addsub_sse4_1(bf1[8], bf1[11], bf1 + 8, bf1 + 11, clamp_lo, clamp_hi);
+ addsub_sse4_1(bf1[9], bf1[10], bf1 + 9, bf1 + 10, clamp_lo, clamp_hi);
+ addsub_sse4_1(bf1[15], bf1[12], bf1 + 15, bf1 + 12, clamp_lo, clamp_hi);
+ addsub_sse4_1(bf1[14], bf1[13], bf1 + 14, bf1 + 13, clamp_lo, clamp_hi);
+
+ temp1 = half_btf_sse4_1(cospim16, &bf1[18], cospi48, &bf1[29], rounding, bit);
+ bf1[29] =
+ half_btf_sse4_1(cospi48, &bf1[18], cospi16, &bf1[29], rounding, bit);
+ bf1[18] = temp1;
+ temp2 = half_btf_sse4_1(cospim16, &bf1[19], cospi48, &bf1[28], rounding, bit);
+ bf1[28] =
+ half_btf_sse4_1(cospi48, &bf1[19], cospi16, &bf1[28], rounding, bit);
+ bf1[19] = temp2;
+ temp1 =
+ half_btf_sse4_1(cospim48, &bf1[20], cospim16, &bf1[27], rounding, bit);
+ bf1[27] =
+ half_btf_sse4_1(cospim16, &bf1[20], cospi48, &bf1[27], rounding, bit);
+ bf1[20] = temp1;
+ temp2 =
+ half_btf_sse4_1(cospim48, &bf1[21], cospim16, &bf1[26], rounding, bit);
+ bf1[26] =
+ half_btf_sse4_1(cospim16, &bf1[21], cospi48, &bf1[26], rounding, bit);
+ bf1[21] = temp2;
+}
+
+static INLINE void idct32_stage7_sse4_1(__m128i *bf1, const __m128i *cospim32,
+ const __m128i *cospi32,
+ const __m128i *clamp_lo,
+ const __m128i *clamp_hi,
+ const __m128i *rounding, int bit) {
+ __m128i temp1, temp2;
+ addsub_sse4_1(bf1[0], bf1[7], bf1 + 0, bf1 + 7, clamp_lo, clamp_hi);
+ addsub_sse4_1(bf1[1], bf1[6], bf1 + 1, bf1 + 6, clamp_lo, clamp_hi);
+ addsub_sse4_1(bf1[2], bf1[5], bf1 + 2, bf1 + 5, clamp_lo, clamp_hi);
+ addsub_sse4_1(bf1[3], bf1[4], bf1 + 3, bf1 + 4, clamp_lo, clamp_hi);
+
+ temp1 = half_btf_sse4_1(cospim32, &bf1[10], cospi32, &bf1[13], rounding, bit);
+ bf1[13] =
+ half_btf_sse4_1(cospi32, &bf1[10], cospi32, &bf1[13], rounding, bit);
+ bf1[10] = temp1;
+ temp2 = half_btf_sse4_1(cospim32, &bf1[11], cospi32, &bf1[12], rounding, bit);
+ bf1[12] =
+ half_btf_sse4_1(cospi32, &bf1[11], cospi32, &bf1[12], rounding, bit);
+ bf1[11] = temp2;
+
+ addsub_sse4_1(bf1[16], bf1[23], bf1 + 16, bf1 + 23, clamp_lo, clamp_hi);
+ addsub_sse4_1(bf1[17], bf1[22], bf1 + 17, bf1 + 22, clamp_lo, clamp_hi);
+ addsub_sse4_1(bf1[18], bf1[21], bf1 + 18, bf1 + 21, clamp_lo, clamp_hi);
+ addsub_sse4_1(bf1[19], bf1[20], bf1 + 19, bf1 + 20, clamp_lo, clamp_hi);
+ addsub_sse4_1(bf1[31], bf1[24], bf1 + 31, bf1 + 24, clamp_lo, clamp_hi);
+ addsub_sse4_1(bf1[30], bf1[25], bf1 + 30, bf1 + 25, clamp_lo, clamp_hi);
+ addsub_sse4_1(bf1[29], bf1[26], bf1 + 29, bf1 + 26, clamp_lo, clamp_hi);
+ addsub_sse4_1(bf1[28], bf1[27], bf1 + 28, bf1 + 27, clamp_lo, clamp_hi);
+}
+
+static INLINE void idct32_stage8_sse4_1(__m128i *bf1, const __m128i *cospim32,
+ const __m128i *cospi32,
+ const __m128i *clamp_lo,
+ const __m128i *clamp_hi,
+ const __m128i *rounding, int bit) {
+ __m128i temp1, temp2;
+ addsub_sse4_1(bf1[0], bf1[15], bf1 + 0, bf1 + 15, clamp_lo, clamp_hi);
+ addsub_sse4_1(bf1[1], bf1[14], bf1 + 1, bf1 + 14, clamp_lo, clamp_hi);
+ addsub_sse4_1(bf1[2], bf1[13], bf1 + 2, bf1 + 13, clamp_lo, clamp_hi);
+ addsub_sse4_1(bf1[3], bf1[12], bf1 + 3, bf1 + 12, clamp_lo, clamp_hi);
+ addsub_sse4_1(bf1[4], bf1[11], bf1 + 4, bf1 + 11, clamp_lo, clamp_hi);
+ addsub_sse4_1(bf1[5], bf1[10], bf1 + 5, bf1 + 10, clamp_lo, clamp_hi);
+ addsub_sse4_1(bf1[6], bf1[9], bf1 + 6, bf1 + 9, clamp_lo, clamp_hi);
+ addsub_sse4_1(bf1[7], bf1[8], bf1 + 7, bf1 + 8, clamp_lo, clamp_hi);
+
+ temp1 = half_btf_sse4_1(cospim32, &bf1[20], cospi32, &bf1[27], rounding, bit);
+ bf1[27] =
+ half_btf_sse4_1(cospi32, &bf1[20], cospi32, &bf1[27], rounding, bit);
+ bf1[20] = temp1;
+ temp2 = half_btf_sse4_1(cospim32, &bf1[21], cospi32, &bf1[26], rounding, bit);
+ bf1[26] =
+ half_btf_sse4_1(cospi32, &bf1[21], cospi32, &bf1[26], rounding, bit);
+ bf1[21] = temp2;
+ temp1 = half_btf_sse4_1(cospim32, &bf1[22], cospi32, &bf1[25], rounding, bit);
+ bf1[25] =
+ half_btf_sse4_1(cospi32, &bf1[22], cospi32, &bf1[25], rounding, bit);
+ bf1[22] = temp1;
+ temp2 = half_btf_sse4_1(cospim32, &bf1[23], cospi32, &bf1[24], rounding, bit);
+ bf1[24] =
+ half_btf_sse4_1(cospi32, &bf1[23], cospi32, &bf1[24], rounding, bit);
+ bf1[23] = temp2;
+}
+
+static INLINE void idct32_stage9_sse4_1(__m128i *bf1, __m128i *out,
+ const int do_cols, const int bd,
+ const int out_shift,
+ const __m128i *clamp_lo,
+ const __m128i *clamp_hi) {
+ addsub_sse4_1(bf1[0], bf1[31], out + 0, out + 31, clamp_lo, clamp_hi);
+ addsub_sse4_1(bf1[1], bf1[30], out + 1, out + 30, clamp_lo, clamp_hi);
+ addsub_sse4_1(bf1[2], bf1[29], out + 2, out + 29, clamp_lo, clamp_hi);
+ addsub_sse4_1(bf1[3], bf1[28], out + 3, out + 28, clamp_lo, clamp_hi);
+ addsub_sse4_1(bf1[4], bf1[27], out + 4, out + 27, clamp_lo, clamp_hi);
+ addsub_sse4_1(bf1[5], bf1[26], out + 5, out + 26, clamp_lo, clamp_hi);
+ addsub_sse4_1(bf1[6], bf1[25], out + 6, out + 25, clamp_lo, clamp_hi);
+ addsub_sse4_1(bf1[7], bf1[24], out + 7, out + 24, clamp_lo, clamp_hi);
+ addsub_sse4_1(bf1[8], bf1[23], out + 8, out + 23, clamp_lo, clamp_hi);
+ addsub_sse4_1(bf1[9], bf1[22], out + 9, out + 22, clamp_lo, clamp_hi);
+ addsub_sse4_1(bf1[10], bf1[21], out + 10, out + 21, clamp_lo, clamp_hi);
+ addsub_sse4_1(bf1[11], bf1[20], out + 11, out + 20, clamp_lo, clamp_hi);
+ addsub_sse4_1(bf1[12], bf1[19], out + 12, out + 19, clamp_lo, clamp_hi);
+ addsub_sse4_1(bf1[13], bf1[18], out + 13, out + 18, clamp_lo, clamp_hi);
+ addsub_sse4_1(bf1[14], bf1[17], out + 14, out + 17, clamp_lo, clamp_hi);
+ addsub_sse4_1(bf1[15], bf1[16], out + 15, out + 16, clamp_lo, clamp_hi);
+
+ if (!do_cols) {
+ const int log_range_out = AOMMAX(16, bd + 6);
+ const __m128i clamp_lo_out = _mm_set1_epi32(-(1 << (log_range_out - 1)));
+ const __m128i clamp_hi_out = _mm_set1_epi32((1 << (log_range_out - 1)) - 1);
+ for (int i = 0; i < 32; i += 8) {
+ round_shift_4x4(out + i, out_shift);
+ round_shift_4x4(out + i + 4, out_shift);
+ }
+ highbd_clamp_epi32_sse4_1(out, out, &clamp_lo_out, &clamp_hi_out, 32);
+ }
+}
+
+static void neg_shift_sse4_1(const __m128i in0, const __m128i in1,
+ __m128i *out0, __m128i *out1,
+ const __m128i *clamp_lo, const __m128i *clamp_hi,
+ int shift) {
+ __m128i offset = _mm_set1_epi32((1 << shift) >> 1);
+ __m128i a0 = _mm_add_epi32(offset, in0);
+ __m128i a1 = _mm_sub_epi32(offset, in1);
+
+ a0 = _mm_sra_epi32(a0, _mm_cvtsi32_si128(shift));
+ a1 = _mm_sra_epi32(a1, _mm_cvtsi32_si128(shift));
+
+ a0 = _mm_max_epi32(a0, *clamp_lo);
+ a0 = _mm_min_epi32(a0, *clamp_hi);
+ a1 = _mm_max_epi32(a1, *clamp_lo);
+ a1 = _mm_min_epi32(a1, *clamp_hi);
+
+ *out0 = a0;
+ *out1 = a1;
+}
+
+static void idct4x4_sse4_1(__m128i *in, __m128i *out, int bit, int do_cols,
+ int bd, int out_shift) {
+ const int32_t *cospi = cospi_arr(bit);
+ const __m128i cospi32 = _mm_set1_epi32(cospi[32]);
+ const __m128i cospi48 = _mm_set1_epi32(cospi[48]);
+ const __m128i cospi16 = _mm_set1_epi32(cospi[16]);
+ const __m128i cospim16 = _mm_set1_epi32(-cospi[16]);
+ const __m128i rnding = _mm_set1_epi32(1 << (bit - 1));
+ int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8));
+ __m128i clamp_lo = _mm_set1_epi32(-(1 << (log_range - 1)));
+ __m128i clamp_hi = _mm_set1_epi32((1 << (log_range - 1)) - 1);
+ __m128i u0, u1, u2, u3;
+ __m128i v0, v1, v2, v3, x, y;
+
+ // Stage 0
+ // Stage 1
+ // Stage 2
+ u0 = in[0];
+ u1 = in[1];
+ u2 = in[2];
+ u3 = in[3];
+
+ x = _mm_mullo_epi32(u0, cospi32);
+ y = _mm_mullo_epi32(u2, cospi32);
+ v0 = _mm_add_epi32(x, y);
+ v0 = _mm_add_epi32(v0, rnding);
+ v0 = _mm_srai_epi32(v0, bit);
+
+ v1 = _mm_sub_epi32(x, y);
+ v1 = _mm_add_epi32(v1, rnding);
+ v1 = _mm_srai_epi32(v1, bit);
+
+ x = _mm_mullo_epi32(u1, cospi48);
+ y = _mm_mullo_epi32(u3, cospim16);
+ v2 = _mm_add_epi32(x, y);
+ v2 = _mm_add_epi32(v2, rnding);
+ v2 = _mm_srai_epi32(v2, bit);
+
+ x = _mm_mullo_epi32(u1, cospi16);
+ y = _mm_mullo_epi32(u3, cospi48);
+ v3 = _mm_add_epi32(x, y);
+ v3 = _mm_add_epi32(v3, rnding);
+ v3 = _mm_srai_epi32(v3, bit);
+
+ // Stage 3
+ addsub_sse4_1(v0, v3, out + 0, out + 3, &clamp_lo, &clamp_hi);
+ addsub_sse4_1(v1, v2, out + 1, out + 2, &clamp_lo, &clamp_hi);
+
+ if (!do_cols) {
+ log_range = AOMMAX(16, bd + 6);
+ clamp_lo = _mm_set1_epi32(-(1 << (log_range - 1)));
+ clamp_hi = _mm_set1_epi32((1 << (log_range - 1)) - 1);
+
+ shift_and_clamp_sse4_1(out + 0, out + 3, &clamp_lo, &clamp_hi, out_shift);
+ shift_and_clamp_sse4_1(out + 1, out + 2, &clamp_lo, &clamp_hi, out_shift);
+ }
+}
+
+static void iadst4x4_sse4_1(__m128i *in, __m128i *out, int bit, int do_cols,
+ int bd, int out_shift) {
+ const int32_t *sinpi = sinpi_arr(bit);
+ const __m128i zero = _mm_setzero_si128();
+ __m128i rnding = _mm_set1_epi32(1 << (bit + 4 - 1));
+ rnding = _mm_unpacklo_epi32(rnding, zero);
+ const __m128i mul = _mm_set1_epi32(1 << 4);
+ const __m128i sinpi1 = _mm_set1_epi32((int)sinpi[1]);
+ const __m128i sinpi2 = _mm_set1_epi32((int)sinpi[2]);
+ const __m128i sinpi3 = _mm_set1_epi32((int)sinpi[3]);
+ const __m128i sinpi4 = _mm_set1_epi32((int)sinpi[4]);
+ __m128i t;
+ __m128i s0, s1, s2, s3, s4, s5, s6, s7;
+ __m128i x0, x1, x2, x3;
+ __m128i u0, u1, u2, u3;
+ __m128i u0_low, u1_low, u2_low, u3_low;
+ __m128i u0_high, u1_high, u2_high, u3_high;
+
+ x0 = in[0];
+ x1 = in[1];
+ x2 = in[2];
+ x3 = in[3];
+
+ s0 = _mm_mullo_epi32(x0, sinpi1);
+ s1 = _mm_mullo_epi32(x0, sinpi2);
+ s2 = _mm_mullo_epi32(x1, sinpi3);
+ s3 = _mm_mullo_epi32(x2, sinpi4);
+ s4 = _mm_mullo_epi32(x2, sinpi1);
+ s5 = _mm_mullo_epi32(x3, sinpi2);
+ s6 = _mm_mullo_epi32(x3, sinpi4);
+ t = _mm_sub_epi32(x0, x2);
+ s7 = _mm_add_epi32(t, x3);
+
+ t = _mm_add_epi32(s0, s3);
+ s0 = _mm_add_epi32(t, s5);
+ t = _mm_sub_epi32(s1, s4);
+ s1 = _mm_sub_epi32(t, s6);
+ s3 = s2;
+ s2 = _mm_mullo_epi32(s7, sinpi3);
+
+ u0 = _mm_add_epi32(s0, s3);
+ u1 = _mm_add_epi32(s1, s3);
+ u2 = s2;
+ t = _mm_add_epi32(s0, s1);
+ u3 = _mm_sub_epi32(t, s3);
+
+ // u0
+ u0_low = _mm_mul_epi32(u0, mul);
+ u0_low = _mm_add_epi64(u0_low, rnding);
+
+ u0 = _mm_srli_si128(u0, 4);
+ u0_high = _mm_mul_epi32(u0, mul);
+ u0_high = _mm_add_epi64(u0_high, rnding);
+
+ u0_low = _mm_srli_si128(u0_low, 2);
+ u0_high = _mm_srli_si128(u0_high, 2);
+
+ u0 = _mm_unpacklo_epi32(u0_low, u0_high);
+ u0_high = _mm_unpackhi_epi32(u0_low, u0_high);
+ u0 = _mm_unpacklo_epi64(u0, u0_high);
+
+ // u1
+ u1_low = _mm_mul_epi32(u1, mul);
+ u1_low = _mm_add_epi64(u1_low, rnding);
+
+ u1 = _mm_srli_si128(u1, 4);
+ u1_high = _mm_mul_epi32(u1, mul);
+ u1_high = _mm_add_epi64(u1_high, rnding);
+
+ u1_low = _mm_srli_si128(u1_low, 2);
+ u1_high = _mm_srli_si128(u1_high, 2);
+
+ u1 = _mm_unpacklo_epi32(u1_low, u1_high);
+ u1_high = _mm_unpackhi_epi32(u1_low, u1_high);
+ u1 = _mm_unpacklo_epi64(u1, u1_high);
+
+ // u2
+ u2_low = _mm_mul_epi32(u2, mul);
+ u2_low = _mm_add_epi64(u2_low, rnding);
+
+ u2 = _mm_srli_si128(u2, 4);
+ u2_high = _mm_mul_epi32(u2, mul);
+ u2_high = _mm_add_epi64(u2_high, rnding);
+
+ u2_low = _mm_srli_si128(u2_low, 2);
+ u2_high = _mm_srli_si128(u2_high, 2);
+
+ u2 = _mm_unpacklo_epi32(u2_low, u2_high);
+ u2_high = _mm_unpackhi_epi32(u2_low, u2_high);
+ u2 = _mm_unpacklo_epi64(u2, u2_high);
+
+ // u3
+ u3_low = _mm_mul_epi32(u3, mul);
+ u3_low = _mm_add_epi64(u3_low, rnding);
+
+ u3 = _mm_srli_si128(u3, 4);
+ u3_high = _mm_mul_epi32(u3, mul);
+ u3_high = _mm_add_epi64(u3_high, rnding);
+
+ u3_low = _mm_srli_si128(u3_low, 2);
+ u3_high = _mm_srli_si128(u3_high, 2);
+
+ u3 = _mm_unpacklo_epi32(u3_low, u3_high);
+ u3_high = _mm_unpackhi_epi32(u3_low, u3_high);
+ u3 = _mm_unpacklo_epi64(u3, u3_high);
+
+ out[0] = u0;
+ out[1] = u1;
+ out[2] = u2;
+ out[3] = u3;
+
+ if (!do_cols) {
+ const int log_range = AOMMAX(16, bd + 6);
+ const __m128i clamp_lo = _mm_set1_epi32(-(1 << (log_range - 1)));
+ const __m128i clamp_hi = _mm_set1_epi32((1 << (log_range - 1)) - 1);
+ round_shift_4x4(out, out_shift);
+ highbd_clamp_epi32_sse4_1(out, out, &clamp_lo, &clamp_hi, 4);
+ }
+}
+
+static void write_buffer_4x4(__m128i *in, uint16_t *output, int stride,
+ int fliplr, int flipud, int shift, int bd) {
+ const __m128i zero = _mm_setzero_si128();
+ __m128i u0, u1, u2, u3;
+ __m128i v0, v1, v2, v3;
+
+ round_shift_4x4(in, shift);
+
+ v0 = _mm_loadl_epi64((__m128i const *)(output + 0 * stride));
+ v1 = _mm_loadl_epi64((__m128i const *)(output + 1 * stride));
+ v2 = _mm_loadl_epi64((__m128i const *)(output + 2 * stride));
+ v3 = _mm_loadl_epi64((__m128i const *)(output + 3 * stride));
+
+ v0 = _mm_unpacklo_epi16(v0, zero);
+ v1 = _mm_unpacklo_epi16(v1, zero);
+ v2 = _mm_unpacklo_epi16(v2, zero);
+ v3 = _mm_unpacklo_epi16(v3, zero);
+
+ if (fliplr) {
+ in[0] = _mm_shuffle_epi32(in[0], 0x1B);
+ in[1] = _mm_shuffle_epi32(in[1], 0x1B);
+ in[2] = _mm_shuffle_epi32(in[2], 0x1B);
+ in[3] = _mm_shuffle_epi32(in[3], 0x1B);
+ }
+
+ if (flipud) {
+ u0 = _mm_add_epi32(in[3], v0);
+ u1 = _mm_add_epi32(in[2], v1);
+ u2 = _mm_add_epi32(in[1], v2);
+ u3 = _mm_add_epi32(in[0], v3);
+ } else {
+ u0 = _mm_add_epi32(in[0], v0);
+ u1 = _mm_add_epi32(in[1], v1);
+ u2 = _mm_add_epi32(in[2], v2);
+ u3 = _mm_add_epi32(in[3], v3);
+ }
+
+ v0 = _mm_packus_epi32(u0, u1);
+ v2 = _mm_packus_epi32(u2, u3);
+
+ u0 = highbd_clamp_epi16(v0, bd);
+ u2 = highbd_clamp_epi16(v2, bd);
+
+ v0 = _mm_unpacklo_epi64(u0, u0);
+ v1 = _mm_unpackhi_epi64(u0, u0);
+ v2 = _mm_unpacklo_epi64(u2, u2);
+ v3 = _mm_unpackhi_epi64(u2, u2);
+
+ _mm_storel_epi64((__m128i *)(output + 0 * stride), v0);
+ _mm_storel_epi64((__m128i *)(output + 1 * stride), v1);
+ _mm_storel_epi64((__m128i *)(output + 2 * stride), v2);
+ _mm_storel_epi64((__m128i *)(output + 3 * stride), v3);
+}
+
+static void iidentity4_sse4_1(__m128i *in, __m128i *out, int bit, int do_cols,
+ int bd, int out_shift) {
+ (void)bit;
+ __m128i zero = _mm_setzero_si128();
+ __m128i fact = _mm_set1_epi32(NewSqrt2);
+ __m128i offset = _mm_set1_epi32(1 << (NewSqrt2Bits - 1));
+ __m128i a0_low, a1_low;
+ __m128i a0_high, a1_high;
+
+ offset = _mm_unpacklo_epi32(offset, zero);
+
+ for (int i = 0; i < 4; i++) {
+ a0_low = _mm_mul_epi32(in[i], fact);
+ a0_low = _mm_add_epi32(a0_low, offset);
+ a0_low = _mm_srli_epi64(a0_low, NewSqrt2Bits);
+
+ a0_high = _mm_srli_si128(in[i], 4);
+ a0_high = _mm_mul_epi32(a0_high, fact);
+ a0_high = _mm_add_epi32(a0_high, offset);
+ a0_high = _mm_srli_epi64(a0_high, NewSqrt2Bits);
+
+ a1_low = _mm_unpacklo_epi32(a0_low, a0_high);
+ a1_high = _mm_unpackhi_epi32(a0_low, a0_high);
+ out[i] = _mm_unpacklo_epi64(a1_low, a1_high);
+ }
+
+ if (!do_cols) {
+ const int log_range = AOMMAX(16, bd + 6);
+ const __m128i clamp_lo = _mm_set1_epi32(-(1 << (log_range - 1)));
+ const __m128i clamp_hi = _mm_set1_epi32((1 << (log_range - 1)) - 1);
+ round_shift_4x4(out, out_shift);
+ highbd_clamp_epi32_sse4_1(out, out, &clamp_lo, &clamp_hi, 4);
+ }
+}
+void av1_inv_txfm2d_add_4x4_sse4_1(const int32_t *input, uint16_t *output,
+ int stride, TX_TYPE tx_type, int bd) {
+ __m128i in[4];
+ const int8_t *shift = av1_inv_txfm_shift_ls[TX_4X4];
+
+ switch (tx_type) {
+ case DCT_DCT:
+ load_buffer_4x4(input, in);
+ idct4x4_sse4_1(in, in, INV_COS_BIT, 0, bd, 0);
+ transpose_32bit_4x4(in, in);
+ idct4x4_sse4_1(in, in, INV_COS_BIT, 1, bd, 0);
+ write_buffer_4x4(in, output, stride, 0, 0, -shift[1], bd);
+ break;
+ case ADST_DCT:
+ load_buffer_4x4(input, in);
+ idct4x4_sse4_1(in, in, INV_COS_BIT, 0, bd, 0);
+ transpose_32bit_4x4(in, in);
+ iadst4x4_sse4_1(in, in, INV_COS_BIT, 1, bd, 0);
+ write_buffer_4x4(in, output, stride, 0, 0, -shift[1], bd);
+ break;
+ case DCT_ADST:
+ load_buffer_4x4(input, in);
+ iadst4x4_sse4_1(in, in, INV_COS_BIT, 0, bd, 0);
+ transpose_32bit_4x4(in, in);
+ idct4x4_sse4_1(in, in, INV_COS_BIT, 1, bd, 0);
+ write_buffer_4x4(in, output, stride, 0, 0, -shift[1], bd);
+ break;
+ case ADST_ADST:
+ load_buffer_4x4(input, in);
+ iadst4x4_sse4_1(in, in, INV_COS_BIT, 0, bd, 0);
+ transpose_32bit_4x4(in, in);
+ iadst4x4_sse4_1(in, in, INV_COS_BIT, 1, bd, 0);
+ write_buffer_4x4(in, output, stride, 0, 0, -shift[1], bd);
+ break;
+ case FLIPADST_DCT:
+ load_buffer_4x4(input, in);
+ idct4x4_sse4_1(in, in, INV_COS_BIT, 0, bd, 0);
+ transpose_32bit_4x4(in, in);
+ iadst4x4_sse4_1(in, in, INV_COS_BIT, 1, bd, 0);
+ write_buffer_4x4(in, output, stride, 0, 1, -shift[1], bd);
+ break;
+ case DCT_FLIPADST:
+ load_buffer_4x4(input, in);
+ iadst4x4_sse4_1(in, in, INV_COS_BIT, 0, bd, 0);
+ transpose_32bit_4x4(in, in);
+ idct4x4_sse4_1(in, in, INV_COS_BIT, 1, bd, 0);
+ write_buffer_4x4(in, output, stride, 1, 0, -shift[1], bd);
+ break;
+ case FLIPADST_FLIPADST:
+ load_buffer_4x4(input, in);
+ iadst4x4_sse4_1(in, in, INV_COS_BIT, 0, bd, 0);
+ transpose_32bit_4x4(in, in);
+ iadst4x4_sse4_1(in, in, INV_COS_BIT, 1, bd, 0);
+ write_buffer_4x4(in, output, stride, 1, 1, -shift[1], bd);
+ break;
+ case ADST_FLIPADST:
+ load_buffer_4x4(input, in);
+ iadst4x4_sse4_1(in, in, INV_COS_BIT, 0, bd, 0);
+ transpose_32bit_4x4(in, in);
+ iadst4x4_sse4_1(in, in, INV_COS_BIT, 1, bd, 0);
+ write_buffer_4x4(in, output, stride, 1, 0, -shift[1], bd);
+ break;
+ case FLIPADST_ADST:
+ load_buffer_4x4(input, in);
+ iadst4x4_sse4_1(in, in, INV_COS_BIT, 0, bd, 0);
+ transpose_32bit_4x4(in, in);
+ iadst4x4_sse4_1(in, in, INV_COS_BIT, 1, bd, 0);
+ write_buffer_4x4(in, output, stride, 0, 1, -shift[1], bd);
+ break;
+ case IDTX:
+ load_buffer_4x4(input, in);
+ iidentity4_sse4_1(in, in, INV_COS_BIT, 0, bd, 0);
+ transpose_32bit_4x4(in, in);
+ iidentity4_sse4_1(in, in, INV_COS_BIT, 1, bd, 0);
+ write_buffer_4x4(in, output, stride, 0, 0, -shift[1], bd);
+ break;
+ case V_DCT:
+ load_buffer_4x4(input, in);
+ iidentity4_sse4_1(in, in, INV_COS_BIT, 0, bd, 0);
+ transpose_32bit_4x4(in, in);
+ idct4x4_sse4_1(in, in, INV_COS_BIT, 1, bd, 0);
+ write_buffer_4x4(in, output, stride, 0, 0, -shift[1], bd);
+ break;
+ case H_DCT:
+ load_buffer_4x4(input, in);
+ idct4x4_sse4_1(in, in, INV_COS_BIT, 0, bd, 0);
+ transpose_32bit_4x4(in, in);
+ iidentity4_sse4_1(in, in, INV_COS_BIT, 1, bd, 0);
+ write_buffer_4x4(in, output, stride, 0, 0, -shift[1], bd);
+ break;
+ case V_ADST:
+ load_buffer_4x4(input, in);
+ iidentity4_sse4_1(in, in, INV_COS_BIT, 0, bd, 0);
+ transpose_32bit_4x4(in, in);
+ iadst4x4_sse4_1(in, in, INV_COS_BIT, 1, bd, 0);
+ write_buffer_4x4(in, output, stride, 0, 0, -shift[1], bd);
+ break;
+ case H_ADST:
+ load_buffer_4x4(input, in);
+ iadst4x4_sse4_1(in, in, INV_COS_BIT, 0, bd, 0);
+ transpose_32bit_4x4(in, in);
+ iidentity4_sse4_1(in, in, INV_COS_BIT, 1, bd, 0);
+ write_buffer_4x4(in, output, stride, 0, 0, -shift[1], bd);
+ break;
+ case V_FLIPADST:
+ load_buffer_4x4(input, in);
+ iidentity4_sse4_1(in, in, INV_COS_BIT, 0, bd, 0);
+ transpose_32bit_4x4(in, in);
+ iadst4x4_sse4_1(in, in, INV_COS_BIT, 1, bd, 0);
+ write_buffer_4x4(in, output, stride, 0, 1, -shift[1], bd);
+ break;
+ case H_FLIPADST:
+ load_buffer_4x4(input, in);
+ iadst4x4_sse4_1(in, in, INV_COS_BIT, 0, bd, 0);
+ transpose_32bit_4x4(in, in);
+ iidentity4_sse4_1(in, in, INV_COS_BIT, 1, bd, 0);
+ write_buffer_4x4(in, output, stride, 1, 0, -shift[1], bd);
+ break;
+ default: assert(0);
+ }
+}
+
+// 8x8
+static void load_buffer_8x8(const int32_t *coeff, __m128i *in) {
+ in[0] = _mm_load_si128((const __m128i *)(coeff + 0));
+ in[1] = _mm_load_si128((const __m128i *)(coeff + 4));
+ in[2] = _mm_load_si128((const __m128i *)(coeff + 8));
+ in[3] = _mm_load_si128((const __m128i *)(coeff + 12));
+ in[4] = _mm_load_si128((const __m128i *)(coeff + 16));
+ in[5] = _mm_load_si128((const __m128i *)(coeff + 20));
+ in[6] = _mm_load_si128((const __m128i *)(coeff + 24));
+ in[7] = _mm_load_si128((const __m128i *)(coeff + 28));
+ in[8] = _mm_load_si128((const __m128i *)(coeff + 32));
+ in[9] = _mm_load_si128((const __m128i *)(coeff + 36));
+ in[10] = _mm_load_si128((const __m128i *)(coeff + 40));
+ in[11] = _mm_load_si128((const __m128i *)(coeff + 44));
+ in[12] = _mm_load_si128((const __m128i *)(coeff + 48));
+ in[13] = _mm_load_si128((const __m128i *)(coeff + 52));
+ in[14] = _mm_load_si128((const __m128i *)(coeff + 56));
+ in[15] = _mm_load_si128((const __m128i *)(coeff + 60));
+}
+
+static void idct8x8_sse4_1(__m128i *in, __m128i *out, int bit, int do_cols,
+ int bd, int out_shift) {
+ const int32_t *cospi = cospi_arr(bit);
+ const __m128i cospi56 = _mm_set1_epi32(cospi[56]);
+ const __m128i cospim8 = _mm_set1_epi32(-cospi[8]);
+ const __m128i cospi24 = _mm_set1_epi32(cospi[24]);
+ const __m128i cospim40 = _mm_set1_epi32(-cospi[40]);
+ const __m128i cospi40 = _mm_set1_epi32(cospi[40]);
+ const __m128i cospi8 = _mm_set1_epi32(cospi[8]);
+ const __m128i cospi32 = _mm_set1_epi32(cospi[32]);
+ const __m128i cospi48 = _mm_set1_epi32(cospi[48]);
+ const __m128i cospim16 = _mm_set1_epi32(-cospi[16]);
+ const __m128i cospi16 = _mm_set1_epi32(cospi[16]);
+ const __m128i rnding = _mm_set1_epi32(1 << (bit - 1));
+ const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8));
+ const __m128i clamp_lo = _mm_set1_epi32(-(1 << (log_range - 1)));
+ const __m128i clamp_hi = _mm_set1_epi32((1 << (log_range - 1)) - 1);
+ __m128i u0, u1, u2, u3, u4, u5, u6, u7;
+ __m128i v0, v1, v2, v3, v4, v5, v6, v7;
+ __m128i x, y;
+ int col;
+
+ // Note:
+ // Even column: 0, 2, ..., 14
+ // Odd column: 1, 3, ..., 15
+ // one even column plus one odd column constructs one row (8 coeffs)
+ // total we have 8 rows (8x8).
+ for (col = 0; col < 2; ++col) {
+ // stage 0
+ // stage 1
+ // stage 2
+ u0 = in[0 * 2 + col];
+ u1 = in[4 * 2 + col];
+ u2 = in[2 * 2 + col];
+ u3 = in[6 * 2 + col];
+
+ x = _mm_mullo_epi32(in[1 * 2 + col], cospi56);
+ y = _mm_mullo_epi32(in[7 * 2 + col], cospim8);
+ u4 = _mm_add_epi32(x, y);
+ u4 = _mm_add_epi32(u4, rnding);
+ u4 = _mm_srai_epi32(u4, bit);
+
+ x = _mm_mullo_epi32(in[1 * 2 + col], cospi8);
+ y = _mm_mullo_epi32(in[7 * 2 + col], cospi56);
+ u7 = _mm_add_epi32(x, y);
+ u7 = _mm_add_epi32(u7, rnding);
+ u7 = _mm_srai_epi32(u7, bit);
+
+ x = _mm_mullo_epi32(in[5 * 2 + col], cospi24);
+ y = _mm_mullo_epi32(in[3 * 2 + col], cospim40);
+ u5 = _mm_add_epi32(x, y);
+ u5 = _mm_add_epi32(u5, rnding);
+ u5 = _mm_srai_epi32(u5, bit);
+
+ x = _mm_mullo_epi32(in[5 * 2 + col], cospi40);
+ y = _mm_mullo_epi32(in[3 * 2 + col], cospi24);
+ u6 = _mm_add_epi32(x, y);
+ u6 = _mm_add_epi32(u6, rnding);
+ u6 = _mm_srai_epi32(u6, bit);
+
+ // stage 3
+ x = _mm_mullo_epi32(u0, cospi32);
+ y = _mm_mullo_epi32(u1, cospi32);
+ v0 = _mm_add_epi32(x, y);
+ v0 = _mm_add_epi32(v0, rnding);
+ v0 = _mm_srai_epi32(v0, bit);
+
+ v1 = _mm_sub_epi32(x, y);
+ v1 = _mm_add_epi32(v1, rnding);
+ v1 = _mm_srai_epi32(v1, bit);
+
+ x = _mm_mullo_epi32(u2, cospi48);
+ y = _mm_mullo_epi32(u3, cospim16);
+ v2 = _mm_add_epi32(x, y);
+ v2 = _mm_add_epi32(v2, rnding);
+ v2 = _mm_srai_epi32(v2, bit);
+
+ x = _mm_mullo_epi32(u2, cospi16);
+ y = _mm_mullo_epi32(u3, cospi48);
+ v3 = _mm_add_epi32(x, y);
+ v3 = _mm_add_epi32(v3, rnding);
+ v3 = _mm_srai_epi32(v3, bit);
+
+ addsub_sse4_1(u4, u5, &v4, &v5, &clamp_lo, &clamp_hi);
+ addsub_sse4_1(u7, u6, &v7, &v6, &clamp_lo, &clamp_hi);
+
+ // stage 4
+ addsub_sse4_1(v0, v3, &u0, &u3, &clamp_lo, &clamp_hi);
+ addsub_sse4_1(v1, v2, &u1, &u2, &clamp_lo, &clamp_hi);
+ u4 = v4;
+ u7 = v7;
+
+ x = _mm_mullo_epi32(v5, cospi32);
+ y = _mm_mullo_epi32(v6, cospi32);
+ u6 = _mm_add_epi32(y, x);
+ u6 = _mm_add_epi32(u6, rnding);
+ u6 = _mm_srai_epi32(u6, bit);
+
+ u5 = _mm_sub_epi32(y, x);
+ u5 = _mm_add_epi32(u5, rnding);
+ u5 = _mm_srai_epi32(u5, bit);
+
+ // stage 5
+ addsub_sse4_1(u0, u7, out + 0 * 2 + col, out + 7 * 2 + col, &clamp_lo,
+ &clamp_hi);
+ addsub_sse4_1(u1, u6, out + 1 * 2 + col, out + 6 * 2 + col, &clamp_lo,
+ &clamp_hi);
+ addsub_sse4_1(u2, u5, out + 2 * 2 + col, out + 5 * 2 + col, &clamp_lo,
+ &clamp_hi);
+ addsub_sse4_1(u3, u4, out + 3 * 2 + col, out + 4 * 2 + col, &clamp_lo,
+ &clamp_hi);
+ }
+
+ if (!do_cols) {
+ const int log_range_out = AOMMAX(16, bd + 6);
+ const __m128i clamp_lo_out = _mm_set1_epi32(-(1 << (log_range_out - 1)));
+ const __m128i clamp_hi_out = _mm_set1_epi32((1 << (log_range_out - 1)) - 1);
+ round_shift_8x8(out, out_shift);
+ highbd_clamp_epi32_sse4_1(out, out, &clamp_lo_out, &clamp_hi_out, 16);
+ }
+}
+
+static void iadst8x8_sse4_1(__m128i *in, __m128i *out, int bit, int do_cols,
+ int bd, int out_shift) {
+ const int32_t *cospi = cospi_arr(bit);
+ const __m128i cospi4 = _mm_set1_epi32(cospi[4]);
+ const __m128i cospi60 = _mm_set1_epi32(cospi[60]);
+ const __m128i cospi20 = _mm_set1_epi32(cospi[20]);
+ const __m128i cospi44 = _mm_set1_epi32(cospi[44]);
+ const __m128i cospi36 = _mm_set1_epi32(cospi[36]);
+ const __m128i cospi28 = _mm_set1_epi32(cospi[28]);
+ const __m128i cospi52 = _mm_set1_epi32(cospi[52]);
+ const __m128i cospi12 = _mm_set1_epi32(cospi[12]);
+ const __m128i cospi16 = _mm_set1_epi32(cospi[16]);
+ const __m128i cospi48 = _mm_set1_epi32(cospi[48]);
+ const __m128i cospim48 = _mm_set1_epi32(-cospi[48]);
+ const __m128i cospi32 = _mm_set1_epi32(cospi[32]);
+ const __m128i rnding = _mm_set1_epi32(1 << (bit - 1));
+ const __m128i kZero = _mm_setzero_si128();
+ const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8));
+ const __m128i clamp_lo = _mm_set1_epi32(-(1 << (log_range - 1)));
+ const __m128i clamp_hi = _mm_set1_epi32((1 << (log_range - 1)) - 1);
+ __m128i u[8], v[8], x;
+
+ // Even 8 points: 0, 2, ..., 14
+ // stage 0
+ // stage 1
+ // stage 2
+ // (1)
+ u[0] = _mm_mullo_epi32(in[14], cospi4);
+ x = _mm_mullo_epi32(in[0], cospi60);
+ u[0] = _mm_add_epi32(u[0], x);
+ u[0] = _mm_add_epi32(u[0], rnding);
+ u[0] = _mm_srai_epi32(u[0], bit);
+
+ u[1] = _mm_mullo_epi32(in[14], cospi60);
+ x = _mm_mullo_epi32(in[0], cospi4);
+ u[1] = _mm_sub_epi32(u[1], x);
+ u[1] = _mm_add_epi32(u[1], rnding);
+ u[1] = _mm_srai_epi32(u[1], bit);
+
+ // (2)
+ u[2] = _mm_mullo_epi32(in[10], cospi20);
+ x = _mm_mullo_epi32(in[4], cospi44);
+ u[2] = _mm_add_epi32(u[2], x);
+ u[2] = _mm_add_epi32(u[2], rnding);
+ u[2] = _mm_srai_epi32(u[2], bit);
+
+ u[3] = _mm_mullo_epi32(in[10], cospi44);
+ x = _mm_mullo_epi32(in[4], cospi20);
+ u[3] = _mm_sub_epi32(u[3], x);
+ u[3] = _mm_add_epi32(u[3], rnding);
+ u[3] = _mm_srai_epi32(u[3], bit);
+
+ // (3)
+ u[4] = _mm_mullo_epi32(in[6], cospi36);
+ x = _mm_mullo_epi32(in[8], cospi28);
+ u[4] = _mm_add_epi32(u[4], x);
+ u[4] = _mm_add_epi32(u[4], rnding);
+ u[4] = _mm_srai_epi32(u[4], bit);
+
+ u[5] = _mm_mullo_epi32(in[6], cospi28);
+ x = _mm_mullo_epi32(in[8], cospi36);
+ u[5] = _mm_sub_epi32(u[5], x);
+ u[5] = _mm_add_epi32(u[5], rnding);
+ u[5] = _mm_srai_epi32(u[5], bit);
+
+ // (4)
+ u[6] = _mm_mullo_epi32(in[2], cospi52);
+ x = _mm_mullo_epi32(in[12], cospi12);
+ u[6] = _mm_add_epi32(u[6], x);
+ u[6] = _mm_add_epi32(u[6], rnding);
+ u[6] = _mm_srai_epi32(u[6], bit);
+
+ u[7] = _mm_mullo_epi32(in[2], cospi12);
+ x = _mm_mullo_epi32(in[12], cospi52);
+ u[7] = _mm_sub_epi32(u[7], x);
+ u[7] = _mm_add_epi32(u[7], rnding);
+ u[7] = _mm_srai_epi32(u[7], bit);
+
+ // stage 3
+ addsub_sse4_1(u[0], u[4], &v[0], &v[4], &clamp_lo, &clamp_hi);
+ addsub_sse4_1(u[1], u[5], &v[1], &v[5], &clamp_lo, &clamp_hi);
+ addsub_sse4_1(u[2], u[6], &v[2], &v[6], &clamp_lo, &clamp_hi);
+ addsub_sse4_1(u[3], u[7], &v[3], &v[7], &clamp_lo, &clamp_hi);
+
+ // stage 4
+ u[0] = v[0];
+ u[1] = v[1];
+ u[2] = v[2];
+ u[3] = v[3];
+
+ u[4] = _mm_mullo_epi32(v[4], cospi16);
+ x = _mm_mullo_epi32(v[5], cospi48);
+ u[4] = _mm_add_epi32(u[4], x);
+ u[4] = _mm_add_epi32(u[4], rnding);
+ u[4] = _mm_srai_epi32(u[4], bit);
+
+ u[5] = _mm_mullo_epi32(v[4], cospi48);
+ x = _mm_mullo_epi32(v[5], cospi16);
+ u[5] = _mm_sub_epi32(u[5], x);
+ u[5] = _mm_add_epi32(u[5], rnding);
+ u[5] = _mm_srai_epi32(u[5], bit);
+
+ u[6] = _mm_mullo_epi32(v[6], cospim48);
+ x = _mm_mullo_epi32(v[7], cospi16);
+ u[6] = _mm_add_epi32(u[6], x);
+ u[6] = _mm_add_epi32(u[6], rnding);
+ u[6] = _mm_srai_epi32(u[6], bit);
+
+ u[7] = _mm_mullo_epi32(v[6], cospi16);
+ x = _mm_mullo_epi32(v[7], cospim48);
+ u[7] = _mm_sub_epi32(u[7], x);
+ u[7] = _mm_add_epi32(u[7], rnding);
+ u[7] = _mm_srai_epi32(u[7], bit);
+
+ // stage 5
+ addsub_sse4_1(u[0], u[2], &v[0], &v[2], &clamp_lo, &clamp_hi);
+ addsub_sse4_1(u[1], u[3], &v[1], &v[3], &clamp_lo, &clamp_hi);
+ addsub_sse4_1(u[4], u[6], &v[4], &v[6], &clamp_lo, &clamp_hi);
+ addsub_sse4_1(u[5], u[7], &v[5], &v[7], &clamp_lo, &clamp_hi);
+
+ // stage 6
+ u[0] = v[0];
+ u[1] = v[1];
+ u[4] = v[4];
+ u[5] = v[5];
+
+ v[0] = _mm_mullo_epi32(v[2], cospi32);
+ x = _mm_mullo_epi32(v[3], cospi32);
+ u[2] = _mm_add_epi32(v[0], x);
+ u[2] = _mm_add_epi32(u[2], rnding);
+ u[2] = _mm_srai_epi32(u[2], bit);
+
+ u[3] = _mm_sub_epi32(v[0], x);
+ u[3] = _mm_add_epi32(u[3], rnding);
+ u[3] = _mm_srai_epi32(u[3], bit);
+
+ v[0] = _mm_mullo_epi32(v[6], cospi32);
+ x = _mm_mullo_epi32(v[7], cospi32);
+ u[6] = _mm_add_epi32(v[0], x);
+ u[6] = _mm_add_epi32(u[6], rnding);
+ u[6] = _mm_srai_epi32(u[6], bit);
+
+ u[7] = _mm_sub_epi32(v[0], x);
+ u[7] = _mm_add_epi32(u[7], rnding);
+ u[7] = _mm_srai_epi32(u[7], bit);
+
+ // stage 7
+ if (do_cols) {
+ out[0] = u[0];
+ out[2] = _mm_sub_epi32(kZero, u[4]);
+ out[4] = u[6];
+ out[6] = _mm_sub_epi32(kZero, u[2]);
+ out[8] = u[3];
+ out[10] = _mm_sub_epi32(kZero, u[7]);
+ out[12] = u[5];
+ out[14] = _mm_sub_epi32(kZero, u[1]);
+ } else {
+ const int log_range_out = AOMMAX(16, bd + 6);
+ const __m128i clamp_lo_out = _mm_set1_epi32(-(1 << (log_range_out - 1)));
+ const __m128i clamp_hi_out = _mm_set1_epi32((1 << (log_range_out - 1)) - 1);
+
+ neg_shift_sse4_1(u[0], u[4], out + 0, out + 2, &clamp_lo_out, &clamp_hi_out,
+ out_shift);
+ neg_shift_sse4_1(u[6], u[2], out + 4, out + 6, &clamp_lo_out, &clamp_hi_out,
+ out_shift);
+ neg_shift_sse4_1(u[3], u[7], out + 8, out + 10, &clamp_lo_out,
+ &clamp_hi_out, out_shift);
+ neg_shift_sse4_1(u[5], u[1], out + 12, out + 14, &clamp_lo_out,
+ &clamp_hi_out, out_shift);
+ }
+
+ // Odd 8 points: 1, 3, ..., 15
+ // stage 0
+ // stage 1
+ // stage 2
+ // (1)
+ u[0] = _mm_mullo_epi32(in[15], cospi4);
+ x = _mm_mullo_epi32(in[1], cospi60);
+ u[0] = _mm_add_epi32(u[0], x);
+ u[0] = _mm_add_epi32(u[0], rnding);
+ u[0] = _mm_srai_epi32(u[0], bit);
+
+ u[1] = _mm_mullo_epi32(in[15], cospi60);
+ x = _mm_mullo_epi32(in[1], cospi4);
+ u[1] = _mm_sub_epi32(u[1], x);
+ u[1] = _mm_add_epi32(u[1], rnding);
+ u[1] = _mm_srai_epi32(u[1], bit);
+
+ // (2)
+ u[2] = _mm_mullo_epi32(in[11], cospi20);
+ x = _mm_mullo_epi32(in[5], cospi44);
+ u[2] = _mm_add_epi32(u[2], x);
+ u[2] = _mm_add_epi32(u[2], rnding);
+ u[2] = _mm_srai_epi32(u[2], bit);
+
+ u[3] = _mm_mullo_epi32(in[11], cospi44);
+ x = _mm_mullo_epi32(in[5], cospi20);
+ u[3] = _mm_sub_epi32(u[3], x);
+ u[3] = _mm_add_epi32(u[3], rnding);
+ u[3] = _mm_srai_epi32(u[3], bit);
+
+ // (3)
+ u[4] = _mm_mullo_epi32(in[7], cospi36);
+ x = _mm_mullo_epi32(in[9], cospi28);
+ u[4] = _mm_add_epi32(u[4], x);
+ u[4] = _mm_add_epi32(u[4], rnding);
+ u[4] = _mm_srai_epi32(u[4], bit);
+
+ u[5] = _mm_mullo_epi32(in[7], cospi28);
+ x = _mm_mullo_epi32(in[9], cospi36);
+ u[5] = _mm_sub_epi32(u[5], x);
+ u[5] = _mm_add_epi32(u[5], rnding);
+ u[5] = _mm_srai_epi32(u[5], bit);
+
+ // (4)
+ u[6] = _mm_mullo_epi32(in[3], cospi52);
+ x = _mm_mullo_epi32(in[13], cospi12);
+ u[6] = _mm_add_epi32(u[6], x);
+ u[6] = _mm_add_epi32(u[6], rnding);
+ u[6] = _mm_srai_epi32(u[6], bit);
+
+ u[7] = _mm_mullo_epi32(in[3], cospi12);
+ x = _mm_mullo_epi32(in[13], cospi52);
+ u[7] = _mm_sub_epi32(u[7], x);
+ u[7] = _mm_add_epi32(u[7], rnding);
+ u[7] = _mm_srai_epi32(u[7], bit);
+
+ // stage 3
+ addsub_sse4_1(u[0], u[4], &v[0], &v[4], &clamp_lo, &clamp_hi);
+ addsub_sse4_1(u[1], u[5], &v[1], &v[5], &clamp_lo, &clamp_hi);
+ addsub_sse4_1(u[2], u[6], &v[2], &v[6], &clamp_lo, &clamp_hi);
+ addsub_sse4_1(u[3], u[7], &v[3], &v[7], &clamp_lo, &clamp_hi);
+
+ // stage 4
+ u[0] = v[0];
+ u[1] = v[1];
+ u[2] = v[2];
+ u[3] = v[3];
+
+ u[4] = _mm_mullo_epi32(v[4], cospi16);
+ x = _mm_mullo_epi32(v[5], cospi48);
+ u[4] = _mm_add_epi32(u[4], x);
+ u[4] = _mm_add_epi32(u[4], rnding);
+ u[4] = _mm_srai_epi32(u[4], bit);
+
+ u[5] = _mm_mullo_epi32(v[4], cospi48);
+ x = _mm_mullo_epi32(v[5], cospi16);
+ u[5] = _mm_sub_epi32(u[5], x);
+ u[5] = _mm_add_epi32(u[5], rnding);
+ u[5] = _mm_srai_epi32(u[5], bit);
+
+ u[6] = _mm_mullo_epi32(v[6], cospim48);
+ x = _mm_mullo_epi32(v[7], cospi16);
+ u[6] = _mm_add_epi32(u[6], x);
+ u[6] = _mm_add_epi32(u[6], rnding);
+ u[6] = _mm_srai_epi32(u[6], bit);
+
+ u[7] = _mm_mullo_epi32(v[6], cospi16);
+ x = _mm_mullo_epi32(v[7], cospim48);
+ u[7] = _mm_sub_epi32(u[7], x);
+ u[7] = _mm_add_epi32(u[7], rnding);
+ u[7] = _mm_srai_epi32(u[7], bit);
+
+ // stage 5
+ addsub_sse4_1(u[0], u[2], &v[0], &v[2], &clamp_lo, &clamp_hi);
+ addsub_sse4_1(u[1], u[3], &v[1], &v[3], &clamp_lo, &clamp_hi);
+ addsub_sse4_1(u[4], u[6], &v[4], &v[6], &clamp_lo, &clamp_hi);
+ addsub_sse4_1(u[5], u[7], &v[5], &v[7], &clamp_lo, &clamp_hi);
+
+ // stage 6
+ u[0] = v[0];
+ u[1] = v[1];
+ u[4] = v[4];
+ u[5] = v[5];
+
+ v[0] = _mm_mullo_epi32(v[2], cospi32);
+ x = _mm_mullo_epi32(v[3], cospi32);
+ u[2] = _mm_add_epi32(v[0], x);
+ u[2] = _mm_add_epi32(u[2], rnding);
+ u[2] = _mm_srai_epi32(u[2], bit);
+
+ u[3] = _mm_sub_epi32(v[0], x);
+ u[3] = _mm_add_epi32(u[3], rnding);
+ u[3] = _mm_srai_epi32(u[3], bit);
+
+ v[0] = _mm_mullo_epi32(v[6], cospi32);
+ x = _mm_mullo_epi32(v[7], cospi32);
+ u[6] = _mm_add_epi32(v[0], x);
+ u[6] = _mm_add_epi32(u[6], rnding);
+ u[6] = _mm_srai_epi32(u[6], bit);
+
+ u[7] = _mm_sub_epi32(v[0], x);
+ u[7] = _mm_add_epi32(u[7], rnding);
+ u[7] = _mm_srai_epi32(u[7], bit);
+
+ // stage 7
+ if (do_cols) {
+ out[1] = u[0];
+ out[3] = _mm_sub_epi32(kZero, u[4]);
+ out[5] = u[6];
+ out[7] = _mm_sub_epi32(kZero, u[2]);
+ out[9] = u[3];
+ out[11] = _mm_sub_epi32(kZero, u[7]);
+ out[13] = u[5];
+ out[15] = _mm_sub_epi32(kZero, u[1]);
+ } else {
+ const int log_range_out = AOMMAX(16, bd + 6);
+ const __m128i clamp_lo_out = _mm_set1_epi32(-(1 << (log_range_out - 1)));
+ const __m128i clamp_hi_out = _mm_set1_epi32((1 << (log_range_out - 1)) - 1);
+
+ neg_shift_sse4_1(u[0], u[4], out + 1, out + 3, &clamp_lo_out, &clamp_hi_out,
+ out_shift);
+ neg_shift_sse4_1(u[6], u[2], out + 5, out + 7, &clamp_lo_out, &clamp_hi_out,
+ out_shift);
+ neg_shift_sse4_1(u[3], u[7], out + 9, out + 11, &clamp_lo_out,
+ &clamp_hi_out, out_shift);
+ neg_shift_sse4_1(u[5], u[1], out + 13, out + 15, &clamp_lo_out,
+ &clamp_hi_out, out_shift);
+ }
+}
+
+static void iidentity8_sse4_1(__m128i *in, __m128i *out, int bit, int do_cols,
+ int bd, int out_shift) {
+ (void)bit;
+ out[0] = _mm_add_epi32(in[0], in[0]);
+ out[1] = _mm_add_epi32(in[1], in[1]);
+ out[2] = _mm_add_epi32(in[2], in[2]);
+ out[3] = _mm_add_epi32(in[3], in[3]);
+ out[4] = _mm_add_epi32(in[4], in[4]);
+ out[5] = _mm_add_epi32(in[5], in[5]);
+ out[6] = _mm_add_epi32(in[6], in[6]);
+ out[7] = _mm_add_epi32(in[7], in[7]);
+
+ if (!do_cols) {
+ const int log_range = AOMMAX(16, bd + 6);
+ const __m128i clamp_lo = _mm_set1_epi32(-(1 << (log_range - 1)));
+ const __m128i clamp_hi = _mm_set1_epi32((1 << (log_range - 1)) - 1);
+ round_shift_4x4(out, out_shift);
+ round_shift_4x4(out + 4, out_shift);
+ highbd_clamp_epi32_sse4_1(out, out, &clamp_lo, &clamp_hi, 8);
+ }
+}
+
+static __m128i get_recon_8x8(const __m128i pred, __m128i res_lo, __m128i res_hi,
+ int fliplr, int bd) {
+ __m128i x0, x1;
+ const __m128i zero = _mm_setzero_si128();
+
+ x0 = _mm_unpacklo_epi16(pred, zero);
+ x1 = _mm_unpackhi_epi16(pred, zero);
+
+ if (fliplr) {
+ res_lo = _mm_shuffle_epi32(res_lo, 0x1B);
+ res_hi = _mm_shuffle_epi32(res_hi, 0x1B);
+ x0 = _mm_add_epi32(res_hi, x0);
+ x1 = _mm_add_epi32(res_lo, x1);
+
+ } else {
+ x0 = _mm_add_epi32(res_lo, x0);
+ x1 = _mm_add_epi32(res_hi, x1);
+ }
+
+ x0 = _mm_packus_epi32(x0, x1);
+ return highbd_clamp_epi16(x0, bd);
+}
+
+static void write_buffer_8x8(__m128i *in, uint16_t *output, int stride,
+ int fliplr, int flipud, int shift, int bd) {
+ __m128i u0, u1, u2, u3, u4, u5, u6, u7;
+ __m128i v0, v1, v2, v3, v4, v5, v6, v7;
+
+ round_shift_8x8(in, shift);
+
+ v0 = _mm_load_si128((__m128i const *)(output + 0 * stride));
+ v1 = _mm_load_si128((__m128i const *)(output + 1 * stride));
+ v2 = _mm_load_si128((__m128i const *)(output + 2 * stride));
+ v3 = _mm_load_si128((__m128i const *)(output + 3 * stride));
+ v4 = _mm_load_si128((__m128i const *)(output + 4 * stride));
+ v5 = _mm_load_si128((__m128i const *)(output + 5 * stride));
+ v6 = _mm_load_si128((__m128i const *)(output + 6 * stride));
+ v7 = _mm_load_si128((__m128i const *)(output + 7 * stride));
+
+ if (flipud) {
+ u0 = get_recon_8x8(v0, in[14], in[15], fliplr, bd);
+ u1 = get_recon_8x8(v1, in[12], in[13], fliplr, bd);
+ u2 = get_recon_8x8(v2, in[10], in[11], fliplr, bd);
+ u3 = get_recon_8x8(v3, in[8], in[9], fliplr, bd);
+ u4 = get_recon_8x8(v4, in[6], in[7], fliplr, bd);
+ u5 = get_recon_8x8(v5, in[4], in[5], fliplr, bd);
+ u6 = get_recon_8x8(v6, in[2], in[3], fliplr, bd);
+ u7 = get_recon_8x8(v7, in[0], in[1], fliplr, bd);
+ } else {
+ u0 = get_recon_8x8(v0, in[0], in[1], fliplr, bd);
+ u1 = get_recon_8x8(v1, in[2], in[3], fliplr, bd);
+ u2 = get_recon_8x8(v2, in[4], in[5], fliplr, bd);
+ u3 = get_recon_8x8(v3, in[6], in[7], fliplr, bd);
+ u4 = get_recon_8x8(v4, in[8], in[9], fliplr, bd);
+ u5 = get_recon_8x8(v5, in[10], in[11], fliplr, bd);
+ u6 = get_recon_8x8(v6, in[12], in[13], fliplr, bd);
+ u7 = get_recon_8x8(v7, in[14], in[15], fliplr, bd);
+ }
+
+ _mm_store_si128((__m128i *)(output + 0 * stride), u0);
+ _mm_store_si128((__m128i *)(output + 1 * stride), u1);
+ _mm_store_si128((__m128i *)(output + 2 * stride), u2);
+ _mm_store_si128((__m128i *)(output + 3 * stride), u3);
+ _mm_store_si128((__m128i *)(output + 4 * stride), u4);
+ _mm_store_si128((__m128i *)(output + 5 * stride), u5);
+ _mm_store_si128((__m128i *)(output + 6 * stride), u6);
+ _mm_store_si128((__m128i *)(output + 7 * stride), u7);
+}
+
+void av1_inv_txfm2d_add_8x8_sse4_1(const int32_t *input, uint16_t *output,
+ int stride, TX_TYPE tx_type, int bd) {
+ __m128i in[16], out[16];
+ const int8_t *shift = av1_inv_txfm_shift_ls[TX_8X8];
+
+ switch (tx_type) {
+ case DCT_DCT:
+ load_buffer_8x8(input, in);
+ idct8x8_sse4_1(in, out, INV_COS_BIT, 0, bd, -shift[0]);
+ transpose_8x8(out, in);
+ idct8x8_sse4_1(in, out, INV_COS_BIT, 1, bd, 0);
+ write_buffer_8x8(out, output, stride, 0, 0, -shift[1], bd);
+ break;
+ case DCT_ADST:
+ load_buffer_8x8(input, in);
+ iadst8x8_sse4_1(in, out, INV_COS_BIT, 0, bd, -shift[0]);
+ transpose_8x8(out, in);
+ idct8x8_sse4_1(in, out, INV_COS_BIT, 1, bd, 0);
+ write_buffer_8x8(out, output, stride, 0, 0, -shift[1], bd);
+ break;
+ case ADST_DCT:
+ load_buffer_8x8(input, in);
+ idct8x8_sse4_1(in, out, INV_COS_BIT, 0, bd, -shift[0]);
+ transpose_8x8(out, in);
+ iadst8x8_sse4_1(in, out, INV_COS_BIT, 1, bd, 0);
+ write_buffer_8x8(out, output, stride, 0, 0, -shift[1], bd);
+ break;
+ case ADST_ADST:
+ load_buffer_8x8(input, in);
+ iadst8x8_sse4_1(in, out, INV_COS_BIT, 0, bd, -shift[0]);
+ transpose_8x8(out, in);
+ iadst8x8_sse4_1(in, out, INV_COS_BIT, 1, bd, 0);
+ write_buffer_8x8(out, output, stride, 0, 0, -shift[1], bd);
+ break;
+ case FLIPADST_DCT:
+ load_buffer_8x8(input, in);
+ idct8x8_sse4_1(in, out, INV_COS_BIT, 0, bd, -shift[0]);
+ transpose_8x8(out, in);
+ iadst8x8_sse4_1(in, out, INV_COS_BIT, 1, bd, 0);
+ write_buffer_8x8(out, output, stride, 0, 1, -shift[1], bd);
+ break;
+ case DCT_FLIPADST:
+ load_buffer_8x8(input, in);
+ iadst8x8_sse4_1(in, out, INV_COS_BIT, 0, bd, -shift[0]);
+ transpose_8x8(out, in);
+ idct8x8_sse4_1(in, out, INV_COS_BIT, 1, bd, 0);
+ write_buffer_8x8(out, output, stride, 1, 0, -shift[1], bd);
+ break;
+ case ADST_FLIPADST:
+ load_buffer_8x8(input, in);
+ iadst8x8_sse4_1(in, out, INV_COS_BIT, 0, bd, -shift[0]);
+ transpose_8x8(out, in);
+ iadst8x8_sse4_1(in, out, INV_COS_BIT, 1, bd, 0);
+ write_buffer_8x8(out, output, stride, 1, 0, -shift[1], bd);
+ break;
+ case FLIPADST_FLIPADST:
+ load_buffer_8x8(input, in);
+ iadst8x8_sse4_1(in, out, INV_COS_BIT, 0, bd, -shift[0]);
+ transpose_8x8(out, in);
+ iadst8x8_sse4_1(in, out, INV_COS_BIT, 1, bd, 0);
+ write_buffer_8x8(out, output, stride, 1, 1, -shift[1], bd);
+ break;
+ case FLIPADST_ADST:
+ load_buffer_8x8(input, in);
+ iadst8x8_sse4_1(in, out, INV_COS_BIT, 0, bd, -shift[0]);
+ transpose_8x8(out, in);
+ iadst8x8_sse4_1(in, out, INV_COS_BIT, 1, bd, 0);
+ write_buffer_8x8(out, output, stride, 0, 1, -shift[1], bd);
+ break;
+ default: assert(0);
+ }
+}
+
+static void idct8x8_low1_sse4_1(__m128i *in, __m128i *out, int bit, int do_cols,
+ int bd, int out_shift) {
+ const int32_t *cospi = cospi_arr(bit);
+ const __m128i cospi32 = _mm_set1_epi32(cospi[32]);
+ const __m128i rnding = _mm_set1_epi32(1 << (bit - 1));
+ const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8));
+ __m128i clamp_lo = _mm_set1_epi32(-(1 << (log_range - 1)));
+ __m128i clamp_hi = _mm_set1_epi32((1 << (log_range - 1)) - 1);
+ __m128i x;
+
+ // stage 0
+ // stage 1
+ // stage 2
+ // stage 3
+ x = _mm_mullo_epi32(in[0], cospi32);
+ x = _mm_add_epi32(x, rnding);
+ x = _mm_srai_epi32(x, bit);
+
+ // stage 4
+ // stage 5
+ if (!do_cols) {
+ const int log_range_out = AOMMAX(16, bd + 6);
+ clamp_lo = _mm_set1_epi32(-(1 << (log_range_out - 1)));
+ clamp_hi = _mm_set1_epi32((1 << (log_range_out - 1)) - 1);
+
+ __m128i offset = _mm_set1_epi32((1 << out_shift) >> 1);
+ x = _mm_add_epi32(x, offset);
+ x = _mm_sra_epi32(x, _mm_cvtsi32_si128(out_shift));
+ }
+
+ x = _mm_max_epi32(x, clamp_lo);
+ x = _mm_min_epi32(x, clamp_hi);
+ out[0] = x;
+ out[1] = x;
+ out[2] = x;
+ out[3] = x;
+ out[4] = x;
+ out[5] = x;
+ out[6] = x;
+ out[7] = x;
+}
+
+static void idct8x8_new_sse4_1(__m128i *in, __m128i *out, int bit, int do_cols,
+ int bd, int out_shift) {
+ const int32_t *cospi = cospi_arr(bit);
+ const __m128i cospi56 = _mm_set1_epi32(cospi[56]);
+ const __m128i cospim8 = _mm_set1_epi32(-cospi[8]);
+ const __m128i cospi24 = _mm_set1_epi32(cospi[24]);
+ const __m128i cospim40 = _mm_set1_epi32(-cospi[40]);
+ const __m128i cospi40 = _mm_set1_epi32(cospi[40]);
+ const __m128i cospi8 = _mm_set1_epi32(cospi[8]);
+ const __m128i cospi32 = _mm_set1_epi32(cospi[32]);
+ const __m128i cospi48 = _mm_set1_epi32(cospi[48]);
+ const __m128i cospim16 = _mm_set1_epi32(-cospi[16]);
+ const __m128i cospi16 = _mm_set1_epi32(cospi[16]);
+ const __m128i rnding = _mm_set1_epi32(1 << (bit - 1));
+ const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8));
+ const __m128i clamp_lo = _mm_set1_epi32(-(1 << (log_range - 1)));
+ const __m128i clamp_hi = _mm_set1_epi32((1 << (log_range - 1)) - 1);
+ __m128i u0, u1, u2, u3, u4, u5, u6, u7;
+ __m128i v0, v1, v2, v3, v4, v5, v6, v7;
+ __m128i x, y;
+
+ // stage 0
+ // stage 1
+ // stage 2
+ u0 = in[0];
+ u1 = in[4];
+ u2 = in[2];
+ u3 = in[6];
+
+ x = _mm_mullo_epi32(in[1], cospi56);
+ y = _mm_mullo_epi32(in[7], cospim8);
+ u4 = _mm_add_epi32(x, y);
+ u4 = _mm_add_epi32(u4, rnding);
+ u4 = _mm_srai_epi32(u4, bit);
+
+ x = _mm_mullo_epi32(in[1], cospi8);
+ y = _mm_mullo_epi32(in[7], cospi56);
+ u7 = _mm_add_epi32(x, y);
+ u7 = _mm_add_epi32(u7, rnding);
+ u7 = _mm_srai_epi32(u7, bit);
+
+ x = _mm_mullo_epi32(in[5], cospi24);
+ y = _mm_mullo_epi32(in[3], cospim40);
+ u5 = _mm_add_epi32(x, y);
+ u5 = _mm_add_epi32(u5, rnding);
+ u5 = _mm_srai_epi32(u5, bit);
+
+ x = _mm_mullo_epi32(in[5], cospi40);
+ y = _mm_mullo_epi32(in[3], cospi24);
+ u6 = _mm_add_epi32(x, y);
+ u6 = _mm_add_epi32(u6, rnding);
+ u6 = _mm_srai_epi32(u6, bit);
+
+ // stage 3
+ x = _mm_mullo_epi32(u0, cospi32);
+ y = _mm_mullo_epi32(u1, cospi32);
+ v0 = _mm_add_epi32(x, y);
+ v0 = _mm_add_epi32(v0, rnding);
+ v0 = _mm_srai_epi32(v0, bit);
+
+ v1 = _mm_sub_epi32(x, y);
+ v1 = _mm_add_epi32(v1, rnding);
+ v1 = _mm_srai_epi32(v1, bit);
+
+ x = _mm_mullo_epi32(u2, cospi48);
+ y = _mm_mullo_epi32(u3, cospim16);
+ v2 = _mm_add_epi32(x, y);
+ v2 = _mm_add_epi32(v2, rnding);
+ v2 = _mm_srai_epi32(v2, bit);
+
+ x = _mm_mullo_epi32(u2, cospi16);
+ y = _mm_mullo_epi32(u3, cospi48);
+ v3 = _mm_add_epi32(x, y);
+ v3 = _mm_add_epi32(v3, rnding);
+ v3 = _mm_srai_epi32(v3, bit);
+
+ addsub_sse4_1(u4, u5, &v4, &v5, &clamp_lo, &clamp_hi);
+ addsub_sse4_1(u7, u6, &v7, &v6, &clamp_lo, &clamp_hi);
+
+ // stage 4
+ addsub_sse4_1(v0, v3, &u0, &u3, &clamp_lo, &clamp_hi);
+ addsub_sse4_1(v1, v2, &u1, &u2, &clamp_lo, &clamp_hi);
+ u4 = v4;
+ u7 = v7;
+
+ x = _mm_mullo_epi32(v5, cospi32);
+ y = _mm_mullo_epi32(v6, cospi32);
+ u6 = _mm_add_epi32(y, x);
+ u6 = _mm_add_epi32(u6, rnding);
+ u6 = _mm_srai_epi32(u6, bit);
+
+ u5 = _mm_sub_epi32(y, x);
+ u5 = _mm_add_epi32(u5, rnding);
+ u5 = _mm_srai_epi32(u5, bit);
+
+ // stage 5
+ addsub_sse4_1(u0, u7, out + 0, out + 7, &clamp_lo, &clamp_hi);
+ addsub_sse4_1(u1, u6, out + 1, out + 6, &clamp_lo, &clamp_hi);
+ addsub_sse4_1(u2, u5, out + 2, out + 5, &clamp_lo, &clamp_hi);
+ addsub_sse4_1(u3, u4, out + 3, out + 4, &clamp_lo, &clamp_hi);
+
+ if (!do_cols) {
+ const int log_range_out = AOMMAX(16, bd + 6);
+ const __m128i clamp_lo_out = _mm_set1_epi32(-(1 << (log_range_out - 1)));
+ const __m128i clamp_hi_out = _mm_set1_epi32((1 << (log_range_out - 1)) - 1);
+
+ round_shift_4x4(out, out_shift);
+ round_shift_4x4(out + 4, out_shift);
+ highbd_clamp_epi32_sse4_1(out, out, &clamp_lo_out, &clamp_hi_out, 8);
+ }
+}
+
+static void iadst8x8_low1_sse4_1(__m128i *in, __m128i *out, int bit,
+ int do_cols, int bd, int out_shift) {
+ const int32_t *cospi = cospi_arr(bit);
+ const __m128i cospi4 = _mm_set1_epi32(cospi[4]);
+ const __m128i cospi60 = _mm_set1_epi32(cospi[60]);
+ const __m128i cospi16 = _mm_set1_epi32(cospi[16]);
+ const __m128i cospi48 = _mm_set1_epi32(cospi[48]);
+ const __m128i cospi32 = _mm_set1_epi32(cospi[32]);
+ const __m128i rnding = _mm_set1_epi32(1 << (bit - 1));
+ const __m128i kZero = _mm_setzero_si128();
+ __m128i u[8], x;
+
+ // stage 0
+ // stage 1
+ // stage 2
+
+ x = _mm_mullo_epi32(in[0], cospi60);
+ u[0] = _mm_add_epi32(x, rnding);
+ u[0] = _mm_srai_epi32(u[0], bit);
+
+ x = _mm_mullo_epi32(in[0], cospi4);
+ u[1] = _mm_sub_epi32(kZero, x);
+ u[1] = _mm_add_epi32(u[1], rnding);
+ u[1] = _mm_srai_epi32(u[1], bit);
+
+ // stage 3
+ // stage 4
+ __m128i temp1, temp2;
+ temp1 = _mm_mullo_epi32(u[0], cospi16);
+ x = _mm_mullo_epi32(u[1], cospi48);
+ temp1 = _mm_add_epi32(temp1, x);
+ temp1 = _mm_add_epi32(temp1, rnding);
+ temp1 = _mm_srai_epi32(temp1, bit);
+ u[4] = temp1;
+
+ temp2 = _mm_mullo_epi32(u[0], cospi48);
+ x = _mm_mullo_epi32(u[1], cospi16);
+ u[5] = _mm_sub_epi32(temp2, x);
+ u[5] = _mm_add_epi32(u[5], rnding);
+ u[5] = _mm_srai_epi32(u[5], bit);
+
+ // stage 5
+ // stage 6
+ temp1 = _mm_mullo_epi32(u[0], cospi32);
+ x = _mm_mullo_epi32(u[1], cospi32);
+ u[2] = _mm_add_epi32(temp1, x);
+ u[2] = _mm_add_epi32(u[2], rnding);
+ u[2] = _mm_srai_epi32(u[2], bit);
+
+ u[3] = _mm_sub_epi32(temp1, x);
+ u[3] = _mm_add_epi32(u[3], rnding);
+ u[3] = _mm_srai_epi32(u[3], bit);
+
+ temp1 = _mm_mullo_epi32(u[4], cospi32);
+ x = _mm_mullo_epi32(u[5], cospi32);
+ u[6] = _mm_add_epi32(temp1, x);
+ u[6] = _mm_add_epi32(u[6], rnding);
+ u[6] = _mm_srai_epi32(u[6], bit);
+
+ u[7] = _mm_sub_epi32(temp1, x);
+ u[7] = _mm_add_epi32(u[7], rnding);
+ u[7] = _mm_srai_epi32(u[7], bit);
+
+ // stage 7
+ if (do_cols) {
+ out[0] = u[0];
+ out[1] = _mm_sub_epi32(kZero, u[4]);
+ out[2] = u[6];
+ out[3] = _mm_sub_epi32(kZero, u[2]);
+ out[4] = u[3];
+ out[5] = _mm_sub_epi32(kZero, u[7]);
+ out[6] = u[5];
+ out[7] = _mm_sub_epi32(kZero, u[1]);
+ } else {
+ const int log_range_out = AOMMAX(16, bd + 6);
+ const __m128i clamp_lo_out = _mm_set1_epi32(-(1 << (log_range_out - 1)));
+ const __m128i clamp_hi_out = _mm_set1_epi32((1 << (log_range_out - 1)) - 1);
+
+ neg_shift_sse4_1(u[0], u[4], out + 0, out + 1, &clamp_lo_out, &clamp_hi_out,
+ out_shift);
+ neg_shift_sse4_1(u[6], u[2], out + 2, out + 3, &clamp_lo_out, &clamp_hi_out,
+ out_shift);
+ neg_shift_sse4_1(u[3], u[7], out + 4, out + 5, &clamp_lo_out, &clamp_hi_out,
+ out_shift);
+ neg_shift_sse4_1(u[5], u[1], out + 6, out + 7, &clamp_lo_out, &clamp_hi_out,
+ out_shift);
+ }
+}
+
+static void iadst8x8_new_sse4_1(__m128i *in, __m128i *out, int bit, int do_cols,
+ int bd, int out_shift) {
+ const int32_t *cospi = cospi_arr(bit);
+ const __m128i cospi4 = _mm_set1_epi32(cospi[4]);
+ const __m128i cospi60 = _mm_set1_epi32(cospi[60]);
+ const __m128i cospi20 = _mm_set1_epi32(cospi[20]);
+ const __m128i cospi44 = _mm_set1_epi32(cospi[44]);
+ const __m128i cospi36 = _mm_set1_epi32(cospi[36]);
+ const __m128i cospi28 = _mm_set1_epi32(cospi[28]);
+ const __m128i cospi52 = _mm_set1_epi32(cospi[52]);
+ const __m128i cospi12 = _mm_set1_epi32(cospi[12]);
+ const __m128i cospi16 = _mm_set1_epi32(cospi[16]);
+ const __m128i cospi48 = _mm_set1_epi32(cospi[48]);
+ const __m128i cospim48 = _mm_set1_epi32(-cospi[48]);
+ const __m128i cospi32 = _mm_set1_epi32(cospi[32]);
+ const __m128i rnding = _mm_set1_epi32(1 << (bit - 1));
+ const __m128i kZero = _mm_setzero_si128();
+ const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8));
+ const __m128i clamp_lo = _mm_set1_epi32(-(1 << (log_range - 1)));
+ const __m128i clamp_hi = _mm_set1_epi32((1 << (log_range - 1)) - 1);
+ __m128i u[8], v[8], x;
+
+ // stage 0
+ // stage 1
+ // stage 2
+
+ u[0] = _mm_mullo_epi32(in[7], cospi4);
+ x = _mm_mullo_epi32(in[0], cospi60);
+ u[0] = _mm_add_epi32(u[0], x);
+ u[0] = _mm_add_epi32(u[0], rnding);
+ u[0] = _mm_srai_epi32(u[0], bit);
+
+ u[1] = _mm_mullo_epi32(in[7], cospi60);
+ x = _mm_mullo_epi32(in[0], cospi4);
+ u[1] = _mm_sub_epi32(u[1], x);
+ u[1] = _mm_add_epi32(u[1], rnding);
+ u[1] = _mm_srai_epi32(u[1], bit);
+
+ // (2)
+ u[2] = _mm_mullo_epi32(in[5], cospi20);
+ x = _mm_mullo_epi32(in[2], cospi44);
+ u[2] = _mm_add_epi32(u[2], x);
+ u[2] = _mm_add_epi32(u[2], rnding);
+ u[2] = _mm_srai_epi32(u[2], bit);
+
+ u[3] = _mm_mullo_epi32(in[5], cospi44);
+ x = _mm_mullo_epi32(in[2], cospi20);
+ u[3] = _mm_sub_epi32(u[3], x);
+ u[3] = _mm_add_epi32(u[3], rnding);
+ u[3] = _mm_srai_epi32(u[3], bit);
+
+ // (3)
+ u[4] = _mm_mullo_epi32(in[3], cospi36);
+ x = _mm_mullo_epi32(in[4], cospi28);
+ u[4] = _mm_add_epi32(u[4], x);
+ u[4] = _mm_add_epi32(u[4], rnding);
+ u[4] = _mm_srai_epi32(u[4], bit);
+
+ u[5] = _mm_mullo_epi32(in[3], cospi28);
+ x = _mm_mullo_epi32(in[4], cospi36);
+ u[5] = _mm_sub_epi32(u[5], x);
+ u[5] = _mm_add_epi32(u[5], rnding);
+ u[5] = _mm_srai_epi32(u[5], bit);
+
+ // (4)
+ u[6] = _mm_mullo_epi32(in[1], cospi52);
+ x = _mm_mullo_epi32(in[6], cospi12);
+ u[6] = _mm_add_epi32(u[6], x);
+ u[6] = _mm_add_epi32(u[6], rnding);
+ u[6] = _mm_srai_epi32(u[6], bit);
+
+ u[7] = _mm_mullo_epi32(in[1], cospi12);
+ x = _mm_mullo_epi32(in[6], cospi52);
+ u[7] = _mm_sub_epi32(u[7], x);
+ u[7] = _mm_add_epi32(u[7], rnding);
+ u[7] = _mm_srai_epi32(u[7], bit);
+
+ // stage 3
+ addsub_sse4_1(u[0], u[4], &v[0], &v[4], &clamp_lo, &clamp_hi);
+ addsub_sse4_1(u[1], u[5], &v[1], &v[5], &clamp_lo, &clamp_hi);
+ addsub_sse4_1(u[2], u[6], &v[2], &v[6], &clamp_lo, &clamp_hi);
+ addsub_sse4_1(u[3], u[7], &v[3], &v[7], &clamp_lo, &clamp_hi);
+
+ // stage 4
+ u[0] = v[0];
+ u[1] = v[1];
+ u[2] = v[2];
+ u[3] = v[3];
+
+ u[4] = _mm_mullo_epi32(v[4], cospi16);
+ x = _mm_mullo_epi32(v[5], cospi48);
+ u[4] = _mm_add_epi32(u[4], x);
+ u[4] = _mm_add_epi32(u[4], rnding);
+ u[4] = _mm_srai_epi32(u[4], bit);
+
+ u[5] = _mm_mullo_epi32(v[4], cospi48);
+ x = _mm_mullo_epi32(v[5], cospi16);
+ u[5] = _mm_sub_epi32(u[5], x);
+ u[5] = _mm_add_epi32(u[5], rnding);
+ u[5] = _mm_srai_epi32(u[5], bit);
+
+ u[6] = _mm_mullo_epi32(v[6], cospim48);
+ x = _mm_mullo_epi32(v[7], cospi16);
+ u[6] = _mm_add_epi32(u[6], x);
+ u[6] = _mm_add_epi32(u[6], rnding);
+ u[6] = _mm_srai_epi32(u[6], bit);
+
+ u[7] = _mm_mullo_epi32(v[6], cospi16);
+ x = _mm_mullo_epi32(v[7], cospim48);
+ u[7] = _mm_sub_epi32(u[7], x);
+ u[7] = _mm_add_epi32(u[7], rnding);
+ u[7] = _mm_srai_epi32(u[7], bit);
+
+ // stage 5
+ addsub_sse4_1(u[0], u[2], &v[0], &v[2], &clamp_lo, &clamp_hi);
+ addsub_sse4_1(u[1], u[3], &v[1], &v[3], &clamp_lo, &clamp_hi);
+ addsub_sse4_1(u[4], u[6], &v[4], &v[6], &clamp_lo, &clamp_hi);
+ addsub_sse4_1(u[5], u[7], &v[5], &v[7], &clamp_lo, &clamp_hi);
+
+ // stage 6
+ u[0] = v[0];
+ u[1] = v[1];
+ u[4] = v[4];
+ u[5] = v[5];
+
+ v[0] = _mm_mullo_epi32(v[2], cospi32);
+ x = _mm_mullo_epi32(v[3], cospi32);
+ u[2] = _mm_add_epi32(v[0], x);
+ u[2] = _mm_add_epi32(u[2], rnding);
+ u[2] = _mm_srai_epi32(u[2], bit);
+
+ u[3] = _mm_sub_epi32(v[0], x);
+ u[3] = _mm_add_epi32(u[3], rnding);
+ u[3] = _mm_srai_epi32(u[3], bit);
+
+ v[0] = _mm_mullo_epi32(v[6], cospi32);
+ x = _mm_mullo_epi32(v[7], cospi32);
+ u[6] = _mm_add_epi32(v[0], x);
+ u[6] = _mm_add_epi32(u[6], rnding);
+ u[6] = _mm_srai_epi32(u[6], bit);
+
+ u[7] = _mm_sub_epi32(v[0], x);
+ u[7] = _mm_add_epi32(u[7], rnding);
+ u[7] = _mm_srai_epi32(u[7], bit);
+
+ // stage 7
+ if (do_cols) {
+ out[0] = u[0];
+ out[1] = _mm_sub_epi32(kZero, u[4]);
+ out[2] = u[6];
+ out[3] = _mm_sub_epi32(kZero, u[2]);
+ out[4] = u[3];
+ out[5] = _mm_sub_epi32(kZero, u[7]);
+ out[6] = u[5];
+ out[7] = _mm_sub_epi32(kZero, u[1]);
+ } else {
+ const int log_range_out = AOMMAX(16, bd + 6);
+ const __m128i clamp_lo_out = _mm_set1_epi32(-(1 << (log_range_out - 1)));
+ const __m128i clamp_hi_out = _mm_set1_epi32((1 << (log_range_out - 1)) - 1);
+
+ neg_shift_sse4_1(u[0], u[4], out + 0, out + 1, &clamp_lo_out, &clamp_hi_out,
+ out_shift);
+ neg_shift_sse4_1(u[6], u[2], out + 2, out + 3, &clamp_lo_out, &clamp_hi_out,
+ out_shift);
+ neg_shift_sse4_1(u[3], u[7], out + 4, out + 5, &clamp_lo_out, &clamp_hi_out,
+ out_shift);
+ neg_shift_sse4_1(u[5], u[1], out + 6, out + 7, &clamp_lo_out, &clamp_hi_out,
+ out_shift);
+ }
+}
+
+static void idct16x16_low1_sse4_1(__m128i *in, __m128i *out, int bit,
+ int do_cols, int bd, int out_shift) {
+ const int32_t *cospi = cospi_arr(bit);
+ const __m128i cospi32 = _mm_set1_epi32(cospi[32]);
+ const __m128i rnding = _mm_set1_epi32(1 << (bit - 1));
+ int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8));
+ __m128i clamp_lo = _mm_set1_epi32(-(1 << (log_range - 1)));
+ __m128i clamp_hi = _mm_set1_epi32((1 << (log_range - 1)) - 1);
+ // stage 0
+ // stage 1
+ // stage 2
+ // stage 3
+ // stage 4
+ in[0] = _mm_mullo_epi32(in[0], cospi32);
+ in[0] = _mm_add_epi32(in[0], rnding);
+ in[0] = _mm_srai_epi32(in[0], bit);
+
+ // stage 5
+ // stage 6
+ // stage 7
+ if (!do_cols) {
+ log_range = AOMMAX(16, bd + 6);
+ clamp_lo = _mm_set1_epi32(-(1 << (log_range - 1)));
+ clamp_hi = _mm_set1_epi32((1 << (log_range - 1)) - 1);
+ if (out_shift != 0) {
+ __m128i offset = _mm_set1_epi32((1 << out_shift) >> 1);
+ in[0] = _mm_add_epi32(in[0], offset);
+ in[0] = _mm_sra_epi32(in[0], _mm_cvtsi32_si128(out_shift));
+ }
+ }
+
+ in[0] = _mm_max_epi32(in[0], clamp_lo);
+ in[0] = _mm_min_epi32(in[0], clamp_hi);
+ out[0] = in[0];
+ out[1] = in[0];
+ out[2] = in[0];
+ out[3] = in[0];
+ out[4] = in[0];
+ out[5] = in[0];
+ out[6] = in[0];
+ out[7] = in[0];
+ out[8] = in[0];
+ out[9] = in[0];
+ out[10] = in[0];
+ out[11] = in[0];
+ out[12] = in[0];
+ out[13] = in[0];
+ out[14] = in[0];
+ out[15] = in[0];
+}
+
+static void idct16x16_low8_sse4_1(__m128i *in, __m128i *out, int bit,
+ int do_cols, int bd, int out_shift) {
+ const int32_t *cospi = cospi_arr(bit);
+ const __m128i cospi60 = _mm_set1_epi32(cospi[60]);
+ const __m128i cospi28 = _mm_set1_epi32(cospi[28]);
+ const __m128i cospi44 = _mm_set1_epi32(cospi[44]);
+ const __m128i cospi20 = _mm_set1_epi32(cospi[20]);
+ const __m128i cospi12 = _mm_set1_epi32(cospi[12]);
+ const __m128i cospi4 = _mm_set1_epi32(cospi[4]);
+ const __m128i cospi56 = _mm_set1_epi32(cospi[56]);
+ const __m128i cospi24 = _mm_set1_epi32(cospi[24]);
+ const __m128i cospim40 = _mm_set1_epi32(-cospi[40]);
+ const __m128i cospi8 = _mm_set1_epi32(cospi[8]);
+ const __m128i cospi32 = _mm_set1_epi32(cospi[32]);
+ const __m128i cospi48 = _mm_set1_epi32(cospi[48]);
+ const __m128i cospi16 = _mm_set1_epi32(cospi[16]);
+ const __m128i cospim16 = _mm_set1_epi32(-cospi[16]);
+ const __m128i cospim48 = _mm_set1_epi32(-cospi[48]);
+ const __m128i cospim36 = _mm_set1_epi32(-cospi[36]);
+ const __m128i cospim52 = _mm_set1_epi32(-cospi[52]);
+ const __m128i rnding = _mm_set1_epi32(1 << (bit - 1));
+ const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8));
+ const __m128i clamp_lo = _mm_set1_epi32(-(1 << (log_range - 1)));
+ const __m128i clamp_hi = _mm_set1_epi32((1 << (log_range - 1)) - 1);
+ __m128i u[16], x, y;
+ // stage 0
+ // stage 1
+ u[0] = in[0];
+ u[2] = in[4];
+ u[4] = in[2];
+ u[6] = in[6];
+ u[8] = in[1];
+ u[10] = in[5];
+ u[12] = in[3];
+ u[14] = in[7];
+
+ // stage 2
+ u[15] = half_btf_0_sse4_1(&cospi4, &u[8], &rnding, bit);
+ u[8] = half_btf_0_sse4_1(&cospi60, &u[8], &rnding, bit);
+
+ u[9] = half_btf_0_sse4_1(&cospim36, &u[14], &rnding, bit);
+ u[14] = half_btf_0_sse4_1(&cospi28, &u[14], &rnding, bit);
+
+ u[13] = half_btf_0_sse4_1(&cospi20, &u[10], &rnding, bit);
+ u[10] = half_btf_0_sse4_1(&cospi44, &u[10], &rnding, bit);
+
+ u[11] = half_btf_0_sse4_1(&cospim52, &u[12], &rnding, bit);
+ u[12] = half_btf_0_sse4_1(&cospi12, &u[12], &rnding, bit);
+
+ // stage 3
+ u[7] = half_btf_0_sse4_1(&cospi8, &u[4], &rnding, bit);
+ u[4] = half_btf_0_sse4_1(&cospi56, &u[4], &rnding, bit);
+ u[5] = half_btf_0_sse4_1(&cospim40, &u[6], &rnding, bit);
+ u[6] = half_btf_0_sse4_1(&cospi24, &u[6], &rnding, bit);
+
+ addsub_sse4_1(u[8], u[9], &u[8], &u[9], &clamp_lo, &clamp_hi);
+ addsub_sse4_1(u[11], u[10], &u[11], &u[10], &clamp_lo, &clamp_hi);
+ addsub_sse4_1(u[12], u[13], &u[12], &u[13], &clamp_lo, &clamp_hi);
+ addsub_sse4_1(u[15], u[14], &u[15], &u[14], &clamp_lo, &clamp_hi);
+
+ // stage 4
+ x = _mm_mullo_epi32(u[0], cospi32);
+ u[0] = _mm_add_epi32(x, rnding);
+ u[0] = _mm_srai_epi32(u[0], bit);
+ u[1] = u[0];
+
+ u[3] = half_btf_0_sse4_1(&cospi16, &u[2], &rnding, bit);
+ u[2] = half_btf_0_sse4_1(&cospi48, &u[2], &rnding, bit);
+
+ addsub_sse4_1(u[4], u[5], &u[4], &u[5], &clamp_lo, &clamp_hi);
+ addsub_sse4_1(u[7], u[6], &u[7], &u[6], &clamp_lo, &clamp_hi);
+
+ x = half_btf_sse4_1(&cospim16, &u[9], &cospi48, &u[14], &rnding, bit);
+ u[14] = half_btf_sse4_1(&cospi48, &u[9], &cospi16, &u[14], &rnding, bit);
+ u[9] = x;
+ y = half_btf_sse4_1(&cospim48, &u[10], &cospim16, &u[13], &rnding, bit);
+ u[13] = half_btf_sse4_1(&cospim16, &u[10], &cospi48, &u[13], &rnding, bit);
+ u[10] = y;
+
+ // stage 5
+ addsub_sse4_1(u[0], u[3], &u[0], &u[3], &clamp_lo, &clamp_hi);
+ addsub_sse4_1(u[1], u[2], &u[1], &u[2], &clamp_lo, &clamp_hi);
+
+ x = _mm_mullo_epi32(u[5], cospi32);
+ y = _mm_mullo_epi32(u[6], cospi32);
+ u[5] = _mm_sub_epi32(y, x);
+ u[5] = _mm_add_epi32(u[5], rnding);
+ u[5] = _mm_srai_epi32(u[5], bit);
+
+ u[6] = _mm_add_epi32(y, x);
+ u[6] = _mm_add_epi32(u[6], rnding);
+ u[6] = _mm_srai_epi32(u[6], bit);
+
+ addsub_sse4_1(u[8], u[11], &u[8], &u[11], &clamp_lo, &clamp_hi);
+ addsub_sse4_1(u[9], u[10], &u[9], &u[10], &clamp_lo, &clamp_hi);
+ addsub_sse4_1(u[15], u[12], &u[15], &u[12], &clamp_lo, &clamp_hi);
+ addsub_sse4_1(u[14], u[13], &u[14], &u[13], &clamp_lo, &clamp_hi);
+
+ // stage 6
+ addsub_sse4_1(u[0], u[7], &u[0], &u[7], &clamp_lo, &clamp_hi);
+ addsub_sse4_1(u[1], u[6], &u[1], &u[6], &clamp_lo, &clamp_hi);
+ addsub_sse4_1(u[2], u[5], &u[2], &u[5], &clamp_lo, &clamp_hi);
+ addsub_sse4_1(u[3], u[4], &u[3], &u[4], &clamp_lo, &clamp_hi);
+
+ x = _mm_mullo_epi32(u[10], cospi32);
+ y = _mm_mullo_epi32(u[13], cospi32);
+ u[10] = _mm_sub_epi32(y, x);
+ u[10] = _mm_add_epi32(u[10], rnding);
+ u[10] = _mm_srai_epi32(u[10], bit);
+
+ u[13] = _mm_add_epi32(x, y);
+ u[13] = _mm_add_epi32(u[13], rnding);
+ u[13] = _mm_srai_epi32(u[13], bit);
+
+ x = _mm_mullo_epi32(u[11], cospi32);
+ y = _mm_mullo_epi32(u[12], cospi32);
+ u[11] = _mm_sub_epi32(y, x);
+ u[11] = _mm_add_epi32(u[11], rnding);
+ u[11] = _mm_srai_epi32(u[11], bit);
+
+ u[12] = _mm_add_epi32(x, y);
+ u[12] = _mm_add_epi32(u[12], rnding);
+ u[12] = _mm_srai_epi32(u[12], bit);
+ // stage 7
+ addsub_sse4_1(u[0], u[15], out + 0, out + 15, &clamp_lo, &clamp_hi);
+ addsub_sse4_1(u[1], u[14], out + 1, out + 14, &clamp_lo, &clamp_hi);
+ addsub_sse4_1(u[2], u[13], out + 2, out + 13, &clamp_lo, &clamp_hi);
+ addsub_sse4_1(u[3], u[12], out + 3, out + 12, &clamp_lo, &clamp_hi);
+ addsub_sse4_1(u[4], u[11], out + 4, out + 11, &clamp_lo, &clamp_hi);
+ addsub_sse4_1(u[5], u[10], out + 5, out + 10, &clamp_lo, &clamp_hi);
+ addsub_sse4_1(u[6], u[9], out + 6, out + 9, &clamp_lo, &clamp_hi);
+ addsub_sse4_1(u[7], u[8], out + 7, out + 8, &clamp_lo, &clamp_hi);
+
+ if (!do_cols) {
+ const int log_range_out = AOMMAX(16, bd + 6);
+ const __m128i clamp_lo_out = _mm_set1_epi32(-(1 << (log_range_out - 1)));
+ const __m128i clamp_hi_out = _mm_set1_epi32((1 << (log_range_out - 1)) - 1);
+ round_shift_8x8(out, out_shift);
+ highbd_clamp_epi32_sse4_1(out, out, &clamp_lo_out, &clamp_hi_out, 16);
+ }
+}
+
+static void iadst16x16_low1_sse4_1(__m128i *in, __m128i *out, int bit,
+ int do_cols, int bd, int out_shift) {
+ const int32_t *cospi = cospi_arr(bit);
+ const __m128i cospi2 = _mm_set1_epi32(cospi[2]);
+ const __m128i cospi62 = _mm_set1_epi32(cospi[62]);
+ const __m128i cospi8 = _mm_set1_epi32(cospi[8]);
+ const __m128i cospi56 = _mm_set1_epi32(cospi[56]);
+ const __m128i cospi48 = _mm_set1_epi32(cospi[48]);
+ const __m128i cospi16 = _mm_set1_epi32(cospi[16]);
+ const __m128i cospi32 = _mm_set1_epi32(cospi[32]);
+ const __m128i rnding = _mm_set1_epi32(1 << (bit - 1));
+ const __m128i zero = _mm_setzero_si128();
+ __m128i v[16], x, y, temp1, temp2;
+ // stage 0
+ // stage 1
+ // stage 2
+ x = _mm_mullo_epi32(in[0], cospi62);
+ v[0] = _mm_add_epi32(x, rnding);
+ v[0] = _mm_srai_epi32(v[0], bit);
+
+ x = _mm_mullo_epi32(in[0], cospi2);
+ v[1] = _mm_sub_epi32(zero, x);
+ v[1] = _mm_add_epi32(v[1], rnding);
+ v[1] = _mm_srai_epi32(v[1], bit);
+
+ // stage 3
+ v[8] = v[0];
+ v[9] = v[1];
+
+ // stage 4
+ temp1 = _mm_mullo_epi32(v[8], cospi8);
+ x = _mm_mullo_epi32(v[9], cospi56);
+ temp1 = _mm_add_epi32(temp1, x);
+ temp1 = _mm_add_epi32(temp1, rnding);
+ temp1 = _mm_srai_epi32(temp1, bit);
+
+ temp2 = _mm_mullo_epi32(v[8], cospi56);
+ x = _mm_mullo_epi32(v[9], cospi8);
+ temp2 = _mm_sub_epi32(temp2, x);
+ temp2 = _mm_add_epi32(temp2, rnding);
+ temp2 = _mm_srai_epi32(temp2, bit);
+ v[8] = temp1;
+ v[9] = temp2;
+
+ // stage 5
+ v[4] = v[0];
+ v[5] = v[1];
+ v[12] = v[8];
+ v[13] = v[9];
+
+ // stage 6
+ temp1 = _mm_mullo_epi32(v[4], cospi16);
+ x = _mm_mullo_epi32(v[5], cospi48);
+ temp1 = _mm_add_epi32(temp1, x);
+ temp1 = _mm_add_epi32(temp1, rnding);
+ temp1 = _mm_srai_epi32(temp1, bit);
+
+ temp2 = _mm_mullo_epi32(v[4], cospi48);
+ x = _mm_mullo_epi32(v[5], cospi16);
+ temp2 = _mm_sub_epi32(temp2, x);
+ temp2 = _mm_add_epi32(temp2, rnding);
+ temp2 = _mm_srai_epi32(temp2, bit);
+ v[4] = temp1;
+ v[5] = temp2;
+
+ temp1 = _mm_mullo_epi32(v[12], cospi16);
+ x = _mm_mullo_epi32(v[13], cospi48);
+ temp1 = _mm_add_epi32(temp1, x);
+ temp1 = _mm_add_epi32(temp1, rnding);
+ temp1 = _mm_srai_epi32(temp1, bit);
+
+ temp2 = _mm_mullo_epi32(v[12], cospi48);
+ x = _mm_mullo_epi32(v[13], cospi16);
+ temp2 = _mm_sub_epi32(temp2, x);
+ temp2 = _mm_add_epi32(temp2, rnding);
+ temp2 = _mm_srai_epi32(temp2, bit);
+ v[12] = temp1;
+ v[13] = temp2;
+
+ // stage 7
+ v[2] = v[0];
+ v[3] = v[1];
+ v[6] = v[4];
+ v[7] = v[5];
+ v[10] = v[8];
+ v[11] = v[9];
+ v[14] = v[12];
+ v[15] = v[13];
+
+ // stage 8
+ y = _mm_mullo_epi32(v[2], cospi32);
+ x = _mm_mullo_epi32(v[3], cospi32);
+ v[2] = _mm_add_epi32(y, x);
+ v[2] = _mm_add_epi32(v[2], rnding);
+ v[2] = _mm_srai_epi32(v[2], bit);
+
+ v[3] = _mm_sub_epi32(y, x);
+ v[3] = _mm_add_epi32(v[3], rnding);
+ v[3] = _mm_srai_epi32(v[3], bit);
+
+ y = _mm_mullo_epi32(v[6], cospi32);
+ x = _mm_mullo_epi32(v[7], cospi32);
+ v[6] = _mm_add_epi32(y, x);
+ v[6] = _mm_add_epi32(v[6], rnding);
+ v[6] = _mm_srai_epi32(v[6], bit);
+
+ v[7] = _mm_sub_epi32(y, x);
+ v[7] = _mm_add_epi32(v[7], rnding);
+ v[7] = _mm_srai_epi32(v[7], bit);
+
+ y = _mm_mullo_epi32(v[10], cospi32);
+ x = _mm_mullo_epi32(v[11], cospi32);
+ v[10] = _mm_add_epi32(y, x);
+ v[10] = _mm_add_epi32(v[10], rnding);
+ v[10] = _mm_srai_epi32(v[10], bit);
+
+ v[11] = _mm_sub_epi32(y, x);
+ v[11] = _mm_add_epi32(v[11], rnding);
+ v[11] = _mm_srai_epi32(v[11], bit);
+
+ y = _mm_mullo_epi32(v[14], cospi32);
+ x = _mm_mullo_epi32(v[15], cospi32);
+ v[14] = _mm_add_epi32(y, x);
+ v[14] = _mm_add_epi32(v[14], rnding);
+ v[14] = _mm_srai_epi32(v[14], bit);
+
+ v[15] = _mm_sub_epi32(y, x);
+ v[15] = _mm_add_epi32(v[15], rnding);
+ v[15] = _mm_srai_epi32(v[15], bit);
+
+ // stage 9
+ if (do_cols) {
+ out[0] = v[0];
+ out[1] = _mm_sub_epi32(zero, v[8]);
+ out[2] = v[12];
+ out[3] = _mm_sub_epi32(zero, v[4]);
+ out[4] = v[6];
+ out[5] = _mm_sub_epi32(zero, v[14]);
+ out[6] = v[10];
+ out[7] = _mm_sub_epi32(zero, v[2]);
+ out[8] = v[3];
+ out[9] = _mm_sub_epi32(zero, v[11]);
+ out[10] = v[15];
+ out[11] = _mm_sub_epi32(zero, v[7]);
+ out[12] = v[5];
+ out[13] = _mm_sub_epi32(zero, v[13]);
+ out[14] = v[9];
+ out[15] = _mm_sub_epi32(zero, v[1]);
+ } else {
+ const int log_range_out = AOMMAX(16, bd + 6);
+ const __m128i clamp_lo_out = _mm_set1_epi32(-(1 << (log_range_out - 1)));
+ const __m128i clamp_hi_out = _mm_set1_epi32((1 << (log_range_out - 1)) - 1);
+
+ neg_shift_sse4_1(v[0], v[8], out + 0, out + 1, &clamp_lo_out, &clamp_hi_out,
+ out_shift);
+ neg_shift_sse4_1(v[12], v[4], out + 2, out + 3, &clamp_lo_out,
+ &clamp_hi_out, out_shift);
+ neg_shift_sse4_1(v[6], v[14], out + 4, out + 5, &clamp_lo_out,
+ &clamp_hi_out, out_shift);
+ neg_shift_sse4_1(v[10], v[2], out + 6, out + 7, &clamp_lo_out,
+ &clamp_hi_out, out_shift);
+ neg_shift_sse4_1(v[3], v[11], out + 8, out + 9, &clamp_lo_out,
+ &clamp_hi_out, out_shift);
+ neg_shift_sse4_1(v[15], v[7], out + 10, out + 11, &clamp_lo_out,
+ &clamp_hi_out, out_shift);
+ neg_shift_sse4_1(v[5], v[13], out + 12, out + 13, &clamp_lo_out,
+ &clamp_hi_out, out_shift);
+ neg_shift_sse4_1(v[9], v[1], out + 14, out + 15, &clamp_lo_out,
+ &clamp_hi_out, out_shift);
+ }
+}
+
+static void iadst16x16_low8_sse4_1(__m128i *in, __m128i *out, int bit,
+ int do_cols, int bd, int out_shift) {
+ const int32_t *cospi = cospi_arr(bit);
+ const __m128i cospi2 = _mm_set1_epi32(cospi[2]);
+ const __m128i cospi62 = _mm_set1_epi32(cospi[62]);
+ const __m128i cospi10 = _mm_set1_epi32(cospi[10]);
+ const __m128i cospi54 = _mm_set1_epi32(cospi[54]);
+ const __m128i cospi18 = _mm_set1_epi32(cospi[18]);
+ const __m128i cospi46 = _mm_set1_epi32(cospi[46]);
+ const __m128i cospi26 = _mm_set1_epi32(cospi[26]);
+ const __m128i cospi38 = _mm_set1_epi32(cospi[38]);
+ const __m128i cospi34 = _mm_set1_epi32(cospi[34]);
+ const __m128i cospi30 = _mm_set1_epi32(cospi[30]);
+ const __m128i cospi42 = _mm_set1_epi32(cospi[42]);
+ const __m128i cospi22 = _mm_set1_epi32(cospi[22]);
+ const __m128i cospi50 = _mm_set1_epi32(cospi[50]);
+ const __m128i cospi14 = _mm_set1_epi32(cospi[14]);
+ const __m128i cospi58 = _mm_set1_epi32(cospi[58]);
+ const __m128i cospi6 = _mm_set1_epi32(cospi[6]);
+ const __m128i cospi8 = _mm_set1_epi32(cospi[8]);
+ const __m128i cospi56 = _mm_set1_epi32(cospi[56]);
+ const __m128i cospi40 = _mm_set1_epi32(cospi[40]);
+ const __m128i cospi24 = _mm_set1_epi32(cospi[24]);
+ const __m128i cospim56 = _mm_set1_epi32(-cospi[56]);
+ const __m128i cospim24 = _mm_set1_epi32(-cospi[24]);
+ const __m128i cospi48 = _mm_set1_epi32(cospi[48]);
+ const __m128i cospi16 = _mm_set1_epi32(cospi[16]);
+ const __m128i cospim48 = _mm_set1_epi32(-cospi[48]);
+ const __m128i cospi32 = _mm_set1_epi32(cospi[32]);
+ const __m128i rnding = _mm_set1_epi32(1 << (bit - 1));
+ const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8));
+ const __m128i clamp_lo = _mm_set1_epi32(-(1 << (log_range - 1)));
+ const __m128i clamp_hi = _mm_set1_epi32((1 << (log_range - 1)) - 1);
+ __m128i zero = _mm_setzero_si128();
+ __m128i u[16], x, y;
+
+ // stage 0
+ // stage 1
+ // stage 2
+ x = _mm_mullo_epi32(in[0], cospi62);
+ u[0] = _mm_add_epi32(x, rnding);
+ u[0] = _mm_srai_epi32(u[0], bit);
+
+ x = _mm_mullo_epi32(in[0], cospi2);
+ u[1] = _mm_sub_epi32(zero, x);
+ u[1] = _mm_add_epi32(u[1], rnding);
+ u[1] = _mm_srai_epi32(u[1], bit);
+
+ x = _mm_mullo_epi32(in[2], cospi54);
+ u[2] = _mm_add_epi32(x, rnding);
+ u[2] = _mm_srai_epi32(u[2], bit);
+
+ x = _mm_mullo_epi32(in[2], cospi10);
+ u[3] = _mm_sub_epi32(zero, x);
+ u[3] = _mm_add_epi32(u[3], rnding);
+ u[3] = _mm_srai_epi32(u[3], bit);
+
+ x = _mm_mullo_epi32(in[4], cospi46);
+ u[4] = _mm_add_epi32(x, rnding);
+ u[4] = _mm_srai_epi32(u[4], bit);
+
+ x = _mm_mullo_epi32(in[4], cospi18);
+ u[5] = _mm_sub_epi32(zero, x);
+ u[5] = _mm_add_epi32(u[5], rnding);
+ u[5] = _mm_srai_epi32(u[5], bit);
+
+ x = _mm_mullo_epi32(in[6], cospi38);
+ u[6] = _mm_add_epi32(x, rnding);
+ u[6] = _mm_srai_epi32(u[6], bit);
+
+ x = _mm_mullo_epi32(in[6], cospi26);
+ u[7] = _mm_sub_epi32(zero, x);
+ u[7] = _mm_add_epi32(u[7], rnding);
+ u[7] = _mm_srai_epi32(u[7], bit);
+
+ u[8] = _mm_mullo_epi32(in[7], cospi34);
+ u[8] = _mm_add_epi32(u[8], rnding);
+ u[8] = _mm_srai_epi32(u[8], bit);
+
+ u[9] = _mm_mullo_epi32(in[7], cospi30);
+ u[9] = _mm_add_epi32(u[9], rnding);
+ u[9] = _mm_srai_epi32(u[9], bit);
+
+ u[10] = _mm_mullo_epi32(in[5], cospi42);
+ u[10] = _mm_add_epi32(u[10], rnding);
+ u[10] = _mm_srai_epi32(u[10], bit);
+
+ u[11] = _mm_mullo_epi32(in[5], cospi22);
+ u[11] = _mm_add_epi32(u[11], rnding);
+ u[11] = _mm_srai_epi32(u[11], bit);
+
+ u[12] = _mm_mullo_epi32(in[3], cospi50);
+ u[12] = _mm_add_epi32(u[12], rnding);
+ u[12] = _mm_srai_epi32(u[12], bit);
+
+ u[13] = _mm_mullo_epi32(in[3], cospi14);
+ u[13] = _mm_add_epi32(u[13], rnding);
+ u[13] = _mm_srai_epi32(u[13], bit);
+
+ u[14] = _mm_mullo_epi32(in[1], cospi58);
+ u[14] = _mm_add_epi32(u[14], rnding);
+ u[14] = _mm_srai_epi32(u[14], bit);
+
+ u[15] = _mm_mullo_epi32(in[1], cospi6);
+ u[15] = _mm_add_epi32(u[15], rnding);
+ u[15] = _mm_srai_epi32(u[15], bit);
+
+ // stage 3
+ addsub_sse4_1(u[0], u[8], &u[0], &u[8], &clamp_lo, &clamp_hi);
+ addsub_sse4_1(u[1], u[9], &u[1], &u[9], &clamp_lo, &clamp_hi);
+ addsub_sse4_1(u[2], u[10], &u[2], &u[10], &clamp_lo, &clamp_hi);
+ addsub_sse4_1(u[3], u[11], &u[3], &u[11], &clamp_lo, &clamp_hi);
+ addsub_sse4_1(u[4], u[12], &u[4], &u[12], &clamp_lo, &clamp_hi);
+ addsub_sse4_1(u[5], u[13], &u[5], &u[13], &clamp_lo, &clamp_hi);
+ addsub_sse4_1(u[6], u[14], &u[6], &u[14], &clamp_lo, &clamp_hi);
+ addsub_sse4_1(u[7], u[15], &u[7], &u[15], &clamp_lo, &clamp_hi);
+
+ // stage 4
+ y = _mm_mullo_epi32(u[8], cospi56);
+ x = _mm_mullo_epi32(u[9], cospi56);
+ u[8] = _mm_mullo_epi32(u[8], cospi8);
+ u[8] = _mm_add_epi32(u[8], x);
+ u[8] = _mm_add_epi32(u[8], rnding);
+ u[8] = _mm_srai_epi32(u[8], bit);
+
+ x = _mm_mullo_epi32(u[9], cospi8);
+ u[9] = _mm_sub_epi32(y, x);
+ u[9] = _mm_add_epi32(u[9], rnding);
+ u[9] = _mm_srai_epi32(u[9], bit);
+
+ x = _mm_mullo_epi32(u[11], cospi24);
+ y = _mm_mullo_epi32(u[10], cospi24);
+ u[10] = _mm_mullo_epi32(u[10], cospi40);
+ u[10] = _mm_add_epi32(u[10], x);
+ u[10] = _mm_add_epi32(u[10], rnding);
+ u[10] = _mm_srai_epi32(u[10], bit);
+
+ x = _mm_mullo_epi32(u[11], cospi40);
+ u[11] = _mm_sub_epi32(y, x);
+ u[11] = _mm_add_epi32(u[11], rnding);
+ u[11] = _mm_srai_epi32(u[11], bit);
+
+ x = _mm_mullo_epi32(u[13], cospi8);
+ y = _mm_mullo_epi32(u[12], cospi8);
+ u[12] = _mm_mullo_epi32(u[12], cospim56);
+ u[12] = _mm_add_epi32(u[12], x);
+ u[12] = _mm_add_epi32(u[12], rnding);
+ u[12] = _mm_srai_epi32(u[12], bit);
+
+ x = _mm_mullo_epi32(u[13], cospim56);
+ u[13] = _mm_sub_epi32(y, x);
+ u[13] = _mm_add_epi32(u[13], rnding);
+ u[13] = _mm_srai_epi32(u[13], bit);
+
+ x = _mm_mullo_epi32(u[15], cospi40);
+ y = _mm_mullo_epi32(u[14], cospi40);
+ u[14] = _mm_mullo_epi32(u[14], cospim24);
+ u[14] = _mm_add_epi32(u[14], x);
+ u[14] = _mm_add_epi32(u[14], rnding);
+ u[14] = _mm_srai_epi32(u[14], bit);
+
+ x = _mm_mullo_epi32(u[15], cospim24);
+ u[15] = _mm_sub_epi32(y, x);
+ u[15] = _mm_add_epi32(u[15], rnding);
+ u[15] = _mm_srai_epi32(u[15], bit);
+
+ // stage 5
+ addsub_sse4_1(u[0], u[4], &u[0], &u[4], &clamp_lo, &clamp_hi);
+ addsub_sse4_1(u[1], u[5], &u[1], &u[5], &clamp_lo, &clamp_hi);
+ addsub_sse4_1(u[2], u[6], &u[2], &u[6], &clamp_lo, &clamp_hi);
+ addsub_sse4_1(u[3], u[7], &u[3], &u[7], &clamp_lo, &clamp_hi);
+ addsub_sse4_1(u[8], u[12], &u[8], &u[12], &clamp_lo, &clamp_hi);
+ addsub_sse4_1(u[9], u[13], &u[9], &u[13], &clamp_lo, &clamp_hi);
+ addsub_sse4_1(u[10], u[14], &u[10], &u[14], &clamp_lo, &clamp_hi);
+ addsub_sse4_1(u[11], u[15], &u[11], &u[15], &clamp_lo, &clamp_hi);
+
+ // stage 6
+ x = _mm_mullo_epi32(u[5], cospi48);
+ y = _mm_mullo_epi32(u[4], cospi48);
+ u[4] = _mm_mullo_epi32(u[4], cospi16);
+ u[4] = _mm_add_epi32(u[4], x);
+ u[4] = _mm_add_epi32(u[4], rnding);
+ u[4] = _mm_srai_epi32(u[4], bit);
+
+ x = _mm_mullo_epi32(u[5], cospi16);
+ u[5] = _mm_sub_epi32(y, x);
+ u[5] = _mm_add_epi32(u[5], rnding);
+ u[5] = _mm_srai_epi32(u[5], bit);
+
+ x = _mm_mullo_epi32(u[7], cospi16);
+ y = _mm_mullo_epi32(u[6], cospi16);
+ u[6] = _mm_mullo_epi32(u[6], cospim48);
+ u[6] = _mm_add_epi32(u[6], x);
+ u[6] = _mm_add_epi32(u[6], rnding);
+ u[6] = _mm_srai_epi32(u[6], bit);
+
+ x = _mm_mullo_epi32(u[7], cospim48);
+ u[7] = _mm_sub_epi32(y, x);
+ u[7] = _mm_add_epi32(u[7], rnding);
+ u[7] = _mm_srai_epi32(u[7], bit);
+
+ x = _mm_mullo_epi32(u[13], cospi48);
+ y = _mm_mullo_epi32(u[12], cospi48);
+ u[12] = _mm_mullo_epi32(u[12], cospi16);
+ u[12] = _mm_add_epi32(u[12], x);
+ u[12] = _mm_add_epi32(u[12], rnding);
+ u[12] = _mm_srai_epi32(u[12], bit);
+
+ x = _mm_mullo_epi32(u[13], cospi16);
+ u[13] = _mm_sub_epi32(y, x);
+ u[13] = _mm_add_epi32(u[13], rnding);
+ u[13] = _mm_srai_epi32(u[13], bit);
+
+ x = _mm_mullo_epi32(u[15], cospi16);
+ y = _mm_mullo_epi32(u[14], cospi16);
+ u[14] = _mm_mullo_epi32(u[14], cospim48);
+ u[14] = _mm_add_epi32(u[14], x);
+ u[14] = _mm_add_epi32(u[14], rnding);
+ u[14] = _mm_srai_epi32(u[14], bit);
+
+ x = _mm_mullo_epi32(u[15], cospim48);
+ u[15] = _mm_sub_epi32(y, x);
+ u[15] = _mm_add_epi32(u[15], rnding);
+ u[15] = _mm_srai_epi32(u[15], bit);
+
+ // stage 7
+ addsub_sse4_1(u[0], u[2], &u[0], &u[2], &clamp_lo, &clamp_hi);
+ addsub_sse4_1(u[1], u[3], &u[1], &u[3], &clamp_lo, &clamp_hi);
+ addsub_sse4_1(u[4], u[6], &u[4], &u[6], &clamp_lo, &clamp_hi);
+ addsub_sse4_1(u[5], u[7], &u[5], &u[7], &clamp_lo, &clamp_hi);
+ addsub_sse4_1(u[8], u[10], &u[8], &u[10], &clamp_lo, &clamp_hi);
+ addsub_sse4_1(u[9], u[11], &u[9], &u[11], &clamp_lo, &clamp_hi);
+ addsub_sse4_1(u[12], u[14], &u[12], &u[14], &clamp_lo, &clamp_hi);
+ addsub_sse4_1(u[13], u[15], &u[13], &u[15], &clamp_lo, &clamp_hi);
+
+ // stage 8
+ y = _mm_mullo_epi32(u[2], cospi32);
+ x = _mm_mullo_epi32(u[3], cospi32);
+ u[2] = _mm_add_epi32(y, x);
+ u[2] = _mm_add_epi32(u[2], rnding);
+ u[2] = _mm_srai_epi32(u[2], bit);
+
+ u[3] = _mm_sub_epi32(y, x);
+ u[3] = _mm_add_epi32(u[3], rnding);
+ u[3] = _mm_srai_epi32(u[3], bit);
+ y = _mm_mullo_epi32(u[6], cospi32);
+ x = _mm_mullo_epi32(u[7], cospi32);
+ u[6] = _mm_add_epi32(y, x);
+ u[6] = _mm_add_epi32(u[6], rnding);
+ u[6] = _mm_srai_epi32(u[6], bit);
+
+ u[7] = _mm_sub_epi32(y, x);
+ u[7] = _mm_add_epi32(u[7], rnding);
+ u[7] = _mm_srai_epi32(u[7], bit);
+
+ y = _mm_mullo_epi32(u[10], cospi32);
+ x = _mm_mullo_epi32(u[11], cospi32);
+ u[10] = _mm_add_epi32(y, x);
+ u[10] = _mm_add_epi32(u[10], rnding);
+ u[10] = _mm_srai_epi32(u[10], bit);
+
+ u[11] = _mm_sub_epi32(y, x);
+ u[11] = _mm_add_epi32(u[11], rnding);
+ u[11] = _mm_srai_epi32(u[11], bit);
+
+ y = _mm_mullo_epi32(u[14], cospi32);
+ x = _mm_mullo_epi32(u[15], cospi32);
+ u[14] = _mm_add_epi32(y, x);
+ u[14] = _mm_add_epi32(u[14], rnding);
+ u[14] = _mm_srai_epi32(u[14], bit);
+
+ u[15] = _mm_sub_epi32(y, x);
+ u[15] = _mm_add_epi32(u[15], rnding);
+ u[15] = _mm_srai_epi32(u[15], bit);
+
+ // stage 9
+ if (do_cols) {
+ out[0] = u[0];
+ out[1] = _mm_sub_epi32(zero, u[8]);
+ out[2] = u[12];
+ out[3] = _mm_sub_epi32(zero, u[4]);
+ out[4] = u[6];
+ out[5] = _mm_sub_epi32(zero, u[14]);
+ out[6] = u[10];
+ out[7] = _mm_sub_epi32(zero, u[2]);
+ out[8] = u[3];
+ out[9] = _mm_sub_epi32(zero, u[11]);
+ out[10] = u[15];
+ out[11] = _mm_sub_epi32(zero, u[7]);
+ out[12] = u[5];
+ out[13] = _mm_sub_epi32(zero, u[13]);
+ out[14] = u[9];
+ out[15] = _mm_sub_epi32(zero, u[1]);
+ } else {
+ const int log_range_out = AOMMAX(16, bd + 6);
+ const __m128i clamp_lo_out = _mm_set1_epi32(-(1 << (log_range_out - 1)));
+ const __m128i clamp_hi_out = _mm_set1_epi32((1 << (log_range_out - 1)) - 1);
+
+ neg_shift_sse4_1(u[0], u[8], out + 0, out + 1, &clamp_lo_out, &clamp_hi_out,
+ out_shift);
+ neg_shift_sse4_1(u[12], u[4], out + 2, out + 3, &clamp_lo_out,
+ &clamp_hi_out, out_shift);
+ neg_shift_sse4_1(u[6], u[14], out + 4, out + 5, &clamp_lo_out,
+ &clamp_hi_out, out_shift);
+ neg_shift_sse4_1(u[10], u[2], out + 6, out + 7, &clamp_lo_out,
+ &clamp_hi_out, out_shift);
+ neg_shift_sse4_1(u[3], u[11], out + 8, out + 9, &clamp_lo_out,
+ &clamp_hi_out, out_shift);
+ neg_shift_sse4_1(u[15], u[7], out + 10, out + 11, &clamp_lo_out,
+ &clamp_hi_out, out_shift);
+ neg_shift_sse4_1(u[5], u[13], out + 12, out + 13, &clamp_lo_out,
+ &clamp_hi_out, out_shift);
+ neg_shift_sse4_1(u[9], u[1], out + 14, out + 15, &clamp_lo_out,
+ &clamp_hi_out, out_shift);
+ }
+}
+
+static void idct16x16_sse4_1(__m128i *in, __m128i *out, int bit, int do_cols,
+ int bd, int out_shift) {
+ const int32_t *cospi = cospi_arr(bit);
+ const __m128i cospi60 = _mm_set1_epi32(cospi[60]);
+ const __m128i cospim4 = _mm_set1_epi32(-cospi[4]);
+ const __m128i cospi28 = _mm_set1_epi32(cospi[28]);
+ const __m128i cospim36 = _mm_set1_epi32(-cospi[36]);
+ const __m128i cospi44 = _mm_set1_epi32(cospi[44]);
+ const __m128i cospi20 = _mm_set1_epi32(cospi[20]);
+ const __m128i cospim20 = _mm_set1_epi32(-cospi[20]);
+ const __m128i cospi12 = _mm_set1_epi32(cospi[12]);
+ const __m128i cospim52 = _mm_set1_epi32(-cospi[52]);
+ const __m128i cospi52 = _mm_set1_epi32(cospi[52]);
+ const __m128i cospi36 = _mm_set1_epi32(cospi[36]);
+ const __m128i cospi4 = _mm_set1_epi32(cospi[4]);
+ const __m128i cospi56 = _mm_set1_epi32(cospi[56]);
+ const __m128i cospim8 = _mm_set1_epi32(-cospi[8]);
+ const __m128i cospi24 = _mm_set1_epi32(cospi[24]);
+ const __m128i cospim40 = _mm_set1_epi32(-cospi[40]);
+ const __m128i cospi40 = _mm_set1_epi32(cospi[40]);
+ const __m128i cospi8 = _mm_set1_epi32(cospi[8]);
+ const __m128i cospi32 = _mm_set1_epi32(cospi[32]);
+ const __m128i cospi48 = _mm_set1_epi32(cospi[48]);
+ const __m128i cospi16 = _mm_set1_epi32(cospi[16]);
+ const __m128i cospim16 = _mm_set1_epi32(-cospi[16]);
+ const __m128i cospim48 = _mm_set1_epi32(-cospi[48]);
+ const __m128i rnding = _mm_set1_epi32(1 << (bit - 1));
+ const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8));
+ const __m128i clamp_lo = _mm_set1_epi32(-(1 << (log_range - 1)));
+ const __m128i clamp_hi = _mm_set1_epi32((1 << (log_range - 1)) - 1);
+ __m128i u[16], v[16], x, y;
+
+ {
+ // stage 0
+ // stage 1
+ u[0] = in[0];
+ u[1] = in[8];
+ u[2] = in[4];
+ u[3] = in[12];
+ u[4] = in[2];
+ u[5] = in[10];
+ u[6] = in[6];
+ u[7] = in[14];
+ u[8] = in[1];
+ u[9] = in[9];
+ u[10] = in[5];
+ u[11] = in[13];
+ u[12] = in[3];
+ u[13] = in[11];
+ u[14] = in[7];
+ u[15] = in[15];
+
+ // stage 2
+ v[0] = u[0];
+ v[1] = u[1];
+ v[2] = u[2];
+ v[3] = u[3];
+ v[4] = u[4];
+ v[5] = u[5];
+ v[6] = u[6];
+ v[7] = u[7];
+
+ v[8] = half_btf_sse4_1(&cospi60, &u[8], &cospim4, &u[15], &rnding, bit);
+ v[9] = half_btf_sse4_1(&cospi28, &u[9], &cospim36, &u[14], &rnding, bit);
+ v[10] = half_btf_sse4_1(&cospi44, &u[10], &cospim20, &u[13], &rnding, bit);
+ v[11] = half_btf_sse4_1(&cospi12, &u[11], &cospim52, &u[12], &rnding, bit);
+ v[12] = half_btf_sse4_1(&cospi52, &u[11], &cospi12, &u[12], &rnding, bit);
+ v[13] = half_btf_sse4_1(&cospi20, &u[10], &cospi44, &u[13], &rnding, bit);
+ v[14] = half_btf_sse4_1(&cospi36, &u[9], &cospi28, &u[14], &rnding, bit);
+ v[15] = half_btf_sse4_1(&cospi4, &u[8], &cospi60, &u[15], &rnding, bit);
+
+ // stage 3
+ u[0] = v[0];
+ u[1] = v[1];
+ u[2] = v[2];
+ u[3] = v[3];
+ u[4] = half_btf_sse4_1(&cospi56, &v[4], &cospim8, &v[7], &rnding, bit);
+ u[5] = half_btf_sse4_1(&cospi24, &v[5], &cospim40, &v[6], &rnding, bit);
+ u[6] = half_btf_sse4_1(&cospi40, &v[5], &cospi24, &v[6], &rnding, bit);
+ u[7] = half_btf_sse4_1(&cospi8, &v[4], &cospi56, &v[7], &rnding, bit);
+ addsub_sse4_1(v[8], v[9], &u[8], &u[9], &clamp_lo, &clamp_hi);
+ addsub_sse4_1(v[11], v[10], &u[11], &u[10], &clamp_lo, &clamp_hi);
+ addsub_sse4_1(v[12], v[13], &u[12], &u[13], &clamp_lo, &clamp_hi);
+ addsub_sse4_1(v[15], v[14], &u[15], &u[14], &clamp_lo, &clamp_hi);
+
+ // stage 4
+ x = _mm_mullo_epi32(u[0], cospi32);
+ y = _mm_mullo_epi32(u[1], cospi32);
+ v[0] = _mm_add_epi32(x, y);
+ v[0] = _mm_add_epi32(v[0], rnding);
+ v[0] = _mm_srai_epi32(v[0], bit);
+
+ v[1] = _mm_sub_epi32(x, y);
+ v[1] = _mm_add_epi32(v[1], rnding);
+ v[1] = _mm_srai_epi32(v[1], bit);
+
+ v[2] = half_btf_sse4_1(&cospi48, &u[2], &cospim16, &u[3], &rnding, bit);
+ v[3] = half_btf_sse4_1(&cospi16, &u[2], &cospi48, &u[3], &rnding, bit);
+ addsub_sse4_1(u[4], u[5], &v[4], &v[5], &clamp_lo, &clamp_hi);
+ addsub_sse4_1(u[7], u[6], &v[7], &v[6], &clamp_lo, &clamp_hi);
+ v[8] = u[8];
+ v[9] = half_btf_sse4_1(&cospim16, &u[9], &cospi48, &u[14], &rnding, bit);
+ v[10] = half_btf_sse4_1(&cospim48, &u[10], &cospim16, &u[13], &rnding, bit);
+ v[11] = u[11];
+ v[12] = u[12];
+ v[13] = half_btf_sse4_1(&cospim16, &u[10], &cospi48, &u[13], &rnding, bit);
+ v[14] = half_btf_sse4_1(&cospi48, &u[9], &cospi16, &u[14], &rnding, bit);
+ v[15] = u[15];
+
+ // stage 5
+ addsub_sse4_1(v[0], v[3], &u[0], &u[3], &clamp_lo, &clamp_hi);
+ addsub_sse4_1(v[1], v[2], &u[1], &u[2], &clamp_lo, &clamp_hi);
+ u[4] = v[4];
+
+ x = _mm_mullo_epi32(v[5], cospi32);
+ y = _mm_mullo_epi32(v[6], cospi32);
+ u[5] = _mm_sub_epi32(y, x);
+ u[5] = _mm_add_epi32(u[5], rnding);
+ u[5] = _mm_srai_epi32(u[5], bit);
+
+ u[6] = _mm_add_epi32(y, x);
+ u[6] = _mm_add_epi32(u[6], rnding);
+ u[6] = _mm_srai_epi32(u[6], bit);
+
+ u[7] = v[7];
+ addsub_sse4_1(v[8], v[11], &u[8], &u[11], &clamp_lo, &clamp_hi);
+ addsub_sse4_1(v[9], v[10], &u[9], &u[10], &clamp_lo, &clamp_hi);
+ addsub_sse4_1(v[15], v[12], &u[15], &u[12], &clamp_lo, &clamp_hi);
+ addsub_sse4_1(v[14], v[13], &u[14], &u[13], &clamp_lo, &clamp_hi);
+
+ // stage 6
+ addsub_sse4_1(u[0], u[7], &v[0], &v[7], &clamp_lo, &clamp_hi);
+ addsub_sse4_1(u[1], u[6], &v[1], &v[6], &clamp_lo, &clamp_hi);
+ addsub_sse4_1(u[2], u[5], &v[2], &v[5], &clamp_lo, &clamp_hi);
+ addsub_sse4_1(u[3], u[4], &v[3], &v[4], &clamp_lo, &clamp_hi);
+ v[8] = u[8];
+ v[9] = u[9];
+
+ x = _mm_mullo_epi32(u[10], cospi32);
+ y = _mm_mullo_epi32(u[13], cospi32);
+ v[10] = _mm_sub_epi32(y, x);
+ v[10] = _mm_add_epi32(v[10], rnding);
+ v[10] = _mm_srai_epi32(v[10], bit);
+
+ v[13] = _mm_add_epi32(x, y);
+ v[13] = _mm_add_epi32(v[13], rnding);
+ v[13] = _mm_srai_epi32(v[13], bit);
+
+ x = _mm_mullo_epi32(u[11], cospi32);
+ y = _mm_mullo_epi32(u[12], cospi32);
+ v[11] = _mm_sub_epi32(y, x);
+ v[11] = _mm_add_epi32(v[11], rnding);
+ v[11] = _mm_srai_epi32(v[11], bit);
+
+ v[12] = _mm_add_epi32(x, y);
+ v[12] = _mm_add_epi32(v[12], rnding);
+ v[12] = _mm_srai_epi32(v[12], bit);
+
+ v[14] = u[14];
+ v[15] = u[15];
+
+ // stage 7
+ addsub_sse4_1(v[0], v[15], out + 0, out + 15, &clamp_lo, &clamp_hi);
+ addsub_sse4_1(v[1], v[14], out + 1, out + 14, &clamp_lo, &clamp_hi);
+ addsub_sse4_1(v[2], v[13], out + 2, out + 13, &clamp_lo, &clamp_hi);
+ addsub_sse4_1(v[3], v[12], out + 3, out + 12, &clamp_lo, &clamp_hi);
+ addsub_sse4_1(v[4], v[11], out + 4, out + 11, &clamp_lo, &clamp_hi);
+ addsub_sse4_1(v[5], v[10], out + 5, out + 10, &clamp_lo, &clamp_hi);
+ addsub_sse4_1(v[6], v[9], out + 6, out + 9, &clamp_lo, &clamp_hi);
+ addsub_sse4_1(v[7], v[8], out + 7, out + 8, &clamp_lo, &clamp_hi);
+
+ if (!do_cols) {
+ const int log_range_out = AOMMAX(16, bd + 6);
+ const __m128i clamp_lo_out = _mm_set1_epi32(-(1 << (log_range_out - 1)));
+ const __m128i clamp_hi_out =
+ _mm_set1_epi32((1 << (log_range_out - 1)) - 1);
+ round_shift_8x8(out, out_shift);
+ highbd_clamp_epi32_sse4_1(out, out, &clamp_lo_out, &clamp_hi_out, 16);
+ }
+ }
+}
+
+static void iadst16x16_sse4_1(__m128i *in, __m128i *out, int bit, int do_cols,
+ int bd, int out_shift) {
+ const int32_t *cospi = cospi_arr(bit);
+ const __m128i cospi2 = _mm_set1_epi32(cospi[2]);
+ const __m128i cospi62 = _mm_set1_epi32(cospi[62]);
+ const __m128i cospi10 = _mm_set1_epi32(cospi[10]);
+ const __m128i cospi54 = _mm_set1_epi32(cospi[54]);
+ const __m128i cospi18 = _mm_set1_epi32(cospi[18]);
+ const __m128i cospi46 = _mm_set1_epi32(cospi[46]);
+ const __m128i cospi26 = _mm_set1_epi32(cospi[26]);
+ const __m128i cospi38 = _mm_set1_epi32(cospi[38]);
+ const __m128i cospi34 = _mm_set1_epi32(cospi[34]);
+ const __m128i cospi30 = _mm_set1_epi32(cospi[30]);
+ const __m128i cospi42 = _mm_set1_epi32(cospi[42]);
+ const __m128i cospi22 = _mm_set1_epi32(cospi[22]);
+ const __m128i cospi50 = _mm_set1_epi32(cospi[50]);
+ const __m128i cospi14 = _mm_set1_epi32(cospi[14]);
+ const __m128i cospi58 = _mm_set1_epi32(cospi[58]);
+ const __m128i cospi6 = _mm_set1_epi32(cospi[6]);
+ const __m128i cospi8 = _mm_set1_epi32(cospi[8]);
+ const __m128i cospi56 = _mm_set1_epi32(cospi[56]);
+ const __m128i cospi40 = _mm_set1_epi32(cospi[40]);
+ const __m128i cospi24 = _mm_set1_epi32(cospi[24]);
+ const __m128i cospim56 = _mm_set1_epi32(-cospi[56]);
+ const __m128i cospim24 = _mm_set1_epi32(-cospi[24]);
+ const __m128i cospi48 = _mm_set1_epi32(cospi[48]);
+ const __m128i cospi16 = _mm_set1_epi32(cospi[16]);
+ const __m128i cospim48 = _mm_set1_epi32(-cospi[48]);
+ const __m128i cospi32 = _mm_set1_epi32(cospi[32]);
+ const __m128i rnding = _mm_set1_epi32(1 << (bit - 1));
+ const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8));
+ const __m128i clamp_lo = _mm_set1_epi32(-(1 << (log_range - 1)));
+ const __m128i clamp_hi = _mm_set1_epi32((1 << (log_range - 1)) - 1);
+ const __m128i zero = _mm_setzero_si128();
+ __m128i u[16], v[16], x, y;
+ // Calculate the column 0, 1, 2, 3
+ // stage 0
+ // stage 1
+ // stage 2
+ v[0] = _mm_mullo_epi32(in[15], cospi2);
+ x = _mm_mullo_epi32(in[0], cospi62);
+ v[0] = _mm_add_epi32(v[0], x);
+ v[0] = _mm_add_epi32(v[0], rnding);
+ v[0] = _mm_srai_epi32(v[0], bit);
+
+ v[1] = _mm_mullo_epi32(in[15], cospi62);
+ x = _mm_mullo_epi32(in[0], cospi2);
+ v[1] = _mm_sub_epi32(v[1], x);
+ v[1] = _mm_add_epi32(v[1], rnding);
+ v[1] = _mm_srai_epi32(v[1], bit);
+
+ v[2] = _mm_mullo_epi32(in[13], cospi10);
+ x = _mm_mullo_epi32(in[2], cospi54);
+ v[2] = _mm_add_epi32(v[2], x);
+ v[2] = _mm_add_epi32(v[2], rnding);
+ v[2] = _mm_srai_epi32(v[2], bit);
+
+ v[3] = _mm_mullo_epi32(in[13], cospi54);
+ x = _mm_mullo_epi32(in[2], cospi10);
+ v[3] = _mm_sub_epi32(v[3], x);
+ v[3] = _mm_add_epi32(v[3], rnding);
+ v[3] = _mm_srai_epi32(v[3], bit);
+
+ v[4] = _mm_mullo_epi32(in[11], cospi18);
+ x = _mm_mullo_epi32(in[4], cospi46);
+ v[4] = _mm_add_epi32(v[4], x);
+ v[4] = _mm_add_epi32(v[4], rnding);
+ v[4] = _mm_srai_epi32(v[4], bit);
+
+ v[5] = _mm_mullo_epi32(in[11], cospi46);
+ x = _mm_mullo_epi32(in[4], cospi18);
+ v[5] = _mm_sub_epi32(v[5], x);
+ v[5] = _mm_add_epi32(v[5], rnding);
+ v[5] = _mm_srai_epi32(v[5], bit);
+
+ v[6] = _mm_mullo_epi32(in[9], cospi26);
+ x = _mm_mullo_epi32(in[6], cospi38);
+ v[6] = _mm_add_epi32(v[6], x);
+ v[6] = _mm_add_epi32(v[6], rnding);
+ v[6] = _mm_srai_epi32(v[6], bit);
+
+ v[7] = _mm_mullo_epi32(in[9], cospi38);
+ x = _mm_mullo_epi32(in[6], cospi26);
+ v[7] = _mm_sub_epi32(v[7], x);
+ v[7] = _mm_add_epi32(v[7], rnding);
+ v[7] = _mm_srai_epi32(v[7], bit);
+
+ v[8] = _mm_mullo_epi32(in[7], cospi34);
+ x = _mm_mullo_epi32(in[8], cospi30);
+ v[8] = _mm_add_epi32(v[8], x);
+ v[8] = _mm_add_epi32(v[8], rnding);
+ v[8] = _mm_srai_epi32(v[8], bit);
+
+ v[9] = _mm_mullo_epi32(in[7], cospi30);
+ x = _mm_mullo_epi32(in[8], cospi34);
+ v[9] = _mm_sub_epi32(v[9], x);
+ v[9] = _mm_add_epi32(v[9], rnding);
+ v[9] = _mm_srai_epi32(v[9], bit);
+
+ v[10] = _mm_mullo_epi32(in[5], cospi42);
+ x = _mm_mullo_epi32(in[10], cospi22);
+ v[10] = _mm_add_epi32(v[10], x);
+ v[10] = _mm_add_epi32(v[10], rnding);
+ v[10] = _mm_srai_epi32(v[10], bit);
+
+ v[11] = _mm_mullo_epi32(in[5], cospi22);
+ x = _mm_mullo_epi32(in[10], cospi42);
+ v[11] = _mm_sub_epi32(v[11], x);
+ v[11] = _mm_add_epi32(v[11], rnding);
+ v[11] = _mm_srai_epi32(v[11], bit);
+
+ v[12] = _mm_mullo_epi32(in[3], cospi50);
+ x = _mm_mullo_epi32(in[12], cospi14);
+ v[12] = _mm_add_epi32(v[12], x);
+ v[12] = _mm_add_epi32(v[12], rnding);
+ v[12] = _mm_srai_epi32(v[12], bit);
+
+ v[13] = _mm_mullo_epi32(in[3], cospi14);
+ x = _mm_mullo_epi32(in[12], cospi50);
+ v[13] = _mm_sub_epi32(v[13], x);
+ v[13] = _mm_add_epi32(v[13], rnding);
+ v[13] = _mm_srai_epi32(v[13], bit);
+
+ v[14] = _mm_mullo_epi32(in[1], cospi58);
+ x = _mm_mullo_epi32(in[14], cospi6);
+ v[14] = _mm_add_epi32(v[14], x);
+ v[14] = _mm_add_epi32(v[14], rnding);
+ v[14] = _mm_srai_epi32(v[14], bit);
+
+ v[15] = _mm_mullo_epi32(in[1], cospi6);
+ x = _mm_mullo_epi32(in[14], cospi58);
+ v[15] = _mm_sub_epi32(v[15], x);
+ v[15] = _mm_add_epi32(v[15], rnding);
+ v[15] = _mm_srai_epi32(v[15], bit);
+
+ // stage 3
+ addsub_sse4_1(v[0], v[8], &u[0], &u[8], &clamp_lo, &clamp_hi);
+ addsub_sse4_1(v[1], v[9], &u[1], &u[9], &clamp_lo, &clamp_hi);
+ addsub_sse4_1(v[2], v[10], &u[2], &u[10], &clamp_lo, &clamp_hi);
+ addsub_sse4_1(v[3], v[11], &u[3], &u[11], &clamp_lo, &clamp_hi);
+ addsub_sse4_1(v[4], v[12], &u[4], &u[12], &clamp_lo, &clamp_hi);
+ addsub_sse4_1(v[5], v[13], &u[5], &u[13], &clamp_lo, &clamp_hi);
+ addsub_sse4_1(v[6], v[14], &u[6], &u[14], &clamp_lo, &clamp_hi);
+ addsub_sse4_1(v[7], v[15], &u[7], &u[15], &clamp_lo, &clamp_hi);
+
+ // stage 4
+ v[0] = u[0];
+ v[1] = u[1];
+ v[2] = u[2];
+ v[3] = u[3];
+ v[4] = u[4];
+ v[5] = u[5];
+ v[6] = u[6];
+ v[7] = u[7];
+
+ v[8] = _mm_mullo_epi32(u[8], cospi8);
+ x = _mm_mullo_epi32(u[9], cospi56);
+ v[8] = _mm_add_epi32(v[8], x);
+ v[8] = _mm_add_epi32(v[8], rnding);
+ v[8] = _mm_srai_epi32(v[8], bit);
+
+ v[9] = _mm_mullo_epi32(u[8], cospi56);
+ x = _mm_mullo_epi32(u[9], cospi8);
+ v[9] = _mm_sub_epi32(v[9], x);
+ v[9] = _mm_add_epi32(v[9], rnding);
+ v[9] = _mm_srai_epi32(v[9], bit);
+
+ v[10] = _mm_mullo_epi32(u[10], cospi40);
+ x = _mm_mullo_epi32(u[11], cospi24);
+ v[10] = _mm_add_epi32(v[10], x);
+ v[10] = _mm_add_epi32(v[10], rnding);
+ v[10] = _mm_srai_epi32(v[10], bit);
+
+ v[11] = _mm_mullo_epi32(u[10], cospi24);
+ x = _mm_mullo_epi32(u[11], cospi40);
+ v[11] = _mm_sub_epi32(v[11], x);
+ v[11] = _mm_add_epi32(v[11], rnding);
+ v[11] = _mm_srai_epi32(v[11], bit);
+
+ v[12] = _mm_mullo_epi32(u[12], cospim56);
+ x = _mm_mullo_epi32(u[13], cospi8);
+ v[12] = _mm_add_epi32(v[12], x);
+ v[12] = _mm_add_epi32(v[12], rnding);
+ v[12] = _mm_srai_epi32(v[12], bit);
+
+ v[13] = _mm_mullo_epi32(u[12], cospi8);
+ x = _mm_mullo_epi32(u[13], cospim56);
+ v[13] = _mm_sub_epi32(v[13], x);
+ v[13] = _mm_add_epi32(v[13], rnding);
+ v[13] = _mm_srai_epi32(v[13], bit);
+
+ v[14] = _mm_mullo_epi32(u[14], cospim24);
+ x = _mm_mullo_epi32(u[15], cospi40);
+ v[14] = _mm_add_epi32(v[14], x);
+ v[14] = _mm_add_epi32(v[14], rnding);
+ v[14] = _mm_srai_epi32(v[14], bit);
+
+ v[15] = _mm_mullo_epi32(u[14], cospi40);
+ x = _mm_mullo_epi32(u[15], cospim24);
+ v[15] = _mm_sub_epi32(v[15], x);
+ v[15] = _mm_add_epi32(v[15], rnding);
+ v[15] = _mm_srai_epi32(v[15], bit);
+
+ // stage 5
+ addsub_sse4_1(v[0], v[4], &u[0], &u[4], &clamp_lo, &clamp_hi);
+ addsub_sse4_1(v[1], v[5], &u[1], &u[5], &clamp_lo, &clamp_hi);
+ addsub_sse4_1(v[2], v[6], &u[2], &u[6], &clamp_lo, &clamp_hi);
+ addsub_sse4_1(v[3], v[7], &u[3], &u[7], &clamp_lo, &clamp_hi);
+ addsub_sse4_1(v[8], v[12], &u[8], &u[12], &clamp_lo, &clamp_hi);
+ addsub_sse4_1(v[9], v[13], &u[9], &u[13], &clamp_lo, &clamp_hi);
+ addsub_sse4_1(v[10], v[14], &u[10], &u[14], &clamp_lo, &clamp_hi);
+ addsub_sse4_1(v[11], v[15], &u[11], &u[15], &clamp_lo, &clamp_hi);
+
+ // stage 6
+ v[0] = u[0];
+ v[1] = u[1];
+ v[2] = u[2];
+ v[3] = u[3];
+
+ v[4] = _mm_mullo_epi32(u[4], cospi16);
+ x = _mm_mullo_epi32(u[5], cospi48);
+ v[4] = _mm_add_epi32(v[4], x);
+ v[4] = _mm_add_epi32(v[4], rnding);
+ v[4] = _mm_srai_epi32(v[4], bit);
+
+ v[5] = _mm_mullo_epi32(u[4], cospi48);
+ x = _mm_mullo_epi32(u[5], cospi16);
+ v[5] = _mm_sub_epi32(v[5], x);
+ v[5] = _mm_add_epi32(v[5], rnding);
+ v[5] = _mm_srai_epi32(v[5], bit);
+
+ v[6] = _mm_mullo_epi32(u[6], cospim48);
+ x = _mm_mullo_epi32(u[7], cospi16);
+ v[6] = _mm_add_epi32(v[6], x);
+ v[6] = _mm_add_epi32(v[6], rnding);
+ v[6] = _mm_srai_epi32(v[6], bit);
+
+ v[7] = _mm_mullo_epi32(u[6], cospi16);
+ x = _mm_mullo_epi32(u[7], cospim48);
+ v[7] = _mm_sub_epi32(v[7], x);
+ v[7] = _mm_add_epi32(v[7], rnding);
+ v[7] = _mm_srai_epi32(v[7], bit);
+
+ v[8] = u[8];
+ v[9] = u[9];
+ v[10] = u[10];
+ v[11] = u[11];
+
+ v[12] = _mm_mullo_epi32(u[12], cospi16);
+ x = _mm_mullo_epi32(u[13], cospi48);
+ v[12] = _mm_add_epi32(v[12], x);
+ v[12] = _mm_add_epi32(v[12], rnding);
+ v[12] = _mm_srai_epi32(v[12], bit);
+
+ v[13] = _mm_mullo_epi32(u[12], cospi48);
+ x = _mm_mullo_epi32(u[13], cospi16);
+ v[13] = _mm_sub_epi32(v[13], x);
+ v[13] = _mm_add_epi32(v[13], rnding);
+ v[13] = _mm_srai_epi32(v[13], bit);
+
+ v[14] = _mm_mullo_epi32(u[14], cospim48);
+ x = _mm_mullo_epi32(u[15], cospi16);
+ v[14] = _mm_add_epi32(v[14], x);
+ v[14] = _mm_add_epi32(v[14], rnding);
+ v[14] = _mm_srai_epi32(v[14], bit);
+
+ v[15] = _mm_mullo_epi32(u[14], cospi16);
+ x = _mm_mullo_epi32(u[15], cospim48);
+ v[15] = _mm_sub_epi32(v[15], x);
+ v[15] = _mm_add_epi32(v[15], rnding);
+ v[15] = _mm_srai_epi32(v[15], bit);
+
+ // stage 7
+ addsub_sse4_1(v[0], v[2], &u[0], &u[2], &clamp_lo, &clamp_hi);
+ addsub_sse4_1(v[1], v[3], &u[1], &u[3], &clamp_lo, &clamp_hi);
+ addsub_sse4_1(v[4], v[6], &u[4], &u[6], &clamp_lo, &clamp_hi);
+ addsub_sse4_1(v[5], v[7], &u[5], &u[7], &clamp_lo, &clamp_hi);
+ addsub_sse4_1(v[8], v[10], &u[8], &u[10], &clamp_lo, &clamp_hi);
+ addsub_sse4_1(v[9], v[11], &u[9], &u[11], &clamp_lo, &clamp_hi);
+ addsub_sse4_1(v[12], v[14], &u[12], &u[14], &clamp_lo, &clamp_hi);
+ addsub_sse4_1(v[13], v[15], &u[13], &u[15], &clamp_lo, &clamp_hi);
+
+ // stage 8
+ v[0] = u[0];
+ v[1] = u[1];
+
+ y = _mm_mullo_epi32(u[2], cospi32);
+ x = _mm_mullo_epi32(u[3], cospi32);
+ v[2] = _mm_add_epi32(y, x);
+ v[2] = _mm_add_epi32(v[2], rnding);
+ v[2] = _mm_srai_epi32(v[2], bit);
+
+ v[3] = _mm_sub_epi32(y, x);
+ v[3] = _mm_add_epi32(v[3], rnding);
+ v[3] = _mm_srai_epi32(v[3], bit);
+
+ v[4] = u[4];
+ v[5] = u[5];
+
+ y = _mm_mullo_epi32(u[6], cospi32);
+ x = _mm_mullo_epi32(u[7], cospi32);
+ v[6] = _mm_add_epi32(y, x);
+ v[6] = _mm_add_epi32(v[6], rnding);
+ v[6] = _mm_srai_epi32(v[6], bit);
+
+ v[7] = _mm_sub_epi32(y, x);
+ v[7] = _mm_add_epi32(v[7], rnding);
+ v[7] = _mm_srai_epi32(v[7], bit);
+
+ v[8] = u[8];
+ v[9] = u[9];
+
+ y = _mm_mullo_epi32(u[10], cospi32);
+ x = _mm_mullo_epi32(u[11], cospi32);
+ v[10] = _mm_add_epi32(y, x);
+ v[10] = _mm_add_epi32(v[10], rnding);
+ v[10] = _mm_srai_epi32(v[10], bit);
+
+ v[11] = _mm_sub_epi32(y, x);
+ v[11] = _mm_add_epi32(v[11], rnding);
+ v[11] = _mm_srai_epi32(v[11], bit);
+
+ v[12] = u[12];
+ v[13] = u[13];
+
+ y = _mm_mullo_epi32(u[14], cospi32);
+ x = _mm_mullo_epi32(u[15], cospi32);
+ v[14] = _mm_add_epi32(y, x);
+ v[14] = _mm_add_epi32(v[14], rnding);
+ v[14] = _mm_srai_epi32(v[14], bit);
+
+ v[15] = _mm_sub_epi32(y, x);
+ v[15] = _mm_add_epi32(v[15], rnding);
+ v[15] = _mm_srai_epi32(v[15], bit);
+
+ // stage 9
+ if (do_cols) {
+ out[0] = v[0];
+ out[1] = _mm_sub_epi32(zero, v[8]);
+ out[2] = v[12];
+ out[3] = _mm_sub_epi32(zero, v[4]);
+ out[4] = v[6];
+ out[5] = _mm_sub_epi32(zero, v[14]);
+ out[6] = v[10];
+ out[7] = _mm_sub_epi32(zero, v[2]);
+ out[8] = v[3];
+ out[9] = _mm_sub_epi32(zero, v[11]);
+ out[10] = v[15];
+ out[11] = _mm_sub_epi32(zero, v[7]);
+ out[12] = v[5];
+ out[13] = _mm_sub_epi32(zero, v[13]);
+ out[14] = v[9];
+ out[15] = _mm_sub_epi32(zero, v[1]);
+ } else {
+ const int log_range_out = AOMMAX(16, bd + 6);
+ const __m128i clamp_lo_out = _mm_set1_epi32(-(1 << (log_range_out - 1)));
+ const __m128i clamp_hi_out = _mm_set1_epi32((1 << (log_range_out - 1)) - 1);
+
+ neg_shift_sse4_1(v[0], v[8], out + 0, out + 1, &clamp_lo_out, &clamp_hi_out,
+ out_shift);
+ neg_shift_sse4_1(v[12], v[4], out + 2, out + 3, &clamp_lo_out,
+ &clamp_hi_out, out_shift);
+ neg_shift_sse4_1(v[6], v[14], out + 4, out + 5, &clamp_lo_out,
+ &clamp_hi_out, out_shift);
+ neg_shift_sse4_1(v[10], v[2], out + 6, out + 7, &clamp_lo_out,
+ &clamp_hi_out, out_shift);
+ neg_shift_sse4_1(v[3], v[11], out + 8, out + 9, &clamp_lo_out,
+ &clamp_hi_out, out_shift);
+ neg_shift_sse4_1(v[15], v[7], out + 10, out + 11, &clamp_lo_out,
+ &clamp_hi_out, out_shift);
+ neg_shift_sse4_1(v[5], v[13], out + 12, out + 13, &clamp_lo_out,
+ &clamp_hi_out, out_shift);
+ neg_shift_sse4_1(v[9], v[1], out + 14, out + 15, &clamp_lo_out,
+ &clamp_hi_out, out_shift);
+ }
+}
+static void iidentity16_sse4_1(__m128i *in, __m128i *out, int bit, int do_cols,
+ int bd, int out_shift) {
+ (void)bit;
+ __m128i fact = _mm_set1_epi32(2 * NewSqrt2);
+ __m128i offset = _mm_set1_epi32(1 << (NewSqrt2Bits - 1));
+ __m128i a0_low, a0_high, a1_low, a1_high;
+ __m128i zero = _mm_setzero_si128();
+ offset = _mm_unpacklo_epi32(offset, zero);
+
+ for (int i = 0; i < 16; i++) {
+ a0_low = _mm_mul_epi32(in[i], fact);
+ a0_low = _mm_add_epi32(a0_low, offset);
+ a0_low = _mm_srli_epi64(a0_low, NewSqrt2Bits);
+
+ a0_high = _mm_srli_si128(in[i], 4);
+ a0_high = _mm_mul_epi32(a0_high, fact);
+ a0_high = _mm_add_epi32(a0_high, offset);
+ a0_high = _mm_srli_epi64(a0_high, NewSqrt2Bits);
+
+ a1_low = _mm_unpacklo_epi32(a0_low, a0_high);
+ a1_high = _mm_unpackhi_epi32(a0_low, a0_high);
+ out[i] = _mm_unpacklo_epi64(a1_low, a1_high);
+ }
+
+ if (!do_cols) {
+ const int log_range = AOMMAX(16, bd + 6);
+ const __m128i clamp_lo = _mm_set1_epi32(-(1 << (log_range - 1)));
+ const __m128i clamp_hi = _mm_set1_epi32((1 << (log_range - 1)) - 1);
+ round_shift_8x8(out, out_shift);
+ highbd_clamp_epi32_sse4_1(out, out, &clamp_lo, &clamp_hi, 16);
+ }
+}
+static INLINE void idct64_stage8_sse4_1(
+ __m128i *u, const __m128i *cospim32, const __m128i *cospi32,
+ const __m128i *cospim16, const __m128i *cospi48, const __m128i *cospi16,
+ const __m128i *cospim48, const __m128i *clamp_lo, const __m128i *clamp_hi,
+ const __m128i *rnding, int bit) {
+ int i;
+ __m128i temp1, temp2, temp3, temp4;
+ temp1 = half_btf_sse4_1(cospim32, &u[10], cospi32, &u[13], rnding, bit);
+ u[13] = half_btf_sse4_1(cospi32, &u[10], cospi32, &u[13], rnding, bit);
+ u[10] = temp1;
+ temp2 = half_btf_sse4_1(cospim32, &u[11], cospi32, &u[12], rnding, bit);
+ u[12] = half_btf_sse4_1(cospi32, &u[11], cospi32, &u[12], rnding, bit);
+ u[11] = temp2;
+
+ for (i = 16; i < 20; ++i) {
+ addsub_sse4_1(u[i], u[i ^ 7], &u[i], &u[i ^ 7], clamp_lo, clamp_hi);
+ addsub_sse4_1(u[i ^ 15], u[i ^ 8], &u[i ^ 15], &u[i ^ 8], clamp_lo,
+ clamp_hi);
+ }
+
+ temp1 = half_btf_sse4_1(cospim16, &u[36], cospi48, &u[59], rnding, bit);
+ temp2 = half_btf_sse4_1(cospim16, &u[37], cospi48, &u[58], rnding, bit);
+ temp3 = half_btf_sse4_1(cospim16, &u[38], cospi48, &u[57], rnding, bit);
+ temp4 = half_btf_sse4_1(cospim16, &u[39], cospi48, &u[56], rnding, bit);
+ u[56] = half_btf_sse4_1(cospi48, &u[39], cospi16, &u[56], rnding, bit);
+ u[57] = half_btf_sse4_1(cospi48, &u[38], cospi16, &u[57], rnding, bit);
+ u[58] = half_btf_sse4_1(cospi48, &u[37], cospi16, &u[58], rnding, bit);
+ u[59] = half_btf_sse4_1(cospi48, &u[36], cospi16, &u[59], rnding, bit);
+ u[36] = temp1;
+ u[37] = temp2;
+ u[38] = temp3;
+ u[39] = temp4;
+
+ temp1 = half_btf_sse4_1(cospim48, &u[40], cospim16, &u[55], rnding, bit);
+ temp2 = half_btf_sse4_1(cospim48, &u[41], cospim16, &u[54], rnding, bit);
+ temp3 = half_btf_sse4_1(cospim48, &u[42], cospim16, &u[53], rnding, bit);
+ temp4 = half_btf_sse4_1(cospim48, &u[43], cospim16, &u[52], rnding, bit);
+ u[52] = half_btf_sse4_1(cospim16, &u[43], cospi48, &u[52], rnding, bit);
+ u[53] = half_btf_sse4_1(cospim16, &u[42], cospi48, &u[53], rnding, bit);
+ u[54] = half_btf_sse4_1(cospim16, &u[41], cospi48, &u[54], rnding, bit);
+ u[55] = half_btf_sse4_1(cospim16, &u[40], cospi48, &u[55], rnding, bit);
+ u[40] = temp1;
+ u[41] = temp2;
+ u[42] = temp3;
+ u[43] = temp4;
+}
+
+static INLINE void idct64_stage9_sse4_1(__m128i *u, const __m128i *cospim32,
+ const __m128i *cospi32,
+ const __m128i *clamp_lo,
+ const __m128i *clamp_hi,
+ const __m128i *rnding, int bit) {
+ int i;
+ __m128i temp1, temp2, temp3, temp4;
+ for (i = 0; i < 8; ++i) {
+ addsub_sse4_1(u[i], u[15 - i], &u[i], &u[15 - i], clamp_lo, clamp_hi);
+ }
+
+ temp1 = half_btf_sse4_1(cospim32, &u[20], cospi32, &u[27], rnding, bit);
+ temp2 = half_btf_sse4_1(cospim32, &u[21], cospi32, &u[26], rnding, bit);
+ temp3 = half_btf_sse4_1(cospim32, &u[22], cospi32, &u[25], rnding, bit);
+ temp4 = half_btf_sse4_1(cospim32, &u[23], cospi32, &u[24], rnding, bit);
+ u[24] = half_btf_sse4_1(cospi32, &u[23], cospi32, &u[24], rnding, bit);
+ u[25] = half_btf_sse4_1(cospi32, &u[22], cospi32, &u[25], rnding, bit);
+ u[26] = half_btf_sse4_1(cospi32, &u[21], cospi32, &u[26], rnding, bit);
+ u[27] = half_btf_sse4_1(cospi32, &u[20], cospi32, &u[27], rnding, bit);
+ u[20] = temp1;
+ u[21] = temp2;
+ u[22] = temp3;
+ u[23] = temp4;
+ for (i = 32; i < 40; i++) {
+ addsub_sse4_1(u[i], u[i ^ 15], &u[i], &u[i ^ 15], clamp_lo, clamp_hi);
+ }
+
+ for (i = 48; i < 56; i++) {
+ addsub_sse4_1(u[i ^ 15], u[i], &u[i ^ 15], &u[i], clamp_lo, clamp_hi);
+ }
+}
+
+static INLINE void idct64_stage10_sse4_1(__m128i *u, const __m128i *cospim32,
+ const __m128i *cospi32,
+ const __m128i *clamp_lo,
+ const __m128i *clamp_hi,
+ const __m128i *rnding, int bit) {
+ __m128i temp1, temp2, temp3, temp4;
+ for (int i = 0; i < 16; i++) {
+ addsub_sse4_1(u[i], u[31 - i], &u[i], &u[31 - i], clamp_lo, clamp_hi);
+ }
+
+ temp1 = half_btf_sse4_1(cospim32, &u[40], cospi32, &u[55], rnding, bit);
+ temp2 = half_btf_sse4_1(cospim32, &u[41], cospi32, &u[54], rnding, bit);
+ temp3 = half_btf_sse4_1(cospim32, &u[42], cospi32, &u[53], rnding, bit);
+ temp4 = half_btf_sse4_1(cospim32, &u[43], cospi32, &u[52], rnding, bit);
+ u[52] = half_btf_sse4_1(cospi32, &u[43], cospi32, &u[52], rnding, bit);
+ u[53] = half_btf_sse4_1(cospi32, &u[42], cospi32, &u[53], rnding, bit);
+ u[54] = half_btf_sse4_1(cospi32, &u[41], cospi32, &u[54], rnding, bit);
+ u[55] = half_btf_sse4_1(cospi32, &u[40], cospi32, &u[55], rnding, bit);
+ u[40] = temp1;
+ u[41] = temp2;
+ u[42] = temp3;
+ u[43] = temp4;
+
+ temp1 = half_btf_sse4_1(cospim32, &u[44], cospi32, &u[51], rnding, bit);
+ temp2 = half_btf_sse4_1(cospim32, &u[45], cospi32, &u[50], rnding, bit);
+ temp3 = half_btf_sse4_1(cospim32, &u[46], cospi32, &u[49], rnding, bit);
+ temp4 = half_btf_sse4_1(cospim32, &u[47], cospi32, &u[48], rnding, bit);
+ u[48] = half_btf_sse4_1(cospi32, &u[47], cospi32, &u[48], rnding, bit);
+ u[49] = half_btf_sse4_1(cospi32, &u[46], cospi32, &u[49], rnding, bit);
+ u[50] = half_btf_sse4_1(cospi32, &u[45], cospi32, &u[50], rnding, bit);
+ u[51] = half_btf_sse4_1(cospi32, &u[44], cospi32, &u[51], rnding, bit);
+ u[44] = temp1;
+ u[45] = temp2;
+ u[46] = temp3;
+ u[47] = temp4;
+}
+
+static INLINE void idct64_stage11_sse4_1(__m128i *u, __m128i *out, int do_cols,
+ int bd, int out_shift,
+ const __m128i *clamp_lo,
+ const __m128i *clamp_hi) {
+ for (int i = 0; i < 32; i++) {
+ addsub_sse4_1(u[i], u[63 - i], out + i, out + 63 - i, clamp_lo, clamp_hi);
+ }
+
+ if (!do_cols) {
+ const int log_range_out = AOMMAX(16, bd + 6);
+ const __m128i clamp_lo_out = _mm_set1_epi32(-(1 << (log_range_out - 1)));
+ const __m128i clamp_hi_out = _mm_set1_epi32((1 << (log_range_out - 1)) - 1);
+
+ for (int i = 0; i < 64; i += 4) {
+ round_shift_4x4(out + i, out_shift);
+ highbd_clamp_epi32_sse4_1(out + i, out + i, &clamp_lo_out, &clamp_hi_out,
+ 4);
+ }
+ }
+}
+
+static void idct64x64_low1_sse4_1(__m128i *in, __m128i *out, int bit,
+ int do_cols, int bd, int out_shift) {
+ const int32_t *cospi = cospi_arr(bit);
+ const __m128i rnding = _mm_set1_epi32(1 << (bit - 1));
+ const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8));
+ __m128i clamp_lo = _mm_set1_epi32(-(1 << (log_range - 1)));
+ __m128i clamp_hi = _mm_set1_epi32((1 << (log_range - 1)) - 1);
+
+ const __m128i cospi32 = _mm_set1_epi32(cospi[32]);
+
+ {
+ __m128i x;
+
+ // stage 1
+ // stage 2
+ // stage 3
+ // stage 4
+ // stage 5
+ // stage 6
+ x = half_btf_0_sse4_1(&cospi32, &in[0], &rnding, bit);
+
+ // stage 8
+ // stage 9
+ // stage 10
+ // stage 11
+ if (!do_cols) {
+ const int log_range_out = AOMMAX(16, bd + 6);
+ clamp_lo = _mm_set1_epi32(-(1 << (log_range_out - 1)));
+ clamp_hi = _mm_set1_epi32((1 << (log_range_out - 1)) - 1);
+ if (out_shift != 0) {
+ __m128i offset = _mm_set1_epi32((1 << out_shift) >> 1);
+ x = _mm_add_epi32(x, offset);
+ x = _mm_sra_epi32(x, _mm_cvtsi32_si128(out_shift));
+ }
+ }
+ x = _mm_max_epi32(x, clamp_lo);
+ x = _mm_min_epi32(x, clamp_hi);
+ out[0] = x;
+ out[1] = x;
+ out[2] = x;
+ out[3] = x;
+ out[4] = x;
+ out[5] = x;
+ out[6] = x;
+ out[7] = x;
+ out[8] = x;
+ out[9] = x;
+ out[10] = x;
+ out[11] = x;
+ out[12] = x;
+ out[13] = x;
+ out[14] = x;
+ out[15] = x;
+ out[16] = x;
+ out[17] = x;
+ out[18] = x;
+ out[19] = x;
+ out[20] = x;
+ out[21] = x;
+ out[22] = x;
+ out[23] = x;
+ out[24] = x;
+ out[25] = x;
+ out[26] = x;
+ out[27] = x;
+ out[28] = x;
+ out[29] = x;
+ out[30] = x;
+ out[31] = x;
+ out[32] = x;
+ out[33] = x;
+ out[34] = x;
+ out[35] = x;
+ out[36] = x;
+ out[37] = x;
+ out[38] = x;
+ out[39] = x;
+ out[40] = x;
+ out[41] = x;
+ out[42] = x;
+ out[43] = x;
+ out[44] = x;
+ out[45] = x;
+ out[46] = x;
+ out[47] = x;
+ out[48] = x;
+ out[49] = x;
+ out[50] = x;
+ out[51] = x;
+ out[52] = x;
+ out[53] = x;
+ out[54] = x;
+ out[55] = x;
+ out[56] = x;
+ out[57] = x;
+ out[58] = x;
+ out[59] = x;
+ out[60] = x;
+ out[61] = x;
+ out[62] = x;
+ out[63] = x;
+ }
+}
+
+static void idct64x64_low8_sse4_1(__m128i *in, __m128i *out, int bit,
+ int do_cols, int bd, int out_shift) {
+ int i, j;
+ const int32_t *cospi = cospi_arr(bit);
+ const __m128i rnding = _mm_set1_epi32(1 << (bit - 1));
+ const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8));
+ const __m128i clamp_lo = _mm_set1_epi32(-(1 << (log_range - 1)));
+ const __m128i clamp_hi = _mm_set1_epi32((1 << (log_range - 1)) - 1);
+
+ const __m128i cospi1 = _mm_set1_epi32(cospi[1]);
+ const __m128i cospi2 = _mm_set1_epi32(cospi[2]);
+ const __m128i cospi3 = _mm_set1_epi32(cospi[3]);
+ const __m128i cospi4 = _mm_set1_epi32(cospi[4]);
+ const __m128i cospi6 = _mm_set1_epi32(cospi[6]);
+ const __m128i cospi8 = _mm_set1_epi32(cospi[8]);
+ const __m128i cospi12 = _mm_set1_epi32(cospi[12]);
+ const __m128i cospi16 = _mm_set1_epi32(cospi[16]);
+ const __m128i cospi20 = _mm_set1_epi32(cospi[20]);
+ const __m128i cospi24 = _mm_set1_epi32(cospi[24]);
+ const __m128i cospi28 = _mm_set1_epi32(cospi[28]);
+ const __m128i cospi32 = _mm_set1_epi32(cospi[32]);
+ const __m128i cospi40 = _mm_set1_epi32(cospi[40]);
+ const __m128i cospi44 = _mm_set1_epi32(cospi[44]);
+ const __m128i cospi48 = _mm_set1_epi32(cospi[48]);
+ const __m128i cospi56 = _mm_set1_epi32(cospi[56]);
+ const __m128i cospi60 = _mm_set1_epi32(cospi[60]);
+ const __m128i cospim4 = _mm_set1_epi32(-cospi[4]);
+ const __m128i cospim8 = _mm_set1_epi32(-cospi[8]);
+ const __m128i cospim12 = _mm_set1_epi32(-cospi[12]);
+ const __m128i cospim16 = _mm_set1_epi32(-cospi[16]);
+ const __m128i cospim20 = _mm_set1_epi32(-cospi[20]);
+ const __m128i cospim24 = _mm_set1_epi32(-cospi[24]);
+ const __m128i cospim28 = _mm_set1_epi32(-cospi[28]);
+ const __m128i cospim32 = _mm_set1_epi32(-cospi[32]);
+ const __m128i cospim36 = _mm_set1_epi32(-cospi[36]);
+ const __m128i cospim40 = _mm_set1_epi32(-cospi[40]);
+ const __m128i cospim48 = _mm_set1_epi32(-cospi[48]);
+ const __m128i cospim52 = _mm_set1_epi32(-cospi[52]);
+ const __m128i cospim56 = _mm_set1_epi32(-cospi[56]);
+ const __m128i cospi63 = _mm_set1_epi32(cospi[63]);
+ const __m128i cospim57 = _mm_set1_epi32(-cospi[57]);
+ const __m128i cospi7 = _mm_set1_epi32(cospi[7]);
+ const __m128i cospi5 = _mm_set1_epi32(cospi[5]);
+ const __m128i cospi59 = _mm_set1_epi32(cospi[59]);
+ const __m128i cospim61 = _mm_set1_epi32(-cospi[61]);
+ const __m128i cospim58 = _mm_set1_epi32(-cospi[58]);
+ const __m128i cospi62 = _mm_set1_epi32(cospi[62]);
+
+ {
+ __m128i u[64];
+
+ // stage 1
+ u[0] = in[0];
+ u[8] = in[4];
+ u[16] = in[2];
+ u[24] = in[6];
+ u[32] = in[1];
+ u[40] = in[5];
+ u[48] = in[3];
+ u[56] = in[7];
+
+ // stage 2
+ u[63] = half_btf_0_sse4_1(&cospi1, &u[32], &rnding, bit);
+ u[32] = half_btf_0_sse4_1(&cospi63, &u[32], &rnding, bit);
+ u[39] = half_btf_0_sse4_1(&cospim57, &u[56], &rnding, bit);
+ u[56] = half_btf_0_sse4_1(&cospi7, &u[56], &rnding, bit);
+ u[55] = half_btf_0_sse4_1(&cospi5, &u[40], &rnding, bit);
+ u[40] = half_btf_0_sse4_1(&cospi59, &u[40], &rnding, bit);
+ u[47] = half_btf_0_sse4_1(&cospim61, &u[48], &rnding, bit);
+ u[48] = half_btf_0_sse4_1(&cospi3, &u[48], &rnding, bit);
+
+ // stage 3
+ u[31] = half_btf_0_sse4_1(&cospi2, &u[16], &rnding, bit);
+ u[16] = half_btf_0_sse4_1(&cospi62, &u[16], &rnding, bit);
+ u[23] = half_btf_0_sse4_1(&cospim58, &u[24], &rnding, bit);
+ u[24] = half_btf_0_sse4_1(&cospi6, &u[24], &rnding, bit);
+ u[33] = u[32];
+ u[38] = u[39];
+ u[41] = u[40];
+ u[46] = u[47];
+ u[49] = u[48];
+ u[54] = u[55];
+ u[57] = u[56];
+ u[62] = u[63];
+
+ // stage 4
+ __m128i temp1, temp2;
+ u[15] = half_btf_0_sse4_1(&cospi4, &u[8], &rnding, bit);
+ u[8] = half_btf_0_sse4_1(&cospi60, &u[8], &rnding, bit);
+ u[17] = u[16];
+ u[22] = u[23];
+ u[25] = u[24];
+ u[30] = u[31];
+
+ temp1 = half_btf_sse4_1(&cospim4, &u[33], &cospi60, &u[62], &rnding, bit);
+ u[62] = half_btf_sse4_1(&cospi60, &u[33], &cospi4, &u[62], &rnding, bit);
+ u[33] = temp1;
+
+ temp2 = half_btf_sse4_1(&cospim36, &u[38], &cospi28, &u[57], &rnding, bit);
+ u[38] = half_btf_sse4_1(&cospim28, &u[38], &cospim36, &u[57], &rnding, bit);
+ u[57] = temp2;
+
+ temp1 = half_btf_sse4_1(&cospim20, &u[41], &cospi44, &u[54], &rnding, bit);
+ u[54] = half_btf_sse4_1(&cospi44, &u[41], &cospi20, &u[54], &rnding, bit);
+ u[41] = temp1;
+
+ temp2 = half_btf_sse4_1(&cospim12, &u[46], &cospim52, &u[49], &rnding, bit);
+ u[49] = half_btf_sse4_1(&cospim52, &u[46], &cospi12, &u[49], &rnding, bit);
+ u[46] = temp2;
+
+ // stage 5
+ u[9] = u[8];
+ u[14] = u[15];
+
+ temp1 = half_btf_sse4_1(&cospim8, &u[17], &cospi56, &u[30], &rnding, bit);
+ u[30] = half_btf_sse4_1(&cospi56, &u[17], &cospi8, &u[30], &rnding, bit);
+ u[17] = temp1;
+
+ temp2 = half_btf_sse4_1(&cospim24, &u[22], &cospim40, &u[25], &rnding, bit);
+ u[25] = half_btf_sse4_1(&cospim40, &u[22], &cospi24, &u[25], &rnding, bit);
+ u[22] = temp2;
+
+ u[35] = u[32];
+ u[34] = u[33];
+ u[36] = u[39];
+ u[37] = u[38];
+ u[43] = u[40];
+ u[42] = u[41];
+ u[44] = u[47];
+ u[45] = u[46];
+ u[51] = u[48];
+ u[50] = u[49];
+ u[52] = u[55];
+ u[53] = u[54];
+ u[59] = u[56];
+ u[58] = u[57];
+ u[60] = u[63];
+ u[61] = u[62];
+
+ // stage 6
+ temp1 = half_btf_0_sse4_1(&cospi32, &u[0], &rnding, bit);
+ u[1] = half_btf_0_sse4_1(&cospi32, &u[0], &rnding, bit);
+ u[0] = temp1;
+
+ temp2 = half_btf_sse4_1(&cospim16, &u[9], &cospi48, &u[14], &rnding, bit);
+ u[14] = half_btf_sse4_1(&cospi48, &u[9], &cospi16, &u[14], &rnding, bit);
+ u[9] = temp2;
+ u[19] = u[16];
+ u[18] = u[17];
+ u[20] = u[23];
+ u[21] = u[22];
+ u[27] = u[24];
+ u[26] = u[25];
+ u[28] = u[31];
+ u[29] = u[30];
+
+ temp1 = half_btf_sse4_1(&cospim8, &u[34], &cospi56, &u[61], &rnding, bit);
+ u[61] = half_btf_sse4_1(&cospi56, &u[34], &cospi8, &u[61], &rnding, bit);
+ u[34] = temp1;
+ temp2 = half_btf_sse4_1(&cospim8, &u[35], &cospi56, &u[60], &rnding, bit);
+ u[60] = half_btf_sse4_1(&cospi56, &u[35], &cospi8, &u[60], &rnding, bit);
+ u[35] = temp2;
+ temp1 = half_btf_sse4_1(&cospim56, &u[36], &cospim8, &u[59], &rnding, bit);
+ u[59] = half_btf_sse4_1(&cospim8, &u[36], &cospi56, &u[59], &rnding, bit);
+ u[36] = temp1;
+ temp2 = half_btf_sse4_1(&cospim56, &u[37], &cospim8, &u[58], &rnding, bit);
+ u[58] = half_btf_sse4_1(&cospim8, &u[37], &cospi56, &u[58], &rnding, bit);
+ u[37] = temp2;
+ temp1 = half_btf_sse4_1(&cospim40, &u[42], &cospi24, &u[53], &rnding, bit);
+ u[53] = half_btf_sse4_1(&cospi24, &u[42], &cospi40, &u[53], &rnding, bit);
+ u[42] = temp1;
+ temp2 = half_btf_sse4_1(&cospim40, &u[43], &cospi24, &u[52], &rnding, bit);
+ u[52] = half_btf_sse4_1(&cospi24, &u[43], &cospi40, &u[52], &rnding, bit);
+ u[43] = temp2;
+ temp1 = half_btf_sse4_1(&cospim24, &u[44], &cospim40, &u[51], &rnding, bit);
+ u[51] = half_btf_sse4_1(&cospim40, &u[44], &cospi24, &u[51], &rnding, bit);
+ u[44] = temp1;
+ temp2 = half_btf_sse4_1(&cospim24, &u[45], &cospim40, &u[50], &rnding, bit);
+ u[50] = half_btf_sse4_1(&cospim40, &u[45], &cospi24, &u[50], &rnding, bit);
+ u[45] = temp2;
+
+ // stage 7
+ u[3] = u[0];
+ u[2] = u[1];
+ u[11] = u[8];
+ u[10] = u[9];
+ u[12] = u[15];
+ u[13] = u[14];
+
+ temp1 = half_btf_sse4_1(&cospim16, &u[18], &cospi48, &u[29], &rnding, bit);
+ u[29] = half_btf_sse4_1(&cospi48, &u[18], &cospi16, &u[29], &rnding, bit);
+ u[18] = temp1;
+ temp2 = half_btf_sse4_1(&cospim16, &u[19], &cospi48, &u[28], &rnding, bit);
+ u[28] = half_btf_sse4_1(&cospi48, &u[19], &cospi16, &u[28], &rnding, bit);
+ u[19] = temp2;
+ temp1 = half_btf_sse4_1(&cospim48, &u[20], &cospim16, &u[27], &rnding, bit);
+ u[27] = half_btf_sse4_1(&cospim16, &u[20], &cospi48, &u[27], &rnding, bit);
+ u[20] = temp1;
+ temp2 = half_btf_sse4_1(&cospim48, &u[21], &cospim16, &u[26], &rnding, bit);
+ u[26] = half_btf_sse4_1(&cospim16, &u[21], &cospi48, &u[26], &rnding, bit);
+ u[21] = temp2;
+ for (i = 32; i < 64; i += 16) {
+ for (j = i; j < i + 4; j++) {
+ addsub_sse4_1(u[j], u[j ^ 7], &u[j], &u[j ^ 7], &clamp_lo, &clamp_hi);
+ addsub_sse4_1(u[j ^ 15], u[j ^ 8], &u[j ^ 15], &u[j ^ 8], &clamp_lo,
+ &clamp_hi);
+ }
+ }
+
+ // stage 8
+ u[7] = u[0];
+ u[6] = u[1];
+ u[5] = u[2];
+ u[4] = u[3];
+
+ idct64_stage8_sse4_1(u, &cospim32, &cospi32, &cospim16, &cospi48, &cospi16,
+ &cospim48, &clamp_lo, &clamp_hi, &rnding, bit);
+
+ // stage 9
+ idct64_stage9_sse4_1(u, &cospim32, &cospi32, &clamp_lo, &clamp_hi, &rnding,
+ bit);
+
+ // stage 10
+ idct64_stage10_sse4_1(u, &cospim32, &cospi32, &clamp_lo, &clamp_hi, &rnding,
+ bit);
+
+ // stage 11
+ idct64_stage11_sse4_1(u, out, do_cols, bd, out_shift, &clamp_lo, &clamp_hi);
+ }
+}
+
+static void idct64x64_low16_sse4_1(__m128i *in, __m128i *out, int bit,
+ int do_cols, int bd, int out_shift) {
+ int i, j;
+ const int32_t *cospi = cospi_arr(bit);
+ const __m128i rnding = _mm_set1_epi32(1 << (bit - 1));
+ const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8));
+ const __m128i clamp_lo = _mm_set1_epi32(-(1 << (log_range - 1)));
+ const __m128i clamp_hi = _mm_set1_epi32((1 << (log_range - 1)) - 1);
+
+ const __m128i cospi1 = _mm_set1_epi32(cospi[1]);
+ const __m128i cospi2 = _mm_set1_epi32(cospi[2]);
+ const __m128i cospi3 = _mm_set1_epi32(cospi[3]);
+ const __m128i cospi4 = _mm_set1_epi32(cospi[4]);
+ const __m128i cospi5 = _mm_set1_epi32(cospi[5]);
+ const __m128i cospi6 = _mm_set1_epi32(cospi[6]);
+ const __m128i cospi7 = _mm_set1_epi32(cospi[7]);
+ const __m128i cospi8 = _mm_set1_epi32(cospi[8]);
+ const __m128i cospi9 = _mm_set1_epi32(cospi[9]);
+ const __m128i cospi10 = _mm_set1_epi32(cospi[10]);
+ const __m128i cospi11 = _mm_set1_epi32(cospi[11]);
+ const __m128i cospi12 = _mm_set1_epi32(cospi[12]);
+ const __m128i cospi13 = _mm_set1_epi32(cospi[13]);
+ const __m128i cospi14 = _mm_set1_epi32(cospi[14]);
+ const __m128i cospi15 = _mm_set1_epi32(cospi[15]);
+ const __m128i cospi16 = _mm_set1_epi32(cospi[16]);
+ const __m128i cospi20 = _mm_set1_epi32(cospi[20]);
+ const __m128i cospi24 = _mm_set1_epi32(cospi[24]);
+ const __m128i cospi28 = _mm_set1_epi32(cospi[28]);
+ const __m128i cospi32 = _mm_set1_epi32(cospi[32]);
+ const __m128i cospi36 = _mm_set1_epi32(cospi[36]);
+ const __m128i cospi40 = _mm_set1_epi32(cospi[40]);
+ const __m128i cospi44 = _mm_set1_epi32(cospi[44]);
+ const __m128i cospi48 = _mm_set1_epi32(cospi[48]);
+ const __m128i cospi51 = _mm_set1_epi32(cospi[51]);
+ const __m128i cospi52 = _mm_set1_epi32(cospi[52]);
+ const __m128i cospi54 = _mm_set1_epi32(cospi[54]);
+ const __m128i cospi55 = _mm_set1_epi32(cospi[55]);
+ const __m128i cospi56 = _mm_set1_epi32(cospi[56]);
+ const __m128i cospi59 = _mm_set1_epi32(cospi[59]);
+ const __m128i cospi60 = _mm_set1_epi32(cospi[60]);
+ const __m128i cospi62 = _mm_set1_epi32(cospi[62]);
+ const __m128i cospi63 = _mm_set1_epi32(cospi[63]);
+
+ const __m128i cospim4 = _mm_set1_epi32(-cospi[4]);
+ const __m128i cospim8 = _mm_set1_epi32(-cospi[8]);
+ const __m128i cospim12 = _mm_set1_epi32(-cospi[12]);
+ const __m128i cospim16 = _mm_set1_epi32(-cospi[16]);
+ const __m128i cospim20 = _mm_set1_epi32(-cospi[20]);
+ const __m128i cospim24 = _mm_set1_epi32(-cospi[24]);
+ const __m128i cospim28 = _mm_set1_epi32(-cospi[28]);
+ const __m128i cospim32 = _mm_set1_epi32(-cospi[32]);
+ const __m128i cospim36 = _mm_set1_epi32(-cospi[36]);
+ const __m128i cospim40 = _mm_set1_epi32(-cospi[40]);
+ const __m128i cospim44 = _mm_set1_epi32(-cospi[44]);
+ const __m128i cospim48 = _mm_set1_epi32(-cospi[48]);
+ const __m128i cospim49 = _mm_set1_epi32(-cospi[49]);
+ const __m128i cospim50 = _mm_set1_epi32(-cospi[50]);
+ const __m128i cospim52 = _mm_set1_epi32(-cospi[52]);
+ const __m128i cospim53 = _mm_set1_epi32(-cospi[53]);
+ const __m128i cospim56 = _mm_set1_epi32(-cospi[56]);
+ const __m128i cospim57 = _mm_set1_epi32(-cospi[57]);
+ const __m128i cospim58 = _mm_set1_epi32(-cospi[58]);
+ const __m128i cospim60 = _mm_set1_epi32(-cospi[60]);
+ const __m128i cospim61 = _mm_set1_epi32(-cospi[61]);
+
+ {
+ __m128i u[64];
+ __m128i tmp1, tmp2, tmp3, tmp4;
+ // stage 1
+ u[0] = in[0];
+ u[32] = in[1];
+ u[36] = in[9];
+ u[40] = in[5];
+ u[44] = in[13];
+ u[48] = in[3];
+ u[52] = in[11];
+ u[56] = in[7];
+ u[60] = in[15];
+ u[16] = in[2];
+ u[20] = in[10];
+ u[24] = in[6];
+ u[28] = in[14];
+ u[4] = in[8];
+ u[8] = in[4];
+ u[12] = in[12];
+
+ // stage 2
+ u[63] = half_btf_0_sse4_1(&cospi1, &u[32], &rnding, bit);
+ u[32] = half_btf_0_sse4_1(&cospi63, &u[32], &rnding, bit);
+ u[35] = half_btf_0_sse4_1(&cospim49, &u[60], &rnding, bit);
+ u[60] = half_btf_0_sse4_1(&cospi15, &u[60], &rnding, bit);
+ u[59] = half_btf_0_sse4_1(&cospi9, &u[36], &rnding, bit);
+ u[36] = half_btf_0_sse4_1(&cospi55, &u[36], &rnding, bit);
+ u[39] = half_btf_0_sse4_1(&cospim57, &u[56], &rnding, bit);
+ u[56] = half_btf_0_sse4_1(&cospi7, &u[56], &rnding, bit);
+ u[55] = half_btf_0_sse4_1(&cospi5, &u[40], &rnding, bit);
+ u[40] = half_btf_0_sse4_1(&cospi59, &u[40], &rnding, bit);
+ u[43] = half_btf_0_sse4_1(&cospim53, &u[52], &rnding, bit);
+ u[52] = half_btf_0_sse4_1(&cospi11, &u[52], &rnding, bit);
+ u[47] = half_btf_0_sse4_1(&cospim61, &u[48], &rnding, bit);
+ u[48] = half_btf_0_sse4_1(&cospi3, &u[48], &rnding, bit);
+ u[51] = half_btf_0_sse4_1(&cospi13, &u[44], &rnding, bit);
+ u[44] = half_btf_0_sse4_1(&cospi51, &u[44], &rnding, bit);
+
+ // stage 3
+ u[31] = half_btf_0_sse4_1(&cospi2, &u[16], &rnding, bit);
+ u[16] = half_btf_0_sse4_1(&cospi62, &u[16], &rnding, bit);
+ u[19] = half_btf_0_sse4_1(&cospim50, &u[28], &rnding, bit);
+ u[28] = half_btf_0_sse4_1(&cospi14, &u[28], &rnding, bit);
+ u[27] = half_btf_0_sse4_1(&cospi10, &u[20], &rnding, bit);
+ u[20] = half_btf_0_sse4_1(&cospi54, &u[20], &rnding, bit);
+ u[23] = half_btf_0_sse4_1(&cospim58, &u[24], &rnding, bit);
+ u[24] = half_btf_0_sse4_1(&cospi6, &u[24], &rnding, bit);
+ u[33] = u[32];
+ u[34] = u[35];
+ u[37] = u[36];
+ u[38] = u[39];
+ u[41] = u[40];
+ u[42] = u[43];
+ u[45] = u[44];
+ u[46] = u[47];
+ u[49] = u[48];
+ u[50] = u[51];
+ u[53] = u[52];
+ u[54] = u[55];
+ u[57] = u[56];
+ u[58] = u[59];
+ u[61] = u[60];
+ u[62] = u[63];
+
+ // stage 4
+ u[15] = half_btf_0_sse4_1(&cospi4, &u[8], &rnding, bit);
+ u[8] = half_btf_0_sse4_1(&cospi60, &u[8], &rnding, bit);
+ u[11] = half_btf_0_sse4_1(&cospim52, &u[12], &rnding, bit);
+ u[12] = half_btf_0_sse4_1(&cospi12, &u[12], &rnding, bit);
+
+ u[17] = u[16];
+ u[18] = u[19];
+ u[21] = u[20];
+ u[22] = u[23];
+ u[25] = u[24];
+ u[26] = u[27];
+ u[29] = u[28];
+ u[30] = u[31];
+
+ tmp1 = half_btf_sse4_1(&cospim4, &u[33], &cospi60, &u[62], &rnding, bit);
+ tmp2 = half_btf_sse4_1(&cospim60, &u[34], &cospim4, &u[61], &rnding, bit);
+ tmp3 = half_btf_sse4_1(&cospim36, &u[37], &cospi28, &u[58], &rnding, bit);
+ tmp4 = half_btf_sse4_1(&cospim28, &u[38], &cospim36, &u[57], &rnding, bit);
+ u[57] = half_btf_sse4_1(&cospim36, &u[38], &cospi28, &u[57], &rnding, bit);
+ u[58] = half_btf_sse4_1(&cospi28, &u[37], &cospi36, &u[58], &rnding, bit);
+ u[61] = half_btf_sse4_1(&cospim4, &u[34], &cospi60, &u[61], &rnding, bit);
+ u[62] = half_btf_sse4_1(&cospi60, &u[33], &cospi4, &u[62], &rnding, bit);
+ u[33] = tmp1;
+ u[34] = tmp2;
+ u[37] = tmp3;
+ u[38] = tmp4;
+
+ tmp1 = half_btf_sse4_1(&cospim20, &u[41], &cospi44, &u[54], &rnding, bit);
+ tmp2 = half_btf_sse4_1(&cospim44, &u[42], &cospim20, &u[53], &rnding, bit);
+ tmp3 = half_btf_sse4_1(&cospim52, &u[45], &cospi12, &u[50], &rnding, bit);
+ tmp4 = half_btf_sse4_1(&cospim12, &u[46], &cospim52, &u[49], &rnding, bit);
+ u[49] = half_btf_sse4_1(&cospim52, &u[46], &cospi12, &u[49], &rnding, bit);
+ u[50] = half_btf_sse4_1(&cospi12, &u[45], &cospi52, &u[50], &rnding, bit);
+ u[53] = half_btf_sse4_1(&cospim20, &u[42], &cospi44, &u[53], &rnding, bit);
+ u[54] = half_btf_sse4_1(&cospi44, &u[41], &cospi20, &u[54], &rnding, bit);
+ u[41] = tmp1;
+ u[42] = tmp2;
+ u[45] = tmp3;
+ u[46] = tmp4;
+
+ // stage 5
+ u[7] = half_btf_0_sse4_1(&cospi8, &u[4], &rnding, bit);
+ u[4] = half_btf_0_sse4_1(&cospi56, &u[4], &rnding, bit);
+
+ u[9] = u[8];
+ u[10] = u[11];
+ u[13] = u[12];
+ u[14] = u[15];
+
+ tmp1 = half_btf_sse4_1(&cospim8, &u[17], &cospi56, &u[30], &rnding, bit);
+ tmp2 = half_btf_sse4_1(&cospim56, &u[18], &cospim8, &u[29], &rnding, bit);
+ tmp3 = half_btf_sse4_1(&cospim40, &u[21], &cospi24, &u[26], &rnding, bit);
+ tmp4 = half_btf_sse4_1(&cospim24, &u[22], &cospim40, &u[25], &rnding, bit);
+ u[25] = half_btf_sse4_1(&cospim40, &u[22], &cospi24, &u[25], &rnding, bit);
+ u[26] = half_btf_sse4_1(&cospi24, &u[21], &cospi40, &u[26], &rnding, bit);
+ u[29] = half_btf_sse4_1(&cospim8, &u[18], &cospi56, &u[29], &rnding, bit);
+ u[30] = half_btf_sse4_1(&cospi56, &u[17], &cospi8, &u[30], &rnding, bit);
+ u[17] = tmp1;
+ u[18] = tmp2;
+ u[21] = tmp3;
+ u[22] = tmp4;
+
+ for (i = 32; i < 64; i += 8) {
+ addsub_sse4_1(u[i + 0], u[i + 3], &u[i + 0], &u[i + 3], &clamp_lo,
+ &clamp_hi);
+ addsub_sse4_1(u[i + 1], u[i + 2], &u[i + 1], &u[i + 2], &clamp_lo,
+ &clamp_hi);
+
+ addsub_sse4_1(u[i + 7], u[i + 4], &u[i + 7], &u[i + 4], &clamp_lo,
+ &clamp_hi);
+ addsub_sse4_1(u[i + 6], u[i + 5], &u[i + 6], &u[i + 5], &clamp_lo,
+ &clamp_hi);
+ }
+
+ // stage 6
+ tmp1 = half_btf_0_sse4_1(&cospi32, &u[0], &rnding, bit);
+ u[1] = half_btf_0_sse4_1(&cospi32, &u[0], &rnding, bit);
+ u[0] = tmp1;
+ u[5] = u[4];
+ u[6] = u[7];
+
+ tmp1 = half_btf_sse4_1(&cospim16, &u[9], &cospi48, &u[14], &rnding, bit);
+ u[14] = half_btf_sse4_1(&cospi48, &u[9], &cospi16, &u[14], &rnding, bit);
+ u[9] = tmp1;
+ tmp2 = half_btf_sse4_1(&cospim48, &u[10], &cospim16, &u[13], &rnding, bit);
+ u[13] = half_btf_sse4_1(&cospim16, &u[10], &cospi48, &u[13], &rnding, bit);
+ u[10] = tmp2;
+
+ for (i = 16; i < 32; i += 8) {
+ addsub_sse4_1(u[i + 0], u[i + 3], &u[i + 0], &u[i + 3], &clamp_lo,
+ &clamp_hi);
+ addsub_sse4_1(u[i + 1], u[i + 2], &u[i + 1], &u[i + 2], &clamp_lo,
+ &clamp_hi);
+
+ addsub_sse4_1(u[i + 7], u[i + 4], &u[i + 7], &u[i + 4], &clamp_lo,
+ &clamp_hi);
+ addsub_sse4_1(u[i + 6], u[i + 5], &u[i + 6], &u[i + 5], &clamp_lo,
+ &clamp_hi);
+ }
+
+ tmp1 = half_btf_sse4_1(&cospim8, &u[34], &cospi56, &u[61], &rnding, bit);
+ tmp2 = half_btf_sse4_1(&cospim8, &u[35], &cospi56, &u[60], &rnding, bit);
+ tmp3 = half_btf_sse4_1(&cospim56, &u[36], &cospim8, &u[59], &rnding, bit);
+ tmp4 = half_btf_sse4_1(&cospim56, &u[37], &cospim8, &u[58], &rnding, bit);
+ u[58] = half_btf_sse4_1(&cospim8, &u[37], &cospi56, &u[58], &rnding, bit);
+ u[59] = half_btf_sse4_1(&cospim8, &u[36], &cospi56, &u[59], &rnding, bit);
+ u[60] = half_btf_sse4_1(&cospi56, &u[35], &cospi8, &u[60], &rnding, bit);
+ u[61] = half_btf_sse4_1(&cospi56, &u[34], &cospi8, &u[61], &rnding, bit);
+ u[34] = tmp1;
+ u[35] = tmp2;
+ u[36] = tmp3;
+ u[37] = tmp4;
+
+ tmp1 = half_btf_sse4_1(&cospim40, &u[42], &cospi24, &u[53], &rnding, bit);
+ tmp2 = half_btf_sse4_1(&cospim40, &u[43], &cospi24, &u[52], &rnding, bit);
+ tmp3 = half_btf_sse4_1(&cospim24, &u[44], &cospim40, &u[51], &rnding, bit);
+ tmp4 = half_btf_sse4_1(&cospim24, &u[45], &cospim40, &u[50], &rnding, bit);
+ u[50] = half_btf_sse4_1(&cospim40, &u[45], &cospi24, &u[50], &rnding, bit);
+ u[51] = half_btf_sse4_1(&cospim40, &u[44], &cospi24, &u[51], &rnding, bit);
+ u[52] = half_btf_sse4_1(&cospi24, &u[43], &cospi40, &u[52], &rnding, bit);
+ u[53] = half_btf_sse4_1(&cospi24, &u[42], &cospi40, &u[53], &rnding, bit);
+ u[42] = tmp1;
+ u[43] = tmp2;
+ u[44] = tmp3;
+ u[45] = tmp4;
+
+ // stage 7
+ u[3] = u[0];
+ u[2] = u[1];
+ tmp1 = half_btf_sse4_1(&cospim32, &u[5], &cospi32, &u[6], &rnding, bit);
+ u[6] = half_btf_sse4_1(&cospi32, &u[5], &cospi32, &u[6], &rnding, bit);
+ u[5] = tmp1;
+ addsub_sse4_1(u[8], u[11], &u[8], &u[11], &clamp_lo, &clamp_hi);
+ addsub_sse4_1(u[9], u[10], &u[9], &u[10], &clamp_lo, &clamp_hi);
+ addsub_sse4_1(u[15], u[12], &u[15], &u[12], &clamp_lo, &clamp_hi);
+ addsub_sse4_1(u[14], u[13], &u[14], &u[13], &clamp_lo, &clamp_hi);
+
+ tmp1 = half_btf_sse4_1(&cospim16, &u[18], &cospi48, &u[29], &rnding, bit);
+ tmp2 = half_btf_sse4_1(&cospim16, &u[19], &cospi48, &u[28], &rnding, bit);
+ tmp3 = half_btf_sse4_1(&cospim48, &u[20], &cospim16, &u[27], &rnding, bit);
+ tmp4 = half_btf_sse4_1(&cospim48, &u[21], &cospim16, &u[26], &rnding, bit);
+ u[26] = half_btf_sse4_1(&cospim16, &u[21], &cospi48, &u[26], &rnding, bit);
+ u[27] = half_btf_sse4_1(&cospim16, &u[20], &cospi48, &u[27], &rnding, bit);
+ u[28] = half_btf_sse4_1(&cospi48, &u[19], &cospi16, &u[28], &rnding, bit);
+ u[29] = half_btf_sse4_1(&cospi48, &u[18], &cospi16, &u[29], &rnding, bit);
+ u[18] = tmp1;
+ u[19] = tmp2;
+ u[20] = tmp3;
+ u[21] = tmp4;
+
+ for (i = 32; i < 64; i += 16) {
+ for (j = i; j < i + 4; j++) {
+ addsub_sse4_1(u[j], u[j ^ 7], &u[j], &u[j ^ 7], &clamp_lo, &clamp_hi);
+ addsub_sse4_1(u[j ^ 15], u[j ^ 8], &u[j ^ 15], &u[j ^ 8], &clamp_lo,
+ &clamp_hi);
+ }
+ }
+
+ // stage 8
+ for (i = 0; i < 4; ++i) {
+ addsub_sse4_1(u[i], u[7 - i], &u[i], &u[7 - i], &clamp_lo, &clamp_hi);
+ }
+
+ idct64_stage8_sse4_1(u, &cospim32, &cospi32, &cospim16, &cospi48, &cospi16,
+ &cospim48, &clamp_lo, &clamp_hi, &rnding, bit);
+
+ // stage 9
+ idct64_stage9_sse4_1(u, &cospim32, &cospi32, &clamp_lo, &clamp_hi, &rnding,
+ bit);
+
+ // stage 10
+ idct64_stage10_sse4_1(u, &cospim32, &cospi32, &clamp_lo, &clamp_hi, &rnding,
+ bit);
+
+ // stage 11
+ idct64_stage11_sse4_1(u, out, do_cols, bd, out_shift, &clamp_lo, &clamp_hi);
+ }
+}
+
+static void idct64x64_sse4_1(__m128i *in, __m128i *out, int bit, int do_cols,
+ int bd, int out_shift) {
+ int i, j;
+ const int32_t *cospi = cospi_arr(bit);
+ const __m128i rnding = _mm_set1_epi32(1 << (bit - 1));
+ const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8));
+ const __m128i clamp_lo = _mm_set1_epi32(-(1 << (log_range - 1)));
+ const __m128i clamp_hi = _mm_set1_epi32((1 << (log_range - 1)) - 1);
+
+ const __m128i cospi1 = _mm_set1_epi32(cospi[1]);
+ const __m128i cospi2 = _mm_set1_epi32(cospi[2]);
+ const __m128i cospi3 = _mm_set1_epi32(cospi[3]);
+ const __m128i cospi4 = _mm_set1_epi32(cospi[4]);
+ const __m128i cospi5 = _mm_set1_epi32(cospi[5]);
+ const __m128i cospi6 = _mm_set1_epi32(cospi[6]);
+ const __m128i cospi7 = _mm_set1_epi32(cospi[7]);
+ const __m128i cospi8 = _mm_set1_epi32(cospi[8]);
+ const __m128i cospi9 = _mm_set1_epi32(cospi[9]);
+ const __m128i cospi10 = _mm_set1_epi32(cospi[10]);
+ const __m128i cospi11 = _mm_set1_epi32(cospi[11]);
+ const __m128i cospi12 = _mm_set1_epi32(cospi[12]);
+ const __m128i cospi13 = _mm_set1_epi32(cospi[13]);
+ const __m128i cospi14 = _mm_set1_epi32(cospi[14]);
+ const __m128i cospi15 = _mm_set1_epi32(cospi[15]);
+ const __m128i cospi16 = _mm_set1_epi32(cospi[16]);
+ const __m128i cospi17 = _mm_set1_epi32(cospi[17]);
+ const __m128i cospi18 = _mm_set1_epi32(cospi[18]);
+ const __m128i cospi19 = _mm_set1_epi32(cospi[19]);
+ const __m128i cospi20 = _mm_set1_epi32(cospi[20]);
+ const __m128i cospi21 = _mm_set1_epi32(cospi[21]);
+ const __m128i cospi22 = _mm_set1_epi32(cospi[22]);
+ const __m128i cospi23 = _mm_set1_epi32(cospi[23]);
+ const __m128i cospi24 = _mm_set1_epi32(cospi[24]);
+ const __m128i cospi25 = _mm_set1_epi32(cospi[25]);
+ const __m128i cospi26 = _mm_set1_epi32(cospi[26]);
+ const __m128i cospi27 = _mm_set1_epi32(cospi[27]);
+ const __m128i cospi28 = _mm_set1_epi32(cospi[28]);
+ const __m128i cospi29 = _mm_set1_epi32(cospi[29]);
+ const __m128i cospi30 = _mm_set1_epi32(cospi[30]);
+ const __m128i cospi31 = _mm_set1_epi32(cospi[31]);
+ const __m128i cospi32 = _mm_set1_epi32(cospi[32]);
+ const __m128i cospi35 = _mm_set1_epi32(cospi[35]);
+ const __m128i cospi36 = _mm_set1_epi32(cospi[36]);
+ const __m128i cospi38 = _mm_set1_epi32(cospi[38]);
+ const __m128i cospi39 = _mm_set1_epi32(cospi[39]);
+ const __m128i cospi40 = _mm_set1_epi32(cospi[40]);
+ const __m128i cospi43 = _mm_set1_epi32(cospi[43]);
+ const __m128i cospi44 = _mm_set1_epi32(cospi[44]);
+ const __m128i cospi46 = _mm_set1_epi32(cospi[46]);
+ const __m128i cospi47 = _mm_set1_epi32(cospi[47]);
+ const __m128i cospi48 = _mm_set1_epi32(cospi[48]);
+ const __m128i cospi51 = _mm_set1_epi32(cospi[51]);
+ const __m128i cospi52 = _mm_set1_epi32(cospi[52]);
+ const __m128i cospi54 = _mm_set1_epi32(cospi[54]);
+ const __m128i cospi55 = _mm_set1_epi32(cospi[55]);
+ const __m128i cospi56 = _mm_set1_epi32(cospi[56]);
+ const __m128i cospi59 = _mm_set1_epi32(cospi[59]);
+ const __m128i cospi60 = _mm_set1_epi32(cospi[60]);
+ const __m128i cospi62 = _mm_set1_epi32(cospi[62]);
+ const __m128i cospi63 = _mm_set1_epi32(cospi[63]);
+
+ const __m128i cospim4 = _mm_set1_epi32(-cospi[4]);
+ const __m128i cospim8 = _mm_set1_epi32(-cospi[8]);
+ const __m128i cospim12 = _mm_set1_epi32(-cospi[12]);
+ const __m128i cospim16 = _mm_set1_epi32(-cospi[16]);
+ const __m128i cospim20 = _mm_set1_epi32(-cospi[20]);
+ const __m128i cospim24 = _mm_set1_epi32(-cospi[24]);
+ const __m128i cospim28 = _mm_set1_epi32(-cospi[28]);
+ const __m128i cospim32 = _mm_set1_epi32(-cospi[32]);
+ const __m128i cospim33 = _mm_set1_epi32(-cospi[33]);
+ const __m128i cospim34 = _mm_set1_epi32(-cospi[34]);
+ const __m128i cospim36 = _mm_set1_epi32(-cospi[36]);
+ const __m128i cospim37 = _mm_set1_epi32(-cospi[37]);
+ const __m128i cospim40 = _mm_set1_epi32(-cospi[40]);
+ const __m128i cospim41 = _mm_set1_epi32(-cospi[41]);
+ const __m128i cospim42 = _mm_set1_epi32(-cospi[42]);
+ const __m128i cospim44 = _mm_set1_epi32(-cospi[44]);
+ const __m128i cospim45 = _mm_set1_epi32(-cospi[45]);
+ const __m128i cospim48 = _mm_set1_epi32(-cospi[48]);
+ const __m128i cospim49 = _mm_set1_epi32(-cospi[49]);
+ const __m128i cospim50 = _mm_set1_epi32(-cospi[50]);
+ const __m128i cospim52 = _mm_set1_epi32(-cospi[52]);
+ const __m128i cospim53 = _mm_set1_epi32(-cospi[53]);
+ const __m128i cospim56 = _mm_set1_epi32(-cospi[56]);
+ const __m128i cospim57 = _mm_set1_epi32(-cospi[57]);
+ const __m128i cospim58 = _mm_set1_epi32(-cospi[58]);
+ const __m128i cospim60 = _mm_set1_epi32(-cospi[60]);
+ const __m128i cospim61 = _mm_set1_epi32(-cospi[61]);
+
+ {
+ __m128i u[64], v[64];
+
+ // stage 1
+ u[32] = in[1];
+ u[34] = in[17];
+ u[36] = in[9];
+ u[38] = in[25];
+ u[40] = in[5];
+ u[42] = in[21];
+ u[44] = in[13];
+ u[46] = in[29];
+ u[48] = in[3];
+ u[50] = in[19];
+ u[52] = in[11];
+ u[54] = in[27];
+ u[56] = in[7];
+ u[58] = in[23];
+ u[60] = in[15];
+ u[62] = in[31];
+
+ v[16] = in[2];
+ v[18] = in[18];
+ v[20] = in[10];
+ v[22] = in[26];
+ v[24] = in[6];
+ v[26] = in[22];
+ v[28] = in[14];
+ v[30] = in[30];
+
+ u[8] = in[4];
+ u[10] = in[20];
+ u[12] = in[12];
+ u[14] = in[28];
+
+ v[4] = in[8];
+ v[6] = in[24];
+
+ u[0] = in[0];
+ u[2] = in[16];
+
+ // stage 2
+ v[32] = half_btf_0_sse4_1(&cospi63, &u[32], &rnding, bit);
+ v[33] = half_btf_0_sse4_1(&cospim33, &u[62], &rnding, bit);
+ v[34] = half_btf_0_sse4_1(&cospi47, &u[34], &rnding, bit);
+ v[35] = half_btf_0_sse4_1(&cospim49, &u[60], &rnding, bit);
+ v[36] = half_btf_0_sse4_1(&cospi55, &u[36], &rnding, bit);
+ v[37] = half_btf_0_sse4_1(&cospim41, &u[58], &rnding, bit);
+ v[38] = half_btf_0_sse4_1(&cospi39, &u[38], &rnding, bit);
+ v[39] = half_btf_0_sse4_1(&cospim57, &u[56], &rnding, bit);
+ v[40] = half_btf_0_sse4_1(&cospi59, &u[40], &rnding, bit);
+ v[41] = half_btf_0_sse4_1(&cospim37, &u[54], &rnding, bit);
+ v[42] = half_btf_0_sse4_1(&cospi43, &u[42], &rnding, bit);
+ v[43] = half_btf_0_sse4_1(&cospim53, &u[52], &rnding, bit);
+ v[44] = half_btf_0_sse4_1(&cospi51, &u[44], &rnding, bit);
+ v[45] = half_btf_0_sse4_1(&cospim45, &u[50], &rnding, bit);
+ v[46] = half_btf_0_sse4_1(&cospi35, &u[46], &rnding, bit);
+ v[47] = half_btf_0_sse4_1(&cospim61, &u[48], &rnding, bit);
+ v[48] = half_btf_0_sse4_1(&cospi3, &u[48], &rnding, bit);
+ v[49] = half_btf_0_sse4_1(&cospi29, &u[46], &rnding, bit);
+ v[50] = half_btf_0_sse4_1(&cospi19, &u[50], &rnding, bit);
+ v[51] = half_btf_0_sse4_1(&cospi13, &u[44], &rnding, bit);
+ v[52] = half_btf_0_sse4_1(&cospi11, &u[52], &rnding, bit);
+ v[53] = half_btf_0_sse4_1(&cospi21, &u[42], &rnding, bit);
+ v[54] = half_btf_0_sse4_1(&cospi27, &u[54], &rnding, bit);
+ v[55] = half_btf_0_sse4_1(&cospi5, &u[40], &rnding, bit);
+ v[56] = half_btf_0_sse4_1(&cospi7, &u[56], &rnding, bit);
+ v[57] = half_btf_0_sse4_1(&cospi25, &u[38], &rnding, bit);
+ v[58] = half_btf_0_sse4_1(&cospi23, &u[58], &rnding, bit);
+ v[59] = half_btf_0_sse4_1(&cospi9, &u[36], &rnding, bit);
+ v[60] = half_btf_0_sse4_1(&cospi15, &u[60], &rnding, bit);
+ v[61] = half_btf_0_sse4_1(&cospi17, &u[34], &rnding, bit);
+ v[62] = half_btf_0_sse4_1(&cospi31, &u[62], &rnding, bit);
+ v[63] = half_btf_0_sse4_1(&cospi1, &u[32], &rnding, bit);
+
+ // stage 3
+ u[16] = half_btf_0_sse4_1(&cospi62, &v[16], &rnding, bit);
+ u[17] = half_btf_0_sse4_1(&cospim34, &v[30], &rnding, bit);
+ u[18] = half_btf_0_sse4_1(&cospi46, &v[18], &rnding, bit);
+ u[19] = half_btf_0_sse4_1(&cospim50, &v[28], &rnding, bit);
+ u[20] = half_btf_0_sse4_1(&cospi54, &v[20], &rnding, bit);
+ u[21] = half_btf_0_sse4_1(&cospim42, &v[26], &rnding, bit);
+ u[22] = half_btf_0_sse4_1(&cospi38, &v[22], &rnding, bit);
+ u[23] = half_btf_0_sse4_1(&cospim58, &v[24], &rnding, bit);
+ u[24] = half_btf_0_sse4_1(&cospi6, &v[24], &rnding, bit);
+ u[25] = half_btf_0_sse4_1(&cospi26, &v[22], &rnding, bit);
+ u[26] = half_btf_0_sse4_1(&cospi22, &v[26], &rnding, bit);
+ u[27] = half_btf_0_sse4_1(&cospi10, &v[20], &rnding, bit);
+ u[28] = half_btf_0_sse4_1(&cospi14, &v[28], &rnding, bit);
+ u[29] = half_btf_0_sse4_1(&cospi18, &v[18], &rnding, bit);
+ u[30] = half_btf_0_sse4_1(&cospi30, &v[30], &rnding, bit);
+ u[31] = half_btf_0_sse4_1(&cospi2, &v[16], &rnding, bit);
+
+ for (i = 32; i < 64; i += 4) {
+ addsub_sse4_1(v[i + 0], v[i + 1], &u[i + 0], &u[i + 1], &clamp_lo,
+ &clamp_hi);
+ addsub_sse4_1(v[i + 3], v[i + 2], &u[i + 3], &u[i + 2], &clamp_lo,
+ &clamp_hi);
+ }
+
+ // stage 4
+ v[8] = half_btf_0_sse4_1(&cospi60, &u[8], &rnding, bit);
+ v[9] = half_btf_0_sse4_1(&cospim36, &u[14], &rnding, bit);
+ v[10] = half_btf_0_sse4_1(&cospi44, &u[10], &rnding, bit);
+ v[11] = half_btf_0_sse4_1(&cospim52, &u[12], &rnding, bit);
+ v[12] = half_btf_0_sse4_1(&cospi12, &u[12], &rnding, bit);
+ v[13] = half_btf_0_sse4_1(&cospi20, &u[10], &rnding, bit);
+ v[14] = half_btf_0_sse4_1(&cospi28, &u[14], &rnding, bit);
+ v[15] = half_btf_0_sse4_1(&cospi4, &u[8], &rnding, bit);
+
+ for (i = 16; i < 32; i += 4) {
+ addsub_sse4_1(u[i + 0], u[i + 1], &v[i + 0], &v[i + 1], &clamp_lo,
+ &clamp_hi);
+ addsub_sse4_1(u[i + 3], u[i + 2], &v[i + 3], &v[i + 2], &clamp_lo,
+ &clamp_hi);
+ }
+
+ for (i = 32; i < 64; i += 4) {
+ v[i + 0] = u[i + 0];
+ v[i + 3] = u[i + 3];
+ }
+
+ v[33] = half_btf_sse4_1(&cospim4, &u[33], &cospi60, &u[62], &rnding, bit);
+ v[34] = half_btf_sse4_1(&cospim60, &u[34], &cospim4, &u[61], &rnding, bit);
+ v[37] = half_btf_sse4_1(&cospim36, &u[37], &cospi28, &u[58], &rnding, bit);
+ v[38] = half_btf_sse4_1(&cospim28, &u[38], &cospim36, &u[57], &rnding, bit);
+ v[41] = half_btf_sse4_1(&cospim20, &u[41], &cospi44, &u[54], &rnding, bit);
+ v[42] = half_btf_sse4_1(&cospim44, &u[42], &cospim20, &u[53], &rnding, bit);
+ v[45] = half_btf_sse4_1(&cospim52, &u[45], &cospi12, &u[50], &rnding, bit);
+ v[46] = half_btf_sse4_1(&cospim12, &u[46], &cospim52, &u[49], &rnding, bit);
+ v[49] = half_btf_sse4_1(&cospim52, &u[46], &cospi12, &u[49], &rnding, bit);
+ v[50] = half_btf_sse4_1(&cospi12, &u[45], &cospi52, &u[50], &rnding, bit);
+ v[53] = half_btf_sse4_1(&cospim20, &u[42], &cospi44, &u[53], &rnding, bit);
+ v[54] = half_btf_sse4_1(&cospi44, &u[41], &cospi20, &u[54], &rnding, bit);
+ v[57] = half_btf_sse4_1(&cospim36, &u[38], &cospi28, &u[57], &rnding, bit);
+ v[58] = half_btf_sse4_1(&cospi28, &u[37], &cospi36, &u[58], &rnding, bit);
+ v[61] = half_btf_sse4_1(&cospim4, &u[34], &cospi60, &u[61], &rnding, bit);
+ v[62] = half_btf_sse4_1(&cospi60, &u[33], &cospi4, &u[62], &rnding, bit);
+
+ // stage 5
+ u[4] = half_btf_0_sse4_1(&cospi56, &v[4], &rnding, bit);
+ u[5] = half_btf_0_sse4_1(&cospim40, &v[6], &rnding, bit);
+ u[6] = half_btf_0_sse4_1(&cospi24, &v[6], &rnding, bit);
+ u[7] = half_btf_0_sse4_1(&cospi8, &v[4], &rnding, bit);
+
+ for (i = 8; i < 16; i += 4) {
+ addsub_sse4_1(v[i + 0], v[i + 1], &u[i + 0], &u[i + 1], &clamp_lo,
+ &clamp_hi);
+ addsub_sse4_1(v[i + 3], v[i + 2], &u[i + 3], &u[i + 2], &clamp_lo,
+ &clamp_hi);
+ }
+
+ for (i = 16; i < 32; i += 4) {
+ u[i + 0] = v[i + 0];
+ u[i + 3] = v[i + 3];
+ }
+
+ u[17] = half_btf_sse4_1(&cospim8, &v[17], &cospi56, &v[30], &rnding, bit);
+ u[18] = half_btf_sse4_1(&cospim56, &v[18], &cospim8, &v[29], &rnding, bit);
+ u[21] = half_btf_sse4_1(&cospim40, &v[21], &cospi24, &v[26], &rnding, bit);
+ u[22] = half_btf_sse4_1(&cospim24, &v[22], &cospim40, &v[25], &rnding, bit);
+ u[25] = half_btf_sse4_1(&cospim40, &v[22], &cospi24, &v[25], &rnding, bit);
+ u[26] = half_btf_sse4_1(&cospi24, &v[21], &cospi40, &v[26], &rnding, bit);
+ u[29] = half_btf_sse4_1(&cospim8, &v[18], &cospi56, &v[29], &rnding, bit);
+ u[30] = half_btf_sse4_1(&cospi56, &v[17], &cospi8, &v[30], &rnding, bit);
+
+ for (i = 32; i < 64; i += 8) {
+ addsub_sse4_1(v[i + 0], v[i + 3], &u[i + 0], &u[i + 3], &clamp_lo,
+ &clamp_hi);
+ addsub_sse4_1(v[i + 1], v[i + 2], &u[i + 1], &u[i + 2], &clamp_lo,
+ &clamp_hi);
+
+ addsub_sse4_1(v[i + 7], v[i + 4], &u[i + 7], &u[i + 4], &clamp_lo,
+ &clamp_hi);
+ addsub_sse4_1(v[i + 6], v[i + 5], &u[i + 6], &u[i + 5], &clamp_lo,
+ &clamp_hi);
+ }
+
+ // stage 6
+ v[0] = half_btf_0_sse4_1(&cospi32, &u[0], &rnding, bit);
+ v[1] = half_btf_0_sse4_1(&cospi32, &u[0], &rnding, bit);
+ v[2] = half_btf_0_sse4_1(&cospi48, &u[2], &rnding, bit);
+ v[3] = half_btf_0_sse4_1(&cospi16, &u[2], &rnding, bit);
+
+ addsub_sse4_1(u[4], u[5], &v[4], &v[5], &clamp_lo, &clamp_hi);
+ addsub_sse4_1(u[7], u[6], &v[7], &v[6], &clamp_lo, &clamp_hi);
+
+ for (i = 8; i < 16; i += 4) {
+ v[i + 0] = u[i + 0];
+ v[i + 3] = u[i + 3];
+ }
+
+ v[9] = half_btf_sse4_1(&cospim16, &u[9], &cospi48, &u[14], &rnding, bit);
+ v[10] = half_btf_sse4_1(&cospim48, &u[10], &cospim16, &u[13], &rnding, bit);
+ v[13] = half_btf_sse4_1(&cospim16, &u[10], &cospi48, &u[13], &rnding, bit);
+ v[14] = half_btf_sse4_1(&cospi48, &u[9], &cospi16, &u[14], &rnding, bit);
+
+ for (i = 16; i < 32; i += 8) {
+ addsub_sse4_1(u[i + 0], u[i + 3], &v[i + 0], &v[i + 3], &clamp_lo,
+ &clamp_hi);
+ addsub_sse4_1(u[i + 1], u[i + 2], &v[i + 1], &v[i + 2], &clamp_lo,
+ &clamp_hi);
+
+ addsub_sse4_1(u[i + 7], u[i + 4], &v[i + 7], &v[i + 4], &clamp_lo,
+ &clamp_hi);
+ addsub_sse4_1(u[i + 6], u[i + 5], &v[i + 6], &v[i + 5], &clamp_lo,
+ &clamp_hi);
+ }
+
+ for (i = 32; i < 64; i += 8) {
+ v[i + 0] = u[i + 0];
+ v[i + 1] = u[i + 1];
+ v[i + 6] = u[i + 6];
+ v[i + 7] = u[i + 7];
+ }
+
+ v[34] = half_btf_sse4_1(&cospim8, &u[34], &cospi56, &u[61], &rnding, bit);
+ v[35] = half_btf_sse4_1(&cospim8, &u[35], &cospi56, &u[60], &rnding, bit);
+ v[36] = half_btf_sse4_1(&cospim56, &u[36], &cospim8, &u[59], &rnding, bit);
+ v[37] = half_btf_sse4_1(&cospim56, &u[37], &cospim8, &u[58], &rnding, bit);
+ v[42] = half_btf_sse4_1(&cospim40, &u[42], &cospi24, &u[53], &rnding, bit);
+ v[43] = half_btf_sse4_1(&cospim40, &u[43], &cospi24, &u[52], &rnding, bit);
+ v[44] = half_btf_sse4_1(&cospim24, &u[44], &cospim40, &u[51], &rnding, bit);
+ v[45] = half_btf_sse4_1(&cospim24, &u[45], &cospim40, &u[50], &rnding, bit);
+ v[50] = half_btf_sse4_1(&cospim40, &u[45], &cospi24, &u[50], &rnding, bit);
+ v[51] = half_btf_sse4_1(&cospim40, &u[44], &cospi24, &u[51], &rnding, bit);
+ v[52] = half_btf_sse4_1(&cospi24, &u[43], &cospi40, &u[52], &rnding, bit);
+ v[53] = half_btf_sse4_1(&cospi24, &u[42], &cospi40, &u[53], &rnding, bit);
+ v[58] = half_btf_sse4_1(&cospim8, &u[37], &cospi56, &u[58], &rnding, bit);
+ v[59] = half_btf_sse4_1(&cospim8, &u[36], &cospi56, &u[59], &rnding, bit);
+ v[60] = half_btf_sse4_1(&cospi56, &u[35], &cospi8, &u[60], &rnding, bit);
+ v[61] = half_btf_sse4_1(&cospi56, &u[34], &cospi8, &u[61], &rnding, bit);
+
+ // stage 7
+ addsub_sse4_1(v[0], v[3], &u[0], &u[3], &clamp_lo, &clamp_hi);
+ addsub_sse4_1(v[1], v[2], &u[1], &u[2], &clamp_lo, &clamp_hi);
+
+ u[4] = v[4];
+ u[7] = v[7];
+ u[5] = half_btf_sse4_1(&cospim32, &v[5], &cospi32, &v[6], &rnding, bit);
+ u[6] = half_btf_sse4_1(&cospi32, &v[5], &cospi32, &v[6], &rnding, bit);
+
+ addsub_sse4_1(v[8], v[11], &u[8], &u[11], &clamp_lo, &clamp_hi);
+ addsub_sse4_1(v[9], v[10], &u[9], &u[10], &clamp_lo, &clamp_hi);
+ addsub_sse4_1(v[15], v[12], &u[15], &u[12], &clamp_lo, &clamp_hi);
+ addsub_sse4_1(v[14], v[13], &u[14], &u[13], &clamp_lo, &clamp_hi);
+
+ for (i = 16; i < 32; i += 8) {
+ u[i + 0] = v[i + 0];
+ u[i + 1] = v[i + 1];
+ u[i + 6] = v[i + 6];
+ u[i + 7] = v[i + 7];
+ }
+
+ u[18] = half_btf_sse4_1(&cospim16, &v[18], &cospi48, &v[29], &rnding, bit);
+ u[19] = half_btf_sse4_1(&cospim16, &v[19], &cospi48, &v[28], &rnding, bit);
+ u[20] = half_btf_sse4_1(&cospim48, &v[20], &cospim16, &v[27], &rnding, bit);
+ u[21] = half_btf_sse4_1(&cospim48, &v[21], &cospim16, &v[26], &rnding, bit);
+ u[26] = half_btf_sse4_1(&cospim16, &v[21], &cospi48, &v[26], &rnding, bit);
+ u[27] = half_btf_sse4_1(&cospim16, &v[20], &cospi48, &v[27], &rnding, bit);
+ u[28] = half_btf_sse4_1(&cospi48, &v[19], &cospi16, &v[28], &rnding, bit);
+ u[29] = half_btf_sse4_1(&cospi48, &v[18], &cospi16, &v[29], &rnding, bit);
+
+ for (i = 32; i < 64; i += 16) {
+ for (j = i; j < i + 4; j++) {
+ addsub_sse4_1(v[j], v[j ^ 7], &u[j], &u[j ^ 7], &clamp_lo, &clamp_hi);
+ addsub_sse4_1(v[j ^ 15], v[j ^ 8], &u[j ^ 15], &u[j ^ 8], &clamp_lo,
+ &clamp_hi);
+ }
+ }
+
+ // stage 8
+ for (i = 0; i < 4; ++i) {
+ addsub_sse4_1(u[i], u[7 - i], &v[i], &v[7 - i], &clamp_lo, &clamp_hi);
+ }
+
+ v[8] = u[8];
+ v[9] = u[9];
+ v[14] = u[14];
+ v[15] = u[15];
+
+ v[10] = half_btf_sse4_1(&cospim32, &u[10], &cospi32, &u[13], &rnding, bit);
+ v[11] = half_btf_sse4_1(&cospim32, &u[11], &cospi32, &u[12], &rnding, bit);
+ v[12] = half_btf_sse4_1(&cospi32, &u[11], &cospi32, &u[12], &rnding, bit);
+ v[13] = half_btf_sse4_1(&cospi32, &u[10], &cospi32, &u[13], &rnding, bit);
+
+ for (i = 16; i < 20; ++i) {
+ addsub_sse4_1(u[i], u[i ^ 7], &v[i], &v[i ^ 7], &clamp_lo, &clamp_hi);
+ addsub_sse4_1(u[i ^ 15], u[i ^ 8], &v[i ^ 15], &v[i ^ 8], &clamp_lo,
+ &clamp_hi);
+ }
+
+ for (i = 32; i < 36; ++i) {
+ v[i] = u[i];
+ v[i + 12] = u[i + 12];
+ v[i + 16] = u[i + 16];
+ v[i + 28] = u[i + 28];
+ }
+
+ v[36] = half_btf_sse4_1(&cospim16, &u[36], &cospi48, &u[59], &rnding, bit);
+ v[37] = half_btf_sse4_1(&cospim16, &u[37], &cospi48, &u[58], &rnding, bit);
+ v[38] = half_btf_sse4_1(&cospim16, &u[38], &cospi48, &u[57], &rnding, bit);
+ v[39] = half_btf_sse4_1(&cospim16, &u[39], &cospi48, &u[56], &rnding, bit);
+ v[40] = half_btf_sse4_1(&cospim48, &u[40], &cospim16, &u[55], &rnding, bit);
+ v[41] = half_btf_sse4_1(&cospim48, &u[41], &cospim16, &u[54], &rnding, bit);
+ v[42] = half_btf_sse4_1(&cospim48, &u[42], &cospim16, &u[53], &rnding, bit);
+ v[43] = half_btf_sse4_1(&cospim48, &u[43], &cospim16, &u[52], &rnding, bit);
+ v[52] = half_btf_sse4_1(&cospim16, &u[43], &cospi48, &u[52], &rnding, bit);
+ v[53] = half_btf_sse4_1(&cospim16, &u[42], &cospi48, &u[53], &rnding, bit);
+ v[54] = half_btf_sse4_1(&cospim16, &u[41], &cospi48, &u[54], &rnding, bit);
+ v[55] = half_btf_sse4_1(&cospim16, &u[40], &cospi48, &u[55], &rnding, bit);
+ v[56] = half_btf_sse4_1(&cospi48, &u[39], &cospi16, &u[56], &rnding, bit);
+ v[57] = half_btf_sse4_1(&cospi48, &u[38], &cospi16, &u[57], &rnding, bit);
+ v[58] = half_btf_sse4_1(&cospi48, &u[37], &cospi16, &u[58], &rnding, bit);
+ v[59] = half_btf_sse4_1(&cospi48, &u[36], &cospi16, &u[59], &rnding, bit);
+
+ // stage 9
+ for (i = 0; i < 8; ++i) {
+ addsub_sse4_1(v[i], v[15 - i], &u[i], &u[15 - i], &clamp_lo, &clamp_hi);
+ }
+
+ for (i = 16; i < 20; ++i) {
+ u[i] = v[i];
+ u[i + 12] = v[i + 12];
+ }
+
+ u[20] = half_btf_sse4_1(&cospim32, &v[20], &cospi32, &v[27], &rnding, bit);
+ u[21] = half_btf_sse4_1(&cospim32, &v[21], &cospi32, &v[26], &rnding, bit);
+ u[22] = half_btf_sse4_1(&cospim32, &v[22], &cospi32, &v[25], &rnding, bit);
+ u[23] = half_btf_sse4_1(&cospim32, &v[23], &cospi32, &v[24], &rnding, bit);
+ u[24] = half_btf_sse4_1(&cospi32, &v[23], &cospi32, &v[24], &rnding, bit);
+ u[25] = half_btf_sse4_1(&cospi32, &v[22], &cospi32, &v[25], &rnding, bit);
+ u[26] = half_btf_sse4_1(&cospi32, &v[21], &cospi32, &v[26], &rnding, bit);
+ u[27] = half_btf_sse4_1(&cospi32, &v[20], &cospi32, &v[27], &rnding, bit);
+
+ for (i = 32; i < 40; i++) {
+ addsub_sse4_1(v[i], v[i ^ 15], &u[i], &u[i ^ 15], &clamp_lo, &clamp_hi);
+ }
+
+ for (i = 48; i < 56; i++) {
+ addsub_sse4_1(v[i ^ 15], v[i], &u[i ^ 15], &u[i], &clamp_lo, &clamp_hi);
+ }
+
+ // stage 10
+ for (i = 0; i < 16; i++) {
+ addsub_sse4_1(u[i], u[31 - i], &v[i], &v[31 - i], &clamp_lo, &clamp_hi);
+ }
+
+ for (i = 32; i < 40; i++) v[i] = u[i];
+
+ v[40] = half_btf_sse4_1(&cospim32, &u[40], &cospi32, &u[55], &rnding, bit);
+ v[41] = half_btf_sse4_1(&cospim32, &u[41], &cospi32, &u[54], &rnding, bit);
+ v[42] = half_btf_sse4_1(&cospim32, &u[42], &cospi32, &u[53], &rnding, bit);
+ v[43] = half_btf_sse4_1(&cospim32, &u[43], &cospi32, &u[52], &rnding, bit);
+ v[44] = half_btf_sse4_1(&cospim32, &u[44], &cospi32, &u[51], &rnding, bit);
+ v[45] = half_btf_sse4_1(&cospim32, &u[45], &cospi32, &u[50], &rnding, bit);
+ v[46] = half_btf_sse4_1(&cospim32, &u[46], &cospi32, &u[49], &rnding, bit);
+ v[47] = half_btf_sse4_1(&cospim32, &u[47], &cospi32, &u[48], &rnding, bit);
+ v[48] = half_btf_sse4_1(&cospi32, &u[47], &cospi32, &u[48], &rnding, bit);
+ v[49] = half_btf_sse4_1(&cospi32, &u[46], &cospi32, &u[49], &rnding, bit);
+ v[50] = half_btf_sse4_1(&cospi32, &u[45], &cospi32, &u[50], &rnding, bit);
+ v[51] = half_btf_sse4_1(&cospi32, &u[44], &cospi32, &u[51], &rnding, bit);
+ v[52] = half_btf_sse4_1(&cospi32, &u[43], &cospi32, &u[52], &rnding, bit);
+ v[53] = half_btf_sse4_1(&cospi32, &u[42], &cospi32, &u[53], &rnding, bit);
+ v[54] = half_btf_sse4_1(&cospi32, &u[41], &cospi32, &u[54], &rnding, bit);
+ v[55] = half_btf_sse4_1(&cospi32, &u[40], &cospi32, &u[55], &rnding, bit);
+
+ for (i = 56; i < 64; i++) v[i] = u[i];
+
+ // stage 11
+ for (i = 0; i < 32; i++) {
+ addsub_sse4_1(v[i], v[63 - i], &out[(i)], &out[(63 - i)], &clamp_lo,
+ &clamp_hi);
+ }
+
+ if (!do_cols) {
+ const int log_range_out = AOMMAX(16, bd + 6);
+ const __m128i clamp_lo_out = _mm_set1_epi32(-(1 << (log_range_out - 1)));
+ const __m128i clamp_hi_out =
+ _mm_set1_epi32((1 << (log_range_out - 1)) - 1);
+ for (i = 0; i < 64; i += 4) {
+ round_shift_4x4(out + i, out_shift);
+ highbd_clamp_epi32_sse4_1(out + i, out + i, &clamp_lo_out,
+ &clamp_hi_out, 4);
+ }
+ }
+ }
+}
+
+static void idct32x32_low1_sse4_1(__m128i *in, __m128i *out, int bit,
+ int do_cols, int bd, int out_shift) {
+ const int32_t *cospi = cospi_arr(bit);
+ const __m128i cospi32 = _mm_set1_epi32(cospi[32]);
+ const __m128i rounding = _mm_set1_epi32(1 << (bit - 1));
+ const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8));
+ __m128i clamp_lo = _mm_set1_epi32(-(1 << (log_range - 1)));
+ __m128i clamp_hi = _mm_set1_epi32((1 << (log_range - 1)) - 1);
+ __m128i bf1;
+
+ // stage 0
+ // stage 1
+ bf1 = in[0];
+
+ // stage 2
+ // stage 3
+ // stage 4
+ // stage 5
+ bf1 = half_btf_0_sse4_1(&cospi32, &bf1, &rounding, bit);
+
+ // stage 6
+ // stage 7
+ // stage 8
+ // stage 9
+ if (do_cols) {
+ bf1 = _mm_max_epi32(bf1, clamp_lo);
+ bf1 = _mm_min_epi32(bf1, clamp_hi);
+ } else {
+ const int log_range_out = AOMMAX(16, bd + 6);
+ clamp_lo = _mm_set1_epi32(-(1 << (log_range_out - 1)));
+ clamp_hi = _mm_set1_epi32((1 << (log_range_out - 1)) - 1);
+ if (out_shift != 0) {
+ __m128i offset = _mm_set1_epi32((1 << out_shift) >> 1);
+ bf1 = _mm_add_epi32(bf1, offset);
+ bf1 = _mm_sra_epi32(bf1, _mm_cvtsi32_si128(out_shift));
+ }
+ }
+
+ bf1 = _mm_max_epi32(bf1, clamp_lo);
+ bf1 = _mm_min_epi32(bf1, clamp_hi);
+ out[0] = bf1;
+ out[1] = bf1;
+ out[2] = bf1;
+ out[3] = bf1;
+ out[4] = bf1;
+ out[5] = bf1;
+ out[6] = bf1;
+ out[7] = bf1;
+ out[8] = bf1;
+ out[9] = bf1;
+ out[10] = bf1;
+ out[11] = bf1;
+ out[12] = bf1;
+ out[13] = bf1;
+ out[14] = bf1;
+ out[15] = bf1;
+ out[16] = bf1;
+ out[17] = bf1;
+ out[18] = bf1;
+ out[19] = bf1;
+ out[20] = bf1;
+ out[21] = bf1;
+ out[22] = bf1;
+ out[23] = bf1;
+ out[24] = bf1;
+ out[25] = bf1;
+ out[26] = bf1;
+ out[27] = bf1;
+ out[28] = bf1;
+ out[29] = bf1;
+ out[30] = bf1;
+ out[31] = bf1;
+}
+
+static void idct32x32_low8_sse4_1(__m128i *in, __m128i *out, int bit,
+ int do_cols, int bd, int out_shift) {
+ const int32_t *cospi = cospi_arr(bit);
+ const __m128i cospi62 = _mm_set1_epi32(cospi[62]);
+ const __m128i cospi14 = _mm_set1_epi32(cospi[14]);
+ const __m128i cospi54 = _mm_set1_epi32(cospi[54]);
+ const __m128i cospi6 = _mm_set1_epi32(cospi[6]);
+ const __m128i cospi10 = _mm_set1_epi32(cospi[10]);
+ const __m128i cospi2 = _mm_set1_epi32(cospi[2]);
+ const __m128i cospim58 = _mm_set1_epi32(-cospi[58]);
+ const __m128i cospim50 = _mm_set1_epi32(-cospi[50]);
+ const __m128i cospi60 = _mm_set1_epi32(cospi[60]);
+ const __m128i cospi12 = _mm_set1_epi32(cospi[12]);
+ const __m128i cospi4 = _mm_set1_epi32(cospi[4]);
+ const __m128i cospim52 = _mm_set1_epi32(-cospi[52]);
+ const __m128i cospi56 = _mm_set1_epi32(cospi[56]);
+ const __m128i cospi24 = _mm_set1_epi32(cospi[24]);
+ const __m128i cospi40 = _mm_set1_epi32(cospi[40]);
+ const __m128i cospi8 = _mm_set1_epi32(cospi[8]);
+ const __m128i cospim40 = _mm_set1_epi32(-cospi[40]);
+ const __m128i cospim8 = _mm_set1_epi32(-cospi[8]);
+ const __m128i cospim56 = _mm_set1_epi32(-cospi[56]);
+ const __m128i cospim24 = _mm_set1_epi32(-cospi[24]);
+ const __m128i cospi32 = _mm_set1_epi32(cospi[32]);
+ const __m128i cospim32 = _mm_set1_epi32(-cospi[32]);
+ const __m128i cospi48 = _mm_set1_epi32(cospi[48]);
+ const __m128i cospim48 = _mm_set1_epi32(-cospi[48]);
+ const __m128i cospi16 = _mm_set1_epi32(cospi[16]);
+ const __m128i cospim16 = _mm_set1_epi32(-cospi[16]);
+ const __m128i rounding = _mm_set1_epi32(1 << (bit - 1));
+ const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8));
+ const __m128i clamp_lo = _mm_set1_epi32(-(1 << (log_range - 1)));
+ const __m128i clamp_hi = _mm_set1_epi32((1 << (log_range - 1)) - 1);
+ __m128i bf1[32];
+
+ // stage 0
+ // stage 1
+ bf1[0] = in[0];
+ bf1[4] = in[4];
+ bf1[8] = in[2];
+ bf1[12] = in[6];
+ bf1[16] = in[1];
+ bf1[20] = in[5];
+ bf1[24] = in[3];
+ bf1[28] = in[7];
+
+ // stage 2
+ bf1[31] = half_btf_0_sse4_1(&cospi2, &bf1[16], &rounding, bit);
+ bf1[16] = half_btf_0_sse4_1(&cospi62, &bf1[16], &rounding, bit);
+ bf1[19] = half_btf_0_sse4_1(&cospim50, &bf1[28], &rounding, bit);
+ bf1[28] = half_btf_0_sse4_1(&cospi14, &bf1[28], &rounding, bit);
+ bf1[27] = half_btf_0_sse4_1(&cospi10, &bf1[20], &rounding, bit);
+ bf1[20] = half_btf_0_sse4_1(&cospi54, &bf1[20], &rounding, bit);
+ bf1[23] = half_btf_0_sse4_1(&cospim58, &bf1[24], &rounding, bit);
+ bf1[24] = half_btf_0_sse4_1(&cospi6, &bf1[24], &rounding, bit);
+
+ // stage 3
+ bf1[15] = half_btf_0_sse4_1(&cospi4, &bf1[8], &rounding, bit);
+ bf1[8] = half_btf_0_sse4_1(&cospi60, &bf1[8], &rounding, bit);
+
+ bf1[11] = half_btf_0_sse4_1(&cospim52, &bf1[12], &rounding, bit);
+ bf1[12] = half_btf_0_sse4_1(&cospi12, &bf1[12], &rounding, bit);
+ bf1[17] = bf1[16];
+ bf1[18] = bf1[19];
+ bf1[21] = bf1[20];
+ bf1[22] = bf1[23];
+ bf1[25] = bf1[24];
+ bf1[26] = bf1[27];
+ bf1[29] = bf1[28];
+ bf1[30] = bf1[31];
+
+ // stage 4 :
+ bf1[7] = half_btf_0_sse4_1(&cospi8, &bf1[4], &rounding, bit);
+ bf1[4] = half_btf_0_sse4_1(&cospi56, &bf1[4], &rounding, bit);
+
+ bf1[9] = bf1[8];
+ bf1[10] = bf1[11];
+ bf1[13] = bf1[12];
+ bf1[14] = bf1[15];
+
+ idct32_stage4_sse4_1(bf1, &cospim8, &cospi56, &cospi8, &cospim56, &cospim40,
+ &cospi24, &cospi40, &cospim24, &rounding, bit);
+
+ // stage 5
+ bf1[0] = half_btf_0_sse4_1(&cospi32, &bf1[0], &rounding, bit);
+ bf1[1] = bf1[0];
+ bf1[5] = bf1[4];
+ bf1[6] = bf1[7];
+
+ idct32_stage5_sse4_1(bf1, &cospim16, &cospi48, &cospi16, &cospim48, &clamp_lo,
+ &clamp_hi, &rounding, bit);
+
+ // stage 6
+ bf1[3] = bf1[0];
+ bf1[2] = bf1[1];
+
+ idct32_stage6_sse4_1(bf1, &cospim32, &cospi32, &cospim16, &cospi48, &cospi16,
+ &cospim48, &clamp_lo, &clamp_hi, &rounding, bit);
+
+ // stage 7
+ idct32_stage7_sse4_1(bf1, &cospim32, &cospi32, &clamp_lo, &clamp_hi,
+ &rounding, bit);
+
+ // stage 8
+ idct32_stage8_sse4_1(bf1, &cospim32, &cospi32, &clamp_lo, &clamp_hi,
+ &rounding, bit);
+
+ // stage 9
+ idct32_stage9_sse4_1(bf1, out, do_cols, bd, out_shift, &clamp_lo, &clamp_hi);
+}
+
+static void idct32x32_low16_sse4_1(__m128i *in, __m128i *out, int bit,
+ int do_cols, int bd, int out_shift) {
+ const int32_t *cospi = cospi_arr(bit);
+ const __m128i cospi62 = _mm_set1_epi32(cospi[62]);
+ const __m128i cospi30 = _mm_set1_epi32(cospi[30]);
+ const __m128i cospi46 = _mm_set1_epi32(cospi[46]);
+ const __m128i cospi14 = _mm_set1_epi32(cospi[14]);
+ const __m128i cospi54 = _mm_set1_epi32(cospi[54]);
+ const __m128i cospi22 = _mm_set1_epi32(cospi[22]);
+ const __m128i cospi38 = _mm_set1_epi32(cospi[38]);
+ const __m128i cospi6 = _mm_set1_epi32(cospi[6]);
+ const __m128i cospi26 = _mm_set1_epi32(cospi[26]);
+ const __m128i cospi10 = _mm_set1_epi32(cospi[10]);
+ const __m128i cospi18 = _mm_set1_epi32(cospi[18]);
+ const __m128i cospi2 = _mm_set1_epi32(cospi[2]);
+ const __m128i cospim58 = _mm_set1_epi32(-cospi[58]);
+ const __m128i cospim42 = _mm_set1_epi32(-cospi[42]);
+ const __m128i cospim50 = _mm_set1_epi32(-cospi[50]);
+ const __m128i cospim34 = _mm_set1_epi32(-cospi[34]);
+ const __m128i cospi60 = _mm_set1_epi32(cospi[60]);
+ const __m128i cospi28 = _mm_set1_epi32(cospi[28]);
+ const __m128i cospi44 = _mm_set1_epi32(cospi[44]);
+ const __m128i cospi12 = _mm_set1_epi32(cospi[12]);
+ const __m128i cospi20 = _mm_set1_epi32(cospi[20]);
+ const __m128i cospi4 = _mm_set1_epi32(cospi[4]);
+ const __m128i cospim52 = _mm_set1_epi32(-cospi[52]);
+ const __m128i cospim36 = _mm_set1_epi32(-cospi[36]);
+ const __m128i cospi56 = _mm_set1_epi32(cospi[56]);
+ const __m128i cospi24 = _mm_set1_epi32(cospi[24]);
+ const __m128i cospi40 = _mm_set1_epi32(cospi[40]);
+ const __m128i cospi8 = _mm_set1_epi32(cospi[8]);
+ const __m128i cospim40 = _mm_set1_epi32(-cospi[40]);
+ const __m128i cospim8 = _mm_set1_epi32(-cospi[8]);
+ const __m128i cospim56 = _mm_set1_epi32(-cospi[56]);
+ const __m128i cospim24 = _mm_set1_epi32(-cospi[24]);
+ const __m128i cospi32 = _mm_set1_epi32(cospi[32]);
+ const __m128i cospim32 = _mm_set1_epi32(-cospi[32]);
+ const __m128i cospi48 = _mm_set1_epi32(cospi[48]);
+ const __m128i cospim48 = _mm_set1_epi32(-cospi[48]);
+ const __m128i cospi16 = _mm_set1_epi32(cospi[16]);
+ const __m128i cospim16 = _mm_set1_epi32(-cospi[16]);
+ const __m128i rounding = _mm_set1_epi32(1 << (bit - 1));
+ const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8));
+ const __m128i clamp_lo = _mm_set1_epi32(-(1 << (log_range - 1)));
+ const __m128i clamp_hi = _mm_set1_epi32((1 << (log_range - 1)) - 1);
+ __m128i bf1[32];
+
+ // stage 0
+ // stage 1
+
+ bf1[0] = in[0];
+ bf1[2] = in[8];
+ bf1[4] = in[4];
+ bf1[6] = in[12];
+ bf1[8] = in[2];
+ bf1[10] = in[10];
+ bf1[12] = in[6];
+ bf1[14] = in[14];
+ bf1[16] = in[1];
+ bf1[18] = in[9];
+ bf1[20] = in[5];
+ bf1[22] = in[13];
+ bf1[24] = in[3];
+ bf1[26] = in[11];
+ bf1[28] = in[7];
+ bf1[30] = in[15];
+
+ // stage 2
+ bf1[31] = half_btf_0_sse4_1(&cospi2, &bf1[16], &rounding, bit);
+ bf1[16] = half_btf_0_sse4_1(&cospi62, &bf1[16], &rounding, bit);
+ bf1[17] = half_btf_0_sse4_1(&cospim34, &bf1[30], &rounding, bit);
+ bf1[30] = half_btf_0_sse4_1(&cospi30, &bf1[30], &rounding, bit);
+ bf1[29] = half_btf_0_sse4_1(&cospi18, &bf1[18], &rounding, bit);
+ bf1[18] = half_btf_0_sse4_1(&cospi46, &bf1[18], &rounding, bit);
+ bf1[19] = half_btf_0_sse4_1(&cospim50, &bf1[28], &rounding, bit);
+ bf1[28] = half_btf_0_sse4_1(&cospi14, &bf1[28], &rounding, bit);
+ bf1[27] = half_btf_0_sse4_1(&cospi10, &bf1[20], &rounding, bit);
+ bf1[20] = half_btf_0_sse4_1(&cospi54, &bf1[20], &rounding, bit);
+ bf1[21] = half_btf_0_sse4_1(&cospim42, &bf1[26], &rounding, bit);
+ bf1[26] = half_btf_0_sse4_1(&cospi22, &bf1[26], &rounding, bit);
+ bf1[25] = half_btf_0_sse4_1(&cospi26, &bf1[22], &rounding, bit);
+ bf1[22] = half_btf_0_sse4_1(&cospi38, &bf1[22], &rounding, bit);
+ bf1[23] = half_btf_0_sse4_1(&cospim58, &bf1[24], &rounding, bit);
+ bf1[24] = half_btf_0_sse4_1(&cospi6, &bf1[24], &rounding, bit);
+
+ // stage 3
+ bf1[15] = half_btf_0_sse4_1(&cospi4, &bf1[8], &rounding, bit);
+ bf1[8] = half_btf_0_sse4_1(&cospi60, &bf1[8], &rounding, bit);
+ bf1[9] = half_btf_0_sse4_1(&cospim36, &bf1[14], &rounding, bit);
+ bf1[14] = half_btf_0_sse4_1(&cospi28, &bf1[14], &rounding, bit);
+ bf1[13] = half_btf_0_sse4_1(&cospi20, &bf1[10], &rounding, bit);
+ bf1[10] = half_btf_0_sse4_1(&cospi44, &bf1[10], &rounding, bit);
+ bf1[11] = half_btf_0_sse4_1(&cospim52, &bf1[12], &rounding, bit);
+ bf1[12] = half_btf_0_sse4_1(&cospi12, &bf1[12], &rounding, bit);
+
+ addsub_sse4_1(bf1[16], bf1[17], bf1 + 16, bf1 + 17, &clamp_lo, &clamp_hi);
+ addsub_sse4_1(bf1[19], bf1[18], bf1 + 19, bf1 + 18, &clamp_lo, &clamp_hi);
+ addsub_sse4_1(bf1[20], bf1[21], bf1 + 20, bf1 + 21, &clamp_lo, &clamp_hi);
+ addsub_sse4_1(bf1[23], bf1[22], bf1 + 23, bf1 + 22, &clamp_lo, &clamp_hi);
+ addsub_sse4_1(bf1[24], bf1[25], bf1 + 24, bf1 + 25, &clamp_lo, &clamp_hi);
+ addsub_sse4_1(bf1[27], bf1[26], bf1 + 27, bf1 + 26, &clamp_lo, &clamp_hi);
+ addsub_sse4_1(bf1[28], bf1[29], bf1 + 28, bf1 + 29, &clamp_lo, &clamp_hi);
+ addsub_sse4_1(bf1[31], bf1[30], bf1 + 31, bf1 + 30, &clamp_lo, &clamp_hi);
+ // stage 4
+ bf1[7] = half_btf_0_sse4_1(&cospi8, &bf1[4], &rounding, bit);
+ bf1[4] = half_btf_0_sse4_1(&cospi56, &bf1[4], &rounding, bit);
+ bf1[5] = half_btf_0_sse4_1(&cospim40, &bf1[6], &rounding, bit);
+ bf1[6] = half_btf_0_sse4_1(&cospi24, &bf1[6], &rounding, bit);
+
+ addsub_sse4_1(bf1[8], bf1[9], bf1 + 8, bf1 + 9, &clamp_lo, &clamp_hi);
+ addsub_sse4_1(bf1[11], bf1[10], bf1 + 11, bf1 + 10, &clamp_lo, &clamp_hi);
+ addsub_sse4_1(bf1[12], bf1[13], bf1 + 12, bf1 + 13, &clamp_lo, &clamp_hi);
+ addsub_sse4_1(bf1[15], bf1[14], bf1 + 15, bf1 + 14, &clamp_lo, &clamp_hi);
+
+ idct32_stage4_sse4_1(bf1, &cospim8, &cospi56, &cospi8, &cospim56, &cospim40,
+ &cospi24, &cospi40, &cospim24, &rounding, bit);
+
+ // stage 5
+ bf1[0] = half_btf_0_sse4_1(&cospi32, &bf1[0], &rounding, bit);
+ bf1[1] = bf1[0];
+ bf1[3] = half_btf_0_sse4_1(&cospi16, &bf1[2], &rounding, bit);
+ bf1[2] = half_btf_0_sse4_1(&cospi48, &bf1[2], &rounding, bit);
+
+ addsub_sse4_1(bf1[4], bf1[5], bf1 + 4, bf1 + 5, &clamp_lo, &clamp_hi);
+ addsub_sse4_1(bf1[7], bf1[6], bf1 + 7, bf1 + 6, &clamp_lo, &clamp_hi);
+
+ idct32_stage5_sse4_1(bf1, &cospim16, &cospi48, &cospi16, &cospim48, &clamp_lo,
+ &clamp_hi, &rounding, bit);
+
+ // stage 6
+ addsub_sse4_1(bf1[0], bf1[3], bf1 + 0, bf1 + 3, &clamp_lo, &clamp_hi);
+ addsub_sse4_1(bf1[1], bf1[2], bf1 + 1, bf1 + 2, &clamp_lo, &clamp_hi);
+
+ idct32_stage6_sse4_1(bf1, &cospim32, &cospi32, &cospim16, &cospi48, &cospi16,
+ &cospim48, &clamp_lo, &clamp_hi, &rounding, bit);
+
+ // stage 7
+ idct32_stage7_sse4_1(bf1, &cospim32, &cospi32, &clamp_lo, &clamp_hi,
+ &rounding, bit);
+
+ // stage 8
+ idct32_stage8_sse4_1(bf1, &cospim32, &cospi32, &clamp_lo, &clamp_hi,
+ &rounding, bit);
+ // stage 9
+ idct32_stage9_sse4_1(bf1, out, do_cols, bd, out_shift, &clamp_lo, &clamp_hi);
+}
+
+static void idct32x32_sse4_1(__m128i *in, __m128i *out, int bit, int do_cols,
+ int bd, int out_shift) {
+ const int32_t *cospi = cospi_arr(bit);
+ const __m128i cospi62 = _mm_set1_epi32(cospi[62]);
+ const __m128i cospi30 = _mm_set1_epi32(cospi[30]);
+ const __m128i cospi46 = _mm_set1_epi32(cospi[46]);
+ const __m128i cospi14 = _mm_set1_epi32(cospi[14]);
+ const __m128i cospi54 = _mm_set1_epi32(cospi[54]);
+ const __m128i cospi22 = _mm_set1_epi32(cospi[22]);
+ const __m128i cospi38 = _mm_set1_epi32(cospi[38]);
+ const __m128i cospi6 = _mm_set1_epi32(cospi[6]);
+ const __m128i cospi58 = _mm_set1_epi32(cospi[58]);
+ const __m128i cospi26 = _mm_set1_epi32(cospi[26]);
+ const __m128i cospi42 = _mm_set1_epi32(cospi[42]);
+ const __m128i cospi10 = _mm_set1_epi32(cospi[10]);
+ const __m128i cospi50 = _mm_set1_epi32(cospi[50]);
+ const __m128i cospi18 = _mm_set1_epi32(cospi[18]);
+ const __m128i cospi34 = _mm_set1_epi32(cospi[34]);
+ const __m128i cospi2 = _mm_set1_epi32(cospi[2]);
+ const __m128i cospim58 = _mm_set1_epi32(-cospi[58]);
+ const __m128i cospim26 = _mm_set1_epi32(-cospi[26]);
+ const __m128i cospim42 = _mm_set1_epi32(-cospi[42]);
+ const __m128i cospim10 = _mm_set1_epi32(-cospi[10]);
+ const __m128i cospim50 = _mm_set1_epi32(-cospi[50]);
+ const __m128i cospim18 = _mm_set1_epi32(-cospi[18]);
+ const __m128i cospim34 = _mm_set1_epi32(-cospi[34]);
+ const __m128i cospim2 = _mm_set1_epi32(-cospi[2]);
+ const __m128i cospi60 = _mm_set1_epi32(cospi[60]);
+ const __m128i cospi28 = _mm_set1_epi32(cospi[28]);
+ const __m128i cospi44 = _mm_set1_epi32(cospi[44]);
+ const __m128i cospi12 = _mm_set1_epi32(cospi[12]);
+ const __m128i cospi52 = _mm_set1_epi32(cospi[52]);
+ const __m128i cospi20 = _mm_set1_epi32(cospi[20]);
+ const __m128i cospi36 = _mm_set1_epi32(cospi[36]);
+ const __m128i cospi4 = _mm_set1_epi32(cospi[4]);
+ const __m128i cospim52 = _mm_set1_epi32(-cospi[52]);
+ const __m128i cospim20 = _mm_set1_epi32(-cospi[20]);
+ const __m128i cospim36 = _mm_set1_epi32(-cospi[36]);
+ const __m128i cospim4 = _mm_set1_epi32(-cospi[4]);
+ const __m128i cospi56 = _mm_set1_epi32(cospi[56]);
+ const __m128i cospi24 = _mm_set1_epi32(cospi[24]);
+ const __m128i cospi40 = _mm_set1_epi32(cospi[40]);
+ const __m128i cospi8 = _mm_set1_epi32(cospi[8]);
+ const __m128i cospim40 = _mm_set1_epi32(-cospi[40]);
+ const __m128i cospim8 = _mm_set1_epi32(-cospi[8]);
+ const __m128i cospim56 = _mm_set1_epi32(-cospi[56]);
+ const __m128i cospim24 = _mm_set1_epi32(-cospi[24]);
+ const __m128i cospi32 = _mm_set1_epi32(cospi[32]);
+ const __m128i cospim32 = _mm_set1_epi32(-cospi[32]);
+ const __m128i cospi48 = _mm_set1_epi32(cospi[48]);
+ const __m128i cospim48 = _mm_set1_epi32(-cospi[48]);
+ const __m128i cospi16 = _mm_set1_epi32(cospi[16]);
+ const __m128i cospim16 = _mm_set1_epi32(-cospi[16]);
+ const __m128i rounding = _mm_set1_epi32(1 << (bit - 1));
+ const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8));
+ const __m128i clamp_lo = _mm_set1_epi32(-(1 << (log_range - 1)));
+ const __m128i clamp_hi = _mm_set1_epi32((1 << (log_range - 1)) - 1);
+ __m128i bf1[32], bf0[32];
+
+ // stage 0
+ // stage 1
+ bf1[0] = in[0];
+ bf1[1] = in[16];
+ bf1[2] = in[8];
+ bf1[3] = in[24];
+ bf1[4] = in[4];
+ bf1[5] = in[20];
+ bf1[6] = in[12];
+ bf1[7] = in[28];
+ bf1[8] = in[2];
+ bf1[9] = in[18];
+ bf1[10] = in[10];
+ bf1[11] = in[26];
+ bf1[12] = in[6];
+ bf1[13] = in[22];
+ bf1[14] = in[14];
+ bf1[15] = in[30];
+ bf1[16] = in[1];
+ bf1[17] = in[17];
+ bf1[18] = in[9];
+ bf1[19] = in[25];
+ bf1[20] = in[5];
+ bf1[21] = in[21];
+ bf1[22] = in[13];
+ bf1[23] = in[29];
+ bf1[24] = in[3];
+ bf1[25] = in[19];
+ bf1[26] = in[11];
+ bf1[27] = in[27];
+ bf1[28] = in[7];
+ bf1[29] = in[23];
+ bf1[30] = in[15];
+ bf1[31] = in[31];
+
+ // stage 2
+ bf0[0] = bf1[0];
+ bf0[1] = bf1[1];
+ bf0[2] = bf1[2];
+ bf0[3] = bf1[3];
+ bf0[4] = bf1[4];
+ bf0[5] = bf1[5];
+ bf0[6] = bf1[6];
+ bf0[7] = bf1[7];
+ bf0[8] = bf1[8];
+ bf0[9] = bf1[9];
+ bf0[10] = bf1[10];
+ bf0[11] = bf1[11];
+ bf0[12] = bf1[12];
+ bf0[13] = bf1[13];
+ bf0[14] = bf1[14];
+ bf0[15] = bf1[15];
+ bf0[16] =
+ half_btf_sse4_1(&cospi62, &bf1[16], &cospim2, &bf1[31], &rounding, bit);
+ bf0[17] =
+ half_btf_sse4_1(&cospi30, &bf1[17], &cospim34, &bf1[30], &rounding, bit);
+ bf0[18] =
+ half_btf_sse4_1(&cospi46, &bf1[18], &cospim18, &bf1[29], &rounding, bit);
+ bf0[19] =
+ half_btf_sse4_1(&cospi14, &bf1[19], &cospim50, &bf1[28], &rounding, bit);
+ bf0[20] =
+ half_btf_sse4_1(&cospi54, &bf1[20], &cospim10, &bf1[27], &rounding, bit);
+ bf0[21] =
+ half_btf_sse4_1(&cospi22, &bf1[21], &cospim42, &bf1[26], &rounding, bit);
+ bf0[22] =
+ half_btf_sse4_1(&cospi38, &bf1[22], &cospim26, &bf1[25], &rounding, bit);
+ bf0[23] =
+ half_btf_sse4_1(&cospi6, &bf1[23], &cospim58, &bf1[24], &rounding, bit);
+ bf0[24] =
+ half_btf_sse4_1(&cospi58, &bf1[23], &cospi6, &bf1[24], &rounding, bit);
+ bf0[25] =
+ half_btf_sse4_1(&cospi26, &bf1[22], &cospi38, &bf1[25], &rounding, bit);
+ bf0[26] =
+ half_btf_sse4_1(&cospi42, &bf1[21], &cospi22, &bf1[26], &rounding, bit);
+ bf0[27] =
+ half_btf_sse4_1(&cospi10, &bf1[20], &cospi54, &bf1[27], &rounding, bit);
+ bf0[28] =
+ half_btf_sse4_1(&cospi50, &bf1[19], &cospi14, &bf1[28], &rounding, bit);
+ bf0[29] =
+ half_btf_sse4_1(&cospi18, &bf1[18], &cospi46, &bf1[29], &rounding, bit);
+ bf0[30] =
+ half_btf_sse4_1(&cospi34, &bf1[17], &cospi30, &bf1[30], &rounding, bit);
+ bf0[31] =
+ half_btf_sse4_1(&cospi2, &bf1[16], &cospi62, &bf1[31], &rounding, bit);
+
+ // stage 3
+ bf1[0] = bf0[0];
+ bf1[1] = bf0[1];
+ bf1[2] = bf0[2];
+ bf1[3] = bf0[3];
+ bf1[4] = bf0[4];
+ bf1[5] = bf0[5];
+ bf1[6] = bf0[6];
+ bf1[7] = bf0[7];
+ bf1[8] =
+ half_btf_sse4_1(&cospi60, &bf0[8], &cospim4, &bf0[15], &rounding, bit);
+ bf1[9] =
+ half_btf_sse4_1(&cospi28, &bf0[9], &cospim36, &bf0[14], &rounding, bit);
+ bf1[10] =
+ half_btf_sse4_1(&cospi44, &bf0[10], &cospim20, &bf0[13], &rounding, bit);
+ bf1[11] =
+ half_btf_sse4_1(&cospi12, &bf0[11], &cospim52, &bf0[12], &rounding, bit);
+ bf1[12] =
+ half_btf_sse4_1(&cospi52, &bf0[11], &cospi12, &bf0[12], &rounding, bit);
+ bf1[13] =
+ half_btf_sse4_1(&cospi20, &bf0[10], &cospi44, &bf0[13], &rounding, bit);
+ bf1[14] =
+ half_btf_sse4_1(&cospi36, &bf0[9], &cospi28, &bf0[14], &rounding, bit);
+ bf1[15] =
+ half_btf_sse4_1(&cospi4, &bf0[8], &cospi60, &bf0[15], &rounding, bit);
+
+ addsub_sse4_1(bf0[16], bf0[17], bf1 + 16, bf1 + 17, &clamp_lo, &clamp_hi);
+ addsub_sse4_1(bf0[19], bf0[18], bf1 + 19, bf1 + 18, &clamp_lo, &clamp_hi);
+ addsub_sse4_1(bf0[20], bf0[21], bf1 + 20, bf1 + 21, &clamp_lo, &clamp_hi);
+ addsub_sse4_1(bf0[23], bf0[22], bf1 + 23, bf1 + 22, &clamp_lo, &clamp_hi);
+ addsub_sse4_1(bf0[24], bf0[25], bf1 + 24, bf1 + 25, &clamp_lo, &clamp_hi);
+ addsub_sse4_1(bf0[27], bf0[26], bf1 + 27, bf1 + 26, &clamp_lo, &clamp_hi);
+ addsub_sse4_1(bf0[28], bf0[29], bf1 + 28, bf1 + 29, &clamp_lo, &clamp_hi);
+ addsub_sse4_1(bf0[31], bf0[30], bf1 + 31, bf1 + 30, &clamp_lo, &clamp_hi);
+
+ // stage 4
+ bf0[0] = bf1[0];
+ bf0[1] = bf1[1];
+ bf0[2] = bf1[2];
+ bf0[3] = bf1[3];
+ bf0[4] =
+ half_btf_sse4_1(&cospi56, &bf1[4], &cospim8, &bf1[7], &rounding, bit);
+ bf0[5] =
+ half_btf_sse4_1(&cospi24, &bf1[5], &cospim40, &bf1[6], &rounding, bit);
+ bf0[6] =
+ half_btf_sse4_1(&cospi40, &bf1[5], &cospi24, &bf1[6], &rounding, bit);
+ bf0[7] = half_btf_sse4_1(&cospi8, &bf1[4], &cospi56, &bf1[7], &rounding, bit);
+
+ addsub_sse4_1(bf1[8], bf1[9], bf0 + 8, bf0 + 9, &clamp_lo, &clamp_hi);
+ addsub_sse4_1(bf1[11], bf1[10], bf0 + 11, bf0 + 10, &clamp_lo, &clamp_hi);
+ addsub_sse4_1(bf1[12], bf1[13], bf0 + 12, bf0 + 13, &clamp_lo, &clamp_hi);
+ addsub_sse4_1(bf1[15], bf1[14], bf0 + 15, bf0 + 14, &clamp_lo, &clamp_hi);
+
+ bf0[16] = bf1[16];
+ bf0[17] =
+ half_btf_sse4_1(&cospim8, &bf1[17], &cospi56, &bf1[30], &rounding, bit);
+ bf0[18] =
+ half_btf_sse4_1(&cospim56, &bf1[18], &cospim8, &bf1[29], &rounding, bit);
+ bf0[19] = bf1[19];
+ bf0[20] = bf1[20];
+ bf0[21] =
+ half_btf_sse4_1(&cospim40, &bf1[21], &cospi24, &bf1[26], &rounding, bit);
+ bf0[22] =
+ half_btf_sse4_1(&cospim24, &bf1[22], &cospim40, &bf1[25], &rounding, bit);
+ bf0[23] = bf1[23];
+ bf0[24] = bf1[24];
+ bf0[25] =
+ half_btf_sse4_1(&cospim40, &bf1[22], &cospi24, &bf1[25], &rounding, bit);
+ bf0[26] =
+ half_btf_sse4_1(&cospi24, &bf1[21], &cospi40, &bf1[26], &rounding, bit);
+ bf0[27] = bf1[27];
+ bf0[28] = bf1[28];
+ bf0[29] =
+ half_btf_sse4_1(&cospim8, &bf1[18], &cospi56, &bf1[29], &rounding, bit);
+ bf0[30] =
+ half_btf_sse4_1(&cospi56, &bf1[17], &cospi8, &bf1[30], &rounding, bit);
+ bf0[31] = bf1[31];
+
+ // stage 5
+ bf1[0] =
+ half_btf_sse4_1(&cospi32, &bf0[0], &cospi32, &bf0[1], &rounding, bit);
+ bf1[1] =
+ half_btf_sse4_1(&cospi32, &bf0[0], &cospim32, &bf0[1], &rounding, bit);
+ bf1[2] =
+ half_btf_sse4_1(&cospi48, &bf0[2], &cospim16, &bf0[3], &rounding, bit);
+ bf1[3] =
+ half_btf_sse4_1(&cospi16, &bf0[2], &cospi48, &bf0[3], &rounding, bit);
+ addsub_sse4_1(bf0[4], bf0[5], bf1 + 4, bf1 + 5, &clamp_lo, &clamp_hi);
+ addsub_sse4_1(bf0[7], bf0[6], bf1 + 7, bf1 + 6, &clamp_lo, &clamp_hi);
+ bf1[8] = bf0[8];
+ bf1[9] =
+ half_btf_sse4_1(&cospim16, &bf0[9], &cospi48, &bf0[14], &rounding, bit);
+ bf1[10] =
+ half_btf_sse4_1(&cospim48, &bf0[10], &cospim16, &bf0[13], &rounding, bit);
+ bf1[11] = bf0[11];
+ bf1[12] = bf0[12];
+ bf1[13] =
+ half_btf_sse4_1(&cospim16, &bf0[10], &cospi48, &bf0[13], &rounding, bit);
+ bf1[14] =
+ half_btf_sse4_1(&cospi48, &bf0[9], &cospi16, &bf0[14], &rounding, bit);
+ bf1[15] = bf0[15];
+ addsub_sse4_1(bf0[16], bf0[19], bf1 + 16, bf1 + 19, &clamp_lo, &clamp_hi);
+ addsub_sse4_1(bf0[17], bf0[18], bf1 + 17, bf1 + 18, &clamp_lo, &clamp_hi);
+ addsub_sse4_1(bf0[23], bf0[20], bf1 + 23, bf1 + 20, &clamp_lo, &clamp_hi);
+ addsub_sse4_1(bf0[22], bf0[21], bf1 + 22, bf1 + 21, &clamp_lo, &clamp_hi);
+ addsub_sse4_1(bf0[24], bf0[27], bf1 + 24, bf1 + 27, &clamp_lo, &clamp_hi);
+ addsub_sse4_1(bf0[25], bf0[26], bf1 + 25, bf1 + 26, &clamp_lo, &clamp_hi);
+ addsub_sse4_1(bf0[31], bf0[28], bf1 + 31, bf1 + 28, &clamp_lo, &clamp_hi);
+ addsub_sse4_1(bf0[30], bf0[29], bf1 + 30, bf1 + 29, &clamp_lo, &clamp_hi);
+
+ // stage 6
+ addsub_sse4_1(bf1[0], bf1[3], bf0 + 0, bf0 + 3, &clamp_lo, &clamp_hi);
+ addsub_sse4_1(bf1[1], bf1[2], bf0 + 1, bf0 + 2, &clamp_lo, &clamp_hi);
+ bf0[4] = bf1[4];
+ bf0[5] =
+ half_btf_sse4_1(&cospim32, &bf1[5], &cospi32, &bf1[6], &rounding, bit);
+ bf0[6] =
+ half_btf_sse4_1(&cospi32, &bf1[5], &cospi32, &bf1[6], &rounding, bit);
+ bf0[7] = bf1[7];
+ addsub_sse4_1(bf1[8], bf1[11], bf0 + 8, bf0 + 11, &clamp_lo, &clamp_hi);
+ addsub_sse4_1(bf1[9], bf1[10], bf0 + 9, bf0 + 10, &clamp_lo, &clamp_hi);
+ addsub_sse4_1(bf1[15], bf1[12], bf0 + 15, bf0 + 12, &clamp_lo, &clamp_hi);
+ addsub_sse4_1(bf1[14], bf1[13], bf0 + 14, bf0 + 13, &clamp_lo, &clamp_hi);
+ bf0[16] = bf1[16];
+ bf0[17] = bf1[17];
+ bf0[18] =
+ half_btf_sse4_1(&cospim16, &bf1[18], &cospi48, &bf1[29], &rounding, bit);
+ bf0[19] =
+ half_btf_sse4_1(&cospim16, &bf1[19], &cospi48, &bf1[28], &rounding, bit);
+ bf0[20] =
+ half_btf_sse4_1(&cospim48, &bf1[20], &cospim16, &bf1[27], &rounding, bit);
+ bf0[21] =
+ half_btf_sse4_1(&cospim48, &bf1[21], &cospim16, &bf1[26], &rounding, bit);
+ bf0[22] = bf1[22];
+ bf0[23] = bf1[23];
+ bf0[24] = bf1[24];
+ bf0[25] = bf1[25];
+ bf0[26] =
+ half_btf_sse4_1(&cospim16, &bf1[21], &cospi48, &bf1[26], &rounding, bit);
+ bf0[27] =
+ half_btf_sse4_1(&cospim16, &bf1[20], &cospi48, &bf1[27], &rounding, bit);
+ bf0[28] =
+ half_btf_sse4_1(&cospi48, &bf1[19], &cospi16, &bf1[28], &rounding, bit);
+ bf0[29] =
+ half_btf_sse4_1(&cospi48, &bf1[18], &cospi16, &bf1[29], &rounding, bit);
+ bf0[30] = bf1[30];
+ bf0[31] = bf1[31];
+
+ // stage 7
+ addsub_sse4_1(bf0[0], bf0[7], bf1 + 0, bf1 + 7, &clamp_lo, &clamp_hi);
+ addsub_sse4_1(bf0[1], bf0[6], bf1 + 1, bf1 + 6, &clamp_lo, &clamp_hi);
+ addsub_sse4_1(bf0[2], bf0[5], bf1 + 2, bf1 + 5, &clamp_lo, &clamp_hi);
+ addsub_sse4_1(bf0[3], bf0[4], bf1 + 3, bf1 + 4, &clamp_lo, &clamp_hi);
+ bf1[8] = bf0[8];
+ bf1[9] = bf0[9];
+ bf1[10] =
+ half_btf_sse4_1(&cospim32, &bf0[10], &cospi32, &bf0[13], &rounding, bit);
+ bf1[11] =
+ half_btf_sse4_1(&cospim32, &bf0[11], &cospi32, &bf0[12], &rounding, bit);
+ bf1[12] =
+ half_btf_sse4_1(&cospi32, &bf0[11], &cospi32, &bf0[12], &rounding, bit);
+ bf1[13] =
+ half_btf_sse4_1(&cospi32, &bf0[10], &cospi32, &bf0[13], &rounding, bit);
+ bf1[14] = bf0[14];
+ bf1[15] = bf0[15];
+ addsub_sse4_1(bf0[16], bf0[23], bf1 + 16, bf1 + 23, &clamp_lo, &clamp_hi);
+ addsub_sse4_1(bf0[17], bf0[22], bf1 + 17, bf1 + 22, &clamp_lo, &clamp_hi);
+ addsub_sse4_1(bf0[18], bf0[21], bf1 + 18, bf1 + 21, &clamp_lo, &clamp_hi);
+ addsub_sse4_1(bf0[19], bf0[20], bf1 + 19, bf1 + 20, &clamp_lo, &clamp_hi);
+ addsub_sse4_1(bf0[31], bf0[24], bf1 + 31, bf1 + 24, &clamp_lo, &clamp_hi);
+ addsub_sse4_1(bf0[30], bf0[25], bf1 + 30, bf1 + 25, &clamp_lo, &clamp_hi);
+ addsub_sse4_1(bf0[29], bf0[26], bf1 + 29, bf1 + 26, &clamp_lo, &clamp_hi);
+ addsub_sse4_1(bf0[28], bf0[27], bf1 + 28, bf1 + 27, &clamp_lo, &clamp_hi);
+
+ // stage 8
+ addsub_sse4_1(bf1[0], bf1[15], bf0 + 0, bf0 + 15, &clamp_lo, &clamp_hi);
+ addsub_sse4_1(bf1[1], bf1[14], bf0 + 1, bf0 + 14, &clamp_lo, &clamp_hi);
+ addsub_sse4_1(bf1[2], bf1[13], bf0 + 2, bf0 + 13, &clamp_lo, &clamp_hi);
+ addsub_sse4_1(bf1[3], bf1[12], bf0 + 3, bf0 + 12, &clamp_lo, &clamp_hi);
+ addsub_sse4_1(bf1[4], bf1[11], bf0 + 4, bf0 + 11, &clamp_lo, &clamp_hi);
+ addsub_sse4_1(bf1[5], bf1[10], bf0 + 5, bf0 + 10, &clamp_lo, &clamp_hi);
+ addsub_sse4_1(bf1[6], bf1[9], bf0 + 6, bf0 + 9, &clamp_lo, &clamp_hi);
+ addsub_sse4_1(bf1[7], bf1[8], bf0 + 7, bf0 + 8, &clamp_lo, &clamp_hi);
+ bf0[16] = bf1[16];
+ bf0[17] = bf1[17];
+ bf0[18] = bf1[18];
+ bf0[19] = bf1[19];
+ bf0[20] =
+ half_btf_sse4_1(&cospim32, &bf1[20], &cospi32, &bf1[27], &rounding, bit);
+ bf0[21] =
+ half_btf_sse4_1(&cospim32, &bf1[21], &cospi32, &bf1[26], &rounding, bit);
+ bf0[22] =
+ half_btf_sse4_1(&cospim32, &bf1[22], &cospi32, &bf1[25], &rounding, bit);
+ bf0[23] =
+ half_btf_sse4_1(&cospim32, &bf1[23], &cospi32, &bf1[24], &rounding, bit);
+ bf0[24] =
+ half_btf_sse4_1(&cospi32, &bf1[23], &cospi32, &bf1[24], &rounding, bit);
+ bf0[25] =
+ half_btf_sse4_1(&cospi32, &bf1[22], &cospi32, &bf1[25], &rounding, bit);
+ bf0[26] =
+ half_btf_sse4_1(&cospi32, &bf1[21], &cospi32, &bf1[26], &rounding, bit);
+ bf0[27] =
+ half_btf_sse4_1(&cospi32, &bf1[20], &cospi32, &bf1[27], &rounding, bit);
+ bf0[28] = bf1[28];
+ bf0[29] = bf1[29];
+ bf0[30] = bf1[30];
+ bf0[31] = bf1[31];
+
+ // stage 9
+ addsub_sse4_1(bf0[0], bf0[31], out + 0, out + 31, &clamp_lo, &clamp_hi);
+ addsub_sse4_1(bf0[1], bf0[30], out + 1, out + 30, &clamp_lo, &clamp_hi);
+ addsub_sse4_1(bf0[2], bf0[29], out + 2, out + 29, &clamp_lo, &clamp_hi);
+ addsub_sse4_1(bf0[3], bf0[28], out + 3, out + 28, &clamp_lo, &clamp_hi);
+ addsub_sse4_1(bf0[4], bf0[27], out + 4, out + 27, &clamp_lo, &clamp_hi);
+ addsub_sse4_1(bf0[5], bf0[26], out + 5, out + 26, &clamp_lo, &clamp_hi);
+ addsub_sse4_1(bf0[6], bf0[25], out + 6, out + 25, &clamp_lo, &clamp_hi);
+ addsub_sse4_1(bf0[7], bf0[24], out + 7, out + 24, &clamp_lo, &clamp_hi);
+ addsub_sse4_1(bf0[8], bf0[23], out + 8, out + 23, &clamp_lo, &clamp_hi);
+ addsub_sse4_1(bf0[9], bf0[22], out + 9, out + 22, &clamp_lo, &clamp_hi);
+ addsub_sse4_1(bf0[10], bf0[21], out + 10, out + 21, &clamp_lo, &clamp_hi);
+ addsub_sse4_1(bf0[11], bf0[20], out + 11, out + 20, &clamp_lo, &clamp_hi);
+ addsub_sse4_1(bf0[12], bf0[19], out + 12, out + 19, &clamp_lo, &clamp_hi);
+ addsub_sse4_1(bf0[13], bf0[18], out + 13, out + 18, &clamp_lo, &clamp_hi);
+ addsub_sse4_1(bf0[14], bf0[17], out + 14, out + 17, &clamp_lo, &clamp_hi);
+ addsub_sse4_1(bf0[15], bf0[16], out + 15, out + 16, &clamp_lo, &clamp_hi);
+
+ if (!do_cols) {
+ const int log_range_out = AOMMAX(16, bd + 6);
+ const __m128i clamp_lo_out = _mm_set1_epi32(-(1 << (log_range_out - 1)));
+ const __m128i clamp_hi_out = _mm_set1_epi32((1 << (log_range_out - 1)) - 1);
+ round_shift_8x8(out, out_shift);
+ round_shift_8x8(out + 16, out_shift);
+ highbd_clamp_epi32_sse4_1(out, out, &clamp_lo_out, &clamp_hi_out, 32);
+ }
+}
+
+void av1_highbd_inv_txfm_add_8x8_sse4_1(const tran_low_t *input, uint8_t *dest,
+ int stride,
+ const TxfmParam *txfm_param) {
+ int bd = txfm_param->bd;
+ const TX_TYPE tx_type = txfm_param->tx_type;
+ const int32_t *src = cast_to_int32(input);
+ switch (tx_type) {
+ case IDTX:
+ case H_DCT:
+ case H_ADST:
+ case H_FLIPADST:
+ case V_DCT:
+ case V_ADST:
+ case V_FLIPADST:
+ av1_highbd_inv_txfm2d_add_universe_sse4_1(input, dest, stride, tx_type,
+ txfm_param->tx_size,
+ txfm_param->eob, bd);
+ break;
+ default:
+ av1_inv_txfm2d_add_8x8_sse4_1(src, CONVERT_TO_SHORTPTR(dest), stride,
+ tx_type, bd);
+ break;
+ }
+}
+void av1_highbd_inv_txfm_add_4x4_sse4_1(const tran_low_t *input, uint8_t *dest,
+ int stride,
+ const TxfmParam *txfm_param) {
+ assert(av1_ext_tx_used[txfm_param->tx_set_type][txfm_param->tx_type]);
+ int eob = txfm_param->eob;
+ int bd = txfm_param->bd;
+ int lossless = txfm_param->lossless;
+ const int32_t *src = cast_to_int32(input);
+ const TX_TYPE tx_type = txfm_param->tx_type;
+ if (lossless) {
+ assert(tx_type == DCT_DCT);
+ av1_highbd_iwht4x4_add(input, dest, stride, eob, bd);
+ return;
+ }
+ av1_inv_txfm2d_add_4x4_sse4_1(src, CONVERT_TO_SHORTPTR(dest), stride, tx_type,
+ bd);
+}
+static void iidentity32_sse4_1(__m128i *in, __m128i *out, int bit, int do_cols,
+ int bd, int out_shift) {
+ (void)bit;
+ for (int i = 0; i < 32; i += 16) {
+ out[i] = _mm_slli_epi32(in[i], 2);
+ out[i + 1] = _mm_slli_epi32(in[i + 1], 2);
+ out[i + 2] = _mm_slli_epi32(in[i + 2], 2);
+ out[i + 3] = _mm_slli_epi32(in[i + 3], 2);
+ out[i + 4] = _mm_slli_epi32(in[i + 4], 2);
+ out[i + 5] = _mm_slli_epi32(in[i + 5], 2);
+ out[i + 6] = _mm_slli_epi32(in[i + 6], 2);
+ out[i + 7] = _mm_slli_epi32(in[i + 7], 2);
+ out[i + 8] = _mm_slli_epi32(in[i + 8], 2);
+ out[i + 9] = _mm_slli_epi32(in[i + 9], 2);
+ out[i + 10] = _mm_slli_epi32(in[i + 10], 2);
+ out[i + 11] = _mm_slli_epi32(in[i + 11], 2);
+ out[i + 12] = _mm_slli_epi32(in[i + 12], 2);
+ out[i + 13] = _mm_slli_epi32(in[i + 13], 2);
+ out[i + 14] = _mm_slli_epi32(in[i + 14], 2);
+ out[i + 15] = _mm_slli_epi32(in[i + 15], 2);
+ }
+
+ if (!do_cols) {
+ const int log_range_out = AOMMAX(16, bd + 6);
+ const __m128i clamp_lo_out = _mm_set1_epi32(-(1 << (log_range_out - 1)));
+ const __m128i clamp_hi_out = _mm_set1_epi32((1 << (log_range_out - 1)) - 1);
+ round_shift_8x8(out, out_shift);
+ round_shift_8x8(out + 16, out_shift);
+ highbd_clamp_epi32_sse4_1(out, out, &clamp_lo_out, &clamp_hi_out, 32);
+ }
+}
+static const transform_1d_sse4_1
+ highbd_txfm_all_1d_zeros_w8_arr[TX_SIZES][ITX_TYPES_1D][4] = {
+ {
+ { idct4x4_sse4_1, NULL, NULL, NULL },
+ { iadst4x4_sse4_1, NULL, NULL, NULL },
+ { iidentity4_sse4_1, iidentity4_sse4_1, iidentity4_sse4_1, NULL },
+ },
+ { { idct8x8_low1_sse4_1, idct8x8_new_sse4_1, NULL, NULL },
+ { iadst8x8_low1_sse4_1, iadst8x8_new_sse4_1, NULL, NULL },
+ { iidentity8_sse4_1, iidentity8_sse4_1, NULL, NULL } },
+ {
+ { idct16x16_low1_sse4_1, idct16x16_low8_sse4_1, idct16x16_sse4_1,
+ NULL },
+ { iadst16x16_low1_sse4_1, iadst16x16_low8_sse4_1, iadst16x16_sse4_1,
+ NULL },
+ { iidentity16_sse4_1, NULL, iidentity16_sse4_1, NULL },
+ },
+ { { idct32x32_low1_sse4_1, idct32x32_low8_sse4_1, idct32x32_low16_sse4_1,
+ idct32x32_sse4_1 },
+ { NULL, NULL, NULL, NULL },
+ { iidentity32_sse4_1, NULL, NULL, NULL } },
+ { { idct64x64_low1_sse4_1, idct64x64_low8_sse4_1, idct64x64_low16_sse4_1,
+ idct64x64_sse4_1 },
+ { NULL, NULL, NULL, NULL },
+ { NULL, NULL, NULL, NULL } }
+ };
+static void highbd_inv_txfm2d_add_h_identity_ssse41(const int32_t *input,
+ uint16_t *output,
+ int stride, TX_TYPE tx_type,
+ TX_SIZE tx_size, int eob,
+ const int bd) {
+ __m128i buf1[64];
+ int eobx, eoby;
+ get_eobx_eoby_scan_v_identity(&eobx, &eoby, tx_size, eob);
+ const int8_t *shift = av1_inv_txfm_shift_ls[tx_size];
+ const int txw_idx = get_txw_idx(tx_size);
+ const int txh_idx = get_txh_idx(tx_size);
+ const int txfm_size_col = tx_size_wide[tx_size];
+ const int txfm_size_row = tx_size_high[tx_size];
+ const int buf_size_w = AOMMIN(32, txfm_size_col);
+ const int buf_size_w_div4 = buf_size_w >> 2;
+ const int buf_size_h_div8 = (eoby + 8) >> 3;
+ const int row_max = AOMMIN(32, txfm_size_row);
+ const int input_stride = row_max;
+ const int rect_type = get_rect_tx_log_ratio(txfm_size_col, txfm_size_row);
+ const int fun_idx = lowbd_txfm_all_1d_zeros_idx[eoby];
+ const transform_1d_sse4_1 row_txfm =
+ highbd_txfm_all_1d_zeros_w8_arr[txw_idx][hitx_1d_tab[tx_type]][0];
+ const transform_1d_sse4_1 col_txfm =
+ highbd_txfm_all_1d_zeros_w8_arr[txh_idx][vitx_1d_tab[tx_type]][fun_idx];
+ int ud_flip, lr_flip;
+ get_flip_cfg(tx_type, &ud_flip, &lr_flip);
+
+ for (int i = 0; i < (buf_size_h_div8 << 1); ++i) {
+ __m128i buf0[16];
+ load_buffer_32bit_input(input + i * 4, input_stride, buf0, buf_size_w);
+ if (rect_type == 1 || rect_type == -1) {
+ av1_round_shift_rect_array_32_sse4_1(buf0, buf0, buf_size_w, 0,
+ NewInvSqrt2);
+ }
+ row_txfm(buf0, buf0, INV_COS_BIT, 0, bd, -shift[0]);
+
+ __m128i *_buf1 = buf1 + i * 4;
+
+ for (int j = 0; j < buf_size_w_div4; ++j) {
+ __m128i *buf0_cur = buf0 + j * 4;
+ TRANSPOSE_4X4(buf0_cur[0], buf0_cur[1], buf0_cur[2], buf0_cur[3],
+ buf0_cur[0], buf0_cur[1], buf0_cur[2], buf0_cur[3]);
+ _buf1[j * txfm_size_row + 0] = buf0_cur[0];
+ _buf1[j * txfm_size_row + 1] = buf0_cur[1];
+ _buf1[j * txfm_size_row + 2] = buf0_cur[2];
+ _buf1[j * txfm_size_row + 3] = buf0_cur[3];
+ }
+ }
+ for (int i = 0; i < buf_size_w_div4; i++) {
+ col_txfm(buf1 + i * txfm_size_row, buf1 + i * txfm_size_row, INV_COS_BIT, 1,
+ bd, 0);
+
+ av1_round_shift_array_32_sse4_1(buf1 + i * txfm_size_row,
+ buf1 + i * txfm_size_row, txfm_size_row,
+ -shift[1]);
+ }
+
+ // write to buffer
+ for (int i = 0; i < (txfm_size_col >> 3); i++) {
+ highbd_write_buffer_8xn_sse4_1(buf1 + i * txfm_size_row * 2, output + 8 * i,
+ stride, ud_flip, txfm_size_row, bd);
+ }
+}
+static void highbd_inv_txfm2d_add_v_identity_ssse41(const int32_t *input,
+ uint16_t *output,
+ int stride, TX_TYPE tx_type,
+ TX_SIZE tx_size, int eob,
+ const int bd) {
+ __m128i buf1[64];
+ int eobx, eoby;
+ get_eobx_eoby_scan_h_identity(&eobx, &eoby, tx_size, eob);
+ const int8_t *shift = av1_inv_txfm_shift_ls[tx_size];
+ const int txw_idx = get_txw_idx(tx_size);
+ const int txh_idx = get_txh_idx(tx_size);
+ const int txfm_size_col = tx_size_wide[tx_size];
+ const int txfm_size_row = tx_size_high[tx_size];
+ const int buf_size_w_div4 = AOMMIN(32, txfm_size_col) >> 2;
+ const int row_max = AOMMIN(32, txfm_size_row);
+ const int input_stride = row_max;
+ const int buf_size_nonzero_w_div8 = (eobx + 8) >> 3;
+ const int buf_size_nonzero_w = buf_size_nonzero_w_div8 << 3;
+ const int rect_type = get_rect_tx_log_ratio(txfm_size_col, txfm_size_row);
+ const int fun_idx = lowbd_txfm_all_1d_zeros_idx[eobx];
+ const transform_1d_sse4_1 row_txfm =
+ highbd_txfm_all_1d_zeros_w8_arr[txw_idx][hitx_1d_tab[tx_type]][fun_idx];
+ const transform_1d_sse4_1 col_txfm =
+ highbd_txfm_all_1d_zeros_w8_arr[txh_idx][vitx_1d_tab[tx_type]][0];
+ int ud_flip, lr_flip;
+ get_flip_cfg(tx_type, &ud_flip, &lr_flip);
+
+ for (int i = 0; i < (row_max >> 2); ++i) {
+ __m128i buf0[16];
+ load_buffer_32bit_input(input + i * 4, input_stride, buf0,
+ buf_size_nonzero_w);
+ if (rect_type == 1 || rect_type == -1) {
+ av1_round_shift_rect_array_32_sse4_1(buf0, buf0, buf_size_nonzero_w, 0,
+ NewInvSqrt2);
+ }
+ row_txfm(buf0, buf0, INV_COS_BIT, 0, bd, -shift[0]);
+
+ __m128i *_buf1 = buf1 + i * 4;
+ if (lr_flip) {
+ for (int j = 0; j < buf_size_w_div4; ++j) {
+ TRANSPOSE_4X4(buf0[4 * j + 3], buf0[4 * j + 2], buf0[4 * j + 1],
+ buf0[4 * j],
+ _buf1[txfm_size_row * (buf_size_w_div4 - 1 - j) + 0],
+ _buf1[txfm_size_row * (buf_size_w_div4 - 1 - j) + 1],
+ _buf1[txfm_size_row * (buf_size_w_div4 - 1 - j) + 2],
+ _buf1[txfm_size_row * (buf_size_w_div4 - 1 - j) + 3]);
+ }
+ } else {
+ for (int j = 0; j < buf_size_w_div4; ++j) {
+ TRANSPOSE_4X4(
+ buf0[j * 4 + 0], buf0[j * 4 + 1], buf0[j * 4 + 2], buf0[j * 4 + 3],
+ _buf1[j * txfm_size_row + 0], _buf1[j * txfm_size_row + 1],
+ _buf1[j * txfm_size_row + 2], _buf1[j * txfm_size_row + 3]);
+ }
+ }
+ }
+ for (int i = 0; i < buf_size_w_div4; i++) {
+ col_txfm(buf1 + i * txfm_size_row, buf1 + i * txfm_size_row, INV_COS_BIT, 1,
+ bd, 0);
+
+ av1_round_shift_array_32_sse4_1(buf1 + i * txfm_size_row,
+ buf1 + i * txfm_size_row, txfm_size_row,
+ -shift[1]);
+ }
+
+ // write to buffer
+ {
+ for (int i = 0; i < (txfm_size_col >> 3); i++) {
+ highbd_write_buffer_8xn_sse4_1(buf1 + i * txfm_size_row * 2,
+ output + 8 * i, stride, ud_flip,
+ txfm_size_row, bd);
+ }
+ }
+}
+static void highbd_inv_txfm2d_add_idtx_ssse41(const int32_t *input,
+ uint16_t *output, int stride,
+ TX_TYPE tx_type, TX_SIZE tx_size,
+ int eob, const int bd) {
+ (void)eob;
+ __m128i buf1[64 * 4];
+ const int8_t *shift = av1_inv_txfm_shift_ls[tx_size];
+ const int txw_idx = get_txw_idx(tx_size);
+ const int txh_idx = get_txh_idx(tx_size);
+ const int txfm_size_col = tx_size_wide[tx_size];
+ const int txfm_size_row = tx_size_high[tx_size];
+ const int row_max = AOMMIN(32, txfm_size_row);
+ const int input_stride = row_max;
+ const int buf_size_w = AOMMIN(32, txfm_size_col);
+ const int buf_size_w_div4 = buf_size_w >> 2;
+ const int rect_type = get_rect_tx_log_ratio(txfm_size_col, txfm_size_row);
+ const transform_1d_sse4_1 row_txfm =
+ highbd_txfm_all_1d_zeros_w8_arr[txw_idx][hitx_1d_tab[tx_type]][0];
+ const transform_1d_sse4_1 col_txfm =
+ highbd_txfm_all_1d_zeros_w8_arr[txh_idx][vitx_1d_tab[tx_type]][0];
+
+ for (int i = 0; i < (row_max >> 2); ++i) {
+ __m128i buf0[32];
+ load_buffer_32bit_input(input + i * 4, input_stride, buf0, buf_size_w);
+ if (rect_type == 1 || rect_type == -1) {
+ av1_round_shift_rect_array_32_sse4_1(buf0, buf0, buf_size_w, 0,
+ NewInvSqrt2);
+ }
+ row_txfm(buf0, buf0, INV_COS_BIT, 0, bd, -shift[0]);
+
+ __m128i *_buf1 = buf1 + i * 4;
+ for (int j = 0; j < buf_size_w_div4; ++j) {
+ __m128i *buf0_cur = buf0 + j * 4;
+ TRANSPOSE_4X4(buf0_cur[0], buf0_cur[1], buf0_cur[2], buf0_cur[3],
+ buf0_cur[0], buf0_cur[1], buf0_cur[2], buf0_cur[3]);
+ _buf1[j * txfm_size_row + 0] = buf0_cur[0];
+ _buf1[j * txfm_size_row + 1] = buf0_cur[1];
+ _buf1[j * txfm_size_row + 2] = buf0_cur[2];
+ _buf1[j * txfm_size_row + 3] = buf0_cur[3];
+ }
+ }
+ for (int i = 0; i < buf_size_w_div4; i++) {
+ col_txfm(buf1 + i * txfm_size_row, buf1 + i * txfm_size_row, INV_COS_BIT, 1,
+ bd, 0);
+
+ av1_round_shift_array_32_sse4_1(buf1 + i * txfm_size_row,
+ buf1 + i * txfm_size_row, txfm_size_row,
+ -shift[1]);
+ }
+
+ // write to buffer
+ {
+ for (int i = 0; i < (txfm_size_col >> 3); i++) {
+ highbd_write_buffer_8xn_sse4_1(buf1 + i * txfm_size_row * 2,
+ output + 8 * i, stride, 0, txfm_size_row,
+ bd);
+ }
+ }
+}
+static void highbd_inv_txfm2d_add_no_identity_sse41(const int32_t *input,
+ uint16_t *output,
+ int stride, TX_TYPE tx_type,
+ TX_SIZE tx_size, int eob,
+ const int bd) {
+ __m128i buf1[64 * 16];
+ int eobx, eoby;
+ get_eobx_eoby_scan_default(&eobx, &eoby, tx_size, eob);
+ const int8_t *shift = av1_inv_txfm_shift_ls[tx_size];
+ const int txw_idx = get_txw_idx(tx_size);
+ const int txh_idx = get_txh_idx(tx_size);
+ const int txfm_size_col = tx_size_wide[tx_size];
+ const int txfm_size_row = tx_size_high[tx_size];
+ const int buf_size_w_div4 = txfm_size_col >> 2;
+ const int buf_size_nonzero_w = (eobx + 8) >> 3 << 3;
+ const int buf_size_nonzero_h_div8 = (eoby + 8) >> 3;
+ const int input_stride = AOMMIN(32, txfm_size_row);
+ const int rect_type = get_rect_tx_log_ratio(txfm_size_col, txfm_size_row);
+
+ const int fun_idx_x = lowbd_txfm_all_1d_zeros_idx[eobx];
+ const int fun_idx_y = lowbd_txfm_all_1d_zeros_idx[eoby];
+ const transform_1d_sse4_1 row_txfm =
+ highbd_txfm_all_1d_zeros_w8_arr[txw_idx][hitx_1d_tab[tx_type]][fun_idx_x];
+ const transform_1d_sse4_1 col_txfm =
+ highbd_txfm_all_1d_zeros_w8_arr[txh_idx][vitx_1d_tab[tx_type]][fun_idx_y];
+
+ assert(col_txfm != NULL);
+ assert(row_txfm != NULL);
+ int ud_flip, lr_flip;
+ get_flip_cfg(tx_type, &ud_flip, &lr_flip);
+
+ // 1st stage: column transform
+ for (int i = 0; i < buf_size_nonzero_h_div8 << 1; i++) {
+ __m128i buf0[64];
+ load_buffer_32bit_input(input + i * 4, input_stride, buf0,
+ buf_size_nonzero_w);
+ if (rect_type == 1 || rect_type == -1) {
+ av1_round_shift_rect_array_32_sse4_1(buf0, buf0, buf_size_nonzero_w, 0,
+ NewInvSqrt2);
+ }
+ row_txfm(buf0, buf0, INV_COS_BIT, 0, bd, -shift[0]);
+
+ __m128i *_buf1 = buf1 + i * 4;
+ if (lr_flip) {
+ for (int j = 0; j < buf_size_w_div4; ++j) {
+ TRANSPOSE_4X4(buf0[4 * j + 3], buf0[4 * j + 2], buf0[4 * j + 1],
+ buf0[4 * j],
+ _buf1[txfm_size_row * (buf_size_w_div4 - 1 - j) + 0],
+ _buf1[txfm_size_row * (buf_size_w_div4 - 1 - j) + 1],
+ _buf1[txfm_size_row * (buf_size_w_div4 - 1 - j) + 2],
+ _buf1[txfm_size_row * (buf_size_w_div4 - 1 - j) + 3]);
+ }
+ } else {
+ for (int j = 0; j < buf_size_w_div4; ++j) {
+ TRANSPOSE_4X4(
+ buf0[j * 4 + 0], buf0[j * 4 + 1], buf0[j * 4 + 2], buf0[j * 4 + 3],
+ _buf1[j * txfm_size_row + 0], _buf1[j * txfm_size_row + 1],
+ _buf1[j * txfm_size_row + 2], _buf1[j * txfm_size_row + 3]);
+ }
+ }
+ }
+ // 2nd stage: column transform
+ for (int i = 0; i < buf_size_w_div4; i++) {
+ col_txfm(buf1 + i * txfm_size_row, buf1 + i * txfm_size_row, INV_COS_BIT, 1,
+ bd, 0);
+
+ av1_round_shift_array_32_sse4_1(buf1 + i * txfm_size_row,
+ buf1 + i * txfm_size_row, txfm_size_row,
+ -shift[1]);
+ }
+
+ // write to buffer
+ {
+ for (int i = 0; i < (txfm_size_col >> 3); i++) {
+ highbd_write_buffer_8xn_sse4_1(buf1 + i * txfm_size_row * 2,
+ output + 8 * i, stride, ud_flip,
+ txfm_size_row, bd);
+ }
+ }
+}
+
+static void highbd_inv_txfm2d_add_4x8_sse41(const int32_t *input,
+ uint16_t *output, int stride,
+ TX_TYPE tx_type, TX_SIZE tx_size,
+ int eob, const int bd) {
+ (void)eob;
+ __m128i buf1[8];
+ const int8_t *shift = av1_inv_txfm_shift_ls[tx_size];
+ const int txw_idx = get_txw_idx(tx_size);
+ const int txh_idx = get_txh_idx(tx_size);
+ const int txfm_size_col = tx_size_wide[tx_size];
+ const int txfm_size_row = tx_size_high[tx_size];
+ const transform_1d_sse4_1 row_txfm =
+ highbd_txfm_all_1d_zeros_w8_arr[txw_idx][hitx_1d_tab[tx_type]][0];
+ const transform_1d_sse4_1 col_txfm =
+ highbd_txfm_all_1d_zeros_w8_arr[txh_idx][vitx_1d_tab[tx_type]][1];
+ const int input_stride = AOMMIN(32, txfm_size_row);
+
+ assert(col_txfm != NULL);
+ assert(row_txfm != NULL);
+ int ud_flip, lr_flip;
+ get_flip_cfg(tx_type, &ud_flip, &lr_flip);
+
+ // 1st stage: column transform
+ __m128i buf0[8];
+ load_buffer_32bit_input(input, input_stride, buf0, txfm_size_col);
+ load_buffer_32bit_input(input + 4, input_stride, buf0 + 4, txfm_size_col);
+ av1_round_shift_rect_array_32_sse4_1(buf0, buf0, txfm_size_row, 0,
+ NewInvSqrt2);
+ row_txfm(buf0, buf0, INV_COS_BIT, 0, bd, -shift[0]);
+ row_txfm(buf0 + 4, buf0 + 4, INV_COS_BIT, 0, bd, -shift[0]);
+
+ if (lr_flip) {
+ TRANSPOSE_4X4(buf0[3], buf0[2], buf0[1], buf0[0], buf1[0], buf1[1], buf1[2],
+ buf1[3]);
+
+ TRANSPOSE_4X4(buf0[7], buf0[6], buf0[5], buf0[4], buf1[4], buf1[5], buf1[6],
+ buf1[7]);
+ } else {
+ TRANSPOSE_4X4(buf0[0], buf0[1], buf0[2], buf0[3], buf1[0], buf1[1], buf1[2],
+ buf1[3]);
+
+ TRANSPOSE_4X4(buf0[4], buf0[5], buf0[6], buf0[7], buf1[4], buf1[5], buf1[6],
+ buf1[7]);
+ }
+
+ // 2nd stage: column transform
+ col_txfm(buf1, buf1, INV_COS_BIT, 1, bd, 0);
+
+ av1_round_shift_array_32_sse4_1(buf1, buf1, txfm_size_row, -shift[1]);
+
+ // write to buffer
+ highbd_write_buffer_4xn_sse4_1(buf1, output, stride, ud_flip, txfm_size_row,
+ bd);
+}
+
+static void highbd_inv_txfm2d_add_8x4_sse41(const int32_t *input,
+ uint16_t *output, int stride,
+ TX_TYPE tx_type, TX_SIZE tx_size,
+ int eob, const int bd) {
+ (void)eob;
+ __m128i buf1[8];
+ const int8_t *shift = av1_inv_txfm_shift_ls[tx_size];
+ const int txw_idx = get_txw_idx(tx_size);
+ const int txh_idx = get_txh_idx(tx_size);
+ const int txfm_size_col = tx_size_wide[tx_size];
+ const int txfm_size_row = tx_size_high[tx_size];
+ const transform_1d_sse4_1 row_txfm =
+ highbd_txfm_all_1d_zeros_w8_arr[txw_idx][hitx_1d_tab[tx_type]][1];
+ const transform_1d_sse4_1 col_txfm =
+ highbd_txfm_all_1d_zeros_w8_arr[txh_idx][vitx_1d_tab[tx_type]][0];
+
+ assert(col_txfm != NULL);
+ assert(row_txfm != NULL);
+ int ud_flip, lr_flip;
+ get_flip_cfg(tx_type, &ud_flip, &lr_flip);
+
+ // 1st stage: column transform
+ __m128i buf0[8];
+ const int32_t *input_row = input;
+ load_buffer_32bit_input(input_row, 4, buf0, txfm_size_col);
+
+ av1_round_shift_rect_array_32_sse4_1(buf0, buf0, txfm_size_col, 0,
+ NewInvSqrt2);
+ row_txfm(buf0, buf0, INV_COS_BIT, 0, bd, -shift[0]);
+
+ __m128i *buf1_ptr;
+ if (lr_flip) {
+ flip_buf_sse2(buf0, buf1, txfm_size_col);
+ buf1_ptr = buf1;
+ } else {
+ buf1_ptr = buf0;
+ }
+
+ // 2nd stage: column transform
+ for (int i = 0; i < 2; i++) {
+ __m128i *buf1_cur = buf1_ptr + i * txfm_size_row;
+ transpose_32bit_4x4(buf1_cur, buf1_cur);
+ col_txfm(buf1_cur, buf1_cur, INV_COS_BIT, 1, bd, 0);
+ }
+ av1_round_shift_array_32_sse4_1(buf1_ptr, buf1_ptr, txfm_size_col, -shift[1]);
+ // write to buffer
+ highbd_write_buffer_8xn_sse4_1(buf1_ptr, output, stride, ud_flip,
+ txfm_size_row, bd);
+}
+
+static void highbd_inv_txfm2d_add_4x16_sse4_1(const int32_t *input,
+ uint16_t *output, int stride,
+ TX_TYPE tx_type, TX_SIZE tx_size,
+ int eob, const int bd) {
+ (void)eob;
+ __m128i buf1[16];
+ const int8_t *shift = av1_inv_txfm_shift_ls[tx_size];
+ const int txw_idx = get_txw_idx(tx_size);
+ const int txh_idx = get_txh_idx(tx_size);
+ const int txfm_size_col = tx_size_wide[tx_size];
+ const int txfm_size_row = tx_size_high[tx_size];
+ const int buf_size_h_div8 = txfm_size_row >> 2;
+ const transform_1d_sse4_1 row_txfm =
+ highbd_txfm_all_1d_zeros_w8_arr[txw_idx][hitx_1d_tab[tx_type]][0];
+ const transform_1d_sse4_1 col_txfm =
+ highbd_txfm_all_1d_zeros_w8_arr[txh_idx][vitx_1d_tab[tx_type]][2];
+ const int input_stride = AOMMIN(32, txfm_size_row);
+
+ assert(col_txfm != NULL);
+ assert(row_txfm != NULL);
+ int ud_flip, lr_flip;
+ get_flip_cfg(tx_type, &ud_flip, &lr_flip);
+
+ // 1st stage: column transform
+ __m128i buf0[16];
+ for (int i = 0; i < (txfm_size_row >> 2); i++) {
+ const int32_t *input_row = input + i * 4;
+ __m128i *buf0_cur = buf0 + i * 4;
+ load_buffer_32bit_input(input_row, input_stride, buf0_cur, txfm_size_col);
+ row_txfm(buf0_cur, buf0_cur, INV_COS_BIT, 0, bd, -shift[0]);
+ }
+
+ if (lr_flip) {
+ for (int j = 0; j < buf_size_h_div8; ++j) {
+ TRANSPOSE_4X4(buf0[4 * j + 3], buf0[4 * j + 2], buf0[4 * j + 1],
+ buf0[4 * j], buf1[4 * j], buf1[4 * j + 1], buf1[4 * j + 2],
+ buf1[4 * j + 3]);
+ }
+ } else {
+ for (int j = 0; j < buf_size_h_div8; ++j) {
+ TRANSPOSE_4X4(buf0[4 * j], buf0[4 * j + 1], buf0[4 * j + 2],
+ buf0[4 * j + 3], buf1[4 * j], buf1[4 * j + 1],
+ buf1[4 * j + 2], buf1[4 * j + 3]);
+ }
+ }
+
+ // 2nd stage: column transform
+ col_txfm(buf1, buf1, INV_COS_BIT, 1, bd, 0);
+
+ av1_round_shift_array_32_sse4_1(buf1, buf1, txfm_size_row, -shift[1]);
+
+ // write to buffer
+ highbd_write_buffer_4xn_sse4_1(buf1, output, stride, ud_flip, txfm_size_row,
+ bd);
+}
+
+static void highbd_inv_txfm2d_add_16x4_sse4_1(const int32_t *input,
+ uint16_t *output, int stride,
+ TX_TYPE tx_type, TX_SIZE tx_size,
+ int eob, const int bd) {
+ (void)eob;
+ __m128i buf1[16];
+ const int8_t *shift = av1_inv_txfm_shift_ls[tx_size];
+ const int txw_idx = get_txw_idx(tx_size);
+ const int txh_idx = get_txh_idx(tx_size);
+ const int txfm_size_col = tx_size_wide[tx_size];
+ const int txfm_size_row = tx_size_high[tx_size];
+ const int buf_size_w_div8 = txfm_size_col >> 2;
+ const transform_1d_sse4_1 row_txfm =
+ highbd_txfm_all_1d_zeros_w8_arr[txw_idx][hitx_1d_tab[tx_type]][2];
+ const transform_1d_sse4_1 col_txfm =
+ highbd_txfm_all_1d_zeros_w8_arr[txh_idx][vitx_1d_tab[tx_type]][0];
+
+ assert(col_txfm != NULL);
+ assert(row_txfm != NULL);
+ int ud_flip, lr_flip;
+ get_flip_cfg(tx_type, &ud_flip, &lr_flip);
+
+ // 1st stage: column transform
+ __m128i buf0[16];
+ const int32_t *input_row = input;
+ load_buffer_32bit_input(input_row, 4, buf0, txfm_size_col);
+
+ row_txfm(buf0, buf0, INV_COS_BIT, 0, bd, -shift[0]);
+
+ __m128i *buf1_ptr;
+ if (lr_flip) {
+ flip_buf_sse2(buf0, buf1, txfm_size_col);
+ buf1_ptr = buf1;
+ } else {
+ buf1_ptr = buf0;
+ }
+
+ // 2nd stage: column transform
+ for (int i = 0; i < buf_size_w_div8; i++) {
+ __m128i *buf1_cur = buf1_ptr + i * txfm_size_row;
+ transpose_32bit_4x4(buf1_cur, buf1_cur);
+ col_txfm(buf1_cur, buf1_cur, INV_COS_BIT, 1, bd, 0);
+ }
+ av1_round_shift_array_32_sse4_1(buf1_ptr, buf1_ptr, txfm_size_col, -shift[1]);
+
+ // write to buffer
+ for (int i = 0; i < (txfm_size_col >> 3); i++) {
+ highbd_write_buffer_8xn_sse4_1(buf1_ptr + i * txfm_size_row * 2,
+ output + 8 * i, stride, ud_flip,
+ txfm_size_row, bd);
+ }
+}
+
+void av1_highbd_inv_txfm2d_add_universe_sse4_1(const int32_t *input,
+ uint8_t *output, int stride,
+ TX_TYPE tx_type, TX_SIZE tx_size,
+ int eob, const int bd) {
+ switch (tx_type) {
+ case DCT_DCT:
+ case ADST_DCT:
+ case DCT_ADST:
+ case ADST_ADST:
+ case FLIPADST_DCT:
+ case DCT_FLIPADST:
+ case FLIPADST_FLIPADST:
+ case ADST_FLIPADST:
+ case FLIPADST_ADST:
+ highbd_inv_txfm2d_add_no_identity_sse41(
+ input, CONVERT_TO_SHORTPTR(output), stride, tx_type, tx_size, eob,
+ bd);
+ break;
+ case V_DCT:
+ case V_ADST:
+ case V_FLIPADST:
+ highbd_inv_txfm2d_add_h_identity_ssse41(
+ input, CONVERT_TO_SHORTPTR(output), stride, tx_type, tx_size, eob,
+ bd);
+ break;
+ case H_DCT:
+ case H_ADST:
+ case H_FLIPADST:
+ highbd_inv_txfm2d_add_v_identity_ssse41(
+ input, CONVERT_TO_SHORTPTR(output), stride, tx_type, tx_size, eob,
+ bd);
+ break;
+ case IDTX:
+ highbd_inv_txfm2d_add_idtx_ssse41(input, CONVERT_TO_SHORTPTR(output),
+ stride, tx_type, tx_size, eob, bd);
+ break;
+ default: assert(0); break;
+ }
+}
+
+void av1_highbd_inv_txfm_add_4x8_sse4_1(const tran_low_t *input, uint8_t *dest,
+ int stride,
+ const TxfmParam *txfm_param) {
+ int bd = txfm_param->bd;
+ const TX_TYPE tx_type = txfm_param->tx_type;
+ const TX_SIZE tx_size = txfm_param->tx_size;
+ int eob = txfm_param->eob;
+ highbd_inv_txfm2d_add_4x8_sse41(input, CONVERT_TO_SHORTPTR(dest), stride,
+ tx_type, tx_size, eob, bd);
+}
+
+void av1_highbd_inv_txfm_add_8x4_sse4_1(const tran_low_t *input, uint8_t *dest,
+ int stride,
+ const TxfmParam *txfm_param) {
+ int bd = txfm_param->bd;
+ const TX_TYPE tx_type = txfm_param->tx_type;
+ const TX_SIZE tx_size = txfm_param->tx_size;
+ int eob = txfm_param->eob;
+ highbd_inv_txfm2d_add_8x4_sse41(input, CONVERT_TO_SHORTPTR(dest), stride,
+ tx_type, tx_size, eob, bd);
+}
+
+void av1_highbd_inv_txfm_add_4x16_sse4_1(const tran_low_t *input, uint8_t *dest,
+ int stride,
+ const TxfmParam *txfm_param) {
+ int bd = txfm_param->bd;
+ const TX_TYPE tx_type = txfm_param->tx_type;
+ const TX_SIZE tx_size = txfm_param->tx_size;
+ int eob = txfm_param->eob;
+ highbd_inv_txfm2d_add_4x16_sse4_1(input, CONVERT_TO_SHORTPTR(dest), stride,
+ tx_type, tx_size, eob, bd);
+}
+
+void av1_highbd_inv_txfm_add_16x4_sse4_1(const tran_low_t *input, uint8_t *dest,
+ int stride,
+ const TxfmParam *txfm_param) {
+ int bd = txfm_param->bd;
+ const TX_TYPE tx_type = txfm_param->tx_type;
+ const TX_SIZE tx_size = txfm_param->tx_size;
+ int eob = txfm_param->eob;
+ highbd_inv_txfm2d_add_16x4_sse4_1(input, CONVERT_TO_SHORTPTR(dest), stride,
+ tx_type, tx_size, eob, bd);
+}
+
+void av1_highbd_inv_txfm_add_sse4_1(const tran_low_t *input, uint8_t *dest,
+ int stride, const TxfmParam *txfm_param) {
+ assert(av1_ext_tx_used[txfm_param->tx_set_type][txfm_param->tx_type]);
+ const TX_SIZE tx_size = txfm_param->tx_size;
+ switch (tx_size) {
+ case TX_8X8:
+ av1_highbd_inv_txfm_add_8x8_sse4_1(input, dest, stride, txfm_param);
+ break;
+ case TX_4X8:
+ av1_highbd_inv_txfm_add_4x8_sse4_1(input, dest, stride, txfm_param);
+ break;
+ case TX_8X4:
+ av1_highbd_inv_txfm_add_8x4_sse4_1(input, dest, stride, txfm_param);
+ break;
+ case TX_4X4:
+ av1_highbd_inv_txfm_add_4x4_sse4_1(input, dest, stride, txfm_param);
+ break;
+ case TX_16X4:
+ av1_highbd_inv_txfm_add_16x4_sse4_1(input, dest, stride, txfm_param);
+ break;
+ case TX_4X16:
+ av1_highbd_inv_txfm_add_4x16_sse4_1(input, dest, stride, txfm_param);
+ break;
+ default:
+ av1_highbd_inv_txfm2d_add_universe_sse4_1(
+ input, dest, stride, txfm_param->tx_type, tx_size, txfm_param->eob,
+ txfm_param->bd);
+ break;
+ }
+}
diff --git a/third_party/aom/av1/common/x86/highbd_jnt_convolve_avx2.c b/third_party/aom/av1/common/x86/highbd_jnt_convolve_avx2.c
new file mode 100644
index 0000000000..6dcac10e45
--- /dev/null
+++ b/third_party/aom/av1/common/x86/highbd_jnt_convolve_avx2.c
@@ -0,0 +1,849 @@
+/*
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <immintrin.h>
+#include <assert.h>
+
+#include "config/av1_rtcd.h"
+
+#include "aom_dsp/x86/convolve_avx2.h"
+#include "aom_dsp/x86/convolve_common_intrin.h"
+#include "aom_dsp/x86/convolve_sse4_1.h"
+#include "aom_dsp/x86/synonyms.h"
+#include "aom_dsp/aom_dsp_common.h"
+#include "aom_dsp/aom_filter.h"
+#include "av1/common/convolve.h"
+
+void av1_highbd_dist_wtd_convolve_2d_copy_avx2(const uint16_t *src,
+ int src_stride, uint16_t *dst0,
+ int dst_stride0, int w, int h,
+ ConvolveParams *conv_params,
+ int bd) {
+ CONV_BUF_TYPE *dst = conv_params->dst;
+ int dst_stride = conv_params->dst_stride;
+
+ const int bits =
+ FILTER_BITS * 2 - conv_params->round_1 - conv_params->round_0;
+ const __m128i left_shift = _mm_cvtsi32_si128(bits);
+ const int do_average = conv_params->do_average;
+ const int use_dist_wtd_comp_avg = conv_params->use_dist_wtd_comp_avg;
+ const int w0 = conv_params->fwd_offset;
+ const int w1 = conv_params->bck_offset;
+ const __m256i wt0 = _mm256_set1_epi32(w0);
+ const __m256i wt1 = _mm256_set1_epi32(w1);
+ const __m256i zero = _mm256_setzero_si256();
+ int i, j;
+
+ const int offset_0 =
+ bd + 2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1;
+ const int offset = (1 << offset_0) + (1 << (offset_0 - 1));
+ const __m256i offset_const = _mm256_set1_epi32(offset);
+ const __m256i offset_const_16b = _mm256_set1_epi16(offset);
+ const int rounding_shift =
+ 2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1;
+ const __m256i rounding_const = _mm256_set1_epi32((1 << rounding_shift) >> 1);
+ const __m256i clip_pixel_to_bd =
+ _mm256_set1_epi16(bd == 10 ? 1023 : (bd == 12 ? 4095 : 255));
+
+ assert(bits <= 4);
+
+ if (!(w % 16)) {
+ for (i = 0; i < h; i += 1) {
+ for (j = 0; j < w; j += 16) {
+ const __m256i src_16bit =
+ _mm256_loadu_si256((__m256i *)(&src[i * src_stride + j]));
+
+ const __m256i res = _mm256_sll_epi16(src_16bit, left_shift);
+
+ if (do_average) {
+ const __m256i data_0 =
+ _mm256_loadu_si256((__m256i *)(&dst[i * dst_stride + j]));
+
+ const __m256i data_ref_0_lo = _mm256_unpacklo_epi16(data_0, zero);
+ const __m256i data_ref_0_hi = _mm256_unpackhi_epi16(data_0, zero);
+
+ const __m256i res_32b_lo = _mm256_unpacklo_epi16(res, zero);
+ const __m256i res_unsigned_lo =
+ _mm256_add_epi32(res_32b_lo, offset_const);
+
+ const __m256i comp_avg_res_lo =
+ highbd_comp_avg(&data_ref_0_lo, &res_unsigned_lo, &wt0, &wt1,
+ use_dist_wtd_comp_avg);
+
+ const __m256i res_32b_hi = _mm256_unpackhi_epi16(res, zero);
+ const __m256i res_unsigned_hi =
+ _mm256_add_epi32(res_32b_hi, offset_const);
+
+ const __m256i comp_avg_res_hi =
+ highbd_comp_avg(&data_ref_0_hi, &res_unsigned_hi, &wt0, &wt1,
+ use_dist_wtd_comp_avg);
+
+ const __m256i round_result_lo = highbd_convolve_rounding(
+ &comp_avg_res_lo, &offset_const, &rounding_const, rounding_shift);
+ const __m256i round_result_hi = highbd_convolve_rounding(
+ &comp_avg_res_hi, &offset_const, &rounding_const, rounding_shift);
+
+ const __m256i res_16b =
+ _mm256_packus_epi32(round_result_lo, round_result_hi);
+ const __m256i res_clip = _mm256_min_epi16(res_16b, clip_pixel_to_bd);
+
+ _mm256_store_si256((__m256i *)(&dst0[i * dst_stride0 + j]), res_clip);
+ } else {
+ const __m256i res_unsigned_16b =
+ _mm256_adds_epu16(res, offset_const_16b);
+
+ _mm256_store_si256((__m256i *)(&dst[i * dst_stride + j]),
+ res_unsigned_16b);
+ }
+ }
+ }
+ } else if (!(w % 4)) {
+ for (i = 0; i < h; i += 2) {
+ for (j = 0; j < w; j += 8) {
+ const __m128i src_row_0 =
+ _mm_loadu_si128((__m128i *)(&src[i * src_stride + j]));
+ const __m128i src_row_1 =
+ _mm_loadu_si128((__m128i *)(&src[i * src_stride + j + src_stride]));
+ // since not all compilers yet support _mm256_set_m128i()
+ const __m256i src_10 = _mm256_insertf128_si256(
+ _mm256_castsi128_si256(src_row_0), src_row_1, 1);
+
+ const __m256i res = _mm256_sll_epi16(src_10, left_shift);
+
+ if (w - j < 8) {
+ if (do_average) {
+ const __m256i data_0 = _mm256_castsi128_si256(
+ _mm_loadl_epi64((__m128i *)(&dst[i * dst_stride + j])));
+ const __m256i data_1 = _mm256_castsi128_si256(_mm_loadl_epi64(
+ (__m128i *)(&dst[i * dst_stride + j + dst_stride])));
+ const __m256i data_01 =
+ _mm256_permute2x128_si256(data_0, data_1, 0x20);
+
+ const __m256i data_ref_0 = _mm256_unpacklo_epi16(data_01, zero);
+
+ const __m256i res_32b = _mm256_unpacklo_epi16(res, zero);
+ const __m256i res_unsigned_lo =
+ _mm256_add_epi32(res_32b, offset_const);
+
+ const __m256i comp_avg_res =
+ highbd_comp_avg(&data_ref_0, &res_unsigned_lo, &wt0, &wt1,
+ use_dist_wtd_comp_avg);
+
+ const __m256i round_result = highbd_convolve_rounding(
+ &comp_avg_res, &offset_const, &rounding_const, rounding_shift);
+
+ const __m256i res_16b =
+ _mm256_packus_epi32(round_result, round_result);
+ const __m256i res_clip =
+ _mm256_min_epi16(res_16b, clip_pixel_to_bd);
+
+ const __m128i res_0 = _mm256_castsi256_si128(res_clip);
+ const __m128i res_1 = _mm256_extracti128_si256(res_clip, 1);
+
+ _mm_storel_epi64((__m128i *)(&dst0[i * dst_stride0 + j]), res_0);
+ _mm_storel_epi64(
+ (__m128i *)(&dst0[i * dst_stride0 + j + dst_stride0]), res_1);
+ } else {
+ const __m256i res_unsigned_16b =
+ _mm256_adds_epu16(res, offset_const_16b);
+
+ const __m128i res_0 = _mm256_castsi256_si128(res_unsigned_16b);
+ const __m128i res_1 = _mm256_extracti128_si256(res_unsigned_16b, 1);
+
+ _mm_storel_epi64((__m128i *)(&dst[i * dst_stride + j]), res_0);
+ _mm_storel_epi64((__m128i *)(&dst[i * dst_stride + j + dst_stride]),
+ res_1);
+ }
+ } else {
+ if (do_average) {
+ const __m256i data_0 = _mm256_castsi128_si256(
+ _mm_loadu_si128((__m128i *)(&dst[i * dst_stride + j])));
+ const __m256i data_1 = _mm256_castsi128_si256(_mm_loadu_si128(
+ (__m128i *)(&dst[i * dst_stride + j + dst_stride])));
+ const __m256i data_01 =
+ _mm256_permute2x128_si256(data_0, data_1, 0x20);
+
+ const __m256i data_ref_0_lo = _mm256_unpacklo_epi16(data_01, zero);
+ const __m256i data_ref_0_hi = _mm256_unpackhi_epi16(data_01, zero);
+
+ const __m256i res_32b_lo = _mm256_unpacklo_epi16(res, zero);
+ const __m256i res_unsigned_lo =
+ _mm256_add_epi32(res_32b_lo, offset_const);
+
+ const __m256i comp_avg_res_lo =
+ highbd_comp_avg(&data_ref_0_lo, &res_unsigned_lo, &wt0, &wt1,
+ use_dist_wtd_comp_avg);
+
+ const __m256i res_32b_hi = _mm256_unpackhi_epi16(res, zero);
+ const __m256i res_unsigned_hi =
+ _mm256_add_epi32(res_32b_hi, offset_const);
+
+ const __m256i comp_avg_res_hi =
+ highbd_comp_avg(&data_ref_0_hi, &res_unsigned_hi, &wt0, &wt1,
+ use_dist_wtd_comp_avg);
+
+ const __m256i round_result_lo =
+ highbd_convolve_rounding(&comp_avg_res_lo, &offset_const,
+ &rounding_const, rounding_shift);
+ const __m256i round_result_hi =
+ highbd_convolve_rounding(&comp_avg_res_hi, &offset_const,
+ &rounding_const, rounding_shift);
+
+ const __m256i res_16b =
+ _mm256_packus_epi32(round_result_lo, round_result_hi);
+ const __m256i res_clip =
+ _mm256_min_epi16(res_16b, clip_pixel_to_bd);
+
+ const __m128i res_0 = _mm256_castsi256_si128(res_clip);
+ const __m128i res_1 = _mm256_extracti128_si256(res_clip, 1);
+
+ _mm_store_si128((__m128i *)(&dst0[i * dst_stride0 + j]), res_0);
+ _mm_store_si128(
+ (__m128i *)(&dst0[i * dst_stride0 + j + dst_stride0]), res_1);
+ } else {
+ const __m256i res_unsigned_16b =
+ _mm256_adds_epu16(res, offset_const_16b);
+ const __m128i res_0 = _mm256_castsi256_si128(res_unsigned_16b);
+ const __m128i res_1 = _mm256_extracti128_si256(res_unsigned_16b, 1);
+
+ _mm_store_si128((__m128i *)(&dst[i * dst_stride + j]), res_0);
+ _mm_store_si128((__m128i *)(&dst[i * dst_stride + j + dst_stride]),
+ res_1);
+ }
+ }
+ }
+ }
+ }
+}
+
+void av1_highbd_dist_wtd_convolve_2d_avx2(
+ const uint16_t *src, int src_stride, uint16_t *dst0, int dst_stride0, int w,
+ int h, const InterpFilterParams *filter_params_x,
+ const InterpFilterParams *filter_params_y, const int subpel_x_qn,
+ const int subpel_y_qn, ConvolveParams *conv_params, int bd) {
+ DECLARE_ALIGNED(32, int16_t, im_block[(MAX_SB_SIZE + MAX_FILTER_TAP) * 8]);
+ CONV_BUF_TYPE *dst = conv_params->dst;
+ int dst_stride = conv_params->dst_stride;
+ int im_h = h + filter_params_y->taps - 1;
+ int im_stride = 8;
+ int i, j;
+ const int fo_vert = filter_params_y->taps / 2 - 1;
+ const int fo_horiz = filter_params_x->taps / 2 - 1;
+ const uint16_t *const src_ptr = src - fo_vert * src_stride - fo_horiz;
+
+ // Check that, even with 12-bit input, the intermediate values will fit
+ // into an unsigned 16-bit intermediate array.
+ assert(bd + FILTER_BITS + 2 - conv_params->round_0 <= 16);
+
+ __m256i s[8], coeffs_y[4], coeffs_x[4];
+ const int do_average = conv_params->do_average;
+ const int use_dist_wtd_comp_avg = conv_params->use_dist_wtd_comp_avg;
+
+ const int w0 = conv_params->fwd_offset;
+ const int w1 = conv_params->bck_offset;
+ const __m256i wt0 = _mm256_set1_epi32(w0);
+ const __m256i wt1 = _mm256_set1_epi32(w1);
+ const __m256i zero = _mm256_setzero_si256();
+
+ const __m256i round_const_x = _mm256_set1_epi32(
+ ((1 << conv_params->round_0) >> 1) + (1 << (bd + FILTER_BITS - 1)));
+ const __m128i round_shift_x = _mm_cvtsi32_si128(conv_params->round_0);
+
+ const __m256i round_const_y = _mm256_set1_epi32(
+ ((1 << conv_params->round_1) >> 1) -
+ (1 << (bd + 2 * FILTER_BITS - conv_params->round_0 - 1)));
+ const __m128i round_shift_y = _mm_cvtsi32_si128(conv_params->round_1);
+
+ const int offset_0 =
+ bd + 2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1;
+ const int offset = (1 << offset_0) + (1 << (offset_0 - 1));
+ const __m256i offset_const = _mm256_set1_epi32(offset);
+ const int rounding_shift =
+ 2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1;
+ const __m256i rounding_const = _mm256_set1_epi32((1 << rounding_shift) >> 1);
+
+ const __m256i clip_pixel_to_bd =
+ _mm256_set1_epi16(bd == 10 ? 1023 : (bd == 12 ? 4095 : 255));
+
+ prepare_coeffs(filter_params_x, subpel_x_qn, coeffs_x);
+ prepare_coeffs(filter_params_y, subpel_y_qn, coeffs_y);
+
+ for (j = 0; j < w; j += 8) {
+ /* Horizontal filter */
+ {
+ for (i = 0; i < im_h; i += 2) {
+ const __m256i row0 =
+ _mm256_loadu_si256((__m256i *)&src_ptr[i * src_stride + j]);
+ __m256i row1 = _mm256_setzero_si256();
+ if (i + 1 < im_h)
+ row1 =
+ _mm256_loadu_si256((__m256i *)&src_ptr[(i + 1) * src_stride + j]);
+
+ const __m256i r0 = _mm256_permute2x128_si256(row0, row1, 0x20);
+ const __m256i r1 = _mm256_permute2x128_si256(row0, row1, 0x31);
+
+ // even pixels
+ s[0] = _mm256_alignr_epi8(r1, r0, 0);
+ s[1] = _mm256_alignr_epi8(r1, r0, 4);
+ s[2] = _mm256_alignr_epi8(r1, r0, 8);
+ s[3] = _mm256_alignr_epi8(r1, r0, 12);
+
+ __m256i res_even = convolve(s, coeffs_x);
+ res_even = _mm256_sra_epi32(_mm256_add_epi32(res_even, round_const_x),
+ round_shift_x);
+
+ // odd pixels
+ s[0] = _mm256_alignr_epi8(r1, r0, 2);
+ s[1] = _mm256_alignr_epi8(r1, r0, 6);
+ s[2] = _mm256_alignr_epi8(r1, r0, 10);
+ s[3] = _mm256_alignr_epi8(r1, r0, 14);
+
+ __m256i res_odd = convolve(s, coeffs_x);
+ res_odd = _mm256_sra_epi32(_mm256_add_epi32(res_odd, round_const_x),
+ round_shift_x);
+
+ __m256i res_even1 = _mm256_packs_epi32(res_even, res_even);
+ __m256i res_odd1 = _mm256_packs_epi32(res_odd, res_odd);
+ __m256i res = _mm256_unpacklo_epi16(res_even1, res_odd1);
+
+ _mm256_store_si256((__m256i *)&im_block[i * im_stride], res);
+ }
+ }
+
+ /* Vertical filter */
+ {
+ __m256i s0 = _mm256_loadu_si256((__m256i *)(im_block + 0 * im_stride));
+ __m256i s1 = _mm256_loadu_si256((__m256i *)(im_block + 1 * im_stride));
+ __m256i s2 = _mm256_loadu_si256((__m256i *)(im_block + 2 * im_stride));
+ __m256i s3 = _mm256_loadu_si256((__m256i *)(im_block + 3 * im_stride));
+ __m256i s4 = _mm256_loadu_si256((__m256i *)(im_block + 4 * im_stride));
+ __m256i s5 = _mm256_loadu_si256((__m256i *)(im_block + 5 * im_stride));
+
+ s[0] = _mm256_unpacklo_epi16(s0, s1);
+ s[1] = _mm256_unpacklo_epi16(s2, s3);
+ s[2] = _mm256_unpacklo_epi16(s4, s5);
+
+ s[4] = _mm256_unpackhi_epi16(s0, s1);
+ s[5] = _mm256_unpackhi_epi16(s2, s3);
+ s[6] = _mm256_unpackhi_epi16(s4, s5);
+
+ for (i = 0; i < h; i += 2) {
+ const int16_t *data = &im_block[i * im_stride];
+
+ const __m256i s6 =
+ _mm256_loadu_si256((__m256i *)(data + 6 * im_stride));
+ const __m256i s7 =
+ _mm256_loadu_si256((__m256i *)(data + 7 * im_stride));
+
+ s[3] = _mm256_unpacklo_epi16(s6, s7);
+ s[7] = _mm256_unpackhi_epi16(s6, s7);
+
+ const __m256i res_a = convolve(s, coeffs_y);
+
+ const __m256i res_a_round = _mm256_sra_epi32(
+ _mm256_add_epi32(res_a, round_const_y), round_shift_y);
+
+ const __m256i res_unsigned_lo =
+ _mm256_add_epi32(res_a_round, offset_const);
+
+ if (w - j < 8) {
+ if (do_average) {
+ const __m256i data_0 = _mm256_castsi128_si256(
+ _mm_loadl_epi64((__m128i *)(&dst[i * dst_stride + j])));
+ const __m256i data_1 = _mm256_castsi128_si256(_mm_loadl_epi64(
+ (__m128i *)(&dst[i * dst_stride + j + dst_stride])));
+ const __m256i data_01 =
+ _mm256_permute2x128_si256(data_0, data_1, 0x20);
+
+ const __m256i data_ref_0 = _mm256_unpacklo_epi16(data_01, zero);
+
+ const __m256i comp_avg_res =
+ highbd_comp_avg(&data_ref_0, &res_unsigned_lo, &wt0, &wt1,
+ use_dist_wtd_comp_avg);
+
+ const __m256i round_result = highbd_convolve_rounding(
+ &comp_avg_res, &offset_const, &rounding_const, rounding_shift);
+
+ const __m256i res_16b =
+ _mm256_packus_epi32(round_result, round_result);
+ const __m256i res_clip =
+ _mm256_min_epi16(res_16b, clip_pixel_to_bd);
+
+ const __m128i res_0 = _mm256_castsi256_si128(res_clip);
+ const __m128i res_1 = _mm256_extracti128_si256(res_clip, 1);
+
+ _mm_storel_epi64((__m128i *)(&dst0[i * dst_stride0 + j]), res_0);
+ _mm_storel_epi64(
+ (__m128i *)(&dst0[i * dst_stride0 + j + dst_stride0]), res_1);
+ } else {
+ __m256i res_16b =
+ _mm256_packus_epi32(res_unsigned_lo, res_unsigned_lo);
+ const __m128i res_0 = _mm256_castsi256_si128(res_16b);
+ const __m128i res_1 = _mm256_extracti128_si256(res_16b, 1);
+
+ _mm_storel_epi64((__m128i *)(&dst[i * dst_stride + j]), res_0);
+ _mm_storel_epi64((__m128i *)(&dst[i * dst_stride + j + dst_stride]),
+ res_1);
+ }
+ } else {
+ const __m256i res_b = convolve(s + 4, coeffs_y);
+ const __m256i res_b_round = _mm256_sra_epi32(
+ _mm256_add_epi32(res_b, round_const_y), round_shift_y);
+
+ __m256i res_unsigned_hi = _mm256_add_epi32(res_b_round, offset_const);
+
+ if (do_average) {
+ const __m256i data_0 = _mm256_castsi128_si256(
+ _mm_loadu_si128((__m128i *)(&dst[i * dst_stride + j])));
+ const __m256i data_1 = _mm256_castsi128_si256(_mm_loadu_si128(
+ (__m128i *)(&dst[i * dst_stride + j + dst_stride])));
+ const __m256i data_01 =
+ _mm256_permute2x128_si256(data_0, data_1, 0x20);
+
+ const __m256i data_ref_0_lo = _mm256_unpacklo_epi16(data_01, zero);
+ const __m256i data_ref_0_hi = _mm256_unpackhi_epi16(data_01, zero);
+
+ const __m256i comp_avg_res_lo =
+ highbd_comp_avg(&data_ref_0_lo, &res_unsigned_lo, &wt0, &wt1,
+ use_dist_wtd_comp_avg);
+ const __m256i comp_avg_res_hi =
+ highbd_comp_avg(&data_ref_0_hi, &res_unsigned_hi, &wt0, &wt1,
+ use_dist_wtd_comp_avg);
+
+ const __m256i round_result_lo =
+ highbd_convolve_rounding(&comp_avg_res_lo, &offset_const,
+ &rounding_const, rounding_shift);
+ const __m256i round_result_hi =
+ highbd_convolve_rounding(&comp_avg_res_hi, &offset_const,
+ &rounding_const, rounding_shift);
+
+ const __m256i res_16b =
+ _mm256_packus_epi32(round_result_lo, round_result_hi);
+ const __m256i res_clip =
+ _mm256_min_epi16(res_16b, clip_pixel_to_bd);
+
+ const __m128i res_0 = _mm256_castsi256_si128(res_clip);
+ const __m128i res_1 = _mm256_extracti128_si256(res_clip, 1);
+
+ _mm_store_si128((__m128i *)(&dst0[i * dst_stride0 + j]), res_0);
+ _mm_store_si128(
+ (__m128i *)(&dst0[i * dst_stride0 + j + dst_stride0]), res_1);
+ } else {
+ __m256i res_16b =
+ _mm256_packus_epi32(res_unsigned_lo, res_unsigned_hi);
+ const __m128i res_0 = _mm256_castsi256_si128(res_16b);
+ const __m128i res_1 = _mm256_extracti128_si256(res_16b, 1);
+
+ _mm_store_si128((__m128i *)(&dst[i * dst_stride + j]), res_0);
+ _mm_store_si128((__m128i *)(&dst[i * dst_stride + j + dst_stride]),
+ res_1);
+ }
+ }
+
+ s[0] = s[1];
+ s[1] = s[2];
+ s[2] = s[3];
+
+ s[4] = s[5];
+ s[5] = s[6];
+ s[6] = s[7];
+ }
+ }
+ }
+}
+
+void av1_highbd_dist_wtd_convolve_x_avx2(
+ const uint16_t *src, int src_stride, uint16_t *dst0, int dst_stride0, int w,
+ int h, const InterpFilterParams *filter_params_x, const int subpel_x_qn,
+ ConvolveParams *conv_params, int bd) {
+ CONV_BUF_TYPE *dst = conv_params->dst;
+ int dst_stride = conv_params->dst_stride;
+ const int fo_horiz = filter_params_x->taps / 2 - 1;
+ const uint16_t *const src_ptr = src - fo_horiz;
+ const int bits = FILTER_BITS - conv_params->round_1;
+
+ int i, j;
+ __m256i s[4], coeffs_x[4];
+
+ const int do_average = conv_params->do_average;
+ const int use_dist_wtd_comp_avg = conv_params->use_dist_wtd_comp_avg;
+ const int w0 = conv_params->fwd_offset;
+ const int w1 = conv_params->bck_offset;
+ const __m256i wt0 = _mm256_set1_epi32(w0);
+ const __m256i wt1 = _mm256_set1_epi32(w1);
+ const __m256i zero = _mm256_setzero_si256();
+
+ const __m256i round_const_x =
+ _mm256_set1_epi32(((1 << conv_params->round_0) >> 1));
+ const __m128i round_shift_x = _mm_cvtsi32_si128(conv_params->round_0);
+ const __m128i round_shift_bits = _mm_cvtsi32_si128(bits);
+
+ const int offset_0 =
+ bd + 2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1;
+ const int offset = (1 << offset_0) + (1 << (offset_0 - 1));
+ const __m256i offset_const = _mm256_set1_epi32(offset);
+ const int rounding_shift =
+ 2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1;
+ const __m256i rounding_const = _mm256_set1_epi32((1 << rounding_shift) >> 1);
+ const __m256i clip_pixel_to_bd =
+ _mm256_set1_epi16(bd == 10 ? 1023 : (bd == 12 ? 4095 : 255));
+
+ assert(bits >= 0);
+ prepare_coeffs(filter_params_x, subpel_x_qn, coeffs_x);
+
+ for (j = 0; j < w; j += 8) {
+ /* Horizontal filter */
+ for (i = 0; i < h; i += 2) {
+ const __m256i row0 =
+ _mm256_loadu_si256((__m256i *)&src_ptr[i * src_stride + j]);
+ __m256i row1 =
+ _mm256_loadu_si256((__m256i *)&src_ptr[(i + 1) * src_stride + j]);
+
+ const __m256i r0 = _mm256_permute2x128_si256(row0, row1, 0x20);
+ const __m256i r1 = _mm256_permute2x128_si256(row0, row1, 0x31);
+
+ // even pixels
+ s[0] = _mm256_alignr_epi8(r1, r0, 0);
+ s[1] = _mm256_alignr_epi8(r1, r0, 4);
+ s[2] = _mm256_alignr_epi8(r1, r0, 8);
+ s[3] = _mm256_alignr_epi8(r1, r0, 12);
+
+ __m256i res_even = convolve(s, coeffs_x);
+ res_even = _mm256_sra_epi32(_mm256_add_epi32(res_even, round_const_x),
+ round_shift_x);
+
+ // odd pixels
+ s[0] = _mm256_alignr_epi8(r1, r0, 2);
+ s[1] = _mm256_alignr_epi8(r1, r0, 6);
+ s[2] = _mm256_alignr_epi8(r1, r0, 10);
+ s[3] = _mm256_alignr_epi8(r1, r0, 14);
+
+ __m256i res_odd = convolve(s, coeffs_x);
+ res_odd = _mm256_sra_epi32(_mm256_add_epi32(res_odd, round_const_x),
+ round_shift_x);
+
+ res_even = _mm256_sll_epi32(res_even, round_shift_bits);
+ res_odd = _mm256_sll_epi32(res_odd, round_shift_bits);
+
+ __m256i res1 = _mm256_unpacklo_epi32(res_even, res_odd);
+
+ __m256i res_unsigned_lo = _mm256_add_epi32(res1, offset_const);
+
+ if (w - j < 8) {
+ if (do_average) {
+ const __m256i data_0 = _mm256_castsi128_si256(
+ _mm_loadl_epi64((__m128i *)(&dst[i * dst_stride + j])));
+ const __m256i data_1 = _mm256_castsi128_si256(_mm_loadl_epi64(
+ (__m128i *)(&dst[i * dst_stride + j + dst_stride])));
+ const __m256i data_01 =
+ _mm256_permute2x128_si256(data_0, data_1, 0x20);
+
+ const __m256i data_ref_0 = _mm256_unpacklo_epi16(data_01, zero);
+
+ const __m256i comp_avg_res = highbd_comp_avg(
+ &data_ref_0, &res_unsigned_lo, &wt0, &wt1, use_dist_wtd_comp_avg);
+
+ const __m256i round_result = highbd_convolve_rounding(
+ &comp_avg_res, &offset_const, &rounding_const, rounding_shift);
+
+ const __m256i res_16b =
+ _mm256_packus_epi32(round_result, round_result);
+ const __m256i res_clip = _mm256_min_epi16(res_16b, clip_pixel_to_bd);
+
+ const __m128i res_0 = _mm256_castsi256_si128(res_clip);
+ const __m128i res_1 = _mm256_extracti128_si256(res_clip, 1);
+
+ _mm_storel_epi64((__m128i *)(&dst0[i * dst_stride0 + j]), res_0);
+ _mm_storel_epi64(
+ (__m128i *)(&dst0[i * dst_stride0 + j + dst_stride0]), res_1);
+ } else {
+ __m256i res_16b =
+ _mm256_packus_epi32(res_unsigned_lo, res_unsigned_lo);
+ const __m128i res_0 = _mm256_castsi256_si128(res_16b);
+ const __m128i res_1 = _mm256_extracti128_si256(res_16b, 1);
+
+ _mm_storel_epi64((__m128i *)(&dst[i * dst_stride + j]), res_0);
+ _mm_storel_epi64((__m128i *)(&dst[i * dst_stride + j + dst_stride]),
+ res_1);
+ }
+ } else {
+ __m256i res2 = _mm256_unpackhi_epi32(res_even, res_odd);
+ __m256i res_unsigned_hi = _mm256_add_epi32(res2, offset_const);
+
+ if (do_average) {
+ const __m256i data_0 = _mm256_castsi128_si256(
+ _mm_loadu_si128((__m128i *)(&dst[i * dst_stride + j])));
+ const __m256i data_1 = _mm256_castsi128_si256(_mm_loadu_si128(
+ (__m128i *)(&dst[i * dst_stride + j + dst_stride])));
+ const __m256i data_01 =
+ _mm256_permute2x128_si256(data_0, data_1, 0x20);
+
+ const __m256i data_ref_0_lo = _mm256_unpacklo_epi16(data_01, zero);
+ const __m256i data_ref_0_hi = _mm256_unpackhi_epi16(data_01, zero);
+
+ const __m256i comp_avg_res_lo =
+ highbd_comp_avg(&data_ref_0_lo, &res_unsigned_lo, &wt0, &wt1,
+ use_dist_wtd_comp_avg);
+ const __m256i comp_avg_res_hi =
+ highbd_comp_avg(&data_ref_0_hi, &res_unsigned_hi, &wt0, &wt1,
+ use_dist_wtd_comp_avg);
+
+ const __m256i round_result_lo = highbd_convolve_rounding(
+ &comp_avg_res_lo, &offset_const, &rounding_const, rounding_shift);
+ const __m256i round_result_hi = highbd_convolve_rounding(
+ &comp_avg_res_hi, &offset_const, &rounding_const, rounding_shift);
+
+ const __m256i res_16b =
+ _mm256_packus_epi32(round_result_lo, round_result_hi);
+ const __m256i res_clip = _mm256_min_epi16(res_16b, clip_pixel_to_bd);
+
+ const __m128i res_0 = _mm256_castsi256_si128(res_clip);
+ const __m128i res_1 = _mm256_extracti128_si256(res_clip, 1);
+
+ _mm_store_si128((__m128i *)(&dst0[i * dst_stride0 + j]), res_0);
+ _mm_store_si128((__m128i *)(&dst0[i * dst_stride0 + j + dst_stride0]),
+ res_1);
+ } else {
+ __m256i res_16b =
+ _mm256_packus_epi32(res_unsigned_lo, res_unsigned_hi);
+ const __m128i res_0 = _mm256_castsi256_si128(res_16b);
+ const __m128i res_1 = _mm256_extracti128_si256(res_16b, 1);
+
+ _mm_store_si128((__m128i *)(&dst[i * dst_stride + j]), res_0);
+ _mm_store_si128((__m128i *)(&dst[i * dst_stride + j + dst_stride]),
+ res_1);
+ }
+ }
+ }
+ }
+}
+
+void av1_highbd_dist_wtd_convolve_y_avx2(
+ const uint16_t *src, int src_stride, uint16_t *dst0, int dst_stride0, int w,
+ int h, const InterpFilterParams *filter_params_y, const int subpel_y_qn,
+ ConvolveParams *conv_params, int bd) {
+ CONV_BUF_TYPE *dst = conv_params->dst;
+ int dst_stride = conv_params->dst_stride;
+ const int fo_vert = filter_params_y->taps / 2 - 1;
+ const uint16_t *const src_ptr = src - fo_vert * src_stride;
+ const int bits = FILTER_BITS - conv_params->round_0;
+
+ assert(bits >= 0);
+ int i, j;
+ __m256i s[8], coeffs_y[4];
+ const int do_average = conv_params->do_average;
+ const int use_dist_wtd_comp_avg = conv_params->use_dist_wtd_comp_avg;
+
+ const int w0 = conv_params->fwd_offset;
+ const int w1 = conv_params->bck_offset;
+ const __m256i wt0 = _mm256_set1_epi32(w0);
+ const __m256i wt1 = _mm256_set1_epi32(w1);
+ const __m256i round_const_y =
+ _mm256_set1_epi32(((1 << conv_params->round_1) >> 1));
+ const __m128i round_shift_y = _mm_cvtsi32_si128(conv_params->round_1);
+ const __m128i round_shift_bits = _mm_cvtsi32_si128(bits);
+
+ const int offset_0 =
+ bd + 2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1;
+ const int offset = (1 << offset_0) + (1 << (offset_0 - 1));
+ const __m256i offset_const = _mm256_set1_epi32(offset);
+ const int rounding_shift =
+ 2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1;
+ const __m256i rounding_const = _mm256_set1_epi32((1 << rounding_shift) >> 1);
+ const __m256i clip_pixel_to_bd =
+ _mm256_set1_epi16(bd == 10 ? 1023 : (bd == 12 ? 4095 : 255));
+ const __m256i zero = _mm256_setzero_si256();
+
+ prepare_coeffs(filter_params_y, subpel_y_qn, coeffs_y);
+
+ for (j = 0; j < w; j += 8) {
+ const uint16_t *data = &src_ptr[j];
+ /* Vertical filter */
+ {
+ __m256i src6;
+ __m256i s01 = _mm256_permute2x128_si256(
+ _mm256_castsi128_si256(
+ _mm_loadu_si128((__m128i *)(data + 0 * src_stride))),
+ _mm256_castsi128_si256(
+ _mm_loadu_si128((__m128i *)(data + 1 * src_stride))),
+ 0x20);
+ __m256i s12 = _mm256_permute2x128_si256(
+ _mm256_castsi128_si256(
+ _mm_loadu_si128((__m128i *)(data + 1 * src_stride))),
+ _mm256_castsi128_si256(
+ _mm_loadu_si128((__m128i *)(data + 2 * src_stride))),
+ 0x20);
+ __m256i s23 = _mm256_permute2x128_si256(
+ _mm256_castsi128_si256(
+ _mm_loadu_si128((__m128i *)(data + 2 * src_stride))),
+ _mm256_castsi128_si256(
+ _mm_loadu_si128((__m128i *)(data + 3 * src_stride))),
+ 0x20);
+ __m256i s34 = _mm256_permute2x128_si256(
+ _mm256_castsi128_si256(
+ _mm_loadu_si128((__m128i *)(data + 3 * src_stride))),
+ _mm256_castsi128_si256(
+ _mm_loadu_si128((__m128i *)(data + 4 * src_stride))),
+ 0x20);
+ __m256i s45 = _mm256_permute2x128_si256(
+ _mm256_castsi128_si256(
+ _mm_loadu_si128((__m128i *)(data + 4 * src_stride))),
+ _mm256_castsi128_si256(
+ _mm_loadu_si128((__m128i *)(data + 5 * src_stride))),
+ 0x20);
+ src6 = _mm256_castsi128_si256(
+ _mm_loadu_si128((__m128i *)(data + 6 * src_stride)));
+ __m256i s56 = _mm256_permute2x128_si256(
+ _mm256_castsi128_si256(
+ _mm_loadu_si128((__m128i *)(data + 5 * src_stride))),
+ src6, 0x20);
+
+ s[0] = _mm256_unpacklo_epi16(s01, s12);
+ s[1] = _mm256_unpacklo_epi16(s23, s34);
+ s[2] = _mm256_unpacklo_epi16(s45, s56);
+
+ s[4] = _mm256_unpackhi_epi16(s01, s12);
+ s[5] = _mm256_unpackhi_epi16(s23, s34);
+ s[6] = _mm256_unpackhi_epi16(s45, s56);
+
+ for (i = 0; i < h; i += 2) {
+ data = &src_ptr[i * src_stride + j];
+
+ const __m256i s67 = _mm256_permute2x128_si256(
+ src6,
+ _mm256_castsi128_si256(
+ _mm_loadu_si128((__m128i *)(data + 7 * src_stride))),
+ 0x20);
+
+ src6 = _mm256_castsi128_si256(
+ _mm_loadu_si128((__m128i *)(data + 8 * src_stride)));
+
+ const __m256i s78 = _mm256_permute2x128_si256(
+ _mm256_castsi128_si256(
+ _mm_loadu_si128((__m128i *)(data + 7 * src_stride))),
+ src6, 0x20);
+
+ s[3] = _mm256_unpacklo_epi16(s67, s78);
+ s[7] = _mm256_unpackhi_epi16(s67, s78);
+
+ const __m256i res_a = convolve(s, coeffs_y);
+
+ __m256i res_a_round = _mm256_sll_epi32(res_a, round_shift_bits);
+ res_a_round = _mm256_sra_epi32(
+ _mm256_add_epi32(res_a_round, round_const_y), round_shift_y);
+
+ __m256i res_unsigned_lo = _mm256_add_epi32(res_a_round, offset_const);
+
+ if (w - j < 8) {
+ if (do_average) {
+ const __m256i data_0 = _mm256_castsi128_si256(
+ _mm_loadl_epi64((__m128i *)(&dst[i * dst_stride + j])));
+ const __m256i data_1 = _mm256_castsi128_si256(_mm_loadl_epi64(
+ (__m128i *)(&dst[i * dst_stride + j + dst_stride])));
+ const __m256i data_01 =
+ _mm256_permute2x128_si256(data_0, data_1, 0x20);
+
+ const __m256i data_ref_0 = _mm256_unpacklo_epi16(data_01, zero);
+
+ const __m256i comp_avg_res =
+ highbd_comp_avg(&data_ref_0, &res_unsigned_lo, &wt0, &wt1,
+ use_dist_wtd_comp_avg);
+
+ const __m256i round_result = highbd_convolve_rounding(
+ &comp_avg_res, &offset_const, &rounding_const, rounding_shift);
+
+ const __m256i res_16b =
+ _mm256_packus_epi32(round_result, round_result);
+ const __m256i res_clip =
+ _mm256_min_epi16(res_16b, clip_pixel_to_bd);
+
+ const __m128i res_0 = _mm256_castsi256_si128(res_clip);
+ const __m128i res_1 = _mm256_extracti128_si256(res_clip, 1);
+
+ _mm_storel_epi64((__m128i *)(&dst0[i * dst_stride0 + j]), res_0);
+ _mm_storel_epi64(
+ (__m128i *)(&dst0[i * dst_stride0 + j + dst_stride0]), res_1);
+ } else {
+ __m256i res_16b =
+ _mm256_packus_epi32(res_unsigned_lo, res_unsigned_lo);
+ const __m128i res_0 = _mm256_castsi256_si128(res_16b);
+ const __m128i res_1 = _mm256_extracti128_si256(res_16b, 1);
+
+ _mm_storel_epi64((__m128i *)(&dst[i * dst_stride + j]), res_0);
+ _mm_storel_epi64((__m128i *)(&dst[i * dst_stride + j + dst_stride]),
+ res_1);
+ }
+ } else {
+ const __m256i res_b = convolve(s + 4, coeffs_y);
+ __m256i res_b_round = _mm256_sll_epi32(res_b, round_shift_bits);
+ res_b_round = _mm256_sra_epi32(
+ _mm256_add_epi32(res_b_round, round_const_y), round_shift_y);
+
+ __m256i res_unsigned_hi = _mm256_add_epi32(res_b_round, offset_const);
+
+ if (do_average) {
+ const __m256i data_0 = _mm256_castsi128_si256(
+ _mm_loadu_si128((__m128i *)(&dst[i * dst_stride + j])));
+ const __m256i data_1 = _mm256_castsi128_si256(_mm_loadu_si128(
+ (__m128i *)(&dst[i * dst_stride + j + dst_stride])));
+ const __m256i data_01 =
+ _mm256_permute2x128_si256(data_0, data_1, 0x20);
+
+ const __m256i data_ref_0_lo = _mm256_unpacklo_epi16(data_01, zero);
+ const __m256i data_ref_0_hi = _mm256_unpackhi_epi16(data_01, zero);
+
+ const __m256i comp_avg_res_lo =
+ highbd_comp_avg(&data_ref_0_lo, &res_unsigned_lo, &wt0, &wt1,
+ use_dist_wtd_comp_avg);
+ const __m256i comp_avg_res_hi =
+ highbd_comp_avg(&data_ref_0_hi, &res_unsigned_hi, &wt0, &wt1,
+ use_dist_wtd_comp_avg);
+
+ const __m256i round_result_lo =
+ highbd_convolve_rounding(&comp_avg_res_lo, &offset_const,
+ &rounding_const, rounding_shift);
+ const __m256i round_result_hi =
+ highbd_convolve_rounding(&comp_avg_res_hi, &offset_const,
+ &rounding_const, rounding_shift);
+
+ const __m256i res_16b =
+ _mm256_packus_epi32(round_result_lo, round_result_hi);
+ const __m256i res_clip =
+ _mm256_min_epi16(res_16b, clip_pixel_to_bd);
+
+ const __m128i res_0 = _mm256_castsi256_si128(res_clip);
+ const __m128i res_1 = _mm256_extracti128_si256(res_clip, 1);
+
+ _mm_store_si128((__m128i *)(&dst0[i * dst_stride0 + j]), res_0);
+ _mm_store_si128(
+ (__m128i *)(&dst0[i * dst_stride0 + j + dst_stride0]), res_1);
+ } else {
+ __m256i res_16b =
+ _mm256_packus_epi32(res_unsigned_lo, res_unsigned_hi);
+ const __m128i res_0 = _mm256_castsi256_si128(res_16b);
+ const __m128i res_1 = _mm256_extracti128_si256(res_16b, 1);
+
+ _mm_store_si128((__m128i *)(&dst[i * dst_stride + j]), res_0);
+ _mm_store_si128((__m128i *)(&dst[i * dst_stride + j + dst_stride]),
+ res_1);
+ }
+ }
+ s[0] = s[1];
+ s[1] = s[2];
+ s[2] = s[3];
+
+ s[4] = s[5];
+ s[5] = s[6];
+ s[6] = s[7];
+ }
+ }
+ }
+}
diff --git a/third_party/aom/av1/common/x86/highbd_jnt_convolve_sse4.c b/third_party/aom/av1/common/x86/highbd_jnt_convolve_sse4.c
new file mode 100644
index 0000000000..5a7fc536a2
--- /dev/null
+++ b/third_party/aom/av1/common/x86/highbd_jnt_convolve_sse4.c
@@ -0,0 +1,381 @@
+/*
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <smmintrin.h>
+#include <assert.h>
+
+#include "config/av1_rtcd.h"
+
+#include "aom_dsp/x86/convolve_sse2.h"
+#include "aom_dsp/x86/convolve_sse4_1.h"
+
+void av1_highbd_dist_wtd_convolve_y_sse4_1(
+ const uint16_t *src, int src_stride, uint16_t *dst0, int dst_stride0, int w,
+ int h, const InterpFilterParams *filter_params_y, const int subpel_y_qn,
+ ConvolveParams *conv_params, int bd) {
+ CONV_BUF_TYPE *dst = conv_params->dst;
+ int dst_stride = conv_params->dst_stride;
+ const int fo_vert = filter_params_y->taps / 2 - 1;
+ const uint16_t *const src_ptr = src - fo_vert * src_stride;
+ const int bits = FILTER_BITS - conv_params->round_0;
+
+ assert(bits >= 0);
+ int i, j;
+ const int do_average = conv_params->do_average;
+ const int use_dist_wtd_comp_avg = conv_params->use_dist_wtd_comp_avg;
+
+ const int w0 = conv_params->fwd_offset;
+ const int w1 = conv_params->bck_offset;
+ const __m128i wt0 = _mm_set1_epi32(w0);
+ const __m128i wt1 = _mm_set1_epi32(w1);
+ const __m128i round_const_y =
+ _mm_set1_epi32(((1 << conv_params->round_1) >> 1));
+ const __m128i round_shift_y = _mm_cvtsi32_si128(conv_params->round_1);
+ const __m128i round_shift_bits = _mm_cvtsi32_si128(bits);
+
+ const int offset_0 =
+ bd + 2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1;
+ const int offset = (1 << offset_0) + (1 << (offset_0 - 1));
+ const __m128i offset_const = _mm_set1_epi32(offset);
+ const int rounding_shift =
+ 2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1;
+ const __m128i rounding_const = _mm_set1_epi32((1 << rounding_shift) >> 1);
+ const __m128i clip_pixel_to_bd =
+ _mm_set1_epi16(bd == 10 ? 1023 : (bd == 12 ? 4095 : 255));
+ const __m128i zero = _mm_setzero_si128();
+ __m128i s[16], coeffs_y[4];
+
+ prepare_coeffs(filter_params_y, subpel_y_qn, coeffs_y);
+
+ for (j = 0; j < w; j += 8) {
+ const uint16_t *data = &src_ptr[j];
+ /* Vertical filter */
+ {
+ __m128i s0 = _mm_loadu_si128((__m128i *)(data + 0 * src_stride));
+ __m128i s1 = _mm_loadu_si128((__m128i *)(data + 1 * src_stride));
+ __m128i s2 = _mm_loadu_si128((__m128i *)(data + 2 * src_stride));
+ __m128i s3 = _mm_loadu_si128((__m128i *)(data + 3 * src_stride));
+ __m128i s4 = _mm_loadu_si128((__m128i *)(data + 4 * src_stride));
+ __m128i s5 = _mm_loadu_si128((__m128i *)(data + 5 * src_stride));
+ __m128i s6 = _mm_loadu_si128((__m128i *)(data + 6 * src_stride));
+
+ s[0] = _mm_unpacklo_epi16(s0, s1);
+ s[1] = _mm_unpacklo_epi16(s2, s3);
+ s[2] = _mm_unpacklo_epi16(s4, s5);
+
+ s[4] = _mm_unpackhi_epi16(s0, s1);
+ s[5] = _mm_unpackhi_epi16(s2, s3);
+ s[6] = _mm_unpackhi_epi16(s4, s5);
+
+ s[0 + 8] = _mm_unpacklo_epi16(s1, s2);
+ s[1 + 8] = _mm_unpacklo_epi16(s3, s4);
+ s[2 + 8] = _mm_unpacklo_epi16(s5, s6);
+
+ s[4 + 8] = _mm_unpackhi_epi16(s1, s2);
+ s[5 + 8] = _mm_unpackhi_epi16(s3, s4);
+ s[6 + 8] = _mm_unpackhi_epi16(s5, s6);
+
+ for (i = 0; i < h; i += 2) {
+ data = &src_ptr[i * src_stride + j];
+
+ __m128i s7 = _mm_loadu_si128((__m128i *)(data + 7 * src_stride));
+ __m128i s8 = _mm_loadu_si128((__m128i *)(data + 8 * src_stride));
+
+ s[3] = _mm_unpacklo_epi16(s6, s7);
+ s[7] = _mm_unpackhi_epi16(s6, s7);
+
+ s[3 + 8] = _mm_unpacklo_epi16(s7, s8);
+ s[7 + 8] = _mm_unpackhi_epi16(s7, s8);
+
+ const __m128i res_a0 = convolve(s, coeffs_y);
+ __m128i res_a_round0 = _mm_sll_epi32(res_a0, round_shift_bits);
+ res_a_round0 = _mm_sra_epi32(_mm_add_epi32(res_a_round0, round_const_y),
+ round_shift_y);
+
+ const __m128i res_a1 = convolve(s + 8, coeffs_y);
+ __m128i res_a_round1 = _mm_sll_epi32(res_a1, round_shift_bits);
+ res_a_round1 = _mm_sra_epi32(_mm_add_epi32(res_a_round1, round_const_y),
+ round_shift_y);
+
+ __m128i res_unsigned_lo_0 = _mm_add_epi32(res_a_round0, offset_const);
+ __m128i res_unsigned_lo_1 = _mm_add_epi32(res_a_round1, offset_const);
+
+ if (w - j < 8) {
+ if (do_average) {
+ const __m128i data_0 =
+ _mm_loadl_epi64((__m128i *)(&dst[i * dst_stride + j]));
+ const __m128i data_1 = _mm_loadl_epi64(
+ (__m128i *)(&dst[i * dst_stride + j + dst_stride]));
+
+ const __m128i data_ref_0 = _mm_unpacklo_epi16(data_0, zero);
+ const __m128i data_ref_1 = _mm_unpacklo_epi16(data_1, zero);
+
+ const __m128i comp_avg_res_0 =
+ highbd_comp_avg_sse4_1(&data_ref_0, &res_unsigned_lo_0, &wt0,
+ &wt1, use_dist_wtd_comp_avg);
+ const __m128i comp_avg_res_1 =
+ highbd_comp_avg_sse4_1(&data_ref_1, &res_unsigned_lo_1, &wt0,
+ &wt1, use_dist_wtd_comp_avg);
+
+ const __m128i round_result_0 =
+ highbd_convolve_rounding_sse2(&comp_avg_res_0, &offset_const,
+ &rounding_const, rounding_shift);
+ const __m128i round_result_1 =
+ highbd_convolve_rounding_sse2(&comp_avg_res_1, &offset_const,
+ &rounding_const, rounding_shift);
+
+ const __m128i res_16b_0 =
+ _mm_packus_epi32(round_result_0, round_result_0);
+ const __m128i res_clip_0 =
+ _mm_min_epi16(res_16b_0, clip_pixel_to_bd);
+ const __m128i res_16b_1 =
+ _mm_packus_epi32(round_result_1, round_result_1);
+ const __m128i res_clip_1 =
+ _mm_min_epi16(res_16b_1, clip_pixel_to_bd);
+
+ _mm_storel_epi64((__m128i *)(&dst0[i * dst_stride0 + j]),
+ res_clip_0);
+ _mm_storel_epi64(
+ (__m128i *)(&dst0[i * dst_stride0 + j + dst_stride0]),
+ res_clip_1);
+
+ } else {
+ __m128i res_16b_0 =
+ _mm_packus_epi32(res_unsigned_lo_0, res_unsigned_lo_0);
+
+ __m128i res_16b_1 =
+ _mm_packus_epi32(res_unsigned_lo_1, res_unsigned_lo_1);
+
+ _mm_storel_epi64((__m128i *)&dst[i * dst_stride + j], res_16b_0);
+ _mm_storel_epi64((__m128i *)&dst[i * dst_stride + j + dst_stride],
+ res_16b_1);
+ }
+ } else {
+ const __m128i res_b0 = convolve(s + 4, coeffs_y);
+ __m128i res_b_round0 = _mm_sll_epi32(res_b0, round_shift_bits);
+ res_b_round0 = _mm_sra_epi32(
+ _mm_add_epi32(res_b_round0, round_const_y), round_shift_y);
+
+ const __m128i res_b1 = convolve(s + 4 + 8, coeffs_y);
+ __m128i res_b_round1 = _mm_sll_epi32(res_b1, round_shift_bits);
+ res_b_round1 = _mm_sra_epi32(
+ _mm_add_epi32(res_b_round1, round_const_y), round_shift_y);
+
+ __m128i res_unsigned_hi_0 = _mm_add_epi32(res_b_round0, offset_const);
+ __m128i res_unsigned_hi_1 = _mm_add_epi32(res_b_round1, offset_const);
+
+ if (do_average) {
+ const __m128i data_0 =
+ _mm_loadu_si128((__m128i *)(&dst[i * dst_stride + j]));
+ const __m128i data_1 = _mm_loadu_si128(
+ (__m128i *)(&dst[i * dst_stride + j + dst_stride]));
+ const __m128i data_ref_0_lo_0 = _mm_unpacklo_epi16(data_0, zero);
+ const __m128i data_ref_0_lo_1 = _mm_unpacklo_epi16(data_1, zero);
+
+ const __m128i data_ref_0_hi_0 = _mm_unpackhi_epi16(data_0, zero);
+ const __m128i data_ref_0_hi_1 = _mm_unpackhi_epi16(data_1, zero);
+
+ const __m128i comp_avg_res_lo_0 =
+ highbd_comp_avg_sse4_1(&data_ref_0_lo_0, &res_unsigned_lo_0,
+ &wt0, &wt1, use_dist_wtd_comp_avg);
+ const __m128i comp_avg_res_lo_1 =
+ highbd_comp_avg_sse4_1(&data_ref_0_lo_1, &res_unsigned_lo_1,
+ &wt0, &wt1, use_dist_wtd_comp_avg);
+ const __m128i comp_avg_res_hi_0 =
+ highbd_comp_avg_sse4_1(&data_ref_0_hi_0, &res_unsigned_hi_0,
+ &wt0, &wt1, use_dist_wtd_comp_avg);
+ const __m128i comp_avg_res_hi_1 =
+ highbd_comp_avg_sse4_1(&data_ref_0_hi_1, &res_unsigned_hi_1,
+ &wt0, &wt1, use_dist_wtd_comp_avg);
+
+ const __m128i round_result_lo_0 =
+ highbd_convolve_rounding_sse2(&comp_avg_res_lo_0, &offset_const,
+ &rounding_const, rounding_shift);
+ const __m128i round_result_lo_1 =
+ highbd_convolve_rounding_sse2(&comp_avg_res_lo_1, &offset_const,
+ &rounding_const, rounding_shift);
+ const __m128i round_result_hi_0 =
+ highbd_convolve_rounding_sse2(&comp_avg_res_hi_0, &offset_const,
+ &rounding_const, rounding_shift);
+ const __m128i round_result_hi_1 =
+ highbd_convolve_rounding_sse2(&comp_avg_res_hi_1, &offset_const,
+ &rounding_const, rounding_shift);
+
+ const __m128i res_16b_0 =
+ _mm_packus_epi32(round_result_lo_0, round_result_hi_0);
+ const __m128i res_clip_0 =
+ _mm_min_epi16(res_16b_0, clip_pixel_to_bd);
+
+ const __m128i res_16b_1 =
+ _mm_packus_epi32(round_result_lo_1, round_result_hi_1);
+ const __m128i res_clip_1 =
+ _mm_min_epi16(res_16b_1, clip_pixel_to_bd);
+
+ _mm_store_si128((__m128i *)(&dst0[i * dst_stride0 + j]),
+ res_clip_0);
+ _mm_store_si128(
+ (__m128i *)(&dst0[i * dst_stride0 + j + dst_stride0]),
+ res_clip_1);
+ } else {
+ __m128i res_16bit0 =
+ _mm_packus_epi32(res_unsigned_lo_0, res_unsigned_hi_0);
+ __m128i res_16bit1 =
+ _mm_packus_epi32(res_unsigned_lo_1, res_unsigned_hi_1);
+ _mm_store_si128((__m128i *)(&dst[i * dst_stride + j]), res_16bit0);
+ _mm_store_si128((__m128i *)(&dst[i * dst_stride + j + dst_stride]),
+ res_16bit1);
+ }
+ }
+ s[0] = s[1];
+ s[1] = s[2];
+ s[2] = s[3];
+
+ s[4] = s[5];
+ s[5] = s[6];
+ s[6] = s[7];
+
+ s[0 + 8] = s[1 + 8];
+ s[1 + 8] = s[2 + 8];
+ s[2 + 8] = s[3 + 8];
+
+ s[4 + 8] = s[5 + 8];
+ s[5 + 8] = s[6 + 8];
+ s[6 + 8] = s[7 + 8];
+
+ s6 = s8;
+ }
+ }
+ }
+}
+
+void av1_highbd_dist_wtd_convolve_x_sse4_1(
+ const uint16_t *src, int src_stride, uint16_t *dst0, int dst_stride0, int w,
+ int h, const InterpFilterParams *filter_params_x, const int subpel_x_qn,
+ ConvolveParams *conv_params, int bd) {
+ CONV_BUF_TYPE *dst = conv_params->dst;
+ int dst_stride = conv_params->dst_stride;
+ const int fo_horiz = filter_params_x->taps / 2 - 1;
+ const uint16_t *const src_ptr = src - fo_horiz;
+ const int bits = FILTER_BITS - conv_params->round_1;
+
+ int i, j;
+ __m128i s[4], coeffs_x[4];
+
+ const int do_average = conv_params->do_average;
+ const int use_dist_wtd_comp_avg = conv_params->use_dist_wtd_comp_avg;
+ const int w0 = conv_params->fwd_offset;
+ const int w1 = conv_params->bck_offset;
+ const __m128i wt0 = _mm_set1_epi32(w0);
+ const __m128i wt1 = _mm_set1_epi32(w1);
+ const __m128i zero = _mm_setzero_si128();
+
+ const __m128i round_const_x =
+ _mm_set1_epi32(((1 << conv_params->round_0) >> 1));
+ const __m128i round_shift_x = _mm_cvtsi32_si128(conv_params->round_0);
+ const __m128i round_shift_bits = _mm_cvtsi32_si128(bits);
+
+ const int offset_0 =
+ bd + 2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1;
+ const int offset = (1 << offset_0) + (1 << (offset_0 - 1));
+ const __m128i offset_const = _mm_set1_epi32(offset);
+ const int rounding_shift =
+ 2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1;
+ const __m128i rounding_const = _mm_set1_epi32((1 << rounding_shift) >> 1);
+ const __m128i clip_pixel_to_bd =
+ _mm_set1_epi16(bd == 10 ? 1023 : (bd == 12 ? 4095 : 255));
+
+ assert(bits >= 0);
+ prepare_coeffs(filter_params_x, subpel_x_qn, coeffs_x);
+
+ for (j = 0; j < w; j += 8) {
+ /* Horizontal filter */
+ for (i = 0; i < h; i += 1) {
+ const __m128i row00 =
+ _mm_loadu_si128((__m128i *)&src_ptr[i * src_stride + j]);
+ const __m128i row01 =
+ _mm_loadu_si128((__m128i *)&src_ptr[i * src_stride + (j + 8)]);
+
+ // even pixels
+ s[0] = _mm_alignr_epi8(row01, row00, 0);
+ s[1] = _mm_alignr_epi8(row01, row00, 4);
+ s[2] = _mm_alignr_epi8(row01, row00, 8);
+ s[3] = _mm_alignr_epi8(row01, row00, 12);
+
+ __m128i res_even = convolve(s, coeffs_x);
+ res_even =
+ _mm_sra_epi32(_mm_add_epi32(res_even, round_const_x), round_shift_x);
+
+ // odd pixels
+ s[0] = _mm_alignr_epi8(row01, row00, 2);
+ s[1] = _mm_alignr_epi8(row01, row00, 6);
+ s[2] = _mm_alignr_epi8(row01, row00, 10);
+ s[3] = _mm_alignr_epi8(row01, row00, 14);
+
+ __m128i res_odd = convolve(s, coeffs_x);
+ res_odd =
+ _mm_sra_epi32(_mm_add_epi32(res_odd, round_const_x), round_shift_x);
+
+ res_even = _mm_sll_epi32(res_even, round_shift_bits);
+ res_odd = _mm_sll_epi32(res_odd, round_shift_bits);
+
+ __m128i res1 = _mm_unpacklo_epi32(res_even, res_odd);
+ __m128i res_unsigned_lo = _mm_add_epi32(res1, offset_const);
+ if (w - j < 8) {
+ if (do_average) {
+ const __m128i data_0 =
+ _mm_loadl_epi64((__m128i *)(&dst[i * dst_stride + j]));
+ const __m128i data_ref_0 = _mm_unpacklo_epi16(data_0, zero);
+
+ const __m128i comp_avg_res = highbd_comp_avg_sse4_1(
+ &data_ref_0, &res_unsigned_lo, &wt0, &wt1, use_dist_wtd_comp_avg);
+ const __m128i round_result = highbd_convolve_rounding_sse2(
+ &comp_avg_res, &offset_const, &rounding_const, rounding_shift);
+
+ const __m128i res_16b = _mm_packus_epi32(round_result, round_result);
+ const __m128i res_clip = _mm_min_epi16(res_16b, clip_pixel_to_bd);
+ _mm_storel_epi64((__m128i *)(&dst0[i * dst_stride0 + j]), res_clip);
+ } else {
+ __m128i res_16b = _mm_packus_epi32(res_unsigned_lo, res_unsigned_lo);
+ _mm_storel_epi64((__m128i *)&dst[i * dst_stride + j], res_16b);
+ }
+ } else {
+ __m128i res2 = _mm_unpackhi_epi32(res_even, res_odd);
+ __m128i res_unsigned_hi = _mm_add_epi32(res2, offset_const);
+ if (do_average) {
+ const __m128i data_0 =
+ _mm_loadu_si128((__m128i *)(&dst[i * dst_stride + j]));
+ const __m128i data_ref_0_lo = _mm_unpacklo_epi16(data_0, zero);
+ const __m128i data_ref_0_hi = _mm_unpackhi_epi16(data_0, zero);
+
+ const __m128i comp_avg_res_lo =
+ highbd_comp_avg_sse4_1(&data_ref_0_lo, &res_unsigned_lo, &wt0,
+ &wt1, use_dist_wtd_comp_avg);
+ const __m128i comp_avg_res_hi =
+ highbd_comp_avg_sse4_1(&data_ref_0_hi, &res_unsigned_hi, &wt0,
+ &wt1, use_dist_wtd_comp_avg);
+
+ const __m128i round_result_lo = highbd_convolve_rounding_sse2(
+ &comp_avg_res_lo, &offset_const, &rounding_const, rounding_shift);
+ const __m128i round_result_hi = highbd_convolve_rounding_sse2(
+ &comp_avg_res_hi, &offset_const, &rounding_const, rounding_shift);
+
+ const __m128i res_16b =
+ _mm_packus_epi32(round_result_lo, round_result_hi);
+ const __m128i res_clip = _mm_min_epi16(res_16b, clip_pixel_to_bd);
+ _mm_store_si128((__m128i *)(&dst0[i * dst_stride0 + j]), res_clip);
+ } else {
+ __m128i res_16b = _mm_packus_epi32(res_unsigned_lo, res_unsigned_hi);
+ _mm_store_si128((__m128i *)(&dst[i * dst_stride + j]), res_16b);
+ }
+ }
+ }
+ }
+}
diff --git a/third_party/aom/av1/common/x86/highbd_txfm_utility_sse4.h b/third_party/aom/av1/common/x86/highbd_txfm_utility_sse4.h
new file mode 100644
index 0000000000..5734810f52
--- /dev/null
+++ b/third_party/aom/av1/common/x86/highbd_txfm_utility_sse4.h
@@ -0,0 +1,132 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AV1_COMMON_X86_HIGHBD_TXFM_UTILITY_SSE4_H_
+#define AOM_AV1_COMMON_X86_HIGHBD_TXFM_UTILITY_SSE4_H_
+
+#include <smmintrin.h> /* SSE4.1 */
+
+#define TRANSPOSE_4X4(x0, x1, x2, x3, y0, y1, y2, y3) \
+ do { \
+ __m128i u0, u1, u2, u3; \
+ u0 = _mm_unpacklo_epi32(x0, x1); \
+ u1 = _mm_unpackhi_epi32(x0, x1); \
+ u2 = _mm_unpacklo_epi32(x2, x3); \
+ u3 = _mm_unpackhi_epi32(x2, x3); \
+ y0 = _mm_unpacklo_epi64(u0, u2); \
+ y1 = _mm_unpackhi_epi64(u0, u2); \
+ y2 = _mm_unpacklo_epi64(u1, u3); \
+ y3 = _mm_unpackhi_epi64(u1, u3); \
+ } while (0)
+
+static INLINE void transpose_8x8(const __m128i *in, __m128i *out) {
+ TRANSPOSE_4X4(in[0], in[2], in[4], in[6], out[0], out[2], out[4], out[6]);
+ TRANSPOSE_4X4(in[1], in[3], in[5], in[7], out[8], out[10], out[12], out[14]);
+ TRANSPOSE_4X4(in[8], in[10], in[12], in[14], out[1], out[3], out[5], out[7]);
+ TRANSPOSE_4X4(in[9], in[11], in[13], in[15], out[9], out[11], out[13],
+ out[15]);
+}
+
+static INLINE void transpose_16x16(const __m128i *in, __m128i *out) {
+ // Upper left 8x8
+ TRANSPOSE_4X4(in[0], in[4], in[8], in[12], out[0], out[4], out[8], out[12]);
+ TRANSPOSE_4X4(in[1], in[5], in[9], in[13], out[16], out[20], out[24],
+ out[28]);
+ TRANSPOSE_4X4(in[16], in[20], in[24], in[28], out[1], out[5], out[9],
+ out[13]);
+ TRANSPOSE_4X4(in[17], in[21], in[25], in[29], out[17], out[21], out[25],
+ out[29]);
+
+ // Upper right 8x8
+ TRANSPOSE_4X4(in[2], in[6], in[10], in[14], out[32], out[36], out[40],
+ out[44]);
+ TRANSPOSE_4X4(in[3], in[7], in[11], in[15], out[48], out[52], out[56],
+ out[60]);
+ TRANSPOSE_4X4(in[18], in[22], in[26], in[30], out[33], out[37], out[41],
+ out[45]);
+ TRANSPOSE_4X4(in[19], in[23], in[27], in[31], out[49], out[53], out[57],
+ out[61]);
+
+ // Lower left 8x8
+ TRANSPOSE_4X4(in[32], in[36], in[40], in[44], out[2], out[6], out[10],
+ out[14]);
+ TRANSPOSE_4X4(in[33], in[37], in[41], in[45], out[18], out[22], out[26],
+ out[30]);
+ TRANSPOSE_4X4(in[48], in[52], in[56], in[60], out[3], out[7], out[11],
+ out[15]);
+ TRANSPOSE_4X4(in[49], in[53], in[57], in[61], out[19], out[23], out[27],
+ out[31]);
+ // Lower right 8x8
+ TRANSPOSE_4X4(in[34], in[38], in[42], in[46], out[34], out[38], out[42],
+ out[46]);
+ TRANSPOSE_4X4(in[35], in[39], in[43], in[47], out[50], out[54], out[58],
+ out[62]);
+ TRANSPOSE_4X4(in[50], in[54], in[58], in[62], out[35], out[39], out[43],
+ out[47]);
+ TRANSPOSE_4X4(in[51], in[55], in[59], in[63], out[51], out[55], out[59],
+ out[63]);
+}
+
+static INLINE void transpose_8nx8n(const __m128i *input, __m128i *output,
+ const int width, const int height) {
+ const int numcol = height >> 2;
+ const int numrow = width >> 2;
+ for (int j = 0; j < numrow; j++) {
+ for (int i = 0; i < numcol; i++) {
+ TRANSPOSE_4X4(input[i * width + j + (numrow * 0)],
+ input[i * width + j + (numrow * 1)],
+ input[i * width + j + (numrow * 2)],
+ input[i * width + j + (numrow * 3)],
+ output[j * height + i + (numcol * 0)],
+ output[j * height + i + (numcol * 1)],
+ output[j * height + i + (numcol * 2)],
+ output[j * height + i + (numcol * 3)]);
+ }
+ }
+}
+
+// Note:
+// rounding = 1 << (bit - 1)
+static INLINE __m128i half_btf_sse4_1(const __m128i *w0, const __m128i *n0,
+ const __m128i *w1, const __m128i *n1,
+ const __m128i *rounding, int bit) {
+ __m128i x, y;
+
+ x = _mm_mullo_epi32(*w0, *n0);
+ y = _mm_mullo_epi32(*w1, *n1);
+ x = _mm_add_epi32(x, y);
+ x = _mm_add_epi32(x, *rounding);
+ x = _mm_srai_epi32(x, bit);
+ return x;
+}
+
+static INLINE __m128i half_btf_0_sse4_1(const __m128i *w0, const __m128i *n0,
+ const __m128i *rounding, int bit) {
+ __m128i x;
+
+ x = _mm_mullo_epi32(*w0, *n0);
+ x = _mm_add_epi32(x, *rounding);
+ x = _mm_srai_epi32(x, bit);
+ return x;
+}
+
+typedef void (*transform_1d_sse4_1)(__m128i *in, __m128i *out, int bit,
+ int do_cols, int bd, int out_shift);
+
+typedef void (*fwd_transform_1d_sse4_1)(__m128i *in, __m128i *out, int bit,
+ const int num_cols);
+
+void av1_highbd_inv_txfm2d_add_universe_sse4_1(const int32_t *input,
+ uint8_t *output, int stride,
+ TX_TYPE tx_type, TX_SIZE tx_size,
+ int eob, const int bd);
+
+#endif // AOM_AV1_COMMON_X86_HIGHBD_TXFM_UTILITY_SSE4_H_
diff --git a/third_party/aom/av1/common/x86/highbd_warp_affine_avx2.c b/third_party/aom/av1/common/x86/highbd_warp_affine_avx2.c
new file mode 100644
index 0000000000..75108b49da
--- /dev/null
+++ b/third_party/aom/av1/common/x86/highbd_warp_affine_avx2.c
@@ -0,0 +1,656 @@
+/*
+ * Copyright (c) 2020, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+#include <immintrin.h>
+
+#include "config/av1_rtcd.h"
+
+#include "av1/common/warped_motion.h"
+
+void av1_highbd_warp_affine_avx2(const int32_t *mat, const uint16_t *ref,
+ int width, int height, int stride,
+ uint16_t *pred, int p_col, int p_row,
+ int p_width, int p_height, int p_stride,
+ int subsampling_x, int subsampling_y, int bd,
+ ConvolveParams *conv_params, int16_t alpha,
+ int16_t beta, int16_t gamma, int16_t delta) {
+ __m256i tmp[15];
+ const int reduce_bits_horiz = conv_params->round_0;
+ const int reduce_bits_vert = conv_params->is_compound
+ ? conv_params->round_1
+ : 2 * FILTER_BITS - reduce_bits_horiz;
+ const int max_bits_horiz = bd + FILTER_BITS + 1 - reduce_bits_horiz;
+ const int offset_bits_horiz = bd + FILTER_BITS - 1;
+ const int offset_bits_vert = bd + 2 * FILTER_BITS - reduce_bits_horiz;
+ const int round_bits =
+ 2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1;
+ const int offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0;
+ (void)max_bits_horiz;
+ assert(IMPLIES(conv_params->is_compound, conv_params->dst != NULL));
+
+ // Check that, even with 12-bit input, the intermediate values will fit
+ // into an unsigned 16-bit intermediate array.
+ assert(bd + FILTER_BITS + 2 - conv_params->round_0 <= 16);
+
+ const __m256i clip_pixel =
+ _mm256_set1_epi16(bd == 10 ? 1023 : (bd == 12 ? 4095 : 255));
+ const __m128i reduce_bits_vert_shift = _mm_cvtsi32_si128(reduce_bits_vert);
+ const __m256i reduce_bits_vert_const =
+ _mm256_set1_epi32(((1 << reduce_bits_vert) >> 1));
+ const __m256i res_add_const = _mm256_set1_epi32(1 << offset_bits_vert);
+ const __m256i res_sub_const =
+ _mm256_set1_epi32(-(1 << (offset_bits - conv_params->round_1)) -
+ (1 << (offset_bits - conv_params->round_1 - 1)));
+ __m128i round_bits_shift = _mm_cvtsi32_si128(round_bits);
+ __m256i round_bits_const = _mm256_set1_epi32(((1 << round_bits) >> 1));
+
+ const int w0 = conv_params->fwd_offset;
+ const int w1 = conv_params->bck_offset;
+ const __m256i wt0 = _mm256_set1_epi32(w0);
+ const __m256i wt1 = _mm256_set1_epi32(w1);
+
+ __m256i v_rbhoriz = _mm256_set1_epi32(1 << (reduce_bits_horiz - 1));
+ __m256i v_zeros = _mm256_setzero_si256();
+ int ohoriz = 1 << offset_bits_horiz;
+ int mhoriz = 1 << max_bits_horiz;
+ (void)mhoriz;
+ int sx;
+
+ for (int i = 0; i < p_height; i += 8) {
+ for (int j = 0; j < p_width; j += 8) {
+ // Calculate the center of this 8x8 block,
+ // project to luma coordinates (if in a subsampled chroma plane),
+ // apply the affine transformation,
+ // then convert back to the original coordinates (if necessary)
+ const int32_t src_x = (p_col + j + 4) << subsampling_x;
+ const int32_t src_y = (p_row + i + 4) << subsampling_y;
+ const int64_t dst_x =
+ (int64_t)mat[2] * src_x + (int64_t)mat[3] * src_y + (int64_t)mat[0];
+ const int64_t dst_y =
+ (int64_t)mat[4] * src_x + (int64_t)mat[5] * src_y + (int64_t)mat[1];
+ const int64_t x4 = dst_x >> subsampling_x;
+ const int64_t y4 = dst_y >> subsampling_y;
+
+ const int16_t ix4 = (int32_t)(x4 >> WARPEDMODEL_PREC_BITS);
+ int32_t sx4 = x4 & ((1 << WARPEDMODEL_PREC_BITS) - 1);
+ const int16_t iy4 = (int32_t)(y4 >> WARPEDMODEL_PREC_BITS);
+ int32_t sy4 = y4 & ((1 << WARPEDMODEL_PREC_BITS) - 1);
+
+ sx4 += alpha * (-4) + beta * (-4) + (1 << (WARPEDDIFF_PREC_BITS - 1)) +
+ (WARPEDPIXEL_PREC_SHIFTS << WARPEDDIFF_PREC_BITS);
+ sy4 += gamma * (-4) + delta * (-4) + (1 << (WARPEDDIFF_PREC_BITS - 1)) +
+ (WARPEDPIXEL_PREC_SHIFTS << WARPEDDIFF_PREC_BITS);
+
+ sx4 &= ~((1 << WARP_PARAM_REDUCE_BITS) - 1);
+ sy4 &= ~((1 << WARP_PARAM_REDUCE_BITS) - 1);
+
+ // Horizontal filter
+ if (ix4 <= -7) {
+ for (int k = -7; k < AOMMIN(8, p_height - i); ++k) {
+ int iy = iy4 + k;
+ if (iy < 0)
+ iy = 0;
+ else if (iy > height - 1)
+ iy = height - 1;
+ tmp[k + 7] = _mm256_cvtepi16_epi32(_mm_set1_epi16(
+ (1 << (bd + FILTER_BITS - reduce_bits_horiz - 1)) +
+ ref[iy * stride] * (1 << (FILTER_BITS - reduce_bits_horiz))));
+ }
+ } else if (ix4 >= width + 6) {
+ for (int k = -7; k < AOMMIN(8, p_height - i); ++k) {
+ int iy = iy4 + k;
+ if (iy < 0)
+ iy = 0;
+ else if (iy > height - 1)
+ iy = height - 1;
+ tmp[k + 7] = _mm256_cvtepi16_epi32(
+ _mm_set1_epi16((1 << (bd + FILTER_BITS - reduce_bits_horiz - 1)) +
+ ref[iy * stride + (width - 1)] *
+ (1 << (FILTER_BITS - reduce_bits_horiz))));
+ }
+ } else if (((ix4 - 7) < 0) || ((ix4 + 9) > width)) {
+ int32_t tmp1[8];
+ for (int k = -7; k < AOMMIN(8, p_height - i); ++k) {
+ const int iy = clamp(iy4 + k, 0, height - 1);
+
+ sx = sx4 + beta * (k + 4);
+ for (int l = -4; l < 4; ++l) {
+ int ix = ix4 + l - 3;
+ const int offs = sx >> WARPEDDIFF_PREC_BITS;
+ const int16_t *coeffs = av1_warped_filter[offs];
+
+ int32_t sum = 1 << offset_bits_horiz;
+ for (int m = 0; m < 8; ++m) {
+ const int sample_x = clamp(ix + m, 0, width - 1);
+ sum += ref[iy * stride + sample_x] * coeffs[m];
+ }
+ sum = ROUND_POWER_OF_TWO(sum, reduce_bits_horiz);
+ tmp1[(l + 4) / 2 + ((l + 4) % 2) * 4] = sum;
+ sx += alpha;
+ }
+ tmp[k + 7] = _mm256_loadu_si256((__m256i *)tmp1);
+ }
+ } else {
+ if (beta == 0 && alpha == 0) {
+ sx = sx4;
+ __m128i v_01 = _mm_loadu_si128(
+ (__m128i *)
+ av1_warped_filter[sx >>
+ WARPEDDIFF_PREC_BITS]); // A7A6A5A4A3A2A1A0
+ __m256i v_c01 = _mm256_broadcastd_epi32(v_01); // A1A0A1A0A1A0A1A0
+ __m256i v_c23 = _mm256_broadcastd_epi32(
+ _mm_shuffle_epi32(v_01, 1)); // A3A2A3A2A3A2A3A2
+ __m256i v_c45 = _mm256_broadcastd_epi32(
+ _mm_shuffle_epi32(v_01, 2)); // A5A4A5A4A5A4A5A4
+ __m256i v_c67 = _mm256_broadcastd_epi32(
+ _mm_shuffle_epi32(v_01, 3)); // A7A6A7A6A7A6A7A6
+ for (int k = -7; k < AOMMIN(8, p_height - i); ++k) {
+ int iy = iy4 + k;
+ if (iy < 0)
+ iy = 0;
+ else if (iy > height - 1)
+ iy = height - 1;
+ iy = iy * stride;
+
+ __m256i v_refl = _mm256_inserti128_si256(
+ _mm256_setzero_si256(),
+ _mm_loadu_si128((__m128i *)&ref[iy + ix4 - 7]), 0);
+ v_refl = _mm256_inserti128_si256(
+ v_refl, _mm_loadu_si128((__m128i *)&ref[iy + ix4 + 1]),
+ 1); // R15 .. R0
+
+ __m256i v_ref = _mm256_permute4x64_epi64(v_refl, 0xEE);
+
+ __m256i v_refu =
+ _mm256_alignr_epi8(v_ref, v_refl, 2); // R8R15R14...R2R1
+ v_refl = _mm256_inserti128_si256(
+ v_refl, _mm256_extracti128_si256(v_refu, 0), 1);
+ v_refu = _mm256_inserti128_si256(
+ v_refu, _mm256_extracti128_si256(v_ref, 0), 0);
+
+ __m256i v_sum = _mm256_set1_epi32(ohoriz);
+ __m256i parsum = _mm256_madd_epi16(
+ v_c01, _mm256_alignr_epi8(v_refu, v_refl,
+ 0)); // R8R7R6..R1R7R6R5..R1R0
+ __m256i v_sum1 = _mm256_add_epi32(v_sum, parsum);
+
+ parsum = _mm256_madd_epi16(
+ v_c23,
+ _mm256_alignr_epi8(v_refu, v_refl, 4)); // R10R9..R3R9R8..R3R2
+ __m256i v_sum2 = _mm256_add_epi32(v_sum1, parsum);
+ parsum = _mm256_madd_epi16(
+ v_c45, _mm256_alignr_epi8(v_refu, v_refl,
+ 8)); // R12R11..R5R11R10..R5R4
+ __m256i v_sum3 = _mm256_add_epi32(v_sum2, parsum);
+ parsum = _mm256_madd_epi16(
+ v_c67, _mm256_alignr_epi8(v_refu, v_refl,
+ 12)); // R14R13..R7R13R12..R7R6
+ __m256i v_sum4 = _mm256_add_epi32(v_sum3, parsum);
+
+ tmp[k + 7] = _mm256_srai_epi32(_mm256_add_epi32(v_sum4, v_rbhoriz),
+ reduce_bits_horiz);
+ }
+ } else if (alpha == 0) {
+ for (int k = -7; k < AOMMIN(8, p_height - i); ++k) {
+ int iy = iy4 + k;
+ if (iy < 0)
+ iy = 0;
+ else if (iy > height - 1)
+ iy = height - 1;
+ iy = iy * stride;
+
+ sx = sx4 + beta * (k + 4);
+
+ __m128i v_01 = _mm_loadu_si128(
+ (__m128i *)av1_warped_filter
+ [sx >> WARPEDDIFF_PREC_BITS]); // A7A6A5A4A3A2A1A0
+ __m256i v_c01 = _mm256_broadcastd_epi32(v_01); // A1A0A1A0A1A0A1A0
+ __m256i v_c23 = _mm256_broadcastd_epi32(
+ _mm_shuffle_epi32(v_01, 1)); // A3A2A3A2A3A2A3A2
+ __m256i v_c45 = _mm256_broadcastd_epi32(
+ _mm_shuffle_epi32(v_01, 2)); // A5A4A5A4A5A4A5A4
+ __m256i v_c67 = _mm256_broadcastd_epi32(
+ _mm_shuffle_epi32(v_01, 3)); // A7A6A7A6A7A6A7A6
+
+ __m256i v_refl = _mm256_inserti128_si256(
+ _mm256_setzero_si256(),
+ _mm_loadu_si128((__m128i *)&ref[iy + ix4 - 7]), 0);
+ v_refl = _mm256_inserti128_si256(
+ v_refl, _mm_loadu_si128((__m128i *)&ref[iy + ix4 + 1]),
+ 1); // R15 .. R0
+
+ __m256i v_ref = _mm256_permute4x64_epi64(v_refl, 0xEE);
+
+ __m256i v_refu =
+ _mm256_alignr_epi8(v_ref, v_refl, 2); // R8R15R14...R2R1
+
+ v_refl = _mm256_inserti128_si256(
+ v_refl, _mm256_extracti128_si256(v_refu, 0), 1);
+ v_refu = _mm256_inserti128_si256(
+ v_refu, _mm256_extracti128_si256(v_ref, 0), 0);
+
+ __m256i v_sum = _mm256_set1_epi32(ohoriz);
+ __m256i parsum =
+ _mm256_madd_epi16(v_c01, _mm256_alignr_epi8(v_refu, v_refl, 0));
+ __m256i v_sum1 = _mm256_add_epi32(v_sum, parsum);
+
+ parsum =
+ _mm256_madd_epi16(v_c23, _mm256_alignr_epi8(v_refu, v_refl, 4));
+ __m256i v_sum2 = _mm256_add_epi32(v_sum1, parsum);
+ parsum =
+ _mm256_madd_epi16(v_c45, _mm256_alignr_epi8(v_refu, v_refl, 8));
+ __m256i v_sum3 = _mm256_add_epi32(v_sum2, parsum);
+ parsum = _mm256_madd_epi16(v_c67,
+ _mm256_alignr_epi8(v_refu, v_refl, 12));
+ __m256i v_sum4 = _mm256_add_epi32(v_sum3, parsum);
+
+ tmp[k + 7] = _mm256_srai_epi32(_mm256_add_epi32(v_sum4, v_rbhoriz),
+ reduce_bits_horiz);
+ }
+ } else if (beta == 0) {
+ sx = sx4;
+ __m256i v_coeff01 = _mm256_inserti128_si256(
+ v_zeros,
+ _mm_loadu_si128(
+ (__m128i *)av1_warped_filter[(sx) >> WARPEDDIFF_PREC_BITS]),
+ 0);
+ v_coeff01 = _mm256_inserti128_si256(
+ v_coeff01,
+ _mm_loadu_si128(
+ (__m128i *)
+ av1_warped_filter[(sx + alpha) >> WARPEDDIFF_PREC_BITS]),
+ 1); // B7B6..B1B0A7A6..A1A0
+ __m256i v_coeff23 = _mm256_inserti128_si256(
+ v_zeros,
+ _mm_loadu_si128(
+ (__m128i *)av1_warped_filter[(sx + 2 * alpha) >>
+ WARPEDDIFF_PREC_BITS]),
+ 0);
+ v_coeff23 = _mm256_inserti128_si256(
+ v_coeff23,
+ _mm_loadu_si128(
+ (__m128i *)av1_warped_filter[(sx + 3 * alpha) >>
+ WARPEDDIFF_PREC_BITS]),
+ 1); // D7D6..D1D0C7C6..C1C0
+ __m256i v_coeff45 = _mm256_inserti128_si256(
+ v_zeros,
+ _mm_loadu_si128(
+ (__m128i *)av1_warped_filter[(sx + 4 * alpha) >>
+ WARPEDDIFF_PREC_BITS]),
+ 0);
+ v_coeff45 = _mm256_inserti128_si256(
+ v_coeff45,
+ _mm_loadu_si128(
+ (__m128i *)av1_warped_filter[(sx + 5 * alpha) >>
+ WARPEDDIFF_PREC_BITS]),
+ 1); // F7F6..F1F0E7E6..E1E0
+ __m256i v_coeff67 = _mm256_inserti128_si256(
+ v_zeros,
+ _mm_loadu_si128(
+ (__m128i *)av1_warped_filter[(sx + 6 * alpha) >>
+ WARPEDDIFF_PREC_BITS]),
+ 0);
+ v_coeff67 = _mm256_inserti128_si256(
+ v_coeff67,
+ _mm_loadu_si128(
+ (__m128i *)av1_warped_filter[(sx + 7 * alpha) >>
+ WARPEDDIFF_PREC_BITS]),
+ 1); // H7H6..H1H0G7G6..G1G0
+
+ __m256i v_c0123 = _mm256_unpacklo_epi32(
+ v_coeff01,
+ v_coeff23); // D3D2B3B2D1D0B1B0C3C2A3A2C1C0A1A0
+ __m256i v_c0123u = _mm256_unpackhi_epi32(
+ v_coeff01,
+ v_coeff23); // D7D6B7B6D5D4B5B4C7C6A7A6C5C4A5A4
+ __m256i v_c4567 = _mm256_unpacklo_epi32(
+ v_coeff45,
+ v_coeff67); // H3H2F3F2H1H0F1F0G3G2E3E2G1G0E1E0
+ __m256i v_c4567u = _mm256_unpackhi_epi32(
+ v_coeff45,
+ v_coeff67); // H7H6F7F6H5H4F5F4G7G6E7E6G5G4E5E4
+
+ __m256i v_c01 = _mm256_unpacklo_epi64(
+ v_c0123, v_c4567); // H1H0F1F0D1D0B1B0G1G0E1E0C1C0A1A0
+ __m256i v_c23 =
+ _mm256_unpackhi_epi64(v_c0123, v_c4567); // H3H2 ... A3A2
+ __m256i v_c45 =
+ _mm256_unpacklo_epi64(v_c0123u, v_c4567u); // H5H4 ... A5A4
+ __m256i v_c67 =
+ _mm256_unpackhi_epi64(v_c0123u, v_c4567u); // H7H6 ... A7A6
+
+ for (int k = -7; k < AOMMIN(8, p_height - i); ++k) {
+ int iy = iy4 + k;
+ if (iy < 0)
+ iy = 0;
+ else if (iy > height - 1)
+ iy = height - 1;
+ iy = iy * stride;
+
+ __m256i v_refl = _mm256_inserti128_si256(
+ _mm256_setzero_si256(),
+ _mm_loadu_si128((__m128i *)&ref[iy + ix4 - 7]), 0);
+ v_refl = _mm256_inserti128_si256(
+ v_refl, _mm_loadu_si128((__m128i *)&ref[iy + ix4 + 1]),
+ 1); // R15 .. R0
+
+ __m256i v_ref = _mm256_permute4x64_epi64(v_refl, 0xEE);
+
+ __m256i v_refu =
+ _mm256_alignr_epi8(v_ref, v_refl, 2); // R8R15R14...R2R1
+
+ v_refl = _mm256_inserti128_si256(
+ v_refl, _mm256_extracti128_si256(v_refu, 0), 1);
+ v_refu = _mm256_inserti128_si256(
+ v_refu, _mm256_extracti128_si256(v_ref, 0), 0);
+
+ __m256i v_sum = _mm256_set1_epi32(ohoriz);
+ __m256i parsum = _mm256_madd_epi16(
+ v_c01, _mm256_alignr_epi8(v_refu, v_refl,
+ 0)); // R8R7R6..R1R7R6R5..R1R0
+ __m256i v_sum1 = _mm256_add_epi32(v_sum, parsum);
+
+ parsum = _mm256_madd_epi16(
+ v_c23,
+ _mm256_alignr_epi8(v_refu, v_refl, 4)); // R10R9..R3R9R8..R3R2
+ __m256i v_sum2 = _mm256_add_epi32(v_sum1, parsum);
+ parsum = _mm256_madd_epi16(
+ v_c45, _mm256_alignr_epi8(v_refu, v_refl,
+ 8)); // R12R11..R5R11R10..R5R4
+ __m256i v_sum3 = _mm256_add_epi32(v_sum2, parsum);
+ parsum = _mm256_madd_epi16(
+ v_c67, _mm256_alignr_epi8(v_refu, v_refl,
+ 12)); // R14R13..R7R13R12..R7R6
+ __m256i v_sum4 = _mm256_add_epi32(v_sum3, parsum);
+
+ tmp[k + 7] = _mm256_srai_epi32(_mm256_add_epi32(v_sum4, v_rbhoriz),
+ reduce_bits_horiz);
+ }
+
+ } else {
+ for (int k = -7; k < AOMMIN(8, p_height - i); ++k) {
+ int iy = iy4 + k;
+ if (iy < 0)
+ iy = 0;
+ else if (iy > height - 1)
+ iy = height - 1;
+ iy = iy * stride;
+
+ sx = sx4 + beta * (k + 4);
+
+ __m256i v_coeff01 = _mm256_inserti128_si256(
+ v_zeros,
+ _mm_loadu_si128(
+ (__m128i *)av1_warped_filter[(sx) >> WARPEDDIFF_PREC_BITS]),
+ 0);
+ v_coeff01 = _mm256_inserti128_si256(
+ v_coeff01,
+ _mm_loadu_si128(
+ (__m128i *)av1_warped_filter[(sx + alpha) >>
+ WARPEDDIFF_PREC_BITS]),
+ 1); // B7B6..B1B0A7A6..A1A0
+ __m256i v_coeff23 = _mm256_inserti128_si256(
+ v_zeros,
+ _mm_loadu_si128(
+ (__m128i *)av1_warped_filter[(sx + 2 * alpha) >>
+ WARPEDDIFF_PREC_BITS]),
+ 0);
+ v_coeff23 = _mm256_inserti128_si256(
+ v_coeff23,
+ _mm_loadu_si128(
+ (__m128i *)av1_warped_filter[(sx + 3 * alpha) >>
+ WARPEDDIFF_PREC_BITS]),
+ 1); // D7D6..D1D0C7C6..C1C0
+ __m256i v_coeff45 = _mm256_inserti128_si256(
+ v_zeros,
+ _mm_loadu_si128(
+ (__m128i *)av1_warped_filter[(sx + 4 * alpha) >>
+ WARPEDDIFF_PREC_BITS]),
+ 0);
+ v_coeff45 = _mm256_inserti128_si256(
+ v_coeff45,
+ _mm_loadu_si128(
+ (__m128i *)av1_warped_filter[(sx + 5 * alpha) >>
+ WARPEDDIFF_PREC_BITS]),
+ 1); // F7F6..F1F0E7E6..E1E0
+ __m256i v_coeff67 = _mm256_inserti128_si256(
+ v_zeros,
+ _mm_loadu_si128(
+ (__m128i *)av1_warped_filter[(sx + 6 * alpha) >>
+ WARPEDDIFF_PREC_BITS]),
+ 0);
+ v_coeff67 = _mm256_inserti128_si256(
+ v_coeff67,
+ _mm_loadu_si128(
+ (__m128i *)av1_warped_filter[(sx + 7 * alpha) >>
+ WARPEDDIFF_PREC_BITS]),
+ 1); // H7H6..H1H0G7G6..G1G0
+
+ __m256i v_c0123 = _mm256_unpacklo_epi32(
+ v_coeff01,
+ v_coeff23); // D3D2B3B2D1D0B1B0C3C2A3A2C1C0A1A0
+ __m256i v_c0123u = _mm256_unpackhi_epi32(
+ v_coeff01,
+ v_coeff23); // D7D6B7B6D5D4B5B4C7C6A7A6C5C4A5A4
+ __m256i v_c4567 = _mm256_unpacklo_epi32(
+ v_coeff45,
+ v_coeff67); // H3H2F3F2H1H0F1F0G3G2E3E2G1G0E1E0
+ __m256i v_c4567u = _mm256_unpackhi_epi32(
+ v_coeff45,
+ v_coeff67); // H7H6F7F6H5H4F5F4G7G6E7E6G5G4E5E4
+
+ __m256i v_c01 = _mm256_unpacklo_epi64(
+ v_c0123, v_c4567); // H1H0F1F0D1D0B1B0G1G0E1E0C1C0A1A0
+ __m256i v_c23 =
+ _mm256_unpackhi_epi64(v_c0123, v_c4567); // H3H2 ... A3A2
+ __m256i v_c45 =
+ _mm256_unpacklo_epi64(v_c0123u, v_c4567u); // H5H4 ... A5A4
+ __m256i v_c67 =
+ _mm256_unpackhi_epi64(v_c0123u, v_c4567u); // H7H6 ... A7A6
+
+ __m256i v_refl = _mm256_inserti128_si256(
+ _mm256_setzero_si256(),
+ _mm_loadu_si128((__m128i *)&ref[iy + ix4 - 7]), 0);
+ v_refl = _mm256_inserti128_si256(
+ v_refl, _mm_loadu_si128((__m128i *)&ref[iy + ix4 + 1]),
+ 1); // R15 .. R0
+
+ __m256i v_ref = _mm256_permute4x64_epi64(v_refl, 0xEE);
+
+ __m256i v_refu =
+ _mm256_alignr_epi8(v_ref, v_refl, 2); // R8R15R14...R2R1
+
+ v_refl = _mm256_inserti128_si256(
+ v_refl, _mm256_extracti128_si256(v_refu, 0), 1);
+ v_refu = _mm256_inserti128_si256(
+ v_refu, _mm256_extracti128_si256(v_ref, 0), 0);
+
+ __m256i v_sum = _mm256_set1_epi32(ohoriz);
+ __m256i parsum =
+ _mm256_madd_epi16(v_c01, _mm256_alignr_epi8(v_refu, v_refl, 0));
+ __m256i v_sum1 = _mm256_add_epi32(v_sum, parsum);
+
+ parsum =
+ _mm256_madd_epi16(v_c23, _mm256_alignr_epi8(v_refu, v_refl, 4));
+ __m256i v_sum2 = _mm256_add_epi32(v_sum1, parsum);
+ parsum =
+ _mm256_madd_epi16(v_c45, _mm256_alignr_epi8(v_refu, v_refl, 8));
+ __m256i v_sum3 = _mm256_add_epi32(v_sum2, parsum);
+ parsum = _mm256_madd_epi16(v_c67,
+ _mm256_alignr_epi8(v_refu, v_refl, 12));
+ __m256i v_sum4 = _mm256_add_epi32(v_sum3, parsum);
+
+ tmp[k + 7] = _mm256_srai_epi32(_mm256_add_epi32(v_sum4, v_rbhoriz),
+ reduce_bits_horiz);
+ }
+ }
+ }
+
+ // Vertical filter
+ for (int k = -4; k < AOMMIN(4, p_height - i - 4); ++k) {
+ int sy = sy4 + delta * (k + 4);
+ const __m256i *src = tmp + (k + 4);
+
+ __m256i v_coeff01 = _mm256_inserti128_si256(
+ v_zeros,
+ _mm_loadu_si128(
+ (__m128i *)av1_warped_filter[(sy) >> WARPEDDIFF_PREC_BITS]),
+ 0);
+ v_coeff01 = _mm256_inserti128_si256(
+ v_coeff01,
+ _mm_loadu_si128(
+ (__m128i *)
+ av1_warped_filter[(sy + gamma) >> WARPEDDIFF_PREC_BITS]),
+ 1);
+ __m256i v_coeff23 = _mm256_inserti128_si256(
+ v_zeros,
+ _mm_loadu_si128((__m128i *)av1_warped_filter[(sy + 2 * gamma) >>
+ WARPEDDIFF_PREC_BITS]),
+ 0);
+ v_coeff23 = _mm256_inserti128_si256(
+ v_coeff23,
+ _mm_loadu_si128((__m128i *)av1_warped_filter[(sy + 3 * gamma) >>
+ WARPEDDIFF_PREC_BITS]),
+ 1);
+ __m256i v_coeff45 = _mm256_inserti128_si256(
+ v_zeros,
+ _mm_loadu_si128((__m128i *)av1_warped_filter[(sy + 4 * gamma) >>
+ WARPEDDIFF_PREC_BITS]),
+ 0);
+ v_coeff45 = _mm256_inserti128_si256(
+ v_coeff45,
+ _mm_loadu_si128((__m128i *)av1_warped_filter[(sy + 5 * gamma) >>
+ WARPEDDIFF_PREC_BITS]),
+ 1);
+ __m256i v_coeff67 = _mm256_inserti128_si256(
+ v_zeros,
+ _mm_loadu_si128((__m128i *)av1_warped_filter[(sy + 6 * gamma) >>
+ WARPEDDIFF_PREC_BITS]),
+ 0);
+ v_coeff67 = _mm256_inserti128_si256(
+ v_coeff67,
+ _mm_loadu_si128((__m128i *)av1_warped_filter[(sy + 7 * gamma) >>
+ WARPEDDIFF_PREC_BITS]),
+ 1);
+
+ __m256i v_c0123 = _mm256_unpacklo_epi32(
+ v_coeff01,
+ v_coeff23); // D3D2B3B2D1D0B1B0C3C2A3A2C1C0A1A0
+ __m256i v_c0123u = _mm256_unpackhi_epi32(
+ v_coeff01,
+ v_coeff23); // D7D6B7B6D5D4B5B4C7C6A7A6C5C4A5A4
+ __m256i v_c4567 = _mm256_unpacklo_epi32(
+ v_coeff45,
+ v_coeff67); // H3H2F3F2H1H0F1F0G3G2E3E2G1G0E1E0
+ __m256i v_c4567u = _mm256_unpackhi_epi32(
+ v_coeff45,
+ v_coeff67); // H7H6F7F6H5H4F5F4G7G6E7E6G5G4E5E4
+
+ __m256i v_c01 = _mm256_unpacklo_epi64(
+ v_c0123, v_c4567); // H1H0F1F0D1D0B1B0G1G0E1E0C1C0A1A0
+ __m256i v_c23 =
+ _mm256_unpackhi_epi64(v_c0123, v_c4567); // H3H2 ... A3A2
+ __m256i v_c45 =
+ _mm256_unpacklo_epi64(v_c0123u, v_c4567u); // H5H4 ... A5A4
+ __m256i v_c67 =
+ _mm256_unpackhi_epi64(v_c0123u, v_c4567u); // H7H6 ... A7A6
+
+ __m256i v_src01l =
+ _mm256_unpacklo_epi32(src[0], src[1]); // T13T03T11T01T12T02T10T00
+ __m256i v_src01u =
+ _mm256_unpackhi_epi32(src[0], src[1]); // T17T07T15T05T16T06T14T04
+ __m256i v_sum =
+ _mm256_madd_epi16(_mm256_packus_epi32(v_src01l, v_src01u),
+ v_c01); // S7S5S3S1S6S4S2S0
+
+ __m256i v_src23l = _mm256_unpacklo_epi32(src[2], src[3]);
+ __m256i v_src23u = _mm256_unpackhi_epi32(src[2], src[3]);
+ v_sum = _mm256_add_epi32(
+ v_sum,
+ _mm256_madd_epi16(_mm256_packus_epi32(v_src23l, v_src23u), v_c23));
+
+ __m256i v_src45l = _mm256_unpacklo_epi32(src[4], src[5]);
+ __m256i v_src45u = _mm256_unpackhi_epi32(src[4], src[5]);
+ v_sum = _mm256_add_epi32(
+ v_sum,
+ _mm256_madd_epi16(_mm256_packus_epi32(v_src45l, v_src45u), v_c45));
+
+ __m256i v_src67l = _mm256_unpacklo_epi32(src[6], src[7]);
+ __m256i v_src67u = _mm256_unpackhi_epi32(src[6], src[7]);
+ v_sum = _mm256_add_epi32(
+ v_sum,
+ _mm256_madd_epi16(_mm256_packus_epi32(v_src67l, v_src67u), v_c67));
+
+ // unpack S7S5S3S1S6S4S2S0 to S7S6S5S4S3S2S1S0
+
+ __m256i v_suml =
+ _mm256_permute4x64_epi64(v_sum, 0xD8); // S7S5S6S4S3S1S2S0
+ __m256i v_sumh =
+ _mm256_permute4x64_epi64(v_sum, 0x32); // S2S0S7S5S2S0S3S1
+ v_sum = _mm256_unpacklo_epi32(v_suml, v_sumh); // S7S6S5S4S3S2S1S0
+
+ if (conv_params->is_compound) {
+ __m128i *const p =
+ (__m128i *)&conv_params
+ ->dst[(i + k + 4) * conv_params->dst_stride + j];
+
+ v_sum = _mm256_add_epi32(v_sum, res_add_const);
+ v_sum =
+ _mm256_sra_epi32(_mm256_add_epi32(v_sum, reduce_bits_vert_const),
+ reduce_bits_vert_shift);
+ if (conv_params->do_average) {
+ __m128i *const dst16 = (__m128i *)&pred[(i + k + 4) * p_stride + j];
+ __m256i p_32 = _mm256_cvtepu16_epi32(_mm_loadu_si128(p));
+
+ if (conv_params->use_dist_wtd_comp_avg) {
+ v_sum = _mm256_add_epi32(_mm256_mullo_epi32(p_32, wt0),
+ _mm256_mullo_epi32(v_sum, wt1));
+ v_sum = _mm256_srai_epi32(v_sum, DIST_PRECISION_BITS);
+ } else {
+ v_sum = _mm256_srai_epi32(_mm256_add_epi32(p_32, v_sum), 1);
+ }
+
+ __m256i v_sum1 = _mm256_add_epi32(v_sum, res_sub_const);
+ v_sum1 = _mm256_sra_epi32(
+ _mm256_add_epi32(v_sum1, round_bits_const), round_bits_shift);
+
+ __m256i v_sum16 = _mm256_packus_epi32(v_sum1, v_sum1);
+ v_sum16 = _mm256_permute4x64_epi64(v_sum16, 0xD8);
+ v_sum16 = _mm256_min_epi16(v_sum16, clip_pixel);
+ _mm_storeu_si128(dst16, _mm256_extracti128_si256(v_sum16, 0));
+ } else {
+ v_sum = _mm256_packus_epi32(v_sum, v_sum);
+ __m256i v_sum16 = _mm256_permute4x64_epi64(v_sum, 0xD8);
+ _mm_storeu_si128(p, _mm256_extracti128_si256(v_sum16, 0));
+ }
+ } else {
+ // Round and pack into 8 bits
+ const __m256i round_const =
+ _mm256_set1_epi32(-(1 << (bd + reduce_bits_vert - 1)) +
+ ((1 << reduce_bits_vert) >> 1));
+
+ __m256i v_sum1 = _mm256_srai_epi32(
+ _mm256_add_epi32(v_sum, round_const), reduce_bits_vert);
+
+ v_sum1 = _mm256_packus_epi32(v_sum1, v_sum1);
+ __m256i v_sum16 = _mm256_permute4x64_epi64(v_sum1, 0xD8);
+ // Clamp res_16bit to the range [0, 2^bd - 1]
+ const __m256i max_val = _mm256_set1_epi16((1 << bd) - 1);
+ const __m256i zero = _mm256_setzero_si256();
+ v_sum16 = _mm256_max_epi16(_mm256_min_epi16(v_sum16, max_val), zero);
+
+ __m128i *const p = (__m128i *)&pred[(i + k + 4) * p_stride + j];
+
+ _mm_storeu_si128(p, _mm256_extracti128_si256(v_sum16, 0));
+ }
+ }
+ }
+ }
+}
diff --git a/third_party/aom/av1/common/x86/highbd_warp_plane_sse4.c b/third_party/aom/av1/common/x86/highbd_warp_plane_sse4.c
new file mode 100644
index 0000000000..96fb4cf632
--- /dev/null
+++ b/third_party/aom/av1/common/x86/highbd_warp_plane_sse4.c
@@ -0,0 +1,636 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <smmintrin.h>
+
+#include "config/av1_rtcd.h"
+
+#include "av1/common/warped_motion.h"
+
+static const uint8_t warp_highbd_arrange_bytes[16] = { 0, 2, 4, 6, 8, 10,
+ 12, 14, 1, 3, 5, 7,
+ 9, 11, 13, 15 };
+
+static const uint8_t highbd_shuffle_alpha0_mask0[16] = {
+ 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3
+};
+static const uint8_t highbd_shuffle_alpha0_mask1[16] = {
+ 4, 5, 6, 7, 4, 5, 6, 7, 4, 5, 6, 7, 4, 5, 6, 7
+};
+static const uint8_t highbd_shuffle_alpha0_mask2[16] = { 8, 9, 10, 11, 8, 9,
+ 10, 11, 8, 9, 10, 11,
+ 8, 9, 10, 11 };
+static const uint8_t highbd_shuffle_alpha0_mask3[16] = { 12, 13, 14, 15, 12, 13,
+ 14, 15, 12, 13, 14, 15,
+ 12, 13, 14, 15 };
+
+static INLINE void highbd_prepare_horizontal_filter_coeff(int alpha, int sx,
+ __m128i *coeff) {
+ // Filter even-index pixels
+ const __m128i tmp_0 =
+ _mm_loadu_si128((__m128i *)(av1_warped_filter +
+ ((sx + 0 * alpha) >> WARPEDDIFF_PREC_BITS)));
+ const __m128i tmp_2 =
+ _mm_loadu_si128((__m128i *)(av1_warped_filter +
+ ((sx + 2 * alpha) >> WARPEDDIFF_PREC_BITS)));
+ const __m128i tmp_4 =
+ _mm_loadu_si128((__m128i *)(av1_warped_filter +
+ ((sx + 4 * alpha) >> WARPEDDIFF_PREC_BITS)));
+ const __m128i tmp_6 =
+ _mm_loadu_si128((__m128i *)(av1_warped_filter +
+ ((sx + 6 * alpha) >> WARPEDDIFF_PREC_BITS)));
+
+ // coeffs 0 1 0 1 2 3 2 3 for pixels 0, 2
+ const __m128i tmp_8 = _mm_unpacklo_epi32(tmp_0, tmp_2);
+ // coeffs 0 1 0 1 2 3 2 3 for pixels 4, 6
+ const __m128i tmp_10 = _mm_unpacklo_epi32(tmp_4, tmp_6);
+ // coeffs 4 5 4 5 6 7 6 7 for pixels 0, 2
+ const __m128i tmp_12 = _mm_unpackhi_epi32(tmp_0, tmp_2);
+ // coeffs 4 5 4 5 6 7 6 7 for pixels 4, 6
+ const __m128i tmp_14 = _mm_unpackhi_epi32(tmp_4, tmp_6);
+
+ // coeffs 0 1 0 1 0 1 0 1 for pixels 0, 2, 4, 6
+ coeff[0] = _mm_unpacklo_epi64(tmp_8, tmp_10);
+ // coeffs 2 3 2 3 2 3 2 3 for pixels 0, 2, 4, 6
+ coeff[2] = _mm_unpackhi_epi64(tmp_8, tmp_10);
+ // coeffs 4 5 4 5 4 5 4 5 for pixels 0, 2, 4, 6
+ coeff[4] = _mm_unpacklo_epi64(tmp_12, tmp_14);
+ // coeffs 6 7 6 7 6 7 6 7 for pixels 0, 2, 4, 6
+ coeff[6] = _mm_unpackhi_epi64(tmp_12, tmp_14);
+
+ // Filter odd-index pixels
+ const __m128i tmp_1 =
+ _mm_loadu_si128((__m128i *)(av1_warped_filter +
+ ((sx + 1 * alpha) >> WARPEDDIFF_PREC_BITS)));
+ const __m128i tmp_3 =
+ _mm_loadu_si128((__m128i *)(av1_warped_filter +
+ ((sx + 3 * alpha) >> WARPEDDIFF_PREC_BITS)));
+ const __m128i tmp_5 =
+ _mm_loadu_si128((__m128i *)(av1_warped_filter +
+ ((sx + 5 * alpha) >> WARPEDDIFF_PREC_BITS)));
+ const __m128i tmp_7 =
+ _mm_loadu_si128((__m128i *)(av1_warped_filter +
+ ((sx + 7 * alpha) >> WARPEDDIFF_PREC_BITS)));
+
+ const __m128i tmp_9 = _mm_unpacklo_epi32(tmp_1, tmp_3);
+ const __m128i tmp_11 = _mm_unpacklo_epi32(tmp_5, tmp_7);
+ const __m128i tmp_13 = _mm_unpackhi_epi32(tmp_1, tmp_3);
+ const __m128i tmp_15 = _mm_unpackhi_epi32(tmp_5, tmp_7);
+
+ coeff[1] = _mm_unpacklo_epi64(tmp_9, tmp_11);
+ coeff[3] = _mm_unpackhi_epi64(tmp_9, tmp_11);
+ coeff[5] = _mm_unpacklo_epi64(tmp_13, tmp_15);
+ coeff[7] = _mm_unpackhi_epi64(tmp_13, tmp_15);
+}
+
+static INLINE void highbd_prepare_horizontal_filter_coeff_alpha0(
+ int sx, __m128i *coeff) {
+ // Filter coeff
+ const __m128i tmp_0 = _mm_loadu_si128(
+ (__m128i *)(av1_warped_filter + (sx >> WARPEDDIFF_PREC_BITS)));
+
+ coeff[0] = _mm_shuffle_epi8(
+ tmp_0, _mm_loadu_si128((__m128i *)highbd_shuffle_alpha0_mask0));
+ coeff[2] = _mm_shuffle_epi8(
+ tmp_0, _mm_loadu_si128((__m128i *)highbd_shuffle_alpha0_mask1));
+ coeff[4] = _mm_shuffle_epi8(
+ tmp_0, _mm_loadu_si128((__m128i *)highbd_shuffle_alpha0_mask2));
+ coeff[6] = _mm_shuffle_epi8(
+ tmp_0, _mm_loadu_si128((__m128i *)highbd_shuffle_alpha0_mask3));
+
+ coeff[1] = coeff[0];
+ coeff[3] = coeff[2];
+ coeff[5] = coeff[4];
+ coeff[7] = coeff[6];
+}
+
+static INLINE void highbd_filter_src_pixels(
+ const __m128i *src, const __m128i *src2, __m128i *tmp, __m128i *coeff,
+ const int offset_bits_horiz, const int reduce_bits_horiz, int k) {
+ const __m128i src_1 = *src;
+ const __m128i src2_1 = *src2;
+
+ const __m128i round_const = _mm_set1_epi32((1 << offset_bits_horiz) +
+ ((1 << reduce_bits_horiz) >> 1));
+
+ const __m128i res_0 = _mm_madd_epi16(src_1, coeff[0]);
+ const __m128i res_2 =
+ _mm_madd_epi16(_mm_alignr_epi8(src2_1, src_1, 4), coeff[2]);
+ const __m128i res_4 =
+ _mm_madd_epi16(_mm_alignr_epi8(src2_1, src_1, 8), coeff[4]);
+ const __m128i res_6 =
+ _mm_madd_epi16(_mm_alignr_epi8(src2_1, src_1, 12), coeff[6]);
+
+ __m128i res_even =
+ _mm_add_epi32(_mm_add_epi32(res_0, res_4), _mm_add_epi32(res_2, res_6));
+ res_even = _mm_sra_epi32(_mm_add_epi32(res_even, round_const),
+ _mm_cvtsi32_si128(reduce_bits_horiz));
+
+ const __m128i res_1 =
+ _mm_madd_epi16(_mm_alignr_epi8(src2_1, src_1, 2), coeff[1]);
+ const __m128i res_3 =
+ _mm_madd_epi16(_mm_alignr_epi8(src2_1, src_1, 6), coeff[3]);
+ const __m128i res_5 =
+ _mm_madd_epi16(_mm_alignr_epi8(src2_1, src_1, 10), coeff[5]);
+ const __m128i res_7 =
+ _mm_madd_epi16(_mm_alignr_epi8(src2_1, src_1, 14), coeff[7]);
+
+ __m128i res_odd =
+ _mm_add_epi32(_mm_add_epi32(res_1, res_5), _mm_add_epi32(res_3, res_7));
+ res_odd = _mm_sra_epi32(_mm_add_epi32(res_odd, round_const),
+ _mm_cvtsi32_si128(reduce_bits_horiz));
+
+ // Combine results into one register.
+ // We store the columns in the order 0, 2, 4, 6, 1, 3, 5, 7
+ // as this order helps with the vertical filter.
+ tmp[k + 7] = _mm_packs_epi32(res_even, res_odd);
+}
+
+static INLINE void highbd_horiz_filter(const __m128i *src, const __m128i *src2,
+ __m128i *tmp, int sx, int alpha, int k,
+ const int offset_bits_horiz,
+ const int reduce_bits_horiz) {
+ __m128i coeff[8];
+ highbd_prepare_horizontal_filter_coeff(alpha, sx, coeff);
+ highbd_filter_src_pixels(src, src2, tmp, coeff, offset_bits_horiz,
+ reduce_bits_horiz, k);
+}
+
+static INLINE void highbd_warp_horizontal_filter_alpha0_beta0(
+ const uint16_t *ref, __m128i *tmp, int stride, int32_t ix4, int32_t iy4,
+ int32_t sx4, int alpha, int beta, int p_height, int height, int i,
+ const int offset_bits_horiz, const int reduce_bits_horiz) {
+ (void)beta;
+ (void)alpha;
+ int k;
+
+ __m128i coeff[8];
+ highbd_prepare_horizontal_filter_coeff_alpha0(sx4, coeff);
+
+ for (k = -7; k < AOMMIN(8, p_height - i); ++k) {
+ int iy = iy4 + k;
+ if (iy < 0)
+ iy = 0;
+ else if (iy > height - 1)
+ iy = height - 1;
+
+ // Load source pixels
+ const __m128i src =
+ _mm_loadu_si128((__m128i *)(ref + iy * stride + ix4 - 7));
+ const __m128i src2 =
+ _mm_loadu_si128((__m128i *)(ref + iy * stride + ix4 + 1));
+ highbd_filter_src_pixels(&src, &src2, tmp, coeff, offset_bits_horiz,
+ reduce_bits_horiz, k);
+ }
+}
+
+static INLINE void highbd_warp_horizontal_filter_alpha0(
+ const uint16_t *ref, __m128i *tmp, int stride, int32_t ix4, int32_t iy4,
+ int32_t sx4, int alpha, int beta, int p_height, int height, int i,
+ const int offset_bits_horiz, const int reduce_bits_horiz) {
+ (void)alpha;
+ int k;
+ for (k = -7; k < AOMMIN(8, p_height - i); ++k) {
+ int iy = iy4 + k;
+ if (iy < 0)
+ iy = 0;
+ else if (iy > height - 1)
+ iy = height - 1;
+ int sx = sx4 + beta * (k + 4);
+
+ // Load source pixels
+ const __m128i src =
+ _mm_loadu_si128((__m128i *)(ref + iy * stride + ix4 - 7));
+ const __m128i src2 =
+ _mm_loadu_si128((__m128i *)(ref + iy * stride + ix4 + 1));
+
+ __m128i coeff[8];
+ highbd_prepare_horizontal_filter_coeff_alpha0(sx, coeff);
+ highbd_filter_src_pixels(&src, &src2, tmp, coeff, offset_bits_horiz,
+ reduce_bits_horiz, k);
+ }
+}
+
+static INLINE void highbd_warp_horizontal_filter_beta0(
+ const uint16_t *ref, __m128i *tmp, int stride, int32_t ix4, int32_t iy4,
+ int32_t sx4, int alpha, int beta, int p_height, int height, int i,
+ const int offset_bits_horiz, const int reduce_bits_horiz) {
+ (void)beta;
+ int k;
+ __m128i coeff[8];
+ highbd_prepare_horizontal_filter_coeff(alpha, sx4, coeff);
+
+ for (k = -7; k < AOMMIN(8, p_height - i); ++k) {
+ int iy = iy4 + k;
+ if (iy < 0)
+ iy = 0;
+ else if (iy > height - 1)
+ iy = height - 1;
+
+ // Load source pixels
+ const __m128i src =
+ _mm_loadu_si128((__m128i *)(ref + iy * stride + ix4 - 7));
+ const __m128i src2 =
+ _mm_loadu_si128((__m128i *)(ref + iy * stride + ix4 + 1));
+ highbd_filter_src_pixels(&src, &src2, tmp, coeff, offset_bits_horiz,
+ reduce_bits_horiz, k);
+ }
+}
+
+static INLINE void highbd_warp_horizontal_filter(
+ const uint16_t *ref, __m128i *tmp, int stride, int32_t ix4, int32_t iy4,
+ int32_t sx4, int alpha, int beta, int p_height, int height, int i,
+ const int offset_bits_horiz, const int reduce_bits_horiz) {
+ int k;
+ for (k = -7; k < AOMMIN(8, p_height - i); ++k) {
+ int iy = iy4 + k;
+ if (iy < 0)
+ iy = 0;
+ else if (iy > height - 1)
+ iy = height - 1;
+ int sx = sx4 + beta * (k + 4);
+
+ // Load source pixels
+ const __m128i src =
+ _mm_loadu_si128((__m128i *)(ref + iy * stride + ix4 - 7));
+ const __m128i src2 =
+ _mm_loadu_si128((__m128i *)(ref + iy * stride + ix4 + 1));
+
+ highbd_horiz_filter(&src, &src2, tmp, sx, alpha, k, offset_bits_horiz,
+ reduce_bits_horiz);
+ }
+}
+
+static INLINE void highbd_prepare_warp_horizontal_filter(
+ const uint16_t *ref, __m128i *tmp, int stride, int32_t ix4, int32_t iy4,
+ int32_t sx4, int alpha, int beta, int p_height, int height, int i,
+ const int offset_bits_horiz, const int reduce_bits_horiz) {
+ if (alpha == 0 && beta == 0)
+ highbd_warp_horizontal_filter_alpha0_beta0(
+ ref, tmp, stride, ix4, iy4, sx4, alpha, beta, p_height, height, i,
+ offset_bits_horiz, reduce_bits_horiz);
+
+ else if (alpha == 0 && beta != 0)
+ highbd_warp_horizontal_filter_alpha0(ref, tmp, stride, ix4, iy4, sx4, alpha,
+ beta, p_height, height, i,
+ offset_bits_horiz, reduce_bits_horiz);
+
+ else if (alpha != 0 && beta == 0)
+ highbd_warp_horizontal_filter_beta0(ref, tmp, stride, ix4, iy4, sx4, alpha,
+ beta, p_height, height, i,
+ offset_bits_horiz, reduce_bits_horiz);
+ else
+ highbd_warp_horizontal_filter(ref, tmp, stride, ix4, iy4, sx4, alpha, beta,
+ p_height, height, i, offset_bits_horiz,
+ reduce_bits_horiz);
+}
+
+void av1_highbd_warp_affine_sse4_1(const int32_t *mat, const uint16_t *ref,
+ int width, int height, int stride,
+ uint16_t *pred, int p_col, int p_row,
+ int p_width, int p_height, int p_stride,
+ int subsampling_x, int subsampling_y, int bd,
+ ConvolveParams *conv_params, int16_t alpha,
+ int16_t beta, int16_t gamma, int16_t delta) {
+ __m128i tmp[15];
+ int i, j, k;
+ const int reduce_bits_horiz = conv_params->round_0;
+ const int reduce_bits_vert = conv_params->is_compound
+ ? conv_params->round_1
+ : 2 * FILTER_BITS - reduce_bits_horiz;
+ const int offset_bits_horiz = bd + FILTER_BITS - 1;
+ assert(IMPLIES(conv_params->is_compound, conv_params->dst != NULL));
+ assert(!(bd == 12 && reduce_bits_horiz < 5));
+ assert(IMPLIES(conv_params->do_average, conv_params->is_compound));
+
+ // Check that, even with 12-bit input, the intermediate values will fit
+ // into an unsigned 16-bit intermediate array.
+ assert(bd + FILTER_BITS + 2 - conv_params->round_0 <= 16);
+
+ const int offset_bits_vert = bd + 2 * FILTER_BITS - reduce_bits_horiz;
+ const __m128i clip_pixel =
+ _mm_set1_epi16(bd == 10 ? 1023 : (bd == 12 ? 4095 : 255));
+ const __m128i reduce_bits_vert_shift = _mm_cvtsi32_si128(reduce_bits_vert);
+ const __m128i reduce_bits_vert_const =
+ _mm_set1_epi32(((1 << reduce_bits_vert) >> 1));
+ const __m128i res_add_const = _mm_set1_epi32(1 << offset_bits_vert);
+ const int round_bits =
+ 2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1;
+ const int offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0;
+ const __m128i res_sub_const =
+ _mm_set1_epi32(-(1 << (offset_bits - conv_params->round_1)) -
+ (1 << (offset_bits - conv_params->round_1 - 1)));
+ __m128i round_bits_shift = _mm_cvtsi32_si128(round_bits);
+ __m128i round_bits_const = _mm_set1_epi32(((1 << round_bits) >> 1));
+
+ const int w0 = conv_params->fwd_offset;
+ const int w1 = conv_params->bck_offset;
+ const __m128i wt0 = _mm_set1_epi32(w0);
+ const __m128i wt1 = _mm_set1_epi32(w1);
+
+ /* Note: For this code to work, the left/right frame borders need to be
+ extended by at least 13 pixels each. By the time we get here, other
+ code will have set up this border, but we allow an explicit check
+ for debugging purposes.
+ */
+ /*for (i = 0; i < height; ++i) {
+ for (j = 0; j < 13; ++j) {
+ assert(ref[i * stride - 13 + j] == ref[i * stride]);
+ assert(ref[i * stride + width + j] == ref[i * stride + (width - 1)]);
+ }
+ }*/
+
+ for (i = 0; i < p_height; i += 8) {
+ for (j = 0; j < p_width; j += 8) {
+ const int32_t src_x = (p_col + j + 4) << subsampling_x;
+ const int32_t src_y = (p_row + i + 4) << subsampling_y;
+ const int64_t dst_x =
+ (int64_t)mat[2] * src_x + (int64_t)mat[3] * src_y + (int64_t)mat[0];
+ const int64_t dst_y =
+ (int64_t)mat[4] * src_x + (int64_t)mat[5] * src_y + (int64_t)mat[1];
+ const int64_t x4 = dst_x >> subsampling_x;
+ const int64_t y4 = dst_y >> subsampling_y;
+
+ int32_t ix4 = (int32_t)(x4 >> WARPEDMODEL_PREC_BITS);
+ int32_t sx4 = x4 & ((1 << WARPEDMODEL_PREC_BITS) - 1);
+ int32_t iy4 = (int32_t)(y4 >> WARPEDMODEL_PREC_BITS);
+ int32_t sy4 = y4 & ((1 << WARPEDMODEL_PREC_BITS) - 1);
+
+ // Add in all the constant terms, including rounding and offset
+ sx4 += alpha * (-4) + beta * (-4) + (1 << (WARPEDDIFF_PREC_BITS - 1)) +
+ (WARPEDPIXEL_PREC_SHIFTS << WARPEDDIFF_PREC_BITS);
+ sy4 += gamma * (-4) + delta * (-4) + (1 << (WARPEDDIFF_PREC_BITS - 1)) +
+ (WARPEDPIXEL_PREC_SHIFTS << WARPEDDIFF_PREC_BITS);
+
+ sx4 &= ~((1 << WARP_PARAM_REDUCE_BITS) - 1);
+ sy4 &= ~((1 << WARP_PARAM_REDUCE_BITS) - 1);
+
+ // Horizontal filter
+ // If the block is aligned such that, after clamping, every sample
+ // would be taken from the leftmost/rightmost column, then we can
+ // skip the expensive horizontal filter.
+ if (ix4 <= -7) {
+ for (k = -7; k < AOMMIN(8, p_height - i); ++k) {
+ int iy = iy4 + k;
+ if (iy < 0)
+ iy = 0;
+ else if (iy > height - 1)
+ iy = height - 1;
+ tmp[k + 7] = _mm_set1_epi16(
+ (1 << (bd + FILTER_BITS - reduce_bits_horiz - 1)) +
+ ref[iy * stride] * (1 << (FILTER_BITS - reduce_bits_horiz)));
+ }
+ } else if (ix4 >= width + 6) {
+ for (k = -7; k < AOMMIN(8, p_height - i); ++k) {
+ int iy = iy4 + k;
+ if (iy < 0)
+ iy = 0;
+ else if (iy > height - 1)
+ iy = height - 1;
+ tmp[k + 7] =
+ _mm_set1_epi16((1 << (bd + FILTER_BITS - reduce_bits_horiz - 1)) +
+ ref[iy * stride + (width - 1)] *
+ (1 << (FILTER_BITS - reduce_bits_horiz)));
+ }
+ } else if (((ix4 - 7) < 0) || ((ix4 + 9) > width)) {
+ const int out_of_boundary_left = -(ix4 - 6);
+ const int out_of_boundary_right = (ix4 + 8) - width;
+
+ for (k = -7; k < AOMMIN(8, p_height - i); ++k) {
+ int iy = iy4 + k;
+ if (iy < 0)
+ iy = 0;
+ else if (iy > height - 1)
+ iy = height - 1;
+ int sx = sx4 + beta * (k + 4);
+
+ // Load source pixels
+ const __m128i src =
+ _mm_loadu_si128((__m128i *)(ref + iy * stride + ix4 - 7));
+ const __m128i src2 =
+ _mm_loadu_si128((__m128i *)(ref + iy * stride + ix4 + 1));
+
+ const __m128i src_01 = _mm_shuffle_epi8(
+ src, _mm_loadu_si128((__m128i *)warp_highbd_arrange_bytes));
+ const __m128i src2_01 = _mm_shuffle_epi8(
+ src2, _mm_loadu_si128((__m128i *)warp_highbd_arrange_bytes));
+
+ __m128i src_lo = _mm_unpacklo_epi64(src_01, src2_01);
+ __m128i src_hi = _mm_unpackhi_epi64(src_01, src2_01);
+
+ if (out_of_boundary_left >= 0) {
+ const __m128i shuffle_reg_left =
+ _mm_loadu_si128((__m128i *)warp_pad_left[out_of_boundary_left]);
+ src_lo = _mm_shuffle_epi8(src_lo, shuffle_reg_left);
+ src_hi = _mm_shuffle_epi8(src_hi, shuffle_reg_left);
+ }
+
+ if (out_of_boundary_right >= 0) {
+ const __m128i shuffle_reg_right = _mm_loadu_si128(
+ (__m128i *)warp_pad_right[out_of_boundary_right]);
+ src_lo = _mm_shuffle_epi8(src_lo, shuffle_reg_right);
+ src_hi = _mm_shuffle_epi8(src_hi, shuffle_reg_right);
+ }
+
+ const __m128i src_padded = _mm_unpacklo_epi8(src_lo, src_hi);
+ const __m128i src2_padded = _mm_unpackhi_epi8(src_lo, src_hi);
+
+ highbd_horiz_filter(&src_padded, &src2_padded, tmp, sx, alpha, k,
+ offset_bits_horiz, reduce_bits_horiz);
+ }
+ } else {
+ highbd_prepare_warp_horizontal_filter(
+ ref, tmp, stride, ix4, iy4, sx4, alpha, beta, p_height, height, i,
+ offset_bits_horiz, reduce_bits_horiz);
+ }
+
+ // Vertical filter
+ for (k = -4; k < AOMMIN(4, p_height - i - 4); ++k) {
+ int sy = sy4 + delta * (k + 4);
+
+ // Load from tmp and rearrange pairs of consecutive rows into the
+ // column order 0 0 2 2 4 4 6 6; 1 1 3 3 5 5 7 7
+ const __m128i *src = tmp + (k + 4);
+ const __m128i src_0 = _mm_unpacklo_epi16(src[0], src[1]);
+ const __m128i src_2 = _mm_unpacklo_epi16(src[2], src[3]);
+ const __m128i src_4 = _mm_unpacklo_epi16(src[4], src[5]);
+ const __m128i src_6 = _mm_unpacklo_epi16(src[6], src[7]);
+
+ // Filter even-index pixels
+ const __m128i tmp_0 = _mm_loadu_si128(
+ (__m128i *)(av1_warped_filter +
+ ((sy + 0 * gamma) >> WARPEDDIFF_PREC_BITS)));
+ const __m128i tmp_2 = _mm_loadu_si128(
+ (__m128i *)(av1_warped_filter +
+ ((sy + 2 * gamma) >> WARPEDDIFF_PREC_BITS)));
+ const __m128i tmp_4 = _mm_loadu_si128(
+ (__m128i *)(av1_warped_filter +
+ ((sy + 4 * gamma) >> WARPEDDIFF_PREC_BITS)));
+ const __m128i tmp_6 = _mm_loadu_si128(
+ (__m128i *)(av1_warped_filter +
+ ((sy + 6 * gamma) >> WARPEDDIFF_PREC_BITS)));
+
+ const __m128i tmp_8 = _mm_unpacklo_epi32(tmp_0, tmp_2);
+ const __m128i tmp_10 = _mm_unpacklo_epi32(tmp_4, tmp_6);
+ const __m128i tmp_12 = _mm_unpackhi_epi32(tmp_0, tmp_2);
+ const __m128i tmp_14 = _mm_unpackhi_epi32(tmp_4, tmp_6);
+
+ const __m128i coeff_0 = _mm_unpacklo_epi64(tmp_8, tmp_10);
+ const __m128i coeff_2 = _mm_unpackhi_epi64(tmp_8, tmp_10);
+ const __m128i coeff_4 = _mm_unpacklo_epi64(tmp_12, tmp_14);
+ const __m128i coeff_6 = _mm_unpackhi_epi64(tmp_12, tmp_14);
+
+ const __m128i res_0 = _mm_madd_epi16(src_0, coeff_0);
+ const __m128i res_2 = _mm_madd_epi16(src_2, coeff_2);
+ const __m128i res_4 = _mm_madd_epi16(src_4, coeff_4);
+ const __m128i res_6 = _mm_madd_epi16(src_6, coeff_6);
+
+ const __m128i res_even = _mm_add_epi32(_mm_add_epi32(res_0, res_2),
+ _mm_add_epi32(res_4, res_6));
+
+ // Filter odd-index pixels
+ const __m128i src_1 = _mm_unpackhi_epi16(src[0], src[1]);
+ const __m128i src_3 = _mm_unpackhi_epi16(src[2], src[3]);
+ const __m128i src_5 = _mm_unpackhi_epi16(src[4], src[5]);
+ const __m128i src_7 = _mm_unpackhi_epi16(src[6], src[7]);
+
+ const __m128i tmp_1 = _mm_loadu_si128(
+ (__m128i *)(av1_warped_filter +
+ ((sy + 1 * gamma) >> WARPEDDIFF_PREC_BITS)));
+ const __m128i tmp_3 = _mm_loadu_si128(
+ (__m128i *)(av1_warped_filter +
+ ((sy + 3 * gamma) >> WARPEDDIFF_PREC_BITS)));
+ const __m128i tmp_5 = _mm_loadu_si128(
+ (__m128i *)(av1_warped_filter +
+ ((sy + 5 * gamma) >> WARPEDDIFF_PREC_BITS)));
+ const __m128i tmp_7 = _mm_loadu_si128(
+ (__m128i *)(av1_warped_filter +
+ ((sy + 7 * gamma) >> WARPEDDIFF_PREC_BITS)));
+
+ const __m128i tmp_9 = _mm_unpacklo_epi32(tmp_1, tmp_3);
+ const __m128i tmp_11 = _mm_unpacklo_epi32(tmp_5, tmp_7);
+ const __m128i tmp_13 = _mm_unpackhi_epi32(tmp_1, tmp_3);
+ const __m128i tmp_15 = _mm_unpackhi_epi32(tmp_5, tmp_7);
+
+ const __m128i coeff_1 = _mm_unpacklo_epi64(tmp_9, tmp_11);
+ const __m128i coeff_3 = _mm_unpackhi_epi64(tmp_9, tmp_11);
+ const __m128i coeff_5 = _mm_unpacklo_epi64(tmp_13, tmp_15);
+ const __m128i coeff_7 = _mm_unpackhi_epi64(tmp_13, tmp_15);
+
+ const __m128i res_1 = _mm_madd_epi16(src_1, coeff_1);
+ const __m128i res_3 = _mm_madd_epi16(src_3, coeff_3);
+ const __m128i res_5 = _mm_madd_epi16(src_5, coeff_5);
+ const __m128i res_7 = _mm_madd_epi16(src_7, coeff_7);
+
+ const __m128i res_odd = _mm_add_epi32(_mm_add_epi32(res_1, res_3),
+ _mm_add_epi32(res_5, res_7));
+
+ // Rearrange pixels back into the order 0 ... 7
+ __m128i res_lo = _mm_unpacklo_epi32(res_even, res_odd);
+ __m128i res_hi = _mm_unpackhi_epi32(res_even, res_odd);
+
+ if (conv_params->is_compound) {
+ __m128i *const p =
+ (__m128i *)&conv_params
+ ->dst[(i + k + 4) * conv_params->dst_stride + j];
+ res_lo = _mm_add_epi32(res_lo, res_add_const);
+ res_lo = _mm_sra_epi32(_mm_add_epi32(res_lo, reduce_bits_vert_const),
+ reduce_bits_vert_shift);
+
+ if (conv_params->do_average) {
+ __m128i *const dst16 = (__m128i *)&pred[(i + k + 4) * p_stride + j];
+ __m128i p_32 = _mm_cvtepu16_epi32(_mm_loadl_epi64(p));
+
+ if (conv_params->use_dist_wtd_comp_avg) {
+ res_lo = _mm_add_epi32(_mm_mullo_epi32(p_32, wt0),
+ _mm_mullo_epi32(res_lo, wt1));
+ res_lo = _mm_srai_epi32(res_lo, DIST_PRECISION_BITS);
+ } else {
+ res_lo = _mm_srai_epi32(_mm_add_epi32(p_32, res_lo), 1);
+ }
+
+ __m128i res32_lo = _mm_add_epi32(res_lo, res_sub_const);
+ res32_lo = _mm_sra_epi32(_mm_add_epi32(res32_lo, round_bits_const),
+ round_bits_shift);
+
+ __m128i res16_lo = _mm_packus_epi32(res32_lo, res32_lo);
+ res16_lo = _mm_min_epi16(res16_lo, clip_pixel);
+ _mm_storel_epi64(dst16, res16_lo);
+ } else {
+ res_lo = _mm_packus_epi32(res_lo, res_lo);
+ _mm_storel_epi64(p, res_lo);
+ }
+ if (p_width > 4) {
+ __m128i *const p4 =
+ (__m128i *)&conv_params
+ ->dst[(i + k + 4) * conv_params->dst_stride + j + 4];
+
+ res_hi = _mm_add_epi32(res_hi, res_add_const);
+ res_hi =
+ _mm_sra_epi32(_mm_add_epi32(res_hi, reduce_bits_vert_const),
+ reduce_bits_vert_shift);
+ if (conv_params->do_average) {
+ __m128i *const dst16_4 =
+ (__m128i *)&pred[(i + k + 4) * p_stride + j + 4];
+ __m128i p4_32 = _mm_cvtepu16_epi32(_mm_loadl_epi64(p4));
+
+ if (conv_params->use_dist_wtd_comp_avg) {
+ res_hi = _mm_add_epi32(_mm_mullo_epi32(p4_32, wt0),
+ _mm_mullo_epi32(res_hi, wt1));
+ res_hi = _mm_srai_epi32(res_hi, DIST_PRECISION_BITS);
+ } else {
+ res_hi = _mm_srai_epi32(_mm_add_epi32(p4_32, res_hi), 1);
+ }
+
+ __m128i res32_hi = _mm_add_epi32(res_hi, res_sub_const);
+ res32_hi = _mm_sra_epi32(
+ _mm_add_epi32(res32_hi, round_bits_const), round_bits_shift);
+ __m128i res16_hi = _mm_packus_epi32(res32_hi, res32_hi);
+ res16_hi = _mm_min_epi16(res16_hi, clip_pixel);
+ _mm_storel_epi64(dst16_4, res16_hi);
+ } else {
+ res_hi = _mm_packus_epi32(res_hi, res_hi);
+ _mm_storel_epi64(p4, res_hi);
+ }
+ }
+ } else {
+ // Round and pack into 8 bits
+ const __m128i round_const =
+ _mm_set1_epi32(-(1 << (bd + reduce_bits_vert - 1)) +
+ ((1 << reduce_bits_vert) >> 1));
+
+ const __m128i res_lo_round = _mm_srai_epi32(
+ _mm_add_epi32(res_lo, round_const), reduce_bits_vert);
+ const __m128i res_hi_round = _mm_srai_epi32(
+ _mm_add_epi32(res_hi, round_const), reduce_bits_vert);
+
+ __m128i res_16bit = _mm_packs_epi32(res_lo_round, res_hi_round);
+ // Clamp res_16bit to the range [0, 2^bd - 1]
+ const __m128i max_val = _mm_set1_epi16((1 << bd) - 1);
+ const __m128i zero = _mm_setzero_si128();
+ res_16bit = _mm_max_epi16(_mm_min_epi16(res_16bit, max_val), zero);
+
+ // Store, blending with 'pred' if needed
+ __m128i *const p = (__m128i *)&pred[(i + k + 4) * p_stride + j];
+
+ // Note: If we're outputting a 4x4 block, we need to be very careful
+ // to only output 4 pixels at this point, to avoid encode/decode
+ // mismatches when encoding with multiple threads.
+ if (p_width == 4) {
+ _mm_storel_epi64(p, res_16bit);
+ } else {
+ _mm_storeu_si128(p, res_16bit);
+ }
+ }
+ }
+ }
+ }
+}
diff --git a/third_party/aom/av1/common/x86/highbd_wiener_convolve_avx2.c b/third_party/aom/av1/common/x86/highbd_wiener_convolve_avx2.c
new file mode 100644
index 0000000000..562c623fa9
--- /dev/null
+++ b/third_party/aom/av1/common/x86/highbd_wiener_convolve_avx2.c
@@ -0,0 +1,245 @@
+/*
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <immintrin.h>
+#include <assert.h>
+
+#include "config/av1_rtcd.h"
+
+#include "av1/common/convolve.h"
+#include "aom_dsp/aom_dsp_common.h"
+#include "aom_dsp/aom_filter.h"
+#include "aom_dsp/x86/synonyms.h"
+#include "aom_dsp/x86/synonyms_avx2.h"
+
+// 128-bit xmmwords are written as [ ... ] with the MSB on the left.
+// 256-bit ymmwords are written as two xmmwords, [ ... ][ ... ] with the MSB
+// on the left.
+// A row of, say, 16-bit pixels with values p0, p1, p2, ..., p14, p15 will be
+// loaded and stored as [ p15 ... p9 p8 ][ p7 ... p1 p0 ].
+void av1_highbd_wiener_convolve_add_src_avx2(
+ const uint8_t *src8, ptrdiff_t src_stride, uint8_t *dst8,
+ ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4,
+ const int16_t *filter_y, int y_step_q4, int w, int h,
+ const WienerConvolveParams *conv_params, int bd) {
+ assert(x_step_q4 == 16 && y_step_q4 == 16);
+ assert(!(w & 7));
+ assert(bd + FILTER_BITS - conv_params->round_0 + 2 <= 16);
+ (void)x_step_q4;
+ (void)y_step_q4;
+
+ const uint16_t *const src = CONVERT_TO_SHORTPTR(src8);
+ uint16_t *const dst = CONVERT_TO_SHORTPTR(dst8);
+
+ DECLARE_ALIGNED(32, uint16_t,
+ temp[(MAX_SB_SIZE + SUBPEL_TAPS - 1) * MAX_SB_SIZE]);
+ int intermediate_height = h + SUBPEL_TAPS - 1;
+ const int center_tap = ((SUBPEL_TAPS - 1) / 2);
+ const uint16_t *const src_ptr = src - center_tap * src_stride - center_tap;
+
+ const __m128i zero_128 = _mm_setzero_si128();
+ const __m256i zero_256 = _mm256_setzero_si256();
+
+ // Add an offset to account for the "add_src" part of the convolve function.
+ const __m128i offset = _mm_insert_epi16(zero_128, 1 << FILTER_BITS, 3);
+
+ const __m256i clamp_low = zero_256;
+
+ /* Horizontal filter */
+ {
+ const __m256i clamp_high_ep =
+ _mm256_set1_epi16(WIENER_CLAMP_LIMIT(conv_params->round_0, bd) - 1);
+
+ // coeffs [ f7 f6 f5 f4 f3 f2 f1 f0 ]
+ const __m128i coeffs_x = _mm_add_epi16(xx_loadu_128(filter_x), offset);
+
+ // coeffs [ f3 f2 f3 f2 f1 f0 f1 f0 ]
+ const __m128i coeffs_0123 = _mm_unpacklo_epi32(coeffs_x, coeffs_x);
+ // coeffs [ f7 f6 f7 f6 f5 f4 f5 f4 ]
+ const __m128i coeffs_4567 = _mm_unpackhi_epi32(coeffs_x, coeffs_x);
+
+ // coeffs [ f1 f0 f1 f0 f1 f0 f1 f0 ]
+ const __m128i coeffs_01_128 = _mm_unpacklo_epi64(coeffs_0123, coeffs_0123);
+ // coeffs [ f3 f2 f3 f2 f3 f2 f3 f2 ]
+ const __m128i coeffs_23_128 = _mm_unpackhi_epi64(coeffs_0123, coeffs_0123);
+ // coeffs [ f5 f4 f5 f4 f5 f4 f5 f4 ]
+ const __m128i coeffs_45_128 = _mm_unpacklo_epi64(coeffs_4567, coeffs_4567);
+ // coeffs [ f7 f6 f7 f6 f7 f6 f7 f6 ]
+ const __m128i coeffs_67_128 = _mm_unpackhi_epi64(coeffs_4567, coeffs_4567);
+
+ // coeffs [ f1 f0 f1 f0 f1 f0 f1 f0 ][ f1 f0 f1 f0 f1 f0 f1 f0 ]
+ const __m256i coeffs_01 = yy_set_m128i(coeffs_01_128, coeffs_01_128);
+ // coeffs [ f3 f2 f3 f2 f3 f2 f3 f2 ][ f3 f2 f3 f2 f3 f2 f3 f2 ]
+ const __m256i coeffs_23 = yy_set_m128i(coeffs_23_128, coeffs_23_128);
+ // coeffs [ f5 f4 f5 f4 f5 f4 f5 f4 ][ f5 f4 f5 f4 f5 f4 f5 f4 ]
+ const __m256i coeffs_45 = yy_set_m128i(coeffs_45_128, coeffs_45_128);
+ // coeffs [ f7 f6 f7 f6 f7 f6 f7 f6 ][ f7 f6 f7 f6 f7 f6 f7 f6 ]
+ const __m256i coeffs_67 = yy_set_m128i(coeffs_67_128, coeffs_67_128);
+
+ const __m256i round_const = _mm256_set1_epi32(
+ (1 << (conv_params->round_0 - 1)) + (1 << (bd + FILTER_BITS - 1)));
+
+ for (int i = 0; i < intermediate_height; ++i) {
+ for (int j = 0; j < w; j += 16) {
+ const uint16_t *src_ij = src_ptr + i * src_stride + j;
+
+ // Load 16-bit src data
+ const __m256i src_0 = yy_loadu_256(src_ij + 0);
+ const __m256i src_1 = yy_loadu_256(src_ij + 1);
+ const __m256i src_2 = yy_loadu_256(src_ij + 2);
+ const __m256i src_3 = yy_loadu_256(src_ij + 3);
+ const __m256i src_4 = yy_loadu_256(src_ij + 4);
+ const __m256i src_5 = yy_loadu_256(src_ij + 5);
+ const __m256i src_6 = yy_loadu_256(src_ij + 6);
+ const __m256i src_7 = yy_loadu_256(src_ij + 7);
+
+ // Multiply src data by filter coeffs and sum pairs
+ const __m256i res_0 = _mm256_madd_epi16(src_0, coeffs_01);
+ const __m256i res_1 = _mm256_madd_epi16(src_1, coeffs_01);
+ const __m256i res_2 = _mm256_madd_epi16(src_2, coeffs_23);
+ const __m256i res_3 = _mm256_madd_epi16(src_3, coeffs_23);
+ const __m256i res_4 = _mm256_madd_epi16(src_4, coeffs_45);
+ const __m256i res_5 = _mm256_madd_epi16(src_5, coeffs_45);
+ const __m256i res_6 = _mm256_madd_epi16(src_6, coeffs_67);
+ const __m256i res_7 = _mm256_madd_epi16(src_7, coeffs_67);
+
+ // Calculate scalar product for even- and odd-indices separately,
+ // increasing to 32-bit precision
+ const __m256i res_even_sum = _mm256_add_epi32(
+ _mm256_add_epi32(res_0, res_4), _mm256_add_epi32(res_2, res_6));
+ const __m256i res_even = _mm256_srai_epi32(
+ _mm256_add_epi32(res_even_sum, round_const), conv_params->round_0);
+
+ const __m256i res_odd_sum = _mm256_add_epi32(
+ _mm256_add_epi32(res_1, res_5), _mm256_add_epi32(res_3, res_7));
+ const __m256i res_odd = _mm256_srai_epi32(
+ _mm256_add_epi32(res_odd_sum, round_const), conv_params->round_0);
+
+ // Reduce to 16-bit precision and pack even- and odd-index results
+ // back into one register. The _mm256_packs_epi32 intrinsic returns
+ // a register with the pixels ordered as follows:
+ // [ 15 13 11 9 14 12 10 8 ] [ 7 5 3 1 6 4 2 0 ]
+ const __m256i res = _mm256_packs_epi32(res_even, res_odd);
+ const __m256i res_clamped =
+ _mm256_min_epi16(_mm256_max_epi16(res, clamp_low), clamp_high_ep);
+
+ // Store in a temporary array
+ yy_storeu_256(temp + i * MAX_SB_SIZE + j, res_clamped);
+ }
+ }
+ }
+
+ /* Vertical filter */
+ {
+ const __m256i clamp_high = _mm256_set1_epi16((1 << bd) - 1);
+
+ // coeffs [ f7 f6 f5 f4 f3 f2 f1 f0 ]
+ const __m128i coeffs_y = _mm_add_epi16(xx_loadu_128(filter_y), offset);
+
+ // coeffs [ f3 f2 f3 f2 f1 f0 f1 f0 ]
+ const __m128i coeffs_0123 = _mm_unpacklo_epi32(coeffs_y, coeffs_y);
+ // coeffs [ f7 f6 f7 f6 f5 f4 f5 f4 ]
+ const __m128i coeffs_4567 = _mm_unpackhi_epi32(coeffs_y, coeffs_y);
+
+ // coeffs [ f1 f0 f1 f0 f1 f0 f1 f0 ]
+ const __m128i coeffs_01_128 = _mm_unpacklo_epi64(coeffs_0123, coeffs_0123);
+ // coeffs [ f3 f2 f3 f2 f3 f2 f3 f2 ]
+ const __m128i coeffs_23_128 = _mm_unpackhi_epi64(coeffs_0123, coeffs_0123);
+ // coeffs [ f5 f4 f5 f4 f5 f4 f5 f4 ]
+ const __m128i coeffs_45_128 = _mm_unpacklo_epi64(coeffs_4567, coeffs_4567);
+ // coeffs [ f7 f6 f7 f6 f7 f6 f7 f6 ]
+ const __m128i coeffs_67_128 = _mm_unpackhi_epi64(coeffs_4567, coeffs_4567);
+
+ // coeffs [ f1 f0 f1 f0 f1 f0 f1 f0 ][ f1 f0 f1 f0 f1 f0 f1 f0 ]
+ const __m256i coeffs_01 = yy_set_m128i(coeffs_01_128, coeffs_01_128);
+ // coeffs [ f3 f2 f3 f2 f3 f2 f3 f2 ][ f3 f2 f3 f2 f3 f2 f3 f2 ]
+ const __m256i coeffs_23 = yy_set_m128i(coeffs_23_128, coeffs_23_128);
+ // coeffs [ f5 f4 f5 f4 f5 f4 f5 f4 ][ f5 f4 f5 f4 f5 f4 f5 f4 ]
+ const __m256i coeffs_45 = yy_set_m128i(coeffs_45_128, coeffs_45_128);
+ // coeffs [ f7 f6 f7 f6 f7 f6 f7 f6 ][ f7 f6 f7 f6 f7 f6 f7 f6 ]
+ const __m256i coeffs_67 = yy_set_m128i(coeffs_67_128, coeffs_67_128);
+
+ const __m256i round_const =
+ _mm256_set1_epi32((1 << (conv_params->round_1 - 1)) -
+ (1 << (bd + conv_params->round_1 - 1)));
+
+ for (int i = 0; i < h; ++i) {
+ for (int j = 0; j < w; j += 16) {
+ const uint16_t *temp_ij = temp + i * MAX_SB_SIZE + j;
+
+ // Load 16-bit data from the output of the horizontal filter in
+ // which the pixels are ordered as follows:
+ // [ 15 13 11 9 14 12 10 8 ] [ 7 5 3 1 6 4 2 0 ]
+ const __m256i data_0 = yy_loadu_256(temp_ij + 0 * MAX_SB_SIZE);
+ const __m256i data_1 = yy_loadu_256(temp_ij + 1 * MAX_SB_SIZE);
+ const __m256i data_2 = yy_loadu_256(temp_ij + 2 * MAX_SB_SIZE);
+ const __m256i data_3 = yy_loadu_256(temp_ij + 3 * MAX_SB_SIZE);
+ const __m256i data_4 = yy_loadu_256(temp_ij + 4 * MAX_SB_SIZE);
+ const __m256i data_5 = yy_loadu_256(temp_ij + 5 * MAX_SB_SIZE);
+ const __m256i data_6 = yy_loadu_256(temp_ij + 6 * MAX_SB_SIZE);
+ const __m256i data_7 = yy_loadu_256(temp_ij + 7 * MAX_SB_SIZE);
+
+ // Filter the even-indices, increasing to 32-bit precision
+ const __m256i src_0 = _mm256_unpacklo_epi16(data_0, data_1);
+ const __m256i src_2 = _mm256_unpacklo_epi16(data_2, data_3);
+ const __m256i src_4 = _mm256_unpacklo_epi16(data_4, data_5);
+ const __m256i src_6 = _mm256_unpacklo_epi16(data_6, data_7);
+
+ const __m256i res_0 = _mm256_madd_epi16(src_0, coeffs_01);
+ const __m256i res_2 = _mm256_madd_epi16(src_2, coeffs_23);
+ const __m256i res_4 = _mm256_madd_epi16(src_4, coeffs_45);
+ const __m256i res_6 = _mm256_madd_epi16(src_6, coeffs_67);
+
+ const __m256i res_even = _mm256_add_epi32(
+ _mm256_add_epi32(res_0, res_2), _mm256_add_epi32(res_4, res_6));
+
+ // Filter the odd-indices, increasing to 32-bit precision
+ const __m256i src_1 = _mm256_unpackhi_epi16(data_0, data_1);
+ const __m256i src_3 = _mm256_unpackhi_epi16(data_2, data_3);
+ const __m256i src_5 = _mm256_unpackhi_epi16(data_4, data_5);
+ const __m256i src_7 = _mm256_unpackhi_epi16(data_6, data_7);
+
+ const __m256i res_1 = _mm256_madd_epi16(src_1, coeffs_01);
+ const __m256i res_3 = _mm256_madd_epi16(src_3, coeffs_23);
+ const __m256i res_5 = _mm256_madd_epi16(src_5, coeffs_45);
+ const __m256i res_7 = _mm256_madd_epi16(src_7, coeffs_67);
+
+ const __m256i res_odd = _mm256_add_epi32(
+ _mm256_add_epi32(res_1, res_3), _mm256_add_epi32(res_5, res_7));
+
+ // Pixels are currently in the following order:
+ // res_even order: [ 14 12 10 8 ] [ 6 4 2 0 ]
+ // res_odd order: [ 15 13 11 9 ] [ 7 5 3 1 ]
+ //
+ // Rearrange the pixels into the following order:
+ // res_lo order: [ 11 10 9 8 ] [ 3 2 1 0 ]
+ // res_hi order: [ 15 14 13 12 ] [ 7 6 5 4 ]
+ const __m256i res_lo = _mm256_unpacklo_epi32(res_even, res_odd);
+ const __m256i res_hi = _mm256_unpackhi_epi32(res_even, res_odd);
+
+ const __m256i res_lo_round = _mm256_srai_epi32(
+ _mm256_add_epi32(res_lo, round_const), conv_params->round_1);
+ const __m256i res_hi_round = _mm256_srai_epi32(
+ _mm256_add_epi32(res_hi, round_const), conv_params->round_1);
+
+ // Reduce to 16-bit precision and pack into the correct order:
+ // [ 15 14 13 12 11 10 9 8 ][ 7 6 5 4 3 2 1 0 ]
+ const __m256i res_16bit =
+ _mm256_packs_epi32(res_lo_round, res_hi_round);
+ const __m256i res_16bit_clamped = _mm256_min_epi16(
+ _mm256_max_epi16(res_16bit, clamp_low), clamp_high);
+
+ // Store in the dst array
+ yy_storeu_256(dst + i * dst_stride + j, res_16bit_clamped);
+ }
+ }
+ }
+}
diff --git a/third_party/aom/av1/common/x86/highbd_wiener_convolve_ssse3.c b/third_party/aom/av1/common/x86/highbd_wiener_convolve_ssse3.c
new file mode 100644
index 0000000000..cab37fa910
--- /dev/null
+++ b/third_party/aom/av1/common/x86/highbd_wiener_convolve_ssse3.c
@@ -0,0 +1,202 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <tmmintrin.h>
+#include <assert.h>
+
+#include "config/av1_rtcd.h"
+
+#include "av1/common/convolve.h"
+#include "aom_dsp/aom_dsp_common.h"
+#include "aom_dsp/aom_filter.h"
+
+void av1_highbd_wiener_convolve_add_src_ssse3(
+ const uint8_t *src8, ptrdiff_t src_stride, uint8_t *dst8,
+ ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4,
+ const int16_t *filter_y, int y_step_q4, int w, int h,
+ const WienerConvolveParams *conv_params, int bd) {
+ assert(x_step_q4 == 16 && y_step_q4 == 16);
+ assert(!(w & 7));
+ assert(bd + FILTER_BITS - conv_params->round_0 + 2 <= 16);
+ (void)x_step_q4;
+ (void)y_step_q4;
+
+ const uint16_t *const src = CONVERT_TO_SHORTPTR(src8);
+ uint16_t *const dst = CONVERT_TO_SHORTPTR(dst8);
+
+ DECLARE_ALIGNED(16, uint16_t,
+ temp[(MAX_SB_SIZE + SUBPEL_TAPS - 1) * MAX_SB_SIZE]);
+ int intermediate_height = h + SUBPEL_TAPS - 1;
+ int i, j;
+ const int center_tap = ((SUBPEL_TAPS - 1) / 2);
+ const uint16_t *const src_ptr = src - center_tap * src_stride - center_tap;
+
+ const __m128i zero = _mm_setzero_si128();
+ // Add an offset to account for the "add_src" part of the convolve function.
+ const __m128i offset = _mm_insert_epi16(zero, 1 << FILTER_BITS, 3);
+
+ /* Horizontal filter */
+ {
+ const __m128i coeffs_x =
+ _mm_add_epi16(_mm_loadu_si128((__m128i *)filter_x), offset);
+
+ // coeffs 0 1 0 1 2 3 2 3
+ const __m128i tmp_0 = _mm_unpacklo_epi32(coeffs_x, coeffs_x);
+ // coeffs 4 5 4 5 6 7 6 7
+ const __m128i tmp_1 = _mm_unpackhi_epi32(coeffs_x, coeffs_x);
+
+ // coeffs 0 1 0 1 0 1 0 1
+ const __m128i coeff_01 = _mm_unpacklo_epi64(tmp_0, tmp_0);
+ // coeffs 2 3 2 3 2 3 2 3
+ const __m128i coeff_23 = _mm_unpackhi_epi64(tmp_0, tmp_0);
+ // coeffs 4 5 4 5 4 5 4 5
+ const __m128i coeff_45 = _mm_unpacklo_epi64(tmp_1, tmp_1);
+ // coeffs 6 7 6 7 6 7 6 7
+ const __m128i coeff_67 = _mm_unpackhi_epi64(tmp_1, tmp_1);
+
+ const __m128i round_const = _mm_set1_epi32(
+ (1 << (conv_params->round_0 - 1)) + (1 << (bd + FILTER_BITS - 1)));
+
+ for (i = 0; i < intermediate_height; ++i) {
+ for (j = 0; j < w; j += 8) {
+ const __m128i data =
+ _mm_loadu_si128((__m128i *)&src_ptr[i * src_stride + j]);
+ const __m128i data2 =
+ _mm_loadu_si128((__m128i *)&src_ptr[i * src_stride + j + 8]);
+
+ // Filter even-index pixels
+ const __m128i res_0 = _mm_madd_epi16(data, coeff_01);
+ const __m128i res_2 =
+ _mm_madd_epi16(_mm_alignr_epi8(data2, data, 4), coeff_23);
+ const __m128i res_4 =
+ _mm_madd_epi16(_mm_alignr_epi8(data2, data, 8), coeff_45);
+ const __m128i res_6 =
+ _mm_madd_epi16(_mm_alignr_epi8(data2, data, 12), coeff_67);
+
+ __m128i res_even = _mm_add_epi32(_mm_add_epi32(res_0, res_4),
+ _mm_add_epi32(res_2, res_6));
+ res_even = _mm_srai_epi32(_mm_add_epi32(res_even, round_const),
+ conv_params->round_0);
+
+ // Filter odd-index pixels
+ const __m128i res_1 =
+ _mm_madd_epi16(_mm_alignr_epi8(data2, data, 2), coeff_01);
+ const __m128i res_3 =
+ _mm_madd_epi16(_mm_alignr_epi8(data2, data, 6), coeff_23);
+ const __m128i res_5 =
+ _mm_madd_epi16(_mm_alignr_epi8(data2, data, 10), coeff_45);
+ const __m128i res_7 =
+ _mm_madd_epi16(_mm_alignr_epi8(data2, data, 14), coeff_67);
+
+ __m128i res_odd = _mm_add_epi32(_mm_add_epi32(res_1, res_5),
+ _mm_add_epi32(res_3, res_7));
+ res_odd = _mm_srai_epi32(_mm_add_epi32(res_odd, round_const),
+ conv_params->round_0);
+
+ // Pack in the column order 0, 2, 4, 6, 1, 3, 5, 7
+ const __m128i maxval =
+ _mm_set1_epi16((WIENER_CLAMP_LIMIT(conv_params->round_0, bd)) - 1);
+ __m128i res = _mm_packs_epi32(res_even, res_odd);
+ res = _mm_min_epi16(_mm_max_epi16(res, zero), maxval);
+ _mm_storeu_si128((__m128i *)&temp[i * MAX_SB_SIZE + j], res);
+ }
+ }
+ }
+
+ /* Vertical filter */
+ {
+ const __m128i coeffs_y =
+ _mm_add_epi16(_mm_loadu_si128((__m128i *)filter_y), offset);
+
+ // coeffs 0 1 0 1 2 3 2 3
+ const __m128i tmp_0 = _mm_unpacklo_epi32(coeffs_y, coeffs_y);
+ // coeffs 4 5 4 5 6 7 6 7
+ const __m128i tmp_1 = _mm_unpackhi_epi32(coeffs_y, coeffs_y);
+
+ // coeffs 0 1 0 1 0 1 0 1
+ const __m128i coeff_01 = _mm_unpacklo_epi64(tmp_0, tmp_0);
+ // coeffs 2 3 2 3 2 3 2 3
+ const __m128i coeff_23 = _mm_unpackhi_epi64(tmp_0, tmp_0);
+ // coeffs 4 5 4 5 4 5 4 5
+ const __m128i coeff_45 = _mm_unpacklo_epi64(tmp_1, tmp_1);
+ // coeffs 6 7 6 7 6 7 6 7
+ const __m128i coeff_67 = _mm_unpackhi_epi64(tmp_1, tmp_1);
+
+ const __m128i round_const =
+ _mm_set1_epi32((1 << (conv_params->round_1 - 1)) -
+ (1 << (bd + conv_params->round_1 - 1)));
+
+ for (i = 0; i < h; ++i) {
+ for (j = 0; j < w; j += 8) {
+ // Filter even-index pixels
+ const uint16_t *data = &temp[i * MAX_SB_SIZE + j];
+ const __m128i src_0 =
+ _mm_unpacklo_epi16(*(__m128i *)(data + 0 * MAX_SB_SIZE),
+ *(__m128i *)(data + 1 * MAX_SB_SIZE));
+ const __m128i src_2 =
+ _mm_unpacklo_epi16(*(__m128i *)(data + 2 * MAX_SB_SIZE),
+ *(__m128i *)(data + 3 * MAX_SB_SIZE));
+ const __m128i src_4 =
+ _mm_unpacklo_epi16(*(__m128i *)(data + 4 * MAX_SB_SIZE),
+ *(__m128i *)(data + 5 * MAX_SB_SIZE));
+ const __m128i src_6 =
+ _mm_unpacklo_epi16(*(__m128i *)(data + 6 * MAX_SB_SIZE),
+ *(__m128i *)(data + 7 * MAX_SB_SIZE));
+
+ const __m128i res_0 = _mm_madd_epi16(src_0, coeff_01);
+ const __m128i res_2 = _mm_madd_epi16(src_2, coeff_23);
+ const __m128i res_4 = _mm_madd_epi16(src_4, coeff_45);
+ const __m128i res_6 = _mm_madd_epi16(src_6, coeff_67);
+
+ const __m128i res_even = _mm_add_epi32(_mm_add_epi32(res_0, res_2),
+ _mm_add_epi32(res_4, res_6));
+
+ // Filter odd-index pixels
+ const __m128i src_1 =
+ _mm_unpackhi_epi16(*(__m128i *)(data + 0 * MAX_SB_SIZE),
+ *(__m128i *)(data + 1 * MAX_SB_SIZE));
+ const __m128i src_3 =
+ _mm_unpackhi_epi16(*(__m128i *)(data + 2 * MAX_SB_SIZE),
+ *(__m128i *)(data + 3 * MAX_SB_SIZE));
+ const __m128i src_5 =
+ _mm_unpackhi_epi16(*(__m128i *)(data + 4 * MAX_SB_SIZE),
+ *(__m128i *)(data + 5 * MAX_SB_SIZE));
+ const __m128i src_7 =
+ _mm_unpackhi_epi16(*(__m128i *)(data + 6 * MAX_SB_SIZE),
+ *(__m128i *)(data + 7 * MAX_SB_SIZE));
+
+ const __m128i res_1 = _mm_madd_epi16(src_1, coeff_01);
+ const __m128i res_3 = _mm_madd_epi16(src_3, coeff_23);
+ const __m128i res_5 = _mm_madd_epi16(src_5, coeff_45);
+ const __m128i res_7 = _mm_madd_epi16(src_7, coeff_67);
+
+ const __m128i res_odd = _mm_add_epi32(_mm_add_epi32(res_1, res_3),
+ _mm_add_epi32(res_5, res_7));
+
+ // Rearrange pixels back into the order 0 ... 7
+ const __m128i res_lo = _mm_unpacklo_epi32(res_even, res_odd);
+ const __m128i res_hi = _mm_unpackhi_epi32(res_even, res_odd);
+
+ const __m128i res_lo_round = _mm_srai_epi32(
+ _mm_add_epi32(res_lo, round_const), conv_params->round_1);
+ const __m128i res_hi_round = _mm_srai_epi32(
+ _mm_add_epi32(res_hi, round_const), conv_params->round_1);
+
+ const __m128i maxval = _mm_set1_epi16((1 << bd) - 1);
+ __m128i res_16bit = _mm_packs_epi32(res_lo_round, res_hi_round);
+ res_16bit = _mm_min_epi16(_mm_max_epi16(res_16bit, zero), maxval);
+
+ __m128i *const p = (__m128i *)&dst[i * dst_stride + j];
+ _mm_storeu_si128(p, res_16bit);
+ }
+ }
+ }
+}
diff --git a/third_party/aom/av1/common/x86/intra_edge_sse4.c b/third_party/aom/av1/common/x86/intra_edge_sse4.c
new file mode 100644
index 0000000000..3eee46faeb
--- /dev/null
+++ b/third_party/aom/av1/common/x86/intra_edge_sse4.c
@@ -0,0 +1,322 @@
+/*
+ * Copyright (c) 2017, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <assert.h>
+#include <smmintrin.h>
+
+#include "config/aom_config.h"
+#include "config/av1_rtcd.h"
+
+void av1_filter_intra_edge_sse4_1(uint8_t *p, int sz, int strength) {
+ if (!strength) return;
+
+ DECLARE_ALIGNED(16, static const int8_t, kern[3][16]) = {
+ { 4, 8, 4, 0, 4, 8, 4, 0, 4, 8, 4, 0, 4, 8, 4, 0 }, // strength 1: 4,8,4
+ { 5, 6, 5, 0, 5, 6, 5, 0, 5, 6, 5, 0, 5, 6, 5, 0 }, // strength 2: 5,6,5
+ { 2, 4, 4, 4, 2, 0, 0, 0, 2, 4, 4, 4, 2, 0, 0, 0 } // strength 3: 2,4,4,4,2
+ };
+
+ DECLARE_ALIGNED(16, static const int8_t, v_const[5][16]) = {
+ { 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6 },
+ { 4, 5, 6, 7, 5, 6, 7, 8, 6, 7, 8, 9, 7, 8, 9, 10 },
+ { 0, 1, 2, 3, 4, 5, 6, 7, 1, 2, 3, 4, 5, 6, 7, 8 },
+ { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 },
+ };
+
+ // Extend the first and last samples to simplify the loop for the 5-tap case
+ p[-1] = p[0];
+ __m128i last = _mm_set1_epi8((char)p[sz - 1]);
+ _mm_storeu_si128((__m128i *)&p[sz], last);
+
+ // Adjust input pointer for filter support area
+ uint8_t *in = (strength == 3) ? p - 1 : p;
+
+ // Avoid modifying first sample
+ uint8_t *out = p + 1;
+ int len = sz - 1;
+
+ const int use_3tap_filter = (strength < 3);
+
+ if (use_3tap_filter) {
+ __m128i coef0 = _mm_lddqu_si128((__m128i const *)kern[strength - 1]);
+ __m128i shuf0 = _mm_lddqu_si128((__m128i const *)v_const[0]);
+ __m128i shuf1 = _mm_lddqu_si128((__m128i const *)v_const[1]);
+ __m128i iden = _mm_lddqu_si128((__m128i *)v_const[3]);
+ __m128i in0 = _mm_lddqu_si128((__m128i *)in);
+ while (len > 0) {
+ int n_out = (len < 8) ? len : 8;
+ __m128i d0 = _mm_shuffle_epi8(in0, shuf0);
+ __m128i d1 = _mm_shuffle_epi8(in0, shuf1);
+ d0 = _mm_maddubs_epi16(d0, coef0);
+ d1 = _mm_maddubs_epi16(d1, coef0);
+ d0 = _mm_hadd_epi16(d0, d1);
+ __m128i eight = _mm_set1_epi16(8);
+ d0 = _mm_add_epi16(d0, eight);
+ d0 = _mm_srai_epi16(d0, 4);
+ d0 = _mm_packus_epi16(d0, d0);
+ __m128i out0 = _mm_lddqu_si128((__m128i *)out);
+ __m128i n0 = _mm_set1_epi8(n_out);
+ __m128i mask = _mm_cmpgt_epi8(n0, iden);
+ out0 = _mm_blendv_epi8(out0, d0, mask);
+ _mm_storel_epi64((__m128i *)out, out0);
+ __m128i in1 = _mm_lddqu_si128((__m128i *)(in + 16));
+ in0 = _mm_alignr_epi8(in1, in0, 8);
+ in += 8;
+ out += 8;
+ len -= n_out;
+ }
+ } else { // 5-tap filter
+ __m128i coef0 = _mm_lddqu_si128((__m128i const *)kern[strength - 1]);
+ __m128i two = _mm_set1_epi8(2);
+ __m128i shuf_a = _mm_lddqu_si128((__m128i const *)v_const[2]);
+ __m128i shuf_b = _mm_add_epi8(shuf_a, two);
+ __m128i shuf_c = _mm_add_epi8(shuf_b, two);
+ __m128i shuf_d = _mm_add_epi8(shuf_c, two);
+ __m128i iden = _mm_lddqu_si128((__m128i *)v_const[3]);
+ __m128i in0 = _mm_lddqu_si128((__m128i *)in);
+ while (len > 0) {
+ int n_out = (len < 8) ? len : 8;
+ __m128i d0 = _mm_shuffle_epi8(in0, shuf_a);
+ __m128i d1 = _mm_shuffle_epi8(in0, shuf_b);
+ __m128i d2 = _mm_shuffle_epi8(in0, shuf_c);
+ __m128i d3 = _mm_shuffle_epi8(in0, shuf_d);
+ d0 = _mm_maddubs_epi16(d0, coef0);
+ d1 = _mm_maddubs_epi16(d1, coef0);
+ d2 = _mm_maddubs_epi16(d2, coef0);
+ d3 = _mm_maddubs_epi16(d3, coef0);
+ d0 = _mm_hadd_epi16(d0, d1);
+ d2 = _mm_hadd_epi16(d2, d3);
+ d0 = _mm_hadd_epi16(d0, d2);
+ __m128i eight = _mm_set1_epi16(8);
+ d0 = _mm_add_epi16(d0, eight);
+ d0 = _mm_srai_epi16(d0, 4);
+ d0 = _mm_packus_epi16(d0, d0);
+ __m128i out0 = _mm_lddqu_si128((__m128i *)out);
+ __m128i n0 = _mm_set1_epi8(n_out);
+ __m128i mask = _mm_cmpgt_epi8(n0, iden);
+ out0 = _mm_blendv_epi8(out0, d0, mask);
+ _mm_storel_epi64((__m128i *)out, out0);
+ __m128i in1 = _mm_lddqu_si128((__m128i *)(in + 16));
+ in0 = _mm_alignr_epi8(in1, in0, 8);
+ in += 8;
+ out += 8;
+ len -= n_out;
+ }
+ }
+}
+
+void av1_upsample_intra_edge_sse4_1(uint8_t *p, int sz) {
+ // interpolate half-sample positions
+ assert(sz <= 24);
+
+ DECLARE_ALIGNED(16, static const int8_t, kernel[1][16]) = {
+ { -1, 9, 9, -1, -1, 9, 9, -1, -1, 9, 9, -1, -1, 9, 9, -1 }
+ };
+
+ DECLARE_ALIGNED(
+ 16, static const int8_t,
+ v_const[2][16]) = { { 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6 },
+ { 4, 5, 6, 7, 5, 6, 7, 8, 6, 7, 8, 9, 7, 8, 9, 10 } };
+
+ // Extend first/last samples (upper-left p[-1], last p[sz-1])
+ // to support 4-tap filter
+ p[-2] = p[-1];
+ p[sz] = p[sz - 1];
+
+ uint8_t *in = &p[-2];
+ uint8_t *out = &p[-2];
+
+ int n = sz + 1; // Input length including upper-left sample
+
+ __m128i in0 = _mm_lddqu_si128((__m128i *)&in[0]);
+ __m128i in16 = _mm_lddqu_si128((__m128i *)&in[16]);
+
+ __m128i coef0 = _mm_lddqu_si128((__m128i *)kernel[0]);
+ __m128i shuf0 = _mm_lddqu_si128((__m128i *)v_const[0]);
+ __m128i shuf1 = _mm_lddqu_si128((__m128i *)v_const[1]);
+
+ while (n > 0) {
+ __m128i in8 = _mm_alignr_epi8(in16, in0, 8);
+ __m128i d0 = _mm_shuffle_epi8(in0, shuf0);
+ __m128i d1 = _mm_shuffle_epi8(in0, shuf1);
+ __m128i d2 = _mm_shuffle_epi8(in8, shuf0);
+ __m128i d3 = _mm_shuffle_epi8(in8, shuf1);
+ d0 = _mm_maddubs_epi16(d0, coef0);
+ d1 = _mm_maddubs_epi16(d1, coef0);
+ d2 = _mm_maddubs_epi16(d2, coef0);
+ d3 = _mm_maddubs_epi16(d3, coef0);
+ d0 = _mm_hadd_epi16(d0, d1);
+ d2 = _mm_hadd_epi16(d2, d3);
+ __m128i eight = _mm_set1_epi16(8);
+ d0 = _mm_add_epi16(d0, eight);
+ d2 = _mm_add_epi16(d2, eight);
+ d0 = _mm_srai_epi16(d0, 4);
+ d2 = _mm_srai_epi16(d2, 4);
+ d0 = _mm_packus_epi16(d0, d2);
+ __m128i in1 = _mm_alignr_epi8(in16, in0, 1);
+ __m128i out0 = _mm_unpacklo_epi8(in1, d0);
+ __m128i out1 = _mm_unpackhi_epi8(in1, d0);
+ _mm_storeu_si128((__m128i *)&out[0], out0);
+ _mm_storeu_si128((__m128i *)&out[16], out1);
+ in0 = in16;
+ in16 = _mm_setzero_si128();
+ out += 32;
+ n -= 16;
+ }
+}
+
+#if CONFIG_AV1_HIGHBITDEPTH
+
+void av1_highbd_filter_intra_edge_sse4_1(uint16_t *p, int sz, int strength) {
+ if (!strength) return;
+
+ DECLARE_ALIGNED(16, static const int16_t, kern[3][8]) = {
+ { 4, 8, 4, 8, 4, 8, 4, 8 }, // strength 1: 4,8,4
+ { 5, 6, 5, 6, 5, 6, 5, 6 }, // strength 2: 5,6,5
+ { 2, 4, 2, 4, 2, 4, 2, 4 } // strength 3: 2,4,4,4,2
+ };
+
+ DECLARE_ALIGNED(16, static const int16_t,
+ v_const[1][8]) = { { 0, 1, 2, 3, 4, 5, 6, 7 } };
+
+ // Extend the first and last samples to simplify the loop for the 5-tap case
+ p[-1] = p[0];
+ __m128i last = _mm_set1_epi16(p[sz - 1]);
+ _mm_storeu_si128((__m128i *)&p[sz], last);
+
+ // Adjust input pointer for filter support area
+ uint16_t *in = (strength == 3) ? p - 1 : p;
+
+ // Avoid modifying first sample
+ uint16_t *out = p + 1;
+ int len = sz - 1;
+
+ const int use_3tap_filter = (strength < 3);
+
+ if (use_3tap_filter) {
+ __m128i coef0 = _mm_lddqu_si128((__m128i const *)kern[strength - 1]);
+ __m128i iden = _mm_lddqu_si128((__m128i *)v_const[0]);
+ __m128i in0 = _mm_lddqu_si128((__m128i *)&in[0]);
+ __m128i in8 = _mm_lddqu_si128((__m128i *)&in[8]);
+ while (len > 0) {
+ int n_out = (len < 8) ? len : 8;
+ __m128i in1 = _mm_alignr_epi8(in8, in0, 2);
+ __m128i in2 = _mm_alignr_epi8(in8, in0, 4);
+ __m128i in02 = _mm_add_epi16(in0, in2);
+ __m128i d0 = _mm_unpacklo_epi16(in02, in1);
+ __m128i d1 = _mm_unpackhi_epi16(in02, in1);
+ d0 = _mm_mullo_epi16(d0, coef0);
+ d1 = _mm_mullo_epi16(d1, coef0);
+ d0 = _mm_hadd_epi16(d0, d1);
+ __m128i eight = _mm_set1_epi16(8);
+ d0 = _mm_add_epi16(d0, eight);
+ d0 = _mm_srli_epi16(d0, 4);
+ __m128i out0 = _mm_lddqu_si128((__m128i *)out);
+ __m128i n0 = _mm_set1_epi16(n_out);
+ __m128i mask = _mm_cmpgt_epi16(n0, iden);
+ out0 = _mm_blendv_epi8(out0, d0, mask);
+ _mm_storeu_si128((__m128i *)out, out0);
+ in += 8;
+ in0 = in8;
+ in8 = _mm_lddqu_si128((__m128i *)&in[8]);
+ out += 8;
+ len -= n_out;
+ }
+ } else { // 5-tap filter
+ __m128i coef0 = _mm_lddqu_si128((__m128i const *)kern[strength - 1]);
+ __m128i iden = _mm_lddqu_si128((__m128i *)v_const[0]);
+ __m128i in0 = _mm_lddqu_si128((__m128i *)&in[0]);
+ __m128i in8 = _mm_lddqu_si128((__m128i *)&in[8]);
+ while (len > 0) {
+ int n_out = (len < 8) ? len : 8;
+ __m128i in1 = _mm_alignr_epi8(in8, in0, 2);
+ __m128i in2 = _mm_alignr_epi8(in8, in0, 4);
+ __m128i in3 = _mm_alignr_epi8(in8, in0, 6);
+ __m128i in4 = _mm_alignr_epi8(in8, in0, 8);
+ __m128i in04 = _mm_add_epi16(in0, in4);
+ __m128i in123 = _mm_add_epi16(in1, in2);
+ in123 = _mm_add_epi16(in123, in3);
+ __m128i d0 = _mm_unpacklo_epi16(in04, in123);
+ __m128i d1 = _mm_unpackhi_epi16(in04, in123);
+ d0 = _mm_mullo_epi16(d0, coef0);
+ d1 = _mm_mullo_epi16(d1, coef0);
+ d0 = _mm_hadd_epi16(d0, d1);
+ __m128i eight = _mm_set1_epi16(8);
+ d0 = _mm_add_epi16(d0, eight);
+ d0 = _mm_srli_epi16(d0, 4);
+ __m128i out0 = _mm_lddqu_si128((__m128i *)out);
+ __m128i n0 = _mm_set1_epi16(n_out);
+ __m128i mask = _mm_cmpgt_epi16(n0, iden);
+ out0 = _mm_blendv_epi8(out0, d0, mask);
+ _mm_storeu_si128((__m128i *)out, out0);
+ in += 8;
+ in0 = in8;
+ in8 = _mm_lddqu_si128((__m128i *)&in[8]);
+ out += 8;
+ len -= n_out;
+ }
+ }
+}
+
+void av1_highbd_upsample_intra_edge_sse4_1(uint16_t *p, int sz, int bd) {
+ // interpolate half-sample positions
+ assert(sz <= 24);
+
+ DECLARE_ALIGNED(16, static const int16_t,
+ kernel[1][8]) = { { -1, 9, -1, 9, -1, 9, -1, 9 } };
+
+ // Extend first/last samples (upper-left p[-1], last p[sz-1])
+ // to support 4-tap filter
+ p[-2] = p[-1];
+ p[sz] = p[sz - 1];
+
+ uint16_t *in = &p[-2];
+ uint16_t *out = in;
+ int n = sz + 1;
+
+ __m128i in0 = _mm_lddqu_si128((__m128i *)&in[0]);
+ __m128i in8 = _mm_lddqu_si128((__m128i *)&in[8]);
+ __m128i in16 = _mm_lddqu_si128((__m128i *)&in[16]);
+ __m128i in24 = _mm_lddqu_si128((__m128i *)&in[24]);
+
+ while (n > 0) {
+ __m128i in1 = _mm_alignr_epi8(in8, in0, 2);
+ __m128i in2 = _mm_alignr_epi8(in8, in0, 4);
+ __m128i in3 = _mm_alignr_epi8(in8, in0, 6);
+ __m128i sum0 = _mm_add_epi16(in0, in3);
+ __m128i sum1 = _mm_add_epi16(in1, in2);
+ __m128i d0 = _mm_unpacklo_epi16(sum0, sum1);
+ __m128i d1 = _mm_unpackhi_epi16(sum0, sum1);
+ __m128i coef0 = _mm_lddqu_si128((__m128i *)kernel[0]);
+ d0 = _mm_madd_epi16(d0, coef0);
+ d1 = _mm_madd_epi16(d1, coef0);
+ __m128i eight = _mm_set1_epi32(8);
+ d0 = _mm_add_epi32(d0, eight);
+ d1 = _mm_add_epi32(d1, eight);
+ d0 = _mm_srai_epi32(d0, 4);
+ d1 = _mm_srai_epi32(d1, 4);
+ d0 = _mm_packus_epi32(d0, d1);
+ __m128i max0 = _mm_set1_epi16((1 << bd) - 1);
+ d0 = _mm_min_epi16(d0, max0);
+ __m128i out0 = _mm_unpacklo_epi16(in1, d0);
+ __m128i out1 = _mm_unpackhi_epi16(in1, d0);
+ _mm_storeu_si128((__m128i *)&out[0], out0);
+ _mm_storeu_si128((__m128i *)&out[8], out1);
+ in0 = in8;
+ in8 = in16;
+ in16 = in24;
+ in24 = _mm_setzero_si128();
+ out += 16;
+ n -= 8;
+ }
+}
+
+#endif // CONFIG_AV1_HIGHBITDEPTH
diff --git a/third_party/aom/av1/common/x86/jnt_convolve_avx2.c b/third_party/aom/av1/common/x86/jnt_convolve_avx2.c
new file mode 100644
index 0000000000..9f82ed2300
--- /dev/null
+++ b/third_party/aom/av1/common/x86/jnt_convolve_avx2.c
@@ -0,0 +1,1124 @@
+/*
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <emmintrin.h>
+#include <immintrin.h>
+
+#include "config/av1_rtcd.h"
+
+#include "aom_dsp/aom_dsp_common.h"
+#include "aom_dsp/aom_filter.h"
+#include "aom_dsp/x86/convolve_avx2.h"
+#include "aom_dsp/x86/convolve_common_intrin.h"
+#include "aom_dsp/x86/convolve_sse4_1.h"
+#include "aom_dsp/x86/mem_sse2.h"
+#include "aom_dsp/x86/synonyms_avx2.h"
+
+#include "av1/common/convolve.h"
+
+static INLINE __m256i unpack_weights_avx2(ConvolveParams *conv_params) {
+ const int w0 = conv_params->fwd_offset;
+ const int w1 = conv_params->bck_offset;
+ const __m256i wt0 = _mm256_set1_epi16((int16_t)w0);
+ const __m256i wt1 = _mm256_set1_epi16((int16_t)w1);
+ const __m256i wt = _mm256_unpacklo_epi16(wt0, wt1);
+ return wt;
+}
+
+static INLINE __m256i load_line2_avx2(const void *a, const void *b) {
+ return _mm256_permute2x128_si256(
+ _mm256_castsi128_si256(_mm_loadu_si128((__m128i *)a)),
+ _mm256_castsi128_si256(_mm_loadu_si128((__m128i *)b)), 0x20);
+}
+
+void av1_dist_wtd_convolve_x_avx2(const uint8_t *src, int src_stride,
+ uint8_t *dst0, int dst_stride0, int w, int h,
+ const InterpFilterParams *filter_params_x,
+ const int subpel_x_qn,
+ ConvolveParams *conv_params) {
+ CONV_BUF_TYPE *dst = conv_params->dst;
+ int dst_stride = conv_params->dst_stride;
+ const int bd = 8;
+ int i, j, is_horiz_4tap = 0;
+ const int bits = FILTER_BITS - conv_params->round_1;
+ const __m256i wt = unpack_weights_avx2(conv_params);
+ const int do_average = conv_params->do_average;
+ const int use_dist_wtd_comp_avg = conv_params->use_dist_wtd_comp_avg;
+ const int offset_0 =
+ bd + 2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1;
+ const int offset = (1 << offset_0) + (1 << (offset_0 - 1));
+ const __m256i offset_const = _mm256_set1_epi16(offset);
+ const int rounding_shift =
+ 2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1;
+ const __m256i rounding_const = _mm256_set1_epi16((1 << rounding_shift) >> 1);
+
+ assert(bits >= 0);
+ assert(conv_params->round_0 > 0);
+
+ const __m256i round_const =
+ _mm256_set1_epi16((1 << (conv_params->round_0 - 1)) >> 1);
+ const __m128i round_shift = _mm_cvtsi32_si128(conv_params->round_0 - 1);
+
+ __m256i filt[4], coeffs[4];
+
+ filt[0] = _mm256_load_si256((__m256i const *)filt_global_avx2);
+ filt[1] = _mm256_load_si256((__m256i const *)(filt_global_avx2 + 32));
+
+ prepare_coeffs_lowbd(filter_params_x, subpel_x_qn, coeffs);
+
+ // Condition for checking valid horz_filt taps
+ if (!(_mm256_extract_epi32(_mm256_or_si256(coeffs[0], coeffs[3]), 0)))
+ is_horiz_4tap = 1;
+
+ // horz_filt as 4 tap
+ if (is_horiz_4tap) {
+ const int fo_horiz = 1;
+ const uint8_t *const src_ptr = src - fo_horiz;
+ for (i = 0; i < h; i += 2) {
+ const uint8_t *src_data = src_ptr + i * src_stride;
+ CONV_BUF_TYPE *dst_data = dst + i * dst_stride;
+ for (j = 0; j < w; j += 8) {
+ const __m256i data =
+ load_line2_avx2(&src_data[j], &src_data[j + src_stride]);
+
+ __m256i res = convolve_lowbd_x_4tap(data, coeffs + 1, filt);
+ res = _mm256_sra_epi16(_mm256_add_epi16(res, round_const), round_shift);
+ res = _mm256_slli_epi16(res, bits);
+
+ const __m256i res_unsigned = _mm256_add_epi16(res, offset_const);
+
+ // Accumulate values into the destination buffer
+ if (do_average) {
+ const __m256i data_ref_0 =
+ load_line2_avx2(&dst_data[j], &dst_data[j + dst_stride]);
+ const __m256i comp_avg_res =
+ comp_avg(&data_ref_0, &res_unsigned, &wt, use_dist_wtd_comp_avg);
+
+ const __m256i round_result = convolve_rounding(
+ &comp_avg_res, &offset_const, &rounding_const, rounding_shift);
+
+ const __m256i res_8 = _mm256_packus_epi16(round_result, round_result);
+ const __m128i res_0 = _mm256_castsi256_si128(res_8);
+ const __m128i res_1 = _mm256_extracti128_si256(res_8, 1);
+
+ if (w > 4) {
+ _mm_storel_epi64((__m128i *)(&dst0[i * dst_stride0 + j]), res_0);
+ _mm_storel_epi64(
+ (__m128i *)((&dst0[i * dst_stride0 + j + dst_stride0])), res_1);
+ } else {
+ *(int *)(&dst0[i * dst_stride0 + j]) = _mm_cvtsi128_si32(res_0);
+ *(int *)(&dst0[i * dst_stride0 + j + dst_stride0]) =
+ _mm_cvtsi128_si32(res_1);
+ }
+ } else {
+ const __m128i res_0 = _mm256_castsi256_si128(res_unsigned);
+ _mm_store_si128((__m128i *)(&dst[i * dst_stride + j]), res_0);
+
+ const __m128i res_1 = _mm256_extracti128_si256(res_unsigned, 1);
+ _mm_store_si128((__m128i *)(&dst[i * dst_stride + j + dst_stride]),
+ res_1);
+ }
+ }
+ }
+ } else {
+ const int fo_horiz = filter_params_x->taps / 2 - 1;
+ const uint8_t *const src_ptr = src - fo_horiz;
+
+ filt[2] = _mm256_load_si256((__m256i const *)(filt_global_avx2 + 32 * 2));
+ filt[3] = _mm256_load_si256((__m256i const *)(filt_global_avx2 + 32 * 3));
+ for (i = 0; i < h; i += 2) {
+ const uint8_t *src_data = src_ptr + i * src_stride;
+ CONV_BUF_TYPE *dst_data = dst + i * dst_stride;
+ for (j = 0; j < w; j += 8) {
+ const __m256i data =
+ load_line2_avx2(&src_data[j], &src_data[j + src_stride]);
+
+ __m256i res = convolve_lowbd_x(data, coeffs, filt);
+
+ res = _mm256_sra_epi16(_mm256_add_epi16(res, round_const), round_shift);
+
+ res = _mm256_slli_epi16(res, bits);
+
+ const __m256i res_unsigned = _mm256_add_epi16(res, offset_const);
+
+ // Accumulate values into the destination buffer
+ if (do_average) {
+ const __m256i data_ref_0 =
+ load_line2_avx2(&dst_data[j], &dst_data[j + dst_stride]);
+ const __m256i comp_avg_res =
+ comp_avg(&data_ref_0, &res_unsigned, &wt, use_dist_wtd_comp_avg);
+
+ const __m256i round_result = convolve_rounding(
+ &comp_avg_res, &offset_const, &rounding_const, rounding_shift);
+
+ const __m256i res_8 = _mm256_packus_epi16(round_result, round_result);
+ const __m128i res_0 = _mm256_castsi256_si128(res_8);
+ const __m128i res_1 = _mm256_extracti128_si256(res_8, 1);
+
+ if (w > 4) {
+ _mm_storel_epi64((__m128i *)(&dst0[i * dst_stride0 + j]), res_0);
+ _mm_storel_epi64(
+ (__m128i *)((&dst0[i * dst_stride0 + j + dst_stride0])), res_1);
+ } else {
+ *(int *)(&dst0[i * dst_stride0 + j]) = _mm_cvtsi128_si32(res_0);
+ *(int *)(&dst0[i * dst_stride0 + j + dst_stride0]) =
+ _mm_cvtsi128_si32(res_1);
+ }
+ } else {
+ const __m128i res_0 = _mm256_castsi256_si128(res_unsigned);
+ _mm_store_si128((__m128i *)(&dst[i * dst_stride + j]), res_0);
+
+ const __m128i res_1 = _mm256_extracti128_si256(res_unsigned, 1);
+ _mm_store_si128((__m128i *)(&dst[i * dst_stride + j + dst_stride]),
+ res_1);
+ }
+ }
+ }
+ }
+}
+
+void av1_dist_wtd_convolve_y_avx2(const uint8_t *src, int src_stride,
+ uint8_t *dst0, int dst_stride0, int w, int h,
+ const InterpFilterParams *filter_params_y,
+ const int subpel_y_qn,
+ ConvolveParams *conv_params) {
+ CONV_BUF_TYPE *dst = conv_params->dst;
+ int dst_stride = conv_params->dst_stride;
+ const int bd = 8;
+ int i, j, is_vert_4tap = 0;
+ // +1 to compensate for dividing the filter coeffs by 2
+ const int left_shift = FILTER_BITS - conv_params->round_0 + 1;
+ const __m256i round_const =
+ _mm256_set1_epi32((1 << conv_params->round_1) >> 1);
+ const __m128i round_shift = _mm_cvtsi32_si128(conv_params->round_1);
+ const __m256i wt = unpack_weights_avx2(conv_params);
+ const int do_average = conv_params->do_average;
+ const int use_dist_wtd_comp_avg = conv_params->use_dist_wtd_comp_avg;
+ const int offset_0 =
+ bd + 2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1;
+ const int offset = (1 << offset_0) + (1 << (offset_0 - 1));
+ const __m256i offset_const = _mm256_set1_epi16(offset);
+ const int offset_1 = (1 << (bd + FILTER_BITS - 2));
+ const __m256i offset_const_1 = _mm256_set1_epi16(offset_1);
+ const __m256i offset_const_2 = _mm256_set1_epi16((1 << offset_0));
+ const int rounding_shift =
+ 2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1;
+ const __m256i rounding_const = _mm256_set1_epi16((1 << rounding_shift) >> 1);
+ const __m256i zero = _mm256_setzero_si256();
+ __m256i coeffs[4], s[8];
+
+ assert((FILTER_BITS - conv_params->round_0) >= 0);
+
+ prepare_coeffs_lowbd(filter_params_y, subpel_y_qn, coeffs);
+
+ // Condition for checking valid vert_filt taps
+ if (!(_mm256_extract_epi32(_mm256_or_si256(coeffs[0], coeffs[3]), 0)))
+ is_vert_4tap = 1;
+
+ if (is_vert_4tap) {
+ const int fo_vert = 1;
+ const uint8_t *const src_ptr = src - fo_vert * src_stride;
+ for (j = 0; j < w; j += 16) {
+ const uint8_t *data = &src_ptr[j];
+ __m256i src4;
+ // Load lines a and b. Line a to lower 128, line b to upper 128
+ {
+ __m256i src_ab[4];
+ __m256i src_a[5];
+ src_a[0] = _mm256_castsi128_si256(_mm_loadu_si128((__m128i *)data));
+ for (int kk = 0; kk < 4; ++kk) {
+ data += src_stride;
+ src_a[kk + 1] =
+ _mm256_castsi128_si256(_mm_loadu_si128((__m128i *)data));
+ src_ab[kk] =
+ _mm256_permute2x128_si256(src_a[kk], src_a[kk + 1], 0x20);
+ }
+ src4 = src_a[4];
+ s[0] = _mm256_unpacklo_epi8(src_ab[0], src_ab[1]);
+ s[1] = _mm256_unpacklo_epi8(src_ab[2], src_ab[3]);
+
+ s[3] = _mm256_unpackhi_epi8(src_ab[0], src_ab[1]);
+ s[4] = _mm256_unpackhi_epi8(src_ab[2], src_ab[3]);
+ }
+
+ for (i = 0; i < h; i += 2) {
+ data = &src_ptr[(i + 5) * src_stride + j];
+ const __m256i src5 =
+ _mm256_castsi128_si256(_mm_loadu_si128((__m128i *)data));
+ const __m256i src_45a = _mm256_permute2x128_si256(src4, src5, 0x20);
+
+ src4 = _mm256_castsi128_si256(
+ _mm_loadu_si128((__m128i *)(data + src_stride)));
+ const __m256i src_56a = _mm256_permute2x128_si256(src5, src4, 0x20);
+
+ s[2] = _mm256_unpacklo_epi8(src_45a, src_56a);
+ s[5] = _mm256_unpackhi_epi8(src_45a, src_56a);
+
+ __m256i res_lo = convolve_lowbd_4tap(s, coeffs + 1);
+
+ res_lo = _mm256_add_epi16(res_lo, offset_const_1);
+
+ const __m256i res_lo_0_32b = _mm256_unpacklo_epi16(res_lo, zero);
+ const __m256i res_lo_0_shift =
+ _mm256_slli_epi32(res_lo_0_32b, left_shift);
+ const __m256i res_lo_0_round = _mm256_sra_epi32(
+ _mm256_add_epi32(res_lo_0_shift, round_const), round_shift);
+
+ const __m256i res_lo_1_32b = _mm256_unpackhi_epi16(res_lo, zero);
+ const __m256i res_lo_1_shift =
+ _mm256_slli_epi32(res_lo_1_32b, left_shift);
+ const __m256i res_lo_1_round = _mm256_sra_epi32(
+ _mm256_add_epi32(res_lo_1_shift, round_const), round_shift);
+
+ const __m256i res_lo_round =
+ _mm256_packs_epi32(res_lo_0_round, res_lo_1_round);
+
+ const __m256i res_lo_unsigned =
+ _mm256_add_epi16(res_lo_round, offset_const_2);
+
+ if (w - j < 16) {
+ if (do_average) {
+ const __m256i data_ref_0 =
+ load_line2_avx2(&dst[i * dst_stride + j],
+ &dst[i * dst_stride + j + dst_stride]);
+ const __m256i comp_avg_res = comp_avg(&data_ref_0, &res_lo_unsigned,
+ &wt, use_dist_wtd_comp_avg);
+
+ const __m256i round_result = convolve_rounding(
+ &comp_avg_res, &offset_const, &rounding_const, rounding_shift);
+
+ const __m256i res_8 =
+ _mm256_packus_epi16(round_result, round_result);
+ const __m128i res_0 = _mm256_castsi256_si128(res_8);
+ const __m128i res_1 = _mm256_extracti128_si256(res_8, 1);
+
+ if (w - j > 4) {
+ _mm_storel_epi64((__m128i *)(&dst0[i * dst_stride0 + j]), res_0);
+ _mm_storel_epi64(
+ (__m128i *)((&dst0[i * dst_stride0 + j + dst_stride0])),
+ res_1);
+ } else {
+ *(int *)(&dst0[i * dst_stride0 + j]) = _mm_cvtsi128_si32(res_0);
+ *(int *)(&dst0[i * dst_stride0 + j + dst_stride0]) =
+ _mm_cvtsi128_si32(res_1);
+ }
+ } else {
+ const __m128i res_0 = _mm256_castsi256_si128(res_lo_unsigned);
+ _mm_store_si128((__m128i *)(&dst[i * dst_stride + j]), res_0);
+
+ const __m128i res_1 = _mm256_extracti128_si256(res_lo_unsigned, 1);
+ _mm_store_si128((__m128i *)(&dst[i * dst_stride + j + dst_stride]),
+ res_1);
+ }
+ } else {
+ __m256i res_hi = convolve_lowbd_4tap(s + 3, coeffs + 1);
+
+ res_hi = _mm256_add_epi16(res_hi, offset_const_1);
+
+ const __m256i res_hi_0_32b = _mm256_unpacklo_epi16(res_hi, zero);
+ const __m256i res_hi_0_shift =
+ _mm256_slli_epi32(res_hi_0_32b, left_shift);
+ const __m256i res_hi_0_round = _mm256_sra_epi32(
+ _mm256_add_epi32(res_hi_0_shift, round_const), round_shift);
+
+ const __m256i res_hi_1_32b = _mm256_unpackhi_epi16(res_hi, zero);
+ const __m256i res_hi_1_shift =
+ _mm256_slli_epi32(res_hi_1_32b, left_shift);
+ const __m256i res_hi_1_round = _mm256_sra_epi32(
+ _mm256_add_epi32(res_hi_1_shift, round_const), round_shift);
+
+ const __m256i res_hi_round =
+ _mm256_packs_epi32(res_hi_0_round, res_hi_1_round);
+
+ const __m256i res_hi_unsigned =
+ _mm256_add_epi16(res_hi_round, offset_const_2);
+
+ if (do_average) {
+ const __m256i data_ref_0_lo =
+ load_line2_avx2(&dst[i * dst_stride + j],
+ &dst[i * dst_stride + j + dst_stride]);
+
+ const __m256i data_ref_0_hi =
+ load_line2_avx2(&dst[i * dst_stride + j + 8],
+ &dst[i * dst_stride + j + 8 + dst_stride]);
+
+ const __m256i comp_avg_res_lo = comp_avg(
+ &data_ref_0_lo, &res_lo_unsigned, &wt, use_dist_wtd_comp_avg);
+
+ const __m256i comp_avg_res_hi = comp_avg(
+ &data_ref_0_hi, &res_hi_unsigned, &wt, use_dist_wtd_comp_avg);
+
+ const __m256i round_result_lo =
+ convolve_rounding(&comp_avg_res_lo, &offset_const,
+ &rounding_const, rounding_shift);
+
+ const __m256i round_result_hi =
+ convolve_rounding(&comp_avg_res_hi, &offset_const,
+ &rounding_const, rounding_shift);
+
+ const __m256i res_8 =
+ _mm256_packus_epi16(round_result_lo, round_result_hi);
+ const __m128i res_0 = _mm256_castsi256_si128(res_8);
+ const __m128i res_1 = _mm256_extracti128_si256(res_8, 1);
+
+ _mm_store_si128((__m128i *)(&dst0[i * dst_stride0 + j]), res_0);
+ _mm_store_si128(
+ (__m128i *)((&dst0[i * dst_stride0 + j + dst_stride0])), res_1);
+
+ } else {
+ const __m128i res_lo_0 = _mm256_castsi256_si128(res_lo_unsigned);
+ _mm_store_si128((__m128i *)(&dst[i * dst_stride + j]), res_lo_0);
+
+ const __m128i res_lo_1 =
+ _mm256_extracti128_si256(res_lo_unsigned, 1);
+ _mm_store_si128((__m128i *)(&dst[i * dst_stride + j + dst_stride]),
+ res_lo_1);
+
+ const __m128i res_hi_0 = _mm256_castsi256_si128(res_hi_unsigned);
+ _mm_store_si128((__m128i *)(&dst[i * dst_stride + j + 8]),
+ res_hi_0);
+
+ const __m128i res_hi_1 =
+ _mm256_extracti128_si256(res_hi_unsigned, 1);
+ _mm_store_si128(
+ (__m128i *)(&dst[i * dst_stride + j + 8 + dst_stride]),
+ res_hi_1);
+ }
+ }
+ s[0] = s[1];
+ s[1] = s[2];
+
+ s[3] = s[4];
+ s[4] = s[5];
+ }
+ }
+ } else {
+ const int fo_vert = filter_params_y->taps / 2 - 1;
+ const uint8_t *const src_ptr = src - fo_vert * src_stride;
+ for (j = 0; j < w; j += 16) {
+ const uint8_t *data = &src_ptr[j];
+ __m256i src6;
+ // Load lines a and b. Line a to lower 128, line b to upper 128
+ {
+ __m256i src_ab[7];
+ __m256i src_a[7];
+ src_a[0] = _mm256_castsi128_si256(_mm_loadu_si128((__m128i *)data));
+ for (int kk = 0; kk < 6; ++kk) {
+ data += src_stride;
+ src_a[kk + 1] =
+ _mm256_castsi128_si256(_mm_loadu_si128((__m128i *)data));
+ src_ab[kk] =
+ _mm256_permute2x128_si256(src_a[kk], src_a[kk + 1], 0x20);
+ }
+ src6 = src_a[6];
+ s[0] = _mm256_unpacklo_epi8(src_ab[0], src_ab[1]);
+ s[1] = _mm256_unpacklo_epi8(src_ab[2], src_ab[3]);
+ s[2] = _mm256_unpacklo_epi8(src_ab[4], src_ab[5]);
+ s[4] = _mm256_unpackhi_epi8(src_ab[0], src_ab[1]);
+ s[5] = _mm256_unpackhi_epi8(src_ab[2], src_ab[3]);
+ s[6] = _mm256_unpackhi_epi8(src_ab[4], src_ab[5]);
+ }
+
+ for (i = 0; i < h; i += 2) {
+ data = &src_ptr[(i + 7) * src_stride + j];
+ const __m256i src7 =
+ _mm256_castsi128_si256(_mm_loadu_si128((__m128i *)data));
+ const __m256i src_67a = _mm256_permute2x128_si256(src6, src7, 0x20);
+
+ src6 = _mm256_castsi128_si256(
+ _mm_loadu_si128((__m128i *)(data + src_stride)));
+ const __m256i src_78a = _mm256_permute2x128_si256(src7, src6, 0x20);
+
+ s[3] = _mm256_unpacklo_epi8(src_67a, src_78a);
+ s[7] = _mm256_unpackhi_epi8(src_67a, src_78a);
+
+ __m256i res_lo = convolve_lowbd(s, coeffs);
+
+ res_lo = _mm256_add_epi16(res_lo, offset_const_1);
+
+ const __m256i res_lo_0_32b = _mm256_unpacklo_epi16(res_lo, zero);
+ const __m256i res_lo_0_shift =
+ _mm256_slli_epi32(res_lo_0_32b, left_shift);
+ const __m256i res_lo_0_round = _mm256_sra_epi32(
+ _mm256_add_epi32(res_lo_0_shift, round_const), round_shift);
+
+ const __m256i res_lo_1_32b = _mm256_unpackhi_epi16(res_lo, zero);
+ const __m256i res_lo_1_shift =
+ _mm256_slli_epi32(res_lo_1_32b, left_shift);
+ const __m256i res_lo_1_round = _mm256_sra_epi32(
+ _mm256_add_epi32(res_lo_1_shift, round_const), round_shift);
+
+ const __m256i res_lo_round =
+ _mm256_packs_epi32(res_lo_0_round, res_lo_1_round);
+
+ const __m256i res_lo_unsigned =
+ _mm256_add_epi16(res_lo_round, offset_const_2);
+
+ if (w - j < 16) {
+ if (do_average) {
+ const __m256i data_ref_0 =
+ load_line2_avx2(&dst[i * dst_stride + j],
+ &dst[i * dst_stride + j + dst_stride]);
+ const __m256i comp_avg_res = comp_avg(&data_ref_0, &res_lo_unsigned,
+ &wt, use_dist_wtd_comp_avg);
+
+ const __m256i round_result = convolve_rounding(
+ &comp_avg_res, &offset_const, &rounding_const, rounding_shift);
+
+ const __m256i res_8 =
+ _mm256_packus_epi16(round_result, round_result);
+ const __m128i res_0 = _mm256_castsi256_si128(res_8);
+ const __m128i res_1 = _mm256_extracti128_si256(res_8, 1);
+
+ if (w - j > 4) {
+ _mm_storel_epi64((__m128i *)(&dst0[i * dst_stride0 + j]), res_0);
+ _mm_storel_epi64(
+ (__m128i *)((&dst0[i * dst_stride0 + j + dst_stride0])),
+ res_1);
+ } else {
+ *(int *)(&dst0[i * dst_stride0 + j]) = _mm_cvtsi128_si32(res_0);
+ *(int *)(&dst0[i * dst_stride0 + j + dst_stride0]) =
+ _mm_cvtsi128_si32(res_1);
+ }
+ } else {
+ const __m128i res_0 = _mm256_castsi256_si128(res_lo_unsigned);
+ _mm_store_si128((__m128i *)(&dst[i * dst_stride + j]), res_0);
+
+ const __m128i res_1 = _mm256_extracti128_si256(res_lo_unsigned, 1);
+ _mm_store_si128((__m128i *)(&dst[i * dst_stride + j + dst_stride]),
+ res_1);
+ }
+ } else {
+ __m256i res_hi = convolve_lowbd(s + 4, coeffs);
+
+ res_hi = _mm256_add_epi16(res_hi, offset_const_1);
+
+ const __m256i res_hi_0_32b = _mm256_unpacklo_epi16(res_hi, zero);
+ const __m256i res_hi_0_shift =
+ _mm256_slli_epi32(res_hi_0_32b, left_shift);
+ const __m256i res_hi_0_round = _mm256_sra_epi32(
+ _mm256_add_epi32(res_hi_0_shift, round_const), round_shift);
+
+ const __m256i res_hi_1_32b = _mm256_unpackhi_epi16(res_hi, zero);
+ const __m256i res_hi_1_shift =
+ _mm256_slli_epi32(res_hi_1_32b, left_shift);
+ const __m256i res_hi_1_round = _mm256_sra_epi32(
+ _mm256_add_epi32(res_hi_1_shift, round_const), round_shift);
+
+ const __m256i res_hi_round =
+ _mm256_packs_epi32(res_hi_0_round, res_hi_1_round);
+
+ const __m256i res_hi_unsigned =
+ _mm256_add_epi16(res_hi_round, offset_const_2);
+
+ if (do_average) {
+ const __m256i data_ref_0_lo =
+ load_line2_avx2(&dst[i * dst_stride + j],
+ &dst[i * dst_stride + j + dst_stride]);
+
+ const __m256i data_ref_0_hi =
+ load_line2_avx2(&dst[i * dst_stride + j + 8],
+ &dst[i * dst_stride + j + 8 + dst_stride]);
+
+ const __m256i comp_avg_res_lo = comp_avg(
+ &data_ref_0_lo, &res_lo_unsigned, &wt, use_dist_wtd_comp_avg);
+
+ const __m256i comp_avg_res_hi = comp_avg(
+ &data_ref_0_hi, &res_hi_unsigned, &wt, use_dist_wtd_comp_avg);
+
+ const __m256i round_result_lo =
+ convolve_rounding(&comp_avg_res_lo, &offset_const,
+ &rounding_const, rounding_shift);
+
+ const __m256i round_result_hi =
+ convolve_rounding(&comp_avg_res_hi, &offset_const,
+ &rounding_const, rounding_shift);
+
+ const __m256i res_8 =
+ _mm256_packus_epi16(round_result_lo, round_result_hi);
+ const __m128i res_0 = _mm256_castsi256_si128(res_8);
+ const __m128i res_1 = _mm256_extracti128_si256(res_8, 1);
+
+ _mm_store_si128((__m128i *)(&dst0[i * dst_stride0 + j]), res_0);
+ _mm_store_si128(
+ (__m128i *)((&dst0[i * dst_stride0 + j + dst_stride0])), res_1);
+
+ } else {
+ const __m128i res_lo_0 = _mm256_castsi256_si128(res_lo_unsigned);
+ _mm_store_si128((__m128i *)(&dst[i * dst_stride + j]), res_lo_0);
+
+ const __m128i res_lo_1 =
+ _mm256_extracti128_si256(res_lo_unsigned, 1);
+ _mm_store_si128((__m128i *)(&dst[i * dst_stride + j + dst_stride]),
+ res_lo_1);
+
+ const __m128i res_hi_0 = _mm256_castsi256_si128(res_hi_unsigned);
+ _mm_store_si128((__m128i *)(&dst[i * dst_stride + j + 8]),
+ res_hi_0);
+
+ const __m128i res_hi_1 =
+ _mm256_extracti128_si256(res_hi_unsigned, 1);
+ _mm_store_si128(
+ (__m128i *)(&dst[i * dst_stride + j + 8 + dst_stride]),
+ res_hi_1);
+ }
+ }
+ s[0] = s[1];
+ s[1] = s[2];
+ s[2] = s[3];
+
+ s[4] = s[5];
+ s[5] = s[6];
+ s[6] = s[7];
+ }
+ }
+ }
+}
+
+void av1_dist_wtd_convolve_2d_avx2(const uint8_t *src, int src_stride,
+ uint8_t *dst0, int dst_stride0, int w, int h,
+ const InterpFilterParams *filter_params_x,
+ const InterpFilterParams *filter_params_y,
+ const int subpel_x_qn, const int subpel_y_qn,
+ ConvolveParams *conv_params) {
+ CONV_BUF_TYPE *dst = conv_params->dst;
+ int dst_stride = conv_params->dst_stride;
+ const int bd = 8;
+
+ DECLARE_ALIGNED(32, int16_t, im_block[(MAX_SB_SIZE + MAX_FILTER_TAP) * 8]);
+
+ int im_stride = 8;
+ int i, is_horiz_4tap = 0, is_vert_4tap = 0;
+ const __m256i wt = unpack_weights_avx2(conv_params);
+ const int do_average = conv_params->do_average;
+ const int use_dist_wtd_comp_avg = conv_params->use_dist_wtd_comp_avg;
+ const int offset_0 =
+ bd + 2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1;
+ const int offset = (1 << offset_0) + (1 << (offset_0 - 1));
+ const __m256i offset_const = _mm256_set1_epi16(offset);
+ const int rounding_shift =
+ 2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1;
+ const __m256i rounding_const = _mm256_set1_epi16((1 << rounding_shift) >> 1);
+
+ assert(conv_params->round_0 > 0);
+
+ const __m256i round_const_h = _mm256_set1_epi16(
+ ((1 << (conv_params->round_0 - 1)) >> 1) + (1 << (bd + FILTER_BITS - 2)));
+ const __m128i round_shift_h = _mm_cvtsi32_si128(conv_params->round_0 - 1);
+
+ const __m256i round_const_v = _mm256_set1_epi32(
+ ((1 << conv_params->round_1) >> 1) -
+ (1 << (bd + 2 * FILTER_BITS - conv_params->round_0 - 1)));
+ const __m128i round_shift_v = _mm_cvtsi32_si128(conv_params->round_1);
+
+ __m256i filt[4], coeffs_x[4], coeffs_y[4];
+
+ filt[0] = _mm256_load_si256((__m256i const *)filt_global_avx2);
+ filt[1] = _mm256_load_si256((__m256i const *)(filt_global_avx2 + 32));
+
+ prepare_coeffs_lowbd(filter_params_x, subpel_x_qn, coeffs_x);
+ prepare_coeffs(filter_params_y, subpel_y_qn, coeffs_y);
+
+ // Condition for checking valid horz_filt taps
+ if (!(_mm256_extract_epi32(_mm256_or_si256(coeffs_x[0], coeffs_x[3]), 0)))
+ is_horiz_4tap = 1;
+
+ // Condition for checking valid vert_filt taps
+ if (!(_mm256_extract_epi32(_mm256_or_si256(coeffs_y[0], coeffs_y[3]), 0)))
+ is_vert_4tap = 1;
+
+ if (is_horiz_4tap) {
+ int im_h = h + filter_params_y->taps - 1;
+ const int fo_vert = filter_params_y->taps / 2 - 1;
+ const int fo_horiz = 1;
+ const uint8_t *const src_ptr = src - fo_vert * src_stride - fo_horiz;
+ for (int j = 0; j < w; j += 8) {
+ /* Horizontal filter */
+ const uint8_t *src_h = src_ptr + j;
+ for (i = 0; i < im_h; i += 2) {
+ __m256i data =
+ _mm256_castsi128_si256(_mm_loadu_si128((__m128i *)src_h));
+ if (i + 1 < im_h)
+ data = _mm256_inserti128_si256(
+ data, _mm_loadu_si128((__m128i *)(src_h + src_stride)), 1);
+ src_h += (src_stride << 1);
+ __m256i res = convolve_lowbd_x_4tap(data, coeffs_x + 1, filt);
+
+ res = _mm256_sra_epi16(_mm256_add_epi16(res, round_const_h),
+ round_shift_h);
+
+ _mm256_store_si256((__m256i *)&im_block[i * im_stride], res);
+ }
+ DIST_WTD_CONVOLVE_VERTICAL_FILTER_8TAP;
+ }
+ } else if (is_vert_4tap) {
+ int im_h = h + 3;
+ const int fo_vert = 1;
+ const int fo_horiz = filter_params_x->taps / 2 - 1;
+ const uint8_t *const src_ptr = src - fo_vert * src_stride - fo_horiz;
+
+ filt[2] = _mm256_load_si256((__m256i const *)(filt_global_avx2 + 32 * 2));
+ filt[3] = _mm256_load_si256((__m256i const *)(filt_global_avx2 + 32 * 3));
+
+ for (int j = 0; j < w; j += 8) {
+ /* Horizontal filter */
+ const uint8_t *src_h = src_ptr + j;
+ DIST_WTD_CONVOLVE_HORIZONTAL_FILTER_8TAP;
+
+ /* Vertical filter */
+ __m256i s[6];
+ __m256i s0 = _mm256_loadu_si256((__m256i *)(im_block + 0 * im_stride));
+ __m256i s1 = _mm256_loadu_si256((__m256i *)(im_block + 1 * im_stride));
+ __m256i s2 = _mm256_loadu_si256((__m256i *)(im_block + 2 * im_stride));
+ __m256i s3 = _mm256_loadu_si256((__m256i *)(im_block + 3 * im_stride));
+
+ s[0] = _mm256_unpacklo_epi16(s0, s1);
+ s[1] = _mm256_unpacklo_epi16(s2, s3);
+
+ s[3] = _mm256_unpackhi_epi16(s0, s1);
+ s[4] = _mm256_unpackhi_epi16(s2, s3);
+
+ for (i = 0; i < h; i += 2) {
+ const int16_t *data = &im_block[i * im_stride];
+
+ const __m256i s4 =
+ _mm256_loadu_si256((__m256i *)(data + 4 * im_stride));
+ const __m256i s5 =
+ _mm256_loadu_si256((__m256i *)(data + 5 * im_stride));
+
+ s[2] = _mm256_unpacklo_epi16(s4, s5);
+ s[5] = _mm256_unpackhi_epi16(s4, s5);
+
+ const __m256i res_a = convolve_4tap(s, coeffs_y + 1);
+ const __m256i res_a_round = _mm256_sra_epi32(
+ _mm256_add_epi32(res_a, round_const_v), round_shift_v);
+
+ if (w - j > 4) {
+ const __m256i res_b = convolve_4tap(s + 3, coeffs_y + 1);
+ const __m256i res_b_round = _mm256_sra_epi32(
+ _mm256_add_epi32(res_b, round_const_v), round_shift_v);
+ const __m256i res_16b = _mm256_packs_epi32(res_a_round, res_b_round);
+ const __m256i res_unsigned = _mm256_add_epi16(res_16b, offset_const);
+
+ if (do_average) {
+ const __m256i data_ref_0 =
+ load_line2_avx2(&dst[i * dst_stride + j],
+ &dst[i * dst_stride + j + dst_stride]);
+ const __m256i comp_avg_res = comp_avg(&data_ref_0, &res_unsigned,
+ &wt, use_dist_wtd_comp_avg);
+
+ const __m256i round_result = convolve_rounding(
+ &comp_avg_res, &offset_const, &rounding_const, rounding_shift);
+
+ const __m256i res_8 =
+ _mm256_packus_epi16(round_result, round_result);
+ const __m128i res_0 = _mm256_castsi256_si128(res_8);
+ const __m128i res_1 = _mm256_extracti128_si256(res_8, 1);
+
+ _mm_storel_epi64((__m128i *)(&dst0[i * dst_stride0 + j]), res_0);
+ _mm_storel_epi64(
+ (__m128i *)((&dst0[i * dst_stride0 + j + dst_stride0])), res_1);
+ } else {
+ const __m128i res_0 = _mm256_castsi256_si128(res_unsigned);
+ _mm_store_si128((__m128i *)(&dst[i * dst_stride + j]), res_0);
+
+ const __m128i res_1 = _mm256_extracti128_si256(res_unsigned, 1);
+ _mm_store_si128((__m128i *)(&dst[i * dst_stride + j + dst_stride]),
+ res_1);
+ }
+ } else {
+ const __m256i res_16b = _mm256_packs_epi32(res_a_round, res_a_round);
+ const __m256i res_unsigned = _mm256_add_epi16(res_16b, offset_const);
+
+ if (do_average) {
+ const __m256i data_ref_0 =
+ load_line2_avx2(&dst[i * dst_stride + j],
+ &dst[i * dst_stride + j + dst_stride]);
+
+ const __m256i comp_avg_res = comp_avg(&data_ref_0, &res_unsigned,
+ &wt, use_dist_wtd_comp_avg);
+
+ const __m256i round_result = convolve_rounding(
+ &comp_avg_res, &offset_const, &rounding_const, rounding_shift);
+
+ const __m256i res_8 =
+ _mm256_packus_epi16(round_result, round_result);
+ const __m128i res_0 = _mm256_castsi256_si128(res_8);
+ const __m128i res_1 = _mm256_extracti128_si256(res_8, 1);
+
+ *(int *)(&dst0[i * dst_stride0 + j]) = _mm_cvtsi128_si32(res_0);
+ *(int *)(&dst0[i * dst_stride0 + j + dst_stride0]) =
+ _mm_cvtsi128_si32(res_1);
+
+ } else {
+ const __m128i res_0 = _mm256_castsi256_si128(res_unsigned);
+ _mm_store_si128((__m128i *)(&dst[i * dst_stride + j]), res_0);
+
+ const __m128i res_1 = _mm256_extracti128_si256(res_unsigned, 1);
+ _mm_store_si128((__m128i *)(&dst[i * dst_stride + j + dst_stride]),
+ res_1);
+ }
+ }
+ s[0] = s[1];
+ s[1] = s[2];
+ s[3] = s[4];
+ s[4] = s[5];
+ }
+ }
+ } else {
+ int im_h = h + filter_params_y->taps - 1;
+ const int fo_vert = filter_params_y->taps / 2 - 1;
+ const int fo_horiz = filter_params_x->taps / 2 - 1;
+ const uint8_t *const src_ptr = src - fo_vert * src_stride - fo_horiz;
+
+ filt[2] = _mm256_load_si256((__m256i const *)(filt_global_avx2 + 32 * 2));
+ filt[3] = _mm256_load_si256((__m256i const *)(filt_global_avx2 + 32 * 3));
+
+ for (int j = 0; j < w; j += 8) {
+ /* Horizontal filter */
+ const uint8_t *src_h = src_ptr + j;
+ DIST_WTD_CONVOLVE_HORIZONTAL_FILTER_8TAP;
+
+ DIST_WTD_CONVOLVE_VERTICAL_FILTER_8TAP;
+ }
+ }
+}
+
+#define DO_NO_AVG_2D_COPY_4X16(r0, c0, r1, c1, r2, c2, r3, c3) \
+ do { \
+ src_0 = _mm256_cvtepu8_epi16( \
+ _mm_loadu_si128((__m128i *)(&src[r0 * src_stride + c0]))); \
+ src_1 = _mm256_cvtepu8_epi16( \
+ _mm_loadu_si128((__m128i *)(&src[r1 * src_stride + c1]))); \
+ src_2 = _mm256_cvtepu8_epi16( \
+ _mm_loadu_si128((__m128i *)(&src[r2 * src_stride + c2]))); \
+ src_3 = _mm256_cvtepu8_epi16( \
+ _mm_loadu_si128((__m128i *)(&src[r3 * src_stride + c3]))); \
+ \
+ src_0 = _mm256_slli_epi16(src_0, LEFT_SHIFT); \
+ src_1 = _mm256_slli_epi16(src_1, LEFT_SHIFT); \
+ src_2 = _mm256_slli_epi16(src_2, LEFT_SHIFT); \
+ src_3 = _mm256_slli_epi16(src_3, LEFT_SHIFT); \
+ \
+ src_0 = _mm256_add_epi16(src_0, offset_const); \
+ src_1 = _mm256_add_epi16(src_1, offset_const); \
+ src_2 = _mm256_add_epi16(src_2, offset_const); \
+ src_3 = _mm256_add_epi16(src_3, offset_const); \
+ \
+ _mm256_store_si256((__m256i *)(&dst[r0 * dst_stride + c0]), src_0); \
+ _mm256_store_si256((__m256i *)(&dst[r1 * dst_stride + c1]), src_1); \
+ _mm256_store_si256((__m256i *)(&dst[r2 * dst_stride + c2]), src_2); \
+ _mm256_store_si256((__m256i *)(&dst[r3 * dst_stride + c3]), src_3); \
+ } while (0)
+
+#define LEFT_SHIFT (2 * FILTER_BITS - 3 - 7)
+static AOM_INLINE void av1_dist_wtd_convolve_2d_no_avg_copy_avx2(
+ const uint8_t *src, int src_stride, CONV_BUF_TYPE *dst, int dst_stride,
+ int w, int h, const __m256i offset_const) {
+ int i = h;
+ if (w >= 16) {
+ __m256i src_0, src_1, src_2, src_3;
+ if (w == 128) {
+ do {
+ DO_NO_AVG_2D_COPY_4X16(0, 0, 0, 16, 0, 32, 0, 48);
+ DO_NO_AVG_2D_COPY_4X16(0, 64, 0, 80, 0, 96, 0, 112);
+ src += 1 * src_stride;
+ dst += 1 * dst_stride;
+ i -= 1;
+ } while (i);
+ } else if (w == 64) {
+ do {
+ DO_NO_AVG_2D_COPY_4X16(0, 0, 0, 16, 0, 32, 0, 48);
+ src += 1 * src_stride;
+ dst += 1 * dst_stride;
+ i -= 1;
+ } while (i);
+ } else if (w == 32) {
+ do {
+ DO_NO_AVG_2D_COPY_4X16(0, 0, 1, 0, 0, 16, 1, 16);
+ src += 2 * src_stride;
+ dst += 2 * dst_stride;
+ i -= 2;
+ } while (i);
+ } else if (w == 16) {
+ do {
+ DO_NO_AVG_2D_COPY_4X16(0, 0, 1, 0, 2, 0, 3, 0);
+ src += 4 * src_stride;
+ dst += 4 * dst_stride;
+ i -= 4;
+ } while (i);
+ }
+ } else {
+ const __m256i zero = _mm256_setzero_si256();
+ do {
+ const __m128i src_row_0 =
+ _mm_loadl_epi64((__m128i *)(&src[0 * src_stride]));
+ const __m128i src_row_1 =
+ _mm_loadl_epi64((__m128i *)(&src[1 * src_stride]));
+ const __m128i src_row_2 =
+ _mm_loadl_epi64((__m128i *)(&src[2 * src_stride]));
+ const __m128i src_row_3 =
+ _mm_loadl_epi64((__m128i *)(&src[3 * src_stride]));
+
+ __m256i src_10 = _mm256_insertf128_si256(
+ _mm256_castsi128_si256(src_row_0), src_row_1, 1);
+ __m256i src_32 = _mm256_insertf128_si256(
+ _mm256_castsi128_si256(src_row_2), src_row_3, 1);
+
+ src_10 = _mm256_unpacklo_epi8(src_10, zero);
+ src_32 = _mm256_unpacklo_epi8(src_32, zero);
+
+ src_10 = _mm256_slli_epi16(src_10, LEFT_SHIFT);
+ src_32 = _mm256_slli_epi16(src_32, LEFT_SHIFT);
+
+ src_10 = _mm256_add_epi16(src_10, offset_const);
+ src_32 = _mm256_add_epi16(src_32, offset_const);
+
+ // Accumulate values into the destination buffer
+ _mm_store_si128((__m128i *)(&dst[0 * dst_stride]),
+ _mm256_castsi256_si128(src_10));
+ _mm_store_si128((__m128i *)(&dst[1 * dst_stride]),
+ _mm256_extracti128_si256(src_10, 1));
+ _mm_store_si128((__m128i *)(&dst[2 * dst_stride]),
+ _mm256_castsi256_si128(src_32));
+ _mm_store_si128((__m128i *)(&dst[3 * dst_stride]),
+ _mm256_extracti128_si256(src_32, 1));
+
+ src += 4 * src_stride;
+ dst += 4 * dst_stride;
+ i -= 4;
+ } while (i);
+ }
+}
+
+#define DO_AVG_2D_COPY_4X16(USE_DIST_WEIGHTED, r0, c0, r1, c1, r2, c2, r3, c3) \
+ do { \
+ src_0 = _mm256_cvtepu8_epi16( \
+ _mm_loadu_si128((__m128i *)(&src[r0 * src_stride + c0]))); \
+ src_1 = _mm256_cvtepu8_epi16( \
+ _mm_loadu_si128((__m128i *)(&src[r1 * src_stride + c1]))); \
+ src_2 = _mm256_cvtepu8_epi16( \
+ _mm_loadu_si128((__m128i *)(&src[r2 * src_stride + c2]))); \
+ src_3 = _mm256_cvtepu8_epi16( \
+ _mm_loadu_si128((__m128i *)(&src[r3 * src_stride + c3]))); \
+ \
+ src_0 = _mm256_slli_epi16(src_0, LEFT_SHIFT); \
+ src_1 = _mm256_slli_epi16(src_1, LEFT_SHIFT); \
+ src_2 = _mm256_slli_epi16(src_2, LEFT_SHIFT); \
+ src_3 = _mm256_slli_epi16(src_3, LEFT_SHIFT); \
+ src_0 = _mm256_add_epi16(src_0, offset_const); \
+ src_1 = _mm256_add_epi16(src_1, offset_const); \
+ src_2 = _mm256_add_epi16(src_2, offset_const); \
+ src_3 = _mm256_add_epi16(src_3, offset_const); \
+ \
+ ref_0 = _mm256_loadu_si256((__m256i *)(&dst[r0 * dst_stride + c0])); \
+ ref_1 = _mm256_loadu_si256((__m256i *)(&dst[r1 * dst_stride + c1])); \
+ ref_2 = _mm256_loadu_si256((__m256i *)(&dst[r2 * dst_stride + c2])); \
+ ref_3 = _mm256_loadu_si256((__m256i *)(&dst[r3 * dst_stride + c3])); \
+ \
+ res_0 = comp_avg(&ref_0, &src_0, &wt, USE_DIST_WEIGHTED); \
+ res_1 = comp_avg(&ref_1, &src_1, &wt, USE_DIST_WEIGHTED); \
+ res_2 = comp_avg(&ref_2, &src_2, &wt, USE_DIST_WEIGHTED); \
+ res_3 = comp_avg(&ref_3, &src_3, &wt, USE_DIST_WEIGHTED); \
+ \
+ res_0 = convolve_rounding(&res_0, &offset_const, &rounding_const, \
+ rounding_shift); \
+ res_1 = convolve_rounding(&res_1, &offset_const, &rounding_const, \
+ rounding_shift); \
+ res_2 = convolve_rounding(&res_2, &offset_const, &rounding_const, \
+ rounding_shift); \
+ res_3 = convolve_rounding(&res_3, &offset_const, &rounding_const, \
+ rounding_shift); \
+ \
+ res_10 = _mm256_packus_epi16(res_0, res_1); \
+ res_32 = _mm256_packus_epi16(res_2, res_3); \
+ res_10 = _mm256_permute4x64_epi64(res_10, 0xD8); \
+ res_32 = _mm256_permute4x64_epi64(res_32, 0xD8); \
+ \
+ _mm_store_si128((__m128i *)(&dst0[r0 * dst_stride0 + c0]), \
+ _mm256_castsi256_si128(res_10)); \
+ _mm_store_si128((__m128i *)(&dst0[r1 * dst_stride0 + c1]), \
+ _mm256_extracti128_si256(res_10, 1)); \
+ _mm_store_si128((__m128i *)(&dst0[r2 * dst_stride0 + c2]), \
+ _mm256_castsi256_si128(res_32)); \
+ _mm_store_si128((__m128i *)(&dst0[r3 * dst_stride0 + c3]), \
+ _mm256_extracti128_si256(res_32, 1)); \
+ } while (0)
+
+#define DO_AVG_2D_COPY(USE_DIST_WEIGHTED) \
+ int i = h; \
+ if (w >= 16) { \
+ __m256i src_0, src_1, src_2, src_3; \
+ __m256i ref_0, ref_1, ref_2, ref_3; \
+ __m256i res_0, res_1, res_2, res_3; \
+ __m256i res_10, res_32; \
+ if (w == 128) { \
+ do { \
+ DO_AVG_2D_COPY_4X16(USE_DIST_WEIGHTED, 0, 0, 0, 16, 0, 32, 0, 48); \
+ DO_AVG_2D_COPY_4X16(USE_DIST_WEIGHTED, 0, 64, 0, 80, 0, 96, 0, 112); \
+ i -= 1; \
+ src += 1 * src_stride; \
+ dst += 1 * dst_stride; \
+ dst0 += 1 * dst_stride0; \
+ } while (i); \
+ } else if (w == 64) { \
+ do { \
+ DO_AVG_2D_COPY_4X16(USE_DIST_WEIGHTED, 0, 0, 0, 16, 0, 32, 0, 48); \
+ \
+ i -= 1; \
+ src += 1 * src_stride; \
+ dst += 1 * dst_stride; \
+ dst0 += 1 * dst_stride0; \
+ } while (i); \
+ } else if (w == 32) { \
+ do { \
+ DO_AVG_2D_COPY_4X16(USE_DIST_WEIGHTED, 0, 0, 1, 0, 0, 16, 1, 16); \
+ \
+ i -= 2; \
+ src += 2 * src_stride; \
+ dst += 2 * dst_stride; \
+ dst0 += 2 * dst_stride0; \
+ } while (i); \
+ } else { \
+ assert(w == 16); \
+ do { \
+ DO_AVG_2D_COPY_4X16(USE_DIST_WEIGHTED, 0, 0, 1, 0, 2, 0, 3, 0); \
+ \
+ i -= 4; \
+ src += 4 * src_stride; \
+ dst += 4 * dst_stride; \
+ dst0 += 4 * dst_stride0; \
+ } while (i); \
+ } \
+ } else if (w == 8) { \
+ do { \
+ const __m128i src_0 = \
+ _mm_loadl_epi64((__m128i *)(&src[0 * src_stride])); \
+ const __m128i src_1 = \
+ _mm_loadl_epi64((__m128i *)(&src[1 * src_stride])); \
+ const __m128i src_2 = \
+ _mm_loadl_epi64((__m128i *)(&src[2 * src_stride])); \
+ const __m128i src_3 = \
+ _mm_loadl_epi64((__m128i *)(&src[3 * src_stride])); \
+ __m256i src_10 = \
+ _mm256_insertf128_si256(_mm256_castsi128_si256(src_0), src_1, 1); \
+ __m256i src_32 = \
+ _mm256_insertf128_si256(_mm256_castsi128_si256(src_2), src_3, 1); \
+ \
+ src_10 = _mm256_unpacklo_epi8(src_10, zero); \
+ src_32 = _mm256_unpacklo_epi8(src_32, zero); \
+ \
+ src_10 = _mm256_slli_epi16(src_10, LEFT_SHIFT); \
+ src_32 = _mm256_slli_epi16(src_32, LEFT_SHIFT); \
+ \
+ src_10 = _mm256_add_epi16(src_10, offset_const); \
+ src_32 = _mm256_add_epi16(src_32, offset_const); \
+ \
+ const __m256i ref_10 = \
+ load_line2_avx2(&dst[0 * dst_stride], &dst[1 * dst_stride]); \
+ const __m256i ref_32 = \
+ load_line2_avx2(&dst[2 * dst_stride], &dst[3 * dst_stride]); \
+ __m256i res_10 = comp_avg(&ref_10, &src_10, &wt, USE_DIST_WEIGHTED); \
+ __m256i res_32 = comp_avg(&ref_32, &src_32, &wt, USE_DIST_WEIGHTED); \
+ \
+ res_10 = convolve_rounding(&res_10, &offset_const, &rounding_const, \
+ rounding_shift); \
+ res_32 = convolve_rounding(&res_32, &offset_const, &rounding_const, \
+ rounding_shift); \
+ \
+ __m256i res = _mm256_packus_epi16(res_10, res_32); \
+ const __m128i res_20 = _mm256_castsi256_si128(res); \
+ const __m128i res_31 = _mm256_extracti128_si256(res, 1); \
+ \
+ _mm_storel_epi64((__m128i *)(&dst0[0 * dst_stride0]), res_20); \
+ _mm_storel_epi64((__m128i *)((&dst0[1 * dst_stride0])), res_31); \
+ _mm_storeh_epi64((__m128i *)(&dst0[2 * dst_stride0]), res_20); \
+ _mm_storeh_epi64((__m128i *)((&dst0[3 * dst_stride0])), res_31); \
+ i -= 4; \
+ src += 4 * src_stride; \
+ dst += 4 * dst_stride; \
+ dst0 += 4 * dst_stride0; \
+ } while (i); \
+ } else { \
+ assert(w == 4); \
+ do { \
+ __m256i src_3210_8bit = \
+ _mm256_setr_epi32(loadu_int32(src + 0 * src_stride), \
+ loadu_int32(src + 1 * src_stride), 0, 0, \
+ loadu_int32(src + 2 * src_stride), \
+ loadu_int32(src + 3 * src_stride), 0, 0); \
+ \
+ __m256i src_3210 = _mm256_unpacklo_epi8(src_3210_8bit, zero); \
+ src_3210 = _mm256_slli_epi16(src_3210, LEFT_SHIFT); \
+ src_3210 = _mm256_add_epi16(src_3210, offset_const); \
+ \
+ __m256i ref_3210 = \
+ _mm256_setr_epi64x(*(int64_t *)(dst + 0 * dst_stride), \
+ *(int64_t *)(dst + 1 * dst_stride), \
+ *(int64_t *)(dst + 2 * dst_stride), \
+ *(int64_t *)(dst + 3 * dst_stride)); \
+ __m256i res_3210 = \
+ comp_avg(&ref_3210, &src_3210, &wt, USE_DIST_WEIGHTED); \
+ \
+ res_3210 = convolve_rounding(&res_3210, &offset_const, &rounding_const, \
+ rounding_shift); \
+ \
+ res_3210 = _mm256_packus_epi16(res_3210, res_3210); \
+ const __m128i res_10 = _mm256_castsi256_si128(res_3210); \
+ const __m128i res_32 = _mm256_extracti128_si256(res_3210, 1); \
+ \
+ *(int *)(&dst0[0 * dst_stride0]) = _mm_cvtsi128_si32(res_10); \
+ *(int *)(&dst0[2 * dst_stride0]) = _mm_cvtsi128_si32(res_32); \
+ *(int *)(&dst0[1 * dst_stride0]) = _mm_extract_epi32(res_10, 1); \
+ *(int *)(&dst0[3 * dst_stride0]) = _mm_extract_epi32(res_32, 1); \
+ i -= 4; \
+ src += 4 * src_stride; \
+ dst += 4 * dst_stride; \
+ dst0 += 4 * dst_stride0; \
+ } while (i); \
+ }
+
+void av1_dist_wtd_convolve_2d_copy_avx2(const uint8_t *src, int src_stride,
+ uint8_t *dst0, int dst_stride0, int w,
+ int h, ConvolveParams *conv_params) {
+ const int bd = 8;
+ CONV_BUF_TYPE *dst = conv_params->dst;
+ int dst_stride = conv_params->dst_stride;
+ assert(conv_params->round_0 == 3);
+ assert(conv_params->round_1 == 7);
+ assert(w % 4 == 0);
+ assert(h % 4 == 0);
+
+ const int do_average = conv_params->do_average;
+ const int use_dist_wtd_comp_avg = conv_params->use_dist_wtd_comp_avg;
+ const __m256i wt = unpack_weights_avx2(conv_params);
+ const __m256i zero = _mm256_setzero_si256();
+
+ const int offset_0 =
+ bd + 2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1;
+ const int offset = (1 << offset_0) + (1 << (offset_0 - 1));
+ const __m256i offset_const = _mm256_set1_epi16(offset);
+ const int rounding_shift =
+ 2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1;
+ const __m256i rounding_const = _mm256_set1_epi16((1 << rounding_shift) >> 1);
+
+ if (do_average) {
+ if (use_dist_wtd_comp_avg) {
+ DO_AVG_2D_COPY(1)
+ } else {
+ DO_AVG_2D_COPY(0)
+ }
+ } else {
+ av1_dist_wtd_convolve_2d_no_avg_copy_avx2(src, src_stride, dst, dst_stride,
+ w, h, offset_const);
+ }
+}
+#undef LEFT_SHIFT
diff --git a/third_party/aom/av1/common/x86/jnt_convolve_sse2.c b/third_party/aom/av1/common/x86/jnt_convolve_sse2.c
new file mode 100644
index 0000000000..8c5d9918fb
--- /dev/null
+++ b/third_party/aom/av1/common/x86/jnt_convolve_sse2.c
@@ -0,0 +1,606 @@
+/*
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <emmintrin.h>
+
+#include "config/av1_rtcd.h"
+
+#include "aom_dsp/aom_filter.h"
+#include "aom_dsp/x86/convolve_sse2.h"
+
+void av1_dist_wtd_convolve_x_sse2(const uint8_t *src, int src_stride,
+ uint8_t *dst0, int dst_stride0, int w, int h,
+ const InterpFilterParams *filter_params_x,
+ const int subpel_x_qn,
+ ConvolveParams *conv_params) {
+ const int bd = 8;
+ CONV_BUF_TYPE *dst = conv_params->dst;
+ const int dst_stride = conv_params->dst_stride;
+ const int fo_horiz = filter_params_x->taps / 2 - 1;
+ const uint8_t *src_ptr = src - fo_horiz;
+ const int bits = FILTER_BITS - conv_params->round_1;
+ const __m128i left_shift = _mm_cvtsi32_si128(bits);
+ const __m128i round_const = _mm_set1_epi32((1 << conv_params->round_0) >> 1);
+ const __m128i round_shift = _mm_cvtsi32_si128(conv_params->round_0);
+ const int w0 = conv_params->fwd_offset;
+ const int w1 = conv_params->bck_offset;
+ const __m128i wt0 = _mm_set1_epi16(w0);
+ const __m128i wt1 = _mm_set1_epi16(w1);
+ const __m128i wt = _mm_unpacklo_epi16(wt0, wt1);
+ const int do_average = conv_params->do_average;
+ const int use_dist_wtd_comp_avg = conv_params->use_dist_wtd_comp_avg;
+ const int offset_0 =
+ bd + 2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1;
+ const int offset = (1 << offset_0) + (1 << (offset_0 - 1));
+ const __m128i offset_const = _mm_set1_epi16(offset);
+ const int rounding_shift =
+ 2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1;
+ const __m128i rounding_const = _mm_set1_epi16((1 << rounding_shift) >> 1);
+ __m128i coeffs[4];
+
+ prepare_coeffs(filter_params_x, subpel_x_qn, coeffs);
+
+ if (w == 4) {
+ do {
+ const __m128i data = _mm_loadu_si128((__m128i *)src_ptr);
+ __m128i s[4];
+
+ s[0] = _mm_unpacklo_epi8(data, _mm_srli_si128(data, 1));
+ s[1] =
+ _mm_unpacklo_epi8(_mm_srli_si128(data, 2), _mm_srli_si128(data, 3));
+ s[2] =
+ _mm_unpacklo_epi8(_mm_srli_si128(data, 4), _mm_srli_si128(data, 5));
+ s[3] =
+ _mm_unpacklo_epi8(_mm_srli_si128(data, 6), _mm_srli_si128(data, 7));
+ const __m128i res_lo = convolve_lo_x(s, coeffs);
+ const __m128i res_lo_round =
+ _mm_sra_epi32(_mm_add_epi32(res_lo, round_const), round_shift);
+ const __m128i res_lo_shift = _mm_sll_epi32(res_lo_round, left_shift);
+
+ const __m128i res_16b = _mm_packs_epi32(res_lo_shift, res_lo_shift);
+ const __m128i res_unsigned = _mm_add_epi16(res_16b, offset_const);
+
+ // Accumulate values into the destination buffer
+ if (do_average) {
+ const __m128i data_ref_0 = _mm_loadu_si128((__m128i *)dst);
+
+ const __m128i comp_avg_res =
+ comp_avg(&data_ref_0, &res_unsigned, &wt, use_dist_wtd_comp_avg);
+
+ const __m128i round_result = convolve_rounding(
+ &comp_avg_res, &offset_const, &rounding_const, rounding_shift);
+
+ const __m128i res_8 = _mm_packus_epi16(round_result, round_result);
+ *(int *)(&dst0[0]) = _mm_cvtsi128_si32(res_8);
+ } else {
+ _mm_store_si128((__m128i *)(&dst[0]), res_unsigned);
+ }
+ src_ptr += src_stride;
+ dst += dst_stride;
+ dst0 += dst_stride0;
+ } while (--h);
+ } else {
+ assert(!(w % 8));
+ int i = 0;
+ do {
+ int j = 0;
+ do {
+ const __m128i data =
+ _mm_loadu_si128((__m128i *)&src_ptr[i * src_stride + j]);
+ __m128i s[4];
+
+ // Filter even-index pixels
+ s[0] = data;
+ s[1] = _mm_srli_si128(data, 2);
+ s[2] = _mm_srli_si128(data, 4);
+ s[3] = _mm_srli_si128(data, 6);
+ const __m128i res_even = convolve_lo_x(s, coeffs);
+
+ // Filter odd-index pixels
+ s[0] = _mm_srli_si128(data, 1);
+ s[1] = _mm_srli_si128(data, 3);
+ s[2] = _mm_srli_si128(data, 5);
+ s[3] = _mm_srli_si128(data, 7);
+ const __m128i res_odd = convolve_lo_x(s, coeffs);
+
+ // Rearrange pixels back into the order 0 ... 7
+ const __m128i res_lo = _mm_unpacklo_epi32(res_even, res_odd);
+ const __m128i res_hi = _mm_unpackhi_epi32(res_even, res_odd);
+ const __m128i res_lo_round =
+ _mm_sra_epi32(_mm_add_epi32(res_lo, round_const), round_shift);
+ const __m128i res_hi_round =
+ _mm_sra_epi32(_mm_add_epi32(res_hi, round_const), round_shift);
+ const __m128i res_lo_shift = _mm_sll_epi32(res_lo_round, left_shift);
+ const __m128i res_hi_shift = _mm_sll_epi32(res_hi_round, left_shift);
+
+ const __m128i res_16b = _mm_packs_epi32(res_lo_shift, res_hi_shift);
+ const __m128i res_unsigned = _mm_add_epi16(res_16b, offset_const);
+
+ // Accumulate values into the destination buffer
+ if (do_average) {
+ const __m128i data_ref_0 =
+ _mm_loadu_si128((__m128i *)(&dst[i * dst_stride + j]));
+
+ const __m128i comp_avg_res =
+ comp_avg(&data_ref_0, &res_unsigned, &wt, use_dist_wtd_comp_avg);
+
+ const __m128i round_result = convolve_rounding(
+ &comp_avg_res, &offset_const, &rounding_const, rounding_shift);
+
+ const __m128i res_8 = _mm_packus_epi16(round_result, round_result);
+ _mm_storel_epi64((__m128i *)(&dst0[i * dst_stride0 + j]), res_8);
+ } else {
+ _mm_store_si128((__m128i *)(&dst[i * dst_stride + j]), res_unsigned);
+ }
+ j += 8;
+ } while (j < w);
+ } while (++i < h);
+ }
+}
+
+void av1_dist_wtd_convolve_y_sse2(const uint8_t *src, int src_stride,
+ uint8_t *dst0, int dst_stride0, int w, int h,
+ const InterpFilterParams *filter_params_y,
+ const int subpel_y_qn,
+ ConvolveParams *conv_params) {
+ const int bd = 8;
+ CONV_BUF_TYPE *dst = conv_params->dst;
+ const int dst_stride = conv_params->dst_stride;
+ const int fo_vert = filter_params_y->taps / 2 - 1;
+ const uint8_t *src_ptr = src - fo_vert * src_stride;
+ const int bits = FILTER_BITS - conv_params->round_0;
+ const __m128i left_shift = _mm_cvtsi32_si128(bits);
+ const __m128i wt0 = _mm_set1_epi16(conv_params->fwd_offset);
+ const __m128i wt1 = _mm_set1_epi16(conv_params->bck_offset);
+ const __m128i wt = _mm_unpacklo_epi16(wt0, wt1);
+ const int do_average = conv_params->do_average;
+ const int use_dist_wtd_comp_avg = conv_params->use_dist_wtd_comp_avg;
+ const int offset_0 =
+ bd + 2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1;
+ const int offset = (1 << offset_0) + (1 << (offset_0 - 1));
+ const __m128i offset_const = _mm_set1_epi16(offset);
+ const int rounding_shift =
+ 2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1;
+ const __m128i rounding_const = _mm_set1_epi16((1 << rounding_shift) >> 1);
+ const __m128i round_const = _mm_set1_epi32((1 << conv_params->round_1) >> 1);
+ const __m128i round_shift = _mm_cvtsi32_si128(conv_params->round_1);
+ __m128i coeffs[4];
+
+ prepare_coeffs(filter_params_y, subpel_y_qn, coeffs);
+
+ if (w == 4) {
+ __m128i s[8], src6, res, res_shift;
+ src6 = _mm_cvtsi32_si128(*(int *)(src_ptr + 6 * src_stride));
+ s[0] = _mm_unpacklo_epi8(
+ _mm_cvtsi32_si128(*(int *)(src_ptr + 0 * src_stride)),
+ _mm_cvtsi32_si128(*(int *)(src_ptr + 1 * src_stride)));
+ s[1] = _mm_unpacklo_epi8(
+ _mm_cvtsi32_si128(*(int *)(src_ptr + 1 * src_stride)),
+ _mm_cvtsi32_si128(*(int *)(src_ptr + 2 * src_stride)));
+ s[2] = _mm_unpacklo_epi8(
+ _mm_cvtsi32_si128(*(int *)(src_ptr + 2 * src_stride)),
+ _mm_cvtsi32_si128(*(int *)(src_ptr + 3 * src_stride)));
+ s[3] = _mm_unpacklo_epi8(
+ _mm_cvtsi32_si128(*(int *)(src_ptr + 3 * src_stride)),
+ _mm_cvtsi32_si128(*(int *)(src_ptr + 4 * src_stride)));
+ s[4] = _mm_unpacklo_epi8(
+ _mm_cvtsi32_si128(*(int *)(src_ptr + 4 * src_stride)),
+ _mm_cvtsi32_si128(*(int *)(src_ptr + 5 * src_stride)));
+ s[5] = _mm_unpacklo_epi8(
+ _mm_cvtsi32_si128(*(int *)(src_ptr + 5 * src_stride)), src6);
+
+ do {
+ s[6] = _mm_unpacklo_epi8(
+ src6, _mm_cvtsi32_si128(*(int *)(src_ptr + 7 * src_stride)));
+ src6 = _mm_cvtsi32_si128(*(int *)(src_ptr + 8 * src_stride));
+ s[7] = _mm_unpacklo_epi8(
+ _mm_cvtsi32_si128(*(int *)(src_ptr + 7 * src_stride)), src6);
+
+ res = convolve_lo_y(s + 0, coeffs);
+ res_shift = _mm_sll_epi32(res, left_shift);
+ res_shift =
+ _mm_sra_epi32(_mm_add_epi32(res_shift, round_const), round_shift);
+
+ __m128i res_16b = _mm_packs_epi32(res_shift, res_shift);
+ __m128i res_unsigned = _mm_add_epi16(res_16b, offset_const);
+
+ // Accumulate values into the destination buffer
+ if (do_average) {
+ const __m128i data_ref_0 = _mm_loadu_si128((__m128i *)dst);
+
+ const __m128i comp_avg_res =
+ comp_avg(&data_ref_0, &res_unsigned, &wt, use_dist_wtd_comp_avg);
+
+ const __m128i round_result = convolve_rounding(
+ &comp_avg_res, &offset_const, &rounding_const, rounding_shift);
+
+ const __m128i res_8 = _mm_packus_epi16(round_result, round_result);
+ *(int *)(&dst0[0]) = _mm_cvtsi128_si32(res_8);
+
+ } else {
+ _mm_store_si128((__m128i *)dst, res_unsigned);
+ }
+
+ src_ptr += src_stride;
+ dst += dst_stride;
+ dst0 += dst_stride0;
+
+ res = convolve_lo_y(s + 1, coeffs);
+ res_shift = _mm_sll_epi32(res, left_shift);
+ res_shift =
+ _mm_sra_epi32(_mm_add_epi32(res_shift, round_const), round_shift);
+
+ res_16b = _mm_packs_epi32(res_shift, res_shift);
+ res_unsigned = _mm_add_epi16(res_16b, offset_const);
+
+ // Accumulate values into the destination buffer
+ if (do_average) {
+ const __m128i data_ref_0 = _mm_loadu_si128((__m128i *)dst);
+
+ const __m128i comp_avg_res =
+ comp_avg(&data_ref_0, &res_unsigned, &wt, use_dist_wtd_comp_avg);
+
+ const __m128i round_result = convolve_rounding(
+ &comp_avg_res, &offset_const, &rounding_const, rounding_shift);
+
+ const __m128i res_8 = _mm_packus_epi16(round_result, round_result);
+ *(int *)(&dst0[0]) = _mm_cvtsi128_si32(res_8);
+
+ } else {
+ _mm_store_si128((__m128i *)dst, res_unsigned);
+ }
+
+ src_ptr += src_stride;
+ dst += dst_stride;
+ dst0 += dst_stride0;
+
+ s[0] = s[2];
+ s[1] = s[3];
+ s[2] = s[4];
+ s[3] = s[5];
+ s[4] = s[6];
+ s[5] = s[7];
+ h -= 2;
+ } while (h);
+ } else {
+ assert(!(w % 8));
+ int j = 0;
+ do {
+ __m128i s[8], src6, res_lo, res_hi, res_lo_shift, res_hi_shift;
+ const uint8_t *data = &src_ptr[j];
+
+ src6 = _mm_loadl_epi64((__m128i *)(data + 6 * src_stride));
+ s[0] = _mm_unpacklo_epi8(
+ _mm_loadl_epi64((__m128i *)(data + 0 * src_stride)),
+ _mm_loadl_epi64((__m128i *)(data + 1 * src_stride)));
+ s[1] = _mm_unpacklo_epi8(
+ _mm_loadl_epi64((__m128i *)(data + 1 * src_stride)),
+ _mm_loadl_epi64((__m128i *)(data + 2 * src_stride)));
+ s[2] = _mm_unpacklo_epi8(
+ _mm_loadl_epi64((__m128i *)(data + 2 * src_stride)),
+ _mm_loadl_epi64((__m128i *)(data + 3 * src_stride)));
+ s[3] = _mm_unpacklo_epi8(
+ _mm_loadl_epi64((__m128i *)(data + 3 * src_stride)),
+ _mm_loadl_epi64((__m128i *)(data + 4 * src_stride)));
+ s[4] = _mm_unpacklo_epi8(
+ _mm_loadl_epi64((__m128i *)(data + 4 * src_stride)),
+ _mm_loadl_epi64((__m128i *)(data + 5 * src_stride)));
+ s[5] = _mm_unpacklo_epi8(
+ _mm_loadl_epi64((__m128i *)(data + 5 * src_stride)), src6);
+
+ int i = 0;
+ do {
+ data = &src_ptr[i * src_stride + j];
+ s[6] = _mm_unpacklo_epi8(
+ src6, _mm_loadl_epi64((__m128i *)(data + 7 * src_stride)));
+ src6 = _mm_loadl_epi64((__m128i *)(data + 8 * src_stride));
+ s[7] = _mm_unpacklo_epi8(
+ _mm_loadl_epi64((__m128i *)(data + 7 * src_stride)), src6);
+
+ res_lo = convolve_lo_y(s, coeffs); // Filter low index pixels
+ res_hi = convolve_hi_y(s, coeffs); // Filter high index pixels
+ res_lo_shift = _mm_sll_epi32(res_lo, left_shift);
+ res_hi_shift = _mm_sll_epi32(res_hi, left_shift);
+ res_lo_shift = _mm_sra_epi32(_mm_add_epi32(res_lo_shift, round_const),
+ round_shift);
+ res_hi_shift = _mm_sra_epi32(_mm_add_epi32(res_hi_shift, round_const),
+ round_shift);
+
+ __m128i res_16b = _mm_packs_epi32(res_lo_shift, res_hi_shift);
+ __m128i res_unsigned = _mm_add_epi16(res_16b, offset_const);
+
+ // Accumulate values into the destination buffer
+ if (do_average) {
+ const __m128i data_ref_0 =
+ _mm_loadu_si128((__m128i *)(&dst[i * dst_stride + j]));
+
+ const __m128i comp_avg_res =
+ comp_avg(&data_ref_0, &res_unsigned, &wt, use_dist_wtd_comp_avg);
+
+ const __m128i round_result = convolve_rounding(
+ &comp_avg_res, &offset_const, &rounding_const, rounding_shift);
+
+ const __m128i res_8 = _mm_packus_epi16(round_result, round_result);
+ _mm_storel_epi64((__m128i *)(&dst0[i * dst_stride0 + j]), res_8);
+ } else {
+ _mm_store_si128((__m128i *)(&dst[i * dst_stride + j]), res_unsigned);
+ }
+ i++;
+
+ res_lo = convolve_lo_y(s + 1, coeffs); // Filter low index pixels
+ res_hi = convolve_hi_y(s + 1, coeffs); // Filter high index pixels
+ res_lo_shift = _mm_sll_epi32(res_lo, left_shift);
+ res_hi_shift = _mm_sll_epi32(res_hi, left_shift);
+ res_lo_shift = _mm_sra_epi32(_mm_add_epi32(res_lo_shift, round_const),
+ round_shift);
+ res_hi_shift = _mm_sra_epi32(_mm_add_epi32(res_hi_shift, round_const),
+ round_shift);
+ res_16b = _mm_packs_epi32(res_lo_shift, res_hi_shift);
+ res_unsigned = _mm_add_epi16(res_16b, offset_const);
+
+ // Accumulate values into the destination buffer
+ if (do_average) {
+ __m128i data_ref_0 =
+ _mm_loadu_si128((__m128i *)(&dst[i * dst_stride + j]));
+
+ const __m128i comp_avg_res =
+ comp_avg(&data_ref_0, &res_unsigned, &wt, use_dist_wtd_comp_avg);
+
+ const __m128i round_result = convolve_rounding(
+ &comp_avg_res, &offset_const, &rounding_const, rounding_shift);
+
+ const __m128i res_8 = _mm_packus_epi16(round_result, round_result);
+ _mm_storel_epi64((__m128i *)(&dst0[i * dst_stride0 + j]), res_8);
+ } else {
+ _mm_store_si128((__m128i *)(&dst[i * dst_stride + j]), res_unsigned);
+ }
+ i++;
+
+ s[0] = s[2];
+ s[1] = s[3];
+ s[2] = s[4];
+ s[3] = s[5];
+ s[4] = s[6];
+ s[5] = s[7];
+ } while (i < h);
+ j += 8;
+ } while (j < w);
+ }
+}
+
+void av1_dist_wtd_convolve_2d_sse2(const uint8_t *src, int src_stride,
+ uint8_t *dst0, int dst_stride0, int w, int h,
+ const InterpFilterParams *filter_params_x,
+ const InterpFilterParams *filter_params_y,
+ const int subpel_x_qn, const int subpel_y_qn,
+ ConvolveParams *conv_params) {
+ CONV_BUF_TYPE *dst = conv_params->dst;
+ int dst_stride = conv_params->dst_stride;
+ const int bd = 8;
+
+ DECLARE_ALIGNED(16, int16_t,
+ im_block[(MAX_SB_SIZE + MAX_FILTER_TAP - 1) * MAX_SB_SIZE]);
+ int im_h = h + filter_params_y->taps - 1;
+ int im_stride = MAX_SB_SIZE;
+ int i, j;
+ const int fo_vert = filter_params_y->taps / 2 - 1;
+ const int fo_horiz = filter_params_x->taps / 2 - 1;
+ const int do_average = conv_params->do_average;
+ const int use_dist_wtd_comp_avg = conv_params->use_dist_wtd_comp_avg;
+ const uint8_t *const src_ptr = src - fo_vert * src_stride - fo_horiz;
+
+ const __m128i zero = _mm_setzero_si128();
+
+ const int w0 = conv_params->fwd_offset;
+ const int w1 = conv_params->bck_offset;
+ const __m128i wt0 = _mm_set1_epi16(w0);
+ const __m128i wt1 = _mm_set1_epi16(w1);
+ const __m128i wt = _mm_unpacklo_epi16(wt0, wt1);
+
+ const int offset_0 =
+ bd + 2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1;
+ const int offset = (1 << offset_0) + (1 << (offset_0 - 1));
+ const __m128i offset_const = _mm_set1_epi16(offset);
+ const int rounding_shift =
+ 2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1;
+ const __m128i rounding_const = _mm_set1_epi16((1 << rounding_shift) >> 1);
+
+ /* Horizontal filter */
+ {
+ const int16_t *x_filter = av1_get_interp_filter_subpel_kernel(
+ filter_params_x, subpel_x_qn & SUBPEL_MASK);
+ const __m128i coeffs_x = _mm_loadu_si128((__m128i *)x_filter);
+
+ // coeffs 0 1 0 1 2 3 2 3
+ const __m128i tmp_0 = _mm_unpacklo_epi32(coeffs_x, coeffs_x);
+ // coeffs 4 5 4 5 6 7 6 7
+ const __m128i tmp_1 = _mm_unpackhi_epi32(coeffs_x, coeffs_x);
+
+ // coeffs 0 1 0 1 0 1 0 1
+ const __m128i coeff_01 = _mm_unpacklo_epi64(tmp_0, tmp_0);
+ // coeffs 2 3 2 3 2 3 2 3
+ const __m128i coeff_23 = _mm_unpackhi_epi64(tmp_0, tmp_0);
+ // coeffs 4 5 4 5 4 5 4 5
+ const __m128i coeff_45 = _mm_unpacklo_epi64(tmp_1, tmp_1);
+ // coeffs 6 7 6 7 6 7 6 7
+ const __m128i coeff_67 = _mm_unpackhi_epi64(tmp_1, tmp_1);
+
+ const __m128i round_const = _mm_set1_epi32(
+ ((1 << conv_params->round_0) >> 1) + (1 << (bd + FILTER_BITS - 1)));
+ const __m128i round_shift = _mm_cvtsi32_si128(conv_params->round_0);
+
+ for (i = 0; i < im_h; ++i) {
+ for (j = 0; j < w; j += 8) {
+ __m128i temp_lo, temp_hi;
+ const __m128i data =
+ _mm_loadu_si128((__m128i *)&src_ptr[i * src_stride + j]);
+
+ const __m128i src_lo = _mm_unpacklo_epi8(data, zero);
+ const __m128i src_hi = _mm_unpackhi_epi8(data, zero);
+
+ // Filter even-index pixels
+ const __m128i res_0 = _mm_madd_epi16(src_lo, coeff_01);
+ temp_lo = _mm_srli_si128(src_lo, 4);
+ temp_hi = _mm_slli_si128(src_hi, 12);
+ const __m128i src_2 = _mm_or_si128(temp_hi, temp_lo);
+ const __m128i res_2 = _mm_madd_epi16(src_2, coeff_23);
+ temp_lo = _mm_srli_si128(src_lo, 8);
+ temp_hi = _mm_slli_si128(src_hi, 8);
+ const __m128i src_4 = _mm_or_si128(temp_hi, temp_lo);
+ const __m128i res_4 = _mm_madd_epi16(src_4, coeff_45);
+ temp_lo = _mm_srli_si128(src_lo, 12);
+ temp_hi = _mm_slli_si128(src_hi, 4);
+ const __m128i src_6 = _mm_or_si128(temp_hi, temp_lo);
+ const __m128i res_6 = _mm_madd_epi16(src_6, coeff_67);
+
+ __m128i res_even = _mm_add_epi32(_mm_add_epi32(res_0, res_4),
+ _mm_add_epi32(res_2, res_6));
+ res_even =
+ _mm_sra_epi32(_mm_add_epi32(res_even, round_const), round_shift);
+
+ // Filter odd-index pixels
+ temp_lo = _mm_srli_si128(src_lo, 2);
+ temp_hi = _mm_slli_si128(src_hi, 14);
+ const __m128i src_1 = _mm_or_si128(temp_hi, temp_lo);
+ const __m128i res_1 = _mm_madd_epi16(src_1, coeff_01);
+ temp_lo = _mm_srli_si128(src_lo, 6);
+ temp_hi = _mm_slli_si128(src_hi, 10);
+ const __m128i src_3 = _mm_or_si128(temp_hi, temp_lo);
+ const __m128i res_3 = _mm_madd_epi16(src_3, coeff_23);
+ temp_lo = _mm_srli_si128(src_lo, 10);
+ temp_hi = _mm_slli_si128(src_hi, 6);
+ const __m128i src_5 = _mm_or_si128(temp_hi, temp_lo);
+ const __m128i res_5 = _mm_madd_epi16(src_5, coeff_45);
+ temp_lo = _mm_srli_si128(src_lo, 14);
+ temp_hi = _mm_slli_si128(src_hi, 2);
+ const __m128i src_7 = _mm_or_si128(temp_hi, temp_lo);
+ const __m128i res_7 = _mm_madd_epi16(src_7, coeff_67);
+
+ __m128i res_odd = _mm_add_epi32(_mm_add_epi32(res_1, res_5),
+ _mm_add_epi32(res_3, res_7));
+ res_odd =
+ _mm_sra_epi32(_mm_add_epi32(res_odd, round_const), round_shift);
+
+ // Pack in the column order 0, 2, 4, 6, 1, 3, 5, 7
+ __m128i res = _mm_packs_epi32(res_even, res_odd);
+ _mm_store_si128((__m128i *)&im_block[i * im_stride + j], res);
+ }
+ }
+ }
+
+ /* Vertical filter */
+ {
+ const int16_t *y_filter = av1_get_interp_filter_subpel_kernel(
+ filter_params_y, subpel_y_qn & SUBPEL_MASK);
+ const __m128i coeffs_y = _mm_loadu_si128((__m128i *)y_filter);
+
+ // coeffs 0 1 0 1 2 3 2 3
+ const __m128i tmp_0 = _mm_unpacklo_epi32(coeffs_y, coeffs_y);
+ // coeffs 4 5 4 5 6 7 6 7
+ const __m128i tmp_1 = _mm_unpackhi_epi32(coeffs_y, coeffs_y);
+
+ // coeffs 0 1 0 1 0 1 0 1
+ const __m128i coeff_01 = _mm_unpacklo_epi64(tmp_0, tmp_0);
+ // coeffs 2 3 2 3 2 3 2 3
+ const __m128i coeff_23 = _mm_unpackhi_epi64(tmp_0, tmp_0);
+ // coeffs 4 5 4 5 4 5 4 5
+ const __m128i coeff_45 = _mm_unpacklo_epi64(tmp_1, tmp_1);
+ // coeffs 6 7 6 7 6 7 6 7
+ const __m128i coeff_67 = _mm_unpackhi_epi64(tmp_1, tmp_1);
+
+ const __m128i round_const = _mm_set1_epi32(
+ ((1 << conv_params->round_1) >> 1) -
+ (1 << (bd + 2 * FILTER_BITS - conv_params->round_0 - 1)));
+ const __m128i round_shift = _mm_cvtsi32_si128(conv_params->round_1);
+
+ for (i = 0; i < h; ++i) {
+ for (j = 0; j < w; j += 8) {
+ // Filter even-index pixels
+ const int16_t *data = &im_block[i * im_stride + j];
+ const __m128i src_0 =
+ _mm_unpacklo_epi16(*(__m128i *)(data + 0 * im_stride),
+ *(__m128i *)(data + 1 * im_stride));
+ const __m128i src_2 =
+ _mm_unpacklo_epi16(*(__m128i *)(data + 2 * im_stride),
+ *(__m128i *)(data + 3 * im_stride));
+ const __m128i src_4 =
+ _mm_unpacklo_epi16(*(__m128i *)(data + 4 * im_stride),
+ *(__m128i *)(data + 5 * im_stride));
+ const __m128i src_6 =
+ _mm_unpacklo_epi16(*(__m128i *)(data + 6 * im_stride),
+ *(__m128i *)(data + 7 * im_stride));
+
+ const __m128i res_0 = _mm_madd_epi16(src_0, coeff_01);
+ const __m128i res_2 = _mm_madd_epi16(src_2, coeff_23);
+ const __m128i res_4 = _mm_madd_epi16(src_4, coeff_45);
+ const __m128i res_6 = _mm_madd_epi16(src_6, coeff_67);
+
+ const __m128i res_even = _mm_add_epi32(_mm_add_epi32(res_0, res_2),
+ _mm_add_epi32(res_4, res_6));
+
+ // Filter odd-index pixels
+ const __m128i src_1 =
+ _mm_unpackhi_epi16(*(__m128i *)(data + 0 * im_stride),
+ *(__m128i *)(data + 1 * im_stride));
+ const __m128i src_3 =
+ _mm_unpackhi_epi16(*(__m128i *)(data + 2 * im_stride),
+ *(__m128i *)(data + 3 * im_stride));
+ const __m128i src_5 =
+ _mm_unpackhi_epi16(*(__m128i *)(data + 4 * im_stride),
+ *(__m128i *)(data + 5 * im_stride));
+ const __m128i src_7 =
+ _mm_unpackhi_epi16(*(__m128i *)(data + 6 * im_stride),
+ *(__m128i *)(data + 7 * im_stride));
+
+ const __m128i res_1 = _mm_madd_epi16(src_1, coeff_01);
+ const __m128i res_3 = _mm_madd_epi16(src_3, coeff_23);
+ const __m128i res_5 = _mm_madd_epi16(src_5, coeff_45);
+ const __m128i res_7 = _mm_madd_epi16(src_7, coeff_67);
+
+ const __m128i res_odd = _mm_add_epi32(_mm_add_epi32(res_1, res_3),
+ _mm_add_epi32(res_5, res_7));
+
+ // Rearrange pixels back into the order 0 ... 7
+ const __m128i res_lo = _mm_unpacklo_epi32(res_even, res_odd);
+ const __m128i res_hi = _mm_unpackhi_epi32(res_even, res_odd);
+
+ const __m128i res_lo_round =
+ _mm_sra_epi32(_mm_add_epi32(res_lo, round_const), round_shift);
+ const __m128i res_hi_round =
+ _mm_sra_epi32(_mm_add_epi32(res_hi, round_const), round_shift);
+
+ const __m128i res_16b = _mm_packs_epi32(res_lo_round, res_hi_round);
+ const __m128i res_unsigned = _mm_add_epi16(res_16b, offset_const);
+
+ // Accumulate values into the destination buffer
+ if (do_average) {
+ const __m128i data_ref_0 =
+ _mm_loadu_si128((__m128i *)(&dst[i * dst_stride + j]));
+
+ const __m128i comp_avg_res =
+ comp_avg(&data_ref_0, &res_unsigned, &wt, use_dist_wtd_comp_avg);
+
+ const __m128i round_result = convolve_rounding(
+ &comp_avg_res, &offset_const, &rounding_const, rounding_shift);
+
+ const __m128i res_8 = _mm_packus_epi16(round_result, round_result);
+
+ if (w > 4)
+ _mm_storel_epi64((__m128i *)(&dst0[i * dst_stride0 + j]), res_8);
+ else
+ *(int *)(&dst0[i * dst_stride0 + j]) = _mm_cvtsi128_si32(res_8);
+ } else {
+ _mm_store_si128((__m128i *)(&dst[i * dst_stride + j]), res_unsigned);
+ }
+ }
+ }
+ }
+}
diff --git a/third_party/aom/av1/common/x86/jnt_convolve_ssse3.c b/third_party/aom/av1/common/x86/jnt_convolve_ssse3.c
new file mode 100644
index 0000000000..f6bf67815d
--- /dev/null
+++ b/third_party/aom/av1/common/x86/jnt_convolve_ssse3.c
@@ -0,0 +1,230 @@
+/*
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <tmmintrin.h>
+
+#include "config/av1_rtcd.h"
+
+#include "aom_dsp/aom_filter.h"
+#include "aom_dsp/x86/convolve_sse2.h"
+
+void av1_dist_wtd_convolve_2d_ssse3(
+ const uint8_t *src, int src_stride, uint8_t *dst0, int dst_stride0, int w,
+ int h, const InterpFilterParams *filter_params_x,
+ const InterpFilterParams *filter_params_y, const int subpel_x_qn,
+ const int subpel_y_qn, ConvolveParams *conv_params) {
+ CONV_BUF_TYPE *dst = conv_params->dst;
+ int dst_stride = conv_params->dst_stride;
+ const int bd = 8;
+
+ DECLARE_ALIGNED(16, int16_t,
+ im_block[(MAX_SB_SIZE + MAX_FILTER_TAP - 1) * MAX_SB_SIZE]);
+ int im_h = h + filter_params_y->taps - 1;
+ int im_stride = MAX_SB_SIZE;
+ int i, j;
+ const int fo_vert = filter_params_y->taps / 2 - 1;
+ const int fo_horiz = filter_params_x->taps / 2 - 1;
+ const int do_average = conv_params->do_average;
+ const int use_dist_wtd_comp_avg = conv_params->use_dist_wtd_comp_avg;
+ const uint8_t *const src_ptr = src - fo_vert * src_stride - fo_horiz;
+
+ const __m128i zero = _mm_setzero_si128();
+
+ const int w0 = conv_params->fwd_offset;
+ const int w1 = conv_params->bck_offset;
+ const __m128i wt0 = _mm_set1_epi16(w0);
+ const __m128i wt1 = _mm_set1_epi16(w1);
+ const __m128i wt = _mm_unpacklo_epi16(wt0, wt1);
+
+ const int offset_0 =
+ bd + 2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1;
+ const int offset = (1 << offset_0) + (1 << (offset_0 - 1));
+ const __m128i offset_const = _mm_set1_epi16(offset);
+ const int rounding_shift =
+ 2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1;
+ const __m128i rounding_const = _mm_set1_epi16((1 << rounding_shift) >> 1);
+
+ /* Horizontal filter */
+ {
+ const int16_t *x_filter = av1_get_interp_filter_subpel_kernel(
+ filter_params_x, subpel_x_qn & SUBPEL_MASK);
+ const __m128i coeffs_x = _mm_loadu_si128((__m128i *)x_filter);
+
+ // coeffs 0 1 0 1 2 3 2 3
+ const __m128i tmp_0 = _mm_unpacklo_epi32(coeffs_x, coeffs_x);
+ // coeffs 4 5 4 5 6 7 6 7
+ const __m128i tmp_1 = _mm_unpackhi_epi32(coeffs_x, coeffs_x);
+
+ // coeffs 0 1 0 1 0 1 0 1
+ const __m128i coeff_01 = _mm_unpacklo_epi64(tmp_0, tmp_0);
+ // coeffs 2 3 2 3 2 3 2 3
+ const __m128i coeff_23 = _mm_unpackhi_epi64(tmp_0, tmp_0);
+ // coeffs 4 5 4 5 4 5 4 5
+ const __m128i coeff_45 = _mm_unpacklo_epi64(tmp_1, tmp_1);
+ // coeffs 6 7 6 7 6 7 6 7
+ const __m128i coeff_67 = _mm_unpackhi_epi64(tmp_1, tmp_1);
+
+ const __m128i round_const = _mm_set1_epi32(
+ ((1 << conv_params->round_0) >> 1) + (1 << (bd + FILTER_BITS - 1)));
+ const __m128i round_shift = _mm_cvtsi32_si128(conv_params->round_0);
+
+ for (i = 0; i < im_h; ++i) {
+ for (j = 0; j < w; j += 8) {
+ const __m128i data =
+ _mm_loadu_si128((__m128i *)&src_ptr[i * src_stride + j]);
+
+ const __m128i src_lo = _mm_unpacklo_epi8(data, zero);
+ const __m128i src_hi = _mm_unpackhi_epi8(data, zero);
+
+ // Filter even-index pixels
+ const __m128i res_0 = _mm_madd_epi16(src_lo, coeff_01);
+ const __m128i src_2 = _mm_alignr_epi8(src_hi, src_lo, 4);
+ const __m128i res_2 = _mm_madd_epi16(src_2, coeff_23);
+ const __m128i src_4 = _mm_alignr_epi8(src_hi, src_lo, 8);
+ const __m128i res_4 = _mm_madd_epi16(src_4, coeff_45);
+ const __m128i src_6 = _mm_alignr_epi8(src_hi, src_lo, 12);
+ const __m128i res_6 = _mm_madd_epi16(src_6, coeff_67);
+
+ __m128i res_even = _mm_add_epi32(_mm_add_epi32(res_0, res_4),
+ _mm_add_epi32(res_2, res_6));
+ res_even =
+ _mm_sra_epi32(_mm_add_epi32(res_even, round_const), round_shift);
+
+ // Filter odd-index pixels
+ const __m128i src_1 = _mm_alignr_epi8(src_hi, src_lo, 2);
+ const __m128i res_1 = _mm_madd_epi16(src_1, coeff_01);
+ const __m128i src_3 = _mm_alignr_epi8(src_hi, src_lo, 6);
+ const __m128i res_3 = _mm_madd_epi16(src_3, coeff_23);
+ const __m128i src_5 = _mm_alignr_epi8(src_hi, src_lo, 10);
+ const __m128i res_5 = _mm_madd_epi16(src_5, coeff_45);
+ const __m128i src_7 = _mm_alignr_epi8(src_hi, src_lo, 14);
+ const __m128i res_7 = _mm_madd_epi16(src_7, coeff_67);
+
+ __m128i res_odd = _mm_add_epi32(_mm_add_epi32(res_1, res_5),
+ _mm_add_epi32(res_3, res_7));
+ res_odd =
+ _mm_sra_epi32(_mm_add_epi32(res_odd, round_const), round_shift);
+
+ // Pack in the column order 0, 2, 4, 6, 1, 3, 5, 7
+ __m128i res = _mm_packs_epi32(res_even, res_odd);
+ _mm_store_si128((__m128i *)&im_block[i * im_stride + j], res);
+ }
+ }
+ }
+
+ /* Vertical filter */
+ {
+ const int16_t *y_filter = av1_get_interp_filter_subpel_kernel(
+ filter_params_y, subpel_y_qn & SUBPEL_MASK);
+ const __m128i coeffs_y = _mm_loadu_si128((__m128i *)y_filter);
+
+ // coeffs 0 1 0 1 2 3 2 3
+ const __m128i tmp_0 = _mm_unpacklo_epi32(coeffs_y, coeffs_y);
+ // coeffs 4 5 4 5 6 7 6 7
+ const __m128i tmp_1 = _mm_unpackhi_epi32(coeffs_y, coeffs_y);
+
+ // coeffs 0 1 0 1 0 1 0 1
+ const __m128i coeff_01 = _mm_unpacklo_epi64(tmp_0, tmp_0);
+ // coeffs 2 3 2 3 2 3 2 3
+ const __m128i coeff_23 = _mm_unpackhi_epi64(tmp_0, tmp_0);
+ // coeffs 4 5 4 5 4 5 4 5
+ const __m128i coeff_45 = _mm_unpacklo_epi64(tmp_1, tmp_1);
+ // coeffs 6 7 6 7 6 7 6 7
+ const __m128i coeff_67 = _mm_unpackhi_epi64(tmp_1, tmp_1);
+
+ const __m128i round_const = _mm_set1_epi32(
+ ((1 << conv_params->round_1) >> 1) -
+ (1 << (bd + 2 * FILTER_BITS - conv_params->round_0 - 1)));
+ const __m128i round_shift = _mm_cvtsi32_si128(conv_params->round_1);
+
+ for (i = 0; i < h; ++i) {
+ for (j = 0; j < w; j += 8) {
+ // Filter even-index pixels
+ const int16_t *data = &im_block[i * im_stride + j];
+ const __m128i src_0 =
+ _mm_unpacklo_epi16(*(__m128i *)(data + 0 * im_stride),
+ *(__m128i *)(data + 1 * im_stride));
+ const __m128i src_2 =
+ _mm_unpacklo_epi16(*(__m128i *)(data + 2 * im_stride),
+ *(__m128i *)(data + 3 * im_stride));
+ const __m128i src_4 =
+ _mm_unpacklo_epi16(*(__m128i *)(data + 4 * im_stride),
+ *(__m128i *)(data + 5 * im_stride));
+ const __m128i src_6 =
+ _mm_unpacklo_epi16(*(__m128i *)(data + 6 * im_stride),
+ *(__m128i *)(data + 7 * im_stride));
+
+ const __m128i res_0 = _mm_madd_epi16(src_0, coeff_01);
+ const __m128i res_2 = _mm_madd_epi16(src_2, coeff_23);
+ const __m128i res_4 = _mm_madd_epi16(src_4, coeff_45);
+ const __m128i res_6 = _mm_madd_epi16(src_6, coeff_67);
+
+ const __m128i res_even = _mm_add_epi32(_mm_add_epi32(res_0, res_2),
+ _mm_add_epi32(res_4, res_6));
+
+ // Filter odd-index pixels
+ const __m128i src_1 =
+ _mm_unpackhi_epi16(*(__m128i *)(data + 0 * im_stride),
+ *(__m128i *)(data + 1 * im_stride));
+ const __m128i src_3 =
+ _mm_unpackhi_epi16(*(__m128i *)(data + 2 * im_stride),
+ *(__m128i *)(data + 3 * im_stride));
+ const __m128i src_5 =
+ _mm_unpackhi_epi16(*(__m128i *)(data + 4 * im_stride),
+ *(__m128i *)(data + 5 * im_stride));
+ const __m128i src_7 =
+ _mm_unpackhi_epi16(*(__m128i *)(data + 6 * im_stride),
+ *(__m128i *)(data + 7 * im_stride));
+
+ const __m128i res_1 = _mm_madd_epi16(src_1, coeff_01);
+ const __m128i res_3 = _mm_madd_epi16(src_3, coeff_23);
+ const __m128i res_5 = _mm_madd_epi16(src_5, coeff_45);
+ const __m128i res_7 = _mm_madd_epi16(src_7, coeff_67);
+
+ const __m128i res_odd = _mm_add_epi32(_mm_add_epi32(res_1, res_3),
+ _mm_add_epi32(res_5, res_7));
+
+ // Rearrange pixels back into the order 0 ... 7
+ const __m128i res_lo = _mm_unpacklo_epi32(res_even, res_odd);
+ const __m128i res_hi = _mm_unpackhi_epi32(res_even, res_odd);
+
+ const __m128i res_lo_round =
+ _mm_sra_epi32(_mm_add_epi32(res_lo, round_const), round_shift);
+ const __m128i res_hi_round =
+ _mm_sra_epi32(_mm_add_epi32(res_hi, round_const), round_shift);
+
+ const __m128i res_16b = _mm_packs_epi32(res_lo_round, res_hi_round);
+ const __m128i res_unsigned = _mm_add_epi16(res_16b, offset_const);
+
+ // Accumulate values into the destination buffer
+ if (do_average) {
+ const __m128i data_ref_0 =
+ _mm_loadu_si128((__m128i *)(&dst[i * dst_stride + j]));
+
+ const __m128i comp_avg_res =
+ comp_avg(&data_ref_0, &res_unsigned, &wt, use_dist_wtd_comp_avg);
+
+ const __m128i round_result = convolve_rounding(
+ &comp_avg_res, &offset_const, &rounding_const, rounding_shift);
+
+ const __m128i res_8 = _mm_packus_epi16(round_result, round_result);
+
+ if (w > 4)
+ _mm_storel_epi64((__m128i *)(&dst0[i * dst_stride0 + j]), res_8);
+ else
+ *(int *)(&dst0[i * dst_stride0 + j]) = _mm_cvtsi128_si32(res_8);
+ } else {
+ _mm_store_si128((__m128i *)(&dst[i * dst_stride + j]), res_unsigned);
+ }
+ }
+ }
+ }
+}
diff --git a/third_party/aom/av1/common/x86/reconinter_avx2.c b/third_party/aom/av1/common/x86/reconinter_avx2.c
new file mode 100644
index 0000000000..71fab7a577
--- /dev/null
+++ b/third_party/aom/av1/common/x86/reconinter_avx2.c
@@ -0,0 +1,624 @@
+/*
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <immintrin.h>
+
+#include "config/av1_rtcd.h"
+
+#include "aom/aom_integer.h"
+#include "aom_dsp/blend.h"
+#include "aom_dsp/x86/synonyms.h"
+#include "aom_dsp/x86/synonyms_avx2.h"
+#include "av1/common/blockd.h"
+
+static INLINE __m256i calc_mask_avx2(const __m256i mask_base, const __m256i s0,
+ const __m256i s1) {
+ const __m256i diff = _mm256_abs_epi16(_mm256_sub_epi16(s0, s1));
+ return _mm256_abs_epi16(
+ _mm256_add_epi16(mask_base, _mm256_srli_epi16(diff, 4)));
+ // clamp(diff, 0, 64) can be skiped for diff is always in the range ( 38, 54)
+}
+void av1_build_compound_diffwtd_mask_avx2(uint8_t *mask,
+ DIFFWTD_MASK_TYPE mask_type,
+ const uint8_t *src0, int src0_stride,
+ const uint8_t *src1, int src1_stride,
+ int h, int w) {
+ const int mb = (mask_type == DIFFWTD_38_INV) ? AOM_BLEND_A64_MAX_ALPHA : 0;
+ const __m256i y_mask_base = _mm256_set1_epi16(38 - mb);
+ int i = 0;
+ if (4 == w) {
+ do {
+ const __m128i s0A = xx_loadl_32(src0);
+ const __m128i s0B = xx_loadl_32(src0 + src0_stride);
+ const __m128i s0C = xx_loadl_32(src0 + src0_stride * 2);
+ const __m128i s0D = xx_loadl_32(src0 + src0_stride * 3);
+ const __m128i s0AB = _mm_unpacklo_epi32(s0A, s0B);
+ const __m128i s0CD = _mm_unpacklo_epi32(s0C, s0D);
+ const __m128i s0ABCD = _mm_unpacklo_epi64(s0AB, s0CD);
+ const __m256i s0ABCD_w = _mm256_cvtepu8_epi16(s0ABCD);
+
+ const __m128i s1A = xx_loadl_32(src1);
+ const __m128i s1B = xx_loadl_32(src1 + src1_stride);
+ const __m128i s1C = xx_loadl_32(src1 + src1_stride * 2);
+ const __m128i s1D = xx_loadl_32(src1 + src1_stride * 3);
+ const __m128i s1AB = _mm_unpacklo_epi32(s1A, s1B);
+ const __m128i s1CD = _mm_unpacklo_epi32(s1C, s1D);
+ const __m128i s1ABCD = _mm_unpacklo_epi64(s1AB, s1CD);
+ const __m256i s1ABCD_w = _mm256_cvtepu8_epi16(s1ABCD);
+ const __m256i m16 = calc_mask_avx2(y_mask_base, s0ABCD_w, s1ABCD_w);
+ const __m256i m8 = _mm256_packus_epi16(m16, _mm256_setzero_si256());
+ const __m128i x_m8 =
+ _mm256_castsi256_si128(_mm256_permute4x64_epi64(m8, 0xd8));
+ xx_storeu_128(mask, x_m8);
+ src0 += (src0_stride << 2);
+ src1 += (src1_stride << 2);
+ mask += 16;
+ i += 4;
+ } while (i < h);
+ } else if (8 == w) {
+ do {
+ const __m128i s0A = xx_loadl_64(src0);
+ const __m128i s0B = xx_loadl_64(src0 + src0_stride);
+ const __m128i s0C = xx_loadl_64(src0 + src0_stride * 2);
+ const __m128i s0D = xx_loadl_64(src0 + src0_stride * 3);
+ const __m256i s0AC_w = _mm256_cvtepu8_epi16(_mm_unpacklo_epi64(s0A, s0C));
+ const __m256i s0BD_w = _mm256_cvtepu8_epi16(_mm_unpacklo_epi64(s0B, s0D));
+ const __m128i s1A = xx_loadl_64(src1);
+ const __m128i s1B = xx_loadl_64(src1 + src1_stride);
+ const __m128i s1C = xx_loadl_64(src1 + src1_stride * 2);
+ const __m128i s1D = xx_loadl_64(src1 + src1_stride * 3);
+ const __m256i s1AB_w = _mm256_cvtepu8_epi16(_mm_unpacklo_epi64(s1A, s1C));
+ const __m256i s1CD_w = _mm256_cvtepu8_epi16(_mm_unpacklo_epi64(s1B, s1D));
+ const __m256i m16AC = calc_mask_avx2(y_mask_base, s0AC_w, s1AB_w);
+ const __m256i m16BD = calc_mask_avx2(y_mask_base, s0BD_w, s1CD_w);
+ const __m256i m8 = _mm256_packus_epi16(m16AC, m16BD);
+ yy_storeu_256(mask, m8);
+ src0 += src0_stride << 2;
+ src1 += src1_stride << 2;
+ mask += 32;
+ i += 4;
+ } while (i < h);
+ } else if (16 == w) {
+ do {
+ const __m128i s0A = xx_load_128(src0);
+ const __m128i s0B = xx_load_128(src0 + src0_stride);
+ const __m128i s1A = xx_load_128(src1);
+ const __m128i s1B = xx_load_128(src1 + src1_stride);
+ const __m256i s0AL = _mm256_cvtepu8_epi16(s0A);
+ const __m256i s0BL = _mm256_cvtepu8_epi16(s0B);
+ const __m256i s1AL = _mm256_cvtepu8_epi16(s1A);
+ const __m256i s1BL = _mm256_cvtepu8_epi16(s1B);
+
+ const __m256i m16AL = calc_mask_avx2(y_mask_base, s0AL, s1AL);
+ const __m256i m16BL = calc_mask_avx2(y_mask_base, s0BL, s1BL);
+
+ const __m256i m8 =
+ _mm256_permute4x64_epi64(_mm256_packus_epi16(m16AL, m16BL), 0xd8);
+ yy_storeu_256(mask, m8);
+ src0 += src0_stride << 1;
+ src1 += src1_stride << 1;
+ mask += 32;
+ i += 2;
+ } while (i < h);
+ } else {
+ do {
+ int j = 0;
+ do {
+ const __m256i s0 = yy_loadu_256(src0 + j);
+ const __m256i s1 = yy_loadu_256(src1 + j);
+ const __m256i s0L = _mm256_cvtepu8_epi16(_mm256_castsi256_si128(s0));
+ const __m256i s1L = _mm256_cvtepu8_epi16(_mm256_castsi256_si128(s1));
+ const __m256i s0H =
+ _mm256_cvtepu8_epi16(_mm256_extracti128_si256(s0, 1));
+ const __m256i s1H =
+ _mm256_cvtepu8_epi16(_mm256_extracti128_si256(s1, 1));
+ const __m256i m16L = calc_mask_avx2(y_mask_base, s0L, s1L);
+ const __m256i m16H = calc_mask_avx2(y_mask_base, s0H, s1H);
+ const __m256i m8 =
+ _mm256_permute4x64_epi64(_mm256_packus_epi16(m16L, m16H), 0xd8);
+ yy_storeu_256(mask + j, m8);
+ j += 32;
+ } while (j < w);
+ src0 += src0_stride;
+ src1 += src1_stride;
+ mask += w;
+ i += 1;
+ } while (i < h);
+ }
+}
+
+static INLINE __m256i calc_mask_d16_avx2(const __m256i *data_src0,
+ const __m256i *data_src1,
+ const __m256i *round_const,
+ const __m256i *mask_base_16,
+ const __m256i *clip_diff, int round) {
+ const __m256i diffa = _mm256_subs_epu16(*data_src0, *data_src1);
+ const __m256i diffb = _mm256_subs_epu16(*data_src1, *data_src0);
+ const __m256i diff = _mm256_max_epu16(diffa, diffb);
+ const __m256i diff_round =
+ _mm256_srli_epi16(_mm256_adds_epu16(diff, *round_const), round);
+ const __m256i diff_factor = _mm256_srli_epi16(diff_round, DIFF_FACTOR_LOG2);
+ const __m256i diff_mask = _mm256_adds_epi16(diff_factor, *mask_base_16);
+ const __m256i diff_clamp = _mm256_min_epi16(diff_mask, *clip_diff);
+ return diff_clamp;
+}
+
+static INLINE __m256i calc_mask_d16_inv_avx2(const __m256i *data_src0,
+ const __m256i *data_src1,
+ const __m256i *round_const,
+ const __m256i *mask_base_16,
+ const __m256i *clip_diff,
+ int round) {
+ const __m256i diffa = _mm256_subs_epu16(*data_src0, *data_src1);
+ const __m256i diffb = _mm256_subs_epu16(*data_src1, *data_src0);
+ const __m256i diff = _mm256_max_epu16(diffa, diffb);
+ const __m256i diff_round =
+ _mm256_srli_epi16(_mm256_adds_epu16(diff, *round_const), round);
+ const __m256i diff_factor = _mm256_srli_epi16(diff_round, DIFF_FACTOR_LOG2);
+ const __m256i diff_mask = _mm256_adds_epi16(diff_factor, *mask_base_16);
+ const __m256i diff_clamp = _mm256_min_epi16(diff_mask, *clip_diff);
+ const __m256i diff_const_16 = _mm256_sub_epi16(*clip_diff, diff_clamp);
+ return diff_const_16;
+}
+
+static INLINE void build_compound_diffwtd_mask_d16_avx2(
+ uint8_t *mask, const CONV_BUF_TYPE *src0, int src0_stride,
+ const CONV_BUF_TYPE *src1, int src1_stride, int h, int w, int shift) {
+ const int mask_base = 38;
+ const __m256i _r = _mm256_set1_epi16((1 << shift) >> 1);
+ const __m256i y38 = _mm256_set1_epi16(mask_base);
+ const __m256i y64 = _mm256_set1_epi16(AOM_BLEND_A64_MAX_ALPHA);
+ int i = 0;
+ if (w == 4) {
+ do {
+ const __m128i s0A = xx_loadl_64(src0);
+ const __m128i s0B = xx_loadl_64(src0 + src0_stride);
+ const __m128i s0C = xx_loadl_64(src0 + src0_stride * 2);
+ const __m128i s0D = xx_loadl_64(src0 + src0_stride * 3);
+ const __m128i s1A = xx_loadl_64(src1);
+ const __m128i s1B = xx_loadl_64(src1 + src1_stride);
+ const __m128i s1C = xx_loadl_64(src1 + src1_stride * 2);
+ const __m128i s1D = xx_loadl_64(src1 + src1_stride * 3);
+ const __m256i s0 = yy_set_m128i(_mm_unpacklo_epi64(s0C, s0D),
+ _mm_unpacklo_epi64(s0A, s0B));
+ const __m256i s1 = yy_set_m128i(_mm_unpacklo_epi64(s1C, s1D),
+ _mm_unpacklo_epi64(s1A, s1B));
+ const __m256i m16 = calc_mask_d16_avx2(&s0, &s1, &_r, &y38, &y64, shift);
+ const __m256i m8 = _mm256_packus_epi16(m16, _mm256_setzero_si256());
+ xx_storeu_128(mask,
+ _mm256_castsi256_si128(_mm256_permute4x64_epi64(m8, 0xd8)));
+ src0 += src0_stride << 2;
+ src1 += src1_stride << 2;
+ mask += 16;
+ i += 4;
+ } while (i < h);
+ } else if (w == 8) {
+ do {
+ const __m256i s0AB = yy_loadu2_128(src0 + src0_stride, src0);
+ const __m256i s0CD =
+ yy_loadu2_128(src0 + src0_stride * 3, src0 + src0_stride * 2);
+ const __m256i s1AB = yy_loadu2_128(src1 + src1_stride, src1);
+ const __m256i s1CD =
+ yy_loadu2_128(src1 + src1_stride * 3, src1 + src1_stride * 2);
+ const __m256i m16AB =
+ calc_mask_d16_avx2(&s0AB, &s1AB, &_r, &y38, &y64, shift);
+ const __m256i m16CD =
+ calc_mask_d16_avx2(&s0CD, &s1CD, &_r, &y38, &y64, shift);
+ const __m256i m8 = _mm256_packus_epi16(m16AB, m16CD);
+ yy_storeu_256(mask, _mm256_permute4x64_epi64(m8, 0xd8));
+ src0 += src0_stride << 2;
+ src1 += src1_stride << 2;
+ mask += 32;
+ i += 4;
+ } while (i < h);
+ } else if (w == 16) {
+ do {
+ const __m256i s0A = yy_loadu_256(src0);
+ const __m256i s0B = yy_loadu_256(src0 + src0_stride);
+ const __m256i s1A = yy_loadu_256(src1);
+ const __m256i s1B = yy_loadu_256(src1 + src1_stride);
+ const __m256i m16A =
+ calc_mask_d16_avx2(&s0A, &s1A, &_r, &y38, &y64, shift);
+ const __m256i m16B =
+ calc_mask_d16_avx2(&s0B, &s1B, &_r, &y38, &y64, shift);
+ const __m256i m8 = _mm256_packus_epi16(m16A, m16B);
+ yy_storeu_256(mask, _mm256_permute4x64_epi64(m8, 0xd8));
+ src0 += src0_stride << 1;
+ src1 += src1_stride << 1;
+ mask += 32;
+ i += 2;
+ } while (i < h);
+ } else if (w == 32) {
+ do {
+ const __m256i s0A = yy_loadu_256(src0);
+ const __m256i s0B = yy_loadu_256(src0 + 16);
+ const __m256i s1A = yy_loadu_256(src1);
+ const __m256i s1B = yy_loadu_256(src1 + 16);
+ const __m256i m16A =
+ calc_mask_d16_avx2(&s0A, &s1A, &_r, &y38, &y64, shift);
+ const __m256i m16B =
+ calc_mask_d16_avx2(&s0B, &s1B, &_r, &y38, &y64, shift);
+ const __m256i m8 = _mm256_packus_epi16(m16A, m16B);
+ yy_storeu_256(mask, _mm256_permute4x64_epi64(m8, 0xd8));
+ src0 += src0_stride;
+ src1 += src1_stride;
+ mask += 32;
+ i += 1;
+ } while (i < h);
+ } else if (w == 64) {
+ do {
+ const __m256i s0A = yy_loadu_256(src0);
+ const __m256i s0B = yy_loadu_256(src0 + 16);
+ const __m256i s0C = yy_loadu_256(src0 + 32);
+ const __m256i s0D = yy_loadu_256(src0 + 48);
+ const __m256i s1A = yy_loadu_256(src1);
+ const __m256i s1B = yy_loadu_256(src1 + 16);
+ const __m256i s1C = yy_loadu_256(src1 + 32);
+ const __m256i s1D = yy_loadu_256(src1 + 48);
+ const __m256i m16A =
+ calc_mask_d16_avx2(&s0A, &s1A, &_r, &y38, &y64, shift);
+ const __m256i m16B =
+ calc_mask_d16_avx2(&s0B, &s1B, &_r, &y38, &y64, shift);
+ const __m256i m16C =
+ calc_mask_d16_avx2(&s0C, &s1C, &_r, &y38, &y64, shift);
+ const __m256i m16D =
+ calc_mask_d16_avx2(&s0D, &s1D, &_r, &y38, &y64, shift);
+ const __m256i m8AB = _mm256_packus_epi16(m16A, m16B);
+ const __m256i m8CD = _mm256_packus_epi16(m16C, m16D);
+ yy_storeu_256(mask, _mm256_permute4x64_epi64(m8AB, 0xd8));
+ yy_storeu_256(mask + 32, _mm256_permute4x64_epi64(m8CD, 0xd8));
+ src0 += src0_stride;
+ src1 += src1_stride;
+ mask += 64;
+ i += 1;
+ } while (i < h);
+ } else {
+ do {
+ const __m256i s0A = yy_loadu_256(src0);
+ const __m256i s0B = yy_loadu_256(src0 + 16);
+ const __m256i s0C = yy_loadu_256(src0 + 32);
+ const __m256i s0D = yy_loadu_256(src0 + 48);
+ const __m256i s0E = yy_loadu_256(src0 + 64);
+ const __m256i s0F = yy_loadu_256(src0 + 80);
+ const __m256i s0G = yy_loadu_256(src0 + 96);
+ const __m256i s0H = yy_loadu_256(src0 + 112);
+ const __m256i s1A = yy_loadu_256(src1);
+ const __m256i s1B = yy_loadu_256(src1 + 16);
+ const __m256i s1C = yy_loadu_256(src1 + 32);
+ const __m256i s1D = yy_loadu_256(src1 + 48);
+ const __m256i s1E = yy_loadu_256(src1 + 64);
+ const __m256i s1F = yy_loadu_256(src1 + 80);
+ const __m256i s1G = yy_loadu_256(src1 + 96);
+ const __m256i s1H = yy_loadu_256(src1 + 112);
+ const __m256i m16A =
+ calc_mask_d16_avx2(&s0A, &s1A, &_r, &y38, &y64, shift);
+ const __m256i m16B =
+ calc_mask_d16_avx2(&s0B, &s1B, &_r, &y38, &y64, shift);
+ const __m256i m16C =
+ calc_mask_d16_avx2(&s0C, &s1C, &_r, &y38, &y64, shift);
+ const __m256i m16D =
+ calc_mask_d16_avx2(&s0D, &s1D, &_r, &y38, &y64, shift);
+ const __m256i m16E =
+ calc_mask_d16_avx2(&s0E, &s1E, &_r, &y38, &y64, shift);
+ const __m256i m16F =
+ calc_mask_d16_avx2(&s0F, &s1F, &_r, &y38, &y64, shift);
+ const __m256i m16G =
+ calc_mask_d16_avx2(&s0G, &s1G, &_r, &y38, &y64, shift);
+ const __m256i m16H =
+ calc_mask_d16_avx2(&s0H, &s1H, &_r, &y38, &y64, shift);
+ const __m256i m8AB = _mm256_packus_epi16(m16A, m16B);
+ const __m256i m8CD = _mm256_packus_epi16(m16C, m16D);
+ const __m256i m8EF = _mm256_packus_epi16(m16E, m16F);
+ const __m256i m8GH = _mm256_packus_epi16(m16G, m16H);
+ yy_storeu_256(mask, _mm256_permute4x64_epi64(m8AB, 0xd8));
+ yy_storeu_256(mask + 32, _mm256_permute4x64_epi64(m8CD, 0xd8));
+ yy_storeu_256(mask + 64, _mm256_permute4x64_epi64(m8EF, 0xd8));
+ yy_storeu_256(mask + 96, _mm256_permute4x64_epi64(m8GH, 0xd8));
+ src0 += src0_stride;
+ src1 += src1_stride;
+ mask += 128;
+ i += 1;
+ } while (i < h);
+ }
+}
+
+static INLINE void build_compound_diffwtd_mask_d16_inv_avx2(
+ uint8_t *mask, const CONV_BUF_TYPE *src0, int src0_stride,
+ const CONV_BUF_TYPE *src1, int src1_stride, int h, int w, int shift) {
+ const int mask_base = 38;
+ const __m256i _r = _mm256_set1_epi16((1 << shift) >> 1);
+ const __m256i y38 = _mm256_set1_epi16(mask_base);
+ const __m256i y64 = _mm256_set1_epi16(AOM_BLEND_A64_MAX_ALPHA);
+ int i = 0;
+ if (w == 4) {
+ do {
+ const __m128i s0A = xx_loadl_64(src0);
+ const __m128i s0B = xx_loadl_64(src0 + src0_stride);
+ const __m128i s0C = xx_loadl_64(src0 + src0_stride * 2);
+ const __m128i s0D = xx_loadl_64(src0 + src0_stride * 3);
+ const __m128i s1A = xx_loadl_64(src1);
+ const __m128i s1B = xx_loadl_64(src1 + src1_stride);
+ const __m128i s1C = xx_loadl_64(src1 + src1_stride * 2);
+ const __m128i s1D = xx_loadl_64(src1 + src1_stride * 3);
+ const __m256i s0 = yy_set_m128i(_mm_unpacklo_epi64(s0C, s0D),
+ _mm_unpacklo_epi64(s0A, s0B));
+ const __m256i s1 = yy_set_m128i(_mm_unpacklo_epi64(s1C, s1D),
+ _mm_unpacklo_epi64(s1A, s1B));
+ const __m256i m16 =
+ calc_mask_d16_inv_avx2(&s0, &s1, &_r, &y38, &y64, shift);
+ const __m256i m8 = _mm256_packus_epi16(m16, _mm256_setzero_si256());
+ xx_storeu_128(mask,
+ _mm256_castsi256_si128(_mm256_permute4x64_epi64(m8, 0xd8)));
+ src0 += src0_stride << 2;
+ src1 += src1_stride << 2;
+ mask += 16;
+ i += 4;
+ } while (i < h);
+ } else if (w == 8) {
+ do {
+ const __m256i s0AB = yy_loadu2_128(src0 + src0_stride, src0);
+ const __m256i s0CD =
+ yy_loadu2_128(src0 + src0_stride * 3, src0 + src0_stride * 2);
+ const __m256i s1AB = yy_loadu2_128(src1 + src1_stride, src1);
+ const __m256i s1CD =
+ yy_loadu2_128(src1 + src1_stride * 3, src1 + src1_stride * 2);
+ const __m256i m16AB =
+ calc_mask_d16_inv_avx2(&s0AB, &s1AB, &_r, &y38, &y64, shift);
+ const __m256i m16CD =
+ calc_mask_d16_inv_avx2(&s0CD, &s1CD, &_r, &y38, &y64, shift);
+ const __m256i m8 = _mm256_packus_epi16(m16AB, m16CD);
+ yy_storeu_256(mask, _mm256_permute4x64_epi64(m8, 0xd8));
+ src0 += src0_stride << 2;
+ src1 += src1_stride << 2;
+ mask += 32;
+ i += 4;
+ } while (i < h);
+ } else if (w == 16) {
+ do {
+ const __m256i s0A = yy_loadu_256(src0);
+ const __m256i s0B = yy_loadu_256(src0 + src0_stride);
+ const __m256i s1A = yy_loadu_256(src1);
+ const __m256i s1B = yy_loadu_256(src1 + src1_stride);
+ const __m256i m16A =
+ calc_mask_d16_inv_avx2(&s0A, &s1A, &_r, &y38, &y64, shift);
+ const __m256i m16B =
+ calc_mask_d16_inv_avx2(&s0B, &s1B, &_r, &y38, &y64, shift);
+ const __m256i m8 = _mm256_packus_epi16(m16A, m16B);
+ yy_storeu_256(mask, _mm256_permute4x64_epi64(m8, 0xd8));
+ src0 += src0_stride << 1;
+ src1 += src1_stride << 1;
+ mask += 32;
+ i += 2;
+ } while (i < h);
+ } else if (w == 32) {
+ do {
+ const __m256i s0A = yy_loadu_256(src0);
+ const __m256i s0B = yy_loadu_256(src0 + 16);
+ const __m256i s1A = yy_loadu_256(src1);
+ const __m256i s1B = yy_loadu_256(src1 + 16);
+ const __m256i m16A =
+ calc_mask_d16_inv_avx2(&s0A, &s1A, &_r, &y38, &y64, shift);
+ const __m256i m16B =
+ calc_mask_d16_inv_avx2(&s0B, &s1B, &_r, &y38, &y64, shift);
+ const __m256i m8 = _mm256_packus_epi16(m16A, m16B);
+ yy_storeu_256(mask, _mm256_permute4x64_epi64(m8, 0xd8));
+ src0 += src0_stride;
+ src1 += src1_stride;
+ mask += 32;
+ i += 1;
+ } while (i < h);
+ } else if (w == 64) {
+ do {
+ const __m256i s0A = yy_loadu_256(src0);
+ const __m256i s0B = yy_loadu_256(src0 + 16);
+ const __m256i s0C = yy_loadu_256(src0 + 32);
+ const __m256i s0D = yy_loadu_256(src0 + 48);
+ const __m256i s1A = yy_loadu_256(src1);
+ const __m256i s1B = yy_loadu_256(src1 + 16);
+ const __m256i s1C = yy_loadu_256(src1 + 32);
+ const __m256i s1D = yy_loadu_256(src1 + 48);
+ const __m256i m16A =
+ calc_mask_d16_inv_avx2(&s0A, &s1A, &_r, &y38, &y64, shift);
+ const __m256i m16B =
+ calc_mask_d16_inv_avx2(&s0B, &s1B, &_r, &y38, &y64, shift);
+ const __m256i m16C =
+ calc_mask_d16_inv_avx2(&s0C, &s1C, &_r, &y38, &y64, shift);
+ const __m256i m16D =
+ calc_mask_d16_inv_avx2(&s0D, &s1D, &_r, &y38, &y64, shift);
+ const __m256i m8AB = _mm256_packus_epi16(m16A, m16B);
+ const __m256i m8CD = _mm256_packus_epi16(m16C, m16D);
+ yy_storeu_256(mask, _mm256_permute4x64_epi64(m8AB, 0xd8));
+ yy_storeu_256(mask + 32, _mm256_permute4x64_epi64(m8CD, 0xd8));
+ src0 += src0_stride;
+ src1 += src1_stride;
+ mask += 64;
+ i += 1;
+ } while (i < h);
+ } else {
+ do {
+ const __m256i s0A = yy_loadu_256(src0);
+ const __m256i s0B = yy_loadu_256(src0 + 16);
+ const __m256i s0C = yy_loadu_256(src0 + 32);
+ const __m256i s0D = yy_loadu_256(src0 + 48);
+ const __m256i s0E = yy_loadu_256(src0 + 64);
+ const __m256i s0F = yy_loadu_256(src0 + 80);
+ const __m256i s0G = yy_loadu_256(src0 + 96);
+ const __m256i s0H = yy_loadu_256(src0 + 112);
+ const __m256i s1A = yy_loadu_256(src1);
+ const __m256i s1B = yy_loadu_256(src1 + 16);
+ const __m256i s1C = yy_loadu_256(src1 + 32);
+ const __m256i s1D = yy_loadu_256(src1 + 48);
+ const __m256i s1E = yy_loadu_256(src1 + 64);
+ const __m256i s1F = yy_loadu_256(src1 + 80);
+ const __m256i s1G = yy_loadu_256(src1 + 96);
+ const __m256i s1H = yy_loadu_256(src1 + 112);
+ const __m256i m16A =
+ calc_mask_d16_inv_avx2(&s0A, &s1A, &_r, &y38, &y64, shift);
+ const __m256i m16B =
+ calc_mask_d16_inv_avx2(&s0B, &s1B, &_r, &y38, &y64, shift);
+ const __m256i m16C =
+ calc_mask_d16_inv_avx2(&s0C, &s1C, &_r, &y38, &y64, shift);
+ const __m256i m16D =
+ calc_mask_d16_inv_avx2(&s0D, &s1D, &_r, &y38, &y64, shift);
+ const __m256i m16E =
+ calc_mask_d16_inv_avx2(&s0E, &s1E, &_r, &y38, &y64, shift);
+ const __m256i m16F =
+ calc_mask_d16_inv_avx2(&s0F, &s1F, &_r, &y38, &y64, shift);
+ const __m256i m16G =
+ calc_mask_d16_inv_avx2(&s0G, &s1G, &_r, &y38, &y64, shift);
+ const __m256i m16H =
+ calc_mask_d16_inv_avx2(&s0H, &s1H, &_r, &y38, &y64, shift);
+ const __m256i m8AB = _mm256_packus_epi16(m16A, m16B);
+ const __m256i m8CD = _mm256_packus_epi16(m16C, m16D);
+ const __m256i m8EF = _mm256_packus_epi16(m16E, m16F);
+ const __m256i m8GH = _mm256_packus_epi16(m16G, m16H);
+ yy_storeu_256(mask, _mm256_permute4x64_epi64(m8AB, 0xd8));
+ yy_storeu_256(mask + 32, _mm256_permute4x64_epi64(m8CD, 0xd8));
+ yy_storeu_256(mask + 64, _mm256_permute4x64_epi64(m8EF, 0xd8));
+ yy_storeu_256(mask + 96, _mm256_permute4x64_epi64(m8GH, 0xd8));
+ src0 += src0_stride;
+ src1 += src1_stride;
+ mask += 128;
+ i += 1;
+ } while (i < h);
+ }
+}
+
+void av1_build_compound_diffwtd_mask_d16_avx2(
+ uint8_t *mask, DIFFWTD_MASK_TYPE mask_type, const CONV_BUF_TYPE *src0,
+ int src0_stride, const CONV_BUF_TYPE *src1, int src1_stride, int h, int w,
+ ConvolveParams *conv_params, int bd) {
+ const int shift =
+ 2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1 + (bd - 8);
+ // When rounding constant is added, there is a possibility of overflow.
+ // However that much precision is not required. Code should very well work for
+ // other values of DIFF_FACTOR_LOG2 and AOM_BLEND_A64_MAX_ALPHA as well. But
+ // there is a possibility of corner case bugs.
+ assert(DIFF_FACTOR_LOG2 == 4);
+ assert(AOM_BLEND_A64_MAX_ALPHA == 64);
+
+ if (mask_type == DIFFWTD_38) {
+ build_compound_diffwtd_mask_d16_avx2(mask, src0, src0_stride, src1,
+ src1_stride, h, w, shift);
+ } else {
+ build_compound_diffwtd_mask_d16_inv_avx2(mask, src0, src0_stride, src1,
+ src1_stride, h, w, shift);
+ }
+}
+
+#if CONFIG_AV1_HIGHBITDEPTH
+
+void av1_build_compound_diffwtd_mask_highbd_avx2(
+ uint8_t *mask, DIFFWTD_MASK_TYPE mask_type, const uint8_t *src0,
+ int src0_stride, const uint8_t *src1, int src1_stride, int h, int w,
+ int bd) {
+ if (w < 16) {
+ av1_build_compound_diffwtd_mask_highbd_ssse3(
+ mask, mask_type, src0, src0_stride, src1, src1_stride, h, w, bd);
+ } else {
+ assert(mask_type == DIFFWTD_38 || mask_type == DIFFWTD_38_INV);
+ assert(bd >= 8);
+ assert((w % 16) == 0);
+ const __m256i y0 = _mm256_setzero_si256();
+ const __m256i yAOM_BLEND_A64_MAX_ALPHA =
+ _mm256_set1_epi16(AOM_BLEND_A64_MAX_ALPHA);
+ const int mask_base = 38;
+ const __m256i ymask_base = _mm256_set1_epi16(mask_base);
+ const uint16_t *ssrc0 = CONVERT_TO_SHORTPTR(src0);
+ const uint16_t *ssrc1 = CONVERT_TO_SHORTPTR(src1);
+ if (bd == 8) {
+ if (mask_type == DIFFWTD_38_INV) {
+ for (int i = 0; i < h; ++i) {
+ for (int j = 0; j < w; j += 16) {
+ __m256i s0 = _mm256_loadu_si256((const __m256i *)&ssrc0[j]);
+ __m256i s1 = _mm256_loadu_si256((const __m256i *)&ssrc1[j]);
+ __m256i diff = _mm256_srai_epi16(
+ _mm256_abs_epi16(_mm256_sub_epi16(s0, s1)), DIFF_FACTOR_LOG2);
+ __m256i m = _mm256_min_epi16(
+ _mm256_max_epi16(y0, _mm256_add_epi16(diff, ymask_base)),
+ yAOM_BLEND_A64_MAX_ALPHA);
+ m = _mm256_sub_epi16(yAOM_BLEND_A64_MAX_ALPHA, m);
+ m = _mm256_packus_epi16(m, m);
+ m = _mm256_permute4x64_epi64(m, _MM_SHUFFLE(0, 0, 2, 0));
+ __m128i m0 = _mm256_castsi256_si128(m);
+ _mm_storeu_si128((__m128i *)&mask[j], m0);
+ }
+ ssrc0 += src0_stride;
+ ssrc1 += src1_stride;
+ mask += w;
+ }
+ } else {
+ for (int i = 0; i < h; ++i) {
+ for (int j = 0; j < w; j += 16) {
+ __m256i s0 = _mm256_loadu_si256((const __m256i *)&ssrc0[j]);
+ __m256i s1 = _mm256_loadu_si256((const __m256i *)&ssrc1[j]);
+ __m256i diff = _mm256_srai_epi16(
+ _mm256_abs_epi16(_mm256_sub_epi16(s0, s1)), DIFF_FACTOR_LOG2);
+ __m256i m = _mm256_min_epi16(
+ _mm256_max_epi16(y0, _mm256_add_epi16(diff, ymask_base)),
+ yAOM_BLEND_A64_MAX_ALPHA);
+ m = _mm256_packus_epi16(m, m);
+ m = _mm256_permute4x64_epi64(m, _MM_SHUFFLE(0, 0, 2, 0));
+ __m128i m0 = _mm256_castsi256_si128(m);
+ _mm_storeu_si128((__m128i *)&mask[j], m0);
+ }
+ ssrc0 += src0_stride;
+ ssrc1 += src1_stride;
+ mask += w;
+ }
+ }
+ } else {
+ const __m128i xshift = xx_set1_64_from_32i(bd - 8 + DIFF_FACTOR_LOG2);
+ if (mask_type == DIFFWTD_38_INV) {
+ for (int i = 0; i < h; ++i) {
+ for (int j = 0; j < w; j += 16) {
+ __m256i s0 = _mm256_loadu_si256((const __m256i *)&ssrc0[j]);
+ __m256i s1 = _mm256_loadu_si256((const __m256i *)&ssrc1[j]);
+ __m256i diff = _mm256_sra_epi16(
+ _mm256_abs_epi16(_mm256_sub_epi16(s0, s1)), xshift);
+ __m256i m = _mm256_min_epi16(
+ _mm256_max_epi16(y0, _mm256_add_epi16(diff, ymask_base)),
+ yAOM_BLEND_A64_MAX_ALPHA);
+ m = _mm256_sub_epi16(yAOM_BLEND_A64_MAX_ALPHA, m);
+ m = _mm256_packus_epi16(m, m);
+ m = _mm256_permute4x64_epi64(m, _MM_SHUFFLE(0, 0, 2, 0));
+ __m128i m0 = _mm256_castsi256_si128(m);
+ _mm_storeu_si128((__m128i *)&mask[j], m0);
+ }
+ ssrc0 += src0_stride;
+ ssrc1 += src1_stride;
+ mask += w;
+ }
+ } else {
+ for (int i = 0; i < h; ++i) {
+ for (int j = 0; j < w; j += 16) {
+ __m256i s0 = _mm256_loadu_si256((const __m256i *)&ssrc0[j]);
+ __m256i s1 = _mm256_loadu_si256((const __m256i *)&ssrc1[j]);
+ __m256i diff = _mm256_sra_epi16(
+ _mm256_abs_epi16(_mm256_sub_epi16(s0, s1)), xshift);
+ __m256i m = _mm256_min_epi16(
+ _mm256_max_epi16(y0, _mm256_add_epi16(diff, ymask_base)),
+ yAOM_BLEND_A64_MAX_ALPHA);
+ m = _mm256_packus_epi16(m, m);
+ m = _mm256_permute4x64_epi64(m, _MM_SHUFFLE(0, 0, 2, 0));
+ __m128i m0 = _mm256_castsi256_si128(m);
+ _mm_storeu_si128((__m128i *)&mask[j], m0);
+ }
+ ssrc0 += src0_stride;
+ ssrc1 += src1_stride;
+ mask += w;
+ }
+ }
+ }
+ }
+}
+
+#endif // CONFIG_AV1_HIGHBITDEPTH
diff --git a/third_party/aom/av1/common/x86/reconinter_sse4.c b/third_party/aom/av1/common/x86/reconinter_sse4.c
new file mode 100644
index 0000000000..eb4a4d1da3
--- /dev/null
+++ b/third_party/aom/av1/common/x86/reconinter_sse4.c
@@ -0,0 +1,154 @@
+/*
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <emmintrin.h> // SSE2
+#include <smmintrin.h> /* SSE4.1 */
+
+#include "aom/aom_integer.h"
+#include "aom_dsp/blend.h"
+#include "av1/common/blockd.h"
+#include "config/av1_rtcd.h"
+
+static INLINE __m128i calc_mask(const __m128i mask_base, const __m128i s0,
+ const __m128i s1) {
+ const __m128i diff = _mm_abs_epi16(_mm_sub_epi16(s0, s1));
+ return _mm_abs_epi16(_mm_add_epi16(mask_base, _mm_srli_epi16(diff, 4)));
+ // clamp(diff, 0, 64) can be skiped for diff is always in the range ( 38, 54)
+}
+
+void av1_build_compound_diffwtd_mask_sse4_1(uint8_t *mask,
+ DIFFWTD_MASK_TYPE mask_type,
+ const uint8_t *src0, int stride0,
+ const uint8_t *src1, int stride1,
+ int h, int w) {
+ const int mb = (mask_type == DIFFWTD_38_INV) ? AOM_BLEND_A64_MAX_ALPHA : 0;
+ const __m128i mask_base = _mm_set1_epi16(38 - mb);
+ int i = 0;
+ if (4 == w) {
+ do {
+ const __m128i s0A = _mm_cvtsi32_si128(*(int *)src0);
+ const __m128i s0B = _mm_cvtsi32_si128(*(int *)(src0 + stride0));
+ const __m128i s0AB = _mm_unpacklo_epi32(s0A, s0B);
+ const __m128i s0 = _mm_cvtepu8_epi16(s0AB);
+
+ const __m128i s1A = _mm_cvtsi32_si128(*(int *)src1);
+ const __m128i s1B = _mm_cvtsi32_si128(*(int *)(src1 + stride1));
+ const __m128i s1AB = _mm_unpacklo_epi32(s1A, s1B);
+ const __m128i s1 = _mm_cvtepu8_epi16(s1AB);
+
+ const __m128i m16 = calc_mask(mask_base, s0, s1);
+ const __m128i m8 = _mm_packus_epi16(m16, m16);
+
+ *(int *)mask = _mm_cvtsi128_si32(m8);
+ *(int *)(mask + w) = _mm_extract_epi32(m8, 1);
+ src0 += (stride0 << 1);
+ src1 += (stride1 << 1);
+ mask += 8;
+ i += 2;
+ } while (i < h);
+ } else if (8 == w) {
+ do {
+ __m128i s0 = _mm_loadl_epi64((__m128i const *)src0);
+ __m128i s1 = _mm_loadl_epi64((__m128i const *)src1);
+ s0 = _mm_cvtepu8_epi16(s0);
+ s1 = _mm_cvtepu8_epi16(s1);
+ const __m128i m16 = calc_mask(mask_base, s0, s1);
+ const __m128i m8 = _mm_packus_epi16(m16, m16);
+ _mm_storel_epi64((__m128i *)mask, m8);
+ src0 += stride0;
+ src1 += stride1;
+ mask += 8;
+ i += 1;
+ } while (i < h);
+ } else {
+ const __m128i zero = _mm_setzero_si128();
+ do {
+ int j = 0;
+ do {
+ const __m128i s0 = _mm_load_si128((__m128i const *)(src0 + j));
+ const __m128i s1 = _mm_load_si128((__m128i const *)(src1 + j));
+ const __m128i s0L = _mm_cvtepu8_epi16(s0);
+ const __m128i s1L = _mm_cvtepu8_epi16(s1);
+ const __m128i s0H = _mm_unpackhi_epi8(s0, zero);
+ const __m128i s1H = _mm_unpackhi_epi8(s1, zero);
+
+ const __m128i m16L = calc_mask(mask_base, s0L, s1L);
+ const __m128i m16H = calc_mask(mask_base, s0H, s1H);
+
+ const __m128i m8 = _mm_packus_epi16(m16L, m16H);
+ _mm_store_si128((__m128i *)(mask + j), m8);
+ j += 16;
+ } while (j < w);
+ src0 += stride0;
+ src1 += stride1;
+ mask += w;
+ i += 1;
+ } while (i < h);
+ }
+}
+
+void av1_build_compound_diffwtd_mask_d16_sse4_1(
+ uint8_t *mask, DIFFWTD_MASK_TYPE mask_type, const CONV_BUF_TYPE *src0,
+ int src0_stride, const CONV_BUF_TYPE *src1, int src1_stride, int h, int w,
+ ConvolveParams *conv_params, int bd) {
+ const int which_inverse = (mask_type == DIFFWTD_38) ? 0 : 1;
+ const int mask_base = 38;
+ int round =
+ 2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1 + (bd - 8);
+ const __m128i round_const = _mm_set1_epi16((1 << round) >> 1);
+ const __m128i mask_base_16 = _mm_set1_epi16(mask_base);
+ const __m128i clip_diff = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA);
+ const __m128i add_const =
+ _mm_set1_epi16((which_inverse ? AOM_BLEND_A64_MAX_ALPHA : 0));
+ const __m128i add_sign = _mm_set1_epi16((which_inverse ? -1 : 1));
+
+ int i, j;
+ // When rounding constant is added, there is a possibility of overflow.
+ // However that much precision is not required. Code should very well work for
+ // other values of DIFF_FACTOR_LOG2 and AOM_BLEND_A64_MAX_ALPHA as well. But
+ // there is a possibility of corner case bugs.
+ assert(DIFF_FACTOR_LOG2 == 4);
+ assert(AOM_BLEND_A64_MAX_ALPHA == 64);
+ for (i = 0; i < h; ++i) {
+ for (j = 0; j < w; j += 8) {
+ const __m128i data_src0 =
+ _mm_loadu_si128((__m128i *)&src0[(i * src0_stride) + j]);
+ const __m128i data_src1 =
+ _mm_loadu_si128((__m128i *)&src1[(i * src1_stride) + j]);
+
+ const __m128i diffa = _mm_subs_epu16(data_src0, data_src1);
+ const __m128i diffb = _mm_subs_epu16(data_src1, data_src0);
+ const __m128i diff = _mm_max_epu16(diffa, diffb);
+ const __m128i diff_round =
+ _mm_srli_epi16(_mm_adds_epu16(diff, round_const), round);
+ const __m128i diff_factor = _mm_srli_epi16(diff_round, DIFF_FACTOR_LOG2);
+ const __m128i diff_mask = _mm_adds_epi16(diff_factor, mask_base_16);
+ __m128i diff_clamp = _mm_min_epi16(diff_mask, clip_diff);
+ // clamp to 0 can be skipped since we are using add and saturate
+ // instruction
+
+ const __m128i diff_sign = _mm_sign_epi16(diff_clamp, add_sign);
+ const __m128i diff_const_16 = _mm_add_epi16(diff_sign, add_const);
+
+ // 8 bit conversion and saturation to uint8
+ const __m128i res_8 = _mm_packus_epi16(diff_const_16, diff_const_16);
+
+ // Store values into the destination buffer
+ __m128i *const dst = (__m128i *)&mask[i * w + j];
+
+ if ((w - j) > 4) {
+ _mm_storel_epi64(dst, res_8);
+ } else { // w==4
+ *(int *)dst = _mm_cvtsi128_si32(res_8);
+ }
+ }
+ }
+}
diff --git a/third_party/aom/av1/common/x86/reconinter_ssse3.c b/third_party/aom/av1/common/x86/reconinter_ssse3.c
new file mode 100644
index 0000000000..c9a3709a62
--- /dev/null
+++ b/third_party/aom/av1/common/x86/reconinter_ssse3.c
@@ -0,0 +1,120 @@
+/*
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include "config/av1_rtcd.h"
+
+#if CONFIG_AV1_HIGHBITDEPTH
+
+#include <tmmintrin.h>
+
+#include "aom/aom_integer.h"
+#include "aom_dsp/blend.h"
+#include "aom_dsp/x86/synonyms.h"
+#include "av1/common/blockd.h"
+
+void av1_build_compound_diffwtd_mask_highbd_ssse3(
+ uint8_t *mask, DIFFWTD_MASK_TYPE mask_type, const uint8_t *src0,
+ int src0_stride, const uint8_t *src1, int src1_stride, int h, int w,
+ int bd) {
+ if (w < 8) {
+ av1_build_compound_diffwtd_mask_highbd_c(mask, mask_type, src0, src0_stride,
+ src1, src1_stride, h, w, bd);
+ } else {
+ assert(bd >= 8);
+ assert((w % 8) == 0);
+ assert(mask_type == DIFFWTD_38 || mask_type == DIFFWTD_38_INV);
+ const __m128i x0 = _mm_setzero_si128();
+ const __m128i xAOM_BLEND_A64_MAX_ALPHA =
+ _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA);
+ const int mask_base = 38;
+ const __m128i xmask_base = _mm_set1_epi16(mask_base);
+ const uint16_t *ssrc0 = CONVERT_TO_SHORTPTR(src0);
+ const uint16_t *ssrc1 = CONVERT_TO_SHORTPTR(src1);
+ if (bd == 8) {
+ if (mask_type == DIFFWTD_38_INV) {
+ for (int i = 0; i < h; ++i) {
+ for (int j = 0; j < w; j += 8) {
+ __m128i s0 = _mm_loadu_si128((const __m128i *)&ssrc0[j]);
+ __m128i s1 = _mm_loadu_si128((const __m128i *)&ssrc1[j]);
+ __m128i diff = _mm_srai_epi16(_mm_abs_epi16(_mm_sub_epi16(s0, s1)),
+ DIFF_FACTOR_LOG2);
+ __m128i m = _mm_min_epi16(
+ _mm_max_epi16(x0, _mm_add_epi16(diff, xmask_base)),
+ xAOM_BLEND_A64_MAX_ALPHA);
+ m = _mm_sub_epi16(xAOM_BLEND_A64_MAX_ALPHA, m);
+ m = _mm_packus_epi16(m, m);
+ _mm_storel_epi64((__m128i *)&mask[j], m);
+ }
+ ssrc0 += src0_stride;
+ ssrc1 += src1_stride;
+ mask += w;
+ }
+ } else {
+ for (int i = 0; i < h; ++i) {
+ for (int j = 0; j < w; j += 8) {
+ __m128i s0 = _mm_loadu_si128((const __m128i *)&ssrc0[j]);
+ __m128i s1 = _mm_loadu_si128((const __m128i *)&ssrc1[j]);
+ __m128i diff = _mm_srai_epi16(_mm_abs_epi16(_mm_sub_epi16(s0, s1)),
+ DIFF_FACTOR_LOG2);
+ __m128i m = _mm_min_epi16(
+ _mm_max_epi16(x0, _mm_add_epi16(diff, xmask_base)),
+ xAOM_BLEND_A64_MAX_ALPHA);
+ m = _mm_packus_epi16(m, m);
+ _mm_storel_epi64((__m128i *)&mask[j], m);
+ }
+ ssrc0 += src0_stride;
+ ssrc1 += src1_stride;
+ mask += w;
+ }
+ }
+ } else {
+ const __m128i xshift = xx_set1_64_from_32i(bd - 8 + DIFF_FACTOR_LOG2);
+ if (mask_type == DIFFWTD_38_INV) {
+ for (int i = 0; i < h; ++i) {
+ for (int j = 0; j < w; j += 8) {
+ __m128i s0 = _mm_loadu_si128((const __m128i *)&ssrc0[j]);
+ __m128i s1 = _mm_loadu_si128((const __m128i *)&ssrc1[j]);
+ __m128i diff =
+ _mm_sra_epi16(_mm_abs_epi16(_mm_sub_epi16(s0, s1)), xshift);
+ __m128i m = _mm_min_epi16(
+ _mm_max_epi16(x0, _mm_add_epi16(diff, xmask_base)),
+ xAOM_BLEND_A64_MAX_ALPHA);
+ m = _mm_sub_epi16(xAOM_BLEND_A64_MAX_ALPHA, m);
+ m = _mm_packus_epi16(m, m);
+ _mm_storel_epi64((__m128i *)&mask[j], m);
+ }
+ ssrc0 += src0_stride;
+ ssrc1 += src1_stride;
+ mask += w;
+ }
+ } else {
+ for (int i = 0; i < h; ++i) {
+ for (int j = 0; j < w; j += 8) {
+ __m128i s0 = _mm_loadu_si128((const __m128i *)&ssrc0[j]);
+ __m128i s1 = _mm_loadu_si128((const __m128i *)&ssrc1[j]);
+ __m128i diff =
+ _mm_sra_epi16(_mm_abs_epi16(_mm_sub_epi16(s0, s1)), xshift);
+ __m128i m = _mm_min_epi16(
+ _mm_max_epi16(x0, _mm_add_epi16(diff, xmask_base)),
+ xAOM_BLEND_A64_MAX_ALPHA);
+ m = _mm_packus_epi16(m, m);
+ _mm_storel_epi64((__m128i *)&mask[j], m);
+ }
+ ssrc0 += src0_stride;
+ ssrc1 += src1_stride;
+ mask += w;
+ }
+ }
+ }
+ }
+}
+
+#endif // CONFIG_AV1_HIGHBITDEPTH
diff --git a/third_party/aom/av1/common/x86/resize_ssse3.c b/third_party/aom/av1/common/x86/resize_ssse3.c
new file mode 100644
index 0000000000..a7fdb5a9a4
--- /dev/null
+++ b/third_party/aom/av1/common/x86/resize_ssse3.c
@@ -0,0 +1,974 @@
+/*
+ *
+ * Copyright (c) 2020, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <tmmintrin.h> // SSSE3
+#include "config/av1_rtcd.h"
+#include "config/aom_scale_rtcd.h"
+
+#include "aom_dsp/x86/convolve_sse2.h"
+#include "aom_dsp/x86/convolve_ssse3.h"
+#include "aom_dsp/x86/mem_sse2.h"
+#include "aom_dsp/x86/transpose_sse2.h"
+#include "av1/common/resize.h"
+
+static INLINE __m128i scale_plane_2_to_1_phase_0_kernel(
+ const uint8_t *const src, const __m128i *const mask) {
+ const __m128i a = _mm_loadu_si128((const __m128i *)(&src[0]));
+ const __m128i b = _mm_loadu_si128((const __m128i *)(&src[16]));
+ const __m128i a_and = _mm_and_si128(a, *mask);
+ const __m128i b_and = _mm_and_si128(b, *mask);
+ return _mm_packus_epi16(a_and, b_and);
+}
+
+static INLINE void shuffle_filter_odd_ssse3(const int16_t *const filter,
+ __m128i *const f) {
+ const __m128i f_values = _mm_load_si128((const __m128i *)filter);
+ // pack and duplicate the filter values
+ // It utilizes the fact that the high byte of filter[3] is always 0 to clean
+ // half of f[0] and f[4].
+ assert(filter[3] >= 0 && filter[3] < 256);
+ f[0] = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0007u));
+ f[1] = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0402u));
+ f[2] = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0806u));
+ f[3] = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0c0au));
+ f[4] = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x070eu));
+}
+
+static INLINE __m128i convolve8_8_even_offset_ssse3(const __m128i *const s,
+ const __m128i *const f) {
+ // multiply 2 adjacent elements with the filter and add the result
+ const __m128i k_64 = _mm_set1_epi16(1 << 6);
+ const __m128i x0 = _mm_maddubs_epi16(s[0], f[0]);
+ const __m128i x1 = _mm_maddubs_epi16(s[1], f[1]);
+ const __m128i x2 = _mm_maddubs_epi16(s[2], f[2]);
+ const __m128i x3 = _mm_maddubs_epi16(s[3], f[3]);
+ // compensate the subtracted 64 in f[1]. x4 is always non negative.
+ const __m128i x4 = _mm_maddubs_epi16(s[1], _mm_set1_epi8(64));
+ // add and saturate the results together
+ __m128i temp = _mm_adds_epi16(x0, x3);
+ temp = _mm_adds_epi16(temp, x1);
+ temp = _mm_adds_epi16(temp, x2);
+ temp = _mm_adds_epi16(temp, x4);
+ // round and shift by 7 bit each 16 bit
+ temp = _mm_adds_epi16(temp, k_64);
+ temp = _mm_srai_epi16(temp, 7);
+ return temp;
+}
+
+static INLINE __m128i convolve8_8_odd_offset_ssse3(const __m128i *const s,
+ const __m128i *const f) {
+ // multiply 2 adjacent elements with the filter and add the result
+ const __m128i k_64 = _mm_set1_epi16(1 << 6);
+ const __m128i x0 = _mm_maddubs_epi16(s[0], f[0]);
+ const __m128i x1 = _mm_maddubs_epi16(s[1], f[1]);
+ const __m128i x2 = _mm_maddubs_epi16(s[2], f[2]);
+ const __m128i x3 = _mm_maddubs_epi16(s[3], f[3]);
+ const __m128i x4 = _mm_maddubs_epi16(s[4], f[4]);
+ // compensate the subtracted 64 in f[2]. x5 is always non negative.
+ const __m128i x5 = _mm_maddubs_epi16(s[2], _mm_set1_epi8(64));
+ __m128i temp;
+
+ // add and saturate the results together
+ temp = _mm_adds_epi16(x0, x1);
+ temp = _mm_adds_epi16(temp, x2);
+ temp = _mm_adds_epi16(temp, x3);
+ temp = _mm_adds_epi16(temp, x4);
+ temp = _mm_adds_epi16(temp, x5);
+ // round and shift by 7 bit each 16 bit
+ temp = _mm_adds_epi16(temp, k_64);
+ temp = _mm_srai_epi16(temp, 7);
+ return temp;
+}
+
+static void scale_plane_2_to_1_phase_0(const uint8_t *src,
+ const ptrdiff_t src_stride, uint8_t *dst,
+ const ptrdiff_t dst_stride,
+ const int dst_w, const int dst_h) {
+ const int max_width = (dst_w + 15) & ~15;
+ const __m128i mask = _mm_set1_epi16(0x00FF);
+ int y = dst_h;
+
+ do {
+ int x = max_width;
+ do {
+ const __m128i d = scale_plane_2_to_1_phase_0_kernel(src, &mask);
+ _mm_storeu_si128((__m128i *)dst, d);
+ src += 32;
+ dst += 16;
+ x -= 16;
+ } while (x);
+ src += 2 * (src_stride - max_width);
+ dst += dst_stride - max_width;
+ } while (--y);
+}
+
+static void scale_plane_4_to_1_phase_0(const uint8_t *src,
+ const ptrdiff_t src_stride, uint8_t *dst,
+ const ptrdiff_t dst_stride,
+ const int dst_w, const int dst_h) {
+ const int max_width = (dst_w + 15) & ~15;
+ const __m128i mask = _mm_set1_epi32(0x000000FF);
+ int y = dst_h;
+
+ do {
+ int x = max_width;
+ do {
+ const __m128i d0 = scale_plane_2_to_1_phase_0_kernel(&src[0], &mask);
+ const __m128i d1 = scale_plane_2_to_1_phase_0_kernel(&src[32], &mask);
+ const __m128i d2 = _mm_packus_epi16(d0, d1);
+ _mm_storeu_si128((__m128i *)dst, d2);
+ src += 64;
+ dst += 16;
+ x -= 16;
+ } while (x);
+ src += 4 * (src_stride - max_width);
+ dst += dst_stride - max_width;
+ } while (--y);
+}
+
+static INLINE __m128i scale_plane_bilinear_kernel(const __m128i *const s,
+ const __m128i c0c1) {
+ const __m128i k_64 = _mm_set1_epi16(1 << 6);
+ const __m128i t0 = _mm_maddubs_epi16(s[0], c0c1);
+ const __m128i t1 = _mm_maddubs_epi16(s[1], c0c1);
+ // round and shift by 7 bit each 16 bit
+ const __m128i t2 = _mm_adds_epi16(t0, k_64);
+ const __m128i t3 = _mm_adds_epi16(t1, k_64);
+ const __m128i t4 = _mm_srai_epi16(t2, 7);
+ const __m128i t5 = _mm_srai_epi16(t3, 7);
+ return _mm_packus_epi16(t4, t5);
+}
+
+static void scale_plane_2_to_1_bilinear(const uint8_t *src,
+ const ptrdiff_t src_stride,
+ uint8_t *dst,
+ const ptrdiff_t dst_stride,
+ const int dst_w, const int dst_h,
+ const __m128i c0c1) {
+ const int max_width = (dst_w + 15) & ~15;
+ int y = dst_h;
+
+ do {
+ int x = max_width;
+ do {
+ __m128i s[2], d[2];
+
+ // Horizontal
+ // Even rows
+ s[0] = _mm_loadu_si128((const __m128i *)(src + 0));
+ s[1] = _mm_loadu_si128((const __m128i *)(src + 16));
+ d[0] = scale_plane_bilinear_kernel(s, c0c1);
+
+ // odd rows
+ s[0] = _mm_loadu_si128((const __m128i *)(src + src_stride + 0));
+ s[1] = _mm_loadu_si128((const __m128i *)(src + src_stride + 16));
+ d[1] = scale_plane_bilinear_kernel(s, c0c1);
+
+ // Vertical
+ s[0] = _mm_unpacklo_epi8(d[0], d[1]);
+ s[1] = _mm_unpackhi_epi8(d[0], d[1]);
+ d[0] = scale_plane_bilinear_kernel(s, c0c1);
+
+ _mm_storeu_si128((__m128i *)dst, d[0]);
+ src += 32;
+ dst += 16;
+ x -= 16;
+ } while (x);
+ src += 2 * (src_stride - max_width);
+ dst += dst_stride - max_width;
+ } while (--y);
+}
+
+static void scale_plane_4_to_1_bilinear(const uint8_t *src,
+ const ptrdiff_t src_stride,
+ uint8_t *dst,
+ const ptrdiff_t dst_stride,
+ const int dst_w, const int dst_h,
+ const __m128i c0c1) {
+ const int max_width = (dst_w + 15) & ~15;
+ int y = dst_h;
+
+ do {
+ int x = max_width;
+ do {
+ __m128i s[8], d[8];
+
+ // Note: Using _mm_packus_epi32() in SSE4.1 could be faster.
+ // Here we tried to not use shuffle instructions which would be slow
+ // on some x86 CPUs.
+
+ // Horizontal
+ // 000 001 xx xx 004 005 xx xx 008 009 xx xx 00C 00D xx xx
+ // 010 011 xx xx 014 015 xx xx 018 019 xx xx 01C 01D xx xx
+ // 020 021 xx xx 024 025 xx xx 028 029 xx xx 02C 02D xx xx
+ // 030 031 xx xx 034 035 xx xx 038 039 xx xx 03C 03D xx xx
+ // 100 101 xx xx 104 105 xx xx 108 109 xx xx 10C 10D xx xx
+ // 110 111 xx xx 114 115 xx xx 118 119 xx xx 11C 11D xx xx
+ // 120 121 xx xx 124 125 xx xx 128 129 xx xx 12C 12D xx xx
+ // 130 131 xx xx 134 135 xx xx 138 139 xx xx 13C 13D xx xx
+ s[0] = _mm_loadu_si128((const __m128i *)(&src[0]));
+ s[1] = _mm_loadu_si128((const __m128i *)(&src[16]));
+ s[2] = _mm_loadu_si128((const __m128i *)(&src[32]));
+ s[3] = _mm_loadu_si128((const __m128i *)(&src[48]));
+ s[4] = _mm_loadu_si128((const __m128i *)(src + src_stride + 0));
+ s[5] = _mm_loadu_si128((const __m128i *)(src + src_stride + 16));
+ s[6] = _mm_loadu_si128((const __m128i *)(src + src_stride + 32));
+ s[7] = _mm_loadu_si128((const __m128i *)(src + src_stride + 48));
+
+ // 000 001 100 101 xx xx xx xx 004 005 104 105 xx xx xx xx
+ // 008 009 108 109 xx xx xx xx 00C 00D 10C 10D xx xx xx xx
+ // 010 011 110 111 xx xx xx xx 014 015 114 115 xx xx xx xx
+ // 018 019 118 119 xx xx xx xx 01C 01D 11C 11D xx xx xx xx
+ // 020 021 120 121 xx xx xx xx 024 025 124 125 xx xx xx xx
+ // 028 029 128 129 xx xx xx xx 02C 02D 12C 12D xx xx xx xx
+ // 030 031 130 131 xx xx xx xx 034 035 134 135 xx xx xx xx
+ // 038 039 138 139 xx xx xx xx 03C 03D 13C 13D xx xx xx xx
+ d[0] = _mm_unpacklo_epi16(s[0], s[4]);
+ d[1] = _mm_unpackhi_epi16(s[0], s[4]);
+ d[2] = _mm_unpacklo_epi16(s[1], s[5]);
+ d[3] = _mm_unpackhi_epi16(s[1], s[5]);
+ d[4] = _mm_unpacklo_epi16(s[2], s[6]);
+ d[5] = _mm_unpackhi_epi16(s[2], s[6]);
+ d[6] = _mm_unpacklo_epi16(s[3], s[7]);
+ d[7] = _mm_unpackhi_epi16(s[3], s[7]);
+
+ // 000 001 100 101 008 009 108 109 xx xx xx xx xx xx xx xx
+ // 004 005 104 105 00C 00D 10C 10D xx xx xx xx xx xx xx xx
+ // 010 011 110 111 018 019 118 119 xx xx xx xx xx xx xx xx
+ // 014 015 114 115 01C 01D 11C 11D xx xx xx xx xx xx xx xx
+ // 020 021 120 121 028 029 128 129 xx xx xx xx xx xx xx xx
+ // 024 025 124 125 02C 02D 12C 12D xx xx xx xx xx xx xx xx
+ // 030 031 130 131 038 039 138 139 xx xx xx xx xx xx xx xx
+ // 034 035 134 135 03C 03D 13C 13D xx xx xx xx xx xx xx xx
+ s[0] = _mm_unpacklo_epi32(d[0], d[1]);
+ s[1] = _mm_unpackhi_epi32(d[0], d[1]);
+ s[2] = _mm_unpacklo_epi32(d[2], d[3]);
+ s[3] = _mm_unpackhi_epi32(d[2], d[3]);
+ s[4] = _mm_unpacklo_epi32(d[4], d[5]);
+ s[5] = _mm_unpackhi_epi32(d[4], d[5]);
+ s[6] = _mm_unpacklo_epi32(d[6], d[7]);
+ s[7] = _mm_unpackhi_epi32(d[6], d[7]);
+
+ // 000 001 100 101 004 005 104 105 008 009 108 109 00C 00D 10C 10D
+ // 010 011 110 111 014 015 114 115 018 019 118 119 01C 01D 11C 11D
+ // 020 021 120 121 024 025 124 125 028 029 128 129 02C 02D 12C 12D
+ // 030 031 130 131 034 035 134 135 038 039 138 139 03C 03D 13C 13D
+ d[0] = _mm_unpacklo_epi32(s[0], s[1]);
+ d[1] = _mm_unpacklo_epi32(s[2], s[3]);
+ d[2] = _mm_unpacklo_epi32(s[4], s[5]);
+ d[3] = _mm_unpacklo_epi32(s[6], s[7]);
+
+ d[0] = scale_plane_bilinear_kernel(&d[0], c0c1);
+ d[1] = scale_plane_bilinear_kernel(&d[2], c0c1);
+
+ // Vertical
+ d[0] = scale_plane_bilinear_kernel(d, c0c1);
+
+ _mm_storeu_si128((__m128i *)dst, d[0]);
+ src += 64;
+ dst += 16;
+ x -= 16;
+ } while (x);
+ src += 4 * (src_stride - max_width);
+ dst += dst_stride - max_width;
+ } while (--y);
+}
+
+static void scale_plane_4_to_1_general(const uint8_t *src, const int src_stride,
+ uint8_t *dst, const int dst_stride,
+ const int w, const int h,
+ const int16_t *const coef,
+ uint8_t *const temp_buffer) {
+ const int width_hor = (w + 1) & ~1;
+ const int width_ver = (w + 7) & ~7;
+ const int height_hor = (4 * h + SUBPEL_TAPS - 2 + 7) & ~7;
+ const int height_ver = (h + 1) & ~1;
+ int x, y = height_hor;
+ uint8_t *t = temp_buffer;
+ __m128i s[11], d[4];
+ __m128i f[4];
+
+ assert(w && h);
+
+ shuffle_filter_ssse3(coef, f);
+ src -= (SUBPEL_TAPS / 2 - 1) * src_stride + SUBPEL_TAPS / 2 + 3;
+
+ // horizontal 2x8
+ do {
+ load_8bit_8x8(src + 4, src_stride, s);
+ // 00 01 10 11 20 21 30 31 40 41 50 51 60 61 70 71
+ // 02 03 12 13 22 23 32 33 42 43 52 53 62 63 72 73
+ // 04 05 14 15 24 25 34 35 44 45 54 55 64 65 74 75 (overlapped)
+ // 06 07 16 17 26 27 36 37 46 47 56 57 66 67 76 77 (overlapped)
+ transpose_16bit_4x8(s, s);
+ x = width_hor;
+
+ do {
+ src += 8;
+ load_8bit_8x8(src, src_stride, &s[2]);
+ // 04 05 14 15 24 25 34 35 44 45 54 55 64 65 74 75
+ // 06 07 16 17 26 27 36 37 46 47 56 57 66 67 76 77
+ // 08 09 18 19 28 29 38 39 48 49 58 59 68 69 78 79
+ // 0A 0B 1A 1B 2A 2B 3A 3B 4A 4B 5A 5B 6A 6B 7A 7B
+ transpose_16bit_4x8(&s[2], &s[2]);
+
+ d[0] = convolve8_8_ssse3(&s[0], f); // 00 10 20 30 40 50 60 70
+ d[1] = convolve8_8_ssse3(&s[2], f); // 01 11 21 31 41 51 61 71
+
+ // 00 10 20 30 40 50 60 70 xx xx xx xx xx xx xx xx
+ // 01 11 21 31 41 51 61 71 xx xx xx xx xx xx xx xx
+ d[0] = _mm_packus_epi16(d[0], d[0]);
+ d[1] = _mm_packus_epi16(d[1], d[1]);
+ // 00 10 01 11 20 30 21 31 40 50 41 51 60 70 61 71
+ d[0] = _mm_unpacklo_epi16(d[0], d[1]);
+ store_8bit_4x4_sse2(d[0], t, 2 * width_hor);
+
+ s[0] = s[4];
+ s[1] = s[5];
+
+ t += 4;
+ x -= 2;
+ } while (x);
+ src += 8 * src_stride - 4 * width_hor;
+ t += 6 * width_hor;
+ y -= 8;
+ } while (y);
+
+ // vertical 8x2
+ x = width_ver;
+ t = temp_buffer;
+ do {
+ // 00 10 01 11 02 12 03 13 04 14 05 15 06 16 07 17
+ // 20 30 21 31 22 32 23 33 24 34 25 35 26 36 27 37
+ s[0] = _mm_loadu_si128((const __m128i *)(t + 0 * width_hor));
+ s[1] = _mm_loadu_si128((const __m128i *)(t + 2 * width_hor));
+ t += 4 * width_hor;
+ y = height_ver;
+
+ do {
+ // 40 50 41 51 42 52 43 53 44 54 45 55 46 56 47 57
+ // 60 70 61 71 62 72 63 73 64 74 65 75 66 76 67 77
+ // 80 90 81 91 82 92 83 93 84 94 85 95 86 96 87 77
+ // A0 B0 A1 B1 A2 B2 A3 B3 A4 B4 A5 B5 A6 B6 A7 77
+ loadu_8bit_16x4(t, 2 * width_hor, &s[2]);
+ t += 8 * width_hor;
+
+ d[0] = convolve8_8_ssse3(&s[0], f); // 00 01 02 03 04 05 06 07
+ d[1] = convolve8_8_ssse3(&s[2], f); // 10 11 12 13 14 15 16 17
+
+ // 00 01 02 03 04 05 06 07 10 11 12 13 14 15 16 17
+ d[0] = _mm_packus_epi16(d[0], d[1]);
+ _mm_storel_epi64((__m128i *)(dst + 0 * dst_stride), d[0]);
+ _mm_storeh_epi64((__m128i *)(dst + 1 * dst_stride), d[0]);
+
+ s[0] = s[4];
+ s[1] = s[5];
+
+ dst += 2 * dst_stride;
+ y -= 2;
+ } while (y);
+ t -= width_hor * (4 * height_ver + 4);
+ t += 16;
+ dst -= height_ver * dst_stride;
+ dst += 8;
+ x -= 8;
+ } while (x);
+}
+
+static void scale_plane_2_to_1_general(const uint8_t *src, const int src_stride,
+ uint8_t *dst, const int dst_stride,
+ const int w, const int h,
+ const int16_t *const coef,
+ uint8_t *const temp_buffer) {
+ const int width_hor = (w + 3) & ~3;
+ const int width_ver = (w + 7) & ~7;
+ const int height_hor = (2 * h + SUBPEL_TAPS - 2 + 7) & ~7;
+ const int height_ver = (h + 3) & ~3;
+ int x, y = height_hor;
+ uint8_t *t = temp_buffer;
+ __m128i s[11], d[4];
+ __m128i f[4];
+
+ assert(w && h);
+
+ shuffle_filter_ssse3(coef, f);
+ src -= (SUBPEL_TAPS / 2 - 1) * src_stride + SUBPEL_TAPS / 2 + 1;
+
+ // horizontal 4x8
+ do {
+ load_8bit_8x8(src + 2, src_stride, s);
+ // 00 01 10 11 20 21 30 31 40 41 50 51 60 61 70 71
+ // 02 03 12 13 22 23 32 33 42 43 52 53 62 63 72 73
+ // 04 05 14 15 24 25 34 35 44 45 54 55 64 65 74 75
+ // 06 07 16 17 26 27 36 37 46 47 56 57 66 67 76 77 (overlapped)
+ transpose_16bit_4x8(s, s);
+ x = width_hor;
+
+ do {
+ src += 8;
+ load_8bit_8x8(src, src_stride, &s[3]);
+ // 06 07 16 17 26 27 36 37 46 47 56 57 66 67 76 77
+ // 08 09 18 19 28 29 38 39 48 49 58 59 68 69 78 79
+ // 0A 0B 1A 1B 2A 2B 3A 3B 4A 4B 5A 5B 6A 6B 7A 7B
+ // 0C 0D 1C 1D 2C 2D 3C 3D 4C 4D 5C 5D 6C 6D 7C 7D
+ transpose_16bit_4x8(&s[3], &s[3]);
+
+ d[0] = convolve8_8_ssse3(&s[0], f); // 00 10 20 30 40 50 60 70
+ d[1] = convolve8_8_ssse3(&s[1], f); // 01 11 21 31 41 51 61 71
+ d[2] = convolve8_8_ssse3(&s[2], f); // 02 12 22 32 42 52 62 72
+ d[3] = convolve8_8_ssse3(&s[3], f); // 03 13 23 33 43 53 63 73
+
+ // 00 10 20 30 40 50 60 70 02 12 22 32 42 52 62 72
+ // 01 11 21 31 41 51 61 71 03 13 23 33 43 53 63 73
+ d[0] = _mm_packus_epi16(d[0], d[2]);
+ d[1] = _mm_packus_epi16(d[1], d[3]);
+ // 00 10 01 11 20 30 21 31 40 50 41 51 60 70 61 71
+ // 02 12 03 13 22 32 23 33 42 52 43 53 62 72 63 73
+ d[2] = _mm_unpacklo_epi16(d[0], d[1]);
+ d[3] = _mm_unpackhi_epi16(d[0], d[1]);
+ // 00 10 01 11 02 12 03 13 20 30 21 31 22 32 23 33
+ // 40 50 41 51 42 52 43 53 60 70 61 71 62 72 63 73
+ d[0] = _mm_unpacklo_epi32(d[2], d[3]);
+ d[1] = _mm_unpackhi_epi32(d[2], d[3]);
+ store_8bit_8x4_from_16x2(d, t, 2 * width_hor);
+
+ s[0] = s[4];
+ s[1] = s[5];
+ s[2] = s[6];
+
+ t += 8;
+ x -= 4;
+ } while (x);
+ src += 8 * src_stride - 2 * width_hor;
+ t += 6 * width_hor;
+ y -= 8;
+ } while (y);
+
+ // vertical 8x4
+ x = width_ver;
+ t = temp_buffer;
+ do {
+ // 00 10 01 11 02 12 03 13 04 14 05 15 06 16 07 17
+ // 20 30 21 31 22 32 23 33 24 34 25 35 26 36 27 37
+ // 40 50 41 51 42 52 43 53 44 54 45 55 46 56 47 57
+ s[0] = _mm_loadu_si128((const __m128i *)(t + 0 * width_hor));
+ s[1] = _mm_loadu_si128((const __m128i *)(t + 2 * width_hor));
+ s[2] = _mm_loadu_si128((const __m128i *)(t + 4 * width_hor));
+ t += 6 * width_hor;
+ y = height_ver;
+
+ do {
+ // 60 70 61 71 62 72 63 73 64 74 65 75 66 76 67 77
+ // 80 90 81 91 82 92 83 93 84 94 85 95 86 96 87 77
+ // A0 B0 A1 B1 A2 B2 A3 B3 A4 B4 A5 B5 A6 B6 A7 77
+ // C0 D0 C1 D1 C2 D2 C3 D3 C4 D4 C5 D5 C6 D6 C7 77
+ loadu_8bit_16x4(t, 2 * width_hor, &s[3]);
+ t += 8 * width_hor;
+
+ d[0] = convolve8_8_ssse3(&s[0], f); // 00 01 02 03 04 05 06 07
+ d[1] = convolve8_8_ssse3(&s[1], f); // 10 11 12 13 14 15 16 17
+ d[2] = convolve8_8_ssse3(&s[2], f); // 20 21 22 23 24 25 26 27
+ d[3] = convolve8_8_ssse3(&s[3], f); // 30 31 32 33 34 35 36 37
+
+ // 00 01 02 03 04 05 06 07 10 11 12 13 14 15 16 17
+ // 20 21 22 23 24 25 26 27 30 31 32 33 34 35 36 37
+ d[0] = _mm_packus_epi16(d[0], d[1]);
+ d[1] = _mm_packus_epi16(d[2], d[3]);
+ store_8bit_8x4_from_16x2(d, dst, dst_stride);
+
+ s[0] = s[4];
+ s[1] = s[5];
+ s[2] = s[6];
+
+ dst += 4 * dst_stride;
+ y -= 4;
+ } while (y);
+ t -= width_hor * (2 * height_ver + 6);
+ t += 16;
+ dst -= height_ver * dst_stride;
+ dst += 8;
+ x -= 8;
+ } while (x);
+}
+
+typedef void (*shuffle_filter_funcs)(const int16_t *const filter,
+ __m128i *const f);
+
+typedef __m128i (*convolve8_funcs)(const __m128i *const s,
+ const __m128i *const f);
+
+static void scale_plane_4_to_3_general(const uint8_t *src, const int src_stride,
+ uint8_t *dst, const int dst_stride,
+ const int w, const int h,
+ const InterpKernel *const coef,
+ const int phase,
+ uint8_t *const temp_buffer) {
+ static const int step_q4 = 16 * 4 / 3;
+ const int width_hor = (w + 5) - ((w + 5) % 6);
+ const int stride_hor = 2 * width_hor + 4; // store 4 extra pixels
+ const int width_ver = (w + 7) & ~7;
+ // We need (SUBPEL_TAPS - 1) extra rows: (SUBPEL_TAPS / 2 - 1) extra rows
+ // above and (SUBPEL_TAPS / 2) extra rows below.
+ const int height_hor = (4 * h / 3 + SUBPEL_TAPS - 1 + 7) & ~7;
+ const int height_ver = (h + 5) - ((h + 5) % 6);
+ int x, y = height_hor;
+ uint8_t *t = temp_buffer;
+ __m128i s[12], d[6], dd[4];
+ __m128i f0[4], f1[5], f2[5];
+ // The offset of the first row is always less than 1 pixel.
+ const int offset1_q4 = phase + 1 * step_q4;
+ const int offset2_q4 = phase + 2 * step_q4;
+ // offset_idxx indicates the pixel offset is even (0) or odd (1).
+ // It's used to choose the src offset and filter coefficient offset.
+ const int offset_idx1 = (offset1_q4 >> 4) & 1;
+ const int offset_idx2 = (offset2_q4 >> 4) & 1;
+ static const shuffle_filter_funcs shuffle_filter_func_list[2] = {
+ shuffle_filter_ssse3, shuffle_filter_odd_ssse3
+ };
+ static const convolve8_funcs convolve8_func_list[2] = {
+ convolve8_8_even_offset_ssse3, convolve8_8_odd_offset_ssse3
+ };
+
+ assert(w && h);
+
+ shuffle_filter_ssse3(coef[(phase + 0 * step_q4) & SUBPEL_MASK], f0);
+ shuffle_filter_func_list[offset_idx1](coef[offset1_q4 & SUBPEL_MASK], f1);
+ shuffle_filter_func_list[offset_idx2](coef[offset2_q4 & SUBPEL_MASK], f2);
+
+ // Sub 64 to avoid overflow.
+ // Coef 128 would be treated as -128 in PMADDUBSW. Sub 64 here.
+ // Coef 128 is in either fx[1] or fx[2] depending on the phase idx.
+ // When filter phase idx is 1, the two biggest coefficients are shuffled
+ // together, and the sum of them are always no less than 128. Sub 64 here.
+ // After the subtraction, when the sum of all positive coefficients are no
+ // larger than 128, and the sum of all negative coefficients are no
+ // less than -128, there will be no overflow in the convolve8 functions.
+ f0[1] = _mm_sub_epi8(f0[1], _mm_set1_epi8(64));
+ f1[1 + offset_idx1] = _mm_sub_epi8(f1[1 + offset_idx1], _mm_set1_epi8(64));
+ f2[1 + offset_idx2] = _mm_sub_epi8(f2[1 + offset_idx2], _mm_set1_epi8(64));
+
+ src -= (SUBPEL_TAPS / 2 - 1) * src_stride + SUBPEL_TAPS / 2 - 1;
+
+ // horizontal 6x8
+ do {
+ load_8bit_8x8(src, src_stride, s);
+ // 00 01 10 11 20 21 30 31 40 41 50 51 60 61 70 71
+ // 02 03 12 13 22 23 32 33 42 43 52 53 62 63 72 73
+ // 04 05 14 15 24 25 34 35 44 45 54 55 64 65 74 75
+ // 06 07 16 17 26 27 36 37 46 47 56 57 66 67 76 77
+ transpose_16bit_4x8(s, s);
+ x = width_hor;
+
+ do {
+ src += 8;
+ load_8bit_8x8(src, src_stride, &s[4]);
+ // 08 09 18 19 28 29 38 39 48 49 58 59 68 69 78 79
+ // 0A 0B 1A 1B 2A 2B 3A 3B 4A 4B 5A 5B 6A 6B 7A 7B
+ // OC 0D 1C 1D 2C 2D 3C 3D 4C 4D 5C 5D 6C 6D 7C 7D
+ // 0E 0F 1E 1F 2E 2F 3E 3F 4E 4F 5E 5F 6E 6F 7E 7F
+ transpose_16bit_4x8(&s[4], &s[4]);
+
+ // 00 10 20 30 40 50 60 70
+ // 01 11 21 31 41 51 61 71
+ // 02 12 22 32 42 52 62 72
+ // 03 13 23 33 43 53 63 73
+ // 04 14 24 34 44 54 64 74
+ // 05 15 25 35 45 55 65 75
+ d[0] = convolve8_8_even_offset_ssse3(&s[0], f0);
+ d[1] = convolve8_func_list[offset_idx1](&s[offset1_q4 >> 5], f1);
+ d[2] = convolve8_func_list[offset_idx2](&s[offset2_q4 >> 5], f2);
+ d[3] = convolve8_8_even_offset_ssse3(&s[2], f0);
+ d[4] = convolve8_func_list[offset_idx1](&s[2 + (offset1_q4 >> 5)], f1);
+ d[5] = convolve8_func_list[offset_idx2](&s[2 + (offset2_q4 >> 5)], f2);
+
+ // 00 10 20 30 40 50 60 70 02 12 22 32 42 52 62 72
+ // 01 11 21 31 41 51 61 71 03 13 23 33 43 53 63 73
+ // 04 14 24 34 44 54 64 74 xx xx xx xx xx xx xx xx
+ // 05 15 25 35 45 55 65 75 xx xx xx xx xx xx xx xx
+ dd[0] = _mm_packus_epi16(d[0], d[2]);
+ dd[1] = _mm_packus_epi16(d[1], d[3]);
+ dd[2] = _mm_packus_epi16(d[4], d[4]);
+ dd[3] = _mm_packus_epi16(d[5], d[5]);
+
+ // 00 10 01 11 20 30 21 31 40 50 41 51 60 70 61 71
+ // 02 12 03 13 22 32 23 33 42 52 43 53 62 72 63 73
+ // 04 14 05 15 24 34 25 35 44 54 45 55 64 74 65 75
+ d[0] = _mm_unpacklo_epi16(dd[0], dd[1]);
+ d[1] = _mm_unpackhi_epi16(dd[0], dd[1]);
+ d[2] = _mm_unpacklo_epi16(dd[2], dd[3]);
+
+ // 00 10 01 11 02 12 03 13 20 30 21 31 22 32 23 33
+ // 40 50 41 51 42 52 43 53 60 70 61 71 62 72 63 73
+ // 04 14 05 15 xx xx xx xx 24 34 25 35 xx xx xx xx
+ // 44 54 45 55 xx xx xx xx 64 74 65 75 xx xx xx xx
+ dd[0] = _mm_unpacklo_epi32(d[0], d[1]);
+ dd[1] = _mm_unpackhi_epi32(d[0], d[1]);
+ dd[2] = _mm_unpacklo_epi32(d[2], d[2]);
+ dd[3] = _mm_unpackhi_epi32(d[2], d[2]);
+
+ // 00 10 01 11 02 12 03 13 04 14 05 15 xx xx xx xx
+ // 20 30 21 31 22 32 23 33 24 34 25 35 xx xx xx xx
+ // 40 50 41 51 42 52 43 53 44 54 45 55 xx xx xx xx
+ // 60 70 61 71 62 72 63 73 64 74 65 75 xx xx xx xx
+ d[0] = _mm_unpacklo_epi64(dd[0], dd[2]);
+ d[1] = _mm_unpackhi_epi64(dd[0], dd[2]);
+ d[2] = _mm_unpacklo_epi64(dd[1], dd[3]);
+ d[3] = _mm_unpackhi_epi64(dd[1], dd[3]);
+
+ // store 4 extra pixels
+ storeu_8bit_16x4(d, t, stride_hor);
+
+ s[0] = s[4];
+ s[1] = s[5];
+ s[2] = s[6];
+ s[3] = s[7];
+
+ t += 12;
+ x -= 6;
+ } while (x);
+ src += 8 * src_stride - 4 * width_hor / 3;
+ t += 3 * stride_hor + 4;
+ y -= 8;
+ } while (y);
+
+ // vertical 8x6
+ x = width_ver;
+ t = temp_buffer;
+ do {
+ // 00 10 01 11 02 12 03 13 04 14 05 15 06 16 07 17
+ // 20 30 21 31 22 32 23 33 24 34 25 35 26 36 27 37
+ // 40 50 41 51 42 52 43 53 44 54 45 55 46 56 47 57
+ // 60 70 61 71 62 72 63 73 64 74 65 75 66 76 67 77
+ loadu_8bit_16x4(t, stride_hor, s);
+ y = height_ver;
+
+ do {
+ // 80 90 81 91 82 92 83 93 84 94 85 95 86 96 87 97
+ // A0 B0 A1 B1 A2 B2 A3 B3 A4 B4 A5 B5 A6 B6 A7 B7
+ // C0 D0 C1 D1 C2 D2 C3 D3 C4 D4 C5 D5 C6 D6 C7 D7
+ // E0 F0 E1 F1 E2 F2 E3 F3 E4 F4 E5 F5 E6 F6 E7 F7
+ t += 4 * stride_hor;
+ loadu_8bit_16x4(t, stride_hor, &s[4]);
+
+ d[0] = convolve8_8_even_offset_ssse3(&s[0], f0);
+ d[1] = convolve8_func_list[offset_idx1](&s[offset1_q4 >> 5], f1);
+ d[2] = convolve8_func_list[offset_idx2](&s[offset2_q4 >> 5], f2);
+ d[3] = convolve8_8_even_offset_ssse3(&s[2], f0);
+ d[4] = convolve8_func_list[offset_idx1](&s[2 + (offset1_q4 >> 5)], f1);
+ d[5] = convolve8_func_list[offset_idx2](&s[2 + (offset2_q4 >> 5)], f2);
+
+ // 00 01 02 03 04 05 06 07 10 11 12 13 14 15 16 17
+ // 20 21 22 23 24 25 26 27 30 31 32 33 34 35 36 37
+ // 40 41 42 43 44 45 46 47 50 51 52 53 54 55 56 57
+ d[0] = _mm_packus_epi16(d[0], d[1]);
+ d[2] = _mm_packus_epi16(d[2], d[3]);
+ d[4] = _mm_packus_epi16(d[4], d[5]);
+
+ _mm_storel_epi64((__m128i *)(dst + 0 * dst_stride), d[0]);
+ _mm_storeh_epi64((__m128i *)(dst + 1 * dst_stride), d[0]);
+ _mm_storel_epi64((__m128i *)(dst + 2 * dst_stride), d[2]);
+ _mm_storeh_epi64((__m128i *)(dst + 3 * dst_stride), d[2]);
+ _mm_storel_epi64((__m128i *)(dst + 4 * dst_stride), d[4]);
+ _mm_storeh_epi64((__m128i *)(dst + 5 * dst_stride), d[4]);
+
+ s[0] = s[4];
+ s[1] = s[5];
+ s[2] = s[6];
+ s[3] = s[7];
+
+ dst += 6 * dst_stride;
+ y -= 6;
+ } while (y);
+ t -= stride_hor * 2 * height_ver / 3;
+ t += 16;
+ dst -= height_ver * dst_stride;
+ dst += 8;
+ x -= 8;
+ } while (x);
+}
+
+static INLINE __m128i scale_1_to_2_phase_0_kernel(const __m128i *const s,
+ const __m128i *const f) {
+ __m128i ss[4], temp;
+
+ ss[0] = _mm_unpacklo_epi8(s[0], s[1]);
+ ss[1] = _mm_unpacklo_epi8(s[2], s[3]);
+ ss[2] = _mm_unpacklo_epi8(s[4], s[5]);
+ ss[3] = _mm_unpacklo_epi8(s[6], s[7]);
+ temp = convolve8_8_ssse3(ss, f);
+ return _mm_packus_epi16(temp, temp);
+}
+
+// Only calculate odd columns since even columns are just src pixels' copies.
+static void scale_1_to_2_phase_0_row(const uint8_t *src, uint8_t *dst,
+ const int w, const __m128i *const f) {
+ int x = w;
+
+ do {
+ __m128i s[8], temp;
+ s[0] = _mm_loadl_epi64((const __m128i *)(src + 0));
+ s[1] = _mm_loadl_epi64((const __m128i *)(src + 1));
+ s[2] = _mm_loadl_epi64((const __m128i *)(src + 2));
+ s[3] = _mm_loadl_epi64((const __m128i *)(src + 3));
+ s[4] = _mm_loadl_epi64((const __m128i *)(src + 4));
+ s[5] = _mm_loadl_epi64((const __m128i *)(src + 5));
+ s[6] = _mm_loadl_epi64((const __m128i *)(src + 6));
+ s[7] = _mm_loadl_epi64((const __m128i *)(src + 7));
+ temp = scale_1_to_2_phase_0_kernel(s, f);
+ _mm_storel_epi64((__m128i *)dst, temp);
+ src += 8;
+ dst += 8;
+ x -= 8;
+ } while (x);
+}
+
+static void scale_plane_1_to_2_phase_0(const uint8_t *src,
+ const ptrdiff_t src_stride, uint8_t *dst,
+ const ptrdiff_t dst_stride,
+ const int src_w, const int src_h,
+ const int16_t *const coef,
+ uint8_t *const temp_buffer) {
+ int max_width;
+ int y;
+ uint8_t *tmp[9];
+ __m128i f[4];
+
+ max_width = (src_w + 7) & ~7;
+ tmp[0] = temp_buffer + 0 * max_width;
+ tmp[1] = temp_buffer + 1 * max_width;
+ tmp[2] = temp_buffer + 2 * max_width;
+ tmp[3] = temp_buffer + 3 * max_width;
+ tmp[4] = temp_buffer + 4 * max_width;
+ tmp[5] = temp_buffer + 5 * max_width;
+ tmp[6] = temp_buffer + 6 * max_width;
+ tmp[7] = temp_buffer + 7 * max_width;
+
+ shuffle_filter_ssse3(coef, f);
+
+ scale_1_to_2_phase_0_row(src - 3 * src_stride - 3, tmp[0], max_width, f);
+ scale_1_to_2_phase_0_row(src - 2 * src_stride - 3, tmp[1], max_width, f);
+ scale_1_to_2_phase_0_row(src - 1 * src_stride - 3, tmp[2], max_width, f);
+ scale_1_to_2_phase_0_row(src + 0 * src_stride - 3, tmp[3], max_width, f);
+ scale_1_to_2_phase_0_row(src + 1 * src_stride - 3, tmp[4], max_width, f);
+ scale_1_to_2_phase_0_row(src + 2 * src_stride - 3, tmp[5], max_width, f);
+ scale_1_to_2_phase_0_row(src + 3 * src_stride - 3, tmp[6], max_width, f);
+
+ y = src_h;
+ do {
+ int x;
+ scale_1_to_2_phase_0_row(src + 4 * src_stride - 3, tmp[7], max_width, f);
+ for (x = 0; x < max_width; x += 8) {
+ __m128i s[8], C, D, CD;
+
+ // Even rows
+ const __m128i a = _mm_loadl_epi64((const __m128i *)(src + x));
+ const __m128i b = _mm_loadl_epi64((const __m128i *)(tmp[3] + x));
+ const __m128i ab = _mm_unpacklo_epi8(a, b);
+ _mm_storeu_si128((__m128i *)(dst + 2 * x), ab);
+
+ // Odd rows
+ // Even columns
+ load_8bit_8x8(src + x - 3 * src_stride, src_stride, s);
+ C = scale_1_to_2_phase_0_kernel(s, f);
+
+ // Odd columns
+ s[0] = _mm_loadl_epi64((const __m128i *)(tmp[0] + x));
+ s[1] = _mm_loadl_epi64((const __m128i *)(tmp[1] + x));
+ s[2] = _mm_loadl_epi64((const __m128i *)(tmp[2] + x));
+ s[3] = _mm_loadl_epi64((const __m128i *)(tmp[3] + x));
+ s[4] = _mm_loadl_epi64((const __m128i *)(tmp[4] + x));
+ s[5] = _mm_loadl_epi64((const __m128i *)(tmp[5] + x));
+ s[6] = _mm_loadl_epi64((const __m128i *)(tmp[6] + x));
+ s[7] = _mm_loadl_epi64((const __m128i *)(tmp[7] + x));
+ D = scale_1_to_2_phase_0_kernel(s, f);
+
+ CD = _mm_unpacklo_epi8(C, D);
+ _mm_storeu_si128((__m128i *)(dst + dst_stride + 2 * x), CD);
+ }
+
+ src += src_stride;
+ dst += 2 * dst_stride;
+ tmp[8] = tmp[0];
+ tmp[0] = tmp[1];
+ tmp[1] = tmp[2];
+ tmp[2] = tmp[3];
+ tmp[3] = tmp[4];
+ tmp[4] = tmp[5];
+ tmp[5] = tmp[6];
+ tmp[6] = tmp[7];
+ tmp[7] = tmp[8];
+ } while (--y);
+}
+
+// There's SIMD optimizations for 1/4, 1/2 and 3/4 downscaling and 2x upscaling
+// in SSSE3.
+static INLINE bool has_normative_scaler_ssse3(const int src_width,
+ const int src_height,
+ const int dst_width,
+ const int dst_height) {
+ const bool has_normative_scaler =
+ (2 * dst_width == src_width && 2 * dst_height == src_height) ||
+ (4 * dst_width == src_width && 4 * dst_height == src_height) ||
+ (4 * dst_width == 3 * src_width && 4 * dst_height == 3 * src_height) ||
+ (dst_width == src_width * 2 && dst_height == src_height * 2);
+
+ return has_normative_scaler;
+}
+
+void av1_resize_and_extend_frame_ssse3(const YV12_BUFFER_CONFIG *src,
+ YV12_BUFFER_CONFIG *dst,
+ const InterpFilter filter,
+ const int phase, const int num_planes) {
+ bool has_normative_scaler =
+ has_normative_scaler_ssse3(src->y_crop_width, src->y_crop_height,
+ dst->y_crop_width, dst->y_crop_height);
+
+ if (num_planes > 1) {
+ has_normative_scaler =
+ has_normative_scaler &&
+ has_normative_scaler_ssse3(src->uv_crop_width, src->uv_crop_height,
+ dst->uv_crop_width, dst->uv_crop_height);
+ }
+
+ if (!has_normative_scaler) {
+ av1_resize_and_extend_frame_c(src, dst, filter, phase, num_planes);
+ return;
+ }
+
+ // We use AOMMIN(num_planes, MAX_MB_PLANE) instead of num_planes to quiet
+ // the static analysis warnings.
+ int malloc_failed = 0;
+ for (int i = 0; i < AOMMIN(num_planes, MAX_MB_PLANE); ++i) {
+ const int is_uv = i > 0;
+ const int src_w = src->crop_widths[is_uv];
+ const int src_h = src->crop_heights[is_uv];
+ const int src_y_w = (src->crop_widths[0] + 1) & ~1;
+ const int dst_w = dst->crop_widths[is_uv];
+ const int dst_h = dst->crop_heights[is_uv];
+ const int dst_y_w = (dst->crop_widths[0] + 1) & ~1;
+ const int dst_y_h = (dst->crop_heights[0] + 1) & ~1;
+
+ if (2 * dst_w == src_w && 2 * dst_h == src_h) {
+ // 2 to 1
+ if (phase == 0) {
+ scale_plane_2_to_1_phase_0(src->buffers[i], src->strides[is_uv],
+ dst->buffers[i], dst->strides[is_uv], dst_w,
+ dst_h);
+ } else if (filter == BILINEAR) {
+ const int16_t c0 = av1_bilinear_filters[phase][3];
+ const int16_t c1 = av1_bilinear_filters[phase][4];
+ const __m128i c0c1 = _mm_set1_epi16(c0 | (c1 << 8)); // c0 and c1 >= 0
+ scale_plane_2_to_1_bilinear(src->buffers[i], src->strides[is_uv],
+ dst->buffers[i], dst->strides[is_uv], dst_w,
+ dst_h, c0c1);
+ } else {
+ const int buffer_stride = (dst_y_w + 3) & ~3;
+ const int buffer_height = (2 * dst_y_h + SUBPEL_TAPS - 2 + 7) & ~7;
+ uint8_t *const temp_buffer =
+ (uint8_t *)malloc(buffer_stride * buffer_height);
+ if (!temp_buffer) {
+ malloc_failed = 1;
+ break;
+ }
+ const InterpKernel *interp_kernel =
+ (const InterpKernel *)av1_interp_filter_params_list[filter]
+ .filter_ptr;
+ scale_plane_2_to_1_general(src->buffers[i], src->strides[is_uv],
+ dst->buffers[i], dst->strides[is_uv], dst_w,
+ dst_h, interp_kernel[phase], temp_buffer);
+ free(temp_buffer);
+ }
+ } else if (4 * dst_w == src_w && 4 * dst_h == src_h) {
+ // 4 to 1
+ if (phase == 0) {
+ scale_plane_4_to_1_phase_0(src->buffers[i], src->strides[is_uv],
+ dst->buffers[i], dst->strides[is_uv], dst_w,
+ dst_h);
+ } else if (filter == BILINEAR) {
+ const int16_t c0 = av1_bilinear_filters[phase][3];
+ const int16_t c1 = av1_bilinear_filters[phase][4];
+ const __m128i c0c1 = _mm_set1_epi16(c0 | (c1 << 8)); // c0 and c1 >= 0
+ scale_plane_4_to_1_bilinear(src->buffers[i], src->strides[is_uv],
+ dst->buffers[i], dst->strides[is_uv], dst_w,
+ dst_h, c0c1);
+ } else {
+ const int buffer_stride = (dst_y_w + 1) & ~1;
+ const int buffer_height = (4 * dst_y_h + SUBPEL_TAPS - 2 + 7) & ~7;
+ // When dst_w is 1 or 2, we need extra padding to avoid heap read
+ // overflow
+ const int extra_padding = 16;
+ uint8_t *const temp_buffer =
+ (uint8_t *)malloc(buffer_stride * buffer_height + extra_padding);
+ if (!temp_buffer) {
+ malloc_failed = 1;
+ break;
+ }
+ const InterpKernel *interp_kernel =
+ (const InterpKernel *)av1_interp_filter_params_list[filter]
+ .filter_ptr;
+ scale_plane_4_to_1_general(src->buffers[i], src->strides[is_uv],
+ dst->buffers[i], dst->strides[is_uv], dst_w,
+ dst_h, interp_kernel[phase], temp_buffer);
+ free(temp_buffer);
+ }
+ } else if (4 * dst_w == 3 * src_w && 4 * dst_h == 3 * src_h) {
+ // 4 to 3
+ const int buffer_stride_hor = (dst_y_w + 5) - ((dst_y_w + 5) % 6) + 2;
+ const int buffer_stride_ver = (dst_y_w + 7) & ~7;
+ const int buffer_height = (4 * dst_y_h / 3 + SUBPEL_TAPS - 1 + 7) & ~7;
+ // When the vertical filter reads more pixels than the horizontal filter
+ // generated in each row, we need extra padding to avoid heap read
+ // overflow. For example, the horizontal filter generates 18 pixels but
+ // the vertical filter reads 24 pixels in a row. The difference is
+ // multiplied by 2 since two rows are interlaced together in the
+ // optimization.
+ const int extra_padding =
+ (buffer_stride_ver > buffer_stride_hor)
+ ? 2 * (buffer_stride_ver - buffer_stride_hor)
+ : 0;
+ const int buffer_size = buffer_stride_hor * buffer_height + extra_padding;
+ uint8_t *const temp_buffer = (uint8_t *)malloc(buffer_size);
+ if (!temp_buffer) {
+ malloc_failed = 1;
+ break;
+ }
+ const InterpKernel *interp_kernel =
+ (const InterpKernel *)av1_interp_filter_params_list[filter]
+ .filter_ptr;
+ scale_plane_4_to_3_general(src->buffers[i], src->strides[is_uv],
+ dst->buffers[i], dst->strides[is_uv], dst_w,
+ dst_h, interp_kernel, phase, temp_buffer);
+ free(temp_buffer);
+ } else {
+ assert(dst_w == src_w * 2 && dst_h == src_h * 2);
+ // 1 to 2
+ uint8_t *const temp_buffer = (uint8_t *)malloc(8 * ((src_y_w + 7) & ~7));
+ if (!temp_buffer) {
+ malloc_failed = 1;
+ break;
+ }
+ const InterpKernel *interp_kernel =
+ (const InterpKernel *)av1_interp_filter_params_list[filter]
+ .filter_ptr;
+ scale_plane_1_to_2_phase_0(src->buffers[i], src->strides[is_uv],
+ dst->buffers[i], dst->strides[is_uv], src_w,
+ src_h, interp_kernel[8], temp_buffer);
+ free(temp_buffer);
+ }
+ }
+
+ if (malloc_failed) {
+ av1_resize_and_extend_frame_c(src, dst, filter, phase, num_planes);
+ } else {
+ aom_extend_frame_borders(dst, num_planes);
+ }
+}
diff --git a/third_party/aom/av1/common/x86/selfguided_avx2.c b/third_party/aom/av1/common/x86/selfguided_avx2.c
new file mode 100644
index 0000000000..5ab6c46f8a
--- /dev/null
+++ b/third_party/aom/av1/common/x86/selfguided_avx2.c
@@ -0,0 +1,724 @@
+/*
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <immintrin.h>
+
+#include "config/aom_config.h"
+#include "config/av1_rtcd.h"
+
+#include "av1/common/restoration.h"
+#include "aom_dsp/x86/synonyms.h"
+#include "aom_dsp/x86/synonyms_avx2.h"
+
+// Load 8 bytes from the possibly-misaligned pointer p, extend each byte to
+// 32-bit precision and return them in an AVX2 register.
+static __m256i yy256_load_extend_8_32(const void *p) {
+ return _mm256_cvtepu8_epi32(xx_loadl_64(p));
+}
+
+// Load 8 halfwords from the possibly-misaligned pointer p, extend each
+// halfword to 32-bit precision and return them in an AVX2 register.
+static __m256i yy256_load_extend_16_32(const void *p) {
+ return _mm256_cvtepu16_epi32(xx_loadu_128(p));
+}
+
+// Compute the scan of an AVX2 register holding 8 32-bit integers. If the
+// register holds x0..x7 then the scan will hold x0, x0+x1, x0+x1+x2, ...,
+// x0+x1+...+x7
+//
+// Let [...] represent a 128-bit block, and let a, ..., h be 32-bit integers
+// (assumed small enough to be able to add them without overflow).
+//
+// Use -> as shorthand for summing, i.e. h->a = h + g + f + e + d + c + b + a.
+//
+// x = [h g f e][d c b a]
+// x01 = [g f e 0][c b a 0]
+// x02 = [g+h f+g e+f e][c+d b+c a+b a]
+// x03 = [e+f e 0 0][a+b a 0 0]
+// x04 = [e->h e->g e->f e][a->d a->c a->b a]
+// s = a->d
+// s01 = [a->d a->d a->d a->d]
+// s02 = [a->d a->d a->d a->d][0 0 0 0]
+// ret = [a->h a->g a->f a->e][a->d a->c a->b a]
+static __m256i scan_32(__m256i x) {
+ const __m256i x01 = _mm256_slli_si256(x, 4);
+ const __m256i x02 = _mm256_add_epi32(x, x01);
+ const __m256i x03 = _mm256_slli_si256(x02, 8);
+ const __m256i x04 = _mm256_add_epi32(x02, x03);
+ const int32_t s = _mm256_extract_epi32(x04, 3);
+ const __m128i s01 = _mm_set1_epi32(s);
+ const __m256i s02 = _mm256_insertf128_si256(_mm256_setzero_si256(), s01, 1);
+ return _mm256_add_epi32(x04, s02);
+}
+
+// Compute two integral images from src. B sums elements; A sums their
+// squares. The images are offset by one pixel, so will have width and height
+// equal to width + 1, height + 1 and the first row and column will be zero.
+//
+// A+1 and B+1 should be aligned to 32 bytes. buf_stride should be a multiple
+// of 8.
+
+static void *memset_zero_avx(int32_t *dest, const __m256i *zero, size_t count) {
+ unsigned int i = 0;
+ for (i = 0; i < (count & 0xffffffe0); i += 32) {
+ _mm256_storeu_si256((__m256i *)(dest + i), *zero);
+ _mm256_storeu_si256((__m256i *)(dest + i + 8), *zero);
+ _mm256_storeu_si256((__m256i *)(dest + i + 16), *zero);
+ _mm256_storeu_si256((__m256i *)(dest + i + 24), *zero);
+ }
+ for (; i < (count & 0xfffffff8); i += 8) {
+ _mm256_storeu_si256((__m256i *)(dest + i), *zero);
+ }
+ for (; i < count; i++) {
+ dest[i] = 0;
+ }
+ return dest;
+}
+
+static void integral_images(const uint8_t *src, int src_stride, int width,
+ int height, int32_t *A, int32_t *B,
+ int buf_stride) {
+ const __m256i zero = _mm256_setzero_si256();
+ // Write out the zero top row
+ memset_zero_avx(A, &zero, (width + 8));
+ memset_zero_avx(B, &zero, (width + 8));
+ for (int i = 0; i < height; ++i) {
+ // Zero the left column.
+ A[(i + 1) * buf_stride] = B[(i + 1) * buf_stride] = 0;
+
+ // ldiff is the difference H - D where H is the output sample immediately
+ // to the left and D is the output sample above it. These are scalars,
+ // replicated across the eight lanes.
+ __m256i ldiff1 = zero, ldiff2 = zero;
+ for (int j = 0; j < width; j += 8) {
+ const int ABj = 1 + j;
+
+ const __m256i above1 = yy_load_256(B + ABj + i * buf_stride);
+ const __m256i above2 = yy_load_256(A + ABj + i * buf_stride);
+
+ const __m256i x1 = yy256_load_extend_8_32(src + j + i * src_stride);
+ const __m256i x2 = _mm256_madd_epi16(x1, x1);
+
+ const __m256i sc1 = scan_32(x1);
+ const __m256i sc2 = scan_32(x2);
+
+ const __m256i row1 =
+ _mm256_add_epi32(_mm256_add_epi32(sc1, above1), ldiff1);
+ const __m256i row2 =
+ _mm256_add_epi32(_mm256_add_epi32(sc2, above2), ldiff2);
+
+ yy_store_256(B + ABj + (i + 1) * buf_stride, row1);
+ yy_store_256(A + ABj + (i + 1) * buf_stride, row2);
+
+ // Calculate the new H - D.
+ ldiff1 = _mm256_set1_epi32(
+ _mm256_extract_epi32(_mm256_sub_epi32(row1, above1), 7));
+ ldiff2 = _mm256_set1_epi32(
+ _mm256_extract_epi32(_mm256_sub_epi32(row2, above2), 7));
+ }
+ }
+}
+
+// Compute two integral images from src. B sums elements; A sums their squares
+//
+// A and B should be aligned to 32 bytes. buf_stride should be a multiple of 8.
+static void integral_images_highbd(const uint16_t *src, int src_stride,
+ int width, int height, int32_t *A,
+ int32_t *B, int buf_stride) {
+ const __m256i zero = _mm256_setzero_si256();
+ // Write out the zero top row
+ memset_zero_avx(A, &zero, (width + 8));
+ memset_zero_avx(B, &zero, (width + 8));
+
+ for (int i = 0; i < height; ++i) {
+ // Zero the left column.
+ A[(i + 1) * buf_stride] = B[(i + 1) * buf_stride] = 0;
+
+ // ldiff is the difference H - D where H is the output sample immediately
+ // to the left and D is the output sample above it. These are scalars,
+ // replicated across the eight lanes.
+ __m256i ldiff1 = zero, ldiff2 = zero;
+ for (int j = 0; j < width; j += 8) {
+ const int ABj = 1 + j;
+
+ const __m256i above1 = yy_load_256(B + ABj + i * buf_stride);
+ const __m256i above2 = yy_load_256(A + ABj + i * buf_stride);
+
+ const __m256i x1 = yy256_load_extend_16_32(src + j + i * src_stride);
+ const __m256i x2 = _mm256_madd_epi16(x1, x1);
+
+ const __m256i sc1 = scan_32(x1);
+ const __m256i sc2 = scan_32(x2);
+
+ const __m256i row1 =
+ _mm256_add_epi32(_mm256_add_epi32(sc1, above1), ldiff1);
+ const __m256i row2 =
+ _mm256_add_epi32(_mm256_add_epi32(sc2, above2), ldiff2);
+
+ yy_store_256(B + ABj + (i + 1) * buf_stride, row1);
+ yy_store_256(A + ABj + (i + 1) * buf_stride, row2);
+
+ // Calculate the new H - D.
+ ldiff1 = _mm256_set1_epi32(
+ _mm256_extract_epi32(_mm256_sub_epi32(row1, above1), 7));
+ ldiff2 = _mm256_set1_epi32(
+ _mm256_extract_epi32(_mm256_sub_epi32(row2, above2), 7));
+ }
+ }
+}
+
+// Compute 8 values of boxsum from the given integral image. ii should point
+// at the middle of the box (for the first value). r is the box radius.
+static INLINE __m256i boxsum_from_ii(const int32_t *ii, int stride, int r) {
+ const __m256i tl = yy_loadu_256(ii - (r + 1) - (r + 1) * stride);
+ const __m256i tr = yy_loadu_256(ii + (r + 0) - (r + 1) * stride);
+ const __m256i bl = yy_loadu_256(ii - (r + 1) + r * stride);
+ const __m256i br = yy_loadu_256(ii + (r + 0) + r * stride);
+ const __m256i u = _mm256_sub_epi32(tr, tl);
+ const __m256i v = _mm256_sub_epi32(br, bl);
+ return _mm256_sub_epi32(v, u);
+}
+
+static __m256i round_for_shift(unsigned shift) {
+ return _mm256_set1_epi32((1 << shift) >> 1);
+}
+
+static __m256i compute_p(__m256i sum1, __m256i sum2, int bit_depth, int n) {
+ __m256i an, bb;
+ if (bit_depth > 8) {
+ const __m256i rounding_a = round_for_shift(2 * (bit_depth - 8));
+ const __m256i rounding_b = round_for_shift(bit_depth - 8);
+ const __m128i shift_a = _mm_cvtsi32_si128(2 * (bit_depth - 8));
+ const __m128i shift_b = _mm_cvtsi32_si128(bit_depth - 8);
+ const __m256i a =
+ _mm256_srl_epi32(_mm256_add_epi32(sum2, rounding_a), shift_a);
+ const __m256i b =
+ _mm256_srl_epi32(_mm256_add_epi32(sum1, rounding_b), shift_b);
+ // b < 2^14, so we can use a 16-bit madd rather than a 32-bit
+ // mullo to square it
+ bb = _mm256_madd_epi16(b, b);
+ an = _mm256_max_epi32(_mm256_mullo_epi32(a, _mm256_set1_epi32(n)), bb);
+ } else {
+ bb = _mm256_madd_epi16(sum1, sum1);
+ an = _mm256_mullo_epi32(sum2, _mm256_set1_epi32(n));
+ }
+ return _mm256_sub_epi32(an, bb);
+}
+
+// Assumes that C, D are integral images for the original buffer which has been
+// extended to have a padding of SGRPROJ_BORDER_VERT/SGRPROJ_BORDER_HORZ pixels
+// on the sides. A, B, C, D point at logical position (0, 0).
+static void calc_ab(int32_t *A, int32_t *B, const int32_t *C, const int32_t *D,
+ int width, int height, int buf_stride, int bit_depth,
+ int sgr_params_idx, int radius_idx) {
+ const sgr_params_type *const params = &av1_sgr_params[sgr_params_idx];
+ const int r = params->r[radius_idx];
+ const int n = (2 * r + 1) * (2 * r + 1);
+ const __m256i s = _mm256_set1_epi32(params->s[radius_idx]);
+ // one_over_n[n-1] is 2^12/n, so easily fits in an int16
+ const __m256i one_over_n = _mm256_set1_epi32(av1_one_by_x[n - 1]);
+
+ const __m256i rnd_z = round_for_shift(SGRPROJ_MTABLE_BITS);
+ const __m256i rnd_res = round_for_shift(SGRPROJ_RECIP_BITS);
+
+ // Set up masks
+ const __m128i ones32 = _mm_set_epi32(0, 0, ~0, ~0);
+ __m256i mask[8];
+ for (int idx = 0; idx < 8; idx++) {
+ const __m128i shift = _mm_cvtsi32_si128(8 * (8 - idx));
+ mask[idx] = _mm256_cvtepi8_epi32(_mm_srl_epi64(ones32, shift));
+ }
+
+ for (int i = -1; i < height + 1; ++i) {
+ for (int j = -1; j < width + 1; j += 8) {
+ const int32_t *Cij = C + i * buf_stride + j;
+ const int32_t *Dij = D + i * buf_stride + j;
+
+ __m256i sum1 = boxsum_from_ii(Dij, buf_stride, r);
+ __m256i sum2 = boxsum_from_ii(Cij, buf_stride, r);
+
+ // When width + 2 isn't a multiple of 8, sum1 and sum2 will contain
+ // some uninitialised data in their upper words. We use a mask to
+ // ensure that these bits are set to 0.
+ int idx = AOMMIN(8, width + 1 - j);
+ assert(idx >= 1);
+
+ if (idx < 8) {
+ sum1 = _mm256_and_si256(mask[idx], sum1);
+ sum2 = _mm256_and_si256(mask[idx], sum2);
+ }
+
+ const __m256i p = compute_p(sum1, sum2, bit_depth, n);
+
+ const __m256i z = _mm256_min_epi32(
+ _mm256_srli_epi32(_mm256_add_epi32(_mm256_mullo_epi32(p, s), rnd_z),
+ SGRPROJ_MTABLE_BITS),
+ _mm256_set1_epi32(255));
+
+ const __m256i a_res = _mm256_i32gather_epi32(av1_x_by_xplus1, z, 4);
+
+ yy_storeu_256(A + i * buf_stride + j, a_res);
+
+ const __m256i a_complement =
+ _mm256_sub_epi32(_mm256_set1_epi32(SGRPROJ_SGR), a_res);
+
+ // sum1 might have lanes greater than 2^15, so we can't use madd to do
+ // multiplication involving sum1. However, a_complement and one_over_n
+ // are both less than 256, so we can multiply them first.
+ const __m256i a_comp_over_n = _mm256_madd_epi16(a_complement, one_over_n);
+ const __m256i b_int = _mm256_mullo_epi32(a_comp_over_n, sum1);
+ const __m256i b_res = _mm256_srli_epi32(_mm256_add_epi32(b_int, rnd_res),
+ SGRPROJ_RECIP_BITS);
+
+ yy_storeu_256(B + i * buf_stride + j, b_res);
+ }
+ }
+}
+
+// Calculate 8 values of the "cross sum" starting at buf. This is a 3x3 filter
+// where the outer four corners have weight 3 and all other pixels have weight
+// 4.
+//
+// Pixels are indexed as follows:
+// xtl xt xtr
+// xl x xr
+// xbl xb xbr
+//
+// buf points to x
+//
+// fours = xl + xt + xr + xb + x
+// threes = xtl + xtr + xbr + xbl
+// cross_sum = 4 * fours + 3 * threes
+// = 4 * (fours + threes) - threes
+// = (fours + threes) << 2 - threes
+static INLINE __m256i cross_sum(const int32_t *buf, int stride) {
+ const __m256i xtl = yy_loadu_256(buf - 1 - stride);
+ const __m256i xt = yy_loadu_256(buf - stride);
+ const __m256i xtr = yy_loadu_256(buf + 1 - stride);
+ const __m256i xl = yy_loadu_256(buf - 1);
+ const __m256i x = yy_loadu_256(buf);
+ const __m256i xr = yy_loadu_256(buf + 1);
+ const __m256i xbl = yy_loadu_256(buf - 1 + stride);
+ const __m256i xb = yy_loadu_256(buf + stride);
+ const __m256i xbr = yy_loadu_256(buf + 1 + stride);
+
+ const __m256i fours = _mm256_add_epi32(
+ xl, _mm256_add_epi32(xt, _mm256_add_epi32(xr, _mm256_add_epi32(xb, x))));
+ const __m256i threes =
+ _mm256_add_epi32(xtl, _mm256_add_epi32(xtr, _mm256_add_epi32(xbr, xbl)));
+
+ return _mm256_sub_epi32(_mm256_slli_epi32(_mm256_add_epi32(fours, threes), 2),
+ threes);
+}
+
+// The final filter for self-guided restoration. Computes a weighted average
+// across A, B with "cross sums" (see cross_sum implementation above).
+static void final_filter(int32_t *dst, int dst_stride, const int32_t *A,
+ const int32_t *B, int buf_stride, const void *dgd8,
+ int dgd_stride, int width, int height, int highbd) {
+ const int nb = 5;
+ const __m256i rounding =
+ round_for_shift(SGRPROJ_SGR_BITS + nb - SGRPROJ_RST_BITS);
+ const uint8_t *dgd_real =
+ highbd ? (const uint8_t *)CONVERT_TO_SHORTPTR(dgd8) : dgd8;
+
+ for (int i = 0; i < height; ++i) {
+ for (int j = 0; j < width; j += 8) {
+ const __m256i a = cross_sum(A + i * buf_stride + j, buf_stride);
+ const __m256i b = cross_sum(B + i * buf_stride + j, buf_stride);
+
+ const __m128i raw =
+ xx_loadu_128(dgd_real + ((i * dgd_stride + j) << highbd));
+ const __m256i src =
+ highbd ? _mm256_cvtepu16_epi32(raw) : _mm256_cvtepu8_epi32(raw);
+
+ __m256i v = _mm256_add_epi32(_mm256_madd_epi16(a, src), b);
+ __m256i w = _mm256_srai_epi32(_mm256_add_epi32(v, rounding),
+ SGRPROJ_SGR_BITS + nb - SGRPROJ_RST_BITS);
+
+ yy_storeu_256(dst + i * dst_stride + j, w);
+ }
+ }
+}
+
+// Assumes that C, D are integral images for the original buffer which has been
+// extended to have a padding of SGRPROJ_BORDER_VERT/SGRPROJ_BORDER_HORZ pixels
+// on the sides. A, B, C, D point at logical position (0, 0).
+static void calc_ab_fast(int32_t *A, int32_t *B, const int32_t *C,
+ const int32_t *D, int width, int height,
+ int buf_stride, int bit_depth, int sgr_params_idx,
+ int radius_idx) {
+ const sgr_params_type *const params = &av1_sgr_params[sgr_params_idx];
+ const int r = params->r[radius_idx];
+ const int n = (2 * r + 1) * (2 * r + 1);
+ const __m256i s = _mm256_set1_epi32(params->s[radius_idx]);
+ // one_over_n[n-1] is 2^12/n, so easily fits in an int16
+ const __m256i one_over_n = _mm256_set1_epi32(av1_one_by_x[n - 1]);
+
+ const __m256i rnd_z = round_for_shift(SGRPROJ_MTABLE_BITS);
+ const __m256i rnd_res = round_for_shift(SGRPROJ_RECIP_BITS);
+
+ // Set up masks
+ const __m128i ones32 = _mm_set_epi32(0, 0, ~0, ~0);
+ __m256i mask[8];
+ for (int idx = 0; idx < 8; idx++) {
+ const __m128i shift = _mm_cvtsi32_si128(8 * (8 - idx));
+ mask[idx] = _mm256_cvtepi8_epi32(_mm_srl_epi64(ones32, shift));
+ }
+
+ for (int i = -1; i < height + 1; i += 2) {
+ for (int j = -1; j < width + 1; j += 8) {
+ const int32_t *Cij = C + i * buf_stride + j;
+ const int32_t *Dij = D + i * buf_stride + j;
+
+ __m256i sum1 = boxsum_from_ii(Dij, buf_stride, r);
+ __m256i sum2 = boxsum_from_ii(Cij, buf_stride, r);
+
+ // When width + 2 isn't a multiple of 8, sum1 and sum2 will contain
+ // some uninitialised data in their upper words. We use a mask to
+ // ensure that these bits are set to 0.
+ int idx = AOMMIN(8, width + 1 - j);
+ assert(idx >= 1);
+
+ if (idx < 8) {
+ sum1 = _mm256_and_si256(mask[idx], sum1);
+ sum2 = _mm256_and_si256(mask[idx], sum2);
+ }
+
+ const __m256i p = compute_p(sum1, sum2, bit_depth, n);
+
+ const __m256i z = _mm256_min_epi32(
+ _mm256_srli_epi32(_mm256_add_epi32(_mm256_mullo_epi32(p, s), rnd_z),
+ SGRPROJ_MTABLE_BITS),
+ _mm256_set1_epi32(255));
+
+ const __m256i a_res = _mm256_i32gather_epi32(av1_x_by_xplus1, z, 4);
+
+ yy_storeu_256(A + i * buf_stride + j, a_res);
+
+ const __m256i a_complement =
+ _mm256_sub_epi32(_mm256_set1_epi32(SGRPROJ_SGR), a_res);
+
+ // sum1 might have lanes greater than 2^15, so we can't use madd to do
+ // multiplication involving sum1. However, a_complement and one_over_n
+ // are both less than 256, so we can multiply them first.
+ const __m256i a_comp_over_n = _mm256_madd_epi16(a_complement, one_over_n);
+ const __m256i b_int = _mm256_mullo_epi32(a_comp_over_n, sum1);
+ const __m256i b_res = _mm256_srli_epi32(_mm256_add_epi32(b_int, rnd_res),
+ SGRPROJ_RECIP_BITS);
+
+ yy_storeu_256(B + i * buf_stride + j, b_res);
+ }
+ }
+}
+
+// Calculate 8 values of the "cross sum" starting at buf.
+//
+// Pixels are indexed like this:
+// xtl xt xtr
+// - buf -
+// xbl xb xbr
+//
+// Pixels are weighted like this:
+// 5 6 5
+// 0 0 0
+// 5 6 5
+//
+// fives = xtl + xtr + xbl + xbr
+// sixes = xt + xb
+// cross_sum = 6 * sixes + 5 * fives
+// = 5 * (fives + sixes) - sixes
+// = (fives + sixes) << 2 + (fives + sixes) + sixes
+static INLINE __m256i cross_sum_fast_even_row(const int32_t *buf, int stride) {
+ const __m256i xtl = yy_loadu_256(buf - 1 - stride);
+ const __m256i xt = yy_loadu_256(buf - stride);
+ const __m256i xtr = yy_loadu_256(buf + 1 - stride);
+ const __m256i xbl = yy_loadu_256(buf - 1 + stride);
+ const __m256i xb = yy_loadu_256(buf + stride);
+ const __m256i xbr = yy_loadu_256(buf + 1 + stride);
+
+ const __m256i fives =
+ _mm256_add_epi32(xtl, _mm256_add_epi32(xtr, _mm256_add_epi32(xbr, xbl)));
+ const __m256i sixes = _mm256_add_epi32(xt, xb);
+ const __m256i fives_plus_sixes = _mm256_add_epi32(fives, sixes);
+
+ return _mm256_add_epi32(
+ _mm256_add_epi32(_mm256_slli_epi32(fives_plus_sixes, 2),
+ fives_plus_sixes),
+ sixes);
+}
+
+// Calculate 8 values of the "cross sum" starting at buf.
+//
+// Pixels are indexed like this:
+// xl x xr
+//
+// Pixels are weighted like this:
+// 5 6 5
+//
+// buf points to x
+//
+// fives = xl + xr
+// sixes = x
+// cross_sum = 5 * fives + 6 * sixes
+// = 4 * (fives + sixes) + (fives + sixes) + sixes
+// = (fives + sixes) << 2 + (fives + sixes) + sixes
+static INLINE __m256i cross_sum_fast_odd_row(const int32_t *buf) {
+ const __m256i xl = yy_loadu_256(buf - 1);
+ const __m256i x = yy_loadu_256(buf);
+ const __m256i xr = yy_loadu_256(buf + 1);
+
+ const __m256i fives = _mm256_add_epi32(xl, xr);
+ const __m256i sixes = x;
+
+ const __m256i fives_plus_sixes = _mm256_add_epi32(fives, sixes);
+
+ return _mm256_add_epi32(
+ _mm256_add_epi32(_mm256_slli_epi32(fives_plus_sixes, 2),
+ fives_plus_sixes),
+ sixes);
+}
+
+// The final filter for the self-guided restoration. Computes a
+// weighted average across A, B with "cross sums" (see cross_sum_...
+// implementations above).
+static void final_filter_fast(int32_t *dst, int dst_stride, const int32_t *A,
+ const int32_t *B, int buf_stride,
+ const void *dgd8, int dgd_stride, int width,
+ int height, int highbd) {
+ const int nb0 = 5;
+ const int nb1 = 4;
+
+ const __m256i rounding0 =
+ round_for_shift(SGRPROJ_SGR_BITS + nb0 - SGRPROJ_RST_BITS);
+ const __m256i rounding1 =
+ round_for_shift(SGRPROJ_SGR_BITS + nb1 - SGRPROJ_RST_BITS);
+
+ const uint8_t *dgd_real =
+ highbd ? (const uint8_t *)CONVERT_TO_SHORTPTR(dgd8) : dgd8;
+
+ for (int i = 0; i < height; ++i) {
+ if (!(i & 1)) { // even row
+ for (int j = 0; j < width; j += 8) {
+ const __m256i a =
+ cross_sum_fast_even_row(A + i * buf_stride + j, buf_stride);
+ const __m256i b =
+ cross_sum_fast_even_row(B + i * buf_stride + j, buf_stride);
+
+ const __m128i raw =
+ xx_loadu_128(dgd_real + ((i * dgd_stride + j) << highbd));
+ const __m256i src =
+ highbd ? _mm256_cvtepu16_epi32(raw) : _mm256_cvtepu8_epi32(raw);
+
+ __m256i v = _mm256_add_epi32(_mm256_madd_epi16(a, src), b);
+ __m256i w =
+ _mm256_srai_epi32(_mm256_add_epi32(v, rounding0),
+ SGRPROJ_SGR_BITS + nb0 - SGRPROJ_RST_BITS);
+
+ yy_storeu_256(dst + i * dst_stride + j, w);
+ }
+ } else { // odd row
+ for (int j = 0; j < width; j += 8) {
+ const __m256i a = cross_sum_fast_odd_row(A + i * buf_stride + j);
+ const __m256i b = cross_sum_fast_odd_row(B + i * buf_stride + j);
+
+ const __m128i raw =
+ xx_loadu_128(dgd_real + ((i * dgd_stride + j) << highbd));
+ const __m256i src =
+ highbd ? _mm256_cvtepu16_epi32(raw) : _mm256_cvtepu8_epi32(raw);
+
+ __m256i v = _mm256_add_epi32(_mm256_madd_epi16(a, src), b);
+ __m256i w =
+ _mm256_srai_epi32(_mm256_add_epi32(v, rounding1),
+ SGRPROJ_SGR_BITS + nb1 - SGRPROJ_RST_BITS);
+
+ yy_storeu_256(dst + i * dst_stride + j, w);
+ }
+ }
+ }
+}
+
+int av1_selfguided_restoration_avx2(const uint8_t *dgd8, int width, int height,
+ int dgd_stride, int32_t *flt0,
+ int32_t *flt1, int flt_stride,
+ int sgr_params_idx, int bit_depth,
+ int highbd) {
+ // The ALIGN_POWER_OF_TWO macro here ensures that column 1 of Atl, Btl,
+ // Ctl and Dtl is 32-byte aligned.
+ const int buf_elts = ALIGN_POWER_OF_TWO(RESTORATION_PROC_UNIT_PELS, 3);
+
+ int32_t *buf = aom_memalign(
+ 32, 4 * sizeof(*buf) * ALIGN_POWER_OF_TWO(RESTORATION_PROC_UNIT_PELS, 3));
+ if (!buf) return -1;
+
+ const int width_ext = width + 2 * SGRPROJ_BORDER_HORZ;
+ const int height_ext = height + 2 * SGRPROJ_BORDER_VERT;
+
+ // Adjusting the stride of A and B here appears to avoid bad cache effects,
+ // leading to a significant speed improvement.
+ // We also align the stride to a multiple of 32 bytes for efficiency.
+ int buf_stride = ALIGN_POWER_OF_TWO(width_ext + 16, 3);
+
+ // The "tl" pointers point at the top-left of the initialised data for the
+ // array.
+ int32_t *Atl = buf + 0 * buf_elts + 7;
+ int32_t *Btl = buf + 1 * buf_elts + 7;
+ int32_t *Ctl = buf + 2 * buf_elts + 7;
+ int32_t *Dtl = buf + 3 * buf_elts + 7;
+
+ // The "0" pointers are (- SGRPROJ_BORDER_VERT, -SGRPROJ_BORDER_HORZ). Note
+ // there's a zero row and column in A, B (integral images), so we move down
+ // and right one for them.
+ const int buf_diag_border =
+ SGRPROJ_BORDER_HORZ + buf_stride * SGRPROJ_BORDER_VERT;
+
+ int32_t *A0 = Atl + 1 + buf_stride;
+ int32_t *B0 = Btl + 1 + buf_stride;
+ int32_t *C0 = Ctl + 1 + buf_stride;
+ int32_t *D0 = Dtl + 1 + buf_stride;
+
+ // Finally, A, B, C, D point at position (0, 0).
+ int32_t *A = A0 + buf_diag_border;
+ int32_t *B = B0 + buf_diag_border;
+ int32_t *C = C0 + buf_diag_border;
+ int32_t *D = D0 + buf_diag_border;
+
+ const int dgd_diag_border =
+ SGRPROJ_BORDER_HORZ + dgd_stride * SGRPROJ_BORDER_VERT;
+ const uint8_t *dgd0 = dgd8 - dgd_diag_border;
+
+ // Generate integral images from the input. C will contain sums of squares; D
+ // will contain just sums
+ if (highbd)
+ integral_images_highbd(CONVERT_TO_SHORTPTR(dgd0), dgd_stride, width_ext,
+ height_ext, Ctl, Dtl, buf_stride);
+ else
+ integral_images(dgd0, dgd_stride, width_ext, height_ext, Ctl, Dtl,
+ buf_stride);
+
+ const sgr_params_type *const params = &av1_sgr_params[sgr_params_idx];
+ // Write to flt0 and flt1
+ // If params->r == 0 we skip the corresponding filter. We only allow one of
+ // the radii to be 0, as having both equal to 0 would be equivalent to
+ // skipping SGR entirely.
+ assert(!(params->r[0] == 0 && params->r[1] == 0));
+ assert(params->r[0] < AOMMIN(SGRPROJ_BORDER_VERT, SGRPROJ_BORDER_HORZ));
+ assert(params->r[1] < AOMMIN(SGRPROJ_BORDER_VERT, SGRPROJ_BORDER_HORZ));
+
+ if (params->r[0] > 0) {
+ calc_ab_fast(A, B, C, D, width, height, buf_stride, bit_depth,
+ sgr_params_idx, 0);
+ final_filter_fast(flt0, flt_stride, A, B, buf_stride, dgd8, dgd_stride,
+ width, height, highbd);
+ }
+
+ if (params->r[1] > 0) {
+ calc_ab(A, B, C, D, width, height, buf_stride, bit_depth, sgr_params_idx,
+ 1);
+ final_filter(flt1, flt_stride, A, B, buf_stride, dgd8, dgd_stride, width,
+ height, highbd);
+ }
+ aom_free(buf);
+ return 0;
+}
+
+int av1_apply_selfguided_restoration_avx2(const uint8_t *dat8, int width,
+ int height, int stride, int eps,
+ const int *xqd, uint8_t *dst8,
+ int dst_stride, int32_t *tmpbuf,
+ int bit_depth, int highbd) {
+ int32_t *flt0 = tmpbuf;
+ int32_t *flt1 = flt0 + RESTORATION_UNITPELS_MAX;
+ assert(width * height <= RESTORATION_UNITPELS_MAX);
+ const int ret = av1_selfguided_restoration_avx2(
+ dat8, width, height, stride, flt0, flt1, width, eps, bit_depth, highbd);
+ if (ret != 0) return ret;
+ const sgr_params_type *const params = &av1_sgr_params[eps];
+ int xq[2];
+ av1_decode_xq(xqd, xq, params);
+
+ __m256i xq0 = _mm256_set1_epi32(xq[0]);
+ __m256i xq1 = _mm256_set1_epi32(xq[1]);
+
+ for (int i = 0; i < height; ++i) {
+ // Calculate output in batches of 16 pixels
+ for (int j = 0; j < width; j += 16) {
+ const int k = i * width + j;
+ const int m = i * dst_stride + j;
+
+ const uint8_t *dat8ij = dat8 + i * stride + j;
+ __m256i ep_0, ep_1;
+ __m128i src_0, src_1;
+ if (highbd) {
+ src_0 = xx_loadu_128(CONVERT_TO_SHORTPTR(dat8ij));
+ src_1 = xx_loadu_128(CONVERT_TO_SHORTPTR(dat8ij + 8));
+ ep_0 = _mm256_cvtepu16_epi32(src_0);
+ ep_1 = _mm256_cvtepu16_epi32(src_1);
+ } else {
+ src_0 = xx_loadu_128(dat8ij);
+ ep_0 = _mm256_cvtepu8_epi32(src_0);
+ ep_1 = _mm256_cvtepu8_epi32(_mm_srli_si128(src_0, 8));
+ }
+
+ const __m256i u_0 = _mm256_slli_epi32(ep_0, SGRPROJ_RST_BITS);
+ const __m256i u_1 = _mm256_slli_epi32(ep_1, SGRPROJ_RST_BITS);
+
+ __m256i v_0 = _mm256_slli_epi32(u_0, SGRPROJ_PRJ_BITS);
+ __m256i v_1 = _mm256_slli_epi32(u_1, SGRPROJ_PRJ_BITS);
+
+ if (params->r[0] > 0) {
+ const __m256i f1_0 = _mm256_sub_epi32(yy_loadu_256(&flt0[k]), u_0);
+ v_0 = _mm256_add_epi32(v_0, _mm256_mullo_epi32(xq0, f1_0));
+
+ const __m256i f1_1 = _mm256_sub_epi32(yy_loadu_256(&flt0[k + 8]), u_1);
+ v_1 = _mm256_add_epi32(v_1, _mm256_mullo_epi32(xq0, f1_1));
+ }
+
+ if (params->r[1] > 0) {
+ const __m256i f2_0 = _mm256_sub_epi32(yy_loadu_256(&flt1[k]), u_0);
+ v_0 = _mm256_add_epi32(v_0, _mm256_mullo_epi32(xq1, f2_0));
+
+ const __m256i f2_1 = _mm256_sub_epi32(yy_loadu_256(&flt1[k + 8]), u_1);
+ v_1 = _mm256_add_epi32(v_1, _mm256_mullo_epi32(xq1, f2_1));
+ }
+
+ const __m256i rounding =
+ round_for_shift(SGRPROJ_PRJ_BITS + SGRPROJ_RST_BITS);
+ const __m256i w_0 = _mm256_srai_epi32(
+ _mm256_add_epi32(v_0, rounding), SGRPROJ_PRJ_BITS + SGRPROJ_RST_BITS);
+ const __m256i w_1 = _mm256_srai_epi32(
+ _mm256_add_epi32(v_1, rounding), SGRPROJ_PRJ_BITS + SGRPROJ_RST_BITS);
+
+ if (highbd) {
+ // Pack into 16 bits and clamp to [0, 2^bit_depth)
+ // Note that packing into 16 bits messes up the order of the bits,
+ // so we use a permute function to correct this
+ const __m256i tmp = _mm256_packus_epi32(w_0, w_1);
+ const __m256i tmp2 = _mm256_permute4x64_epi64(tmp, 0xd8);
+ const __m256i max = _mm256_set1_epi16((1 << bit_depth) - 1);
+ const __m256i res = _mm256_min_epi16(tmp2, max);
+ yy_storeu_256(CONVERT_TO_SHORTPTR(dst8 + m), res);
+ } else {
+ // Pack into 8 bits and clamp to [0, 256)
+ // Note that each pack messes up the order of the bits,
+ // so we use a permute function to correct this
+ const __m256i tmp = _mm256_packs_epi32(w_0, w_1);
+ const __m256i tmp2 = _mm256_permute4x64_epi64(tmp, 0xd8);
+ const __m256i res =
+ _mm256_packus_epi16(tmp2, tmp2 /* "don't care" value */);
+ const __m128i res2 =
+ _mm256_castsi256_si128(_mm256_permute4x64_epi64(res, 0xd8));
+ xx_storeu_128(dst8 + m, res2);
+ }
+ }
+ }
+ return 0;
+}
diff --git a/third_party/aom/av1/common/x86/selfguided_sse4.c b/third_party/aom/av1/common/x86/selfguided_sse4.c
new file mode 100644
index 0000000000..ac850f5691
--- /dev/null
+++ b/third_party/aom/av1/common/x86/selfguided_sse4.c
@@ -0,0 +1,662 @@
+/*
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <smmintrin.h>
+
+#include "config/aom_config.h"
+#include "config/av1_rtcd.h"
+
+#include "av1/common/restoration.h"
+#include "aom_dsp/x86/synonyms.h"
+
+// Load 4 bytes from the possibly-misaligned pointer p, extend each byte to
+// 32-bit precision and return them in an SSE register.
+static __m128i xx_load_extend_8_32(const void *p) {
+ return _mm_cvtepu8_epi32(xx_loadl_32(p));
+}
+
+// Load 4 halfwords from the possibly-misaligned pointer p, extend each
+// halfword to 32-bit precision and return them in an SSE register.
+static __m128i xx_load_extend_16_32(const void *p) {
+ return _mm_cvtepu16_epi32(xx_loadl_64(p));
+}
+
+// Compute the scan of an SSE register holding 4 32-bit integers. If the
+// register holds x0..x3 then the scan will hold x0, x0+x1, x0+x1+x2,
+// x0+x1+x2+x3
+static __m128i scan_32(__m128i x) {
+ const __m128i x01 = _mm_add_epi32(x, _mm_slli_si128(x, 4));
+ return _mm_add_epi32(x01, _mm_slli_si128(x01, 8));
+}
+
+// Compute two integral images from src. B sums elements; A sums their
+// squares. The images are offset by one pixel, so will have width and height
+// equal to width + 1, height + 1 and the first row and column will be zero.
+//
+// A+1 and B+1 should be aligned to 16 bytes. buf_stride should be a multiple
+// of 4.
+static void integral_images(const uint8_t *src, int src_stride, int width,
+ int height, int32_t *A, int32_t *B,
+ int buf_stride) {
+ // Write out the zero top row
+ memset(A, 0, sizeof(*A) * (width + 1));
+ memset(B, 0, sizeof(*B) * (width + 1));
+
+ const __m128i zero = _mm_setzero_si128();
+ for (int i = 0; i < height; ++i) {
+ // Zero the left column.
+ A[(i + 1) * buf_stride] = B[(i + 1) * buf_stride] = 0;
+
+ // ldiff is the difference H - D where H is the output sample immediately
+ // to the left and D is the output sample above it. These are scalars,
+ // replicated across the four lanes.
+ __m128i ldiff1 = zero, ldiff2 = zero;
+ for (int j = 0; j < width; j += 4) {
+ const int ABj = 1 + j;
+
+ const __m128i above1 = xx_load_128(B + ABj + i * buf_stride);
+ const __m128i above2 = xx_load_128(A + ABj + i * buf_stride);
+
+ const __m128i x1 = xx_load_extend_8_32(src + j + i * src_stride);
+ const __m128i x2 = _mm_madd_epi16(x1, x1);
+
+ const __m128i sc1 = scan_32(x1);
+ const __m128i sc2 = scan_32(x2);
+
+ const __m128i row1 = _mm_add_epi32(_mm_add_epi32(sc1, above1), ldiff1);
+ const __m128i row2 = _mm_add_epi32(_mm_add_epi32(sc2, above2), ldiff2);
+
+ xx_store_128(B + ABj + (i + 1) * buf_stride, row1);
+ xx_store_128(A + ABj + (i + 1) * buf_stride, row2);
+
+ // Calculate the new H - D.
+ ldiff1 = _mm_shuffle_epi32(_mm_sub_epi32(row1, above1), 0xff);
+ ldiff2 = _mm_shuffle_epi32(_mm_sub_epi32(row2, above2), 0xff);
+ }
+ }
+}
+
+// Compute two integral images from src. B sums elements; A sums their squares
+//
+// A and B should be aligned to 16 bytes. buf_stride should be a multiple of 4.
+static void integral_images_highbd(const uint16_t *src, int src_stride,
+ int width, int height, int32_t *A,
+ int32_t *B, int buf_stride) {
+ // Write out the zero top row
+ memset(A, 0, sizeof(*A) * (width + 1));
+ memset(B, 0, sizeof(*B) * (width + 1));
+
+ const __m128i zero = _mm_setzero_si128();
+ for (int i = 0; i < height; ++i) {
+ // Zero the left column.
+ A[(i + 1) * buf_stride] = B[(i + 1) * buf_stride] = 0;
+
+ // ldiff is the difference H - D where H is the output sample immediately
+ // to the left and D is the output sample above it. These are scalars,
+ // replicated across the four lanes.
+ __m128i ldiff1 = zero, ldiff2 = zero;
+ for (int j = 0; j < width; j += 4) {
+ const int ABj = 1 + j;
+
+ const __m128i above1 = xx_load_128(B + ABj + i * buf_stride);
+ const __m128i above2 = xx_load_128(A + ABj + i * buf_stride);
+
+ const __m128i x1 = xx_load_extend_16_32(src + j + i * src_stride);
+ const __m128i x2 = _mm_madd_epi16(x1, x1);
+
+ const __m128i sc1 = scan_32(x1);
+ const __m128i sc2 = scan_32(x2);
+
+ const __m128i row1 = _mm_add_epi32(_mm_add_epi32(sc1, above1), ldiff1);
+ const __m128i row2 = _mm_add_epi32(_mm_add_epi32(sc2, above2), ldiff2);
+
+ xx_store_128(B + ABj + (i + 1) * buf_stride, row1);
+ xx_store_128(A + ABj + (i + 1) * buf_stride, row2);
+
+ // Calculate the new H - D.
+ ldiff1 = _mm_shuffle_epi32(_mm_sub_epi32(row1, above1), 0xff);
+ ldiff2 = _mm_shuffle_epi32(_mm_sub_epi32(row2, above2), 0xff);
+ }
+ }
+}
+
+// Compute 4 values of boxsum from the given integral image. ii should point
+// at the middle of the box (for the first value). r is the box radius.
+static INLINE __m128i boxsum_from_ii(const int32_t *ii, int stride, int r) {
+ const __m128i tl = xx_loadu_128(ii - (r + 1) - (r + 1) * stride);
+ const __m128i tr = xx_loadu_128(ii + (r + 0) - (r + 1) * stride);
+ const __m128i bl = xx_loadu_128(ii - (r + 1) + r * stride);
+ const __m128i br = xx_loadu_128(ii + (r + 0) + r * stride);
+ const __m128i u = _mm_sub_epi32(tr, tl);
+ const __m128i v = _mm_sub_epi32(br, bl);
+ return _mm_sub_epi32(v, u);
+}
+
+static __m128i round_for_shift(unsigned shift) {
+ return _mm_set1_epi32((1 << shift) >> 1);
+}
+
+static __m128i compute_p(__m128i sum1, __m128i sum2, int bit_depth, int n) {
+ __m128i an, bb;
+ if (bit_depth > 8) {
+ const __m128i rounding_a = round_for_shift(2 * (bit_depth - 8));
+ const __m128i rounding_b = round_for_shift(bit_depth - 8);
+ const __m128i shift_a = _mm_cvtsi32_si128(2 * (bit_depth - 8));
+ const __m128i shift_b = _mm_cvtsi32_si128(bit_depth - 8);
+ const __m128i a = _mm_srl_epi32(_mm_add_epi32(sum2, rounding_a), shift_a);
+ const __m128i b = _mm_srl_epi32(_mm_add_epi32(sum1, rounding_b), shift_b);
+ // b < 2^14, so we can use a 16-bit madd rather than a 32-bit
+ // mullo to square it
+ bb = _mm_madd_epi16(b, b);
+ an = _mm_max_epi32(_mm_mullo_epi32(a, _mm_set1_epi32(n)), bb);
+ } else {
+ bb = _mm_madd_epi16(sum1, sum1);
+ an = _mm_mullo_epi32(sum2, _mm_set1_epi32(n));
+ }
+ return _mm_sub_epi32(an, bb);
+}
+
+// Assumes that C, D are integral images for the original buffer which has been
+// extended to have a padding of SGRPROJ_BORDER_VERT/SGRPROJ_BORDER_HORZ pixels
+// on the sides. A, B, C, D point at logical position (0, 0).
+static void calc_ab(int32_t *A, int32_t *B, const int32_t *C, const int32_t *D,
+ int width, int height, int buf_stride, int bit_depth,
+ int sgr_params_idx, int radius_idx) {
+ const sgr_params_type *const params = &av1_sgr_params[sgr_params_idx];
+ const int r = params->r[radius_idx];
+ const int n = (2 * r + 1) * (2 * r + 1);
+ const __m128i s = _mm_set1_epi32(params->s[radius_idx]);
+ // one_over_n[n-1] is 2^12/n, so easily fits in an int16
+ const __m128i one_over_n = _mm_set1_epi32(av1_one_by_x[n - 1]);
+
+ const __m128i rnd_z = round_for_shift(SGRPROJ_MTABLE_BITS);
+ const __m128i rnd_res = round_for_shift(SGRPROJ_RECIP_BITS);
+
+ // Set up masks
+ const __m128i ones32 = _mm_set_epi32(0, 0, ~0, ~0);
+ __m128i mask[4];
+ for (int idx = 0; idx < 4; idx++) {
+ const __m128i shift = _mm_cvtsi32_si128(8 * (4 - idx));
+ mask[idx] = _mm_cvtepi8_epi32(_mm_srl_epi64(ones32, shift));
+ }
+
+ for (int i = -1; i < height + 1; ++i) {
+ for (int j = -1; j < width + 1; j += 4) {
+ const int32_t *Cij = C + i * buf_stride + j;
+ const int32_t *Dij = D + i * buf_stride + j;
+
+ __m128i sum1 = boxsum_from_ii(Dij, buf_stride, r);
+ __m128i sum2 = boxsum_from_ii(Cij, buf_stride, r);
+
+ // When width + 2 isn't a multiple of 4, sum1 and sum2 will contain
+ // some uninitialised data in their upper words. We use a mask to
+ // ensure that these bits are set to 0.
+ int idx = AOMMIN(4, width + 1 - j);
+ assert(idx >= 1);
+
+ if (idx < 4) {
+ sum1 = _mm_and_si128(mask[idx], sum1);
+ sum2 = _mm_and_si128(mask[idx], sum2);
+ }
+
+ const __m128i p = compute_p(sum1, sum2, bit_depth, n);
+
+ const __m128i z = _mm_min_epi32(
+ _mm_srli_epi32(_mm_add_epi32(_mm_mullo_epi32(p, s), rnd_z),
+ SGRPROJ_MTABLE_BITS),
+ _mm_set1_epi32(255));
+
+ // 'Gather' type instructions are not available pre-AVX2, so synthesize a
+ // gather using scalar loads.
+ const __m128i a_res =
+ _mm_set_epi32(av1_x_by_xplus1[_mm_extract_epi32(z, 3)],
+ av1_x_by_xplus1[_mm_extract_epi32(z, 2)],
+ av1_x_by_xplus1[_mm_extract_epi32(z, 1)],
+ av1_x_by_xplus1[_mm_extract_epi32(z, 0)]);
+
+ xx_storeu_128(A + i * buf_stride + j, a_res);
+
+ const __m128i a_complement =
+ _mm_sub_epi32(_mm_set1_epi32(SGRPROJ_SGR), a_res);
+
+ // sum1 might have lanes greater than 2^15, so we can't use madd to do
+ // multiplication involving sum1. However, a_complement and one_over_n
+ // are both less than 256, so we can multiply them first.
+ const __m128i a_comp_over_n = _mm_madd_epi16(a_complement, one_over_n);
+ const __m128i b_int = _mm_mullo_epi32(a_comp_over_n, sum1);
+ const __m128i b_res =
+ _mm_srli_epi32(_mm_add_epi32(b_int, rnd_res), SGRPROJ_RECIP_BITS);
+
+ xx_storeu_128(B + i * buf_stride + j, b_res);
+ }
+ }
+}
+
+// Calculate 4 values of the "cross sum" starting at buf. This is a 3x3 filter
+// where the outer four corners have weight 3 and all other pixels have weight
+// 4.
+//
+// Pixels are indexed like this:
+// xtl xt xtr
+// xl x xr
+// xbl xb xbr
+//
+// buf points to x
+//
+// fours = xl + xt + xr + xb + x
+// threes = xtl + xtr + xbr + xbl
+// cross_sum = 4 * fours + 3 * threes
+// = 4 * (fours + threes) - threes
+// = (fours + threes) << 2 - threes
+static INLINE __m128i cross_sum(const int32_t *buf, int stride) {
+ const __m128i xtl = xx_loadu_128(buf - 1 - stride);
+ const __m128i xt = xx_loadu_128(buf - stride);
+ const __m128i xtr = xx_loadu_128(buf + 1 - stride);
+ const __m128i xl = xx_loadu_128(buf - 1);
+ const __m128i x = xx_loadu_128(buf);
+ const __m128i xr = xx_loadu_128(buf + 1);
+ const __m128i xbl = xx_loadu_128(buf - 1 + stride);
+ const __m128i xb = xx_loadu_128(buf + stride);
+ const __m128i xbr = xx_loadu_128(buf + 1 + stride);
+
+ const __m128i fours = _mm_add_epi32(
+ xl, _mm_add_epi32(xt, _mm_add_epi32(xr, _mm_add_epi32(xb, x))));
+ const __m128i threes =
+ _mm_add_epi32(xtl, _mm_add_epi32(xtr, _mm_add_epi32(xbr, xbl)));
+
+ return _mm_sub_epi32(_mm_slli_epi32(_mm_add_epi32(fours, threes), 2), threes);
+}
+
+// The final filter for self-guided restoration. Computes a weighted average
+// across A, B with "cross sums" (see cross_sum implementation above).
+static void final_filter(int32_t *dst, int dst_stride, const int32_t *A,
+ const int32_t *B, int buf_stride, const void *dgd8,
+ int dgd_stride, int width, int height, int highbd) {
+ const int nb = 5;
+ const __m128i rounding =
+ round_for_shift(SGRPROJ_SGR_BITS + nb - SGRPROJ_RST_BITS);
+ const uint8_t *dgd_real =
+ highbd ? (const uint8_t *)CONVERT_TO_SHORTPTR(dgd8) : dgd8;
+
+ for (int i = 0; i < height; ++i) {
+ for (int j = 0; j < width; j += 4) {
+ const __m128i a = cross_sum(A + i * buf_stride + j, buf_stride);
+ const __m128i b = cross_sum(B + i * buf_stride + j, buf_stride);
+ const __m128i raw =
+ xx_loadl_64(dgd_real + ((i * dgd_stride + j) << highbd));
+ const __m128i src =
+ highbd ? _mm_cvtepu16_epi32(raw) : _mm_cvtepu8_epi32(raw);
+
+ __m128i v = _mm_add_epi32(_mm_madd_epi16(a, src), b);
+ __m128i w = _mm_srai_epi32(_mm_add_epi32(v, rounding),
+ SGRPROJ_SGR_BITS + nb - SGRPROJ_RST_BITS);
+
+ xx_storeu_128(dst + i * dst_stride + j, w);
+ }
+ }
+}
+
+// Assumes that C, D are integral images for the original buffer which has been
+// extended to have a padding of SGRPROJ_BORDER_VERT/SGRPROJ_BORDER_HORZ pixels
+// on the sides. A, B, C, D point at logical position (0, 0).
+static void calc_ab_fast(int32_t *A, int32_t *B, const int32_t *C,
+ const int32_t *D, int width, int height,
+ int buf_stride, int bit_depth, int sgr_params_idx,
+ int radius_idx) {
+ const sgr_params_type *const params = &av1_sgr_params[sgr_params_idx];
+ const int r = params->r[radius_idx];
+ const int n = (2 * r + 1) * (2 * r + 1);
+ const __m128i s = _mm_set1_epi32(params->s[radius_idx]);
+ // one_over_n[n-1] is 2^12/n, so easily fits in an int16
+ const __m128i one_over_n = _mm_set1_epi32(av1_one_by_x[n - 1]);
+
+ const __m128i rnd_z = round_for_shift(SGRPROJ_MTABLE_BITS);
+ const __m128i rnd_res = round_for_shift(SGRPROJ_RECIP_BITS);
+
+ // Set up masks
+ const __m128i ones32 = _mm_set_epi32(0, 0, ~0, ~0);
+ __m128i mask[4];
+ for (int idx = 0; idx < 4; idx++) {
+ const __m128i shift = _mm_cvtsi32_si128(8 * (4 - idx));
+ mask[idx] = _mm_cvtepi8_epi32(_mm_srl_epi64(ones32, shift));
+ }
+
+ for (int i = -1; i < height + 1; i += 2) {
+ for (int j = -1; j < width + 1; j += 4) {
+ const int32_t *Cij = C + i * buf_stride + j;
+ const int32_t *Dij = D + i * buf_stride + j;
+
+ __m128i sum1 = boxsum_from_ii(Dij, buf_stride, r);
+ __m128i sum2 = boxsum_from_ii(Cij, buf_stride, r);
+
+ // When width + 2 isn't a multiple of 4, sum1 and sum2 will contain
+ // some uninitialised data in their upper words. We use a mask to
+ // ensure that these bits are set to 0.
+ int idx = AOMMIN(4, width + 1 - j);
+ assert(idx >= 1);
+
+ if (idx < 4) {
+ sum1 = _mm_and_si128(mask[idx], sum1);
+ sum2 = _mm_and_si128(mask[idx], sum2);
+ }
+
+ const __m128i p = compute_p(sum1, sum2, bit_depth, n);
+
+ const __m128i z = _mm_min_epi32(
+ _mm_srli_epi32(_mm_add_epi32(_mm_mullo_epi32(p, s), rnd_z),
+ SGRPROJ_MTABLE_BITS),
+ _mm_set1_epi32(255));
+
+ // 'Gather' type instructions are not available pre-AVX2, so synthesize a
+ // gather using scalar loads.
+ const __m128i a_res =
+ _mm_set_epi32(av1_x_by_xplus1[_mm_extract_epi32(z, 3)],
+ av1_x_by_xplus1[_mm_extract_epi32(z, 2)],
+ av1_x_by_xplus1[_mm_extract_epi32(z, 1)],
+ av1_x_by_xplus1[_mm_extract_epi32(z, 0)]);
+
+ xx_storeu_128(A + i * buf_stride + j, a_res);
+
+ const __m128i a_complement =
+ _mm_sub_epi32(_mm_set1_epi32(SGRPROJ_SGR), a_res);
+
+ // sum1 might have lanes greater than 2^15, so we can't use madd to do
+ // multiplication involving sum1. However, a_complement and one_over_n
+ // are both less than 256, so we can multiply them first.
+ const __m128i a_comp_over_n = _mm_madd_epi16(a_complement, one_over_n);
+ const __m128i b_int = _mm_mullo_epi32(a_comp_over_n, sum1);
+ const __m128i b_res =
+ _mm_srli_epi32(_mm_add_epi32(b_int, rnd_res), SGRPROJ_RECIP_BITS);
+
+ xx_storeu_128(B + i * buf_stride + j, b_res);
+ }
+ }
+}
+
+// Calculate 4 values of the "cross sum" starting at buf.
+//
+// Pixels are indexed like this:
+// xtl xt xtr
+// - buf -
+// xbl xb xbr
+//
+// Pixels are weighted like this:
+// 5 6 5
+// 0 0 0
+// 5 6 5
+//
+// fives = xtl + xtr + xbl + xbr
+// sixes = xt + xb
+// cross_sum = 6 * sixes + 5 * fives
+// = 5 * (fives + sixes) - sixes
+// = (fives + sixes) << 2 + (fives + sixes) + sixes
+static INLINE __m128i cross_sum_fast_even_row(const int32_t *buf, int stride) {
+ const __m128i xtl = xx_loadu_128(buf - 1 - stride);
+ const __m128i xt = xx_loadu_128(buf - stride);
+ const __m128i xtr = xx_loadu_128(buf + 1 - stride);
+ const __m128i xbl = xx_loadu_128(buf - 1 + stride);
+ const __m128i xb = xx_loadu_128(buf + stride);
+ const __m128i xbr = xx_loadu_128(buf + 1 + stride);
+
+ const __m128i fives =
+ _mm_add_epi32(xtl, _mm_add_epi32(xtr, _mm_add_epi32(xbr, xbl)));
+ const __m128i sixes = _mm_add_epi32(xt, xb);
+ const __m128i fives_plus_sixes = _mm_add_epi32(fives, sixes);
+
+ return _mm_add_epi32(
+ _mm_add_epi32(_mm_slli_epi32(fives_plus_sixes, 2), fives_plus_sixes),
+ sixes);
+}
+
+// Calculate 4 values of the "cross sum" starting at buf.
+//
+// Pixels are indexed like this:
+// xl x xr
+//
+// Pixels are weighted like this:
+// 5 6 5
+//
+// buf points to x
+//
+// fives = xl + xr
+// sixes = x
+// cross_sum = 5 * fives + 6 * sixes
+// = 4 * (fives + sixes) + (fives + sixes) + sixes
+// = (fives + sixes) << 2 + (fives + sixes) + sixes
+static INLINE __m128i cross_sum_fast_odd_row(const int32_t *buf) {
+ const __m128i xl = xx_loadu_128(buf - 1);
+ const __m128i x = xx_loadu_128(buf);
+ const __m128i xr = xx_loadu_128(buf + 1);
+
+ const __m128i fives = _mm_add_epi32(xl, xr);
+ const __m128i sixes = x;
+
+ const __m128i fives_plus_sixes = _mm_add_epi32(fives, sixes);
+
+ return _mm_add_epi32(
+ _mm_add_epi32(_mm_slli_epi32(fives_plus_sixes, 2), fives_plus_sixes),
+ sixes);
+}
+
+// The final filter for the self-guided restoration. Computes a
+// weighted average across A, B with "cross sums" (see cross_sum_...
+// implementations above).
+static void final_filter_fast(int32_t *dst, int dst_stride, const int32_t *A,
+ const int32_t *B, int buf_stride,
+ const void *dgd8, int dgd_stride, int width,
+ int height, int highbd) {
+ const int nb0 = 5;
+ const int nb1 = 4;
+
+ const __m128i rounding0 =
+ round_for_shift(SGRPROJ_SGR_BITS + nb0 - SGRPROJ_RST_BITS);
+ const __m128i rounding1 =
+ round_for_shift(SGRPROJ_SGR_BITS + nb1 - SGRPROJ_RST_BITS);
+
+ const uint8_t *dgd_real =
+ highbd ? (const uint8_t *)CONVERT_TO_SHORTPTR(dgd8) : dgd8;
+
+ for (int i = 0; i < height; ++i) {
+ if (!(i & 1)) { // even row
+ for (int j = 0; j < width; j += 4) {
+ const __m128i a =
+ cross_sum_fast_even_row(A + i * buf_stride + j, buf_stride);
+ const __m128i b =
+ cross_sum_fast_even_row(B + i * buf_stride + j, buf_stride);
+ const __m128i raw =
+ xx_loadl_64(dgd_real + ((i * dgd_stride + j) << highbd));
+ const __m128i src =
+ highbd ? _mm_cvtepu16_epi32(raw) : _mm_cvtepu8_epi32(raw);
+
+ __m128i v = _mm_add_epi32(_mm_madd_epi16(a, src), b);
+ __m128i w = _mm_srai_epi32(_mm_add_epi32(v, rounding0),
+ SGRPROJ_SGR_BITS + nb0 - SGRPROJ_RST_BITS);
+
+ xx_storeu_128(dst + i * dst_stride + j, w);
+ }
+ } else { // odd row
+ for (int j = 0; j < width; j += 4) {
+ const __m128i a = cross_sum_fast_odd_row(A + i * buf_stride + j);
+ const __m128i b = cross_sum_fast_odd_row(B + i * buf_stride + j);
+ const __m128i raw =
+ xx_loadl_64(dgd_real + ((i * dgd_stride + j) << highbd));
+ const __m128i src =
+ highbd ? _mm_cvtepu16_epi32(raw) : _mm_cvtepu8_epi32(raw);
+
+ __m128i v = _mm_add_epi32(_mm_madd_epi16(a, src), b);
+ __m128i w = _mm_srai_epi32(_mm_add_epi32(v, rounding1),
+ SGRPROJ_SGR_BITS + nb1 - SGRPROJ_RST_BITS);
+
+ xx_storeu_128(dst + i * dst_stride + j, w);
+ }
+ }
+ }
+}
+
+int av1_selfguided_restoration_sse4_1(const uint8_t *dgd8, int width,
+ int height, int dgd_stride, int32_t *flt0,
+ int32_t *flt1, int flt_stride,
+ int sgr_params_idx, int bit_depth,
+ int highbd) {
+ int32_t *buf = (int32_t *)aom_memalign(
+ 16, 4 * sizeof(*buf) * RESTORATION_PROC_UNIT_PELS);
+ if (!buf) return -1;
+ memset(buf, 0, 4 * sizeof(*buf) * RESTORATION_PROC_UNIT_PELS);
+
+ const int width_ext = width + 2 * SGRPROJ_BORDER_HORZ;
+ const int height_ext = height + 2 * SGRPROJ_BORDER_VERT;
+
+ // Adjusting the stride of A and B here appears to avoid bad cache effects,
+ // leading to a significant speed improvement.
+ // We also align the stride to a multiple of 16 bytes for efficiency.
+ int buf_stride = ((width_ext + 3) & ~3) + 16;
+
+ // The "tl" pointers point at the top-left of the initialised data for the
+ // array. Adding 3 here ensures that column 1 is 16-byte aligned.
+ int32_t *Atl = buf + 0 * RESTORATION_PROC_UNIT_PELS + 3;
+ int32_t *Btl = buf + 1 * RESTORATION_PROC_UNIT_PELS + 3;
+ int32_t *Ctl = buf + 2 * RESTORATION_PROC_UNIT_PELS + 3;
+ int32_t *Dtl = buf + 3 * RESTORATION_PROC_UNIT_PELS + 3;
+
+ // The "0" pointers are (- SGRPROJ_BORDER_VERT, -SGRPROJ_BORDER_HORZ). Note
+ // there's a zero row and column in A, B (integral images), so we move down
+ // and right one for them.
+ const int buf_diag_border =
+ SGRPROJ_BORDER_HORZ + buf_stride * SGRPROJ_BORDER_VERT;
+
+ int32_t *A0 = Atl + 1 + buf_stride;
+ int32_t *B0 = Btl + 1 + buf_stride;
+ int32_t *C0 = Ctl + 1 + buf_stride;
+ int32_t *D0 = Dtl + 1 + buf_stride;
+
+ // Finally, A, B, C, D point at position (0, 0).
+ int32_t *A = A0 + buf_diag_border;
+ int32_t *B = B0 + buf_diag_border;
+ int32_t *C = C0 + buf_diag_border;
+ int32_t *D = D0 + buf_diag_border;
+
+ const int dgd_diag_border =
+ SGRPROJ_BORDER_HORZ + dgd_stride * SGRPROJ_BORDER_VERT;
+ const uint8_t *dgd0 = dgd8 - dgd_diag_border;
+
+ // Generate integral images from the input. C will contain sums of squares; D
+ // will contain just sums
+ if (highbd)
+ integral_images_highbd(CONVERT_TO_SHORTPTR(dgd0), dgd_stride, width_ext,
+ height_ext, Ctl, Dtl, buf_stride);
+ else
+ integral_images(dgd0, dgd_stride, width_ext, height_ext, Ctl, Dtl,
+ buf_stride);
+
+ const sgr_params_type *const params = &av1_sgr_params[sgr_params_idx];
+ // Write to flt0 and flt1
+ // If params->r == 0 we skip the corresponding filter. We only allow one of
+ // the radii to be 0, as having both equal to 0 would be equivalent to
+ // skipping SGR entirely.
+ assert(!(params->r[0] == 0 && params->r[1] == 0));
+ assert(params->r[0] < AOMMIN(SGRPROJ_BORDER_VERT, SGRPROJ_BORDER_HORZ));
+ assert(params->r[1] < AOMMIN(SGRPROJ_BORDER_VERT, SGRPROJ_BORDER_HORZ));
+
+ if (params->r[0] > 0) {
+ calc_ab_fast(A, B, C, D, width, height, buf_stride, bit_depth,
+ sgr_params_idx, 0);
+ final_filter_fast(flt0, flt_stride, A, B, buf_stride, dgd8, dgd_stride,
+ width, height, highbd);
+ }
+
+ if (params->r[1] > 0) {
+ calc_ab(A, B, C, D, width, height, buf_stride, bit_depth, sgr_params_idx,
+ 1);
+ final_filter(flt1, flt_stride, A, B, buf_stride, dgd8, dgd_stride, width,
+ height, highbd);
+ }
+ aom_free(buf);
+ return 0;
+}
+
+int av1_apply_selfguided_restoration_sse4_1(const uint8_t *dat8, int width,
+ int height, int stride, int eps,
+ const int *xqd, uint8_t *dst8,
+ int dst_stride, int32_t *tmpbuf,
+ int bit_depth, int highbd) {
+ int32_t *flt0 = tmpbuf;
+ int32_t *flt1 = flt0 + RESTORATION_UNITPELS_MAX;
+ assert(width * height <= RESTORATION_UNITPELS_MAX);
+ const int ret = av1_selfguided_restoration_sse4_1(
+ dat8, width, height, stride, flt0, flt1, width, eps, bit_depth, highbd);
+ if (ret != 0) return ret;
+ const sgr_params_type *const params = &av1_sgr_params[eps];
+ int xq[2];
+ av1_decode_xq(xqd, xq, params);
+
+ __m128i xq0 = _mm_set1_epi32(xq[0]);
+ __m128i xq1 = _mm_set1_epi32(xq[1]);
+
+ for (int i = 0; i < height; ++i) {
+ // Calculate output in batches of 8 pixels
+ for (int j = 0; j < width; j += 8) {
+ const int k = i * width + j;
+ const int m = i * dst_stride + j;
+
+ const uint8_t *dat8ij = dat8 + i * stride + j;
+ __m128i src;
+ if (highbd) {
+ src = xx_loadu_128(CONVERT_TO_SHORTPTR(dat8ij));
+ } else {
+ src = _mm_cvtepu8_epi16(xx_loadl_64(dat8ij));
+ }
+
+ const __m128i u = _mm_slli_epi16(src, SGRPROJ_RST_BITS);
+ const __m128i u_0 = _mm_cvtepu16_epi32(u);
+ const __m128i u_1 = _mm_cvtepu16_epi32(_mm_srli_si128(u, 8));
+
+ __m128i v_0 = _mm_slli_epi32(u_0, SGRPROJ_PRJ_BITS);
+ __m128i v_1 = _mm_slli_epi32(u_1, SGRPROJ_PRJ_BITS);
+
+ if (params->r[0] > 0) {
+ const __m128i f1_0 = _mm_sub_epi32(xx_loadu_128(&flt0[k]), u_0);
+ v_0 = _mm_add_epi32(v_0, _mm_mullo_epi32(xq0, f1_0));
+
+ const __m128i f1_1 = _mm_sub_epi32(xx_loadu_128(&flt0[k + 4]), u_1);
+ v_1 = _mm_add_epi32(v_1, _mm_mullo_epi32(xq0, f1_1));
+ }
+
+ if (params->r[1] > 0) {
+ const __m128i f2_0 = _mm_sub_epi32(xx_loadu_128(&flt1[k]), u_0);
+ v_0 = _mm_add_epi32(v_0, _mm_mullo_epi32(xq1, f2_0));
+
+ const __m128i f2_1 = _mm_sub_epi32(xx_loadu_128(&flt1[k + 4]), u_1);
+ v_1 = _mm_add_epi32(v_1, _mm_mullo_epi32(xq1, f2_1));
+ }
+
+ const __m128i rounding =
+ round_for_shift(SGRPROJ_PRJ_BITS + SGRPROJ_RST_BITS);
+ const __m128i w_0 = _mm_srai_epi32(_mm_add_epi32(v_0, rounding),
+ SGRPROJ_PRJ_BITS + SGRPROJ_RST_BITS);
+ const __m128i w_1 = _mm_srai_epi32(_mm_add_epi32(v_1, rounding),
+ SGRPROJ_PRJ_BITS + SGRPROJ_RST_BITS);
+
+ if (highbd) {
+ // Pack into 16 bits and clamp to [0, 2^bit_depth)
+ const __m128i tmp = _mm_packus_epi32(w_0, w_1);
+ const __m128i max = _mm_set1_epi16((1 << bit_depth) - 1);
+ const __m128i res = _mm_min_epi16(tmp, max);
+ xx_storeu_128(CONVERT_TO_SHORTPTR(dst8 + m), res);
+ } else {
+ // Pack into 8 bits and clamp to [0, 256)
+ const __m128i tmp = _mm_packs_epi32(w_0, w_1);
+ const __m128i res = _mm_packus_epi16(tmp, tmp /* "don't care" value */);
+ xx_storel_64(dst8 + m, res);
+ }
+ }
+ }
+ return 0;
+}
diff --git a/third_party/aom/av1/common/x86/warp_plane_avx2.c b/third_party/aom/av1/common/x86/warp_plane_avx2.c
new file mode 100644
index 0000000000..663b8cde93
--- /dev/null
+++ b/third_party/aom/av1/common/x86/warp_plane_avx2.c
@@ -0,0 +1,1210 @@
+/*
+ * Copyright (c) 2019, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <immintrin.h>
+#include "config/av1_rtcd.h"
+#include "av1/common/warped_motion.h"
+#include "aom_dsp/x86/synonyms.h"
+
+DECLARE_ALIGNED(32, static const uint8_t, shuffle_alpha0_mask01_avx2[32]) = {
+ 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1,
+ 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1
+};
+
+DECLARE_ALIGNED(32, static const uint8_t, shuffle_alpha0_mask23_avx2[32]) = {
+ 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3,
+ 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3
+};
+
+DECLARE_ALIGNED(32, static const uint8_t, shuffle_alpha0_mask45_avx2[32]) = {
+ 4, 5, 4, 5, 4, 5, 4, 5, 4, 5, 4, 5, 4, 5, 4, 5,
+ 4, 5, 4, 5, 4, 5, 4, 5, 4, 5, 4, 5, 4, 5, 4, 5
+};
+
+DECLARE_ALIGNED(32, static const uint8_t, shuffle_alpha0_mask67_avx2[32]) = {
+ 6, 7, 6, 7, 6, 7, 6, 7, 6, 7, 6, 7, 6, 7, 6, 7,
+ 6, 7, 6, 7, 6, 7, 6, 7, 6, 7, 6, 7, 6, 7, 6, 7
+};
+
+DECLARE_ALIGNED(32, static const uint8_t, shuffle_gamma0_mask0_avx2[32]) = {
+ 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3,
+ 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3
+};
+
+DECLARE_ALIGNED(32, static const uint8_t, shuffle_gamma0_mask1_avx2[32]) = {
+ 4, 5, 6, 7, 4, 5, 6, 7, 4, 5, 6, 7, 4, 5, 6, 7,
+ 4, 5, 6, 7, 4, 5, 6, 7, 4, 5, 6, 7, 4, 5, 6, 7
+};
+
+DECLARE_ALIGNED(32, static const uint8_t, shuffle_gamma0_mask2_avx2[32]) = {
+ 8, 9, 10, 11, 8, 9, 10, 11, 8, 9, 10, 11, 8, 9, 10, 11,
+ 8, 9, 10, 11, 8, 9, 10, 11, 8, 9, 10, 11, 8, 9, 10, 11
+};
+
+DECLARE_ALIGNED(32, static const uint8_t, shuffle_gamma0_mask3_avx2[32]) = {
+ 12, 13, 14, 15, 12, 13, 14, 15, 12, 13, 14, 15, 12, 13, 14, 15,
+ 12, 13, 14, 15, 12, 13, 14, 15, 12, 13, 14, 15, 12, 13, 14, 15
+};
+
+DECLARE_ALIGNED(32, static const uint8_t,
+ shuffle_src0[32]) = { 0, 2, 2, 4, 4, 6, 6, 8, 1, 3, 3,
+ 5, 5, 7, 7, 9, 0, 2, 2, 4, 4, 6,
+ 6, 8, 1, 3, 3, 5, 5, 7, 7, 9 };
+
+DECLARE_ALIGNED(32, static const uint8_t,
+ shuffle_src1[32]) = { 4, 6, 6, 8, 8, 10, 10, 12, 5, 7, 7,
+ 9, 9, 11, 11, 13, 4, 6, 6, 8, 8, 10,
+ 10, 12, 5, 7, 7, 9, 9, 11, 11, 13 };
+
+DECLARE_ALIGNED(32, static const uint8_t,
+ shuffle_src2[32]) = { 1, 3, 3, 5, 5, 7, 7, 9, 2, 4, 4,
+ 6, 6, 8, 8, 10, 1, 3, 3, 5, 5, 7,
+ 7, 9, 2, 4, 4, 6, 6, 8, 8, 10 };
+
+DECLARE_ALIGNED(32, static const uint8_t,
+ shuffle_src3[32]) = { 5, 7, 7, 9, 9, 11, 11, 13, 6, 8, 8,
+ 10, 10, 12, 12, 14, 5, 7, 7, 9, 9, 11,
+ 11, 13, 6, 8, 8, 10, 10, 12, 12, 14 };
+
+static INLINE void filter_src_pixels_avx2(const __m256i src, __m256i *horz_out,
+ __m256i *coeff,
+ const __m256i *shuffle_src,
+ const __m256i *round_const,
+ const __m128i *shift, int row) {
+ const __m256i src_0 = _mm256_shuffle_epi8(src, shuffle_src[0]);
+ const __m256i src_1 = _mm256_shuffle_epi8(src, shuffle_src[1]);
+ const __m256i src_2 = _mm256_shuffle_epi8(src, shuffle_src[2]);
+ const __m256i src_3 = _mm256_shuffle_epi8(src, shuffle_src[3]);
+
+ const __m256i res_02 = _mm256_maddubs_epi16(src_0, coeff[0]);
+ const __m256i res_46 = _mm256_maddubs_epi16(src_1, coeff[1]);
+ const __m256i res_13 = _mm256_maddubs_epi16(src_2, coeff[2]);
+ const __m256i res_57 = _mm256_maddubs_epi16(src_3, coeff[3]);
+
+ const __m256i res_even = _mm256_add_epi16(res_02, res_46);
+ const __m256i res_odd = _mm256_add_epi16(res_13, res_57);
+ const __m256i res =
+ _mm256_add_epi16(_mm256_add_epi16(res_even, res_odd), *round_const);
+ horz_out[row] = _mm256_srl_epi16(res, *shift);
+}
+
+static INLINE void prepare_horizontal_filter_coeff_avx2(int alpha, int beta,
+ int sx,
+ __m256i *coeff) {
+ __m128i tmp_0 = _mm_loadl_epi64(
+ (__m128i *)&av1_filter_8bit[((unsigned)(sx + 0 * alpha)) >>
+ WARPEDDIFF_PREC_BITS]);
+ __m128i tmp_1 = _mm_loadl_epi64(
+ (__m128i *)&av1_filter_8bit[((unsigned)(sx + 1 * alpha)) >>
+ WARPEDDIFF_PREC_BITS]);
+ __m128i tmp_2 = _mm_loadl_epi64(
+ (__m128i *)&av1_filter_8bit[((unsigned)(sx + 2 * alpha)) >>
+ WARPEDDIFF_PREC_BITS]);
+ __m128i tmp_3 = _mm_loadl_epi64(
+ (__m128i *)&av1_filter_8bit[((unsigned)(sx + 3 * alpha)) >>
+ WARPEDDIFF_PREC_BITS]);
+
+ __m128i tmp_4 = _mm_loadl_epi64(
+ (__m128i *)&av1_filter_8bit[((unsigned)(sx + 4 * alpha)) >>
+ WARPEDDIFF_PREC_BITS]);
+ __m128i tmp_5 = _mm_loadl_epi64(
+ (__m128i *)&av1_filter_8bit[((unsigned)(sx + 5 * alpha)) >>
+ WARPEDDIFF_PREC_BITS]);
+ __m128i tmp_6 = _mm_loadl_epi64(
+ (__m128i *)&av1_filter_8bit[((unsigned)(sx + 6 * alpha)) >>
+ WARPEDDIFF_PREC_BITS]);
+ __m128i tmp_7 = _mm_loadl_epi64(
+ (__m128i *)&av1_filter_8bit[((unsigned)(sx + 7 * alpha)) >>
+ WARPEDDIFF_PREC_BITS]);
+
+ __m256i tmp0_256 = _mm256_castsi128_si256(tmp_0);
+ __m256i tmp2_256 = _mm256_castsi128_si256(tmp_2);
+ __m256i tmp1_256 = _mm256_castsi128_si256(tmp_1);
+ __m256i tmp3_256 = _mm256_castsi128_si256(tmp_3);
+
+ __m256i tmp4_256 = _mm256_castsi128_si256(tmp_4);
+ __m256i tmp6_256 = _mm256_castsi128_si256(tmp_6);
+ __m256i tmp5_256 = _mm256_castsi128_si256(tmp_5);
+ __m256i tmp7_256 = _mm256_castsi128_si256(tmp_7);
+
+ __m128i tmp_8 = _mm_loadl_epi64(
+ (__m128i *)&av1_filter_8bit[(unsigned)((sx + beta) + 0 * alpha) >>
+ WARPEDDIFF_PREC_BITS]);
+ tmp0_256 = _mm256_inserti128_si256(tmp0_256, tmp_8, 1);
+
+ __m128i tmp_9 = _mm_loadl_epi64(
+ (__m128i *)&av1_filter_8bit[(unsigned)((sx + beta) + 1 * alpha) >>
+ WARPEDDIFF_PREC_BITS]);
+ tmp1_256 = _mm256_inserti128_si256(tmp1_256, tmp_9, 1);
+
+ __m128i tmp_10 = _mm_loadl_epi64(
+ (__m128i *)&av1_filter_8bit[(unsigned)((sx + beta) + 2 * alpha) >>
+ WARPEDDIFF_PREC_BITS]);
+ tmp2_256 = _mm256_inserti128_si256(tmp2_256, tmp_10, 1);
+
+ __m128i tmp_11 = _mm_loadl_epi64(
+ (__m128i *)&av1_filter_8bit[(unsigned)((sx + beta) + 3 * alpha) >>
+ WARPEDDIFF_PREC_BITS]);
+ tmp3_256 = _mm256_inserti128_si256(tmp3_256, tmp_11, 1);
+
+ tmp_2 = _mm_loadl_epi64(
+ (__m128i *)&av1_filter_8bit[(unsigned)((sx + beta) + 4 * alpha) >>
+ WARPEDDIFF_PREC_BITS]);
+ tmp4_256 = _mm256_inserti128_si256(tmp4_256, tmp_2, 1);
+
+ tmp_3 = _mm_loadl_epi64(
+ (__m128i *)&av1_filter_8bit[(unsigned)((sx + beta) + 5 * alpha) >>
+ WARPEDDIFF_PREC_BITS]);
+ tmp5_256 = _mm256_inserti128_si256(tmp5_256, tmp_3, 1);
+
+ tmp_6 = _mm_loadl_epi64(
+ (__m128i *)&av1_filter_8bit[(unsigned)((sx + beta) + 6 * alpha) >>
+ WARPEDDIFF_PREC_BITS]);
+ tmp6_256 = _mm256_inserti128_si256(tmp6_256, tmp_6, 1);
+
+ tmp_7 = _mm_loadl_epi64(
+ (__m128i *)&av1_filter_8bit[(unsigned)((sx + beta) + 7 * alpha) >>
+ WARPEDDIFF_PREC_BITS]);
+ tmp7_256 = _mm256_inserti128_si256(tmp7_256, tmp_7, 1);
+
+ const __m256i tmp_12 = _mm256_unpacklo_epi16(tmp0_256, tmp2_256);
+ const __m256i tmp_13 = _mm256_unpacklo_epi16(tmp1_256, tmp3_256);
+ const __m256i tmp_14 = _mm256_unpacklo_epi16(tmp4_256, tmp6_256);
+ const __m256i tmp_15 = _mm256_unpacklo_epi16(tmp5_256, tmp7_256);
+
+ const __m256i res_0 = _mm256_unpacklo_epi32(tmp_12, tmp_14);
+ const __m256i res_1 = _mm256_unpackhi_epi32(tmp_12, tmp_14);
+ const __m256i res_2 = _mm256_unpacklo_epi32(tmp_13, tmp_15);
+ const __m256i res_3 = _mm256_unpackhi_epi32(tmp_13, tmp_15);
+
+ coeff[0] = _mm256_unpacklo_epi64(res_0, res_2);
+ coeff[1] = _mm256_unpackhi_epi64(res_0, res_2);
+ coeff[2] = _mm256_unpacklo_epi64(res_1, res_3);
+ coeff[3] = _mm256_unpackhi_epi64(res_1, res_3);
+}
+
+static INLINE void prepare_horizontal_filter_coeff_beta0_avx2(int alpha, int sx,
+ __m256i *coeff) {
+ __m128i tmp_0 = _mm_loadl_epi64(
+ (__m128i *)&av1_filter_8bit[(sx + 0 * alpha) >> WARPEDDIFF_PREC_BITS]);
+ __m128i tmp_1 = _mm_loadl_epi64(
+ (__m128i *)&av1_filter_8bit[(sx + 1 * alpha) >> WARPEDDIFF_PREC_BITS]);
+ __m128i tmp_2 = _mm_loadl_epi64(
+ (__m128i *)&av1_filter_8bit[(sx + 2 * alpha) >> WARPEDDIFF_PREC_BITS]);
+ __m128i tmp_3 = _mm_loadl_epi64(
+ (__m128i *)&av1_filter_8bit[(sx + 3 * alpha) >> WARPEDDIFF_PREC_BITS]);
+ __m128i tmp_4 = _mm_loadl_epi64(
+ (__m128i *)&av1_filter_8bit[(sx + 4 * alpha) >> WARPEDDIFF_PREC_BITS]);
+ __m128i tmp_5 = _mm_loadl_epi64(
+ (__m128i *)&av1_filter_8bit[(sx + 5 * alpha) >> WARPEDDIFF_PREC_BITS]);
+ __m128i tmp_6 = _mm_loadl_epi64(
+ (__m128i *)&av1_filter_8bit[(sx + 6 * alpha) >> WARPEDDIFF_PREC_BITS]);
+ __m128i tmp_7 = _mm_loadl_epi64(
+ (__m128i *)&av1_filter_8bit[(sx + 7 * alpha) >> WARPEDDIFF_PREC_BITS]);
+
+ tmp_0 = _mm_unpacklo_epi16(tmp_0, tmp_2);
+ tmp_1 = _mm_unpacklo_epi16(tmp_1, tmp_3);
+ tmp_4 = _mm_unpacklo_epi16(tmp_4, tmp_6);
+ tmp_5 = _mm_unpacklo_epi16(tmp_5, tmp_7);
+
+ const __m256i tmp_12 = _mm256_broadcastsi128_si256(tmp_0);
+ const __m256i tmp_13 = _mm256_broadcastsi128_si256(tmp_1);
+ const __m256i tmp_14 = _mm256_broadcastsi128_si256(tmp_4);
+ const __m256i tmp_15 = _mm256_broadcastsi128_si256(tmp_5);
+
+ const __m256i res_0 = _mm256_unpacklo_epi32(tmp_12, tmp_14);
+ const __m256i res_1 = _mm256_unpackhi_epi32(tmp_12, tmp_14);
+ const __m256i res_2 = _mm256_unpacklo_epi32(tmp_13, tmp_15);
+ const __m256i res_3 = _mm256_unpackhi_epi32(tmp_13, tmp_15);
+
+ coeff[0] = _mm256_unpacklo_epi64(res_0, res_2);
+ coeff[1] = _mm256_unpackhi_epi64(res_0, res_2);
+ coeff[2] = _mm256_unpacklo_epi64(res_1, res_3);
+ coeff[3] = _mm256_unpackhi_epi64(res_1, res_3);
+}
+
+static INLINE void prepare_horizontal_filter_coeff_alpha0_avx2(int beta, int sx,
+ __m256i *coeff) {
+ const __m128i tmp_0 =
+ _mm_loadl_epi64((__m128i *)&av1_filter_8bit[sx >> WARPEDDIFF_PREC_BITS]);
+ const __m128i tmp_1 = _mm_loadl_epi64(
+ (__m128i *)&av1_filter_8bit[(sx + beta) >> WARPEDDIFF_PREC_BITS]);
+
+ const __m256i res_0 =
+ _mm256_inserti128_si256(_mm256_castsi128_si256(tmp_0), tmp_1, 0x1);
+
+ coeff[0] = _mm256_shuffle_epi8(
+ res_0, _mm256_load_si256((__m256i *)shuffle_alpha0_mask01_avx2));
+ coeff[1] = _mm256_shuffle_epi8(
+ res_0, _mm256_load_si256((__m256i *)shuffle_alpha0_mask23_avx2));
+ coeff[2] = _mm256_shuffle_epi8(
+ res_0, _mm256_load_si256((__m256i *)shuffle_alpha0_mask45_avx2));
+ coeff[3] = _mm256_shuffle_epi8(
+ res_0, _mm256_load_si256((__m256i *)shuffle_alpha0_mask67_avx2));
+}
+
+static INLINE void horizontal_filter_avx2(const __m256i src, __m256i *horz_out,
+ int sx, int alpha, int beta, int row,
+ const __m256i *shuffle_src,
+ const __m256i *round_const,
+ const __m128i *shift) {
+ __m256i coeff[4];
+ prepare_horizontal_filter_coeff_avx2(alpha, beta, sx, coeff);
+ filter_src_pixels_avx2(src, horz_out, coeff, shuffle_src, round_const, shift,
+ row);
+}
+static INLINE void prepare_horizontal_filter_coeff(int alpha, int sx,
+ __m256i *coeff) {
+ const __m128i tmp_0 = _mm_loadl_epi64(
+ (__m128i *)&av1_filter_8bit[(sx + 0 * alpha) >> WARPEDDIFF_PREC_BITS]);
+ const __m128i tmp_1 = _mm_loadl_epi64(
+ (__m128i *)&av1_filter_8bit[(sx + 1 * alpha) >> WARPEDDIFF_PREC_BITS]);
+ const __m128i tmp_2 = _mm_loadl_epi64(
+ (__m128i *)&av1_filter_8bit[(sx + 2 * alpha) >> WARPEDDIFF_PREC_BITS]);
+ const __m128i tmp_3 = _mm_loadl_epi64(
+ (__m128i *)&av1_filter_8bit[(sx + 3 * alpha) >> WARPEDDIFF_PREC_BITS]);
+ const __m128i tmp_4 = _mm_loadl_epi64(
+ (__m128i *)&av1_filter_8bit[(sx + 4 * alpha) >> WARPEDDIFF_PREC_BITS]);
+ const __m128i tmp_5 = _mm_loadl_epi64(
+ (__m128i *)&av1_filter_8bit[(sx + 5 * alpha) >> WARPEDDIFF_PREC_BITS]);
+ const __m128i tmp_6 = _mm_loadl_epi64(
+ (__m128i *)&av1_filter_8bit[(sx + 6 * alpha) >> WARPEDDIFF_PREC_BITS]);
+ const __m128i tmp_7 = _mm_loadl_epi64(
+ (__m128i *)&av1_filter_8bit[(sx + 7 * alpha) >> WARPEDDIFF_PREC_BITS]);
+
+ const __m128i tmp_8 = _mm_unpacklo_epi16(tmp_0, tmp_2);
+ const __m128i tmp_9 = _mm_unpacklo_epi16(tmp_1, tmp_3);
+ const __m128i tmp_10 = _mm_unpacklo_epi16(tmp_4, tmp_6);
+ const __m128i tmp_11 = _mm_unpacklo_epi16(tmp_5, tmp_7);
+
+ const __m128i tmp_12 = _mm_unpacklo_epi32(tmp_8, tmp_10);
+ const __m128i tmp_13 = _mm_unpackhi_epi32(tmp_8, tmp_10);
+ const __m128i tmp_14 = _mm_unpacklo_epi32(tmp_9, tmp_11);
+ const __m128i tmp_15 = _mm_unpackhi_epi32(tmp_9, tmp_11);
+
+ coeff[0] = _mm256_castsi128_si256(_mm_unpacklo_epi64(tmp_12, tmp_14));
+ coeff[1] = _mm256_castsi128_si256(_mm_unpackhi_epi64(tmp_12, tmp_14));
+ coeff[2] = _mm256_castsi128_si256(_mm_unpacklo_epi64(tmp_13, tmp_15));
+ coeff[3] = _mm256_castsi128_si256(_mm_unpackhi_epi64(tmp_13, tmp_15));
+}
+
+static INLINE void warp_horizontal_filter_avx2(
+ const uint8_t *ref, __m256i *horz_out, int stride, int32_t ix4, int32_t iy4,
+ int32_t sx4, int alpha, int beta, int p_height, int height, int i,
+ const __m256i *round_const, const __m128i *shift,
+ const __m256i *shuffle_src) {
+ int k, iy, sx, row = 0;
+ __m256i coeff[4];
+ for (k = -7; k <= (AOMMIN(8, p_height - i) - 2); k += 2) {
+ iy = iy4 + k;
+ iy = clamp(iy, 0, height - 1);
+ const __m128i src_0 =
+ _mm_loadu_si128((__m128i *)(ref + iy * stride + ix4 - 7));
+ iy = iy4 + k + 1;
+ iy = clamp(iy, 0, height - 1);
+ const __m128i src_1 =
+ _mm_loadu_si128((__m128i *)(ref + iy * stride + ix4 - 7));
+ const __m256i src_01 =
+ _mm256_inserti128_si256(_mm256_castsi128_si256(src_0), src_1, 0x1);
+ sx = sx4 + beta * (k + 4);
+ horizontal_filter_avx2(src_01, horz_out, sx, alpha, beta, row, shuffle_src,
+ round_const, shift);
+ row += 1;
+ }
+ iy = iy4 + k;
+ iy = clamp(iy, 0, height - 1);
+ const __m256i src_01 = _mm256_castsi128_si256(
+ _mm_loadu_si128((__m128i *)(ref + iy * stride + ix4 - 7)));
+ sx = sx4 + beta * (k + 4);
+ prepare_horizontal_filter_coeff(alpha, sx, coeff);
+ filter_src_pixels_avx2(src_01, horz_out, coeff, shuffle_src, round_const,
+ shift, row);
+}
+
+static INLINE void warp_horizontal_filter_alpha0_avx2(
+ const uint8_t *ref, __m256i *horz_out, int stride, int32_t ix4, int32_t iy4,
+ int32_t sx4, int alpha, int beta, int p_height, int height, int i,
+ const __m256i *round_const, const __m128i *shift,
+ const __m256i *shuffle_src) {
+ (void)alpha;
+ int k, iy, sx, row = 0;
+ __m256i coeff[4];
+ for (k = -7; k <= (AOMMIN(8, p_height - i) - 2); k += 2) {
+ iy = iy4 + k;
+ iy = clamp(iy, 0, height - 1);
+ const __m128i src_0 =
+ _mm_loadu_si128((__m128i *)(ref + iy * stride + ix4 - 7));
+ iy = iy4 + k + 1;
+ iy = clamp(iy, 0, height - 1);
+ const __m128i src_1 =
+ _mm_loadu_si128((__m128i *)(ref + iy * stride + ix4 - 7));
+ const __m256i src_01 =
+ _mm256_inserti128_si256(_mm256_castsi128_si256(src_0), src_1, 0x1);
+ sx = sx4 + beta * (k + 4);
+ prepare_horizontal_filter_coeff_alpha0_avx2(beta, sx, coeff);
+ filter_src_pixels_avx2(src_01, horz_out, coeff, shuffle_src, round_const,
+ shift, row);
+ row += 1;
+ }
+ iy = iy4 + k;
+ iy = clamp(iy, 0, height - 1);
+ const __m256i src_01 = _mm256_castsi128_si256(
+ _mm_loadu_si128((__m128i *)(ref + iy * stride + ix4 - 7)));
+ sx = sx4 + beta * (k + 4);
+ prepare_horizontal_filter_coeff_alpha0_avx2(beta, sx, coeff);
+ filter_src_pixels_avx2(src_01, horz_out, coeff, shuffle_src, round_const,
+ shift, row);
+}
+
+static INLINE void warp_horizontal_filter_beta0_avx2(
+ const uint8_t *ref, __m256i *horz_out, int stride, int32_t ix4, int32_t iy4,
+ int32_t sx4, int alpha, int beta, int p_height, int height, int i,
+ const __m256i *round_const, const __m128i *shift,
+ const __m256i *shuffle_src) {
+ (void)beta;
+ int k, iy, row = 0;
+ __m256i coeff[4];
+ prepare_horizontal_filter_coeff_beta0_avx2(alpha, sx4, coeff);
+ for (k = -7; k <= (AOMMIN(8, p_height - i) - 2); k += 2) {
+ iy = iy4 + k;
+ iy = clamp(iy, 0, height - 1);
+ const __m128i src_0 =
+ _mm_loadu_si128((__m128i *)(ref + iy * stride + ix4 - 7));
+ iy = iy4 + k + 1;
+ iy = clamp(iy, 0, height - 1);
+ const __m128i src_1 =
+ _mm_loadu_si128((__m128i *)(ref + iy * stride + ix4 - 7));
+ const __m256i src_01 =
+ _mm256_inserti128_si256(_mm256_castsi128_si256(src_0), src_1, 0x1);
+ filter_src_pixels_avx2(src_01, horz_out, coeff, shuffle_src, round_const,
+ shift, row);
+ row += 1;
+ }
+ iy = iy4 + k;
+ iy = clamp(iy, 0, height - 1);
+ const __m256i src_01 = _mm256_castsi128_si256(
+ _mm_loadu_si128((__m128i *)(ref + iy * stride + ix4 - 7)));
+ filter_src_pixels_avx2(src_01, horz_out, coeff, shuffle_src, round_const,
+ shift, row);
+}
+
+static INLINE void warp_horizontal_filter_alpha0_beta0_avx2(
+ const uint8_t *ref, __m256i *horz_out, int stride, int32_t ix4, int32_t iy4,
+ int32_t sx4, int alpha, int beta, int p_height, int height, int i,
+ const __m256i *round_const, const __m128i *shift,
+ const __m256i *shuffle_src) {
+ (void)alpha;
+ int k, iy, row = 0;
+ __m256i coeff[4];
+ prepare_horizontal_filter_coeff_alpha0_avx2(beta, sx4, coeff);
+ for (k = -7; k <= (AOMMIN(8, p_height - i) - 2); k += 2) {
+ iy = iy4 + k;
+ iy = clamp(iy, 0, height - 1);
+ const __m128i src0 =
+ _mm_loadu_si128((__m128i *)(ref + iy * stride + ix4 - 7));
+ iy = iy4 + k + 1;
+ iy = clamp(iy, 0, height - 1);
+ const __m128i src1 =
+ _mm_loadu_si128((__m128i *)(ref + iy * stride + ix4 - 7));
+ const __m256i src_01 =
+ _mm256_inserti128_si256(_mm256_castsi128_si256(src0), src1, 0x1);
+ filter_src_pixels_avx2(src_01, horz_out, coeff, shuffle_src, round_const,
+ shift, row);
+ row += 1;
+ }
+ iy = iy4 + k;
+ iy = clamp(iy, 0, height - 1);
+ const __m256i src_01 = _mm256_castsi128_si256(
+ _mm_loadu_si128((__m128i *)(ref + iy * stride + ix4 - 7)));
+ filter_src_pixels_avx2(src_01, horz_out, coeff, shuffle_src, round_const,
+ shift, row);
+}
+
+static INLINE void unpack_weights_and_set_round_const_avx2(
+ ConvolveParams *conv_params, const int round_bits, const int offset_bits,
+ __m256i *res_sub_const, __m256i *round_bits_const, __m256i *wt) {
+ *res_sub_const =
+ _mm256_set1_epi16(-(1 << (offset_bits - conv_params->round_1)) -
+ (1 << (offset_bits - conv_params->round_1 - 1)));
+ *round_bits_const = _mm256_set1_epi16(((1 << round_bits) >> 1));
+
+ const int w0 = conv_params->fwd_offset;
+ const int w1 = conv_params->bck_offset;
+ const __m256i wt0 = _mm256_set1_epi16((short)w0);
+ const __m256i wt1 = _mm256_set1_epi16((short)w1);
+ *wt = _mm256_unpacklo_epi16(wt0, wt1);
+}
+
+static INLINE void prepare_vertical_filter_coeffs_avx2(int gamma, int delta,
+ int sy,
+ __m256i *coeffs) {
+ __m128i filt_00 =
+ _mm_loadu_si128((__m128i *)(av1_warped_filter +
+ ((sy + 0 * gamma) >> WARPEDDIFF_PREC_BITS)));
+ __m128i filt_01 =
+ _mm_loadu_si128((__m128i *)(av1_warped_filter +
+ ((sy + 2 * gamma) >> WARPEDDIFF_PREC_BITS)));
+ __m128i filt_02 =
+ _mm_loadu_si128((__m128i *)(av1_warped_filter +
+ ((sy + 4 * gamma) >> WARPEDDIFF_PREC_BITS)));
+ __m128i filt_03 =
+ _mm_loadu_si128((__m128i *)(av1_warped_filter +
+ ((sy + 6 * gamma) >> WARPEDDIFF_PREC_BITS)));
+
+ __m128i filt_10 = _mm_loadu_si128(
+ (__m128i *)(av1_warped_filter +
+ (((sy + delta) + 0 * gamma) >> WARPEDDIFF_PREC_BITS)));
+ __m128i filt_11 = _mm_loadu_si128(
+ (__m128i *)(av1_warped_filter +
+ (((sy + delta) + 2 * gamma) >> WARPEDDIFF_PREC_BITS)));
+ __m128i filt_12 = _mm_loadu_si128(
+ (__m128i *)(av1_warped_filter +
+ (((sy + delta) + 4 * gamma) >> WARPEDDIFF_PREC_BITS)));
+ __m128i filt_13 = _mm_loadu_si128(
+ (__m128i *)(av1_warped_filter +
+ (((sy + delta) + 6 * gamma) >> WARPEDDIFF_PREC_BITS)));
+
+ __m256i filt_0 =
+ _mm256_inserti128_si256(_mm256_castsi128_si256(filt_00), filt_10, 0x1);
+ __m256i filt_1 =
+ _mm256_inserti128_si256(_mm256_castsi128_si256(filt_01), filt_11, 0x1);
+ __m256i filt_2 =
+ _mm256_inserti128_si256(_mm256_castsi128_si256(filt_02), filt_12, 0x1);
+ __m256i filt_3 =
+ _mm256_inserti128_si256(_mm256_castsi128_si256(filt_03), filt_13, 0x1);
+
+ __m256i res_0 = _mm256_unpacklo_epi32(filt_0, filt_1);
+ __m256i res_1 = _mm256_unpacklo_epi32(filt_2, filt_3);
+ __m256i res_2 = _mm256_unpackhi_epi32(filt_0, filt_1);
+ __m256i res_3 = _mm256_unpackhi_epi32(filt_2, filt_3);
+
+ coeffs[0] = _mm256_unpacklo_epi64(res_0, res_1);
+ coeffs[1] = _mm256_unpackhi_epi64(res_0, res_1);
+ coeffs[2] = _mm256_unpacklo_epi64(res_2, res_3);
+ coeffs[3] = _mm256_unpackhi_epi64(res_2, res_3);
+
+ filt_00 =
+ _mm_loadu_si128((__m128i *)(av1_warped_filter +
+ ((sy + 1 * gamma) >> WARPEDDIFF_PREC_BITS)));
+ filt_01 =
+ _mm_loadu_si128((__m128i *)(av1_warped_filter +
+ ((sy + 3 * gamma) >> WARPEDDIFF_PREC_BITS)));
+ filt_02 =
+ _mm_loadu_si128((__m128i *)(av1_warped_filter +
+ ((sy + 5 * gamma) >> WARPEDDIFF_PREC_BITS)));
+ filt_03 =
+ _mm_loadu_si128((__m128i *)(av1_warped_filter +
+ ((sy + 7 * gamma) >> WARPEDDIFF_PREC_BITS)));
+
+ filt_10 = _mm_loadu_si128(
+ (__m128i *)(av1_warped_filter +
+ (((sy + delta) + 1 * gamma) >> WARPEDDIFF_PREC_BITS)));
+ filt_11 = _mm_loadu_si128(
+ (__m128i *)(av1_warped_filter +
+ (((sy + delta) + 3 * gamma) >> WARPEDDIFF_PREC_BITS)));
+ filt_12 = _mm_loadu_si128(
+ (__m128i *)(av1_warped_filter +
+ (((sy + delta) + 5 * gamma) >> WARPEDDIFF_PREC_BITS)));
+ filt_13 = _mm_loadu_si128(
+ (__m128i *)(av1_warped_filter +
+ (((sy + delta) + 7 * gamma) >> WARPEDDIFF_PREC_BITS)));
+
+ filt_0 =
+ _mm256_inserti128_si256(_mm256_castsi128_si256(filt_00), filt_10, 0x1);
+ filt_1 =
+ _mm256_inserti128_si256(_mm256_castsi128_si256(filt_01), filt_11, 0x1);
+ filt_2 =
+ _mm256_inserti128_si256(_mm256_castsi128_si256(filt_02), filt_12, 0x1);
+ filt_3 =
+ _mm256_inserti128_si256(_mm256_castsi128_si256(filt_03), filt_13, 0x1);
+
+ res_0 = _mm256_unpacklo_epi32(filt_0, filt_1);
+ res_1 = _mm256_unpacklo_epi32(filt_2, filt_3);
+ res_2 = _mm256_unpackhi_epi32(filt_0, filt_1);
+ res_3 = _mm256_unpackhi_epi32(filt_2, filt_3);
+
+ coeffs[4] = _mm256_unpacklo_epi64(res_0, res_1);
+ coeffs[5] = _mm256_unpackhi_epi64(res_0, res_1);
+ coeffs[6] = _mm256_unpacklo_epi64(res_2, res_3);
+ coeffs[7] = _mm256_unpackhi_epi64(res_2, res_3);
+}
+
+static INLINE void prepare_vertical_filter_coeffs_delta0_avx2(int gamma, int sy,
+ __m256i *coeffs) {
+ __m128i filt_00 =
+ _mm_loadu_si128((__m128i *)(av1_warped_filter +
+ ((sy + 0 * gamma) >> WARPEDDIFF_PREC_BITS)));
+ __m128i filt_01 =
+ _mm_loadu_si128((__m128i *)(av1_warped_filter +
+ ((sy + 2 * gamma) >> WARPEDDIFF_PREC_BITS)));
+ __m128i filt_02 =
+ _mm_loadu_si128((__m128i *)(av1_warped_filter +
+ ((sy + 4 * gamma) >> WARPEDDIFF_PREC_BITS)));
+ __m128i filt_03 =
+ _mm_loadu_si128((__m128i *)(av1_warped_filter +
+ ((sy + 6 * gamma) >> WARPEDDIFF_PREC_BITS)));
+
+ __m256i filt_0 = _mm256_broadcastsi128_si256(filt_00);
+ __m256i filt_1 = _mm256_broadcastsi128_si256(filt_01);
+ __m256i filt_2 = _mm256_broadcastsi128_si256(filt_02);
+ __m256i filt_3 = _mm256_broadcastsi128_si256(filt_03);
+
+ __m256i res_0 = _mm256_unpacklo_epi32(filt_0, filt_1);
+ __m256i res_1 = _mm256_unpacklo_epi32(filt_2, filt_3);
+ __m256i res_2 = _mm256_unpackhi_epi32(filt_0, filt_1);
+ __m256i res_3 = _mm256_unpackhi_epi32(filt_2, filt_3);
+
+ coeffs[0] = _mm256_unpacklo_epi64(res_0, res_1);
+ coeffs[1] = _mm256_unpackhi_epi64(res_0, res_1);
+ coeffs[2] = _mm256_unpacklo_epi64(res_2, res_3);
+ coeffs[3] = _mm256_unpackhi_epi64(res_2, res_3);
+
+ filt_00 =
+ _mm_loadu_si128((__m128i *)(av1_warped_filter +
+ ((sy + 1 * gamma) >> WARPEDDIFF_PREC_BITS)));
+ filt_01 =
+ _mm_loadu_si128((__m128i *)(av1_warped_filter +
+ ((sy + 3 * gamma) >> WARPEDDIFF_PREC_BITS)));
+ filt_02 =
+ _mm_loadu_si128((__m128i *)(av1_warped_filter +
+ ((sy + 5 * gamma) >> WARPEDDIFF_PREC_BITS)));
+ filt_03 =
+ _mm_loadu_si128((__m128i *)(av1_warped_filter +
+ ((sy + 7 * gamma) >> WARPEDDIFF_PREC_BITS)));
+
+ filt_0 = _mm256_broadcastsi128_si256(filt_00);
+ filt_1 = _mm256_broadcastsi128_si256(filt_01);
+ filt_2 = _mm256_broadcastsi128_si256(filt_02);
+ filt_3 = _mm256_broadcastsi128_si256(filt_03);
+
+ res_0 = _mm256_unpacklo_epi32(filt_0, filt_1);
+ res_1 = _mm256_unpacklo_epi32(filt_2, filt_3);
+ res_2 = _mm256_unpackhi_epi32(filt_0, filt_1);
+ res_3 = _mm256_unpackhi_epi32(filt_2, filt_3);
+
+ coeffs[4] = _mm256_unpacklo_epi64(res_0, res_1);
+ coeffs[5] = _mm256_unpackhi_epi64(res_0, res_1);
+ coeffs[6] = _mm256_unpacklo_epi64(res_2, res_3);
+ coeffs[7] = _mm256_unpackhi_epi64(res_2, res_3);
+}
+
+static INLINE void prepare_vertical_filter_coeffs_gamma0_avx2(int delta, int sy,
+ __m256i *coeffs) {
+ const __m128i filt_0 = _mm_loadu_si128(
+ (__m128i *)(av1_warped_filter + (sy >> WARPEDDIFF_PREC_BITS)));
+ const __m128i filt_1 = _mm_loadu_si128(
+ (__m128i *)(av1_warped_filter + ((sy + delta) >> WARPEDDIFF_PREC_BITS)));
+
+ __m256i res_0 =
+ _mm256_inserti128_si256(_mm256_castsi128_si256(filt_0), filt_1, 0x1);
+
+ coeffs[0] = _mm256_shuffle_epi8(
+ res_0, _mm256_load_si256((__m256i *)shuffle_gamma0_mask0_avx2));
+ coeffs[1] = _mm256_shuffle_epi8(
+ res_0, _mm256_load_si256((__m256i *)shuffle_gamma0_mask1_avx2));
+ coeffs[2] = _mm256_shuffle_epi8(
+ res_0, _mm256_load_si256((__m256i *)shuffle_gamma0_mask2_avx2));
+ coeffs[3] = _mm256_shuffle_epi8(
+ res_0, _mm256_load_si256((__m256i *)shuffle_gamma0_mask3_avx2));
+
+ coeffs[4] = coeffs[0];
+ coeffs[5] = coeffs[1];
+ coeffs[6] = coeffs[2];
+ coeffs[7] = coeffs[3];
+}
+
+static INLINE void filter_src_pixels_vertical_avx2(__m256i *horz_out,
+ __m256i *src,
+ __m256i *coeffs,
+ __m256i *res_lo,
+ __m256i *res_hi, int row) {
+ const __m256i src_6 = horz_out[row + 3];
+ const __m256i src_7 =
+ _mm256_permute2x128_si256(horz_out[row + 3], horz_out[row + 4], 0x21);
+
+ src[6] = _mm256_unpacklo_epi16(src_6, src_7);
+
+ const __m256i res_0 = _mm256_madd_epi16(src[0], coeffs[0]);
+ const __m256i res_2 = _mm256_madd_epi16(src[2], coeffs[1]);
+ const __m256i res_4 = _mm256_madd_epi16(src[4], coeffs[2]);
+ const __m256i res_6 = _mm256_madd_epi16(src[6], coeffs[3]);
+
+ const __m256i res_even = _mm256_add_epi32(_mm256_add_epi32(res_0, res_2),
+ _mm256_add_epi32(res_4, res_6));
+
+ src[7] = _mm256_unpackhi_epi16(src_6, src_7);
+
+ const __m256i res_1 = _mm256_madd_epi16(src[1], coeffs[4]);
+ const __m256i res_3 = _mm256_madd_epi16(src[3], coeffs[5]);
+ const __m256i res_5 = _mm256_madd_epi16(src[5], coeffs[6]);
+ const __m256i res_7 = _mm256_madd_epi16(src[7], coeffs[7]);
+
+ const __m256i res_odd = _mm256_add_epi32(_mm256_add_epi32(res_1, res_3),
+ _mm256_add_epi32(res_5, res_7));
+
+ // Rearrange pixels back into the order 0 ... 7
+ *res_lo = _mm256_unpacklo_epi32(res_even, res_odd);
+ *res_hi = _mm256_unpackhi_epi32(res_even, res_odd);
+}
+
+static INLINE void store_vertical_filter_output_avx2(
+ const __m256i *res_lo, const __m256i *res_hi, const __m256i *res_add_const,
+ const __m256i *wt, const __m256i *res_sub_const,
+ const __m256i *round_bits_const, uint8_t *pred, ConvolveParams *conv_params,
+ int i, int j, int k, const int reduce_bits_vert, int p_stride, int p_width,
+ const int round_bits) {
+ __m256i res_lo_1 = *res_lo;
+ __m256i res_hi_1 = *res_hi;
+
+ if (conv_params->is_compound) {
+ __m128i *const p_0 =
+ (__m128i *)&conv_params->dst[(i + k + 4) * conv_params->dst_stride + j];
+ __m128i *const p_1 =
+ (__m128i *)&conv_params
+ ->dst[(i + (k + 1) + 4) * conv_params->dst_stride + j];
+
+ res_lo_1 = _mm256_srai_epi32(_mm256_add_epi32(res_lo_1, *res_add_const),
+ reduce_bits_vert);
+
+ const __m256i temp_lo_16 = _mm256_packus_epi32(res_lo_1, res_lo_1);
+ __m256i res_lo_16;
+ if (conv_params->do_average) {
+ __m128i *const dst8_0 = (__m128i *)&pred[(i + k + 4) * p_stride + j];
+ __m128i *const dst8_1 =
+ (__m128i *)&pred[(i + (k + 1) + 4) * p_stride + j];
+ const __m128i p_16_0 = _mm_loadl_epi64(p_0);
+ const __m128i p_16_1 = _mm_loadl_epi64(p_1);
+ const __m256i p_16 =
+ _mm256_inserti128_si256(_mm256_castsi128_si256(p_16_0), p_16_1, 1);
+ if (conv_params->use_dist_wtd_comp_avg) {
+ const __m256i p_16_lo = _mm256_unpacklo_epi16(p_16, temp_lo_16);
+ const __m256i wt_res_lo = _mm256_madd_epi16(p_16_lo, *wt);
+ const __m256i shifted_32 =
+ _mm256_srai_epi32(wt_res_lo, DIST_PRECISION_BITS);
+ res_lo_16 = _mm256_packus_epi32(shifted_32, shifted_32);
+ } else {
+ res_lo_16 = _mm256_srai_epi16(_mm256_add_epi16(p_16, temp_lo_16), 1);
+ }
+ res_lo_16 = _mm256_add_epi16(res_lo_16, *res_sub_const);
+ res_lo_16 = _mm256_srai_epi16(
+ _mm256_add_epi16(res_lo_16, *round_bits_const), round_bits);
+ const __m256i res_8_lo = _mm256_packus_epi16(res_lo_16, res_lo_16);
+ const __m128i res_8_lo_0 = _mm256_castsi256_si128(res_8_lo);
+ const __m128i res_8_lo_1 = _mm256_extracti128_si256(res_8_lo, 1);
+ *(int *)dst8_0 = _mm_cvtsi128_si32(res_8_lo_0);
+ *(int *)dst8_1 = _mm_cvtsi128_si32(res_8_lo_1);
+ } else {
+ const __m128i temp_lo_16_0 = _mm256_castsi256_si128(temp_lo_16);
+ const __m128i temp_lo_16_1 = _mm256_extracti128_si256(temp_lo_16, 1);
+ _mm_storel_epi64(p_0, temp_lo_16_0);
+ _mm_storel_epi64(p_1, temp_lo_16_1);
+ }
+ if (p_width > 4) {
+ __m128i *const p4_0 =
+ (__m128i *)&conv_params
+ ->dst[(i + k + 4) * conv_params->dst_stride + j + 4];
+ __m128i *const p4_1 =
+ (__m128i *)&conv_params
+ ->dst[(i + (k + 1) + 4) * conv_params->dst_stride + j + 4];
+ res_hi_1 = _mm256_srai_epi32(_mm256_add_epi32(res_hi_1, *res_add_const),
+ reduce_bits_vert);
+ const __m256i temp_hi_16 = _mm256_packus_epi32(res_hi_1, res_hi_1);
+ __m256i res_hi_16;
+ if (conv_params->do_average) {
+ __m128i *const dst8_4_0 =
+ (__m128i *)&pred[(i + k + 4) * p_stride + j + 4];
+ __m128i *const dst8_4_1 =
+ (__m128i *)&pred[(i + (k + 1) + 4) * p_stride + j + 4];
+ const __m128i p4_16_0 = _mm_loadl_epi64(p4_0);
+ const __m128i p4_16_1 = _mm_loadl_epi64(p4_1);
+ const __m256i p4_16 = _mm256_inserti128_si256(
+ _mm256_castsi128_si256(p4_16_0), p4_16_1, 1);
+ if (conv_params->use_dist_wtd_comp_avg) {
+ const __m256i p_16_hi = _mm256_unpacklo_epi16(p4_16, temp_hi_16);
+ const __m256i wt_res_hi = _mm256_madd_epi16(p_16_hi, *wt);
+ const __m256i shifted_32 =
+ _mm256_srai_epi32(wt_res_hi, DIST_PRECISION_BITS);
+ res_hi_16 = _mm256_packus_epi32(shifted_32, shifted_32);
+ } else {
+ res_hi_16 = _mm256_srai_epi16(_mm256_add_epi16(p4_16, temp_hi_16), 1);
+ }
+ res_hi_16 = _mm256_add_epi16(res_hi_16, *res_sub_const);
+ res_hi_16 = _mm256_srai_epi16(
+ _mm256_add_epi16(res_hi_16, *round_bits_const), round_bits);
+ __m256i res_8_hi = _mm256_packus_epi16(res_hi_16, res_hi_16);
+ const __m128i res_8_hi_0 = _mm256_castsi256_si128(res_8_hi);
+ const __m128i res_8_hi_1 = _mm256_extracti128_si256(res_8_hi, 1);
+ *(int *)dst8_4_0 = _mm_cvtsi128_si32(res_8_hi_0);
+ *(int *)dst8_4_1 = _mm_cvtsi128_si32(res_8_hi_1);
+ } else {
+ const __m128i temp_hi_16_0 = _mm256_castsi256_si128(temp_hi_16);
+ const __m128i temp_hi_16_1 = _mm256_extracti128_si256(temp_hi_16, 1);
+ _mm_storel_epi64(p4_0, temp_hi_16_0);
+ _mm_storel_epi64(p4_1, temp_hi_16_1);
+ }
+ }
+ } else {
+ const __m256i res_lo_round = _mm256_srai_epi32(
+ _mm256_add_epi32(res_lo_1, *res_add_const), reduce_bits_vert);
+ const __m256i res_hi_round = _mm256_srai_epi32(
+ _mm256_add_epi32(res_hi_1, *res_add_const), reduce_bits_vert);
+
+ const __m256i res_16bit = _mm256_packs_epi32(res_lo_round, res_hi_round);
+ const __m256i res_8bit = _mm256_packus_epi16(res_16bit, res_16bit);
+ const __m128i res_8bit0 = _mm256_castsi256_si128(res_8bit);
+ const __m128i res_8bit1 = _mm256_extracti128_si256(res_8bit, 1);
+
+ // Store, blending with 'pred' if needed
+ __m128i *const p = (__m128i *)&pred[(i + k + 4) * p_stride + j];
+ __m128i *const p1 = (__m128i *)&pred[(i + (k + 1) + 4) * p_stride + j];
+
+ if (p_width == 4) {
+ *(int *)p = _mm_cvtsi128_si32(res_8bit0);
+ *(int *)p1 = _mm_cvtsi128_si32(res_8bit1);
+ } else {
+ _mm_storel_epi64(p, res_8bit0);
+ _mm_storel_epi64(p1, res_8bit1);
+ }
+ }
+}
+
+static INLINE void warp_vertical_filter_avx2(
+ uint8_t *pred, __m256i *horz_out, ConvolveParams *conv_params,
+ int16_t gamma, int16_t delta, int p_height, int p_stride, int p_width,
+ int i, int j, int sy4, const int reduce_bits_vert,
+ const __m256i *res_add_const, const int round_bits,
+ const __m256i *res_sub_const, const __m256i *round_bits_const,
+ const __m256i *wt) {
+ int k, row = 0;
+ __m256i src[8];
+ const __m256i src_0 = horz_out[0];
+ const __m256i src_1 =
+ _mm256_permute2x128_si256(horz_out[0], horz_out[1], 0x21);
+ const __m256i src_2 = horz_out[1];
+ const __m256i src_3 =
+ _mm256_permute2x128_si256(horz_out[1], horz_out[2], 0x21);
+ const __m256i src_4 = horz_out[2];
+ const __m256i src_5 =
+ _mm256_permute2x128_si256(horz_out[2], horz_out[3], 0x21);
+
+ src[0] = _mm256_unpacklo_epi16(src_0, src_1);
+ src[2] = _mm256_unpacklo_epi16(src_2, src_3);
+ src[4] = _mm256_unpacklo_epi16(src_4, src_5);
+
+ src[1] = _mm256_unpackhi_epi16(src_0, src_1);
+ src[3] = _mm256_unpackhi_epi16(src_2, src_3);
+ src[5] = _mm256_unpackhi_epi16(src_4, src_5);
+
+ for (k = -4; k < AOMMIN(4, p_height - i - 4); k += 2) {
+ int sy = sy4 + delta * (k + 4);
+ __m256i coeffs[8];
+ prepare_vertical_filter_coeffs_avx2(gamma, delta, sy, coeffs);
+ __m256i res_lo, res_hi;
+ filter_src_pixels_vertical_avx2(horz_out, src, coeffs, &res_lo, &res_hi,
+ row);
+ store_vertical_filter_output_avx2(&res_lo, &res_hi, res_add_const, wt,
+ res_sub_const, round_bits_const, pred,
+ conv_params, i, j, k, reduce_bits_vert,
+ p_stride, p_width, round_bits);
+ src[0] = src[2];
+ src[2] = src[4];
+ src[4] = src[6];
+ src[1] = src[3];
+ src[3] = src[5];
+ src[5] = src[7];
+
+ row += 1;
+ }
+}
+
+static INLINE void warp_vertical_filter_gamma0_avx2(
+ uint8_t *pred, __m256i *horz_out, ConvolveParams *conv_params,
+ int16_t gamma, int16_t delta, int p_height, int p_stride, int p_width,
+ int i, int j, int sy4, const int reduce_bits_vert,
+ const __m256i *res_add_const, const int round_bits,
+ const __m256i *res_sub_const, const __m256i *round_bits_const,
+ const __m256i *wt) {
+ (void)gamma;
+ int k, row = 0;
+ __m256i src[8];
+ const __m256i src_0 = horz_out[0];
+ const __m256i src_1 =
+ _mm256_permute2x128_si256(horz_out[0], horz_out[1], 0x21);
+ const __m256i src_2 = horz_out[1];
+ const __m256i src_3 =
+ _mm256_permute2x128_si256(horz_out[1], horz_out[2], 0x21);
+ const __m256i src_4 = horz_out[2];
+ const __m256i src_5 =
+ _mm256_permute2x128_si256(horz_out[2], horz_out[3], 0x21);
+
+ src[0] = _mm256_unpacklo_epi16(src_0, src_1);
+ src[2] = _mm256_unpacklo_epi16(src_2, src_3);
+ src[4] = _mm256_unpacklo_epi16(src_4, src_5);
+
+ src[1] = _mm256_unpackhi_epi16(src_0, src_1);
+ src[3] = _mm256_unpackhi_epi16(src_2, src_3);
+ src[5] = _mm256_unpackhi_epi16(src_4, src_5);
+
+ for (k = -4; k < AOMMIN(4, p_height - i - 4); k += 2) {
+ int sy = sy4 + delta * (k + 4);
+ __m256i coeffs[8];
+ prepare_vertical_filter_coeffs_gamma0_avx2(delta, sy, coeffs);
+ __m256i res_lo, res_hi;
+ filter_src_pixels_vertical_avx2(horz_out, src, coeffs, &res_lo, &res_hi,
+ row);
+ store_vertical_filter_output_avx2(&res_lo, &res_hi, res_add_const, wt,
+ res_sub_const, round_bits_const, pred,
+ conv_params, i, j, k, reduce_bits_vert,
+ p_stride, p_width, round_bits);
+ src[0] = src[2];
+ src[2] = src[4];
+ src[4] = src[6];
+ src[1] = src[3];
+ src[3] = src[5];
+ src[5] = src[7];
+ row += 1;
+ }
+}
+
+static INLINE void warp_vertical_filter_delta0_avx2(
+ uint8_t *pred, __m256i *horz_out, ConvolveParams *conv_params,
+ int16_t gamma, int16_t delta, int p_height, int p_stride, int p_width,
+ int i, int j, int sy4, const int reduce_bits_vert,
+ const __m256i *res_add_const, const int round_bits,
+ const __m256i *res_sub_const, const __m256i *round_bits_const,
+ const __m256i *wt) {
+ (void)delta;
+ int k, row = 0;
+ __m256i src[8], coeffs[8];
+ const __m256i src_0 = horz_out[0];
+ const __m256i src_1 =
+ _mm256_permute2x128_si256(horz_out[0], horz_out[1], 0x21);
+ const __m256i src_2 = horz_out[1];
+ const __m256i src_3 =
+ _mm256_permute2x128_si256(horz_out[1], horz_out[2], 0x21);
+ const __m256i src_4 = horz_out[2];
+ const __m256i src_5 =
+ _mm256_permute2x128_si256(horz_out[2], horz_out[3], 0x21);
+
+ src[0] = _mm256_unpacklo_epi16(src_0, src_1);
+ src[2] = _mm256_unpacklo_epi16(src_2, src_3);
+ src[4] = _mm256_unpacklo_epi16(src_4, src_5);
+
+ src[1] = _mm256_unpackhi_epi16(src_0, src_1);
+ src[3] = _mm256_unpackhi_epi16(src_2, src_3);
+ src[5] = _mm256_unpackhi_epi16(src_4, src_5);
+
+ prepare_vertical_filter_coeffs_delta0_avx2(gamma, sy4, coeffs);
+
+ for (k = -4; k < AOMMIN(4, p_height - i - 4); k += 2) {
+ __m256i res_lo, res_hi;
+ filter_src_pixels_vertical_avx2(horz_out, src, coeffs, &res_lo, &res_hi,
+ row);
+ store_vertical_filter_output_avx2(&res_lo, &res_hi, res_add_const, wt,
+ res_sub_const, round_bits_const, pred,
+ conv_params, i, j, k, reduce_bits_vert,
+ p_stride, p_width, round_bits);
+ src[0] = src[2];
+ src[2] = src[4];
+ src[4] = src[6];
+ src[1] = src[3];
+ src[3] = src[5];
+ src[5] = src[7];
+ row += 1;
+ }
+}
+
+static INLINE void warp_vertical_filter_gamma0_delta0_avx2(
+ uint8_t *pred, __m256i *horz_out, ConvolveParams *conv_params,
+ int16_t gamma, int16_t delta, int p_height, int p_stride, int p_width,
+ int i, int j, int sy4, const int reduce_bits_vert,
+ const __m256i *res_add_const, const int round_bits,
+ const __m256i *res_sub_const, const __m256i *round_bits_const,
+ const __m256i *wt) {
+ (void)gamma;
+ int k, row = 0;
+ __m256i src[8], coeffs[8];
+ const __m256i src_0 = horz_out[0];
+ const __m256i src_1 =
+ _mm256_permute2x128_si256(horz_out[0], horz_out[1], 0x21);
+ const __m256i src_2 = horz_out[1];
+ const __m256i src_3 =
+ _mm256_permute2x128_si256(horz_out[1], horz_out[2], 0x21);
+ const __m256i src_4 = horz_out[2];
+ const __m256i src_5 =
+ _mm256_permute2x128_si256(horz_out[2], horz_out[3], 0x21);
+
+ src[0] = _mm256_unpacklo_epi16(src_0, src_1);
+ src[2] = _mm256_unpacklo_epi16(src_2, src_3);
+ src[4] = _mm256_unpacklo_epi16(src_4, src_5);
+
+ src[1] = _mm256_unpackhi_epi16(src_0, src_1);
+ src[3] = _mm256_unpackhi_epi16(src_2, src_3);
+ src[5] = _mm256_unpackhi_epi16(src_4, src_5);
+
+ prepare_vertical_filter_coeffs_gamma0_avx2(delta, sy4, coeffs);
+
+ for (k = -4; k < AOMMIN(4, p_height - i - 4); k += 2) {
+ __m256i res_lo, res_hi;
+ filter_src_pixels_vertical_avx2(horz_out, src, coeffs, &res_lo, &res_hi,
+ row);
+ store_vertical_filter_output_avx2(&res_lo, &res_hi, res_add_const, wt,
+ res_sub_const, round_bits_const, pred,
+ conv_params, i, j, k, reduce_bits_vert,
+ p_stride, p_width, round_bits);
+ src[0] = src[2];
+ src[2] = src[4];
+ src[4] = src[6];
+ src[1] = src[3];
+ src[3] = src[5];
+ src[5] = src[7];
+ row += 1;
+ }
+}
+
+static INLINE void prepare_warp_vertical_filter_avx2(
+ uint8_t *pred, __m256i *horz_out, ConvolveParams *conv_params,
+ int16_t gamma, int16_t delta, int p_height, int p_stride, int p_width,
+ int i, int j, int sy4, const int reduce_bits_vert,
+ const __m256i *res_add_const, const int round_bits,
+ const __m256i *res_sub_const, const __m256i *round_bits_const,
+ const __m256i *wt) {
+ if (gamma == 0 && delta == 0)
+ warp_vertical_filter_gamma0_delta0_avx2(
+ pred, horz_out, conv_params, gamma, delta, p_height, p_stride, p_width,
+ i, j, sy4, reduce_bits_vert, res_add_const, round_bits, res_sub_const,
+ round_bits_const, wt);
+ else if (gamma == 0 && delta != 0)
+ warp_vertical_filter_gamma0_avx2(
+ pred, horz_out, conv_params, gamma, delta, p_height, p_stride, p_width,
+ i, j, sy4, reduce_bits_vert, res_add_const, round_bits, res_sub_const,
+ round_bits_const, wt);
+ else if (gamma != 0 && delta == 0)
+ warp_vertical_filter_delta0_avx2(
+ pred, horz_out, conv_params, gamma, delta, p_height, p_stride, p_width,
+ i, j, sy4, reduce_bits_vert, res_add_const, round_bits, res_sub_const,
+ round_bits_const, wt);
+ else
+ warp_vertical_filter_avx2(pred, horz_out, conv_params, gamma, delta,
+ p_height, p_stride, p_width, i, j, sy4,
+ reduce_bits_vert, res_add_const, round_bits,
+ res_sub_const, round_bits_const, wt);
+}
+
+static INLINE void prepare_warp_horizontal_filter_avx2(
+ const uint8_t *ref, __m256i *horz_out, int stride, int32_t ix4, int32_t iy4,
+ int32_t sx4, int alpha, int beta, int p_height, int height, int i,
+ const __m256i *round_const, const __m128i *shift,
+ const __m256i *shuffle_src) {
+ if (alpha == 0 && beta == 0)
+ warp_horizontal_filter_alpha0_beta0_avx2(
+ ref, horz_out, stride, ix4, iy4, sx4, alpha, beta, p_height, height, i,
+ round_const, shift, shuffle_src);
+ else if (alpha == 0 && beta != 0)
+ warp_horizontal_filter_alpha0_avx2(ref, horz_out, stride, ix4, iy4, sx4,
+ alpha, beta, p_height, height, i,
+ round_const, shift, shuffle_src);
+ else if (alpha != 0 && beta == 0)
+ warp_horizontal_filter_beta0_avx2(ref, horz_out, stride, ix4, iy4, sx4,
+ alpha, beta, p_height, height, i,
+ round_const, shift, shuffle_src);
+ else
+ warp_horizontal_filter_avx2(ref, horz_out, stride, ix4, iy4, sx4, alpha,
+ beta, p_height, height, i, round_const, shift,
+ shuffle_src);
+}
+
+void av1_warp_affine_avx2(const int32_t *mat, const uint8_t *ref, int width,
+ int height, int stride, uint8_t *pred, int p_col,
+ int p_row, int p_width, int p_height, int p_stride,
+ int subsampling_x, int subsampling_y,
+ ConvolveParams *conv_params, int16_t alpha,
+ int16_t beta, int16_t gamma, int16_t delta) {
+ __m256i horz_out[8];
+ int i, j, k;
+ const int bd = 8;
+ const int reduce_bits_horiz = conv_params->round_0;
+ const int reduce_bits_vert = conv_params->is_compound
+ ? conv_params->round_1
+ : 2 * FILTER_BITS - reduce_bits_horiz;
+ const int offset_bits_horiz = bd + FILTER_BITS - 1;
+ assert(IMPLIES(conv_params->is_compound, conv_params->dst != NULL));
+
+ const int offset_bits_vert = bd + 2 * FILTER_BITS - reduce_bits_horiz;
+ const __m256i reduce_bits_vert_const =
+ _mm256_set1_epi32(((1 << reduce_bits_vert) >> 1));
+ const __m256i res_add_const = _mm256_set1_epi32(1 << offset_bits_vert);
+ const int round_bits =
+ 2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1;
+ const int offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0;
+ assert(IMPLIES(conv_params->do_average, conv_params->is_compound));
+
+ const __m256i round_const = _mm256_set1_epi16(
+ (1 << offset_bits_horiz) + ((1 << reduce_bits_horiz) >> 1));
+ const __m128i shift = _mm_cvtsi32_si128(reduce_bits_horiz);
+
+ __m256i res_sub_const, round_bits_const, wt;
+ unpack_weights_and_set_round_const_avx2(conv_params, round_bits, offset_bits,
+ &res_sub_const, &round_bits_const,
+ &wt);
+
+ __m256i res_add_const_1;
+ if (conv_params->is_compound == 1) {
+ res_add_const_1 = _mm256_add_epi32(reduce_bits_vert_const, res_add_const);
+ } else {
+ res_add_const_1 = _mm256_set1_epi32(-(1 << (bd + reduce_bits_vert - 1)) +
+ ((1 << reduce_bits_vert) >> 1));
+ }
+ const int32_t const1 = alpha * (-4) + beta * (-4) +
+ (1 << (WARPEDDIFF_PREC_BITS - 1)) +
+ (WARPEDPIXEL_PREC_SHIFTS << WARPEDDIFF_PREC_BITS);
+ const int32_t const2 = gamma * (-4) + delta * (-4) +
+ (1 << (WARPEDDIFF_PREC_BITS - 1)) +
+ (WARPEDPIXEL_PREC_SHIFTS << WARPEDDIFF_PREC_BITS);
+ const int32_t const3 = ((1 << WARP_PARAM_REDUCE_BITS) - 1);
+ const int16_t const4 = (1 << (bd + FILTER_BITS - reduce_bits_horiz - 1));
+ const int16_t const5 = (1 << (FILTER_BITS - reduce_bits_horiz));
+
+ __m256i shuffle_src[4];
+ shuffle_src[0] = _mm256_load_si256((__m256i *)shuffle_src0);
+ shuffle_src[1] = _mm256_load_si256((__m256i *)shuffle_src1);
+ shuffle_src[2] = _mm256_load_si256((__m256i *)shuffle_src2);
+ shuffle_src[3] = _mm256_load_si256((__m256i *)shuffle_src3);
+
+ for (i = 0; i < p_height; i += 8) {
+ for (j = 0; j < p_width; j += 8) {
+ const int32_t src_x = (p_col + j + 4) << subsampling_x;
+ const int32_t src_y = (p_row + i + 4) << subsampling_y;
+ const int64_t dst_x =
+ (int64_t)mat[2] * src_x + (int64_t)mat[3] * src_y + (int64_t)mat[0];
+ const int64_t dst_y =
+ (int64_t)mat[4] * src_x + (int64_t)mat[5] * src_y + (int64_t)mat[1];
+ const int64_t x4 = dst_x >> subsampling_x;
+ const int64_t y4 = dst_y >> subsampling_y;
+
+ int32_t ix4 = (int32_t)(x4 >> WARPEDMODEL_PREC_BITS);
+ int32_t sx4 = x4 & ((1 << WARPEDMODEL_PREC_BITS) - 1);
+ int32_t iy4 = (int32_t)(y4 >> WARPEDMODEL_PREC_BITS);
+ int32_t sy4 = y4 & ((1 << WARPEDMODEL_PREC_BITS) - 1);
+
+ // Add in all the constant terms, including rounding and offset
+ sx4 += const1;
+ sy4 += const2;
+
+ sx4 &= ~const3;
+ sy4 &= ~const3;
+
+ // Horizontal filter
+ // If the block is aligned such that, after clamping, every sample
+ // would be taken from the leftmost/rightmost column, then we can
+ // skip the expensive horizontal filter.
+
+ if (ix4 <= -7) {
+ int iy, row = 0;
+ for (k = -7; k <= (AOMMIN(8, p_height - i) - 2); k += 2) {
+ iy = iy4 + k;
+ iy = clamp(iy, 0, height - 1);
+ const __m256i temp_0 =
+ _mm256_set1_epi16(const4 + ref[iy * stride] * const5);
+ iy = iy4 + k + 1;
+ iy = clamp(iy, 0, height - 1);
+ const __m256i temp_1 =
+ _mm256_set1_epi16(const4 + ref[iy * stride] * const5);
+ horz_out[row] = _mm256_blend_epi32(temp_0, temp_1, 0xf0);
+ row += 1;
+ }
+ iy = iy4 + k;
+ iy = clamp(iy, 0, height - 1);
+ horz_out[row] = _mm256_set1_epi16(const4 + ref[iy * stride] * const5);
+ } else if (ix4 >= width + 6) {
+ int iy, row = 0;
+ for (k = -7; k <= (AOMMIN(8, p_height - i) - 2); k += 2) {
+ iy = iy4 + k;
+ iy = clamp(iy, 0, height - 1);
+ const __m256i temp_0 = _mm256_set1_epi16(
+ const4 + ref[iy * stride + (width - 1)] * const5);
+ iy = iy4 + k + 1;
+ iy = clamp(iy, 0, height - 1);
+ const __m256i temp_1 = _mm256_set1_epi16(
+ const4 + ref[iy * stride + (width - 1)] * const5);
+ horz_out[row] = _mm256_blend_epi32(temp_0, temp_1, 0xf0);
+ row += 1;
+ }
+ iy = iy4 + k;
+ iy = clamp(iy, 0, height - 1);
+ horz_out[row] =
+ _mm256_set1_epi16(const4 + ref[iy * stride + (width - 1)] * const5);
+ } else if (((ix4 - 7) < 0) || ((ix4 + 9) > width)) {
+ const int out_of_boundary_left = -(ix4 - 6);
+ const int out_of_boundary_right = (ix4 + 8) - width;
+ int iy, sx, row = 0;
+ for (k = -7; k <= (AOMMIN(8, p_height - i) - 2); k += 2) {
+ iy = iy4 + k;
+ iy = clamp(iy, 0, height - 1);
+ __m128i src0 =
+ _mm_loadu_si128((__m128i *)(ref + iy * stride + ix4 - 7));
+ iy = iy4 + k + 1;
+ iy = clamp(iy, 0, height - 1);
+ __m128i src1 =
+ _mm_loadu_si128((__m128i *)(ref + iy * stride + ix4 - 7));
+
+ if (out_of_boundary_left >= 0) {
+ const __m128i shuffle_reg_left =
+ _mm_loadu_si128((__m128i *)warp_pad_left[out_of_boundary_left]);
+ src0 = _mm_shuffle_epi8(src0, shuffle_reg_left);
+ src1 = _mm_shuffle_epi8(src1, shuffle_reg_left);
+ }
+ if (out_of_boundary_right >= 0) {
+ const __m128i shuffle_reg_right = _mm_loadu_si128(
+ (__m128i *)warp_pad_right[out_of_boundary_right]);
+ src0 = _mm_shuffle_epi8(src0, shuffle_reg_right);
+ src1 = _mm_shuffle_epi8(src1, shuffle_reg_right);
+ }
+ sx = sx4 + beta * (k + 4);
+ const __m256i src_01 =
+ _mm256_inserti128_si256(_mm256_castsi128_si256(src0), src1, 0x1);
+ horizontal_filter_avx2(src_01, horz_out, sx, alpha, beta, row,
+ shuffle_src, &round_const, &shift);
+ row += 1;
+ }
+ iy = iy4 + k;
+ iy = clamp(iy, 0, height - 1);
+ __m128i src = _mm_loadu_si128((__m128i *)(ref + iy * stride + ix4 - 7));
+ if (out_of_boundary_left >= 0) {
+ const __m128i shuffle_reg_left =
+ _mm_loadu_si128((__m128i *)warp_pad_left[out_of_boundary_left]);
+ src = _mm_shuffle_epi8(src, shuffle_reg_left);
+ }
+ if (out_of_boundary_right >= 0) {
+ const __m128i shuffle_reg_right =
+ _mm_loadu_si128((__m128i *)warp_pad_right[out_of_boundary_right]);
+ src = _mm_shuffle_epi8(src, shuffle_reg_right);
+ }
+ sx = sx4 + beta * (k + 4);
+ const __m256i src_01 = _mm256_castsi128_si256(src);
+ __m256i coeff[4];
+ prepare_horizontal_filter_coeff(alpha, sx, coeff);
+ filter_src_pixels_avx2(src_01, horz_out, coeff, shuffle_src,
+ &round_const, &shift, row);
+ } else {
+ prepare_warp_horizontal_filter_avx2(
+ ref, horz_out, stride, ix4, iy4, sx4, alpha, beta, p_height, height,
+ i, &round_const, &shift, shuffle_src);
+ }
+
+ // Vertical filter
+ prepare_warp_vertical_filter_avx2(
+ pred, horz_out, conv_params, gamma, delta, p_height, p_stride,
+ p_width, i, j, sy4, reduce_bits_vert, &res_add_const_1, round_bits,
+ &res_sub_const, &round_bits_const, &wt);
+ }
+ }
+}
diff --git a/third_party/aom/av1/common/x86/warp_plane_sse4.c b/third_party/aom/av1/common/x86/warp_plane_sse4.c
new file mode 100644
index 0000000000..4c05555ff7
--- /dev/null
+++ b/third_party/aom/av1/common/x86/warp_plane_sse4.c
@@ -0,0 +1,908 @@
+/*
+ * Copyright (c) 2017, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <emmintrin.h>
+#include <smmintrin.h>
+
+#include "config/av1_rtcd.h"
+
+#include "av1/common/warped_motion.h"
+
+/* This is a modified version of 'av1_warped_filter' from warped_motion.c:
+ * Each coefficient is stored in 8 bits instead of 16 bits
+ * The coefficients are rearranged in the column order 0, 2, 4, 6, 1, 3, 5, 7
+
+ This is done in order to avoid overflow: Since the tap with the largest
+ coefficient could be any of taps 2, 3, 4 or 5, we can't use the summation
+ order ((0 + 1) + (4 + 5)) + ((2 + 3) + (6 + 7)) used in the regular
+ convolve functions.
+
+ Instead, we use the summation order
+ ((0 + 2) + (4 + 6)) + ((1 + 3) + (5 + 7)).
+ The rearrangement of coefficients in this table is so that we can get the
+ coefficients into the correct order more quickly.
+*/
+/* clang-format off */
+DECLARE_ALIGNED(8, const int8_t,
+ av1_filter_8bit[WARPEDPIXEL_PREC_SHIFTS * 3 + 1][8]) = {
+ // [-1, 0)
+ { 0, 127, 0, 0, 0, 1, 0, 0}, { 0, 127, 0, 0, -1, 2, 0, 0},
+ { 1, 127, -1, 0, -3, 4, 0, 0}, { 1, 126, -2, 0, -4, 6, 1, 0},
+ { 1, 126, -3, 0, -5, 8, 1, 0}, { 1, 125, -4, 0, -6, 11, 1, 0},
+ { 1, 124, -4, 0, -7, 13, 1, 0}, { 2, 123, -5, 0, -8, 15, 1, 0},
+ { 2, 122, -6, 0, -9, 18, 1, 0}, { 2, 121, -6, 0, -10, 20, 1, 0},
+ { 2, 120, -7, 0, -11, 22, 2, 0}, { 2, 119, -8, 0, -12, 25, 2, 0},
+ { 3, 117, -8, 0, -13, 27, 2, 0}, { 3, 116, -9, 0, -13, 29, 2, 0},
+ { 3, 114, -10, 0, -14, 32, 3, 0}, { 3, 113, -10, 0, -15, 35, 2, 0},
+ { 3, 111, -11, 0, -15, 37, 3, 0}, { 3, 109, -11, 0, -16, 40, 3, 0},
+ { 3, 108, -12, 0, -16, 42, 3, 0}, { 4, 106, -13, 0, -17, 45, 3, 0},
+ { 4, 104, -13, 0, -17, 47, 3, 0}, { 4, 102, -14, 0, -17, 50, 3, 0},
+ { 4, 100, -14, 0, -17, 52, 3, 0}, { 4, 98, -15, 0, -18, 55, 4, 0},
+ { 4, 96, -15, 0, -18, 58, 3, 0}, { 4, 94, -16, 0, -18, 60, 4, 0},
+ { 4, 91, -16, 0, -18, 63, 4, 0}, { 4, 89, -16, 0, -18, 65, 4, 0},
+ { 4, 87, -17, 0, -18, 68, 4, 0}, { 4, 85, -17, 0, -18, 70, 4, 0},
+ { 4, 82, -17, 0, -18, 73, 4, 0}, { 4, 80, -17, 0, -18, 75, 4, 0},
+ { 4, 78, -18, 0, -18, 78, 4, 0}, { 4, 75, -18, 0, -17, 80, 4, 0},
+ { 4, 73, -18, 0, -17, 82, 4, 0}, { 4, 70, -18, 0, -17, 85, 4, 0},
+ { 4, 68, -18, 0, -17, 87, 4, 0}, { 4, 65, -18, 0, -16, 89, 4, 0},
+ { 4, 63, -18, 0, -16, 91, 4, 0}, { 4, 60, -18, 0, -16, 94, 4, 0},
+ { 3, 58, -18, 0, -15, 96, 4, 0}, { 4, 55, -18, 0, -15, 98, 4, 0},
+ { 3, 52, -17, 0, -14, 100, 4, 0}, { 3, 50, -17, 0, -14, 102, 4, 0},
+ { 3, 47, -17, 0, -13, 104, 4, 0}, { 3, 45, -17, 0, -13, 106, 4, 0},
+ { 3, 42, -16, 0, -12, 108, 3, 0}, { 3, 40, -16, 0, -11, 109, 3, 0},
+ { 3, 37, -15, 0, -11, 111, 3, 0}, { 2, 35, -15, 0, -10, 113, 3, 0},
+ { 3, 32, -14, 0, -10, 114, 3, 0}, { 2, 29, -13, 0, -9, 116, 3, 0},
+ { 2, 27, -13, 0, -8, 117, 3, 0}, { 2, 25, -12, 0, -8, 119, 2, 0},
+ { 2, 22, -11, 0, -7, 120, 2, 0}, { 1, 20, -10, 0, -6, 121, 2, 0},
+ { 1, 18, -9, 0, -6, 122, 2, 0}, { 1, 15, -8, 0, -5, 123, 2, 0},
+ { 1, 13, -7, 0, -4, 124, 1, 0}, { 1, 11, -6, 0, -4, 125, 1, 0},
+ { 1, 8, -5, 0, -3, 126, 1, 0}, { 1, 6, -4, 0, -2, 126, 1, 0},
+ { 0, 4, -3, 0, -1, 127, 1, 0}, { 0, 2, -1, 0, 0, 127, 0, 0},
+ // [0, 1)
+ { 0, 0, 1, 0, 0, 127, 0, 0}, { 0, -1, 2, 0, 0, 127, 0, 0},
+ { 0, -3, 4, 1, 1, 127, -2, 0}, { 0, -5, 6, 1, 1, 127, -2, 0},
+ { 0, -6, 8, 1, 2, 126, -3, 0}, {-1, -7, 11, 2, 2, 126, -4, -1},
+ {-1, -8, 13, 2, 3, 125, -5, -1}, {-1, -10, 16, 3, 3, 124, -6, -1},
+ {-1, -11, 18, 3, 4, 123, -7, -1}, {-1, -12, 20, 3, 4, 122, -7, -1},
+ {-1, -13, 23, 3, 4, 121, -8, -1}, {-2, -14, 25, 4, 5, 120, -9, -1},
+ {-1, -15, 27, 4, 5, 119, -10, -1}, {-1, -16, 30, 4, 5, 118, -11, -1},
+ {-2, -17, 33, 5, 6, 116, -12, -1}, {-2, -17, 35, 5, 6, 114, -12, -1},
+ {-2, -18, 38, 5, 6, 113, -13, -1}, {-2, -19, 41, 6, 7, 111, -14, -2},
+ {-2, -19, 43, 6, 7, 110, -15, -2}, {-2, -20, 46, 6, 7, 108, -15, -2},
+ {-2, -20, 49, 6, 7, 106, -16, -2}, {-2, -21, 51, 7, 7, 104, -16, -2},
+ {-2, -21, 54, 7, 7, 102, -17, -2}, {-2, -21, 56, 7, 8, 100, -18, -2},
+ {-2, -22, 59, 7, 8, 98, -18, -2}, {-2, -22, 62, 7, 8, 96, -19, -2},
+ {-2, -22, 64, 7, 8, 94, -19, -2}, {-2, -22, 67, 8, 8, 91, -20, -2},
+ {-2, -22, 69, 8, 8, 89, -20, -2}, {-2, -22, 72, 8, 8, 87, -21, -2},
+ {-2, -21, 74, 8, 8, 84, -21, -2}, {-2, -22, 77, 8, 8, 82, -21, -2},
+ {-2, -21, 79, 8, 8, 79, -21, -2}, {-2, -21, 82, 8, 8, 77, -22, -2},
+ {-2, -21, 84, 8, 8, 74, -21, -2}, {-2, -21, 87, 8, 8, 72, -22, -2},
+ {-2, -20, 89, 8, 8, 69, -22, -2}, {-2, -20, 91, 8, 8, 67, -22, -2},
+ {-2, -19, 94, 8, 7, 64, -22, -2}, {-2, -19, 96, 8, 7, 62, -22, -2},
+ {-2, -18, 98, 8, 7, 59, -22, -2}, {-2, -18, 100, 8, 7, 56, -21, -2},
+ {-2, -17, 102, 7, 7, 54, -21, -2}, {-2, -16, 104, 7, 7, 51, -21, -2},
+ {-2, -16, 106, 7, 6, 49, -20, -2}, {-2, -15, 108, 7, 6, 46, -20, -2},
+ {-2, -15, 110, 7, 6, 43, -19, -2}, {-2, -14, 111, 7, 6, 41, -19, -2},
+ {-1, -13, 113, 6, 5, 38, -18, -2}, {-1, -12, 114, 6, 5, 35, -17, -2},
+ {-1, -12, 116, 6, 5, 33, -17, -2}, {-1, -11, 118, 5, 4, 30, -16, -1},
+ {-1, -10, 119, 5, 4, 27, -15, -1}, {-1, -9, 120, 5, 4, 25, -14, -2},
+ {-1, -8, 121, 4, 3, 23, -13, -1}, {-1, -7, 122, 4, 3, 20, -12, -1},
+ {-1, -7, 123, 4, 3, 18, -11, -1}, {-1, -6, 124, 3, 3, 16, -10, -1},
+ {-1, -5, 125, 3, 2, 13, -8, -1}, {-1, -4, 126, 2, 2, 11, -7, -1},
+ { 0, -3, 126, 2, 1, 8, -6, 0}, { 0, -2, 127, 1, 1, 6, -5, 0},
+ { 0, -2, 127, 1, 1, 4, -3, 0}, { 0, 0, 127, 0, 0, 2, -1, 0},
+ // [1, 2)
+ { 0, 0, 127, 0, 0, 1, 0, 0}, { 0, 0, 127, 0, 0, -1, 2, 0},
+ { 0, 1, 127, -1, 0, -3, 4, 0}, { 0, 1, 126, -2, 0, -4, 6, 1},
+ { 0, 1, 126, -3, 0, -5, 8, 1}, { 0, 1, 125, -4, 0, -6, 11, 1},
+ { 0, 1, 124, -4, 0, -7, 13, 1}, { 0, 2, 123, -5, 0, -8, 15, 1},
+ { 0, 2, 122, -6, 0, -9, 18, 1}, { 0, 2, 121, -6, 0, -10, 20, 1},
+ { 0, 2, 120, -7, 0, -11, 22, 2}, { 0, 2, 119, -8, 0, -12, 25, 2},
+ { 0, 3, 117, -8, 0, -13, 27, 2}, { 0, 3, 116, -9, 0, -13, 29, 2},
+ { 0, 3, 114, -10, 0, -14, 32, 3}, { 0, 3, 113, -10, 0, -15, 35, 2},
+ { 0, 3, 111, -11, 0, -15, 37, 3}, { 0, 3, 109, -11, 0, -16, 40, 3},
+ { 0, 3, 108, -12, 0, -16, 42, 3}, { 0, 4, 106, -13, 0, -17, 45, 3},
+ { 0, 4, 104, -13, 0, -17, 47, 3}, { 0, 4, 102, -14, 0, -17, 50, 3},
+ { 0, 4, 100, -14, 0, -17, 52, 3}, { 0, 4, 98, -15, 0, -18, 55, 4},
+ { 0, 4, 96, -15, 0, -18, 58, 3}, { 0, 4, 94, -16, 0, -18, 60, 4},
+ { 0, 4, 91, -16, 0, -18, 63, 4}, { 0, 4, 89, -16, 0, -18, 65, 4},
+ { 0, 4, 87, -17, 0, -18, 68, 4}, { 0, 4, 85, -17, 0, -18, 70, 4},
+ { 0, 4, 82, -17, 0, -18, 73, 4}, { 0, 4, 80, -17, 0, -18, 75, 4},
+ { 0, 4, 78, -18, 0, -18, 78, 4}, { 0, 4, 75, -18, 0, -17, 80, 4},
+ { 0, 4, 73, -18, 0, -17, 82, 4}, { 0, 4, 70, -18, 0, -17, 85, 4},
+ { 0, 4, 68, -18, 0, -17, 87, 4}, { 0, 4, 65, -18, 0, -16, 89, 4},
+ { 0, 4, 63, -18, 0, -16, 91, 4}, { 0, 4, 60, -18, 0, -16, 94, 4},
+ { 0, 3, 58, -18, 0, -15, 96, 4}, { 0, 4, 55, -18, 0, -15, 98, 4},
+ { 0, 3, 52, -17, 0, -14, 100, 4}, { 0, 3, 50, -17, 0, -14, 102, 4},
+ { 0, 3, 47, -17, 0, -13, 104, 4}, { 0, 3, 45, -17, 0, -13, 106, 4},
+ { 0, 3, 42, -16, 0, -12, 108, 3}, { 0, 3, 40, -16, 0, -11, 109, 3},
+ { 0, 3, 37, -15, 0, -11, 111, 3}, { 0, 2, 35, -15, 0, -10, 113, 3},
+ { 0, 3, 32, -14, 0, -10, 114, 3}, { 0, 2, 29, -13, 0, -9, 116, 3},
+ { 0, 2, 27, -13, 0, -8, 117, 3}, { 0, 2, 25, -12, 0, -8, 119, 2},
+ { 0, 2, 22, -11, 0, -7, 120, 2}, { 0, 1, 20, -10, 0, -6, 121, 2},
+ { 0, 1, 18, -9, 0, -6, 122, 2}, { 0, 1, 15, -8, 0, -5, 123, 2},
+ { 0, 1, 13, -7, 0, -4, 124, 1}, { 0, 1, 11, -6, 0, -4, 125, 1},
+ { 0, 1, 8, -5, 0, -3, 126, 1}, { 0, 1, 6, -4, 0, -2, 126, 1},
+ { 0, 0, 4, -3, 0, -1, 127, 1}, { 0, 0, 2, -1, 0, 0, 127, 0},
+ // dummy (replicate row index 191)
+ { 0, 0, 2, -1, 0, 0, 127, 0},
+};
+/* clang-format on */
+
+// Shuffle masks: we want to convert a sequence of bytes 0, 1, 2, ..., 15
+// in an SSE register into two sequences:
+// 0, 2, 2, 4, ..., 12, 12, 14, <don't care>
+// 1, 3, 3, 5, ..., 13, 13, 15, <don't care>
+DECLARE_ALIGNED(16, static const uint8_t,
+ even_mask[16]) = { 0, 2, 2, 4, 4, 6, 6, 8,
+ 8, 10, 10, 12, 12, 14, 14, 0 };
+
+DECLARE_ALIGNED(16, static const uint8_t,
+ odd_mask[16]) = { 1, 3, 3, 5, 5, 7, 7, 9,
+ 9, 11, 11, 13, 13, 15, 15, 0 };
+
+DECLARE_ALIGNED(16, static const uint8_t,
+ shuffle_alpha0_mask01[16]) = { 0, 1, 0, 1, 0, 1, 0, 1,
+ 0, 1, 0, 1, 0, 1, 0, 1 };
+
+DECLARE_ALIGNED(16, static const uint8_t,
+ shuffle_alpha0_mask23[16]) = { 2, 3, 2, 3, 2, 3, 2, 3,
+ 2, 3, 2, 3, 2, 3, 2, 3 };
+
+DECLARE_ALIGNED(16, static const uint8_t,
+ shuffle_alpha0_mask45[16]) = { 4, 5, 4, 5, 4, 5, 4, 5,
+ 4, 5, 4, 5, 4, 5, 4, 5 };
+
+DECLARE_ALIGNED(16, static const uint8_t,
+ shuffle_alpha0_mask67[16]) = { 6, 7, 6, 7, 6, 7, 6, 7,
+ 6, 7, 6, 7, 6, 7, 6, 7 };
+
+DECLARE_ALIGNED(16, static const uint8_t,
+ shuffle_gamma0_mask0[16]) = { 0, 1, 2, 3, 0, 1, 2, 3,
+ 0, 1, 2, 3, 0, 1, 2, 3 };
+
+DECLARE_ALIGNED(16, static const uint8_t,
+ shuffle_gamma0_mask1[16]) = { 4, 5, 6, 7, 4, 5, 6, 7,
+ 4, 5, 6, 7, 4, 5, 6, 7 };
+
+DECLARE_ALIGNED(16, static const uint8_t,
+ shuffle_gamma0_mask2[16]) = { 8, 9, 10, 11, 8, 9, 10, 11,
+ 8, 9, 10, 11, 8, 9, 10, 11 };
+
+DECLARE_ALIGNED(16, static const uint8_t,
+ shuffle_gamma0_mask3[16]) = { 12, 13, 14, 15, 12, 13, 14, 15,
+ 12, 13, 14, 15, 12, 13, 14, 15 };
+
+static INLINE void filter_src_pixels(__m128i src, __m128i *tmp, __m128i *coeff,
+ const int offset_bits_horiz,
+ const int reduce_bits_horiz, int k) {
+ const __m128i src_even =
+ _mm_shuffle_epi8(src, _mm_load_si128((__m128i *)even_mask));
+ const __m128i src_odd =
+ _mm_shuffle_epi8(src, _mm_load_si128((__m128i *)odd_mask));
+ // The pixel order we need for 'src' is:
+ // 0 2 2 4 4 6 6 8 1 3 3 5 5 7 7 9
+ const __m128i src_02 = _mm_unpacklo_epi64(src_even, src_odd);
+ const __m128i res_02 = _mm_maddubs_epi16(src_02, coeff[0]);
+ // 4 6 6 8 8 10 10 12 5 7 7 9 9 11 11 13
+ const __m128i src_46 = _mm_unpacklo_epi64(_mm_srli_si128(src_even, 4),
+ _mm_srli_si128(src_odd, 4));
+ const __m128i res_46 = _mm_maddubs_epi16(src_46, coeff[1]);
+ // 1 3 3 5 5 7 7 9 2 4 4 6 6 8 8 10
+ const __m128i src_13 =
+ _mm_unpacklo_epi64(src_odd, _mm_srli_si128(src_even, 2));
+ const __m128i res_13 = _mm_maddubs_epi16(src_13, coeff[2]);
+ // 5 7 7 9 9 11 11 13 6 8 8 10 10 12 12 14
+ const __m128i src_57 = _mm_unpacklo_epi64(_mm_srli_si128(src_odd, 4),
+ _mm_srli_si128(src_even, 6));
+ const __m128i res_57 = _mm_maddubs_epi16(src_57, coeff[3]);
+
+ const __m128i round_const = _mm_set1_epi16((1 << offset_bits_horiz) +
+ ((1 << reduce_bits_horiz) >> 1));
+
+ // Note: The values res_02 + res_46 and res_13 + res_57 both
+ // fit into int16s at this point, but their sum may be too wide to fit
+ // into an int16. However, once we also add round_const, the sum of
+ // all of these fits into a uint16.
+ //
+ // The wrapping behaviour of _mm_add_* is used here to make sure we
+ // get the correct result despite converting between different
+ // (implicit) types.
+ const __m128i res_even = _mm_add_epi16(res_02, res_46);
+ const __m128i res_odd = _mm_add_epi16(res_13, res_57);
+ const __m128i res =
+ _mm_add_epi16(_mm_add_epi16(res_even, res_odd), round_const);
+ tmp[k + 7] = _mm_srl_epi16(res, _mm_cvtsi32_si128(reduce_bits_horiz));
+}
+
+static INLINE void prepare_horizontal_filter_coeff(int alpha, int sx,
+ __m128i *coeff) {
+ // Filter even-index pixels
+ const __m128i tmp_0 = _mm_loadl_epi64(
+ (__m128i *)&av1_filter_8bit[(sx + 0 * alpha) >> WARPEDDIFF_PREC_BITS]);
+ const __m128i tmp_1 = _mm_loadl_epi64(
+ (__m128i *)&av1_filter_8bit[(sx + 1 * alpha) >> WARPEDDIFF_PREC_BITS]);
+ const __m128i tmp_2 = _mm_loadl_epi64(
+ (__m128i *)&av1_filter_8bit[(sx + 2 * alpha) >> WARPEDDIFF_PREC_BITS]);
+ const __m128i tmp_3 = _mm_loadl_epi64(
+ (__m128i *)&av1_filter_8bit[(sx + 3 * alpha) >> WARPEDDIFF_PREC_BITS]);
+ const __m128i tmp_4 = _mm_loadl_epi64(
+ (__m128i *)&av1_filter_8bit[(sx + 4 * alpha) >> WARPEDDIFF_PREC_BITS]);
+ const __m128i tmp_5 = _mm_loadl_epi64(
+ (__m128i *)&av1_filter_8bit[(sx + 5 * alpha) >> WARPEDDIFF_PREC_BITS]);
+ const __m128i tmp_6 = _mm_loadl_epi64(
+ (__m128i *)&av1_filter_8bit[(sx + 6 * alpha) >> WARPEDDIFF_PREC_BITS]);
+ const __m128i tmp_7 = _mm_loadl_epi64(
+ (__m128i *)&av1_filter_8bit[(sx + 7 * alpha) >> WARPEDDIFF_PREC_BITS]);
+
+ // Coeffs 0 2 0 2 4 6 4 6 1 3 1 3 5 7 5 7 for pixels 0 2
+ const __m128i tmp_8 = _mm_unpacklo_epi16(tmp_0, tmp_2);
+ // Coeffs 0 2 0 2 4 6 4 6 1 3 1 3 5 7 5 7 for pixels 1 3
+ const __m128i tmp_9 = _mm_unpacklo_epi16(tmp_1, tmp_3);
+ // Coeffs 0 2 0 2 4 6 4 6 1 3 1 3 5 7 5 7 for pixels 4 6
+ const __m128i tmp_10 = _mm_unpacklo_epi16(tmp_4, tmp_6);
+ // Coeffs 0 2 0 2 4 6 4 6 1 3 1 3 5 7 5 7 for pixels 5 7
+ const __m128i tmp_11 = _mm_unpacklo_epi16(tmp_5, tmp_7);
+
+ // Coeffs 0 2 0 2 0 2 0 2 4 6 4 6 4 6 4 6 for pixels 0 2 4 6
+ const __m128i tmp_12 = _mm_unpacklo_epi32(tmp_8, tmp_10);
+ // Coeffs 1 3 1 3 1 3 1 3 5 7 5 7 5 7 5 7 for pixels 0 2 4 6
+ const __m128i tmp_13 = _mm_unpackhi_epi32(tmp_8, tmp_10);
+ // Coeffs 0 2 0 2 0 2 0 2 4 6 4 6 4 6 4 6 for pixels 1 3 5 7
+ const __m128i tmp_14 = _mm_unpacklo_epi32(tmp_9, tmp_11);
+ // Coeffs 1 3 1 3 1 3 1 3 5 7 5 7 5 7 5 7 for pixels 1 3 5 7
+ const __m128i tmp_15 = _mm_unpackhi_epi32(tmp_9, tmp_11);
+
+ // Coeffs 0 2 for pixels 0 2 4 6 1 3 5 7
+ coeff[0] = _mm_unpacklo_epi64(tmp_12, tmp_14);
+ // Coeffs 4 6 for pixels 0 2 4 6 1 3 5 7
+ coeff[1] = _mm_unpackhi_epi64(tmp_12, tmp_14);
+ // Coeffs 1 3 for pixels 0 2 4 6 1 3 5 7
+ coeff[2] = _mm_unpacklo_epi64(tmp_13, tmp_15);
+ // Coeffs 5 7 for pixels 0 2 4 6 1 3 5 7
+ coeff[3] = _mm_unpackhi_epi64(tmp_13, tmp_15);
+}
+
+static INLINE void prepare_horizontal_filter_coeff_alpha0(int sx,
+ __m128i *coeff) {
+ // Filter even-index pixels
+ const __m128i tmp_0 =
+ _mm_loadl_epi64((__m128i *)&av1_filter_8bit[sx >> WARPEDDIFF_PREC_BITS]);
+
+ // Coeffs 0 2 for pixels 0 2 4 6 1 3 5 7
+ coeff[0] =
+ _mm_shuffle_epi8(tmp_0, _mm_load_si128((__m128i *)shuffle_alpha0_mask01));
+ // Coeffs 4 6 for pixels 0 2 4 6 1 3 5 7
+ coeff[1] =
+ _mm_shuffle_epi8(tmp_0, _mm_load_si128((__m128i *)shuffle_alpha0_mask23));
+ // Coeffs 1 3 for pixels 0 2 4 6 1 3 5 7
+ coeff[2] =
+ _mm_shuffle_epi8(tmp_0, _mm_load_si128((__m128i *)shuffle_alpha0_mask45));
+ // Coeffs 5 7 for pixels 0 2 4 6 1 3 5 7
+ coeff[3] =
+ _mm_shuffle_epi8(tmp_0, _mm_load_si128((__m128i *)shuffle_alpha0_mask67));
+}
+
+static INLINE void horizontal_filter(__m128i src, __m128i *tmp, int sx,
+ int alpha, int k,
+ const int offset_bits_horiz,
+ const int reduce_bits_horiz) {
+ __m128i coeff[4];
+ prepare_horizontal_filter_coeff(alpha, sx, coeff);
+ filter_src_pixels(src, tmp, coeff, offset_bits_horiz, reduce_bits_horiz, k);
+}
+
+static INLINE void warp_horizontal_filter(const uint8_t *ref, __m128i *tmp,
+ int stride, int32_t ix4, int32_t iy4,
+ int32_t sx4, int alpha, int beta,
+ int p_height, int height, int i,
+ const int offset_bits_horiz,
+ const int reduce_bits_horiz) {
+ int k;
+ for (k = -7; k < AOMMIN(8, p_height - i); ++k) {
+ int iy = iy4 + k;
+ if (iy < 0)
+ iy = 0;
+ else if (iy > height - 1)
+ iy = height - 1;
+ int sx = sx4 + beta * (k + 4);
+
+ // Load source pixels
+ const __m128i src =
+ _mm_loadu_si128((__m128i *)(ref + iy * stride + ix4 - 7));
+ horizontal_filter(src, tmp, sx, alpha, k, offset_bits_horiz,
+ reduce_bits_horiz);
+ }
+}
+
+static INLINE void warp_horizontal_filter_alpha0(
+ const uint8_t *ref, __m128i *tmp, int stride, int32_t ix4, int32_t iy4,
+ int32_t sx4, int alpha, int beta, int p_height, int height, int i,
+ const int offset_bits_horiz, const int reduce_bits_horiz) {
+ (void)alpha;
+ int k;
+ for (k = -7; k < AOMMIN(8, p_height - i); ++k) {
+ int iy = iy4 + k;
+ if (iy < 0)
+ iy = 0;
+ else if (iy > height - 1)
+ iy = height - 1;
+ int sx = sx4 + beta * (k + 4);
+
+ // Load source pixels
+ const __m128i src =
+ _mm_loadu_si128((__m128i *)(ref + iy * stride + ix4 - 7));
+
+ __m128i coeff[4];
+ prepare_horizontal_filter_coeff_alpha0(sx, coeff);
+ filter_src_pixels(src, tmp, coeff, offset_bits_horiz, reduce_bits_horiz, k);
+ }
+}
+
+static INLINE void warp_horizontal_filter_beta0(
+ const uint8_t *ref, __m128i *tmp, int stride, int32_t ix4, int32_t iy4,
+ int32_t sx4, int alpha, int beta, int p_height, int height, int i,
+ const int offset_bits_horiz, const int reduce_bits_horiz) {
+ (void)beta;
+ int k;
+ __m128i coeff[4];
+ prepare_horizontal_filter_coeff(alpha, sx4, coeff);
+
+ for (k = -7; k < AOMMIN(8, p_height - i); ++k) {
+ int iy = iy4 + k;
+ if (iy < 0)
+ iy = 0;
+ else if (iy > height - 1)
+ iy = height - 1;
+
+ // Load source pixels
+ const __m128i src =
+ _mm_loadu_si128((__m128i *)(ref + iy * stride + ix4 - 7));
+ filter_src_pixels(src, tmp, coeff, offset_bits_horiz, reduce_bits_horiz, k);
+ }
+}
+
+static INLINE void warp_horizontal_filter_alpha0_beta0(
+ const uint8_t *ref, __m128i *tmp, int stride, int32_t ix4, int32_t iy4,
+ int32_t sx4, int alpha, int beta, int p_height, int height, int i,
+ const int offset_bits_horiz, const int reduce_bits_horiz) {
+ (void)beta;
+ (void)alpha;
+ int k;
+
+ __m128i coeff[4];
+ prepare_horizontal_filter_coeff_alpha0(sx4, coeff);
+
+ for (k = -7; k < AOMMIN(8, p_height - i); ++k) {
+ int iy = iy4 + k;
+ if (iy < 0)
+ iy = 0;
+ else if (iy > height - 1)
+ iy = height - 1;
+
+ // Load source pixels
+ const __m128i src =
+ _mm_loadu_si128((__m128i *)(ref + iy * stride + ix4 - 7));
+ filter_src_pixels(src, tmp, coeff, offset_bits_horiz, reduce_bits_horiz, k);
+ }
+}
+
+static INLINE void unpack_weights_and_set_round_const(
+ ConvolveParams *conv_params, const int round_bits, const int offset_bits,
+ __m128i *res_sub_const, __m128i *round_bits_const, __m128i *wt) {
+ *res_sub_const =
+ _mm_set1_epi16(-(1 << (offset_bits - conv_params->round_1)) -
+ (1 << (offset_bits - conv_params->round_1 - 1)));
+ *round_bits_const = _mm_set1_epi16(((1 << round_bits) >> 1));
+
+ const int w0 = conv_params->fwd_offset;
+ const int w1 = conv_params->bck_offset;
+ const __m128i wt0 = _mm_set1_epi16((int16_t)w0);
+ const __m128i wt1 = _mm_set1_epi16((int16_t)w1);
+ *wt = _mm_unpacklo_epi16(wt0, wt1);
+}
+
+static INLINE void prepare_vertical_filter_coeffs(int gamma, int sy,
+ __m128i *coeffs) {
+ const __m128i tmp_0 =
+ _mm_loadu_si128((__m128i *)(av1_warped_filter +
+ ((sy + 0 * gamma) >> WARPEDDIFF_PREC_BITS)));
+ const __m128i tmp_2 =
+ _mm_loadu_si128((__m128i *)(av1_warped_filter +
+ ((sy + 2 * gamma) >> WARPEDDIFF_PREC_BITS)));
+ const __m128i tmp_4 =
+ _mm_loadu_si128((__m128i *)(av1_warped_filter +
+ ((sy + 4 * gamma) >> WARPEDDIFF_PREC_BITS)));
+ const __m128i tmp_6 =
+ _mm_loadu_si128((__m128i *)(av1_warped_filter +
+ ((sy + 6 * gamma) >> WARPEDDIFF_PREC_BITS)));
+
+ const __m128i tmp_8 = _mm_unpacklo_epi32(tmp_0, tmp_2);
+ const __m128i tmp_10 = _mm_unpacklo_epi32(tmp_4, tmp_6);
+ const __m128i tmp_12 = _mm_unpackhi_epi32(tmp_0, tmp_2);
+ const __m128i tmp_14 = _mm_unpackhi_epi32(tmp_4, tmp_6);
+
+ // even coeffs
+ coeffs[0] = _mm_unpacklo_epi64(tmp_8, tmp_10);
+ coeffs[1] = _mm_unpackhi_epi64(tmp_8, tmp_10);
+ coeffs[2] = _mm_unpacklo_epi64(tmp_12, tmp_14);
+ coeffs[3] = _mm_unpackhi_epi64(tmp_12, tmp_14);
+
+ const __m128i tmp_1 =
+ _mm_loadu_si128((__m128i *)(av1_warped_filter +
+ ((sy + 1 * gamma) >> WARPEDDIFF_PREC_BITS)));
+ const __m128i tmp_3 =
+ _mm_loadu_si128((__m128i *)(av1_warped_filter +
+ ((sy + 3 * gamma) >> WARPEDDIFF_PREC_BITS)));
+ const __m128i tmp_5 =
+ _mm_loadu_si128((__m128i *)(av1_warped_filter +
+ ((sy + 5 * gamma) >> WARPEDDIFF_PREC_BITS)));
+ const __m128i tmp_7 =
+ _mm_loadu_si128((__m128i *)(av1_warped_filter +
+ ((sy + 7 * gamma) >> WARPEDDIFF_PREC_BITS)));
+
+ const __m128i tmp_9 = _mm_unpacklo_epi32(tmp_1, tmp_3);
+ const __m128i tmp_11 = _mm_unpacklo_epi32(tmp_5, tmp_7);
+ const __m128i tmp_13 = _mm_unpackhi_epi32(tmp_1, tmp_3);
+ const __m128i tmp_15 = _mm_unpackhi_epi32(tmp_5, tmp_7);
+
+ // odd coeffs
+ coeffs[4] = _mm_unpacklo_epi64(tmp_9, tmp_11);
+ coeffs[5] = _mm_unpackhi_epi64(tmp_9, tmp_11);
+ coeffs[6] = _mm_unpacklo_epi64(tmp_13, tmp_15);
+ coeffs[7] = _mm_unpackhi_epi64(tmp_13, tmp_15);
+}
+
+static INLINE void prepare_vertical_filter_coeffs_gamma0(int sy,
+ __m128i *coeffs) {
+ const __m128i tmp_0 = _mm_loadu_si128(
+ (__m128i *)(av1_warped_filter + (sy >> WARPEDDIFF_PREC_BITS)));
+
+ // even coeffs
+ coeffs[0] =
+ _mm_shuffle_epi8(tmp_0, _mm_load_si128((__m128i *)shuffle_gamma0_mask0));
+ coeffs[1] =
+ _mm_shuffle_epi8(tmp_0, _mm_load_si128((__m128i *)shuffle_gamma0_mask1));
+ coeffs[2] =
+ _mm_shuffle_epi8(tmp_0, _mm_load_si128((__m128i *)shuffle_gamma0_mask2));
+ coeffs[3] =
+ _mm_shuffle_epi8(tmp_0, _mm_load_si128((__m128i *)shuffle_gamma0_mask3));
+
+ // odd coeffs
+ coeffs[4] = coeffs[0];
+ coeffs[5] = coeffs[1];
+ coeffs[6] = coeffs[2];
+ coeffs[7] = coeffs[3];
+}
+
+static INLINE void filter_src_pixels_vertical(__m128i *tmp, __m128i *coeffs,
+ __m128i *res_lo, __m128i *res_hi,
+ int k) {
+ // Load from tmp and rearrange pairs of consecutive rows into the
+ // column order 0 0 2 2 4 4 6 6; 1 1 3 3 5 5 7 7
+ const __m128i *src = tmp + (k + 4);
+ const __m128i src_0 = _mm_unpacklo_epi16(src[0], src[1]);
+ const __m128i src_2 = _mm_unpacklo_epi16(src[2], src[3]);
+ const __m128i src_4 = _mm_unpacklo_epi16(src[4], src[5]);
+ const __m128i src_6 = _mm_unpacklo_epi16(src[6], src[7]);
+
+ const __m128i res_0 = _mm_madd_epi16(src_0, coeffs[0]);
+ const __m128i res_2 = _mm_madd_epi16(src_2, coeffs[1]);
+ const __m128i res_4 = _mm_madd_epi16(src_4, coeffs[2]);
+ const __m128i res_6 = _mm_madd_epi16(src_6, coeffs[3]);
+
+ const __m128i res_even =
+ _mm_add_epi32(_mm_add_epi32(res_0, res_2), _mm_add_epi32(res_4, res_6));
+
+ // Filter odd-index pixels
+ const __m128i src_1 = _mm_unpackhi_epi16(src[0], src[1]);
+ const __m128i src_3 = _mm_unpackhi_epi16(src[2], src[3]);
+ const __m128i src_5 = _mm_unpackhi_epi16(src[4], src[5]);
+ const __m128i src_7 = _mm_unpackhi_epi16(src[6], src[7]);
+
+ const __m128i res_1 = _mm_madd_epi16(src_1, coeffs[4]);
+ const __m128i res_3 = _mm_madd_epi16(src_3, coeffs[5]);
+ const __m128i res_5 = _mm_madd_epi16(src_5, coeffs[6]);
+ const __m128i res_7 = _mm_madd_epi16(src_7, coeffs[7]);
+
+ const __m128i res_odd =
+ _mm_add_epi32(_mm_add_epi32(res_1, res_3), _mm_add_epi32(res_5, res_7));
+
+ // Rearrange pixels back into the order 0 ... 7
+ *res_lo = _mm_unpacklo_epi32(res_even, res_odd);
+ *res_hi = _mm_unpackhi_epi32(res_even, res_odd);
+}
+
+static INLINE void store_vertical_filter_output(
+ __m128i *res_lo, __m128i *res_hi, const __m128i *res_add_const,
+ const __m128i *wt, const __m128i *res_sub_const, __m128i *round_bits_const,
+ uint8_t *pred, ConvolveParams *conv_params, int i, int j, int k,
+ const int reduce_bits_vert, int p_stride, int p_width,
+ const int round_bits) {
+ __m128i res_lo_1 = *res_lo;
+ __m128i res_hi_1 = *res_hi;
+
+ if (conv_params->is_compound) {
+ __m128i *const p =
+ (__m128i *)&conv_params->dst[(i + k + 4) * conv_params->dst_stride + j];
+ res_lo_1 = _mm_srai_epi32(_mm_add_epi32(res_lo_1, *res_add_const),
+ reduce_bits_vert);
+ const __m128i temp_lo_16 = _mm_packus_epi32(res_lo_1, res_lo_1);
+ __m128i res_lo_16;
+ if (conv_params->do_average) {
+ __m128i *const dst8 = (__m128i *)&pred[(i + k + 4) * p_stride + j];
+ const __m128i p_16 = _mm_loadl_epi64(p);
+
+ if (conv_params->use_dist_wtd_comp_avg) {
+ const __m128i p_16_lo = _mm_unpacklo_epi16(p_16, temp_lo_16);
+ const __m128i wt_res_lo = _mm_madd_epi16(p_16_lo, *wt);
+ const __m128i shifted_32 =
+ _mm_srai_epi32(wt_res_lo, DIST_PRECISION_BITS);
+ res_lo_16 = _mm_packus_epi32(shifted_32, shifted_32);
+ } else {
+ res_lo_16 = _mm_srai_epi16(_mm_add_epi16(p_16, temp_lo_16), 1);
+ }
+
+ res_lo_16 = _mm_add_epi16(res_lo_16, *res_sub_const);
+
+ res_lo_16 = _mm_srai_epi16(_mm_add_epi16(res_lo_16, *round_bits_const),
+ round_bits);
+ __m128i res_8_lo = _mm_packus_epi16(res_lo_16, res_lo_16);
+ *(int *)dst8 = _mm_cvtsi128_si32(res_8_lo);
+ } else {
+ _mm_storel_epi64(p, temp_lo_16);
+ }
+ if (p_width > 4) {
+ __m128i *const p4 =
+ (__m128i *)&conv_params
+ ->dst[(i + k + 4) * conv_params->dst_stride + j + 4];
+ res_hi_1 = _mm_srai_epi32(_mm_add_epi32(res_hi_1, *res_add_const),
+ reduce_bits_vert);
+ const __m128i temp_hi_16 = _mm_packus_epi32(res_hi_1, res_hi_1);
+ __m128i res_hi_16;
+
+ if (conv_params->do_average) {
+ __m128i *const dst8_4 =
+ (__m128i *)&pred[(i + k + 4) * p_stride + j + 4];
+ const __m128i p4_16 = _mm_loadl_epi64(p4);
+
+ if (conv_params->use_dist_wtd_comp_avg) {
+ const __m128i p_16_hi = _mm_unpacklo_epi16(p4_16, temp_hi_16);
+ const __m128i wt_res_hi = _mm_madd_epi16(p_16_hi, *wt);
+ const __m128i shifted_32 =
+ _mm_srai_epi32(wt_res_hi, DIST_PRECISION_BITS);
+ res_hi_16 = _mm_packus_epi32(shifted_32, shifted_32);
+ } else {
+ res_hi_16 = _mm_srai_epi16(_mm_add_epi16(p4_16, temp_hi_16), 1);
+ }
+ res_hi_16 = _mm_add_epi16(res_hi_16, *res_sub_const);
+
+ res_hi_16 = _mm_srai_epi16(_mm_add_epi16(res_hi_16, *round_bits_const),
+ round_bits);
+ __m128i res_8_hi = _mm_packus_epi16(res_hi_16, res_hi_16);
+ *(int *)dst8_4 = _mm_cvtsi128_si32(res_8_hi);
+
+ } else {
+ _mm_storel_epi64(p4, temp_hi_16);
+ }
+ }
+ } else {
+ const __m128i res_lo_round = _mm_srai_epi32(
+ _mm_add_epi32(res_lo_1, *res_add_const), reduce_bits_vert);
+ const __m128i res_hi_round = _mm_srai_epi32(
+ _mm_add_epi32(res_hi_1, *res_add_const), reduce_bits_vert);
+
+ const __m128i res_16bit = _mm_packs_epi32(res_lo_round, res_hi_round);
+ __m128i res_8bit = _mm_packus_epi16(res_16bit, res_16bit);
+
+ // Store, blending with 'pred' if needed
+ __m128i *const p = (__m128i *)&pred[(i + k + 4) * p_stride + j];
+
+ // Note: If we're outputting a 4x4 block, we need to be very careful
+ // to only output 4 pixels at this point, to avoid encode/decode
+ // mismatches when encoding with multiple threads.
+ if (p_width == 4) {
+ *(int *)p = _mm_cvtsi128_si32(res_8bit);
+ } else {
+ _mm_storel_epi64(p, res_8bit);
+ }
+ }
+}
+
+static INLINE void warp_vertical_filter(
+ uint8_t *pred, __m128i *tmp, ConvolveParams *conv_params, int16_t gamma,
+ int16_t delta, int p_height, int p_stride, int p_width, int i, int j,
+ int sy4, const int reduce_bits_vert, const __m128i *res_add_const,
+ const int round_bits, const int offset_bits) {
+ int k;
+ __m128i res_sub_const, round_bits_const, wt;
+ unpack_weights_and_set_round_const(conv_params, round_bits, offset_bits,
+ &res_sub_const, &round_bits_const, &wt);
+ // Vertical filter
+ for (k = -4; k < AOMMIN(4, p_height - i - 4); ++k) {
+ int sy = sy4 + delta * (k + 4);
+
+ __m128i coeffs[8];
+ prepare_vertical_filter_coeffs(gamma, sy, coeffs);
+
+ __m128i res_lo;
+ __m128i res_hi;
+ filter_src_pixels_vertical(tmp, coeffs, &res_lo, &res_hi, k);
+
+ store_vertical_filter_output(&res_lo, &res_hi, res_add_const, &wt,
+ &res_sub_const, &round_bits_const, pred,
+ conv_params, i, j, k, reduce_bits_vert,
+ p_stride, p_width, round_bits);
+ }
+}
+
+static INLINE void warp_vertical_filter_gamma0(
+ uint8_t *pred, __m128i *tmp, ConvolveParams *conv_params, int16_t gamma,
+ int16_t delta, int p_height, int p_stride, int p_width, int i, int j,
+ int sy4, const int reduce_bits_vert, const __m128i *res_add_const,
+ const int round_bits, const int offset_bits) {
+ int k;
+ (void)gamma;
+ __m128i res_sub_const, round_bits_const, wt;
+ unpack_weights_and_set_round_const(conv_params, round_bits, offset_bits,
+ &res_sub_const, &round_bits_const, &wt);
+ // Vertical filter
+ for (k = -4; k < AOMMIN(4, p_height - i - 4); ++k) {
+ int sy = sy4 + delta * (k + 4);
+
+ __m128i coeffs[8];
+ prepare_vertical_filter_coeffs_gamma0(sy, coeffs);
+
+ __m128i res_lo;
+ __m128i res_hi;
+ filter_src_pixels_vertical(tmp, coeffs, &res_lo, &res_hi, k);
+
+ store_vertical_filter_output(&res_lo, &res_hi, res_add_const, &wt,
+ &res_sub_const, &round_bits_const, pred,
+ conv_params, i, j, k, reduce_bits_vert,
+ p_stride, p_width, round_bits);
+ }
+}
+
+static INLINE void warp_vertical_filter_delta0(
+ uint8_t *pred, __m128i *tmp, ConvolveParams *conv_params, int16_t gamma,
+ int16_t delta, int p_height, int p_stride, int p_width, int i, int j,
+ int sy4, const int reduce_bits_vert, const __m128i *res_add_const,
+ const int round_bits, const int offset_bits) {
+ (void)delta;
+ int k;
+ __m128i res_sub_const, round_bits_const, wt;
+ unpack_weights_and_set_round_const(conv_params, round_bits, offset_bits,
+ &res_sub_const, &round_bits_const, &wt);
+
+ __m128i coeffs[8];
+ prepare_vertical_filter_coeffs(gamma, sy4, coeffs);
+ // Vertical filter
+ for (k = -4; k < AOMMIN(4, p_height - i - 4); ++k) {
+ __m128i res_lo;
+ __m128i res_hi;
+ filter_src_pixels_vertical(tmp, coeffs, &res_lo, &res_hi, k);
+
+ store_vertical_filter_output(&res_lo, &res_hi, res_add_const, &wt,
+ &res_sub_const, &round_bits_const, pred,
+ conv_params, i, j, k, reduce_bits_vert,
+ p_stride, p_width, round_bits);
+ }
+}
+
+static INLINE void warp_vertical_filter_gamma0_delta0(
+ uint8_t *pred, __m128i *tmp, ConvolveParams *conv_params, int16_t gamma,
+ int16_t delta, int p_height, int p_stride, int p_width, int i, int j,
+ int sy4, const int reduce_bits_vert, const __m128i *res_add_const,
+ const int round_bits, const int offset_bits) {
+ (void)delta;
+ (void)gamma;
+ int k;
+ __m128i res_sub_const, round_bits_const, wt;
+ unpack_weights_and_set_round_const(conv_params, round_bits, offset_bits,
+ &res_sub_const, &round_bits_const, &wt);
+
+ __m128i coeffs[8];
+ prepare_vertical_filter_coeffs_gamma0(sy4, coeffs);
+ // Vertical filter
+ for (k = -4; k < AOMMIN(4, p_height - i - 4); ++k) {
+ __m128i res_lo;
+ __m128i res_hi;
+ filter_src_pixels_vertical(tmp, coeffs, &res_lo, &res_hi, k);
+
+ store_vertical_filter_output(&res_lo, &res_hi, res_add_const, &wt,
+ &res_sub_const, &round_bits_const, pred,
+ conv_params, i, j, k, reduce_bits_vert,
+ p_stride, p_width, round_bits);
+ }
+}
+
+static INLINE void prepare_warp_vertical_filter(
+ uint8_t *pred, __m128i *tmp, ConvolveParams *conv_params, int16_t gamma,
+ int16_t delta, int p_height, int p_stride, int p_width, int i, int j,
+ int sy4, const int reduce_bits_vert, const __m128i *res_add_const,
+ const int round_bits, const int offset_bits) {
+ if (gamma == 0 && delta == 0)
+ warp_vertical_filter_gamma0_delta0(
+ pred, tmp, conv_params, gamma, delta, p_height, p_stride, p_width, i, j,
+ sy4, reduce_bits_vert, res_add_const, round_bits, offset_bits);
+ else if (gamma == 0 && delta != 0)
+ warp_vertical_filter_gamma0(pred, tmp, conv_params, gamma, delta, p_height,
+ p_stride, p_width, i, j, sy4, reduce_bits_vert,
+ res_add_const, round_bits, offset_bits);
+ else if (gamma != 0 && delta == 0)
+ warp_vertical_filter_delta0(pred, tmp, conv_params, gamma, delta, p_height,
+ p_stride, p_width, i, j, sy4, reduce_bits_vert,
+ res_add_const, round_bits, offset_bits);
+ else
+ warp_vertical_filter(pred, tmp, conv_params, gamma, delta, p_height,
+ p_stride, p_width, i, j, sy4, reduce_bits_vert,
+ res_add_const, round_bits, offset_bits);
+}
+
+static INLINE void prepare_warp_horizontal_filter(
+ const uint8_t *ref, __m128i *tmp, int stride, int32_t ix4, int32_t iy4,
+ int32_t sx4, int alpha, int beta, int p_height, int height, int i,
+ const int offset_bits_horiz, const int reduce_bits_horiz) {
+ if (alpha == 0 && beta == 0)
+ warp_horizontal_filter_alpha0_beta0(ref, tmp, stride, ix4, iy4, sx4, alpha,
+ beta, p_height, height, i,
+ offset_bits_horiz, reduce_bits_horiz);
+ else if (alpha == 0 && beta != 0)
+ warp_horizontal_filter_alpha0(ref, tmp, stride, ix4, iy4, sx4, alpha, beta,
+ p_height, height, i, offset_bits_horiz,
+ reduce_bits_horiz);
+ else if (alpha != 0 && beta == 0)
+ warp_horizontal_filter_beta0(ref, tmp, stride, ix4, iy4, sx4, alpha, beta,
+ p_height, height, i, offset_bits_horiz,
+ reduce_bits_horiz);
+ else
+ warp_horizontal_filter(ref, tmp, stride, ix4, iy4, sx4, alpha, beta,
+ p_height, height, i, offset_bits_horiz,
+ reduce_bits_horiz);
+}
+
+void av1_warp_affine_sse4_1(const int32_t *mat, const uint8_t *ref, int width,
+ int height, int stride, uint8_t *pred, int p_col,
+ int p_row, int p_width, int p_height, int p_stride,
+ int subsampling_x, int subsampling_y,
+ ConvolveParams *conv_params, int16_t alpha,
+ int16_t beta, int16_t gamma, int16_t delta) {
+ __m128i tmp[15];
+ int i, j, k;
+ const int bd = 8;
+ const int reduce_bits_horiz = conv_params->round_0;
+ const int reduce_bits_vert = conv_params->is_compound
+ ? conv_params->round_1
+ : 2 * FILTER_BITS - reduce_bits_horiz;
+ const int offset_bits_horiz = bd + FILTER_BITS - 1;
+ assert(IMPLIES(conv_params->is_compound, conv_params->dst != NULL));
+
+ const int offset_bits_vert = bd + 2 * FILTER_BITS - reduce_bits_horiz;
+ const __m128i reduce_bits_vert_const =
+ _mm_set1_epi32(((1 << reduce_bits_vert) >> 1));
+ const __m128i res_add_const = _mm_set1_epi32(1 << offset_bits_vert);
+ const int round_bits =
+ 2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1;
+ const int offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0;
+ assert(IMPLIES(conv_params->do_average, conv_params->is_compound));
+
+ /* Note: For this code to work, the left/right frame borders need to be
+ extended by at least 13 pixels each. By the time we get here, other
+ code will have set up this border, but we allow an explicit check
+ for debugging purposes.
+ */
+ /*for (i = 0; i < height; ++i) {
+ for (j = 0; j < 13; ++j) {
+ assert(ref[i * stride - 13 + j] == ref[i * stride]);
+ assert(ref[i * stride + width + j] == ref[i * stride + (width - 1)]);
+ }
+ }*/
+ __m128i res_add_const_1;
+ if (conv_params->is_compound == 1) {
+ res_add_const_1 = _mm_add_epi32(reduce_bits_vert_const, res_add_const);
+ } else {
+ res_add_const_1 = _mm_set1_epi32(-(1 << (bd + reduce_bits_vert - 1)) +
+ ((1 << reduce_bits_vert) >> 1));
+ }
+
+ for (i = 0; i < p_height; i += 8) {
+ for (j = 0; j < p_width; j += 8) {
+ const int32_t src_x = (p_col + j + 4) << subsampling_x;
+ const int32_t src_y = (p_row + i + 4) << subsampling_y;
+ const int64_t dst_x =
+ (int64_t)mat[2] * src_x + (int64_t)mat[3] * src_y + (int64_t)mat[0];
+ const int64_t dst_y =
+ (int64_t)mat[4] * src_x + (int64_t)mat[5] * src_y + (int64_t)mat[1];
+ const int64_t x4 = dst_x >> subsampling_x;
+ const int64_t y4 = dst_y >> subsampling_y;
+
+ int32_t ix4 = (int32_t)(x4 >> WARPEDMODEL_PREC_BITS);
+ int32_t sx4 = x4 & ((1 << WARPEDMODEL_PREC_BITS) - 1);
+ int32_t iy4 = (int32_t)(y4 >> WARPEDMODEL_PREC_BITS);
+ int32_t sy4 = y4 & ((1 << WARPEDMODEL_PREC_BITS) - 1);
+
+ // Add in all the constant terms, including rounding and offset
+ sx4 += alpha * (-4) + beta * (-4) + (1 << (WARPEDDIFF_PREC_BITS - 1)) +
+ (WARPEDPIXEL_PREC_SHIFTS << WARPEDDIFF_PREC_BITS);
+ sy4 += gamma * (-4) + delta * (-4) + (1 << (WARPEDDIFF_PREC_BITS - 1)) +
+ (WARPEDPIXEL_PREC_SHIFTS << WARPEDDIFF_PREC_BITS);
+
+ sx4 &= ~((1 << WARP_PARAM_REDUCE_BITS) - 1);
+ sy4 &= ~((1 << WARP_PARAM_REDUCE_BITS) - 1);
+
+ // Horizontal filter
+ // If the block is aligned such that, after clamping, every sample
+ // would be taken from the leftmost/rightmost column, then we can
+ // skip the expensive horizontal filter.
+ if (ix4 <= -7) {
+ for (k = -7; k < AOMMIN(8, p_height - i); ++k) {
+ int iy = iy4 + k;
+ if (iy < 0)
+ iy = 0;
+ else if (iy > height - 1)
+ iy = height - 1;
+ tmp[k + 7] = _mm_set1_epi16(
+ (1 << (bd + FILTER_BITS - reduce_bits_horiz - 1)) +
+ ref[iy * stride] * (1 << (FILTER_BITS - reduce_bits_horiz)));
+ }
+ } else if (ix4 >= width + 6) {
+ for (k = -7; k < AOMMIN(8, p_height - i); ++k) {
+ int iy = iy4 + k;
+ if (iy < 0)
+ iy = 0;
+ else if (iy > height - 1)
+ iy = height - 1;
+ tmp[k + 7] =
+ _mm_set1_epi16((1 << (bd + FILTER_BITS - reduce_bits_horiz - 1)) +
+ ref[iy * stride + (width - 1)] *
+ (1 << (FILTER_BITS - reduce_bits_horiz)));
+ }
+ } else if (((ix4 - 7) < 0) || ((ix4 + 9) > width)) {
+ const int out_of_boundary_left = -(ix4 - 6);
+ const int out_of_boundary_right = (ix4 + 8) - width;
+ for (k = -7; k < AOMMIN(8, p_height - i); ++k) {
+ int iy = iy4 + k;
+ if (iy < 0)
+ iy = 0;
+ else if (iy > height - 1)
+ iy = height - 1;
+ int sx = sx4 + beta * (k + 4);
+
+ // Load source pixels
+ __m128i src =
+ _mm_loadu_si128((__m128i *)(ref + iy * stride + ix4 - 7));
+ if (out_of_boundary_left >= 0) {
+ const __m128i shuffle_reg_left =
+ _mm_loadu_si128((__m128i *)warp_pad_left[out_of_boundary_left]);
+ src = _mm_shuffle_epi8(src, shuffle_reg_left);
+ }
+ if (out_of_boundary_right >= 0) {
+ const __m128i shuffle_reg_right = _mm_loadu_si128(
+ (__m128i *)warp_pad_right[out_of_boundary_right]);
+ src = _mm_shuffle_epi8(src, shuffle_reg_right);
+ }
+ horizontal_filter(src, tmp, sx, alpha, k, offset_bits_horiz,
+ reduce_bits_horiz);
+ }
+ } else {
+ prepare_warp_horizontal_filter(ref, tmp, stride, ix4, iy4, sx4, alpha,
+ beta, p_height, height, i,
+ offset_bits_horiz, reduce_bits_horiz);
+ }
+
+ // Vertical filter
+ prepare_warp_vertical_filter(
+ pred, tmp, conv_params, gamma, delta, p_height, p_stride, p_width, i,
+ j, sy4, reduce_bits_vert, &res_add_const_1, round_bits, offset_bits);
+ }
+ }
+}
diff --git a/third_party/aom/av1/common/x86/wiener_convolve_avx2.c b/third_party/aom/av1/common/x86/wiener_convolve_avx2.c
new file mode 100644
index 0000000000..3de630f203
--- /dev/null
+++ b/third_party/aom/av1/common/x86/wiener_convolve_avx2.c
@@ -0,0 +1,242 @@
+/*
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <immintrin.h>
+#include <assert.h>
+
+#include "config/av1_rtcd.h"
+
+#include "av1/common/convolve.h"
+#include "aom_dsp/aom_dsp_common.h"
+#include "aom_dsp/aom_filter.h"
+#include "aom_dsp/x86/convolve_avx2.h"
+#include "aom_dsp/x86/synonyms.h"
+#include "aom_dsp/x86/synonyms_avx2.h"
+
+// 128-bit xmmwords are written as [ ... ] with the MSB on the left.
+// 256-bit ymmwords are written as two xmmwords, [ ... ][ ... ] with the MSB
+// on the left.
+// A row of, say, 8-bit pixels with values p0, p1, p2, ..., p30, p31 will be
+// loaded and stored as [ p31 ... p17 p16 ][ p15 ... p1 p0 ].
+
+// Exploiting the range of wiener filter coefficients,
+// horizontal filtering can be done in 16 bit intermediate precision.
+// The details are as follows :
+// Consider the horizontal wiener filter coefficients of the following form :
+// [C0, C1, C2, 2^(FILTER_BITS) -2 * (C0 + C1 + C2), C2, C1, C0]
+// Subtracting 2^(FILTER_BITS) from the centre tap we get the following :
+// [C0, C1, C2, -2 * (C0 + C1 + C2), C2, C1, C0]
+// The sum of the product "C0 * p0 + C1 * p1 + C2 * p2 -2 * (C0 + C1 + C2) * p3
+// + C2 * p4 + C1 * p5 + C0 * p6" would be in the range of signed 16 bit
+// precision. Finally, after rounding the above result by round_0, we multiply
+// the centre pixel by 2^(FILTER_BITS - round_0) and add it to get the
+// horizontal filter output.
+
+void av1_wiener_convolve_add_src_avx2(const uint8_t *src, ptrdiff_t src_stride,
+ uint8_t *dst, ptrdiff_t dst_stride,
+ const int16_t *filter_x, int x_step_q4,
+ const int16_t *filter_y, int y_step_q4,
+ int w, int h,
+ const WienerConvolveParams *conv_params) {
+ const int bd = 8;
+ assert(x_step_q4 == 16 && y_step_q4 == 16);
+ assert(!(w & 7));
+ (void)x_step_q4;
+ (void)y_step_q4;
+
+ DECLARE_ALIGNED(32, int16_t, im_block[(MAX_SB_SIZE + SUBPEL_TAPS) * 8]);
+ int im_h = h + SUBPEL_TAPS - 2;
+ int im_stride = 8;
+ memset(im_block + (im_h * im_stride), 0, MAX_SB_SIZE);
+ int i, j;
+ const int center_tap = (SUBPEL_TAPS - 1) / 2;
+ const uint8_t *const src_ptr = src - center_tap * src_stride - center_tap;
+
+ __m256i filt[4], coeffs_h[4], coeffs_v[4], filt_center;
+
+ assert(conv_params->round_0 > 0);
+
+ filt[0] = _mm256_load_si256((__m256i const *)filt1_global_avx2);
+ filt[1] = _mm256_load_si256((__m256i const *)filt2_global_avx2);
+ filt[2] = _mm256_load_si256((__m256i const *)filt3_global_avx2);
+ filt[3] = _mm256_load_si256((__m256i const *)filt4_global_avx2);
+
+ filt_center = _mm256_load_si256((__m256i const *)filt_center_global_avx2);
+
+ const __m128i coeffs_x = _mm_loadu_si128((__m128i *)filter_x);
+ const __m256i filter_coeffs_x = _mm256_broadcastsi128_si256(coeffs_x);
+
+ // coeffs 0 1 0 1 0 1 0 1
+ coeffs_h[0] =
+ _mm256_shuffle_epi8(filter_coeffs_x, _mm256_set1_epi16(0x0200u));
+ // coeffs 2 3 2 3 2 3 2 3
+ coeffs_h[1] =
+ _mm256_shuffle_epi8(filter_coeffs_x, _mm256_set1_epi16(0x0604u));
+ // coeffs 4 5 4 5 4 5 4 5
+ coeffs_h[2] =
+ _mm256_shuffle_epi8(filter_coeffs_x, _mm256_set1_epi16(0x0a08u));
+ // coeffs 6 7 6 7 6 7 6 7
+ coeffs_h[3] =
+ _mm256_shuffle_epi8(filter_coeffs_x, _mm256_set1_epi16(0x0e0cu));
+
+ const __m256i round_const_h =
+ _mm256_set1_epi16((1 << (conv_params->round_0 - 1)));
+ const __m256i round_const_horz =
+ _mm256_set1_epi16((1 << (bd + FILTER_BITS - conv_params->round_0 - 1)));
+ const __m256i clamp_low = _mm256_setzero_si256();
+ const __m256i clamp_high =
+ _mm256_set1_epi16(WIENER_CLAMP_LIMIT(conv_params->round_0, bd) - 1);
+ const __m128i round_shift_h = _mm_cvtsi32_si128(conv_params->round_0);
+
+ // Add an offset to account for the "add_src" part of the convolve function.
+ const __m128i zero_128 = _mm_setzero_si128();
+ const __m128i offset_0 = _mm_insert_epi16(zero_128, 1 << FILTER_BITS, 3);
+ const __m128i coeffs_y = _mm_add_epi16(xx_loadu_128(filter_y), offset_0);
+
+ const __m256i filter_coeffs_y = _mm256_broadcastsi128_si256(coeffs_y);
+
+ // coeffs 0 1 0 1 0 1 0 1
+ coeffs_v[0] = _mm256_shuffle_epi32(filter_coeffs_y, 0x00);
+ // coeffs 2 3 2 3 2 3 2 3
+ coeffs_v[1] = _mm256_shuffle_epi32(filter_coeffs_y, 0x55);
+ // coeffs 4 5 4 5 4 5 4 5
+ coeffs_v[2] = _mm256_shuffle_epi32(filter_coeffs_y, 0xaa);
+ // coeffs 6 7 6 7 6 7 6 7
+ coeffs_v[3] = _mm256_shuffle_epi32(filter_coeffs_y, 0xff);
+
+ const __m256i round_const_v =
+ _mm256_set1_epi32((1 << (conv_params->round_1 - 1)) -
+ (1 << (bd + conv_params->round_1 - 1)));
+ const __m128i round_shift_v = _mm_cvtsi32_si128(conv_params->round_1);
+
+ for (j = 0; j < w; j += 8) {
+ for (i = 0; i < im_h; i += 2) {
+ __m256i data = _mm256_castsi128_si256(
+ _mm_loadu_si128((__m128i *)&src_ptr[(i * src_stride) + j]));
+
+ // Load the next line
+ if (i + 1 < im_h)
+ data = _mm256_inserti128_si256(
+ data,
+ _mm_loadu_si128(
+ (__m128i *)&src_ptr[(i * src_stride) + j + src_stride]),
+ 1);
+
+ __m256i res = convolve_lowbd_x(data, coeffs_h, filt);
+
+ res =
+ _mm256_sra_epi16(_mm256_add_epi16(res, round_const_h), round_shift_h);
+
+ __m256i data_0 = _mm256_shuffle_epi8(data, filt_center);
+
+ // multiply the center pixel by 2^(FILTER_BITS - round_0) and add it to
+ // the result
+ data_0 = _mm256_slli_epi16(data_0, FILTER_BITS - conv_params->round_0);
+ res = _mm256_add_epi16(res, data_0);
+ res = _mm256_add_epi16(res, round_const_horz);
+ const __m256i res_clamped =
+ _mm256_min_epi16(_mm256_max_epi16(res, clamp_low), clamp_high);
+ _mm256_store_si256((__m256i *)&im_block[i * im_stride], res_clamped);
+ }
+
+ /* Vertical filter */
+ {
+ __m256i src_0 = _mm256_loadu_si256((__m256i *)(im_block + 0 * im_stride));
+ __m256i src_1 = _mm256_loadu_si256((__m256i *)(im_block + 1 * im_stride));
+ __m256i src_2 = _mm256_loadu_si256((__m256i *)(im_block + 2 * im_stride));
+ __m256i src_3 = _mm256_loadu_si256((__m256i *)(im_block + 3 * im_stride));
+ __m256i src_4 = _mm256_loadu_si256((__m256i *)(im_block + 4 * im_stride));
+ __m256i src_5 = _mm256_loadu_si256((__m256i *)(im_block + 5 * im_stride));
+
+ __m256i s[8];
+ s[0] = _mm256_unpacklo_epi16(src_0, src_1);
+ s[1] = _mm256_unpacklo_epi16(src_2, src_3);
+ s[2] = _mm256_unpacklo_epi16(src_4, src_5);
+
+ s[4] = _mm256_unpackhi_epi16(src_0, src_1);
+ s[5] = _mm256_unpackhi_epi16(src_2, src_3);
+ s[6] = _mm256_unpackhi_epi16(src_4, src_5);
+
+ for (i = 0; i < h - 1; i += 2) {
+ const int16_t *data = &im_block[i * im_stride];
+
+ const __m256i s6 =
+ _mm256_loadu_si256((__m256i *)(data + 6 * im_stride));
+ const __m256i s7 =
+ _mm256_loadu_si256((__m256i *)(data + 7 * im_stride));
+
+ s[3] = _mm256_unpacklo_epi16(s6, s7);
+ s[7] = _mm256_unpackhi_epi16(s6, s7);
+
+ __m256i res_a = convolve(s, coeffs_v);
+ __m256i res_b = convolve(s + 4, coeffs_v);
+
+ const __m256i res_a_round = _mm256_sra_epi32(
+ _mm256_add_epi32(res_a, round_const_v), round_shift_v);
+ const __m256i res_b_round = _mm256_sra_epi32(
+ _mm256_add_epi32(res_b, round_const_v), round_shift_v);
+
+ /* rounding code */
+ // 16 bit conversion
+ const __m256i res_16bit = _mm256_packs_epi32(res_a_round, res_b_round);
+ // 8 bit conversion and saturation to uint8
+ const __m256i res_8b = _mm256_packus_epi16(res_16bit, res_16bit);
+
+ const __m128i res_0 = _mm256_castsi256_si128(res_8b);
+ const __m128i res_1 = _mm256_extracti128_si256(res_8b, 1);
+
+ // Store values into the destination buffer
+ __m128i *const p_0 = (__m128i *)&dst[i * dst_stride + j];
+ __m128i *const p_1 = (__m128i *)&dst[i * dst_stride + j + dst_stride];
+
+ _mm_storel_epi64(p_0, res_0);
+ _mm_storel_epi64(p_1, res_1);
+
+ s[0] = s[1];
+ s[1] = s[2];
+ s[2] = s[3];
+
+ s[4] = s[5];
+ s[5] = s[6];
+ s[6] = s[7];
+ }
+ if (h - i) {
+ s[0] = _mm256_permute2x128_si256(s[0], s[4], 0x20);
+ s[1] = _mm256_permute2x128_si256(s[1], s[5], 0x20);
+ s[2] = _mm256_permute2x128_si256(s[2], s[6], 0x20);
+
+ const int16_t *data = &im_block[i * im_stride];
+ const __m128i s6_ = _mm_loadu_si128((__m128i *)(data + 6 * im_stride));
+ const __m128i s7_ = _mm_loadu_si128((__m128i *)(data + 7 * im_stride));
+
+ __m128i s3 = _mm_unpacklo_epi16(s6_, s7_);
+ __m128i s7 = _mm_unpackhi_epi16(s6_, s7_);
+
+ s[3] = _mm256_inserti128_si256(_mm256_castsi128_si256(s3), s7, 1);
+ __m256i convolveres = convolve(s, coeffs_v);
+
+ const __m256i res_round = _mm256_sra_epi32(
+ _mm256_add_epi32(convolveres, round_const_v), round_shift_v);
+
+ /* rounding code */
+ // 16 bit conversion
+ __m128i reslo = _mm256_castsi256_si128(res_round);
+ __m128i reshi = _mm256_extracti128_si256(res_round, 1);
+ const __m128i res_16bit = _mm_packus_epi32(reslo, reshi);
+
+ // 8 bit conversion and saturation to uint8
+ const __m128i res_8b = _mm_packus_epi16(res_16bit, res_16bit);
+ __m128i *const p_0 = (__m128i *)&dst[i * dst_stride + j];
+ _mm_storel_epi64(p_0, res_8b);
+ }
+ }
+ }
+}
diff --git a/third_party/aom/av1/common/x86/wiener_convolve_sse2.c b/third_party/aom/av1/common/x86/wiener_convolve_sse2.c
new file mode 100644
index 0000000000..1c039e80c6
--- /dev/null
+++ b/third_party/aom/av1/common/x86/wiener_convolve_sse2.c
@@ -0,0 +1,199 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <emmintrin.h>
+#include <assert.h>
+
+#include "config/av1_rtcd.h"
+
+#include "av1/common/convolve.h"
+#include "aom_dsp/aom_dsp_common.h"
+#include "aom_dsp/aom_filter.h"
+
+void av1_wiener_convolve_add_src_sse2(const uint8_t *src, ptrdiff_t src_stride,
+ uint8_t *dst, ptrdiff_t dst_stride,
+ const int16_t *filter_x, int x_step_q4,
+ const int16_t *filter_y, int y_step_q4,
+ int w, int h,
+ const WienerConvolveParams *conv_params) {
+ const int bd = 8;
+ assert(x_step_q4 == 16 && y_step_q4 == 16);
+ assert(!(w & 7));
+ (void)x_step_q4;
+ (void)y_step_q4;
+
+ DECLARE_ALIGNED(16, uint16_t,
+ temp[(MAX_SB_SIZE + SUBPEL_TAPS - 1) * MAX_SB_SIZE]);
+ int intermediate_height = h + SUBPEL_TAPS - 2;
+ memset(temp + (intermediate_height * MAX_SB_SIZE), 0, MAX_SB_SIZE);
+ int i, j;
+ const int center_tap = ((SUBPEL_TAPS - 1) / 2);
+ const uint8_t *const src_ptr = src - center_tap * src_stride - center_tap;
+
+ const __m128i zero = _mm_setzero_si128();
+ // Add an offset to account for the "add_src" part of the convolve function.
+ const __m128i offset = _mm_insert_epi16(zero, 1 << FILTER_BITS, 3);
+
+ /* Horizontal filter */
+ {
+ const __m128i coeffs_x =
+ _mm_add_epi16(_mm_loadu_si128((__m128i *)filter_x), offset);
+
+ // coeffs 0 1 0 1 2 3 2 3
+ const __m128i tmp_0 = _mm_unpacklo_epi32(coeffs_x, coeffs_x);
+ // coeffs 4 5 4 5 6 7 6 7
+ const __m128i tmp_1 = _mm_unpackhi_epi32(coeffs_x, coeffs_x);
+
+ // coeffs 0 1 0 1 0 1 0 1
+ const __m128i coeff_01 = _mm_unpacklo_epi64(tmp_0, tmp_0);
+ // coeffs 2 3 2 3 2 3 2 3
+ const __m128i coeff_23 = _mm_unpackhi_epi64(tmp_0, tmp_0);
+ // coeffs 4 5 4 5 4 5 4 5
+ const __m128i coeff_45 = _mm_unpacklo_epi64(tmp_1, tmp_1);
+ // coeffs 6 7 6 7 6 7 6 7
+ const __m128i coeff_67 = _mm_unpackhi_epi64(tmp_1, tmp_1);
+
+ const __m128i round_const = _mm_set1_epi32(
+ (1 << (conv_params->round_0 - 1)) + (1 << (bd + FILTER_BITS - 1)));
+
+ for (i = 0; i < intermediate_height; ++i) {
+ for (j = 0; j < w; j += 8) {
+ const __m128i data =
+ _mm_loadu_si128((__m128i *)&src_ptr[i * src_stride + j]);
+
+ // Filter even-index pixels
+ const __m128i src_0 = _mm_unpacklo_epi8(data, zero);
+ const __m128i res_0 = _mm_madd_epi16(src_0, coeff_01);
+ const __m128i src_2 = _mm_unpacklo_epi8(_mm_srli_si128(data, 2), zero);
+ const __m128i res_2 = _mm_madd_epi16(src_2, coeff_23);
+ const __m128i src_4 = _mm_unpacklo_epi8(_mm_srli_si128(data, 4), zero);
+ const __m128i res_4 = _mm_madd_epi16(src_4, coeff_45);
+ const __m128i src_6 = _mm_unpacklo_epi8(_mm_srli_si128(data, 6), zero);
+ const __m128i res_6 = _mm_madd_epi16(src_6, coeff_67);
+
+ __m128i res_even = _mm_add_epi32(_mm_add_epi32(res_0, res_4),
+ _mm_add_epi32(res_2, res_6));
+ res_even = _mm_srai_epi32(_mm_add_epi32(res_even, round_const),
+ conv_params->round_0);
+
+ // Filter odd-index pixels
+ const __m128i src_1 = _mm_unpacklo_epi8(_mm_srli_si128(data, 1), zero);
+ const __m128i res_1 = _mm_madd_epi16(src_1, coeff_01);
+ const __m128i src_3 = _mm_unpacklo_epi8(_mm_srli_si128(data, 3), zero);
+ const __m128i res_3 = _mm_madd_epi16(src_3, coeff_23);
+ const __m128i src_5 = _mm_unpacklo_epi8(_mm_srli_si128(data, 5), zero);
+ const __m128i res_5 = _mm_madd_epi16(src_5, coeff_45);
+ const __m128i src_7 = _mm_unpacklo_epi8(_mm_srli_si128(data, 7), zero);
+ const __m128i res_7 = _mm_madd_epi16(src_7, coeff_67);
+
+ __m128i res_odd = _mm_add_epi32(_mm_add_epi32(res_1, res_5),
+ _mm_add_epi32(res_3, res_7));
+ res_odd = _mm_srai_epi32(_mm_add_epi32(res_odd, round_const),
+ conv_params->round_0);
+
+ // Pack in the column order 0, 2, 4, 6, 1, 3, 5, 7
+ __m128i res = _mm_packs_epi32(res_even, res_odd);
+ res = _mm_min_epi16(
+ _mm_max_epi16(res, zero),
+ _mm_set1_epi16(WIENER_CLAMP_LIMIT(conv_params->round_0, bd) - 1));
+ _mm_storeu_si128((__m128i *)&temp[i * MAX_SB_SIZE + j], res);
+ }
+ }
+ }
+
+ /* Vertical filter */
+ {
+ const __m128i coeffs_y =
+ _mm_add_epi16(_mm_loadu_si128((__m128i *)filter_y), offset);
+
+ // coeffs 0 1 0 1 2 3 2 3
+ const __m128i tmp_0 = _mm_unpacklo_epi32(coeffs_y, coeffs_y);
+ // coeffs 4 5 4 5 6 7 6 7
+ const __m128i tmp_1 = _mm_unpackhi_epi32(coeffs_y, coeffs_y);
+
+ // coeffs 0 1 0 1 0 1 0 1
+ const __m128i coeff_01 = _mm_unpacklo_epi64(tmp_0, tmp_0);
+ // coeffs 2 3 2 3 2 3 2 3
+ const __m128i coeff_23 = _mm_unpackhi_epi64(tmp_0, tmp_0);
+ // coeffs 4 5 4 5 4 5 4 5
+ const __m128i coeff_45 = _mm_unpacklo_epi64(tmp_1, tmp_1);
+ // coeffs 6 7 6 7 6 7 6 7
+ const __m128i coeff_67 = _mm_unpackhi_epi64(tmp_1, tmp_1);
+
+ const __m128i round_const =
+ _mm_set1_epi32((1 << (conv_params->round_1 - 1)) -
+ (1 << (bd + conv_params->round_1 - 1)));
+
+ for (i = 0; i < h; ++i) {
+ for (j = 0; j < w; j += 8) {
+ // Filter even-index pixels
+ const uint16_t *data = &temp[i * MAX_SB_SIZE + j];
+ const __m128i src_0 =
+ _mm_unpacklo_epi16(*(__m128i *)(data + 0 * MAX_SB_SIZE),
+ *(__m128i *)(data + 1 * MAX_SB_SIZE));
+ const __m128i src_2 =
+ _mm_unpacklo_epi16(*(__m128i *)(data + 2 * MAX_SB_SIZE),
+ *(__m128i *)(data + 3 * MAX_SB_SIZE));
+ const __m128i src_4 =
+ _mm_unpacklo_epi16(*(__m128i *)(data + 4 * MAX_SB_SIZE),
+ *(__m128i *)(data + 5 * MAX_SB_SIZE));
+ const __m128i src_6 =
+ _mm_unpacklo_epi16(*(__m128i *)(data + 6 * MAX_SB_SIZE),
+ *(__m128i *)(data + 7 * MAX_SB_SIZE));
+
+ const __m128i res_0 = _mm_madd_epi16(src_0, coeff_01);
+ const __m128i res_2 = _mm_madd_epi16(src_2, coeff_23);
+ const __m128i res_4 = _mm_madd_epi16(src_4, coeff_45);
+ const __m128i res_6 = _mm_madd_epi16(src_6, coeff_67);
+
+ const __m128i res_even = _mm_add_epi32(_mm_add_epi32(res_0, res_2),
+ _mm_add_epi32(res_4, res_6));
+
+ // Filter odd-index pixels
+ const __m128i src_1 =
+ _mm_unpackhi_epi16(*(__m128i *)(data + 0 * MAX_SB_SIZE),
+ *(__m128i *)(data + 1 * MAX_SB_SIZE));
+ const __m128i src_3 =
+ _mm_unpackhi_epi16(*(__m128i *)(data + 2 * MAX_SB_SIZE),
+ *(__m128i *)(data + 3 * MAX_SB_SIZE));
+ const __m128i src_5 =
+ _mm_unpackhi_epi16(*(__m128i *)(data + 4 * MAX_SB_SIZE),
+ *(__m128i *)(data + 5 * MAX_SB_SIZE));
+ const __m128i src_7 =
+ _mm_unpackhi_epi16(*(__m128i *)(data + 6 * MAX_SB_SIZE),
+ *(__m128i *)(data + 7 * MAX_SB_SIZE));
+
+ const __m128i res_1 = _mm_madd_epi16(src_1, coeff_01);
+ const __m128i res_3 = _mm_madd_epi16(src_3, coeff_23);
+ const __m128i res_5 = _mm_madd_epi16(src_5, coeff_45);
+ const __m128i res_7 = _mm_madd_epi16(src_7, coeff_67);
+
+ const __m128i res_odd = _mm_add_epi32(_mm_add_epi32(res_1, res_3),
+ _mm_add_epi32(res_5, res_7));
+
+ // Rearrange pixels back into the order 0 ... 7
+ const __m128i res_lo = _mm_unpacklo_epi32(res_even, res_odd);
+ const __m128i res_hi = _mm_unpackhi_epi32(res_even, res_odd);
+
+ const __m128i res_lo_round = _mm_srai_epi32(
+ _mm_add_epi32(res_lo, round_const), conv_params->round_1);
+ const __m128i res_hi_round = _mm_srai_epi32(
+ _mm_add_epi32(res_hi, round_const), conv_params->round_1);
+
+ const __m128i res_16bit = _mm_packs_epi32(res_lo_round, res_hi_round);
+ __m128i res_8bit = _mm_packus_epi16(res_16bit, res_16bit);
+
+ __m128i *const p = (__m128i *)&dst[i * dst_stride + j];
+ _mm_storel_epi64(p, res_8bit);
+ }
+ }
+ }
+}