summaryrefslogtreecommitdiffstats
path: root/media/libvpx/libvpx/vpx_dsp/x86
diff options
context:
space:
mode:
Diffstat (limited to 'media/libvpx/libvpx/vpx_dsp/x86')
-rw-r--r--media/libvpx/libvpx/vpx_dsp/x86/add_noise_sse2.asm88
-rw-r--r--media/libvpx/libvpx/vpx_dsp/x86/avg_intrin_avx2.c482
-rw-r--r--media/libvpx/libvpx/vpx_dsp/x86/avg_intrin_sse2.c577
-rw-r--r--media/libvpx/libvpx/vpx_dsp/x86/avg_pred_sse2.c69
-rw-r--r--media/libvpx/libvpx/vpx_dsp/x86/avg_ssse3_x86_64.asm130
-rw-r--r--media/libvpx/libvpx/vpx_dsp/x86/bitdepth_conversion_avx2.h44
-rw-r--r--media/libvpx/libvpx/vpx_dsp/x86/bitdepth_conversion_sse2.asm90
-rw-r--r--media/libvpx/libvpx/vpx_dsp/x86/bitdepth_conversion_sse2.h56
-rw-r--r--media/libvpx/libvpx/vpx_dsp/x86/convolve.h279
-rw-r--r--media/libvpx/libvpx/vpx_dsp/x86/convolve_avx2.h161
-rw-r--r--media/libvpx/libvpx/vpx_dsp/x86/convolve_sse2.h88
-rw-r--r--media/libvpx/libvpx/vpx_dsp/x86/convolve_ssse3.h112
-rw-r--r--media/libvpx/libvpx/vpx_dsp/x86/deblock_sse2.asm432
-rw-r--r--media/libvpx/libvpx/vpx_dsp/x86/fwd_dct32x32_impl_avx2.h2930
-rw-r--r--media/libvpx/libvpx/vpx_dsp/x86/fwd_dct32x32_impl_sse2.h3130
-rw-r--r--media/libvpx/libvpx/vpx_dsp/x86/fwd_txfm_avx2.c399
-rw-r--r--media/libvpx/libvpx/vpx_dsp/x86/fwd_txfm_impl_sse2.h1015
-rw-r--r--media/libvpx/libvpx/vpx_dsp/x86/fwd_txfm_sse2.c272
-rw-r--r--media/libvpx/libvpx/vpx_dsp/x86/fwd_txfm_sse2.h371
-rw-r--r--media/libvpx/libvpx/vpx_dsp/x86/fwd_txfm_ssse3_x86_64.asm361
-rw-r--r--media/libvpx/libvpx/vpx_dsp/x86/highbd_convolve_avx2.c1495
-rw-r--r--media/libvpx/libvpx/vpx_dsp/x86/highbd_idct16x16_add_sse2.c355
-rw-r--r--media/libvpx/libvpx/vpx_dsp/x86/highbd_idct16x16_add_sse4.c349
-rw-r--r--media/libvpx/libvpx/vpx_dsp/x86/highbd_idct32x32_add_sse2.c782
-rw-r--r--media/libvpx/libvpx/vpx_dsp/x86/highbd_idct32x32_add_sse4.c765
-rw-r--r--media/libvpx/libvpx/vpx_dsp/x86/highbd_idct4x4_add_sse2.c160
-rw-r--r--media/libvpx/libvpx/vpx_dsp/x86/highbd_idct4x4_add_sse4.c47
-rw-r--r--media/libvpx/libvpx/vpx_dsp/x86/highbd_idct8x8_add_sse2.c213
-rw-r--r--media/libvpx/libvpx/vpx_dsp/x86/highbd_idct8x8_add_sse4.c210
-rw-r--r--media/libvpx/libvpx/vpx_dsp/x86/highbd_intrapred_intrin_sse2.c534
-rw-r--r--media/libvpx/libvpx/vpx_dsp/x86/highbd_intrapred_intrin_ssse3.c930
-rw-r--r--media/libvpx/libvpx/vpx_dsp/x86/highbd_intrapred_sse2.asm453
-rw-r--r--media/libvpx/libvpx/vpx_dsp/x86/highbd_inv_txfm_sse2.h404
-rw-r--r--media/libvpx/libvpx/vpx_dsp/x86/highbd_inv_txfm_sse4.h112
-rw-r--r--media/libvpx/libvpx/vpx_dsp/x86/highbd_loopfilter_sse2.c1140
-rw-r--r--media/libvpx/libvpx/vpx_dsp/x86/highbd_quantize_intrin_avx2.c260
-rw-r--r--media/libvpx/libvpx/vpx_dsp/x86/highbd_quantize_intrin_sse2.c155
-rw-r--r--media/libvpx/libvpx/vpx_dsp/x86/highbd_sad4d_avx2.c462
-rw-r--r--media/libvpx/libvpx/vpx_dsp/x86/highbd_sad4d_sse2.asm326
-rw-r--r--media/libvpx/libvpx/vpx_dsp/x86/highbd_sad_avx2.c522
-rw-r--r--media/libvpx/libvpx/vpx_dsp/x86/highbd_sad_sse2.asm416
-rw-r--r--media/libvpx/libvpx/vpx_dsp/x86/highbd_subpel_variance_impl_sse2.asm1021
-rw-r--r--media/libvpx/libvpx/vpx_dsp/x86/highbd_variance_impl_sse2.asm315
-rw-r--r--media/libvpx/libvpx/vpx_dsp/x86/highbd_variance_sse2.c608
-rw-r--r--media/libvpx/libvpx/vpx_dsp/x86/intrapred_sse2.asm860
-rw-r--r--media/libvpx/libvpx/vpx_dsp/x86/intrapred_ssse3.asm871
-rw-r--r--media/libvpx/libvpx/vpx_dsp/x86/inv_txfm_avx2.c626
-rw-r--r--media/libvpx/libvpx/vpx_dsp/x86/inv_txfm_sse2.c1235
-rw-r--r--media/libvpx/libvpx/vpx_dsp/x86/inv_txfm_sse2.h710
-rw-r--r--media/libvpx/libvpx/vpx_dsp/x86/inv_txfm_ssse3.c364
-rw-r--r--media/libvpx/libvpx/vpx_dsp/x86/inv_txfm_ssse3.h110
-rw-r--r--media/libvpx/libvpx/vpx_dsp/x86/inv_wht_sse2.asm103
-rw-r--r--media/libvpx/libvpx/vpx_dsp/x86/loopfilter_avx2.c913
-rw-r--r--media/libvpx/libvpx/vpx_dsp/x86/loopfilter_intrin_sse2.c1779
-rw-r--r--media/libvpx/libvpx/vpx_dsp/x86/mem_sse2.h154
-rw-r--r--media/libvpx/libvpx/vpx_dsp/x86/post_proc_sse2.c141
-rw-r--r--media/libvpx/libvpx/vpx_dsp/x86/quantize_avx.c258
-rw-r--r--media/libvpx/libvpx/vpx_dsp/x86/quantize_avx2.c291
-rw-r--r--media/libvpx/libvpx/vpx_dsp/x86/quantize_sse2.c116
-rw-r--r--media/libvpx/libvpx/vpx_dsp/x86/quantize_sse2.h127
-rw-r--r--media/libvpx/libvpx/vpx_dsp/x86/quantize_ssse3.c232
-rw-r--r--media/libvpx/libvpx/vpx_dsp/x86/quantize_ssse3.h51
-rw-r--r--media/libvpx/libvpx/vpx_dsp/x86/sad4d_avx2.c184
-rw-r--r--media/libvpx/libvpx/vpx_dsp/x86/sad4d_avx512.c83
-rw-r--r--media/libvpx/libvpx/vpx_dsp/x86/sad4d_sse2.asm278
-rw-r--r--media/libvpx/libvpx/vpx_dsp/x86/sad_avx2.c208
-rw-r--r--media/libvpx/libvpx/vpx_dsp/x86/sad_sse2.asm332
-rw-r--r--media/libvpx/libvpx/vpx_dsp/x86/ssim_opt_x86_64.asm219
-rw-r--r--media/libvpx/libvpx/vpx_dsp/x86/subpel_variance_sse2.asm1467
-rw-r--r--media/libvpx/libvpx/vpx_dsp/x86/subtract_avx2.c203
-rw-r--r--media/libvpx/libvpx/vpx_dsp/x86/subtract_sse2.asm127
-rw-r--r--media/libvpx/libvpx/vpx_dsp/x86/sum_squares_sse2.c105
-rw-r--r--media/libvpx/libvpx/vpx_dsp/x86/transpose_sse2.h367
-rw-r--r--media/libvpx/libvpx/vpx_dsp/x86/txfm_common_sse2.h32
-rw-r--r--media/libvpx/libvpx/vpx_dsp/x86/variance_avx2.c872
-rw-r--r--media/libvpx/libvpx/vpx_dsp/x86/variance_sse2.c565
-rw-r--r--media/libvpx/libvpx/vpx_dsp/x86/vpx_convolve_copy_sse2.asm226
-rw-r--r--media/libvpx/libvpx/vpx_dsp/x86/vpx_high_subpixel_8t_sse2.asm964
-rw-r--r--media/libvpx/libvpx/vpx_dsp/x86/vpx_high_subpixel_bilinear_sse2.asm496
-rw-r--r--media/libvpx/libvpx/vpx_dsp/x86/vpx_subpixel_4t_intrin_sse2.c1161
-rw-r--r--media/libvpx/libvpx/vpx_dsp/x86/vpx_subpixel_8t_intrin_avx2.c1458
-rw-r--r--media/libvpx/libvpx/vpx_dsp/x86/vpx_subpixel_8t_intrin_ssse3.c1087
-rw-r--r--media/libvpx/libvpx/vpx_dsp/x86/vpx_subpixel_8t_sse2.asm989
-rw-r--r--media/libvpx/libvpx/vpx_dsp/x86/vpx_subpixel_8t_ssse3.asm803
-rw-r--r--media/libvpx/libvpx/vpx_dsp/x86/vpx_subpixel_bilinear_sse2.asm450
-rw-r--r--media/libvpx/libvpx/vpx_dsp/x86/vpx_subpixel_bilinear_ssse3.asm420
86 files changed, 45157 insertions, 0 deletions
diff --git a/media/libvpx/libvpx/vpx_dsp/x86/add_noise_sse2.asm b/media/libvpx/libvpx/vpx_dsp/x86/add_noise_sse2.asm
new file mode 100644
index 0000000000..f51718cf99
--- /dev/null
+++ b/media/libvpx/libvpx/vpx_dsp/x86/add_noise_sse2.asm
@@ -0,0 +1,88 @@
+;
+; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+;
+; Use of this source code is governed by a BSD-style license
+; that can be found in the LICENSE file in the root of the source
+; tree. An additional intellectual property rights grant can be found
+; in the file PATENTS. All contributing project authors may
+; be found in the AUTHORS file in the root of the source tree.
+;
+
+
+%include "vpx_ports/x86_abi_support.asm"
+
+SECTION .text
+
+;void vpx_plane_add_noise_sse2(uint8_t *start, const int8_t *noise,
+; int blackclamp, int whiteclamp,
+; int width, int height, int pitch)
+globalsym(vpx_plane_add_noise_sse2)
+sym(vpx_plane_add_noise_sse2):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 7
+ GET_GOT rbx
+ push rsi
+ push rdi
+
+ mov rdx, 0x01010101
+ mov rax, arg(2)
+ mul rdx
+ movq xmm3, rax
+ pshufd xmm3, xmm3, 0 ; xmm3 is 16 copies of char in blackclamp
+
+ mov rdx, 0x01010101
+ mov rax, arg(3)
+ mul rdx
+ movq xmm4, rax
+ pshufd xmm4, xmm4, 0 ; xmm4 is 16 copies of char in whiteclamp
+
+ movdqu xmm5, xmm3 ; both clamp = black clamp + white clamp
+ paddusb xmm5, xmm4
+
+.addnoise_loop:
+ call sym(LIBVPX_RAND) WRT_PLT
+ mov rcx, arg(1) ;noise
+ and rax, 0xff
+ add rcx, rax
+
+ mov rdi, rcx
+ movsxd rcx, dword arg(4) ;[Width]
+ mov rsi, arg(0) ;Pos
+ xor rax, rax
+
+.addnoise_nextset:
+ movdqu xmm1,[rsi+rax] ; get the source
+
+ psubusb xmm1, xmm3 ; subtract black clamp
+ paddusb xmm1, xmm5 ; add both clamp
+ psubusb xmm1, xmm4 ; subtract whiteclamp
+
+ movdqu xmm2,[rdi+rax] ; get the noise for this line
+ paddb xmm1,xmm2 ; add it in
+ movdqu [rsi+rax],xmm1 ; store the result
+
+ add rax,16 ; move to the next line
+
+ cmp rax, rcx
+ jl .addnoise_nextset
+
+ movsxd rax, dword arg(6) ; Pitch
+ add arg(0), rax ; Start += Pitch
+ sub dword arg(5), 1 ; Height -= 1
+ jg .addnoise_loop
+
+ ; begin epilog
+ pop rdi
+ pop rsi
+ RESTORE_GOT
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+
+SECTION_RODATA
+align 16
+rd42:
+ times 8 dw 0x04
+four8s:
+ times 4 dd 8
diff --git a/media/libvpx/libvpx/vpx_dsp/x86/avg_intrin_avx2.c b/media/libvpx/libvpx/vpx_dsp/x86/avg_intrin_avx2.c
new file mode 100644
index 0000000000..b2e01319d3
--- /dev/null
+++ b/media/libvpx/libvpx/vpx_dsp/x86/avg_intrin_avx2.c
@@ -0,0 +1,482 @@
+/*
+ * Copyright (c) 2017 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <immintrin.h>
+
+#include "./vpx_dsp_rtcd.h"
+#include "vpx/vpx_integer.h"
+#include "vpx_dsp/x86/bitdepth_conversion_avx2.h"
+#include "vpx_ports/mem.h"
+
+#if CONFIG_VP9_HIGHBITDEPTH
+static void highbd_hadamard_col8_avx2(__m256i *in, int iter) {
+ __m256i a0 = in[0];
+ __m256i a1 = in[1];
+ __m256i a2 = in[2];
+ __m256i a3 = in[3];
+ __m256i a4 = in[4];
+ __m256i a5 = in[5];
+ __m256i a6 = in[6];
+ __m256i a7 = in[7];
+
+ __m256i b0 = _mm256_add_epi32(a0, a1);
+ __m256i b1 = _mm256_sub_epi32(a0, a1);
+ __m256i b2 = _mm256_add_epi32(a2, a3);
+ __m256i b3 = _mm256_sub_epi32(a2, a3);
+ __m256i b4 = _mm256_add_epi32(a4, a5);
+ __m256i b5 = _mm256_sub_epi32(a4, a5);
+ __m256i b6 = _mm256_add_epi32(a6, a7);
+ __m256i b7 = _mm256_sub_epi32(a6, a7);
+
+ a0 = _mm256_add_epi32(b0, b2);
+ a1 = _mm256_add_epi32(b1, b3);
+ a2 = _mm256_sub_epi32(b0, b2);
+ a3 = _mm256_sub_epi32(b1, b3);
+ a4 = _mm256_add_epi32(b4, b6);
+ a5 = _mm256_add_epi32(b5, b7);
+ a6 = _mm256_sub_epi32(b4, b6);
+ a7 = _mm256_sub_epi32(b5, b7);
+
+ if (iter == 0) {
+ b0 = _mm256_add_epi32(a0, a4);
+ b7 = _mm256_add_epi32(a1, a5);
+ b3 = _mm256_add_epi32(a2, a6);
+ b4 = _mm256_add_epi32(a3, a7);
+ b2 = _mm256_sub_epi32(a0, a4);
+ b6 = _mm256_sub_epi32(a1, a5);
+ b1 = _mm256_sub_epi32(a2, a6);
+ b5 = _mm256_sub_epi32(a3, a7);
+
+ a0 = _mm256_unpacklo_epi32(b0, b1);
+ a1 = _mm256_unpacklo_epi32(b2, b3);
+ a2 = _mm256_unpackhi_epi32(b0, b1);
+ a3 = _mm256_unpackhi_epi32(b2, b3);
+ a4 = _mm256_unpacklo_epi32(b4, b5);
+ a5 = _mm256_unpacklo_epi32(b6, b7);
+ a6 = _mm256_unpackhi_epi32(b4, b5);
+ a7 = _mm256_unpackhi_epi32(b6, b7);
+
+ b0 = _mm256_unpacklo_epi64(a0, a1);
+ b1 = _mm256_unpacklo_epi64(a4, a5);
+ b2 = _mm256_unpackhi_epi64(a0, a1);
+ b3 = _mm256_unpackhi_epi64(a4, a5);
+ b4 = _mm256_unpacklo_epi64(a2, a3);
+ b5 = _mm256_unpacklo_epi64(a6, a7);
+ b6 = _mm256_unpackhi_epi64(a2, a3);
+ b7 = _mm256_unpackhi_epi64(a6, a7);
+
+ in[0] = _mm256_permute2x128_si256(b0, b1, 0x20);
+ in[1] = _mm256_permute2x128_si256(b0, b1, 0x31);
+ in[2] = _mm256_permute2x128_si256(b2, b3, 0x20);
+ in[3] = _mm256_permute2x128_si256(b2, b3, 0x31);
+ in[4] = _mm256_permute2x128_si256(b4, b5, 0x20);
+ in[5] = _mm256_permute2x128_si256(b4, b5, 0x31);
+ in[6] = _mm256_permute2x128_si256(b6, b7, 0x20);
+ in[7] = _mm256_permute2x128_si256(b6, b7, 0x31);
+ } else {
+ in[0] = _mm256_add_epi32(a0, a4);
+ in[7] = _mm256_add_epi32(a1, a5);
+ in[3] = _mm256_add_epi32(a2, a6);
+ in[4] = _mm256_add_epi32(a3, a7);
+ in[2] = _mm256_sub_epi32(a0, a4);
+ in[6] = _mm256_sub_epi32(a1, a5);
+ in[1] = _mm256_sub_epi32(a2, a6);
+ in[5] = _mm256_sub_epi32(a3, a7);
+ }
+}
+
+void vpx_highbd_hadamard_8x8_avx2(const int16_t *src_diff, ptrdiff_t src_stride,
+ tran_low_t *coeff) {
+ __m128i src16[8];
+ __m256i src32[8];
+
+ src16[0] = _mm_loadu_si128((const __m128i *)src_diff);
+ src16[1] = _mm_loadu_si128((const __m128i *)(src_diff += src_stride));
+ src16[2] = _mm_loadu_si128((const __m128i *)(src_diff += src_stride));
+ src16[3] = _mm_loadu_si128((const __m128i *)(src_diff += src_stride));
+ src16[4] = _mm_loadu_si128((const __m128i *)(src_diff += src_stride));
+ src16[5] = _mm_loadu_si128((const __m128i *)(src_diff += src_stride));
+ src16[6] = _mm_loadu_si128((const __m128i *)(src_diff += src_stride));
+ src16[7] = _mm_loadu_si128((const __m128i *)(src_diff + src_stride));
+
+ src32[0] = _mm256_cvtepi16_epi32(src16[0]);
+ src32[1] = _mm256_cvtepi16_epi32(src16[1]);
+ src32[2] = _mm256_cvtepi16_epi32(src16[2]);
+ src32[3] = _mm256_cvtepi16_epi32(src16[3]);
+ src32[4] = _mm256_cvtepi16_epi32(src16[4]);
+ src32[5] = _mm256_cvtepi16_epi32(src16[5]);
+ src32[6] = _mm256_cvtepi16_epi32(src16[6]);
+ src32[7] = _mm256_cvtepi16_epi32(src16[7]);
+
+ highbd_hadamard_col8_avx2(src32, 0);
+ highbd_hadamard_col8_avx2(src32, 1);
+
+ _mm256_storeu_si256((__m256i *)coeff, src32[0]);
+ coeff += 8;
+ _mm256_storeu_si256((__m256i *)coeff, src32[1]);
+ coeff += 8;
+ _mm256_storeu_si256((__m256i *)coeff, src32[2]);
+ coeff += 8;
+ _mm256_storeu_si256((__m256i *)coeff, src32[3]);
+ coeff += 8;
+ _mm256_storeu_si256((__m256i *)coeff, src32[4]);
+ coeff += 8;
+ _mm256_storeu_si256((__m256i *)coeff, src32[5]);
+ coeff += 8;
+ _mm256_storeu_si256((__m256i *)coeff, src32[6]);
+ coeff += 8;
+ _mm256_storeu_si256((__m256i *)coeff, src32[7]);
+}
+
+void vpx_highbd_hadamard_16x16_avx2(const int16_t *src_diff,
+ ptrdiff_t src_stride, tran_low_t *coeff) {
+ int idx;
+ tran_low_t *t_coeff = coeff;
+ for (idx = 0; idx < 4; ++idx) {
+ const int16_t *src_ptr =
+ src_diff + (idx >> 1) * 8 * src_stride + (idx & 0x01) * 8;
+ vpx_highbd_hadamard_8x8_avx2(src_ptr, src_stride, t_coeff + idx * 64);
+ }
+
+ for (idx = 0; idx < 64; idx += 8) {
+ __m256i coeff0 = _mm256_loadu_si256((const __m256i *)t_coeff);
+ __m256i coeff1 = _mm256_loadu_si256((const __m256i *)(t_coeff + 64));
+ __m256i coeff2 = _mm256_loadu_si256((const __m256i *)(t_coeff + 128));
+ __m256i coeff3 = _mm256_loadu_si256((const __m256i *)(t_coeff + 192));
+
+ __m256i b0 = _mm256_add_epi32(coeff0, coeff1);
+ __m256i b1 = _mm256_sub_epi32(coeff0, coeff1);
+ __m256i b2 = _mm256_add_epi32(coeff2, coeff3);
+ __m256i b3 = _mm256_sub_epi32(coeff2, coeff3);
+
+ b0 = _mm256_srai_epi32(b0, 1);
+ b1 = _mm256_srai_epi32(b1, 1);
+ b2 = _mm256_srai_epi32(b2, 1);
+ b3 = _mm256_srai_epi32(b3, 1);
+
+ coeff0 = _mm256_add_epi32(b0, b2);
+ coeff1 = _mm256_add_epi32(b1, b3);
+ coeff2 = _mm256_sub_epi32(b0, b2);
+ coeff3 = _mm256_sub_epi32(b1, b3);
+
+ _mm256_storeu_si256((__m256i *)coeff, coeff0);
+ _mm256_storeu_si256((__m256i *)(coeff + 64), coeff1);
+ _mm256_storeu_si256((__m256i *)(coeff + 128), coeff2);
+ _mm256_storeu_si256((__m256i *)(coeff + 192), coeff3);
+
+ coeff += 8;
+ t_coeff += 8;
+ }
+}
+
+void vpx_highbd_hadamard_32x32_avx2(const int16_t *src_diff,
+ ptrdiff_t src_stride, tran_low_t *coeff) {
+ int idx;
+ tran_low_t *t_coeff = coeff;
+ for (idx = 0; idx < 4; ++idx) {
+ const int16_t *src_ptr =
+ src_diff + (idx >> 1) * 16 * src_stride + (idx & 0x01) * 16;
+ vpx_highbd_hadamard_16x16_avx2(src_ptr, src_stride, t_coeff + idx * 256);
+ }
+
+ for (idx = 0; idx < 256; idx += 8) {
+ __m256i coeff0 = _mm256_loadu_si256((const __m256i *)t_coeff);
+ __m256i coeff1 = _mm256_loadu_si256((const __m256i *)(t_coeff + 256));
+ __m256i coeff2 = _mm256_loadu_si256((const __m256i *)(t_coeff + 512));
+ __m256i coeff3 = _mm256_loadu_si256((const __m256i *)(t_coeff + 768));
+
+ __m256i b0 = _mm256_add_epi32(coeff0, coeff1);
+ __m256i b1 = _mm256_sub_epi32(coeff0, coeff1);
+ __m256i b2 = _mm256_add_epi32(coeff2, coeff3);
+ __m256i b3 = _mm256_sub_epi32(coeff2, coeff3);
+
+ b0 = _mm256_srai_epi32(b0, 2);
+ b1 = _mm256_srai_epi32(b1, 2);
+ b2 = _mm256_srai_epi32(b2, 2);
+ b3 = _mm256_srai_epi32(b3, 2);
+
+ coeff0 = _mm256_add_epi32(b0, b2);
+ coeff1 = _mm256_add_epi32(b1, b3);
+ coeff2 = _mm256_sub_epi32(b0, b2);
+ coeff3 = _mm256_sub_epi32(b1, b3);
+
+ _mm256_storeu_si256((__m256i *)coeff, coeff0);
+ _mm256_storeu_si256((__m256i *)(coeff + 256), coeff1);
+ _mm256_storeu_si256((__m256i *)(coeff + 512), coeff2);
+ _mm256_storeu_si256((__m256i *)(coeff + 768), coeff3);
+
+ coeff += 8;
+ t_coeff += 8;
+ }
+}
+#endif // CONFIG_VP9_HIGHBITDEPTH
+
+static void hadamard_col8x2_avx2(__m256i *in, int iter) {
+ __m256i a0 = in[0];
+ __m256i a1 = in[1];
+ __m256i a2 = in[2];
+ __m256i a3 = in[3];
+ __m256i a4 = in[4];
+ __m256i a5 = in[5];
+ __m256i a6 = in[6];
+ __m256i a7 = in[7];
+
+ __m256i b0 = _mm256_add_epi16(a0, a1);
+ __m256i b1 = _mm256_sub_epi16(a0, a1);
+ __m256i b2 = _mm256_add_epi16(a2, a3);
+ __m256i b3 = _mm256_sub_epi16(a2, a3);
+ __m256i b4 = _mm256_add_epi16(a4, a5);
+ __m256i b5 = _mm256_sub_epi16(a4, a5);
+ __m256i b6 = _mm256_add_epi16(a6, a7);
+ __m256i b7 = _mm256_sub_epi16(a6, a7);
+
+ a0 = _mm256_add_epi16(b0, b2);
+ a1 = _mm256_add_epi16(b1, b3);
+ a2 = _mm256_sub_epi16(b0, b2);
+ a3 = _mm256_sub_epi16(b1, b3);
+ a4 = _mm256_add_epi16(b4, b6);
+ a5 = _mm256_add_epi16(b5, b7);
+ a6 = _mm256_sub_epi16(b4, b6);
+ a7 = _mm256_sub_epi16(b5, b7);
+
+ if (iter == 0) {
+ b0 = _mm256_add_epi16(a0, a4);
+ b7 = _mm256_add_epi16(a1, a5);
+ b3 = _mm256_add_epi16(a2, a6);
+ b4 = _mm256_add_epi16(a3, a7);
+ b2 = _mm256_sub_epi16(a0, a4);
+ b6 = _mm256_sub_epi16(a1, a5);
+ b1 = _mm256_sub_epi16(a2, a6);
+ b5 = _mm256_sub_epi16(a3, a7);
+
+ a0 = _mm256_unpacklo_epi16(b0, b1);
+ a1 = _mm256_unpacklo_epi16(b2, b3);
+ a2 = _mm256_unpackhi_epi16(b0, b1);
+ a3 = _mm256_unpackhi_epi16(b2, b3);
+ a4 = _mm256_unpacklo_epi16(b4, b5);
+ a5 = _mm256_unpacklo_epi16(b6, b7);
+ a6 = _mm256_unpackhi_epi16(b4, b5);
+ a7 = _mm256_unpackhi_epi16(b6, b7);
+
+ b0 = _mm256_unpacklo_epi32(a0, a1);
+ b1 = _mm256_unpacklo_epi32(a4, a5);
+ b2 = _mm256_unpackhi_epi32(a0, a1);
+ b3 = _mm256_unpackhi_epi32(a4, a5);
+ b4 = _mm256_unpacklo_epi32(a2, a3);
+ b5 = _mm256_unpacklo_epi32(a6, a7);
+ b6 = _mm256_unpackhi_epi32(a2, a3);
+ b7 = _mm256_unpackhi_epi32(a6, a7);
+
+ in[0] = _mm256_unpacklo_epi64(b0, b1);
+ in[1] = _mm256_unpackhi_epi64(b0, b1);
+ in[2] = _mm256_unpacklo_epi64(b2, b3);
+ in[3] = _mm256_unpackhi_epi64(b2, b3);
+ in[4] = _mm256_unpacklo_epi64(b4, b5);
+ in[5] = _mm256_unpackhi_epi64(b4, b5);
+ in[6] = _mm256_unpacklo_epi64(b6, b7);
+ in[7] = _mm256_unpackhi_epi64(b6, b7);
+ } else {
+ in[0] = _mm256_add_epi16(a0, a4);
+ in[7] = _mm256_add_epi16(a1, a5);
+ in[3] = _mm256_add_epi16(a2, a6);
+ in[4] = _mm256_add_epi16(a3, a7);
+ in[2] = _mm256_sub_epi16(a0, a4);
+ in[6] = _mm256_sub_epi16(a1, a5);
+ in[1] = _mm256_sub_epi16(a2, a6);
+ in[5] = _mm256_sub_epi16(a3, a7);
+ }
+}
+
+static void hadamard_8x8x2_avx2(const int16_t *src_diff, ptrdiff_t src_stride,
+ int16_t *coeff) {
+ __m256i src[8];
+ src[0] = _mm256_loadu_si256((const __m256i *)src_diff);
+ src[1] = _mm256_loadu_si256((const __m256i *)(src_diff += src_stride));
+ src[2] = _mm256_loadu_si256((const __m256i *)(src_diff += src_stride));
+ src[3] = _mm256_loadu_si256((const __m256i *)(src_diff += src_stride));
+ src[4] = _mm256_loadu_si256((const __m256i *)(src_diff += src_stride));
+ src[5] = _mm256_loadu_si256((const __m256i *)(src_diff += src_stride));
+ src[6] = _mm256_loadu_si256((const __m256i *)(src_diff += src_stride));
+ src[7] = _mm256_loadu_si256((const __m256i *)(src_diff + src_stride));
+
+ hadamard_col8x2_avx2(src, 0);
+ hadamard_col8x2_avx2(src, 1);
+
+ _mm256_storeu_si256((__m256i *)coeff,
+ _mm256_permute2x128_si256(src[0], src[1], 0x20));
+ coeff += 16;
+ _mm256_storeu_si256((__m256i *)coeff,
+ _mm256_permute2x128_si256(src[2], src[3], 0x20));
+ coeff += 16;
+ _mm256_storeu_si256((__m256i *)coeff,
+ _mm256_permute2x128_si256(src[4], src[5], 0x20));
+ coeff += 16;
+ _mm256_storeu_si256((__m256i *)coeff,
+ _mm256_permute2x128_si256(src[6], src[7], 0x20));
+ coeff += 16;
+ _mm256_storeu_si256((__m256i *)coeff,
+ _mm256_permute2x128_si256(src[0], src[1], 0x31));
+ coeff += 16;
+ _mm256_storeu_si256((__m256i *)coeff,
+ _mm256_permute2x128_si256(src[2], src[3], 0x31));
+ coeff += 16;
+ _mm256_storeu_si256((__m256i *)coeff,
+ _mm256_permute2x128_si256(src[4], src[5], 0x31));
+ coeff += 16;
+ _mm256_storeu_si256((__m256i *)coeff,
+ _mm256_permute2x128_si256(src[6], src[7], 0x31));
+}
+
+static INLINE void hadamard_16x16_avx2(const int16_t *src_diff,
+ ptrdiff_t src_stride, tran_low_t *coeff,
+ int is_final) {
+#if CONFIG_VP9_HIGHBITDEPTH
+ DECLARE_ALIGNED(32, int16_t, temp_coeff[16 * 16]);
+ int16_t *t_coeff = temp_coeff;
+#else
+ int16_t *t_coeff = coeff;
+#endif
+ int16_t *coeff16 = (int16_t *)coeff;
+ int idx;
+ for (idx = 0; idx < 2; ++idx) {
+ const int16_t *src_ptr = src_diff + idx * 8 * src_stride;
+ hadamard_8x8x2_avx2(src_ptr, src_stride, t_coeff + (idx * 64 * 2));
+ }
+
+ for (idx = 0; idx < 64; idx += 16) {
+ const __m256i coeff0 = _mm256_loadu_si256((const __m256i *)t_coeff);
+ const __m256i coeff1 = _mm256_loadu_si256((const __m256i *)(t_coeff + 64));
+ const __m256i coeff2 = _mm256_loadu_si256((const __m256i *)(t_coeff + 128));
+ const __m256i coeff3 = _mm256_loadu_si256((const __m256i *)(t_coeff + 192));
+
+ __m256i b0 = _mm256_add_epi16(coeff0, coeff1);
+ __m256i b1 = _mm256_sub_epi16(coeff0, coeff1);
+ __m256i b2 = _mm256_add_epi16(coeff2, coeff3);
+ __m256i b3 = _mm256_sub_epi16(coeff2, coeff3);
+
+ b0 = _mm256_srai_epi16(b0, 1);
+ b1 = _mm256_srai_epi16(b1, 1);
+ b2 = _mm256_srai_epi16(b2, 1);
+ b3 = _mm256_srai_epi16(b3, 1);
+ if (is_final) {
+ store_tran_low(_mm256_add_epi16(b0, b2), coeff);
+ store_tran_low(_mm256_add_epi16(b1, b3), coeff + 64);
+ store_tran_low(_mm256_sub_epi16(b0, b2), coeff + 128);
+ store_tran_low(_mm256_sub_epi16(b1, b3), coeff + 192);
+ coeff += 16;
+ } else {
+ _mm256_storeu_si256((__m256i *)coeff16, _mm256_add_epi16(b0, b2));
+ _mm256_storeu_si256((__m256i *)(coeff16 + 64), _mm256_add_epi16(b1, b3));
+ _mm256_storeu_si256((__m256i *)(coeff16 + 128), _mm256_sub_epi16(b0, b2));
+ _mm256_storeu_si256((__m256i *)(coeff16 + 192), _mm256_sub_epi16(b1, b3));
+ coeff16 += 16;
+ }
+ t_coeff += 16;
+ }
+}
+
+void vpx_hadamard_16x16_avx2(const int16_t *src_diff, ptrdiff_t src_stride,
+ tran_low_t *coeff) {
+ hadamard_16x16_avx2(src_diff, src_stride, coeff, 1);
+}
+
+void vpx_hadamard_32x32_avx2(const int16_t *src_diff, ptrdiff_t src_stride,
+ tran_low_t *coeff) {
+#if CONFIG_VP9_HIGHBITDEPTH
+ // For high bitdepths, it is unnecessary to store_tran_low
+ // (mult/unpack/store), then load_tran_low (load/pack) the same memory in the
+ // next stage. Output to an intermediate buffer first, then store_tran_low()
+ // in the final stage.
+ DECLARE_ALIGNED(32, int16_t, temp_coeff[32 * 32]);
+ int16_t *t_coeff = temp_coeff;
+#else
+ int16_t *t_coeff = coeff;
+#endif
+ int idx;
+ for (idx = 0; idx < 4; ++idx) {
+ // src_diff: 9 bit, dynamic range [-255, 255]
+ const int16_t *src_ptr =
+ src_diff + (idx >> 1) * 16 * src_stride + (idx & 0x01) * 16;
+ hadamard_16x16_avx2(src_ptr, src_stride,
+ (tran_low_t *)(t_coeff + idx * 256), 0);
+ }
+
+ for (idx = 0; idx < 256; idx += 16) {
+ const __m256i coeff0 = _mm256_loadu_si256((const __m256i *)t_coeff);
+ const __m256i coeff1 = _mm256_loadu_si256((const __m256i *)(t_coeff + 256));
+ const __m256i coeff2 = _mm256_loadu_si256((const __m256i *)(t_coeff + 512));
+ const __m256i coeff3 = _mm256_loadu_si256((const __m256i *)(t_coeff + 768));
+
+ __m256i b0 = _mm256_add_epi16(coeff0, coeff1);
+ __m256i b1 = _mm256_sub_epi16(coeff0, coeff1);
+ __m256i b2 = _mm256_add_epi16(coeff2, coeff3);
+ __m256i b3 = _mm256_sub_epi16(coeff2, coeff3);
+
+ b0 = _mm256_srai_epi16(b0, 2);
+ b1 = _mm256_srai_epi16(b1, 2);
+ b2 = _mm256_srai_epi16(b2, 2);
+ b3 = _mm256_srai_epi16(b3, 2);
+
+ store_tran_low(_mm256_add_epi16(b0, b2), coeff);
+ store_tran_low(_mm256_add_epi16(b1, b3), coeff + 256);
+ store_tran_low(_mm256_sub_epi16(b0, b2), coeff + 512);
+ store_tran_low(_mm256_sub_epi16(b1, b3), coeff + 768);
+
+ coeff += 16;
+ t_coeff += 16;
+ }
+}
+
+int vpx_satd_avx2(const tran_low_t *coeff, int length) {
+ const __m256i one = _mm256_set1_epi16(1);
+ __m256i accum = _mm256_setzero_si256();
+ int i;
+
+ for (i = 0; i < length; i += 16) {
+ const __m256i src_line = load_tran_low(coeff);
+ const __m256i abs = _mm256_abs_epi16(src_line);
+ const __m256i sum = _mm256_madd_epi16(abs, one);
+ accum = _mm256_add_epi32(accum, sum);
+ coeff += 16;
+ }
+
+ { // 32 bit horizontal add
+ const __m256i a = _mm256_srli_si256(accum, 8);
+ const __m256i b = _mm256_add_epi32(accum, a);
+ const __m256i c = _mm256_srli_epi64(b, 32);
+ const __m256i d = _mm256_add_epi32(b, c);
+ const __m128i accum_128 = _mm_add_epi32(_mm256_castsi256_si128(d),
+ _mm256_extractf128_si256(d, 1));
+ return _mm_cvtsi128_si32(accum_128);
+ }
+}
+
+#if CONFIG_VP9_HIGHBITDEPTH
+int vpx_highbd_satd_avx2(const tran_low_t *coeff, int length) {
+ __m256i accum = _mm256_setzero_si256();
+ int i;
+
+ for (i = 0; i < length; i += 8, coeff += 8) {
+ const __m256i src_line = _mm256_loadu_si256((const __m256i *)coeff);
+ const __m256i abs = _mm256_abs_epi32(src_line);
+ accum = _mm256_add_epi32(accum, abs);
+ }
+
+ { // 32 bit horizontal add
+ const __m256i a = _mm256_srli_si256(accum, 8);
+ const __m256i b = _mm256_add_epi32(accum, a);
+ const __m256i c = _mm256_srli_epi64(b, 32);
+ const __m256i d = _mm256_add_epi32(b, c);
+ const __m128i accum_128 = _mm_add_epi32(_mm256_castsi256_si128(d),
+ _mm256_extractf128_si256(d, 1));
+ return _mm_cvtsi128_si32(accum_128);
+ }
+}
+#endif // CONFIG_VP9_HIGHBITDEPTH
diff --git a/media/libvpx/libvpx/vpx_dsp/x86/avg_intrin_sse2.c b/media/libvpx/libvpx/vpx_dsp/x86/avg_intrin_sse2.c
new file mode 100644
index 0000000000..015c11a1f3
--- /dev/null
+++ b/media/libvpx/libvpx/vpx_dsp/x86/avg_intrin_sse2.c
@@ -0,0 +1,577 @@
+/*
+ * Copyright (c) 2014 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <emmintrin.h>
+
+#include "./vpx_dsp_rtcd.h"
+#include "vpx/vpx_integer.h"
+#include "vpx_dsp/x86/bitdepth_conversion_sse2.h"
+#include "vpx_ports/mem.h"
+
+void vpx_minmax_8x8_sse2(const uint8_t *s, int p, const uint8_t *d, int dp,
+ int *min, int *max) {
+ __m128i u0, s0, d0, diff, maxabsdiff, minabsdiff, negdiff, absdiff0, absdiff;
+ u0 = _mm_setzero_si128();
+ // Row 0
+ s0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s)), u0);
+ d0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(d)), u0);
+ diff = _mm_subs_epi16(s0, d0);
+ negdiff = _mm_subs_epi16(u0, diff);
+ absdiff0 = _mm_max_epi16(diff, negdiff);
+ // Row 1
+ s0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s + p)), u0);
+ d0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(d + dp)), u0);
+ diff = _mm_subs_epi16(s0, d0);
+ negdiff = _mm_subs_epi16(u0, diff);
+ absdiff = _mm_max_epi16(diff, negdiff);
+ maxabsdiff = _mm_max_epi16(absdiff0, absdiff);
+ minabsdiff = _mm_min_epi16(absdiff0, absdiff);
+ // Row 2
+ s0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s + 2 * p)), u0);
+ d0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(d + 2 * dp)), u0);
+ diff = _mm_subs_epi16(s0, d0);
+ negdiff = _mm_subs_epi16(u0, diff);
+ absdiff = _mm_max_epi16(diff, negdiff);
+ maxabsdiff = _mm_max_epi16(maxabsdiff, absdiff);
+ minabsdiff = _mm_min_epi16(minabsdiff, absdiff);
+ // Row 3
+ s0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s + 3 * p)), u0);
+ d0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(d + 3 * dp)), u0);
+ diff = _mm_subs_epi16(s0, d0);
+ negdiff = _mm_subs_epi16(u0, diff);
+ absdiff = _mm_max_epi16(diff, negdiff);
+ maxabsdiff = _mm_max_epi16(maxabsdiff, absdiff);
+ minabsdiff = _mm_min_epi16(minabsdiff, absdiff);
+ // Row 4
+ s0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s + 4 * p)), u0);
+ d0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(d + 4 * dp)), u0);
+ diff = _mm_subs_epi16(s0, d0);
+ negdiff = _mm_subs_epi16(u0, diff);
+ absdiff = _mm_max_epi16(diff, negdiff);
+ maxabsdiff = _mm_max_epi16(maxabsdiff, absdiff);
+ minabsdiff = _mm_min_epi16(minabsdiff, absdiff);
+ // Row 5
+ s0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s + 5 * p)), u0);
+ d0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(d + 5 * dp)), u0);
+ diff = _mm_subs_epi16(s0, d0);
+ negdiff = _mm_subs_epi16(u0, diff);
+ absdiff = _mm_max_epi16(diff, negdiff);
+ maxabsdiff = _mm_max_epi16(maxabsdiff, absdiff);
+ minabsdiff = _mm_min_epi16(minabsdiff, absdiff);
+ // Row 6
+ s0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s + 6 * p)), u0);
+ d0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(d + 6 * dp)), u0);
+ diff = _mm_subs_epi16(s0, d0);
+ negdiff = _mm_subs_epi16(u0, diff);
+ absdiff = _mm_max_epi16(diff, negdiff);
+ maxabsdiff = _mm_max_epi16(maxabsdiff, absdiff);
+ minabsdiff = _mm_min_epi16(minabsdiff, absdiff);
+ // Row 7
+ s0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s + 7 * p)), u0);
+ d0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(d + 7 * dp)), u0);
+ diff = _mm_subs_epi16(s0, d0);
+ negdiff = _mm_subs_epi16(u0, diff);
+ absdiff = _mm_max_epi16(diff, negdiff);
+ maxabsdiff = _mm_max_epi16(maxabsdiff, absdiff);
+ minabsdiff = _mm_min_epi16(minabsdiff, absdiff);
+
+ maxabsdiff = _mm_max_epi16(maxabsdiff, _mm_srli_si128(maxabsdiff, 8));
+ maxabsdiff = _mm_max_epi16(maxabsdiff, _mm_srli_epi64(maxabsdiff, 32));
+ maxabsdiff = _mm_max_epi16(maxabsdiff, _mm_srli_epi64(maxabsdiff, 16));
+ *max = _mm_extract_epi16(maxabsdiff, 0);
+
+ minabsdiff = _mm_min_epi16(minabsdiff, _mm_srli_si128(minabsdiff, 8));
+ minabsdiff = _mm_min_epi16(minabsdiff, _mm_srli_epi64(minabsdiff, 32));
+ minabsdiff = _mm_min_epi16(minabsdiff, _mm_srli_epi64(minabsdiff, 16));
+ *min = _mm_extract_epi16(minabsdiff, 0);
+}
+
+unsigned int vpx_avg_8x8_sse2(const uint8_t *s, int p) {
+ __m128i s0, s1, u0;
+ unsigned int avg = 0;
+ u0 = _mm_setzero_si128();
+ s0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s)), u0);
+ s1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s + p)), u0);
+ s0 = _mm_adds_epu16(s0, s1);
+ s1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s + 2 * p)), u0);
+ s0 = _mm_adds_epu16(s0, s1);
+ s1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s + 3 * p)), u0);
+ s0 = _mm_adds_epu16(s0, s1);
+ s1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s + 4 * p)), u0);
+ s0 = _mm_adds_epu16(s0, s1);
+ s1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s + 5 * p)), u0);
+ s0 = _mm_adds_epu16(s0, s1);
+ s1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s + 6 * p)), u0);
+ s0 = _mm_adds_epu16(s0, s1);
+ s1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s + 7 * p)), u0);
+ s0 = _mm_adds_epu16(s0, s1);
+
+ s0 = _mm_adds_epu16(s0, _mm_srli_si128(s0, 8));
+ s0 = _mm_adds_epu16(s0, _mm_srli_epi64(s0, 32));
+ s0 = _mm_adds_epu16(s0, _mm_srli_epi64(s0, 16));
+ avg = _mm_extract_epi16(s0, 0);
+ return (avg + 32) >> 6;
+}
+
+unsigned int vpx_avg_4x4_sse2(const uint8_t *s, int p) {
+ __m128i s0, s1, u0;
+ unsigned int avg = 0;
+ u0 = _mm_setzero_si128();
+ s0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s)), u0);
+ s1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s + p)), u0);
+ s0 = _mm_adds_epu16(s0, s1);
+ s1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s + 2 * p)), u0);
+ s0 = _mm_adds_epu16(s0, s1);
+ s1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s + 3 * p)), u0);
+ s0 = _mm_adds_epu16(s0, s1);
+
+ s0 = _mm_adds_epu16(s0, _mm_srli_si128(s0, 4));
+ s0 = _mm_adds_epu16(s0, _mm_srli_epi64(s0, 16));
+ avg = _mm_extract_epi16(s0, 0);
+ return (avg + 8) >> 4;
+}
+
+#if CONFIG_VP9_HIGHBITDEPTH
+unsigned int vpx_highbd_avg_8x8_sse2(const uint8_t *s8, int p) {
+ __m128i s0, s1;
+ unsigned int avg;
+ const uint16_t *s = CONVERT_TO_SHORTPTR(s8);
+ const __m128i zero = _mm_setzero_si128();
+ s0 = _mm_loadu_si128((const __m128i *)(s));
+ s1 = _mm_loadu_si128((const __m128i *)(s + p));
+ s0 = _mm_adds_epu16(s0, s1);
+ s1 = _mm_loadu_si128((const __m128i *)(s + 2 * p));
+ s0 = _mm_adds_epu16(s0, s1);
+ s1 = _mm_loadu_si128((const __m128i *)(s + 3 * p));
+ s0 = _mm_adds_epu16(s0, s1);
+ s1 = _mm_loadu_si128((const __m128i *)(s + 4 * p));
+ s0 = _mm_adds_epu16(s0, s1);
+ s1 = _mm_loadu_si128((const __m128i *)(s + 5 * p));
+ s0 = _mm_adds_epu16(s0, s1);
+ s1 = _mm_loadu_si128((const __m128i *)(s + 6 * p));
+ s0 = _mm_adds_epu16(s0, s1);
+ s1 = _mm_loadu_si128((const __m128i *)(s + 7 * p));
+ s0 = _mm_adds_epu16(s0, s1);
+ s1 = _mm_unpackhi_epi16(s0, zero);
+ s0 = _mm_unpacklo_epi16(s0, zero);
+ s0 = _mm_add_epi32(s0, s1);
+ s0 = _mm_add_epi32(s0, _mm_srli_si128(s0, 8));
+ s0 = _mm_add_epi32(s0, _mm_srli_si128(s0, 4));
+ avg = (unsigned int)_mm_cvtsi128_si32(s0);
+
+ return (avg + 32) >> 6;
+}
+
+unsigned int vpx_highbd_avg_4x4_sse2(const uint8_t *s8, int p) {
+ __m128i s0, s1;
+ unsigned int avg;
+ const uint16_t *s = CONVERT_TO_SHORTPTR(s8);
+ s0 = _mm_loadl_epi64((const __m128i *)(s));
+ s1 = _mm_loadl_epi64((const __m128i *)(s + p));
+ s0 = _mm_adds_epu16(s0, s1);
+ s1 = _mm_loadl_epi64((const __m128i *)(s + 2 * p));
+ s0 = _mm_adds_epu16(s0, s1);
+ s1 = _mm_loadl_epi64((const __m128i *)(s + 3 * p));
+ s0 = _mm_adds_epu16(s0, s1);
+ s0 = _mm_add_epi16(s0, _mm_srli_si128(s0, 4));
+ s0 = _mm_add_epi16(s0, _mm_srli_si128(s0, 2));
+ avg = _mm_extract_epi16(s0, 0);
+
+ return (avg + 8) >> 4;
+}
+#endif // CONFIG_VP9_HIGHBITDEPTH
+
+static void hadamard_col8_sse2(__m128i *in, int iter) {
+ __m128i a0 = in[0];
+ __m128i a1 = in[1];
+ __m128i a2 = in[2];
+ __m128i a3 = in[3];
+ __m128i a4 = in[4];
+ __m128i a5 = in[5];
+ __m128i a6 = in[6];
+ __m128i a7 = in[7];
+
+ __m128i b0 = _mm_add_epi16(a0, a1);
+ __m128i b1 = _mm_sub_epi16(a0, a1);
+ __m128i b2 = _mm_add_epi16(a2, a3);
+ __m128i b3 = _mm_sub_epi16(a2, a3);
+ __m128i b4 = _mm_add_epi16(a4, a5);
+ __m128i b5 = _mm_sub_epi16(a4, a5);
+ __m128i b6 = _mm_add_epi16(a6, a7);
+ __m128i b7 = _mm_sub_epi16(a6, a7);
+
+ a0 = _mm_add_epi16(b0, b2);
+ a1 = _mm_add_epi16(b1, b3);
+ a2 = _mm_sub_epi16(b0, b2);
+ a3 = _mm_sub_epi16(b1, b3);
+ a4 = _mm_add_epi16(b4, b6);
+ a5 = _mm_add_epi16(b5, b7);
+ a6 = _mm_sub_epi16(b4, b6);
+ a7 = _mm_sub_epi16(b5, b7);
+
+ if (iter == 0) {
+ b0 = _mm_add_epi16(a0, a4);
+ b7 = _mm_add_epi16(a1, a5);
+ b3 = _mm_add_epi16(a2, a6);
+ b4 = _mm_add_epi16(a3, a7);
+ b2 = _mm_sub_epi16(a0, a4);
+ b6 = _mm_sub_epi16(a1, a5);
+ b1 = _mm_sub_epi16(a2, a6);
+ b5 = _mm_sub_epi16(a3, a7);
+
+ a0 = _mm_unpacklo_epi16(b0, b1);
+ a1 = _mm_unpacklo_epi16(b2, b3);
+ a2 = _mm_unpackhi_epi16(b0, b1);
+ a3 = _mm_unpackhi_epi16(b2, b3);
+ a4 = _mm_unpacklo_epi16(b4, b5);
+ a5 = _mm_unpacklo_epi16(b6, b7);
+ a6 = _mm_unpackhi_epi16(b4, b5);
+ a7 = _mm_unpackhi_epi16(b6, b7);
+
+ b0 = _mm_unpacklo_epi32(a0, a1);
+ b1 = _mm_unpacklo_epi32(a4, a5);
+ b2 = _mm_unpackhi_epi32(a0, a1);
+ b3 = _mm_unpackhi_epi32(a4, a5);
+ b4 = _mm_unpacklo_epi32(a2, a3);
+ b5 = _mm_unpacklo_epi32(a6, a7);
+ b6 = _mm_unpackhi_epi32(a2, a3);
+ b7 = _mm_unpackhi_epi32(a6, a7);
+
+ in[0] = _mm_unpacklo_epi64(b0, b1);
+ in[1] = _mm_unpackhi_epi64(b0, b1);
+ in[2] = _mm_unpacklo_epi64(b2, b3);
+ in[3] = _mm_unpackhi_epi64(b2, b3);
+ in[4] = _mm_unpacklo_epi64(b4, b5);
+ in[5] = _mm_unpackhi_epi64(b4, b5);
+ in[6] = _mm_unpacklo_epi64(b6, b7);
+ in[7] = _mm_unpackhi_epi64(b6, b7);
+ } else {
+ in[0] = _mm_add_epi16(a0, a4);
+ in[7] = _mm_add_epi16(a1, a5);
+ in[3] = _mm_add_epi16(a2, a6);
+ in[4] = _mm_add_epi16(a3, a7);
+ in[2] = _mm_sub_epi16(a0, a4);
+ in[6] = _mm_sub_epi16(a1, a5);
+ in[1] = _mm_sub_epi16(a2, a6);
+ in[5] = _mm_sub_epi16(a3, a7);
+ }
+}
+
+static INLINE void hadamard_8x8_sse2(const int16_t *src_diff,
+ ptrdiff_t src_stride, tran_low_t *coeff,
+ int is_final) {
+ __m128i src[8];
+ src[0] = _mm_load_si128((const __m128i *)src_diff);
+ src[1] = _mm_load_si128((const __m128i *)(src_diff += src_stride));
+ src[2] = _mm_load_si128((const __m128i *)(src_diff += src_stride));
+ src[3] = _mm_load_si128((const __m128i *)(src_diff += src_stride));
+ src[4] = _mm_load_si128((const __m128i *)(src_diff += src_stride));
+ src[5] = _mm_load_si128((const __m128i *)(src_diff += src_stride));
+ src[6] = _mm_load_si128((const __m128i *)(src_diff += src_stride));
+ src[7] = _mm_load_si128((const __m128i *)(src_diff + src_stride));
+
+ hadamard_col8_sse2(src, 0);
+ hadamard_col8_sse2(src, 1);
+
+ if (is_final) {
+ store_tran_low(src[0], coeff);
+ coeff += 8;
+ store_tran_low(src[1], coeff);
+ coeff += 8;
+ store_tran_low(src[2], coeff);
+ coeff += 8;
+ store_tran_low(src[3], coeff);
+ coeff += 8;
+ store_tran_low(src[4], coeff);
+ coeff += 8;
+ store_tran_low(src[5], coeff);
+ coeff += 8;
+ store_tran_low(src[6], coeff);
+ coeff += 8;
+ store_tran_low(src[7], coeff);
+ } else {
+ int16_t *coeff16 = (int16_t *)coeff;
+ _mm_store_si128((__m128i *)coeff16, src[0]);
+ coeff16 += 8;
+ _mm_store_si128((__m128i *)coeff16, src[1]);
+ coeff16 += 8;
+ _mm_store_si128((__m128i *)coeff16, src[2]);
+ coeff16 += 8;
+ _mm_store_si128((__m128i *)coeff16, src[3]);
+ coeff16 += 8;
+ _mm_store_si128((__m128i *)coeff16, src[4]);
+ coeff16 += 8;
+ _mm_store_si128((__m128i *)coeff16, src[5]);
+ coeff16 += 8;
+ _mm_store_si128((__m128i *)coeff16, src[6]);
+ coeff16 += 8;
+ _mm_store_si128((__m128i *)coeff16, src[7]);
+ }
+}
+
+void vpx_hadamard_8x8_sse2(const int16_t *src_diff, ptrdiff_t src_stride,
+ tran_low_t *coeff) {
+ hadamard_8x8_sse2(src_diff, src_stride, coeff, 1);
+}
+
+static INLINE void hadamard_16x16_sse2(const int16_t *src_diff,
+ ptrdiff_t src_stride, tran_low_t *coeff,
+ int is_final) {
+#if CONFIG_VP9_HIGHBITDEPTH
+ // For high bitdepths, it is unnecessary to store_tran_low
+ // (mult/unpack/store), then load_tran_low (load/pack) the same memory in the
+ // next stage. Output to an intermediate buffer first, then store_tran_low()
+ // in the final stage.
+ DECLARE_ALIGNED(32, int16_t, temp_coeff[16 * 16]);
+ int16_t *t_coeff = temp_coeff;
+#else
+ int16_t *t_coeff = coeff;
+#endif
+ int16_t *coeff16 = (int16_t *)coeff;
+ int idx;
+ for (idx = 0; idx < 4; ++idx) {
+ const int16_t *src_ptr =
+ src_diff + (idx >> 1) * 8 * src_stride + (idx & 0x01) * 8;
+ hadamard_8x8_sse2(src_ptr, src_stride, (tran_low_t *)(t_coeff + idx * 64),
+ 0);
+ }
+
+ for (idx = 0; idx < 64; idx += 8) {
+ __m128i coeff0 = _mm_load_si128((const __m128i *)t_coeff);
+ __m128i coeff1 = _mm_load_si128((const __m128i *)(t_coeff + 64));
+ __m128i coeff2 = _mm_load_si128((const __m128i *)(t_coeff + 128));
+ __m128i coeff3 = _mm_load_si128((const __m128i *)(t_coeff + 192));
+
+ __m128i b0 = _mm_add_epi16(coeff0, coeff1);
+ __m128i b1 = _mm_sub_epi16(coeff0, coeff1);
+ __m128i b2 = _mm_add_epi16(coeff2, coeff3);
+ __m128i b3 = _mm_sub_epi16(coeff2, coeff3);
+
+ b0 = _mm_srai_epi16(b0, 1);
+ b1 = _mm_srai_epi16(b1, 1);
+ b2 = _mm_srai_epi16(b2, 1);
+ b3 = _mm_srai_epi16(b3, 1);
+
+ coeff0 = _mm_add_epi16(b0, b2);
+ coeff1 = _mm_add_epi16(b1, b3);
+ coeff2 = _mm_sub_epi16(b0, b2);
+ coeff3 = _mm_sub_epi16(b1, b3);
+
+ if (is_final) {
+ store_tran_low(coeff0, coeff);
+ store_tran_low(coeff1, coeff + 64);
+ store_tran_low(coeff2, coeff + 128);
+ store_tran_low(coeff3, coeff + 192);
+ coeff += 8;
+ } else {
+ _mm_store_si128((__m128i *)coeff16, coeff0);
+ _mm_store_si128((__m128i *)(coeff16 + 64), coeff1);
+ _mm_store_si128((__m128i *)(coeff16 + 128), coeff2);
+ _mm_store_si128((__m128i *)(coeff16 + 192), coeff3);
+ coeff16 += 8;
+ }
+
+ t_coeff += 8;
+ }
+}
+
+void vpx_hadamard_16x16_sse2(const int16_t *src_diff, ptrdiff_t src_stride,
+ tran_low_t *coeff) {
+ hadamard_16x16_sse2(src_diff, src_stride, coeff, 1);
+}
+
+void vpx_hadamard_32x32_sse2(const int16_t *src_diff, ptrdiff_t src_stride,
+ tran_low_t *coeff) {
+#if CONFIG_VP9_HIGHBITDEPTH
+ // For high bitdepths, it is unnecessary to store_tran_low
+ // (mult/unpack/store), then load_tran_low (load/pack) the same memory in the
+ // next stage. Output to an intermediate buffer first, then store_tran_low()
+ // in the final stage.
+ DECLARE_ALIGNED(32, int16_t, temp_coeff[32 * 32]);
+ int16_t *t_coeff = temp_coeff;
+#else
+ int16_t *t_coeff = coeff;
+#endif
+ int idx;
+ for (idx = 0; idx < 4; ++idx) {
+ const int16_t *src_ptr =
+ src_diff + (idx >> 1) * 16 * src_stride + (idx & 0x01) * 16;
+ hadamard_16x16_sse2(src_ptr, src_stride,
+ (tran_low_t *)(t_coeff + idx * 256), 0);
+ }
+
+ for (idx = 0; idx < 256; idx += 8) {
+ __m128i coeff0 = _mm_load_si128((const __m128i *)t_coeff);
+ __m128i coeff1 = _mm_load_si128((const __m128i *)(t_coeff + 256));
+ __m128i coeff2 = _mm_load_si128((const __m128i *)(t_coeff + 512));
+ __m128i coeff3 = _mm_load_si128((const __m128i *)(t_coeff + 768));
+
+ __m128i b0 = _mm_add_epi16(coeff0, coeff1);
+ __m128i b1 = _mm_sub_epi16(coeff0, coeff1);
+ __m128i b2 = _mm_add_epi16(coeff2, coeff3);
+ __m128i b3 = _mm_sub_epi16(coeff2, coeff3);
+
+ b0 = _mm_srai_epi16(b0, 2);
+ b1 = _mm_srai_epi16(b1, 2);
+ b2 = _mm_srai_epi16(b2, 2);
+ b3 = _mm_srai_epi16(b3, 2);
+
+ coeff0 = _mm_add_epi16(b0, b2);
+ coeff1 = _mm_add_epi16(b1, b3);
+ store_tran_low(coeff0, coeff);
+ store_tran_low(coeff1, coeff + 256);
+
+ coeff2 = _mm_sub_epi16(b0, b2);
+ coeff3 = _mm_sub_epi16(b1, b3);
+ store_tran_low(coeff2, coeff + 512);
+ store_tran_low(coeff3, coeff + 768);
+
+ coeff += 8;
+ t_coeff += 8;
+ }
+}
+
+int vpx_satd_sse2(const tran_low_t *coeff, int length) {
+ int i;
+ const __m128i zero = _mm_setzero_si128();
+ __m128i accum = zero;
+
+ for (i = 0; i < length; i += 8) {
+ const __m128i src_line = load_tran_low(coeff);
+ const __m128i inv = _mm_sub_epi16(zero, src_line);
+ const __m128i abs = _mm_max_epi16(src_line, inv); // abs(src_line)
+ const __m128i abs_lo = _mm_unpacklo_epi16(abs, zero);
+ const __m128i abs_hi = _mm_unpackhi_epi16(abs, zero);
+ const __m128i sum = _mm_add_epi32(abs_lo, abs_hi);
+ accum = _mm_add_epi32(accum, sum);
+ coeff += 8;
+ }
+
+ { // cascading summation of accum
+ __m128i hi = _mm_srli_si128(accum, 8);
+ accum = _mm_add_epi32(accum, hi);
+ hi = _mm_srli_epi64(accum, 32);
+ accum = _mm_add_epi32(accum, hi);
+ }
+
+ return _mm_cvtsi128_si32(accum);
+}
+
+void vpx_int_pro_row_sse2(int16_t hbuf[16], const uint8_t *ref,
+ const int ref_stride, const int height) {
+ int idx;
+ __m128i zero = _mm_setzero_si128();
+ __m128i src_line = _mm_loadu_si128((const __m128i *)ref);
+ __m128i s0 = _mm_unpacklo_epi8(src_line, zero);
+ __m128i s1 = _mm_unpackhi_epi8(src_line, zero);
+ __m128i t0, t1;
+ int height_1 = height - 1;
+ ref += ref_stride;
+
+ for (idx = 1; idx < height_1; idx += 2) {
+ src_line = _mm_loadu_si128((const __m128i *)ref);
+ t0 = _mm_unpacklo_epi8(src_line, zero);
+ t1 = _mm_unpackhi_epi8(src_line, zero);
+ s0 = _mm_adds_epu16(s0, t0);
+ s1 = _mm_adds_epu16(s1, t1);
+ ref += ref_stride;
+
+ src_line = _mm_loadu_si128((const __m128i *)ref);
+ t0 = _mm_unpacklo_epi8(src_line, zero);
+ t1 = _mm_unpackhi_epi8(src_line, zero);
+ s0 = _mm_adds_epu16(s0, t0);
+ s1 = _mm_adds_epu16(s1, t1);
+ ref += ref_stride;
+ }
+
+ src_line = _mm_loadu_si128((const __m128i *)ref);
+ t0 = _mm_unpacklo_epi8(src_line, zero);
+ t1 = _mm_unpackhi_epi8(src_line, zero);
+ s0 = _mm_adds_epu16(s0, t0);
+ s1 = _mm_adds_epu16(s1, t1);
+
+ if (height == 64) {
+ s0 = _mm_srai_epi16(s0, 5);
+ s1 = _mm_srai_epi16(s1, 5);
+ } else if (height == 32) {
+ s0 = _mm_srai_epi16(s0, 4);
+ s1 = _mm_srai_epi16(s1, 4);
+ } else {
+ s0 = _mm_srai_epi16(s0, 3);
+ s1 = _mm_srai_epi16(s1, 3);
+ }
+
+ _mm_storeu_si128((__m128i *)hbuf, s0);
+ hbuf += 8;
+ _mm_storeu_si128((__m128i *)hbuf, s1);
+}
+
+int16_t vpx_int_pro_col_sse2(const uint8_t *ref, const int width) {
+ __m128i zero = _mm_setzero_si128();
+ __m128i src_line = _mm_loadu_si128((const __m128i *)ref);
+ __m128i s0 = _mm_sad_epu8(src_line, zero);
+ __m128i s1;
+ int i;
+
+ for (i = 16; i < width; i += 16) {
+ ref += 16;
+ src_line = _mm_loadu_si128((const __m128i *)ref);
+ s1 = _mm_sad_epu8(src_line, zero);
+ s0 = _mm_adds_epu16(s0, s1);
+ }
+
+ s1 = _mm_srli_si128(s0, 8);
+ s0 = _mm_adds_epu16(s0, s1);
+
+ return _mm_extract_epi16(s0, 0);
+}
+
+int vpx_vector_var_sse2(const int16_t *ref, const int16_t *src, const int bwl) {
+ int idx;
+ int width = 4 << bwl;
+ int16_t mean;
+ __m128i v0 = _mm_loadu_si128((const __m128i *)ref);
+ __m128i v1 = _mm_load_si128((const __m128i *)src);
+ __m128i diff = _mm_subs_epi16(v0, v1);
+ __m128i sum = diff;
+ __m128i sse = _mm_madd_epi16(diff, diff);
+
+ ref += 8;
+ src += 8;
+
+ for (idx = 8; idx < width; idx += 8) {
+ v0 = _mm_loadu_si128((const __m128i *)ref);
+ v1 = _mm_load_si128((const __m128i *)src);
+ diff = _mm_subs_epi16(v0, v1);
+
+ sum = _mm_add_epi16(sum, diff);
+ v0 = _mm_madd_epi16(diff, diff);
+ sse = _mm_add_epi32(sse, v0);
+
+ ref += 8;
+ src += 8;
+ }
+
+ v0 = _mm_srli_si128(sum, 8);
+ sum = _mm_add_epi16(sum, v0);
+ v0 = _mm_srli_epi64(sum, 32);
+ sum = _mm_add_epi16(sum, v0);
+ v0 = _mm_srli_epi32(sum, 16);
+ sum = _mm_add_epi16(sum, v0);
+
+ v1 = _mm_srli_si128(sse, 8);
+ sse = _mm_add_epi32(sse, v1);
+ v1 = _mm_srli_epi64(sse, 32);
+ sse = _mm_add_epi32(sse, v1);
+
+ mean = (int16_t)_mm_extract_epi16(sum, 0);
+
+ return _mm_cvtsi128_si32(sse) - ((mean * mean) >> (bwl + 2));
+}
diff --git a/media/libvpx/libvpx/vpx_dsp/x86/avg_pred_sse2.c b/media/libvpx/libvpx/vpx_dsp/x86/avg_pred_sse2.c
new file mode 100644
index 0000000000..c6e70f744e
--- /dev/null
+++ b/media/libvpx/libvpx/vpx_dsp/x86/avg_pred_sse2.c
@@ -0,0 +1,69 @@
+/*
+ * Copyright (c) 2017 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <assert.h>
+#include <emmintrin.h>
+
+#include "./vpx_dsp_rtcd.h"
+#include "vpx/vpx_integer.h"
+#include "vpx_dsp/x86/mem_sse2.h"
+
+void vpx_comp_avg_pred_sse2(uint8_t *comp_pred, const uint8_t *pred, int width,
+ int height, const uint8_t *ref, int ref_stride) {
+ /* comp_pred and pred must be 16 byte aligned. */
+ assert(((intptr_t)comp_pred & 0xf) == 0);
+ assert(((intptr_t)pred & 0xf) == 0);
+ if (width > 8) {
+ int x, y;
+ for (y = 0; y < height; ++y) {
+ for (x = 0; x < width; x += 16) {
+ const __m128i p = _mm_load_si128((const __m128i *)(pred + x));
+ const __m128i r = _mm_loadu_si128((const __m128i *)(ref + x));
+ const __m128i avg = _mm_avg_epu8(p, r);
+ _mm_store_si128((__m128i *)(comp_pred + x), avg);
+ }
+ comp_pred += width;
+ pred += width;
+ ref += ref_stride;
+ }
+ } else { // width must be 4 or 8.
+ int i;
+ // Process 16 elements at a time. comp_pred and pred have width == stride
+ // and therefore live in contigious memory. 4*4, 4*8, 8*4, 8*8, and 8*16 are
+ // all divisible by 16 so just ref needs to be massaged when loading.
+ for (i = 0; i < width * height; i += 16) {
+ const __m128i p = _mm_load_si128((const __m128i *)pred);
+ __m128i r;
+ __m128i avg;
+ if (width == ref_stride) {
+ r = _mm_loadu_si128((const __m128i *)ref);
+ ref += 16;
+ } else if (width == 4) {
+ r = _mm_set_epi32(loadu_int32(ref + 3 * ref_stride),
+ loadu_int32(ref + 2 * ref_stride),
+ loadu_int32(ref + ref_stride), loadu_int32(ref));
+
+ ref += 4 * ref_stride;
+ } else {
+ const __m128i r_0 = _mm_loadl_epi64((const __m128i *)ref);
+ assert(width == 8);
+ r = _mm_castps_si128(_mm_loadh_pi(_mm_castsi128_ps(r_0),
+ (const __m64 *)(ref + ref_stride)));
+
+ ref += 2 * ref_stride;
+ }
+ avg = _mm_avg_epu8(p, r);
+ _mm_store_si128((__m128i *)comp_pred, avg);
+
+ pred += 16;
+ comp_pred += 16;
+ }
+ }
+}
diff --git a/media/libvpx/libvpx/vpx_dsp/x86/avg_ssse3_x86_64.asm b/media/libvpx/libvpx/vpx_dsp/x86/avg_ssse3_x86_64.asm
new file mode 100644
index 0000000000..9122b5a401
--- /dev/null
+++ b/media/libvpx/libvpx/vpx_dsp/x86/avg_ssse3_x86_64.asm
@@ -0,0 +1,130 @@
+;
+; Copyright (c) 2014 The WebM project authors. All Rights Reserved.
+;
+; Use of this source code is governed by a BSD-style license
+; that can be found in the LICENSE file in the root of the source
+; tree. An additional intellectual property rights grant can be found
+; in the file PATENTS. All contributing project authors may
+; be found in the AUTHORS file in the root of the source tree.
+;
+
+%include "third_party/x86inc/x86inc.asm"
+%include "vpx_dsp/x86/bitdepth_conversion_sse2.asm"
+
+SECTION .text
+
+%if VPX_ARCH_X86_64
+; matrix transpose
+%macro TRANSPOSE8X8 10
+ ; stage 1
+ punpcklwd m%9, m%1, m%2
+ punpcklwd m%10, m%3, m%4
+ punpckhwd m%1, m%2
+ punpckhwd m%3, m%4
+
+ punpcklwd m%2, m%5, m%6
+ punpcklwd m%4, m%7, m%8
+ punpckhwd m%5, m%6
+ punpckhwd m%7, m%8
+
+ ; stage 2
+ punpckldq m%6, m%9, m%10
+ punpckldq m%8, m%1, m%3
+ punpckhdq m%9, m%10
+ punpckhdq m%1, m%3
+
+ punpckldq m%10, m%2, m%4
+ punpckldq m%3, m%5, m%7
+ punpckhdq m%2, m%4
+ punpckhdq m%5, m%7
+
+ ; stage 3
+ punpckhqdq m%4, m%9, m%2 ; out3
+ punpcklqdq m%9, m%2 ; out2
+ punpcklqdq m%7, m%1, m%5 ; out6
+ punpckhqdq m%1, m%5 ; out7
+
+ punpckhqdq m%2, m%6, m%10 ; out1
+ punpcklqdq m%6, m%10 ; out0
+ punpcklqdq m%5, m%8, m%3 ; out4
+ punpckhqdq m%8, m%3 ; out5
+
+ SWAP %6, %1
+ SWAP %3, %9
+ SWAP %8, %6
+%endmacro
+
+%macro HMD8_1D 0
+ psubw m8, m0, m1
+ psubw m9, m2, m3
+ paddw m0, m1
+ paddw m2, m3
+ SWAP 1, 8
+ SWAP 3, 9
+ psubw m8, m4, m5
+ psubw m9, m6, m7
+ paddw m4, m5
+ paddw m6, m7
+ SWAP 5, 8
+ SWAP 7, 9
+
+ psubw m8, m0, m2
+ psubw m9, m1, m3
+ paddw m0, m2
+ paddw m1, m3
+ SWAP 2, 8
+ SWAP 3, 9
+ psubw m8, m4, m6
+ psubw m9, m5, m7
+ paddw m4, m6
+ paddw m5, m7
+ SWAP 6, 8
+ SWAP 7, 9
+
+ psubw m8, m0, m4
+ psubw m9, m1, m5
+ paddw m0, m4
+ paddw m1, m5
+ SWAP 4, 8
+ SWAP 5, 9
+ psubw m8, m2, m6
+ psubw m9, m3, m7
+ paddw m2, m6
+ paddw m3, m7
+ SWAP 6, 8
+ SWAP 7, 9
+%endmacro
+
+
+INIT_XMM ssse3
+cglobal hadamard_8x8, 3, 5, 11, input, stride, output
+ lea r3, [2 * strideq]
+ lea r4, [4 * strideq]
+
+ mova m0, [inputq]
+ mova m1, [inputq + r3]
+ lea inputq, [inputq + r4]
+ mova m2, [inputq]
+ mova m3, [inputq + r3]
+ lea inputq, [inputq + r4]
+ mova m4, [inputq]
+ mova m5, [inputq + r3]
+ lea inputq, [inputq + r4]
+ mova m6, [inputq]
+ mova m7, [inputq + r3]
+
+ HMD8_1D
+ TRANSPOSE8X8 0, 1, 2, 3, 4, 5, 6, 7, 9, 10
+ HMD8_1D
+
+ STORE_TRAN_LOW 0, outputq, 0, 8, 9
+ STORE_TRAN_LOW 1, outputq, 8, 8, 9
+ STORE_TRAN_LOW 2, outputq, 16, 8, 9
+ STORE_TRAN_LOW 3, outputq, 24, 8, 9
+ STORE_TRAN_LOW 4, outputq, 32, 8, 9
+ STORE_TRAN_LOW 5, outputq, 40, 8, 9
+ STORE_TRAN_LOW 6, outputq, 48, 8, 9
+ STORE_TRAN_LOW 7, outputq, 56, 8, 9
+
+ RET
+%endif
diff --git a/media/libvpx/libvpx/vpx_dsp/x86/bitdepth_conversion_avx2.h b/media/libvpx/libvpx/vpx_dsp/x86/bitdepth_conversion_avx2.h
new file mode 100644
index 0000000000..c02b47a3eb
--- /dev/null
+++ b/media/libvpx/libvpx/vpx_dsp/x86/bitdepth_conversion_avx2.h
@@ -0,0 +1,44 @@
+/*
+ * Copyright (c) 2017 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+#ifndef VPX_VPX_DSP_X86_BITDEPTH_CONVERSION_AVX2_H_
+#define VPX_VPX_DSP_X86_BITDEPTH_CONVERSION_AVX2_H_
+
+#include <immintrin.h>
+
+#include "./vpx_config.h"
+#include "vpx/vpx_integer.h"
+#include "vpx_dsp/vpx_dsp_common.h"
+
+// Load 16 16 bit values. If the source is 32 bits then pack down with
+// saturation.
+static INLINE __m256i load_tran_low(const tran_low_t *a) {
+#if CONFIG_VP9_HIGHBITDEPTH
+ const __m256i a_low = _mm256_loadu_si256((const __m256i *)a);
+ const __m256i a_high = _mm256_loadu_si256((const __m256i *)(a + 8));
+ return _mm256_packs_epi32(a_low, a_high);
+#else
+ return _mm256_loadu_si256((const __m256i *)a);
+#endif
+}
+
+static INLINE void store_tran_low(__m256i a, tran_low_t *b) {
+#if CONFIG_VP9_HIGHBITDEPTH
+ const __m256i one = _mm256_set1_epi16(1);
+ const __m256i a_hi = _mm256_mulhi_epi16(a, one);
+ const __m256i a_lo = _mm256_mullo_epi16(a, one);
+ const __m256i a_1 = _mm256_unpacklo_epi16(a_lo, a_hi);
+ const __m256i a_2 = _mm256_unpackhi_epi16(a_lo, a_hi);
+ _mm256_storeu_si256((__m256i *)b, a_1);
+ _mm256_storeu_si256((__m256i *)(b + 8), a_2);
+#else
+ _mm256_storeu_si256((__m256i *)b, a);
+#endif
+}
+#endif // VPX_VPX_DSP_X86_BITDEPTH_CONVERSION_AVX2_H_
diff --git a/media/libvpx/libvpx/vpx_dsp/x86/bitdepth_conversion_sse2.asm b/media/libvpx/libvpx/vpx_dsp/x86/bitdepth_conversion_sse2.asm
new file mode 100644
index 0000000000..aacf71f7ac
--- /dev/null
+++ b/media/libvpx/libvpx/vpx_dsp/x86/bitdepth_conversion_sse2.asm
@@ -0,0 +1,90 @@
+;
+; Copyright (c) 2017 The WebM project authors. All Rights Reserved.
+;
+; Use of this source code is governed by a BSD-style license
+; that can be found in the LICENSE file in the root of the source
+; tree. An additional intellectual property rights grant can be found
+; in the file PATENTS. All contributing project authors may
+; be found in the AUTHORS file in the root of the source tree.
+;
+
+; TODO(johannkoenig): Add the necessary include guards to vpx_config.asm.
+; vpx_config.asm is not guarded so can not be included twice. Because this will
+; be used in conjunction with x86_abi_support.asm or x86inc.asm, it must be
+; included after those files.
+
+; Increment register by sizeof() tran_low_t * 8.
+%macro INCREMENT_TRAN_LOW 1
+%if CONFIG_VP9_HIGHBITDEPTH
+ add %1, 32
+%else
+ add %1, 16
+%endif
+%endmacro
+
+; Increment %1 by sizeof() tran_low_t * %2.
+%macro INCREMENT_ELEMENTS_TRAN_LOW 2
+%if CONFIG_VP9_HIGHBITDEPTH
+ lea %1, [%1 + %2 * 4]
+%else
+ lea %1, [%1 + %2 * 2]
+%endif
+%endmacro
+
+; Load %2 + %3 into m%1.
+; %3 is the offset in elements, not bytes.
+; If tran_low_t is 16 bits (low bit depth configuration) then load the value
+; directly. If tran_low_t is 32 bits (high bit depth configuration) then pack
+; the values down to 16 bits.
+%macro LOAD_TRAN_LOW 3
+%if CONFIG_VP9_HIGHBITDEPTH
+ mova m%1, [%2 + (%3) * 4]
+ packssdw m%1, [%2 + (%3) * 4 + 16]
+%else
+ mova m%1, [%2 + (%3) * 2]
+%endif
+%endmacro
+
+; Store m%1 to %2 + %3.
+; %3 is the offset in elements, not bytes.
+; If 5 arguments are provided then m%1 is corrupted.
+; If 6 arguments are provided then m%1 is preserved.
+; If tran_low_t is 16 bits (low bit depth configuration) then store the value
+; directly. If tran_low_t is 32 bits (high bit depth configuration) then sign
+; extend the values first.
+; Uses m%4-m%6 as scratch registers for high bit depth.
+%macro STORE_TRAN_LOW 5-6
+%if CONFIG_VP9_HIGHBITDEPTH
+ pxor m%4, m%4
+ mova m%5, m%1
+ %if %0 == 6
+ mova m%6, m%1
+ %endif
+ pcmpgtw m%4, m%1
+ punpcklwd m%5, m%4
+ %if %0 == 5
+ punpckhwd m%1, m%4
+ %else
+ punpckhwd m%6, m%4
+ %endif
+ mova [%2 + (%3) * 4 + 0], m%5
+ %if %0 == 5
+ mova [%2 + (%3) * 4 + 16], m%1
+ %else
+ mova [%2 + (%3) * 4 + 16], m%6
+ %endif
+%else
+ mova [%2 + (%3) * 2], m%1
+%endif
+%endmacro
+
+; Store zeros (in m%1) to %2 + %3.
+; %3 is the offset in elements, not bytes.
+%macro STORE_ZERO_TRAN_LOW 3
+%if CONFIG_VP9_HIGHBITDEPTH
+ mova [%2 + (%3) * 4 + 0], m%1
+ mova [%2 + (%3) * 4 + 16], m%1
+%else
+ mova [%2 + (%3) * 2], m%1
+%endif
+%endmacro
diff --git a/media/libvpx/libvpx/vpx_dsp/x86/bitdepth_conversion_sse2.h b/media/libvpx/libvpx/vpx_dsp/x86/bitdepth_conversion_sse2.h
new file mode 100644
index 0000000000..74dde656b1
--- /dev/null
+++ b/media/libvpx/libvpx/vpx_dsp/x86/bitdepth_conversion_sse2.h
@@ -0,0 +1,56 @@
+/*
+ * Copyright (c) 2017 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+#ifndef VPX_VPX_DSP_X86_BITDEPTH_CONVERSION_SSE2_H_
+#define VPX_VPX_DSP_X86_BITDEPTH_CONVERSION_SSE2_H_
+
+#include <xmmintrin.h>
+
+#include "./vpx_config.h"
+#include "vpx/vpx_integer.h"
+#include "vpx_dsp/vpx_dsp_common.h"
+
+// Load 8 16 bit values. If the source is 32 bits then pack down with
+// saturation.
+static INLINE __m128i load_tran_low(const tran_low_t *a) {
+#if CONFIG_VP9_HIGHBITDEPTH
+ const __m128i a_low = _mm_load_si128((const __m128i *)a);
+ return _mm_packs_epi32(a_low, *(const __m128i *)(a + 4));
+#else
+ return _mm_load_si128((const __m128i *)a);
+#endif
+}
+
+// Store 8 16 bit values. If the destination is 32 bits then sign extend the
+// values by multiplying by 1.
+static INLINE void store_tran_low(__m128i a, tran_low_t *b) {
+#if CONFIG_VP9_HIGHBITDEPTH
+ const __m128i one = _mm_set1_epi16(1);
+ const __m128i a_hi = _mm_mulhi_epi16(a, one);
+ const __m128i a_lo = _mm_mullo_epi16(a, one);
+ const __m128i a_1 = _mm_unpacklo_epi16(a_lo, a_hi);
+ const __m128i a_2 = _mm_unpackhi_epi16(a_lo, a_hi);
+ _mm_store_si128((__m128i *)(b), a_1);
+ _mm_store_si128((__m128i *)(b + 4), a_2);
+#else
+ _mm_store_si128((__m128i *)(b), a);
+#endif
+}
+
+// Zero fill 8 positions in the output buffer.
+static INLINE void store_zero_tran_low(tran_low_t *a) {
+ const __m128i zero = _mm_setzero_si128();
+#if CONFIG_VP9_HIGHBITDEPTH
+ _mm_store_si128((__m128i *)(a), zero);
+ _mm_store_si128((__m128i *)(a + 4), zero);
+#else
+ _mm_store_si128((__m128i *)(a), zero);
+#endif
+}
+#endif // VPX_VPX_DSP_X86_BITDEPTH_CONVERSION_SSE2_H_
diff --git a/media/libvpx/libvpx/vpx_dsp/x86/convolve.h b/media/libvpx/libvpx/vpx_dsp/x86/convolve.h
new file mode 100644
index 0000000000..c339600556
--- /dev/null
+++ b/media/libvpx/libvpx/vpx_dsp/x86/convolve.h
@@ -0,0 +1,279 @@
+/*
+ * Copyright (c) 2015 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+#ifndef VPX_VPX_DSP_X86_CONVOLVE_H_
+#define VPX_VPX_DSP_X86_CONVOLVE_H_
+
+#include <assert.h>
+
+#include "./vpx_config.h"
+#include "vpx/vpx_integer.h"
+#include "vpx_ports/compiler_attributes.h"
+
+// TODO(chiyotsai@google.com): Refactor the code here. Currently this is pretty
+// hacky and awful to read. Note that there is a filter_x[3] == 128 check in
+// HIGHBD_FUN_CONV_2D to avoid seg fault due to the fact that the c function
+// assumes the filter is always 8 tap.
+typedef void filter8_1dfunction(const uint8_t *src_ptr, ptrdiff_t src_pitch,
+ uint8_t *output_ptr, ptrdiff_t out_pitch,
+ uint32_t output_height, const int16_t *filter);
+
+// TODO(chiyotsai@google.com): Remove the is_avg argument to the MACROS once we
+// have 4-tap vert avg filter.
+#define FUN_CONV_1D(name, offset, step_q4, dir, src_start, avg, opt, is_avg) \
+ void vpx_convolve8_##name##_##opt( \
+ const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, \
+ ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, \
+ int x_step_q4, int y0_q4, int y_step_q4, int w, int h) { \
+ const int16_t *filter_row = filter[offset]; \
+ (void)x0_q4; \
+ (void)x_step_q4; \
+ (void)y0_q4; \
+ (void)y_step_q4; \
+ assert(filter_row[3] != 128); \
+ assert(step_q4 == 16); \
+ if (filter_row[0] | filter_row[1] | filter_row[6] | filter_row[7]) { \
+ const int num_taps = 8; \
+ while (w >= 16) { \
+ vpx_filter_block1d16_##dir##8_##avg##opt(src_start, src_stride, dst, \
+ dst_stride, h, filter_row); \
+ src += 16; \
+ dst += 16; \
+ w -= 16; \
+ } \
+ if (w == 8) { \
+ vpx_filter_block1d8_##dir##8_##avg##opt(src_start, src_stride, dst, \
+ dst_stride, h, filter_row); \
+ } else if (w == 4) { \
+ vpx_filter_block1d4_##dir##8_##avg##opt(src_start, src_stride, dst, \
+ dst_stride, h, filter_row); \
+ } \
+ (void)num_taps; \
+ } else if (filter_row[2] | filter_row[5]) { \
+ const int num_taps = is_avg ? 8 : 4; \
+ while (w >= 16) { \
+ vpx_filter_block1d16_##dir##4_##avg##opt(src_start, src_stride, dst, \
+ dst_stride, h, filter_row); \
+ src += 16; \
+ dst += 16; \
+ w -= 16; \
+ } \
+ if (w == 8) { \
+ vpx_filter_block1d8_##dir##4_##avg##opt(src_start, src_stride, dst, \
+ dst_stride, h, filter_row); \
+ } else if (w == 4) { \
+ vpx_filter_block1d4_##dir##4_##avg##opt(src_start, src_stride, dst, \
+ dst_stride, h, filter_row); \
+ } \
+ (void)num_taps; \
+ } else { \
+ const int num_taps = 2; \
+ while (w >= 16) { \
+ vpx_filter_block1d16_##dir##2_##avg##opt(src_start, src_stride, dst, \
+ dst_stride, h, filter_row); \
+ src += 16; \
+ dst += 16; \
+ w -= 16; \
+ } \
+ if (w == 8) { \
+ vpx_filter_block1d8_##dir##2_##avg##opt(src_start, src_stride, dst, \
+ dst_stride, h, filter_row); \
+ } else if (w == 4) { \
+ vpx_filter_block1d4_##dir##2_##avg##opt(src_start, src_stride, dst, \
+ dst_stride, h, filter_row); \
+ } \
+ (void)num_taps; \
+ } \
+ }
+
+#define FUN_CONV_2D(avg, opt, is_avg) \
+ void vpx_convolve8_##avg##opt( \
+ const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, \
+ ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, \
+ int x_step_q4, int y0_q4, int y_step_q4, int w, int h) { \
+ const int16_t *filter_x = filter[x0_q4]; \
+ const int16_t *filter_y = filter[y0_q4]; \
+ (void)filter_y; \
+ assert(filter_x[3] != 128); \
+ assert(filter_y[3] != 128); \
+ assert(w <= 64); \
+ assert(h <= 64); \
+ assert(x_step_q4 == 16); \
+ assert(y_step_q4 == 16); \
+ if (filter_x[0] | filter_x[1] | filter_x[6] | filter_x[7]) { \
+ DECLARE_ALIGNED(16, uint8_t, fdata2[64 * 71] VPX_UNINITIALIZED); \
+ vpx_convolve8_horiz_##opt(src - 3 * src_stride, src_stride, fdata2, 64, \
+ filter, x0_q4, x_step_q4, y0_q4, y_step_q4, w, \
+ h + 7); \
+ vpx_convolve8_##avg##vert_##opt(fdata2 + 3 * 64, 64, dst, dst_stride, \
+ filter, x0_q4, x_step_q4, y0_q4, \
+ y_step_q4, w, h); \
+ } else if (filter_x[2] | filter_x[5]) { \
+ const int num_taps = is_avg ? 8 : 4; \
+ DECLARE_ALIGNED(16, uint8_t, fdata2[64 * 71] VPX_UNINITIALIZED); \
+ vpx_convolve8_horiz_##opt( \
+ src - (num_taps / 2 - 1) * src_stride, src_stride, fdata2, 64, \
+ filter, x0_q4, x_step_q4, y0_q4, y_step_q4, w, h + num_taps - 1); \
+ vpx_convolve8_##avg##vert_##opt(fdata2 + 64 * (num_taps / 2 - 1), 64, \
+ dst, dst_stride, filter, x0_q4, \
+ x_step_q4, y0_q4, y_step_q4, w, h); \
+ } else { \
+ DECLARE_ALIGNED(16, uint8_t, fdata2[64 * 65] VPX_UNINITIALIZED); \
+ vpx_convolve8_horiz_##opt(src, src_stride, fdata2, 64, filter, x0_q4, \
+ x_step_q4, y0_q4, y_step_q4, w, h + 1); \
+ vpx_convolve8_##avg##vert_##opt(fdata2, 64, dst, dst_stride, filter, \
+ x0_q4, x_step_q4, y0_q4, y_step_q4, w, \
+ h); \
+ } \
+ }
+
+#if CONFIG_VP9_HIGHBITDEPTH
+
+typedef void highbd_filter8_1dfunction(const uint16_t *src_ptr,
+ const ptrdiff_t src_pitch,
+ uint16_t *output_ptr,
+ ptrdiff_t out_pitch,
+ unsigned int output_height,
+ const int16_t *filter, int bd);
+
+#define HIGH_FUN_CONV_1D(name, offset, step_q4, dir, src_start, avg, opt, \
+ is_avg) \
+ void vpx_highbd_convolve8_##name##_##opt( \
+ const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, \
+ ptrdiff_t dst_stride, const InterpKernel *filter_kernel, int x0_q4, \
+ int x_step_q4, int y0_q4, int y_step_q4, int w, int h, int bd) { \
+ const int16_t *filter_row = filter_kernel[offset]; \
+ if (step_q4 == 16 && filter_row[3] != 128) { \
+ if (filter_row[0] | filter_row[1] | filter_row[6] | filter_row[7]) { \
+ const int num_taps = 8; \
+ while (w >= 16) { \
+ vpx_highbd_filter_block1d16_##dir##8_##avg##opt( \
+ src_start, src_stride, dst, dst_stride, h, filter_row, bd); \
+ src += 16; \
+ dst += 16; \
+ w -= 16; \
+ } \
+ while (w >= 8) { \
+ vpx_highbd_filter_block1d8_##dir##8_##avg##opt( \
+ src_start, src_stride, dst, dst_stride, h, filter_row, bd); \
+ src += 8; \
+ dst += 8; \
+ w -= 8; \
+ } \
+ while (w >= 4) { \
+ vpx_highbd_filter_block1d4_##dir##8_##avg##opt( \
+ src_start, src_stride, dst, dst_stride, h, filter_row, bd); \
+ src += 4; \
+ dst += 4; \
+ w -= 4; \
+ } \
+ (void)num_taps; \
+ } else if (filter_row[2] | filter_row[5]) { \
+ const int num_taps = is_avg ? 8 : 4; \
+ while (w >= 16) { \
+ vpx_highbd_filter_block1d16_##dir##4_##avg##opt( \
+ src_start, src_stride, dst, dst_stride, h, filter_row, bd); \
+ src += 16; \
+ dst += 16; \
+ w -= 16; \
+ } \
+ while (w >= 8) { \
+ vpx_highbd_filter_block1d8_##dir##4_##avg##opt( \
+ src_start, src_stride, dst, dst_stride, h, filter_row, bd); \
+ src += 8; \
+ dst += 8; \
+ w -= 8; \
+ } \
+ while (w >= 4) { \
+ vpx_highbd_filter_block1d4_##dir##4_##avg##opt( \
+ src_start, src_stride, dst, dst_stride, h, filter_row, bd); \
+ src += 4; \
+ dst += 4; \
+ w -= 4; \
+ } \
+ (void)num_taps; \
+ } else { \
+ const int num_taps = 2; \
+ while (w >= 16) { \
+ vpx_highbd_filter_block1d16_##dir##2_##avg##opt( \
+ src_start, src_stride, dst, dst_stride, h, filter_row, bd); \
+ src += 16; \
+ dst += 16; \
+ w -= 16; \
+ } \
+ while (w >= 8) { \
+ vpx_highbd_filter_block1d8_##dir##2_##avg##opt( \
+ src_start, src_stride, dst, dst_stride, h, filter_row, bd); \
+ src += 8; \
+ dst += 8; \
+ w -= 8; \
+ } \
+ while (w >= 4) { \
+ vpx_highbd_filter_block1d4_##dir##2_##avg##opt( \
+ src_start, src_stride, dst, dst_stride, h, filter_row, bd); \
+ src += 4; \
+ dst += 4; \
+ w -= 4; \
+ } \
+ (void)num_taps; \
+ } \
+ } \
+ if (w) { \
+ vpx_highbd_convolve8_##name##_c(src, src_stride, dst, dst_stride, \
+ filter_kernel, x0_q4, x_step_q4, y0_q4, \
+ y_step_q4, w, h, bd); \
+ } \
+ }
+
+#define HIGH_FUN_CONV_2D(avg, opt, is_avg) \
+ void vpx_highbd_convolve8_##avg##opt( \
+ const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, \
+ ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, \
+ int x_step_q4, int y0_q4, int y_step_q4, int w, int h, int bd) { \
+ const int16_t *filter_x = filter[x0_q4]; \
+ assert(w <= 64); \
+ assert(h <= 64); \
+ if (x_step_q4 == 16 && y_step_q4 == 16) { \
+ if ((filter_x[0] | filter_x[1] | filter_x[6] | filter_x[7]) || \
+ filter_x[3] == 128) { \
+ DECLARE_ALIGNED(16, uint16_t, fdata2[64 * 71] VPX_UNINITIALIZED); \
+ vpx_highbd_convolve8_horiz_##opt(src - 3 * src_stride, src_stride, \
+ fdata2, 64, filter, x0_q4, x_step_q4, \
+ y0_q4, y_step_q4, w, h + 7, bd); \
+ vpx_highbd_convolve8_##avg##vert_##opt( \
+ fdata2 + 192, 64, dst, dst_stride, filter, x0_q4, x_step_q4, \
+ y0_q4, y_step_q4, w, h, bd); \
+ } else if (filter_x[2] | filter_x[5]) { \
+ const int num_taps = is_avg ? 8 : 4; \
+ DECLARE_ALIGNED(16, uint16_t, fdata2[64 * 71] VPX_UNINITIALIZED); \
+ vpx_highbd_convolve8_horiz_##opt( \
+ src - (num_taps / 2 - 1) * src_stride, src_stride, fdata2, 64, \
+ filter, x0_q4, x_step_q4, y0_q4, y_step_q4, w, h + num_taps - 1, \
+ bd); \
+ vpx_highbd_convolve8_##avg##vert_##opt( \
+ fdata2 + 64 * (num_taps / 2 - 1), 64, dst, dst_stride, filter, \
+ x0_q4, x_step_q4, y0_q4, y_step_q4, w, h, bd); \
+ } else { \
+ DECLARE_ALIGNED(16, uint16_t, fdata2[64 * 65] VPX_UNINITIALIZED); \
+ vpx_highbd_convolve8_horiz_##opt(src, src_stride, fdata2, 64, filter, \
+ x0_q4, x_step_q4, y0_q4, y_step_q4, \
+ w, h + 1, bd); \
+ vpx_highbd_convolve8_##avg##vert_##opt(fdata2, 64, dst, dst_stride, \
+ filter, x0_q4, x_step_q4, \
+ y0_q4, y_step_q4, w, h, bd); \
+ } \
+ } else { \
+ vpx_highbd_convolve8_##avg##c(src, src_stride, dst, dst_stride, filter, \
+ x0_q4, x_step_q4, y0_q4, y_step_q4, w, h, \
+ bd); \
+ } \
+ }
+
+#endif // CONFIG_VP9_HIGHBITDEPTH
+#endif // VPX_VPX_DSP_X86_CONVOLVE_H_
diff --git a/media/libvpx/libvpx/vpx_dsp/x86/convolve_avx2.h b/media/libvpx/libvpx/vpx_dsp/x86/convolve_avx2.h
new file mode 100644
index 0000000000..ebee964b18
--- /dev/null
+++ b/media/libvpx/libvpx/vpx_dsp/x86/convolve_avx2.h
@@ -0,0 +1,161 @@
+/*
+ * Copyright (c) 2017 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef VPX_VPX_DSP_X86_CONVOLVE_AVX2_H_
+#define VPX_VPX_DSP_X86_CONVOLVE_AVX2_H_
+
+#include <immintrin.h> // AVX2
+
+#include "./vpx_config.h"
+
+#if defined(__clang__)
+#if (__clang_major__ > 0 && __clang_major__ < 3) || \
+ (__clang_major__ == 3 && __clang_minor__ <= 3) || \
+ (defined(__APPLE__) && defined(__apple_build_version__) && \
+ ((__clang_major__ == 4 && __clang_minor__ <= 2) || \
+ (__clang_major__ == 5 && __clang_minor__ == 0)))
+#define MM256_BROADCASTSI128_SI256(x) \
+ _mm_broadcastsi128_si256((__m128i const *)&(x))
+#else // clang > 3.3, and not 5.0 on macosx.
+#define MM256_BROADCASTSI128_SI256(x) _mm256_broadcastsi128_si256(x)
+#endif // clang <= 3.3
+#elif defined(__GNUC__)
+#if __GNUC__ < 4 || (__GNUC__ == 4 && __GNUC_MINOR__ <= 6)
+#define MM256_BROADCASTSI128_SI256(x) \
+ _mm_broadcastsi128_si256((__m128i const *)&(x))
+#elif __GNUC__ == 4 && __GNUC_MINOR__ == 7
+#define MM256_BROADCASTSI128_SI256(x) _mm_broadcastsi128_si256(x)
+#else // gcc > 4.7
+#define MM256_BROADCASTSI128_SI256(x) _mm256_broadcastsi128_si256(x)
+#endif // gcc <= 4.6
+#else // !(gcc || clang)
+#define MM256_BROADCASTSI128_SI256(x) _mm256_broadcastsi128_si256(x)
+#endif // __clang__
+
+static INLINE void shuffle_filter_avx2(const int16_t *const filter,
+ __m256i *const f) {
+ const __m256i f_values =
+ MM256_BROADCASTSI128_SI256(_mm_load_si128((const __m128i *)filter));
+ // pack and duplicate the filter values
+ f[0] = _mm256_shuffle_epi8(f_values, _mm256_set1_epi16(0x0200u));
+ f[1] = _mm256_shuffle_epi8(f_values, _mm256_set1_epi16(0x0604u));
+ f[2] = _mm256_shuffle_epi8(f_values, _mm256_set1_epi16(0x0a08u));
+ f[3] = _mm256_shuffle_epi8(f_values, _mm256_set1_epi16(0x0e0cu));
+}
+
+static INLINE __m256i convolve8_16_avx2(const __m256i *const s,
+ const __m256i *const f) {
+ // multiply 2 adjacent elements with the filter and add the result
+ const __m256i k_64 = _mm256_set1_epi16(1 << 6);
+ const __m256i x0 = _mm256_maddubs_epi16(s[0], f[0]);
+ const __m256i x1 = _mm256_maddubs_epi16(s[1], f[1]);
+ const __m256i x2 = _mm256_maddubs_epi16(s[2], f[2]);
+ const __m256i x3 = _mm256_maddubs_epi16(s[3], f[3]);
+ __m256i sum1, sum2;
+
+ // sum the results together, saturating only on the final step
+ // adding x0 with x2 and x1 with x3 is the only order that prevents
+ // outranges for all filters
+ sum1 = _mm256_add_epi16(x0, x2);
+ sum2 = _mm256_add_epi16(x1, x3);
+ // add the rounding offset early to avoid another saturated add
+ sum1 = _mm256_add_epi16(sum1, k_64);
+ sum1 = _mm256_adds_epi16(sum1, sum2);
+ // round and shift by 7 bit each 16 bit
+ sum1 = _mm256_srai_epi16(sum1, 7);
+ return sum1;
+}
+
+static INLINE __m128i convolve8_8_avx2(const __m256i *const s,
+ const __m256i *const f) {
+ // multiply 2 adjacent elements with the filter and add the result
+ const __m128i k_64 = _mm_set1_epi16(1 << 6);
+ const __m128i x0 = _mm_maddubs_epi16(_mm256_castsi256_si128(s[0]),
+ _mm256_castsi256_si128(f[0]));
+ const __m128i x1 = _mm_maddubs_epi16(_mm256_castsi256_si128(s[1]),
+ _mm256_castsi256_si128(f[1]));
+ const __m128i x2 = _mm_maddubs_epi16(_mm256_castsi256_si128(s[2]),
+ _mm256_castsi256_si128(f[2]));
+ const __m128i x3 = _mm_maddubs_epi16(_mm256_castsi256_si128(s[3]),
+ _mm256_castsi256_si128(f[3]));
+ __m128i sum1, sum2;
+
+ // sum the results together, saturating only on the final step
+ // adding x0 with x2 and x1 with x3 is the only order that prevents
+ // outranges for all filters
+ sum1 = _mm_add_epi16(x0, x2);
+ sum2 = _mm_add_epi16(x1, x3);
+ // add the rounding offset early to avoid another saturated add
+ sum1 = _mm_add_epi16(sum1, k_64);
+ sum1 = _mm_adds_epi16(sum1, sum2);
+ // shift by 7 bit each 16 bit
+ sum1 = _mm_srai_epi16(sum1, 7);
+ return sum1;
+}
+
+static INLINE __m256i mm256_loadu2_si128(const void *lo, const void *hi) {
+ const __m256i tmp =
+ _mm256_castsi128_si256(_mm_loadu_si128((const __m128i *)lo));
+ return _mm256_inserti128_si256(tmp, _mm_loadu_si128((const __m128i *)hi), 1);
+}
+
+static INLINE __m256i mm256_loadu2_epi64(const void *lo, const void *hi) {
+ const __m256i tmp =
+ _mm256_castsi128_si256(_mm_loadl_epi64((const __m128i *)lo));
+ return _mm256_inserti128_si256(tmp, _mm_loadl_epi64((const __m128i *)hi), 1);
+}
+
+static INLINE void mm256_store2_si128(__m128i *const dst_ptr_1,
+ __m128i *const dst_ptr_2,
+ const __m256i *const src) {
+ _mm_store_si128(dst_ptr_1, _mm256_castsi256_si128(*src));
+ _mm_store_si128(dst_ptr_2, _mm256_extractf128_si256(*src, 1));
+}
+
+static INLINE void mm256_storeu2_epi64(__m128i *const dst_ptr_1,
+ __m128i *const dst_ptr_2,
+ const __m256i *const src) {
+ _mm_storel_epi64(dst_ptr_1, _mm256_castsi256_si128(*src));
+ _mm_storel_epi64(dst_ptr_2, _mm256_extractf128_si256(*src, 1));
+}
+
+static INLINE void mm256_storeu2_epi32(__m128i *const dst_ptr_1,
+ __m128i *const dst_ptr_2,
+ const __m256i *const src) {
+ *((int *)(dst_ptr_1)) = _mm_cvtsi128_si32(_mm256_castsi256_si128(*src));
+ *((int *)(dst_ptr_2)) = _mm_cvtsi128_si32(_mm256_extractf128_si256(*src, 1));
+}
+
+static INLINE __m256i mm256_round_epi32(const __m256i *const src,
+ const __m256i *const half_depth,
+ const int depth) {
+ const __m256i nearest_src = _mm256_add_epi32(*src, *half_depth);
+ return _mm256_srai_epi32(nearest_src, depth);
+}
+
+static INLINE __m256i mm256_round_epi16(const __m256i *const src,
+ const __m256i *const half_depth,
+ const int depth) {
+ const __m256i nearest_src = _mm256_adds_epi16(*src, *half_depth);
+ return _mm256_srai_epi16(nearest_src, depth);
+}
+
+static INLINE __m256i mm256_madd_add_epi32(const __m256i *const src_0,
+ const __m256i *const src_1,
+ const __m256i *const ker_0,
+ const __m256i *const ker_1) {
+ const __m256i tmp_0 = _mm256_madd_epi16(*src_0, *ker_0);
+ const __m256i tmp_1 = _mm256_madd_epi16(*src_1, *ker_1);
+ return _mm256_add_epi32(tmp_0, tmp_1);
+}
+
+#undef MM256_BROADCASTSI128_SI256
+
+#endif // VPX_VPX_DSP_X86_CONVOLVE_AVX2_H_
diff --git a/media/libvpx/libvpx/vpx_dsp/x86/convolve_sse2.h b/media/libvpx/libvpx/vpx_dsp/x86/convolve_sse2.h
new file mode 100644
index 0000000000..8443546394
--- /dev/null
+++ b/media/libvpx/libvpx/vpx_dsp/x86/convolve_sse2.h
@@ -0,0 +1,88 @@
+/*
+ * Copyright (c) 2018 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef VPX_VPX_DSP_X86_CONVOLVE_SSE2_H_
+#define VPX_VPX_DSP_X86_CONVOLVE_SSE2_H_
+
+#include <emmintrin.h> // SSE2
+
+#include "./vpx_config.h"
+
+// Interprets the input register as 16-bit words 7 6 5 4 3 2 1 0, then returns
+// values at index 2 and 3 to return 3 2 3 2 3 2 3 2 as 16-bit words
+static INLINE __m128i extract_quarter_2_epi16_sse2(const __m128i *const reg) {
+ __m128i tmp = _mm_unpacklo_epi32(*reg, *reg);
+ return _mm_unpackhi_epi64(tmp, tmp);
+}
+
+// Interprets the input register as 16-bit words 7 6 5 4 3 2 1 0, then returns
+// values at index 2 and 3 to return 5 4 5 4 5 4 5 4 as 16-bit words.
+static INLINE __m128i extract_quarter_3_epi16_sse2(const __m128i *const reg) {
+ __m128i tmp = _mm_unpackhi_epi32(*reg, *reg);
+ return _mm_unpacklo_epi64(tmp, tmp);
+}
+
+// Interprets src as 8-bit words, zero extends to form 16-bit words, then
+// multiplies with ker and add the adjacent results to form 32-bit words.
+// Finally adds the result from 1 and 2 together.
+static INLINE __m128i mm_madd_add_epi8_sse2(const __m128i *const src_1,
+ const __m128i *const src_2,
+ const __m128i *const ker_1,
+ const __m128i *const ker_2) {
+ const __m128i src_1_half = _mm_unpacklo_epi8(*src_1, _mm_setzero_si128());
+ const __m128i src_2_half = _mm_unpacklo_epi8(*src_2, _mm_setzero_si128());
+ const __m128i madd_1 = _mm_madd_epi16(src_1_half, *ker_1);
+ const __m128i madd_2 = _mm_madd_epi16(src_2_half, *ker_2);
+ return _mm_add_epi32(madd_1, madd_2);
+}
+
+// Interprets src as 16-bit words, then multiplies with ker and add the
+// adjacent results to form 32-bit words. Finally adds the result from 1 and 2
+// together.
+static INLINE __m128i mm_madd_add_epi16_sse2(const __m128i *const src_1,
+ const __m128i *const src_2,
+ const __m128i *const ker_1,
+ const __m128i *const ker_2) {
+ const __m128i madd_1 = _mm_madd_epi16(*src_1, *ker_1);
+ const __m128i madd_2 = _mm_madd_epi16(*src_2, *ker_2);
+ return _mm_add_epi32(madd_1, madd_2);
+}
+
+static INLINE __m128i mm_madd_packs_epi16_sse2(const __m128i *const src_0,
+ const __m128i *const src_1,
+ const __m128i *const ker) {
+ const __m128i madd_1 = _mm_madd_epi16(*src_0, *ker);
+ const __m128i madd_2 = _mm_madd_epi16(*src_1, *ker);
+ return _mm_packs_epi32(madd_1, madd_2);
+}
+
+// Interleaves src_1 and src_2
+static INLINE __m128i mm_zip_epi32_sse2(const __m128i *const src_1,
+ const __m128i *const src_2) {
+ const __m128i tmp_1 = _mm_unpacklo_epi32(*src_1, *src_2);
+ const __m128i tmp_2 = _mm_unpackhi_epi32(*src_1, *src_2);
+ return _mm_packs_epi32(tmp_1, tmp_2);
+}
+
+static INLINE __m128i mm_round_epi32_sse2(const __m128i *const src,
+ const __m128i *const half_depth,
+ const int depth) {
+ const __m128i nearest_src = _mm_add_epi32(*src, *half_depth);
+ return _mm_srai_epi32(nearest_src, depth);
+}
+
+static INLINE __m128i mm_round_epi16_sse2(const __m128i *const src,
+ const __m128i *const half_depth,
+ const int depth) {
+ const __m128i nearest_src = _mm_adds_epi16(*src, *half_depth);
+ return _mm_srai_epi16(nearest_src, depth);
+}
+
+#endif // VPX_VPX_DSP_X86_CONVOLVE_SSE2_H_
diff --git a/media/libvpx/libvpx/vpx_dsp/x86/convolve_ssse3.h b/media/libvpx/libvpx/vpx_dsp/x86/convolve_ssse3.h
new file mode 100644
index 0000000000..8a4b165133
--- /dev/null
+++ b/media/libvpx/libvpx/vpx_dsp/x86/convolve_ssse3.h
@@ -0,0 +1,112 @@
+/*
+ * Copyright (c) 2017 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef VPX_VPX_DSP_X86_CONVOLVE_SSSE3_H_
+#define VPX_VPX_DSP_X86_CONVOLVE_SSSE3_H_
+
+#include <assert.h>
+#include <tmmintrin.h> // SSSE3
+
+#include "./vpx_config.h"
+
+static INLINE void shuffle_filter_ssse3(const int16_t *const filter,
+ __m128i *const f) {
+ const __m128i f_values = _mm_load_si128((const __m128i *)filter);
+ // pack and duplicate the filter values
+ f[0] = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0200u));
+ f[1] = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0604u));
+ f[2] = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0a08u));
+ f[3] = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0e0cu));
+}
+
+static INLINE void shuffle_filter_odd_ssse3(const int16_t *const filter,
+ __m128i *const f) {
+ const __m128i f_values = _mm_load_si128((const __m128i *)filter);
+ // pack and duplicate the filter values
+ // It utilizes the fact that the high byte of filter[3] is always 0 to clean
+ // half of f[0] and f[4].
+ assert(filter[3] >= 0 && filter[3] < 256);
+ f[0] = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0007u));
+ f[1] = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0402u));
+ f[2] = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0806u));
+ f[3] = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0c0au));
+ f[4] = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x070eu));
+}
+
+static INLINE __m128i convolve8_8_ssse3(const __m128i *const s,
+ const __m128i *const f) {
+ // multiply 2 adjacent elements with the filter and add the result
+ const __m128i k_64 = _mm_set1_epi16(1 << 6);
+ const __m128i x0 = _mm_maddubs_epi16(s[0], f[0]);
+ const __m128i x1 = _mm_maddubs_epi16(s[1], f[1]);
+ const __m128i x2 = _mm_maddubs_epi16(s[2], f[2]);
+ const __m128i x3 = _mm_maddubs_epi16(s[3], f[3]);
+ __m128i sum1, sum2;
+
+ // sum the results together, saturating only on the final step
+ // adding x0 with x2 and x1 with x3 is the only order that prevents
+ // outranges for all filters
+ sum1 = _mm_add_epi16(x0, x2);
+ sum2 = _mm_add_epi16(x1, x3);
+ // add the rounding offset early to avoid another saturated add
+ sum1 = _mm_add_epi16(sum1, k_64);
+ sum1 = _mm_adds_epi16(sum1, sum2);
+ // shift by 7 bit each 16 bit
+ sum1 = _mm_srai_epi16(sum1, 7);
+ return sum1;
+}
+
+static INLINE __m128i convolve8_8_even_offset_ssse3(const __m128i *const s,
+ const __m128i *const f) {
+ // multiply 2 adjacent elements with the filter and add the result
+ const __m128i k_64 = _mm_set1_epi16(1 << 6);
+ const __m128i x0 = _mm_maddubs_epi16(s[0], f[0]);
+ const __m128i x1 = _mm_maddubs_epi16(s[1], f[1]);
+ const __m128i x2 = _mm_maddubs_epi16(s[2], f[2]);
+ const __m128i x3 = _mm_maddubs_epi16(s[3], f[3]);
+ // compensate the subtracted 64 in f[1]. x4 is always non negative.
+ const __m128i x4 = _mm_maddubs_epi16(s[1], _mm_set1_epi8(64));
+ // add and saturate the results together
+ __m128i temp = _mm_adds_epi16(x0, x3);
+ temp = _mm_adds_epi16(temp, x1);
+ temp = _mm_adds_epi16(temp, x2);
+ temp = _mm_adds_epi16(temp, x4);
+ // round and shift by 7 bit each 16 bit
+ temp = _mm_adds_epi16(temp, k_64);
+ temp = _mm_srai_epi16(temp, 7);
+ return temp;
+}
+
+static INLINE __m128i convolve8_8_odd_offset_ssse3(const __m128i *const s,
+ const __m128i *const f) {
+ // multiply 2 adjacent elements with the filter and add the result
+ const __m128i k_64 = _mm_set1_epi16(1 << 6);
+ const __m128i x0 = _mm_maddubs_epi16(s[0], f[0]);
+ const __m128i x1 = _mm_maddubs_epi16(s[1], f[1]);
+ const __m128i x2 = _mm_maddubs_epi16(s[2], f[2]);
+ const __m128i x3 = _mm_maddubs_epi16(s[3], f[3]);
+ const __m128i x4 = _mm_maddubs_epi16(s[4], f[4]);
+ // compensate the subtracted 64 in f[2]. x5 is always non negative.
+ const __m128i x5 = _mm_maddubs_epi16(s[2], _mm_set1_epi8(64));
+ __m128i temp;
+
+ // add and saturate the results together
+ temp = _mm_adds_epi16(x0, x1);
+ temp = _mm_adds_epi16(temp, x2);
+ temp = _mm_adds_epi16(temp, x3);
+ temp = _mm_adds_epi16(temp, x4);
+ temp = _mm_adds_epi16(temp, x5);
+ // round and shift by 7 bit each 16 bit
+ temp = _mm_adds_epi16(temp, k_64);
+ temp = _mm_srai_epi16(temp, 7);
+ return temp;
+}
+
+#endif // VPX_VPX_DSP_X86_CONVOLVE_SSSE3_H_
diff --git a/media/libvpx/libvpx/vpx_dsp/x86/deblock_sse2.asm b/media/libvpx/libvpx/vpx_dsp/x86/deblock_sse2.asm
new file mode 100644
index 0000000000..b3af677d2e
--- /dev/null
+++ b/media/libvpx/libvpx/vpx_dsp/x86/deblock_sse2.asm
@@ -0,0 +1,432 @@
+;
+; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+;
+; Use of this source code is governed by a BSD-style license
+; that can be found in the LICENSE file in the root of the source
+; tree. An additional intellectual property rights grant can be found
+; in the file PATENTS. All contributing project authors may
+; be found in the AUTHORS file in the root of the source tree.
+;
+
+
+%include "vpx_ports/x86_abi_support.asm"
+
+;macro in deblock functions
+%macro FIRST_2_ROWS 0
+ movdqa xmm4, xmm0
+ movdqa xmm6, xmm0
+ movdqa xmm5, xmm1
+ pavgb xmm5, xmm3
+
+ ;calculate absolute value
+ psubusb xmm4, xmm1
+ psubusb xmm1, xmm0
+ psubusb xmm6, xmm3
+ psubusb xmm3, xmm0
+ paddusb xmm4, xmm1
+ paddusb xmm6, xmm3
+
+ ;get threshold
+ movdqa xmm2, flimit
+ pxor xmm1, xmm1
+ movdqa xmm7, xmm2
+
+ ;get mask
+ psubusb xmm2, xmm4
+ psubusb xmm7, xmm6
+ pcmpeqb xmm2, xmm1
+ pcmpeqb xmm7, xmm1
+ por xmm7, xmm2
+%endmacro
+
+%macro SECOND_2_ROWS 0
+ movdqa xmm6, xmm0
+ movdqa xmm4, xmm0
+ movdqa xmm2, xmm1
+ pavgb xmm1, xmm3
+
+ ;calculate absolute value
+ psubusb xmm6, xmm2
+ psubusb xmm2, xmm0
+ psubusb xmm4, xmm3
+ psubusb xmm3, xmm0
+ paddusb xmm6, xmm2
+ paddusb xmm4, xmm3
+
+ pavgb xmm5, xmm1
+
+ ;get threshold
+ movdqa xmm2, flimit
+ pxor xmm1, xmm1
+ movdqa xmm3, xmm2
+
+ ;get mask
+ psubusb xmm2, xmm6
+ psubusb xmm3, xmm4
+ pcmpeqb xmm2, xmm1
+ pcmpeqb xmm3, xmm1
+
+ por xmm7, xmm2
+ por xmm7, xmm3
+
+ pavgb xmm5, xmm0
+
+ ;decide if or not to use filtered value
+ pand xmm0, xmm7
+ pandn xmm7, xmm5
+ paddusb xmm0, xmm7
+%endmacro
+
+%macro UPDATE_FLIMIT 0
+ movdqu xmm2, XMMWORD PTR [rbx]
+ movdqu [rsp], xmm2
+ add rbx, 16
+%endmacro
+
+SECTION .text
+
+;void vpx_post_proc_down_and_across_mb_row_sse2
+;(
+; unsigned char *src_ptr,
+; unsigned char *dst_ptr,
+; int src_pixels_per_line,
+; int dst_pixels_per_line,
+; int cols,
+; int *flimits,
+; int size
+;)
+globalsym(vpx_post_proc_down_and_across_mb_row_sse2)
+sym(vpx_post_proc_down_and_across_mb_row_sse2):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 7
+ SAVE_XMM 7
+ push rbx
+ push rsi
+ push rdi
+ ; end prolog
+ ALIGN_STACK 16, rax
+ sub rsp, 16
+
+ ; put flimit on stack
+ mov rbx, arg(5) ;flimits ptr
+ UPDATE_FLIMIT
+
+%define flimit [rsp]
+
+ mov rsi, arg(0) ;src_ptr
+ mov rdi, arg(1) ;dst_ptr
+
+ movsxd rax, DWORD PTR arg(2) ;src_pixels_per_line
+ movsxd rcx, DWORD PTR arg(6) ;rows in a macroblock
+.nextrow:
+ xor rdx, rdx ;col
+.nextcol:
+ ;load current and next 2 rows
+ movdqu xmm0, XMMWORD PTR [rsi]
+ movdqu xmm1, XMMWORD PTR [rsi + rax]
+ movdqu xmm3, XMMWORD PTR [rsi + 2*rax]
+
+ FIRST_2_ROWS
+
+ ;load above 2 rows
+ neg rax
+ movdqu xmm1, XMMWORD PTR [rsi + 2*rax]
+ movdqu xmm3, XMMWORD PTR [rsi + rax]
+
+ SECOND_2_ROWS
+
+ movdqu XMMWORD PTR [rdi], xmm0
+
+ neg rax ; positive stride
+ add rsi, 16
+ add rdi, 16
+
+ add rdx, 16
+ cmp edx, dword arg(4) ;cols
+ jge .downdone
+ UPDATE_FLIMIT
+ jmp .nextcol
+
+.downdone:
+ ; done with the all cols, start the across filtering in place
+ sub rsi, rdx
+ sub rdi, rdx
+
+ mov rbx, arg(5) ; flimits
+ UPDATE_FLIMIT
+
+ ; dup the first byte into the left border 8 times
+ movq mm1, [rdi]
+ punpcklbw mm1, mm1
+ punpcklwd mm1, mm1
+ punpckldq mm1, mm1
+ mov rdx, -8
+ movq [rdi+rdx], mm1
+
+ ; dup the last byte into the right border
+ movsxd rdx, dword arg(4)
+ movq mm1, [rdi + rdx + -1]
+ punpcklbw mm1, mm1
+ punpcklwd mm1, mm1
+ punpckldq mm1, mm1
+ movq [rdi+rdx], mm1
+
+ xor rdx, rdx
+ movq mm0, QWORD PTR [rdi-16];
+ movq mm1, QWORD PTR [rdi-8];
+
+.acrossnextcol:
+ movdqu xmm0, XMMWORD PTR [rdi + rdx]
+ movdqu xmm1, XMMWORD PTR [rdi + rdx -2]
+ movdqu xmm3, XMMWORD PTR [rdi + rdx -1]
+
+ FIRST_2_ROWS
+
+ movdqu xmm1, XMMWORD PTR [rdi + rdx +1]
+ movdqu xmm3, XMMWORD PTR [rdi + rdx +2]
+
+ SECOND_2_ROWS
+
+ movq QWORD PTR [rdi+rdx-16], mm0 ; store previous 8 bytes
+ movq QWORD PTR [rdi+rdx-8], mm1 ; store previous 8 bytes
+ movdq2q mm0, xmm0
+ psrldq xmm0, 8
+ movdq2q mm1, xmm0
+
+ add rdx, 16
+ cmp edx, dword arg(4) ;cols
+ jge .acrossdone
+ UPDATE_FLIMIT
+ jmp .acrossnextcol
+
+.acrossdone:
+ ; last 16 pixels
+ movq QWORD PTR [rdi+rdx-16], mm0
+
+ cmp edx, dword arg(4)
+ jne .throw_last_8
+ movq QWORD PTR [rdi+rdx-8], mm1
+.throw_last_8:
+ ; done with this rwo
+ add rsi,rax ;next src line
+ mov eax, dword arg(3) ;dst_pixels_per_line
+ add rdi,rax ;next destination
+ mov eax, dword arg(2) ;src_pixels_per_line
+
+ mov rbx, arg(5) ;flimits
+ UPDATE_FLIMIT
+
+ dec rcx ;decrement count
+ jnz .nextrow ;next row
+
+ add rsp, 16
+ pop rsp
+ ; begin epilog
+ pop rdi
+ pop rsi
+ pop rbx
+ RESTORE_XMM
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+%undef flimit
+
+
+;void vpx_mbpost_proc_across_ip_sse2(unsigned char *src,
+; int pitch, int rows, int cols,int flimit)
+globalsym(vpx_mbpost_proc_across_ip_sse2)
+sym(vpx_mbpost_proc_across_ip_sse2):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 5
+ SAVE_XMM 7
+ GET_GOT rbx
+ push rsi
+ push rdi
+ ; end prolog
+
+ ALIGN_STACK 16, rax
+ sub rsp, 16
+
+ ; create flimit4 at [rsp]
+ mov eax, dword ptr arg(4) ;flimit
+ mov [rsp], eax
+ mov [rsp+4], eax
+ mov [rsp+8], eax
+ mov [rsp+12], eax
+%define flimit4 [rsp]
+
+
+ ;for(r=0;r<rows;r++)
+.ip_row_loop:
+
+ xor rdx, rdx ;sumsq=0;
+ xor rcx, rcx ;sum=0;
+ mov rsi, arg(0); s
+
+
+ ; dup the first byte into the left border 8 times
+ movq mm1, [rsi]
+ punpcklbw mm1, mm1
+ punpcklwd mm1, mm1
+ punpckldq mm1, mm1
+
+ mov rdi, -8
+ movq [rsi+rdi], mm1
+
+ ; dup the last byte into the right border
+ movsxd rdx, dword arg(3)
+ movq mm1, [rsi + rdx + -1]
+ punpcklbw mm1, mm1
+ punpcklwd mm1, mm1
+ punpckldq mm1, mm1
+ movq [rsi+rdx], mm1
+
+.ip_var_loop:
+ ;for(i=-8;i<=6;i++)
+ ;{
+ ; sumsq += s[i]*s[i];
+ ; sum += s[i];
+ ;}
+ movzx eax, byte [rsi+rdi]
+ add ecx, eax
+ mul al
+ add edx, eax
+ add rdi, 1
+ cmp rdi, 6
+ jle .ip_var_loop
+
+
+ ;mov rax, sumsq
+ ;movd xmm7, rax
+ movd xmm7, edx
+
+ ;mov rax, sum
+ ;movd xmm6, rax
+ movd xmm6, ecx
+
+ mov rsi, arg(0) ;s
+ xor rcx, rcx
+
+ movsxd rdx, dword arg(3) ;cols
+ add rdx, 8
+ pxor mm0, mm0
+ pxor mm1, mm1
+
+ pxor xmm0, xmm0
+.nextcol4:
+
+ movd xmm1, DWORD PTR [rsi+rcx-8] ; -8 -7 -6 -5
+ movd xmm2, DWORD PTR [rsi+rcx+7] ; +7 +8 +9 +10
+
+ punpcklbw xmm1, xmm0 ; expanding
+ punpcklbw xmm2, xmm0 ; expanding
+
+ punpcklwd xmm1, xmm0 ; expanding to dwords
+ punpcklwd xmm2, xmm0 ; expanding to dwords
+
+ psubd xmm2, xmm1 ; 7--8 8--7 9--6 10--5
+ paddd xmm1, xmm1 ; -8*2 -7*2 -6*2 -5*2
+
+ paddd xmm1, xmm2 ; 7+-8 8+-7 9+-6 10+-5
+ pmaddwd xmm1, xmm2 ; squared of 7+-8 8+-7 9+-6 10+-5
+
+ paddd xmm6, xmm2
+ paddd xmm7, xmm1
+
+ pshufd xmm6, xmm6, 0 ; duplicate the last ones
+ pshufd xmm7, xmm7, 0 ; duplicate the last ones
+
+ psrldq xmm1, 4 ; 8--7 9--6 10--5 0000
+ psrldq xmm2, 4 ; 8--7 9--6 10--5 0000
+
+ pshufd xmm3, xmm1, 3 ; 0000 8--7 8--7 8--7 squared
+ pshufd xmm4, xmm2, 3 ; 0000 8--7 8--7 8--7 squared
+
+ paddd xmm6, xmm4
+ paddd xmm7, xmm3
+
+ pshufd xmm3, xmm1, 01011111b ; 0000 0000 9--6 9--6 squared
+ pshufd xmm4, xmm2, 01011111b ; 0000 0000 9--6 9--6 squared
+
+ paddd xmm7, xmm3
+ paddd xmm6, xmm4
+
+ pshufd xmm3, xmm1, 10111111b ; 0000 0000 8--7 8--7 squared
+ pshufd xmm4, xmm2, 10111111b ; 0000 0000 8--7 8--7 squared
+
+ paddd xmm7, xmm3
+ paddd xmm6, xmm4
+
+ movdqa xmm3, xmm6
+ pmaddwd xmm3, xmm3
+
+ movdqa xmm5, xmm7
+ pslld xmm5, 4
+
+ psubd xmm5, xmm7
+ psubd xmm5, xmm3
+
+ psubd xmm5, flimit4
+ psrad xmm5, 31
+
+ packssdw xmm5, xmm0
+ packsswb xmm5, xmm0
+
+ movd xmm1, DWORD PTR [rsi+rcx]
+ movq xmm2, xmm1
+
+ punpcklbw xmm1, xmm0
+ punpcklwd xmm1, xmm0
+
+ paddd xmm1, xmm6
+ paddd xmm1, [GLOBAL(four8s)]
+
+ psrad xmm1, 4
+ packssdw xmm1, xmm0
+
+ packuswb xmm1, xmm0
+ pand xmm1, xmm5
+
+ pandn xmm5, xmm2
+ por xmm5, xmm1
+
+ movd [rsi+rcx-8], mm0
+ movq mm0, mm1
+
+ movdq2q mm1, xmm5
+ psrldq xmm7, 12
+
+ psrldq xmm6, 12
+ add rcx, 4
+
+ cmp rcx, rdx
+ jl .nextcol4
+
+ ;s+=pitch;
+ movsxd rax, dword arg(1)
+ add arg(0), rax
+
+ sub dword arg(2), 1 ;rows-=1
+ cmp dword arg(2), 0
+ jg .ip_row_loop
+
+ add rsp, 16
+ pop rsp
+
+ ; begin epilog
+ pop rdi
+ pop rsi
+ RESTORE_GOT
+ RESTORE_XMM
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+%undef flimit4
+
+
+SECTION_RODATA
+align 16
+four8s:
+ times 4 dd 8
diff --git a/media/libvpx/libvpx/vpx_dsp/x86/fwd_dct32x32_impl_avx2.h b/media/libvpx/libvpx/vpx_dsp/x86/fwd_dct32x32_impl_avx2.h
new file mode 100644
index 0000000000..f3a8020292
--- /dev/null
+++ b/media/libvpx/libvpx/vpx_dsp/x86/fwd_dct32x32_impl_avx2.h
@@ -0,0 +1,2930 @@
+/*
+ * Copyright (c) 2012 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <immintrin.h> // AVX2
+
+#include "./vpx_dsp_rtcd.h"
+#include "vpx_dsp/txfm_common.h"
+
+#define pair256_set_epi16(a, b) \
+ _mm256_set_epi16((int16_t)(b), (int16_t)(a), (int16_t)(b), (int16_t)(a), \
+ (int16_t)(b), (int16_t)(a), (int16_t)(b), (int16_t)(a), \
+ (int16_t)(b), (int16_t)(a), (int16_t)(b), (int16_t)(a), \
+ (int16_t)(b), (int16_t)(a), (int16_t)(b), (int16_t)(a))
+
+#define pair256_set_epi32(a, b) \
+ _mm256_set_epi32((int)(b), (int)(a), (int)(b), (int)(a), (int)(b), (int)(a), \
+ (int)(b), (int)(a))
+
+#if FDCT32x32_HIGH_PRECISION
+static INLINE __m256i k_madd_epi32_avx2(__m256i a, __m256i b) {
+ __m256i buf0, buf1;
+ buf0 = _mm256_mul_epu32(a, b);
+ a = _mm256_srli_epi64(a, 32);
+ b = _mm256_srli_epi64(b, 32);
+ buf1 = _mm256_mul_epu32(a, b);
+ return _mm256_add_epi64(buf0, buf1);
+}
+
+static INLINE __m256i k_packs_epi64_avx2(__m256i a, __m256i b) {
+ __m256i buf0 = _mm256_shuffle_epi32(a, _MM_SHUFFLE(0, 0, 2, 0));
+ __m256i buf1 = _mm256_shuffle_epi32(b, _MM_SHUFFLE(0, 0, 2, 0));
+ return _mm256_unpacklo_epi64(buf0, buf1);
+}
+#endif
+
+void FDCT32x32_2D_AVX2(const int16_t *input, int16_t *output_org, int stride) {
+ // Calculate pre-multiplied strides
+ const int str1 = stride;
+ const int str2 = 2 * stride;
+ const int str3 = 2 * stride + str1;
+ // We need an intermediate buffer between passes.
+ DECLARE_ALIGNED(32, int16_t, intermediate[32 * 32]);
+ // Constants
+ // When we use them, in one case, they are all the same. In all others
+ // it's a pair of them that we need to repeat four times. This is done
+ // by constructing the 32 bit constant corresponding to that pair.
+ const __m256i k__cospi_p16_p16 = _mm256_set1_epi16(cospi_16_64);
+ const __m256i k__cospi_p16_m16 =
+ pair256_set_epi16(+cospi_16_64, -cospi_16_64);
+ const __m256i k__cospi_m08_p24 = pair256_set_epi16(-cospi_8_64, cospi_24_64);
+ const __m256i k__cospi_m24_m08 = pair256_set_epi16(-cospi_24_64, -cospi_8_64);
+ const __m256i k__cospi_p24_p08 = pair256_set_epi16(+cospi_24_64, cospi_8_64);
+ const __m256i k__cospi_p12_p20 = pair256_set_epi16(+cospi_12_64, cospi_20_64);
+ const __m256i k__cospi_m20_p12 = pair256_set_epi16(-cospi_20_64, cospi_12_64);
+ const __m256i k__cospi_m04_p28 = pair256_set_epi16(-cospi_4_64, cospi_28_64);
+ const __m256i k__cospi_p28_p04 = pair256_set_epi16(+cospi_28_64, cospi_4_64);
+ const __m256i k__cospi_m28_m04 = pair256_set_epi16(-cospi_28_64, -cospi_4_64);
+ const __m256i k__cospi_m12_m20 =
+ pair256_set_epi16(-cospi_12_64, -cospi_20_64);
+ const __m256i k__cospi_p30_p02 = pair256_set_epi16(+cospi_30_64, cospi_2_64);
+ const __m256i k__cospi_p14_p18 = pair256_set_epi16(+cospi_14_64, cospi_18_64);
+ const __m256i k__cospi_p22_p10 = pair256_set_epi16(+cospi_22_64, cospi_10_64);
+ const __m256i k__cospi_p06_p26 = pair256_set_epi16(+cospi_6_64, cospi_26_64);
+ const __m256i k__cospi_m26_p06 = pair256_set_epi16(-cospi_26_64, cospi_6_64);
+ const __m256i k__cospi_m10_p22 = pair256_set_epi16(-cospi_10_64, cospi_22_64);
+ const __m256i k__cospi_m18_p14 = pair256_set_epi16(-cospi_18_64, cospi_14_64);
+ const __m256i k__cospi_m02_p30 = pair256_set_epi16(-cospi_2_64, cospi_30_64);
+ const __m256i k__cospi_p31_p01 = pair256_set_epi16(+cospi_31_64, cospi_1_64);
+ const __m256i k__cospi_p15_p17 = pair256_set_epi16(+cospi_15_64, cospi_17_64);
+ const __m256i k__cospi_p23_p09 = pair256_set_epi16(+cospi_23_64, cospi_9_64);
+ const __m256i k__cospi_p07_p25 = pair256_set_epi16(+cospi_7_64, cospi_25_64);
+ const __m256i k__cospi_m25_p07 = pair256_set_epi16(-cospi_25_64, cospi_7_64);
+ const __m256i k__cospi_m09_p23 = pair256_set_epi16(-cospi_9_64, cospi_23_64);
+ const __m256i k__cospi_m17_p15 = pair256_set_epi16(-cospi_17_64, cospi_15_64);
+ const __m256i k__cospi_m01_p31 = pair256_set_epi16(-cospi_1_64, cospi_31_64);
+ const __m256i k__cospi_p27_p05 = pair256_set_epi16(+cospi_27_64, cospi_5_64);
+ const __m256i k__cospi_p11_p21 = pair256_set_epi16(+cospi_11_64, cospi_21_64);
+ const __m256i k__cospi_p19_p13 = pair256_set_epi16(+cospi_19_64, cospi_13_64);
+ const __m256i k__cospi_p03_p29 = pair256_set_epi16(+cospi_3_64, cospi_29_64);
+ const __m256i k__cospi_m29_p03 = pair256_set_epi16(-cospi_29_64, cospi_3_64);
+ const __m256i k__cospi_m13_p19 = pair256_set_epi16(-cospi_13_64, cospi_19_64);
+ const __m256i k__cospi_m21_p11 = pair256_set_epi16(-cospi_21_64, cospi_11_64);
+ const __m256i k__cospi_m05_p27 = pair256_set_epi16(-cospi_5_64, cospi_27_64);
+ const __m256i k__DCT_CONST_ROUNDING = _mm256_set1_epi32(DCT_CONST_ROUNDING);
+ const __m256i kZero = _mm256_setzero_si256();
+ const __m256i kOne = _mm256_set1_epi16(1);
+ // Do the two transform/transpose passes
+ int pass;
+ for (pass = 0; pass < 2; ++pass) {
+ // We process sixteen columns (transposed rows in second pass) at a time.
+ int column_start;
+ for (column_start = 0; column_start < 32; column_start += 16) {
+ __m256i step1[32];
+ __m256i step2[32];
+ __m256i step3[32];
+ __m256i out[32];
+ // Stage 1
+ // Note: even though all the loads below are aligned, using the aligned
+ // intrinsic make the code slightly slower.
+ if (0 == pass) {
+ const int16_t *in = &input[column_start];
+ // step1[i] = (in[ 0 * stride] + in[(32 - 1) * stride]) << 2;
+ // Note: the next four blocks could be in a loop. That would help the
+ // instruction cache but is actually slower.
+ {
+ const int16_t *ina = in + 0 * str1;
+ const int16_t *inb = in + 31 * str1;
+ __m256i *step1a = &step1[0];
+ __m256i *step1b = &step1[31];
+ const __m256i ina0 = _mm256_loadu_si256((const __m256i *)(ina));
+ const __m256i ina1 =
+ _mm256_loadu_si256((const __m256i *)(ina + str1));
+ const __m256i ina2 =
+ _mm256_loadu_si256((const __m256i *)(ina + str2));
+ const __m256i ina3 =
+ _mm256_loadu_si256((const __m256i *)(ina + str3));
+ const __m256i inb3 =
+ _mm256_loadu_si256((const __m256i *)(inb - str3));
+ const __m256i inb2 =
+ _mm256_loadu_si256((const __m256i *)(inb - str2));
+ const __m256i inb1 =
+ _mm256_loadu_si256((const __m256i *)(inb - str1));
+ const __m256i inb0 = _mm256_loadu_si256((const __m256i *)(inb));
+ step1a[0] = _mm256_add_epi16(ina0, inb0);
+ step1a[1] = _mm256_add_epi16(ina1, inb1);
+ step1a[2] = _mm256_add_epi16(ina2, inb2);
+ step1a[3] = _mm256_add_epi16(ina3, inb3);
+ step1b[-3] = _mm256_sub_epi16(ina3, inb3);
+ step1b[-2] = _mm256_sub_epi16(ina2, inb2);
+ step1b[-1] = _mm256_sub_epi16(ina1, inb1);
+ step1b[-0] = _mm256_sub_epi16(ina0, inb0);
+ step1a[0] = _mm256_slli_epi16(step1a[0], 2);
+ step1a[1] = _mm256_slli_epi16(step1a[1], 2);
+ step1a[2] = _mm256_slli_epi16(step1a[2], 2);
+ step1a[3] = _mm256_slli_epi16(step1a[3], 2);
+ step1b[-3] = _mm256_slli_epi16(step1b[-3], 2);
+ step1b[-2] = _mm256_slli_epi16(step1b[-2], 2);
+ step1b[-1] = _mm256_slli_epi16(step1b[-1], 2);
+ step1b[-0] = _mm256_slli_epi16(step1b[-0], 2);
+ }
+ {
+ const int16_t *ina = in + 4 * str1;
+ const int16_t *inb = in + 27 * str1;
+ __m256i *step1a = &step1[4];
+ __m256i *step1b = &step1[27];
+ const __m256i ina0 = _mm256_loadu_si256((const __m256i *)(ina));
+ const __m256i ina1 =
+ _mm256_loadu_si256((const __m256i *)(ina + str1));
+ const __m256i ina2 =
+ _mm256_loadu_si256((const __m256i *)(ina + str2));
+ const __m256i ina3 =
+ _mm256_loadu_si256((const __m256i *)(ina + str3));
+ const __m256i inb3 =
+ _mm256_loadu_si256((const __m256i *)(inb - str3));
+ const __m256i inb2 =
+ _mm256_loadu_si256((const __m256i *)(inb - str2));
+ const __m256i inb1 =
+ _mm256_loadu_si256((const __m256i *)(inb - str1));
+ const __m256i inb0 = _mm256_loadu_si256((const __m256i *)(inb));
+ step1a[0] = _mm256_add_epi16(ina0, inb0);
+ step1a[1] = _mm256_add_epi16(ina1, inb1);
+ step1a[2] = _mm256_add_epi16(ina2, inb2);
+ step1a[3] = _mm256_add_epi16(ina3, inb3);
+ step1b[-3] = _mm256_sub_epi16(ina3, inb3);
+ step1b[-2] = _mm256_sub_epi16(ina2, inb2);
+ step1b[-1] = _mm256_sub_epi16(ina1, inb1);
+ step1b[-0] = _mm256_sub_epi16(ina0, inb0);
+ step1a[0] = _mm256_slli_epi16(step1a[0], 2);
+ step1a[1] = _mm256_slli_epi16(step1a[1], 2);
+ step1a[2] = _mm256_slli_epi16(step1a[2], 2);
+ step1a[3] = _mm256_slli_epi16(step1a[3], 2);
+ step1b[-3] = _mm256_slli_epi16(step1b[-3], 2);
+ step1b[-2] = _mm256_slli_epi16(step1b[-2], 2);
+ step1b[-1] = _mm256_slli_epi16(step1b[-1], 2);
+ step1b[-0] = _mm256_slli_epi16(step1b[-0], 2);
+ }
+ {
+ const int16_t *ina = in + 8 * str1;
+ const int16_t *inb = in + 23 * str1;
+ __m256i *step1a = &step1[8];
+ __m256i *step1b = &step1[23];
+ const __m256i ina0 = _mm256_loadu_si256((const __m256i *)(ina));
+ const __m256i ina1 =
+ _mm256_loadu_si256((const __m256i *)(ina + str1));
+ const __m256i ina2 =
+ _mm256_loadu_si256((const __m256i *)(ina + str2));
+ const __m256i ina3 =
+ _mm256_loadu_si256((const __m256i *)(ina + str3));
+ const __m256i inb3 =
+ _mm256_loadu_si256((const __m256i *)(inb - str3));
+ const __m256i inb2 =
+ _mm256_loadu_si256((const __m256i *)(inb - str2));
+ const __m256i inb1 =
+ _mm256_loadu_si256((const __m256i *)(inb - str1));
+ const __m256i inb0 = _mm256_loadu_si256((const __m256i *)(inb));
+ step1a[0] = _mm256_add_epi16(ina0, inb0);
+ step1a[1] = _mm256_add_epi16(ina1, inb1);
+ step1a[2] = _mm256_add_epi16(ina2, inb2);
+ step1a[3] = _mm256_add_epi16(ina3, inb3);
+ step1b[-3] = _mm256_sub_epi16(ina3, inb3);
+ step1b[-2] = _mm256_sub_epi16(ina2, inb2);
+ step1b[-1] = _mm256_sub_epi16(ina1, inb1);
+ step1b[-0] = _mm256_sub_epi16(ina0, inb0);
+ step1a[0] = _mm256_slli_epi16(step1a[0], 2);
+ step1a[1] = _mm256_slli_epi16(step1a[1], 2);
+ step1a[2] = _mm256_slli_epi16(step1a[2], 2);
+ step1a[3] = _mm256_slli_epi16(step1a[3], 2);
+ step1b[-3] = _mm256_slli_epi16(step1b[-3], 2);
+ step1b[-2] = _mm256_slli_epi16(step1b[-2], 2);
+ step1b[-1] = _mm256_slli_epi16(step1b[-1], 2);
+ step1b[-0] = _mm256_slli_epi16(step1b[-0], 2);
+ }
+ {
+ const int16_t *ina = in + 12 * str1;
+ const int16_t *inb = in + 19 * str1;
+ __m256i *step1a = &step1[12];
+ __m256i *step1b = &step1[19];
+ const __m256i ina0 = _mm256_loadu_si256((const __m256i *)(ina));
+ const __m256i ina1 =
+ _mm256_loadu_si256((const __m256i *)(ina + str1));
+ const __m256i ina2 =
+ _mm256_loadu_si256((const __m256i *)(ina + str2));
+ const __m256i ina3 =
+ _mm256_loadu_si256((const __m256i *)(ina + str3));
+ const __m256i inb3 =
+ _mm256_loadu_si256((const __m256i *)(inb - str3));
+ const __m256i inb2 =
+ _mm256_loadu_si256((const __m256i *)(inb - str2));
+ const __m256i inb1 =
+ _mm256_loadu_si256((const __m256i *)(inb - str1));
+ const __m256i inb0 = _mm256_loadu_si256((const __m256i *)(inb));
+ step1a[0] = _mm256_add_epi16(ina0, inb0);
+ step1a[1] = _mm256_add_epi16(ina1, inb1);
+ step1a[2] = _mm256_add_epi16(ina2, inb2);
+ step1a[3] = _mm256_add_epi16(ina3, inb3);
+ step1b[-3] = _mm256_sub_epi16(ina3, inb3);
+ step1b[-2] = _mm256_sub_epi16(ina2, inb2);
+ step1b[-1] = _mm256_sub_epi16(ina1, inb1);
+ step1b[-0] = _mm256_sub_epi16(ina0, inb0);
+ step1a[0] = _mm256_slli_epi16(step1a[0], 2);
+ step1a[1] = _mm256_slli_epi16(step1a[1], 2);
+ step1a[2] = _mm256_slli_epi16(step1a[2], 2);
+ step1a[3] = _mm256_slli_epi16(step1a[3], 2);
+ step1b[-3] = _mm256_slli_epi16(step1b[-3], 2);
+ step1b[-2] = _mm256_slli_epi16(step1b[-2], 2);
+ step1b[-1] = _mm256_slli_epi16(step1b[-1], 2);
+ step1b[-0] = _mm256_slli_epi16(step1b[-0], 2);
+ }
+ } else {
+ int16_t *in = &intermediate[column_start];
+ // step1[i] = in[ 0 * 32] + in[(32 - 1) * 32];
+ // Note: using the same approach as above to have common offset is
+ // counter-productive as all offsets can be calculated at compile
+ // time.
+ // Note: the next four blocks could be in a loop. That would help the
+ // instruction cache but is actually slower.
+ {
+ __m256i in00 = _mm256_loadu_si256((const __m256i *)(in + 0 * 32));
+ __m256i in01 = _mm256_loadu_si256((const __m256i *)(in + 1 * 32));
+ __m256i in02 = _mm256_loadu_si256((const __m256i *)(in + 2 * 32));
+ __m256i in03 = _mm256_loadu_si256((const __m256i *)(in + 3 * 32));
+ __m256i in28 = _mm256_loadu_si256((const __m256i *)(in + 28 * 32));
+ __m256i in29 = _mm256_loadu_si256((const __m256i *)(in + 29 * 32));
+ __m256i in30 = _mm256_loadu_si256((const __m256i *)(in + 30 * 32));
+ __m256i in31 = _mm256_loadu_si256((const __m256i *)(in + 31 * 32));
+ step1[0] = _mm256_add_epi16(in00, in31);
+ step1[1] = _mm256_add_epi16(in01, in30);
+ step1[2] = _mm256_add_epi16(in02, in29);
+ step1[3] = _mm256_add_epi16(in03, in28);
+ step1[28] = _mm256_sub_epi16(in03, in28);
+ step1[29] = _mm256_sub_epi16(in02, in29);
+ step1[30] = _mm256_sub_epi16(in01, in30);
+ step1[31] = _mm256_sub_epi16(in00, in31);
+ }
+ {
+ __m256i in04 = _mm256_loadu_si256((const __m256i *)(in + 4 * 32));
+ __m256i in05 = _mm256_loadu_si256((const __m256i *)(in + 5 * 32));
+ __m256i in06 = _mm256_loadu_si256((const __m256i *)(in + 6 * 32));
+ __m256i in07 = _mm256_loadu_si256((const __m256i *)(in + 7 * 32));
+ __m256i in24 = _mm256_loadu_si256((const __m256i *)(in + 24 * 32));
+ __m256i in25 = _mm256_loadu_si256((const __m256i *)(in + 25 * 32));
+ __m256i in26 = _mm256_loadu_si256((const __m256i *)(in + 26 * 32));
+ __m256i in27 = _mm256_loadu_si256((const __m256i *)(in + 27 * 32));
+ step1[4] = _mm256_add_epi16(in04, in27);
+ step1[5] = _mm256_add_epi16(in05, in26);
+ step1[6] = _mm256_add_epi16(in06, in25);
+ step1[7] = _mm256_add_epi16(in07, in24);
+ step1[24] = _mm256_sub_epi16(in07, in24);
+ step1[25] = _mm256_sub_epi16(in06, in25);
+ step1[26] = _mm256_sub_epi16(in05, in26);
+ step1[27] = _mm256_sub_epi16(in04, in27);
+ }
+ {
+ __m256i in08 = _mm256_loadu_si256((const __m256i *)(in + 8 * 32));
+ __m256i in09 = _mm256_loadu_si256((const __m256i *)(in + 9 * 32));
+ __m256i in10 = _mm256_loadu_si256((const __m256i *)(in + 10 * 32));
+ __m256i in11 = _mm256_loadu_si256((const __m256i *)(in + 11 * 32));
+ __m256i in20 = _mm256_loadu_si256((const __m256i *)(in + 20 * 32));
+ __m256i in21 = _mm256_loadu_si256((const __m256i *)(in + 21 * 32));
+ __m256i in22 = _mm256_loadu_si256((const __m256i *)(in + 22 * 32));
+ __m256i in23 = _mm256_loadu_si256((const __m256i *)(in + 23 * 32));
+ step1[8] = _mm256_add_epi16(in08, in23);
+ step1[9] = _mm256_add_epi16(in09, in22);
+ step1[10] = _mm256_add_epi16(in10, in21);
+ step1[11] = _mm256_add_epi16(in11, in20);
+ step1[20] = _mm256_sub_epi16(in11, in20);
+ step1[21] = _mm256_sub_epi16(in10, in21);
+ step1[22] = _mm256_sub_epi16(in09, in22);
+ step1[23] = _mm256_sub_epi16(in08, in23);
+ }
+ {
+ __m256i in12 = _mm256_loadu_si256((const __m256i *)(in + 12 * 32));
+ __m256i in13 = _mm256_loadu_si256((const __m256i *)(in + 13 * 32));
+ __m256i in14 = _mm256_loadu_si256((const __m256i *)(in + 14 * 32));
+ __m256i in15 = _mm256_loadu_si256((const __m256i *)(in + 15 * 32));
+ __m256i in16 = _mm256_loadu_si256((const __m256i *)(in + 16 * 32));
+ __m256i in17 = _mm256_loadu_si256((const __m256i *)(in + 17 * 32));
+ __m256i in18 = _mm256_loadu_si256((const __m256i *)(in + 18 * 32));
+ __m256i in19 = _mm256_loadu_si256((const __m256i *)(in + 19 * 32));
+ step1[12] = _mm256_add_epi16(in12, in19);
+ step1[13] = _mm256_add_epi16(in13, in18);
+ step1[14] = _mm256_add_epi16(in14, in17);
+ step1[15] = _mm256_add_epi16(in15, in16);
+ step1[16] = _mm256_sub_epi16(in15, in16);
+ step1[17] = _mm256_sub_epi16(in14, in17);
+ step1[18] = _mm256_sub_epi16(in13, in18);
+ step1[19] = _mm256_sub_epi16(in12, in19);
+ }
+ }
+ // Stage 2
+ {
+ step2[0] = _mm256_add_epi16(step1[0], step1[15]);
+ step2[1] = _mm256_add_epi16(step1[1], step1[14]);
+ step2[2] = _mm256_add_epi16(step1[2], step1[13]);
+ step2[3] = _mm256_add_epi16(step1[3], step1[12]);
+ step2[4] = _mm256_add_epi16(step1[4], step1[11]);
+ step2[5] = _mm256_add_epi16(step1[5], step1[10]);
+ step2[6] = _mm256_add_epi16(step1[6], step1[9]);
+ step2[7] = _mm256_add_epi16(step1[7], step1[8]);
+ step2[8] = _mm256_sub_epi16(step1[7], step1[8]);
+ step2[9] = _mm256_sub_epi16(step1[6], step1[9]);
+ step2[10] = _mm256_sub_epi16(step1[5], step1[10]);
+ step2[11] = _mm256_sub_epi16(step1[4], step1[11]);
+ step2[12] = _mm256_sub_epi16(step1[3], step1[12]);
+ step2[13] = _mm256_sub_epi16(step1[2], step1[13]);
+ step2[14] = _mm256_sub_epi16(step1[1], step1[14]);
+ step2[15] = _mm256_sub_epi16(step1[0], step1[15]);
+ }
+ {
+ const __m256i s2_20_0 = _mm256_unpacklo_epi16(step1[27], step1[20]);
+ const __m256i s2_20_1 = _mm256_unpackhi_epi16(step1[27], step1[20]);
+ const __m256i s2_21_0 = _mm256_unpacklo_epi16(step1[26], step1[21]);
+ const __m256i s2_21_1 = _mm256_unpackhi_epi16(step1[26], step1[21]);
+ const __m256i s2_22_0 = _mm256_unpacklo_epi16(step1[25], step1[22]);
+ const __m256i s2_22_1 = _mm256_unpackhi_epi16(step1[25], step1[22]);
+ const __m256i s2_23_0 = _mm256_unpacklo_epi16(step1[24], step1[23]);
+ const __m256i s2_23_1 = _mm256_unpackhi_epi16(step1[24], step1[23]);
+ const __m256i s2_20_2 = _mm256_madd_epi16(s2_20_0, k__cospi_p16_m16);
+ const __m256i s2_20_3 = _mm256_madd_epi16(s2_20_1, k__cospi_p16_m16);
+ const __m256i s2_21_2 = _mm256_madd_epi16(s2_21_0, k__cospi_p16_m16);
+ const __m256i s2_21_3 = _mm256_madd_epi16(s2_21_1, k__cospi_p16_m16);
+ const __m256i s2_22_2 = _mm256_madd_epi16(s2_22_0, k__cospi_p16_m16);
+ const __m256i s2_22_3 = _mm256_madd_epi16(s2_22_1, k__cospi_p16_m16);
+ const __m256i s2_23_2 = _mm256_madd_epi16(s2_23_0, k__cospi_p16_m16);
+ const __m256i s2_23_3 = _mm256_madd_epi16(s2_23_1, k__cospi_p16_m16);
+ const __m256i s2_24_2 = _mm256_madd_epi16(s2_23_0, k__cospi_p16_p16);
+ const __m256i s2_24_3 = _mm256_madd_epi16(s2_23_1, k__cospi_p16_p16);
+ const __m256i s2_25_2 = _mm256_madd_epi16(s2_22_0, k__cospi_p16_p16);
+ const __m256i s2_25_3 = _mm256_madd_epi16(s2_22_1, k__cospi_p16_p16);
+ const __m256i s2_26_2 = _mm256_madd_epi16(s2_21_0, k__cospi_p16_p16);
+ const __m256i s2_26_3 = _mm256_madd_epi16(s2_21_1, k__cospi_p16_p16);
+ const __m256i s2_27_2 = _mm256_madd_epi16(s2_20_0, k__cospi_p16_p16);
+ const __m256i s2_27_3 = _mm256_madd_epi16(s2_20_1, k__cospi_p16_p16);
+ // dct_const_round_shift
+ const __m256i s2_20_4 =
+ _mm256_add_epi32(s2_20_2, k__DCT_CONST_ROUNDING);
+ const __m256i s2_20_5 =
+ _mm256_add_epi32(s2_20_3, k__DCT_CONST_ROUNDING);
+ const __m256i s2_21_4 =
+ _mm256_add_epi32(s2_21_2, k__DCT_CONST_ROUNDING);
+ const __m256i s2_21_5 =
+ _mm256_add_epi32(s2_21_3, k__DCT_CONST_ROUNDING);
+ const __m256i s2_22_4 =
+ _mm256_add_epi32(s2_22_2, k__DCT_CONST_ROUNDING);
+ const __m256i s2_22_5 =
+ _mm256_add_epi32(s2_22_3, k__DCT_CONST_ROUNDING);
+ const __m256i s2_23_4 =
+ _mm256_add_epi32(s2_23_2, k__DCT_CONST_ROUNDING);
+ const __m256i s2_23_5 =
+ _mm256_add_epi32(s2_23_3, k__DCT_CONST_ROUNDING);
+ const __m256i s2_24_4 =
+ _mm256_add_epi32(s2_24_2, k__DCT_CONST_ROUNDING);
+ const __m256i s2_24_5 =
+ _mm256_add_epi32(s2_24_3, k__DCT_CONST_ROUNDING);
+ const __m256i s2_25_4 =
+ _mm256_add_epi32(s2_25_2, k__DCT_CONST_ROUNDING);
+ const __m256i s2_25_5 =
+ _mm256_add_epi32(s2_25_3, k__DCT_CONST_ROUNDING);
+ const __m256i s2_26_4 =
+ _mm256_add_epi32(s2_26_2, k__DCT_CONST_ROUNDING);
+ const __m256i s2_26_5 =
+ _mm256_add_epi32(s2_26_3, k__DCT_CONST_ROUNDING);
+ const __m256i s2_27_4 =
+ _mm256_add_epi32(s2_27_2, k__DCT_CONST_ROUNDING);
+ const __m256i s2_27_5 =
+ _mm256_add_epi32(s2_27_3, k__DCT_CONST_ROUNDING);
+ const __m256i s2_20_6 = _mm256_srai_epi32(s2_20_4, DCT_CONST_BITS);
+ const __m256i s2_20_7 = _mm256_srai_epi32(s2_20_5, DCT_CONST_BITS);
+ const __m256i s2_21_6 = _mm256_srai_epi32(s2_21_4, DCT_CONST_BITS);
+ const __m256i s2_21_7 = _mm256_srai_epi32(s2_21_5, DCT_CONST_BITS);
+ const __m256i s2_22_6 = _mm256_srai_epi32(s2_22_4, DCT_CONST_BITS);
+ const __m256i s2_22_7 = _mm256_srai_epi32(s2_22_5, DCT_CONST_BITS);
+ const __m256i s2_23_6 = _mm256_srai_epi32(s2_23_4, DCT_CONST_BITS);
+ const __m256i s2_23_7 = _mm256_srai_epi32(s2_23_5, DCT_CONST_BITS);
+ const __m256i s2_24_6 = _mm256_srai_epi32(s2_24_4, DCT_CONST_BITS);
+ const __m256i s2_24_7 = _mm256_srai_epi32(s2_24_5, DCT_CONST_BITS);
+ const __m256i s2_25_6 = _mm256_srai_epi32(s2_25_4, DCT_CONST_BITS);
+ const __m256i s2_25_7 = _mm256_srai_epi32(s2_25_5, DCT_CONST_BITS);
+ const __m256i s2_26_6 = _mm256_srai_epi32(s2_26_4, DCT_CONST_BITS);
+ const __m256i s2_26_7 = _mm256_srai_epi32(s2_26_5, DCT_CONST_BITS);
+ const __m256i s2_27_6 = _mm256_srai_epi32(s2_27_4, DCT_CONST_BITS);
+ const __m256i s2_27_7 = _mm256_srai_epi32(s2_27_5, DCT_CONST_BITS);
+ // Combine
+ step2[20] = _mm256_packs_epi32(s2_20_6, s2_20_7);
+ step2[21] = _mm256_packs_epi32(s2_21_6, s2_21_7);
+ step2[22] = _mm256_packs_epi32(s2_22_6, s2_22_7);
+ step2[23] = _mm256_packs_epi32(s2_23_6, s2_23_7);
+ step2[24] = _mm256_packs_epi32(s2_24_6, s2_24_7);
+ step2[25] = _mm256_packs_epi32(s2_25_6, s2_25_7);
+ step2[26] = _mm256_packs_epi32(s2_26_6, s2_26_7);
+ step2[27] = _mm256_packs_epi32(s2_27_6, s2_27_7);
+ }
+
+#if !FDCT32x32_HIGH_PRECISION
+ // dump the magnitude by half, hence the intermediate values are within
+ // the range of 16 bits.
+ if (1 == pass) {
+ __m256i s3_00_0 = _mm256_cmpgt_epi16(kZero, step2[0]);
+ __m256i s3_01_0 = _mm256_cmpgt_epi16(kZero, step2[1]);
+ __m256i s3_02_0 = _mm256_cmpgt_epi16(kZero, step2[2]);
+ __m256i s3_03_0 = _mm256_cmpgt_epi16(kZero, step2[3]);
+ __m256i s3_04_0 = _mm256_cmpgt_epi16(kZero, step2[4]);
+ __m256i s3_05_0 = _mm256_cmpgt_epi16(kZero, step2[5]);
+ __m256i s3_06_0 = _mm256_cmpgt_epi16(kZero, step2[6]);
+ __m256i s3_07_0 = _mm256_cmpgt_epi16(kZero, step2[7]);
+ __m256i s2_08_0 = _mm256_cmpgt_epi16(kZero, step2[8]);
+ __m256i s2_09_0 = _mm256_cmpgt_epi16(kZero, step2[9]);
+ __m256i s3_10_0 = _mm256_cmpgt_epi16(kZero, step2[10]);
+ __m256i s3_11_0 = _mm256_cmpgt_epi16(kZero, step2[11]);
+ __m256i s3_12_0 = _mm256_cmpgt_epi16(kZero, step2[12]);
+ __m256i s3_13_0 = _mm256_cmpgt_epi16(kZero, step2[13]);
+ __m256i s2_14_0 = _mm256_cmpgt_epi16(kZero, step2[14]);
+ __m256i s2_15_0 = _mm256_cmpgt_epi16(kZero, step2[15]);
+ __m256i s3_16_0 = _mm256_cmpgt_epi16(kZero, step1[16]);
+ __m256i s3_17_0 = _mm256_cmpgt_epi16(kZero, step1[17]);
+ __m256i s3_18_0 = _mm256_cmpgt_epi16(kZero, step1[18]);
+ __m256i s3_19_0 = _mm256_cmpgt_epi16(kZero, step1[19]);
+ __m256i s3_20_0 = _mm256_cmpgt_epi16(kZero, step2[20]);
+ __m256i s3_21_0 = _mm256_cmpgt_epi16(kZero, step2[21]);
+ __m256i s3_22_0 = _mm256_cmpgt_epi16(kZero, step2[22]);
+ __m256i s3_23_0 = _mm256_cmpgt_epi16(kZero, step2[23]);
+ __m256i s3_24_0 = _mm256_cmpgt_epi16(kZero, step2[24]);
+ __m256i s3_25_0 = _mm256_cmpgt_epi16(kZero, step2[25]);
+ __m256i s3_26_0 = _mm256_cmpgt_epi16(kZero, step2[26]);
+ __m256i s3_27_0 = _mm256_cmpgt_epi16(kZero, step2[27]);
+ __m256i s3_28_0 = _mm256_cmpgt_epi16(kZero, step1[28]);
+ __m256i s3_29_0 = _mm256_cmpgt_epi16(kZero, step1[29]);
+ __m256i s3_30_0 = _mm256_cmpgt_epi16(kZero, step1[30]);
+ __m256i s3_31_0 = _mm256_cmpgt_epi16(kZero, step1[31]);
+
+ step2[0] = _mm256_sub_epi16(step2[0], s3_00_0);
+ step2[1] = _mm256_sub_epi16(step2[1], s3_01_0);
+ step2[2] = _mm256_sub_epi16(step2[2], s3_02_0);
+ step2[3] = _mm256_sub_epi16(step2[3], s3_03_0);
+ step2[4] = _mm256_sub_epi16(step2[4], s3_04_0);
+ step2[5] = _mm256_sub_epi16(step2[5], s3_05_0);
+ step2[6] = _mm256_sub_epi16(step2[6], s3_06_0);
+ step2[7] = _mm256_sub_epi16(step2[7], s3_07_0);
+ step2[8] = _mm256_sub_epi16(step2[8], s2_08_0);
+ step2[9] = _mm256_sub_epi16(step2[9], s2_09_0);
+ step2[10] = _mm256_sub_epi16(step2[10], s3_10_0);
+ step2[11] = _mm256_sub_epi16(step2[11], s3_11_0);
+ step2[12] = _mm256_sub_epi16(step2[12], s3_12_0);
+ step2[13] = _mm256_sub_epi16(step2[13], s3_13_0);
+ step2[14] = _mm256_sub_epi16(step2[14], s2_14_0);
+ step2[15] = _mm256_sub_epi16(step2[15], s2_15_0);
+ step1[16] = _mm256_sub_epi16(step1[16], s3_16_0);
+ step1[17] = _mm256_sub_epi16(step1[17], s3_17_0);
+ step1[18] = _mm256_sub_epi16(step1[18], s3_18_0);
+ step1[19] = _mm256_sub_epi16(step1[19], s3_19_0);
+ step2[20] = _mm256_sub_epi16(step2[20], s3_20_0);
+ step2[21] = _mm256_sub_epi16(step2[21], s3_21_0);
+ step2[22] = _mm256_sub_epi16(step2[22], s3_22_0);
+ step2[23] = _mm256_sub_epi16(step2[23], s3_23_0);
+ step2[24] = _mm256_sub_epi16(step2[24], s3_24_0);
+ step2[25] = _mm256_sub_epi16(step2[25], s3_25_0);
+ step2[26] = _mm256_sub_epi16(step2[26], s3_26_0);
+ step2[27] = _mm256_sub_epi16(step2[27], s3_27_0);
+ step1[28] = _mm256_sub_epi16(step1[28], s3_28_0);
+ step1[29] = _mm256_sub_epi16(step1[29], s3_29_0);
+ step1[30] = _mm256_sub_epi16(step1[30], s3_30_0);
+ step1[31] = _mm256_sub_epi16(step1[31], s3_31_0);
+
+ step2[0] = _mm256_add_epi16(step2[0], kOne);
+ step2[1] = _mm256_add_epi16(step2[1], kOne);
+ step2[2] = _mm256_add_epi16(step2[2], kOne);
+ step2[3] = _mm256_add_epi16(step2[3], kOne);
+ step2[4] = _mm256_add_epi16(step2[4], kOne);
+ step2[5] = _mm256_add_epi16(step2[5], kOne);
+ step2[6] = _mm256_add_epi16(step2[6], kOne);
+ step2[7] = _mm256_add_epi16(step2[7], kOne);
+ step2[8] = _mm256_add_epi16(step2[8], kOne);
+ step2[9] = _mm256_add_epi16(step2[9], kOne);
+ step2[10] = _mm256_add_epi16(step2[10], kOne);
+ step2[11] = _mm256_add_epi16(step2[11], kOne);
+ step2[12] = _mm256_add_epi16(step2[12], kOne);
+ step2[13] = _mm256_add_epi16(step2[13], kOne);
+ step2[14] = _mm256_add_epi16(step2[14], kOne);
+ step2[15] = _mm256_add_epi16(step2[15], kOne);
+ step1[16] = _mm256_add_epi16(step1[16], kOne);
+ step1[17] = _mm256_add_epi16(step1[17], kOne);
+ step1[18] = _mm256_add_epi16(step1[18], kOne);
+ step1[19] = _mm256_add_epi16(step1[19], kOne);
+ step2[20] = _mm256_add_epi16(step2[20], kOne);
+ step2[21] = _mm256_add_epi16(step2[21], kOne);
+ step2[22] = _mm256_add_epi16(step2[22], kOne);
+ step2[23] = _mm256_add_epi16(step2[23], kOne);
+ step2[24] = _mm256_add_epi16(step2[24], kOne);
+ step2[25] = _mm256_add_epi16(step2[25], kOne);
+ step2[26] = _mm256_add_epi16(step2[26], kOne);
+ step2[27] = _mm256_add_epi16(step2[27], kOne);
+ step1[28] = _mm256_add_epi16(step1[28], kOne);
+ step1[29] = _mm256_add_epi16(step1[29], kOne);
+ step1[30] = _mm256_add_epi16(step1[30], kOne);
+ step1[31] = _mm256_add_epi16(step1[31], kOne);
+
+ step2[0] = _mm256_srai_epi16(step2[0], 2);
+ step2[1] = _mm256_srai_epi16(step2[1], 2);
+ step2[2] = _mm256_srai_epi16(step2[2], 2);
+ step2[3] = _mm256_srai_epi16(step2[3], 2);
+ step2[4] = _mm256_srai_epi16(step2[4], 2);
+ step2[5] = _mm256_srai_epi16(step2[5], 2);
+ step2[6] = _mm256_srai_epi16(step2[6], 2);
+ step2[7] = _mm256_srai_epi16(step2[7], 2);
+ step2[8] = _mm256_srai_epi16(step2[8], 2);
+ step2[9] = _mm256_srai_epi16(step2[9], 2);
+ step2[10] = _mm256_srai_epi16(step2[10], 2);
+ step2[11] = _mm256_srai_epi16(step2[11], 2);
+ step2[12] = _mm256_srai_epi16(step2[12], 2);
+ step2[13] = _mm256_srai_epi16(step2[13], 2);
+ step2[14] = _mm256_srai_epi16(step2[14], 2);
+ step2[15] = _mm256_srai_epi16(step2[15], 2);
+ step1[16] = _mm256_srai_epi16(step1[16], 2);
+ step1[17] = _mm256_srai_epi16(step1[17], 2);
+ step1[18] = _mm256_srai_epi16(step1[18], 2);
+ step1[19] = _mm256_srai_epi16(step1[19], 2);
+ step2[20] = _mm256_srai_epi16(step2[20], 2);
+ step2[21] = _mm256_srai_epi16(step2[21], 2);
+ step2[22] = _mm256_srai_epi16(step2[22], 2);
+ step2[23] = _mm256_srai_epi16(step2[23], 2);
+ step2[24] = _mm256_srai_epi16(step2[24], 2);
+ step2[25] = _mm256_srai_epi16(step2[25], 2);
+ step2[26] = _mm256_srai_epi16(step2[26], 2);
+ step2[27] = _mm256_srai_epi16(step2[27], 2);
+ step1[28] = _mm256_srai_epi16(step1[28], 2);
+ step1[29] = _mm256_srai_epi16(step1[29], 2);
+ step1[30] = _mm256_srai_epi16(step1[30], 2);
+ step1[31] = _mm256_srai_epi16(step1[31], 2);
+ }
+#endif
+
+#if FDCT32x32_HIGH_PRECISION
+ if (pass == 0) {
+#endif
+ // Stage 3
+ {
+ step3[0] = _mm256_add_epi16(step2[(8 - 1)], step2[0]);
+ step3[1] = _mm256_add_epi16(step2[(8 - 2)], step2[1]);
+ step3[2] = _mm256_add_epi16(step2[(8 - 3)], step2[2]);
+ step3[3] = _mm256_add_epi16(step2[(8 - 4)], step2[3]);
+ step3[4] = _mm256_sub_epi16(step2[(8 - 5)], step2[4]);
+ step3[5] = _mm256_sub_epi16(step2[(8 - 6)], step2[5]);
+ step3[6] = _mm256_sub_epi16(step2[(8 - 7)], step2[6]);
+ step3[7] = _mm256_sub_epi16(step2[(8 - 8)], step2[7]);
+ }
+ {
+ const __m256i s3_10_0 = _mm256_unpacklo_epi16(step2[13], step2[10]);
+ const __m256i s3_10_1 = _mm256_unpackhi_epi16(step2[13], step2[10]);
+ const __m256i s3_11_0 = _mm256_unpacklo_epi16(step2[12], step2[11]);
+ const __m256i s3_11_1 = _mm256_unpackhi_epi16(step2[12], step2[11]);
+ const __m256i s3_10_2 = _mm256_madd_epi16(s3_10_0, k__cospi_p16_m16);
+ const __m256i s3_10_3 = _mm256_madd_epi16(s3_10_1, k__cospi_p16_m16);
+ const __m256i s3_11_2 = _mm256_madd_epi16(s3_11_0, k__cospi_p16_m16);
+ const __m256i s3_11_3 = _mm256_madd_epi16(s3_11_1, k__cospi_p16_m16);
+ const __m256i s3_12_2 = _mm256_madd_epi16(s3_11_0, k__cospi_p16_p16);
+ const __m256i s3_12_3 = _mm256_madd_epi16(s3_11_1, k__cospi_p16_p16);
+ const __m256i s3_13_2 = _mm256_madd_epi16(s3_10_0, k__cospi_p16_p16);
+ const __m256i s3_13_3 = _mm256_madd_epi16(s3_10_1, k__cospi_p16_p16);
+ // dct_const_round_shift
+ const __m256i s3_10_4 =
+ _mm256_add_epi32(s3_10_2, k__DCT_CONST_ROUNDING);
+ const __m256i s3_10_5 =
+ _mm256_add_epi32(s3_10_3, k__DCT_CONST_ROUNDING);
+ const __m256i s3_11_4 =
+ _mm256_add_epi32(s3_11_2, k__DCT_CONST_ROUNDING);
+ const __m256i s3_11_5 =
+ _mm256_add_epi32(s3_11_3, k__DCT_CONST_ROUNDING);
+ const __m256i s3_12_4 =
+ _mm256_add_epi32(s3_12_2, k__DCT_CONST_ROUNDING);
+ const __m256i s3_12_5 =
+ _mm256_add_epi32(s3_12_3, k__DCT_CONST_ROUNDING);
+ const __m256i s3_13_4 =
+ _mm256_add_epi32(s3_13_2, k__DCT_CONST_ROUNDING);
+ const __m256i s3_13_5 =
+ _mm256_add_epi32(s3_13_3, k__DCT_CONST_ROUNDING);
+ const __m256i s3_10_6 = _mm256_srai_epi32(s3_10_4, DCT_CONST_BITS);
+ const __m256i s3_10_7 = _mm256_srai_epi32(s3_10_5, DCT_CONST_BITS);
+ const __m256i s3_11_6 = _mm256_srai_epi32(s3_11_4, DCT_CONST_BITS);
+ const __m256i s3_11_7 = _mm256_srai_epi32(s3_11_5, DCT_CONST_BITS);
+ const __m256i s3_12_6 = _mm256_srai_epi32(s3_12_4, DCT_CONST_BITS);
+ const __m256i s3_12_7 = _mm256_srai_epi32(s3_12_5, DCT_CONST_BITS);
+ const __m256i s3_13_6 = _mm256_srai_epi32(s3_13_4, DCT_CONST_BITS);
+ const __m256i s3_13_7 = _mm256_srai_epi32(s3_13_5, DCT_CONST_BITS);
+ // Combine
+ step3[10] = _mm256_packs_epi32(s3_10_6, s3_10_7);
+ step3[11] = _mm256_packs_epi32(s3_11_6, s3_11_7);
+ step3[12] = _mm256_packs_epi32(s3_12_6, s3_12_7);
+ step3[13] = _mm256_packs_epi32(s3_13_6, s3_13_7);
+ }
+ {
+ step3[16] = _mm256_add_epi16(step2[23], step1[16]);
+ step3[17] = _mm256_add_epi16(step2[22], step1[17]);
+ step3[18] = _mm256_add_epi16(step2[21], step1[18]);
+ step3[19] = _mm256_add_epi16(step2[20], step1[19]);
+ step3[20] = _mm256_sub_epi16(step1[19], step2[20]);
+ step3[21] = _mm256_sub_epi16(step1[18], step2[21]);
+ step3[22] = _mm256_sub_epi16(step1[17], step2[22]);
+ step3[23] = _mm256_sub_epi16(step1[16], step2[23]);
+ step3[24] = _mm256_sub_epi16(step1[31], step2[24]);
+ step3[25] = _mm256_sub_epi16(step1[30], step2[25]);
+ step3[26] = _mm256_sub_epi16(step1[29], step2[26]);
+ step3[27] = _mm256_sub_epi16(step1[28], step2[27]);
+ step3[28] = _mm256_add_epi16(step2[27], step1[28]);
+ step3[29] = _mm256_add_epi16(step2[26], step1[29]);
+ step3[30] = _mm256_add_epi16(step2[25], step1[30]);
+ step3[31] = _mm256_add_epi16(step2[24], step1[31]);
+ }
+
+ // Stage 4
+ {
+ step1[0] = _mm256_add_epi16(step3[3], step3[0]);
+ step1[1] = _mm256_add_epi16(step3[2], step3[1]);
+ step1[2] = _mm256_sub_epi16(step3[1], step3[2]);
+ step1[3] = _mm256_sub_epi16(step3[0], step3[3]);
+ step1[8] = _mm256_add_epi16(step3[11], step2[8]);
+ step1[9] = _mm256_add_epi16(step3[10], step2[9]);
+ step1[10] = _mm256_sub_epi16(step2[9], step3[10]);
+ step1[11] = _mm256_sub_epi16(step2[8], step3[11]);
+ step1[12] = _mm256_sub_epi16(step2[15], step3[12]);
+ step1[13] = _mm256_sub_epi16(step2[14], step3[13]);
+ step1[14] = _mm256_add_epi16(step3[13], step2[14]);
+ step1[15] = _mm256_add_epi16(step3[12], step2[15]);
+ }
+ {
+ const __m256i s1_05_0 = _mm256_unpacklo_epi16(step3[6], step3[5]);
+ const __m256i s1_05_1 = _mm256_unpackhi_epi16(step3[6], step3[5]);
+ const __m256i s1_05_2 = _mm256_madd_epi16(s1_05_0, k__cospi_p16_m16);
+ const __m256i s1_05_3 = _mm256_madd_epi16(s1_05_1, k__cospi_p16_m16);
+ const __m256i s1_06_2 = _mm256_madd_epi16(s1_05_0, k__cospi_p16_p16);
+ const __m256i s1_06_3 = _mm256_madd_epi16(s1_05_1, k__cospi_p16_p16);
+ // dct_const_round_shift
+ const __m256i s1_05_4 =
+ _mm256_add_epi32(s1_05_2, k__DCT_CONST_ROUNDING);
+ const __m256i s1_05_5 =
+ _mm256_add_epi32(s1_05_3, k__DCT_CONST_ROUNDING);
+ const __m256i s1_06_4 =
+ _mm256_add_epi32(s1_06_2, k__DCT_CONST_ROUNDING);
+ const __m256i s1_06_5 =
+ _mm256_add_epi32(s1_06_3, k__DCT_CONST_ROUNDING);
+ const __m256i s1_05_6 = _mm256_srai_epi32(s1_05_4, DCT_CONST_BITS);
+ const __m256i s1_05_7 = _mm256_srai_epi32(s1_05_5, DCT_CONST_BITS);
+ const __m256i s1_06_6 = _mm256_srai_epi32(s1_06_4, DCT_CONST_BITS);
+ const __m256i s1_06_7 = _mm256_srai_epi32(s1_06_5, DCT_CONST_BITS);
+ // Combine
+ step1[5] = _mm256_packs_epi32(s1_05_6, s1_05_7);
+ step1[6] = _mm256_packs_epi32(s1_06_6, s1_06_7);
+ }
+ {
+ const __m256i s1_18_0 = _mm256_unpacklo_epi16(step3[18], step3[29]);
+ const __m256i s1_18_1 = _mm256_unpackhi_epi16(step3[18], step3[29]);
+ const __m256i s1_19_0 = _mm256_unpacklo_epi16(step3[19], step3[28]);
+ const __m256i s1_19_1 = _mm256_unpackhi_epi16(step3[19], step3[28]);
+ const __m256i s1_20_0 = _mm256_unpacklo_epi16(step3[20], step3[27]);
+ const __m256i s1_20_1 = _mm256_unpackhi_epi16(step3[20], step3[27]);
+ const __m256i s1_21_0 = _mm256_unpacklo_epi16(step3[21], step3[26]);
+ const __m256i s1_21_1 = _mm256_unpackhi_epi16(step3[21], step3[26]);
+ const __m256i s1_18_2 = _mm256_madd_epi16(s1_18_0, k__cospi_m08_p24);
+ const __m256i s1_18_3 = _mm256_madd_epi16(s1_18_1, k__cospi_m08_p24);
+ const __m256i s1_19_2 = _mm256_madd_epi16(s1_19_0, k__cospi_m08_p24);
+ const __m256i s1_19_3 = _mm256_madd_epi16(s1_19_1, k__cospi_m08_p24);
+ const __m256i s1_20_2 = _mm256_madd_epi16(s1_20_0, k__cospi_m24_m08);
+ const __m256i s1_20_3 = _mm256_madd_epi16(s1_20_1, k__cospi_m24_m08);
+ const __m256i s1_21_2 = _mm256_madd_epi16(s1_21_0, k__cospi_m24_m08);
+ const __m256i s1_21_3 = _mm256_madd_epi16(s1_21_1, k__cospi_m24_m08);
+ const __m256i s1_26_2 = _mm256_madd_epi16(s1_21_0, k__cospi_m08_p24);
+ const __m256i s1_26_3 = _mm256_madd_epi16(s1_21_1, k__cospi_m08_p24);
+ const __m256i s1_27_2 = _mm256_madd_epi16(s1_20_0, k__cospi_m08_p24);
+ const __m256i s1_27_3 = _mm256_madd_epi16(s1_20_1, k__cospi_m08_p24);
+ const __m256i s1_28_2 = _mm256_madd_epi16(s1_19_0, k__cospi_p24_p08);
+ const __m256i s1_28_3 = _mm256_madd_epi16(s1_19_1, k__cospi_p24_p08);
+ const __m256i s1_29_2 = _mm256_madd_epi16(s1_18_0, k__cospi_p24_p08);
+ const __m256i s1_29_3 = _mm256_madd_epi16(s1_18_1, k__cospi_p24_p08);
+ // dct_const_round_shift
+ const __m256i s1_18_4 =
+ _mm256_add_epi32(s1_18_2, k__DCT_CONST_ROUNDING);
+ const __m256i s1_18_5 =
+ _mm256_add_epi32(s1_18_3, k__DCT_CONST_ROUNDING);
+ const __m256i s1_19_4 =
+ _mm256_add_epi32(s1_19_2, k__DCT_CONST_ROUNDING);
+ const __m256i s1_19_5 =
+ _mm256_add_epi32(s1_19_3, k__DCT_CONST_ROUNDING);
+ const __m256i s1_20_4 =
+ _mm256_add_epi32(s1_20_2, k__DCT_CONST_ROUNDING);
+ const __m256i s1_20_5 =
+ _mm256_add_epi32(s1_20_3, k__DCT_CONST_ROUNDING);
+ const __m256i s1_21_4 =
+ _mm256_add_epi32(s1_21_2, k__DCT_CONST_ROUNDING);
+ const __m256i s1_21_5 =
+ _mm256_add_epi32(s1_21_3, k__DCT_CONST_ROUNDING);
+ const __m256i s1_26_4 =
+ _mm256_add_epi32(s1_26_2, k__DCT_CONST_ROUNDING);
+ const __m256i s1_26_5 =
+ _mm256_add_epi32(s1_26_3, k__DCT_CONST_ROUNDING);
+ const __m256i s1_27_4 =
+ _mm256_add_epi32(s1_27_2, k__DCT_CONST_ROUNDING);
+ const __m256i s1_27_5 =
+ _mm256_add_epi32(s1_27_3, k__DCT_CONST_ROUNDING);
+ const __m256i s1_28_4 =
+ _mm256_add_epi32(s1_28_2, k__DCT_CONST_ROUNDING);
+ const __m256i s1_28_5 =
+ _mm256_add_epi32(s1_28_3, k__DCT_CONST_ROUNDING);
+ const __m256i s1_29_4 =
+ _mm256_add_epi32(s1_29_2, k__DCT_CONST_ROUNDING);
+ const __m256i s1_29_5 =
+ _mm256_add_epi32(s1_29_3, k__DCT_CONST_ROUNDING);
+ const __m256i s1_18_6 = _mm256_srai_epi32(s1_18_4, DCT_CONST_BITS);
+ const __m256i s1_18_7 = _mm256_srai_epi32(s1_18_5, DCT_CONST_BITS);
+ const __m256i s1_19_6 = _mm256_srai_epi32(s1_19_4, DCT_CONST_BITS);
+ const __m256i s1_19_7 = _mm256_srai_epi32(s1_19_5, DCT_CONST_BITS);
+ const __m256i s1_20_6 = _mm256_srai_epi32(s1_20_4, DCT_CONST_BITS);
+ const __m256i s1_20_7 = _mm256_srai_epi32(s1_20_5, DCT_CONST_BITS);
+ const __m256i s1_21_6 = _mm256_srai_epi32(s1_21_4, DCT_CONST_BITS);
+ const __m256i s1_21_7 = _mm256_srai_epi32(s1_21_5, DCT_CONST_BITS);
+ const __m256i s1_26_6 = _mm256_srai_epi32(s1_26_4, DCT_CONST_BITS);
+ const __m256i s1_26_7 = _mm256_srai_epi32(s1_26_5, DCT_CONST_BITS);
+ const __m256i s1_27_6 = _mm256_srai_epi32(s1_27_4, DCT_CONST_BITS);
+ const __m256i s1_27_7 = _mm256_srai_epi32(s1_27_5, DCT_CONST_BITS);
+ const __m256i s1_28_6 = _mm256_srai_epi32(s1_28_4, DCT_CONST_BITS);
+ const __m256i s1_28_7 = _mm256_srai_epi32(s1_28_5, DCT_CONST_BITS);
+ const __m256i s1_29_6 = _mm256_srai_epi32(s1_29_4, DCT_CONST_BITS);
+ const __m256i s1_29_7 = _mm256_srai_epi32(s1_29_5, DCT_CONST_BITS);
+ // Combine
+ step1[18] = _mm256_packs_epi32(s1_18_6, s1_18_7);
+ step1[19] = _mm256_packs_epi32(s1_19_6, s1_19_7);
+ step1[20] = _mm256_packs_epi32(s1_20_6, s1_20_7);
+ step1[21] = _mm256_packs_epi32(s1_21_6, s1_21_7);
+ step1[26] = _mm256_packs_epi32(s1_26_6, s1_26_7);
+ step1[27] = _mm256_packs_epi32(s1_27_6, s1_27_7);
+ step1[28] = _mm256_packs_epi32(s1_28_6, s1_28_7);
+ step1[29] = _mm256_packs_epi32(s1_29_6, s1_29_7);
+ }
+ // Stage 5
+ {
+ step2[4] = _mm256_add_epi16(step1[5], step3[4]);
+ step2[5] = _mm256_sub_epi16(step3[4], step1[5]);
+ step2[6] = _mm256_sub_epi16(step3[7], step1[6]);
+ step2[7] = _mm256_add_epi16(step1[6], step3[7]);
+ }
+ {
+ const __m256i out_00_0 = _mm256_unpacklo_epi16(step1[0], step1[1]);
+ const __m256i out_00_1 = _mm256_unpackhi_epi16(step1[0], step1[1]);
+ const __m256i out_08_0 = _mm256_unpacklo_epi16(step1[2], step1[3]);
+ const __m256i out_08_1 = _mm256_unpackhi_epi16(step1[2], step1[3]);
+ const __m256i out_00_2 =
+ _mm256_madd_epi16(out_00_0, k__cospi_p16_p16);
+ const __m256i out_00_3 =
+ _mm256_madd_epi16(out_00_1, k__cospi_p16_p16);
+ const __m256i out_16_2 =
+ _mm256_madd_epi16(out_00_0, k__cospi_p16_m16);
+ const __m256i out_16_3 =
+ _mm256_madd_epi16(out_00_1, k__cospi_p16_m16);
+ const __m256i out_08_2 =
+ _mm256_madd_epi16(out_08_0, k__cospi_p24_p08);
+ const __m256i out_08_3 =
+ _mm256_madd_epi16(out_08_1, k__cospi_p24_p08);
+ const __m256i out_24_2 =
+ _mm256_madd_epi16(out_08_0, k__cospi_m08_p24);
+ const __m256i out_24_3 =
+ _mm256_madd_epi16(out_08_1, k__cospi_m08_p24);
+ // dct_const_round_shift
+ const __m256i out_00_4 =
+ _mm256_add_epi32(out_00_2, k__DCT_CONST_ROUNDING);
+ const __m256i out_00_5 =
+ _mm256_add_epi32(out_00_3, k__DCT_CONST_ROUNDING);
+ const __m256i out_16_4 =
+ _mm256_add_epi32(out_16_2, k__DCT_CONST_ROUNDING);
+ const __m256i out_16_5 =
+ _mm256_add_epi32(out_16_3, k__DCT_CONST_ROUNDING);
+ const __m256i out_08_4 =
+ _mm256_add_epi32(out_08_2, k__DCT_CONST_ROUNDING);
+ const __m256i out_08_5 =
+ _mm256_add_epi32(out_08_3, k__DCT_CONST_ROUNDING);
+ const __m256i out_24_4 =
+ _mm256_add_epi32(out_24_2, k__DCT_CONST_ROUNDING);
+ const __m256i out_24_5 =
+ _mm256_add_epi32(out_24_3, k__DCT_CONST_ROUNDING);
+ const __m256i out_00_6 = _mm256_srai_epi32(out_00_4, DCT_CONST_BITS);
+ const __m256i out_00_7 = _mm256_srai_epi32(out_00_5, DCT_CONST_BITS);
+ const __m256i out_16_6 = _mm256_srai_epi32(out_16_4, DCT_CONST_BITS);
+ const __m256i out_16_7 = _mm256_srai_epi32(out_16_5, DCT_CONST_BITS);
+ const __m256i out_08_6 = _mm256_srai_epi32(out_08_4, DCT_CONST_BITS);
+ const __m256i out_08_7 = _mm256_srai_epi32(out_08_5, DCT_CONST_BITS);
+ const __m256i out_24_6 = _mm256_srai_epi32(out_24_4, DCT_CONST_BITS);
+ const __m256i out_24_7 = _mm256_srai_epi32(out_24_5, DCT_CONST_BITS);
+ // Combine
+ out[0] = _mm256_packs_epi32(out_00_6, out_00_7);
+ out[16] = _mm256_packs_epi32(out_16_6, out_16_7);
+ out[8] = _mm256_packs_epi32(out_08_6, out_08_7);
+ out[24] = _mm256_packs_epi32(out_24_6, out_24_7);
+ }
+ {
+ const __m256i s2_09_0 = _mm256_unpacklo_epi16(step1[9], step1[14]);
+ const __m256i s2_09_1 = _mm256_unpackhi_epi16(step1[9], step1[14]);
+ const __m256i s2_10_0 = _mm256_unpacklo_epi16(step1[10], step1[13]);
+ const __m256i s2_10_1 = _mm256_unpackhi_epi16(step1[10], step1[13]);
+ const __m256i s2_09_2 = _mm256_madd_epi16(s2_09_0, k__cospi_m08_p24);
+ const __m256i s2_09_3 = _mm256_madd_epi16(s2_09_1, k__cospi_m08_p24);
+ const __m256i s2_10_2 = _mm256_madd_epi16(s2_10_0, k__cospi_m24_m08);
+ const __m256i s2_10_3 = _mm256_madd_epi16(s2_10_1, k__cospi_m24_m08);
+ const __m256i s2_13_2 = _mm256_madd_epi16(s2_10_0, k__cospi_m08_p24);
+ const __m256i s2_13_3 = _mm256_madd_epi16(s2_10_1, k__cospi_m08_p24);
+ const __m256i s2_14_2 = _mm256_madd_epi16(s2_09_0, k__cospi_p24_p08);
+ const __m256i s2_14_3 = _mm256_madd_epi16(s2_09_1, k__cospi_p24_p08);
+ // dct_const_round_shift
+ const __m256i s2_09_4 =
+ _mm256_add_epi32(s2_09_2, k__DCT_CONST_ROUNDING);
+ const __m256i s2_09_5 =
+ _mm256_add_epi32(s2_09_3, k__DCT_CONST_ROUNDING);
+ const __m256i s2_10_4 =
+ _mm256_add_epi32(s2_10_2, k__DCT_CONST_ROUNDING);
+ const __m256i s2_10_5 =
+ _mm256_add_epi32(s2_10_3, k__DCT_CONST_ROUNDING);
+ const __m256i s2_13_4 =
+ _mm256_add_epi32(s2_13_2, k__DCT_CONST_ROUNDING);
+ const __m256i s2_13_5 =
+ _mm256_add_epi32(s2_13_3, k__DCT_CONST_ROUNDING);
+ const __m256i s2_14_4 =
+ _mm256_add_epi32(s2_14_2, k__DCT_CONST_ROUNDING);
+ const __m256i s2_14_5 =
+ _mm256_add_epi32(s2_14_3, k__DCT_CONST_ROUNDING);
+ const __m256i s2_09_6 = _mm256_srai_epi32(s2_09_4, DCT_CONST_BITS);
+ const __m256i s2_09_7 = _mm256_srai_epi32(s2_09_5, DCT_CONST_BITS);
+ const __m256i s2_10_6 = _mm256_srai_epi32(s2_10_4, DCT_CONST_BITS);
+ const __m256i s2_10_7 = _mm256_srai_epi32(s2_10_5, DCT_CONST_BITS);
+ const __m256i s2_13_6 = _mm256_srai_epi32(s2_13_4, DCT_CONST_BITS);
+ const __m256i s2_13_7 = _mm256_srai_epi32(s2_13_5, DCT_CONST_BITS);
+ const __m256i s2_14_6 = _mm256_srai_epi32(s2_14_4, DCT_CONST_BITS);
+ const __m256i s2_14_7 = _mm256_srai_epi32(s2_14_5, DCT_CONST_BITS);
+ // Combine
+ step2[9] = _mm256_packs_epi32(s2_09_6, s2_09_7);
+ step2[10] = _mm256_packs_epi32(s2_10_6, s2_10_7);
+ step2[13] = _mm256_packs_epi32(s2_13_6, s2_13_7);
+ step2[14] = _mm256_packs_epi32(s2_14_6, s2_14_7);
+ }
+ {
+ step2[16] = _mm256_add_epi16(step1[19], step3[16]);
+ step2[17] = _mm256_add_epi16(step1[18], step3[17]);
+ step2[18] = _mm256_sub_epi16(step3[17], step1[18]);
+ step2[19] = _mm256_sub_epi16(step3[16], step1[19]);
+ step2[20] = _mm256_sub_epi16(step3[23], step1[20]);
+ step2[21] = _mm256_sub_epi16(step3[22], step1[21]);
+ step2[22] = _mm256_add_epi16(step1[21], step3[22]);
+ step2[23] = _mm256_add_epi16(step1[20], step3[23]);
+ step2[24] = _mm256_add_epi16(step1[27], step3[24]);
+ step2[25] = _mm256_add_epi16(step1[26], step3[25]);
+ step2[26] = _mm256_sub_epi16(step3[25], step1[26]);
+ step2[27] = _mm256_sub_epi16(step3[24], step1[27]);
+ step2[28] = _mm256_sub_epi16(step3[31], step1[28]);
+ step2[29] = _mm256_sub_epi16(step3[30], step1[29]);
+ step2[30] = _mm256_add_epi16(step1[29], step3[30]);
+ step2[31] = _mm256_add_epi16(step1[28], step3[31]);
+ }
+ // Stage 6
+ {
+ const __m256i out_04_0 = _mm256_unpacklo_epi16(step2[4], step2[7]);
+ const __m256i out_04_1 = _mm256_unpackhi_epi16(step2[4], step2[7]);
+ const __m256i out_20_0 = _mm256_unpacklo_epi16(step2[5], step2[6]);
+ const __m256i out_20_1 = _mm256_unpackhi_epi16(step2[5], step2[6]);
+ const __m256i out_12_0 = _mm256_unpacklo_epi16(step2[5], step2[6]);
+ const __m256i out_12_1 = _mm256_unpackhi_epi16(step2[5], step2[6]);
+ const __m256i out_28_0 = _mm256_unpacklo_epi16(step2[4], step2[7]);
+ const __m256i out_28_1 = _mm256_unpackhi_epi16(step2[4], step2[7]);
+ const __m256i out_04_2 =
+ _mm256_madd_epi16(out_04_0, k__cospi_p28_p04);
+ const __m256i out_04_3 =
+ _mm256_madd_epi16(out_04_1, k__cospi_p28_p04);
+ const __m256i out_20_2 =
+ _mm256_madd_epi16(out_20_0, k__cospi_p12_p20);
+ const __m256i out_20_3 =
+ _mm256_madd_epi16(out_20_1, k__cospi_p12_p20);
+ const __m256i out_12_2 =
+ _mm256_madd_epi16(out_12_0, k__cospi_m20_p12);
+ const __m256i out_12_3 =
+ _mm256_madd_epi16(out_12_1, k__cospi_m20_p12);
+ const __m256i out_28_2 =
+ _mm256_madd_epi16(out_28_0, k__cospi_m04_p28);
+ const __m256i out_28_3 =
+ _mm256_madd_epi16(out_28_1, k__cospi_m04_p28);
+ // dct_const_round_shift
+ const __m256i out_04_4 =
+ _mm256_add_epi32(out_04_2, k__DCT_CONST_ROUNDING);
+ const __m256i out_04_5 =
+ _mm256_add_epi32(out_04_3, k__DCT_CONST_ROUNDING);
+ const __m256i out_20_4 =
+ _mm256_add_epi32(out_20_2, k__DCT_CONST_ROUNDING);
+ const __m256i out_20_5 =
+ _mm256_add_epi32(out_20_3, k__DCT_CONST_ROUNDING);
+ const __m256i out_12_4 =
+ _mm256_add_epi32(out_12_2, k__DCT_CONST_ROUNDING);
+ const __m256i out_12_5 =
+ _mm256_add_epi32(out_12_3, k__DCT_CONST_ROUNDING);
+ const __m256i out_28_4 =
+ _mm256_add_epi32(out_28_2, k__DCT_CONST_ROUNDING);
+ const __m256i out_28_5 =
+ _mm256_add_epi32(out_28_3, k__DCT_CONST_ROUNDING);
+ const __m256i out_04_6 = _mm256_srai_epi32(out_04_4, DCT_CONST_BITS);
+ const __m256i out_04_7 = _mm256_srai_epi32(out_04_5, DCT_CONST_BITS);
+ const __m256i out_20_6 = _mm256_srai_epi32(out_20_4, DCT_CONST_BITS);
+ const __m256i out_20_7 = _mm256_srai_epi32(out_20_5, DCT_CONST_BITS);
+ const __m256i out_12_6 = _mm256_srai_epi32(out_12_4, DCT_CONST_BITS);
+ const __m256i out_12_7 = _mm256_srai_epi32(out_12_5, DCT_CONST_BITS);
+ const __m256i out_28_6 = _mm256_srai_epi32(out_28_4, DCT_CONST_BITS);
+ const __m256i out_28_7 = _mm256_srai_epi32(out_28_5, DCT_CONST_BITS);
+ // Combine
+ out[4] = _mm256_packs_epi32(out_04_6, out_04_7);
+ out[20] = _mm256_packs_epi32(out_20_6, out_20_7);
+ out[12] = _mm256_packs_epi32(out_12_6, out_12_7);
+ out[28] = _mm256_packs_epi32(out_28_6, out_28_7);
+ }
+ {
+ step3[8] = _mm256_add_epi16(step2[9], step1[8]);
+ step3[9] = _mm256_sub_epi16(step1[8], step2[9]);
+ step3[10] = _mm256_sub_epi16(step1[11], step2[10]);
+ step3[11] = _mm256_add_epi16(step2[10], step1[11]);
+ step3[12] = _mm256_add_epi16(step2[13], step1[12]);
+ step3[13] = _mm256_sub_epi16(step1[12], step2[13]);
+ step3[14] = _mm256_sub_epi16(step1[15], step2[14]);
+ step3[15] = _mm256_add_epi16(step2[14], step1[15]);
+ }
+ {
+ const __m256i s3_17_0 = _mm256_unpacklo_epi16(step2[17], step2[30]);
+ const __m256i s3_17_1 = _mm256_unpackhi_epi16(step2[17], step2[30]);
+ const __m256i s3_18_0 = _mm256_unpacklo_epi16(step2[18], step2[29]);
+ const __m256i s3_18_1 = _mm256_unpackhi_epi16(step2[18], step2[29]);
+ const __m256i s3_21_0 = _mm256_unpacklo_epi16(step2[21], step2[26]);
+ const __m256i s3_21_1 = _mm256_unpackhi_epi16(step2[21], step2[26]);
+ const __m256i s3_22_0 = _mm256_unpacklo_epi16(step2[22], step2[25]);
+ const __m256i s3_22_1 = _mm256_unpackhi_epi16(step2[22], step2[25]);
+ const __m256i s3_17_2 = _mm256_madd_epi16(s3_17_0, k__cospi_m04_p28);
+ const __m256i s3_17_3 = _mm256_madd_epi16(s3_17_1, k__cospi_m04_p28);
+ const __m256i s3_18_2 = _mm256_madd_epi16(s3_18_0, k__cospi_m28_m04);
+ const __m256i s3_18_3 = _mm256_madd_epi16(s3_18_1, k__cospi_m28_m04);
+ const __m256i s3_21_2 = _mm256_madd_epi16(s3_21_0, k__cospi_m20_p12);
+ const __m256i s3_21_3 = _mm256_madd_epi16(s3_21_1, k__cospi_m20_p12);
+ const __m256i s3_22_2 = _mm256_madd_epi16(s3_22_0, k__cospi_m12_m20);
+ const __m256i s3_22_3 = _mm256_madd_epi16(s3_22_1, k__cospi_m12_m20);
+ const __m256i s3_25_2 = _mm256_madd_epi16(s3_22_0, k__cospi_m20_p12);
+ const __m256i s3_25_3 = _mm256_madd_epi16(s3_22_1, k__cospi_m20_p12);
+ const __m256i s3_26_2 = _mm256_madd_epi16(s3_21_0, k__cospi_p12_p20);
+ const __m256i s3_26_3 = _mm256_madd_epi16(s3_21_1, k__cospi_p12_p20);
+ const __m256i s3_29_2 = _mm256_madd_epi16(s3_18_0, k__cospi_m04_p28);
+ const __m256i s3_29_3 = _mm256_madd_epi16(s3_18_1, k__cospi_m04_p28);
+ const __m256i s3_30_2 = _mm256_madd_epi16(s3_17_0, k__cospi_p28_p04);
+ const __m256i s3_30_3 = _mm256_madd_epi16(s3_17_1, k__cospi_p28_p04);
+ // dct_const_round_shift
+ const __m256i s3_17_4 =
+ _mm256_add_epi32(s3_17_2, k__DCT_CONST_ROUNDING);
+ const __m256i s3_17_5 =
+ _mm256_add_epi32(s3_17_3, k__DCT_CONST_ROUNDING);
+ const __m256i s3_18_4 =
+ _mm256_add_epi32(s3_18_2, k__DCT_CONST_ROUNDING);
+ const __m256i s3_18_5 =
+ _mm256_add_epi32(s3_18_3, k__DCT_CONST_ROUNDING);
+ const __m256i s3_21_4 =
+ _mm256_add_epi32(s3_21_2, k__DCT_CONST_ROUNDING);
+ const __m256i s3_21_5 =
+ _mm256_add_epi32(s3_21_3, k__DCT_CONST_ROUNDING);
+ const __m256i s3_22_4 =
+ _mm256_add_epi32(s3_22_2, k__DCT_CONST_ROUNDING);
+ const __m256i s3_22_5 =
+ _mm256_add_epi32(s3_22_3, k__DCT_CONST_ROUNDING);
+ const __m256i s3_17_6 = _mm256_srai_epi32(s3_17_4, DCT_CONST_BITS);
+ const __m256i s3_17_7 = _mm256_srai_epi32(s3_17_5, DCT_CONST_BITS);
+ const __m256i s3_18_6 = _mm256_srai_epi32(s3_18_4, DCT_CONST_BITS);
+ const __m256i s3_18_7 = _mm256_srai_epi32(s3_18_5, DCT_CONST_BITS);
+ const __m256i s3_21_6 = _mm256_srai_epi32(s3_21_4, DCT_CONST_BITS);
+ const __m256i s3_21_7 = _mm256_srai_epi32(s3_21_5, DCT_CONST_BITS);
+ const __m256i s3_22_6 = _mm256_srai_epi32(s3_22_4, DCT_CONST_BITS);
+ const __m256i s3_22_7 = _mm256_srai_epi32(s3_22_5, DCT_CONST_BITS);
+ const __m256i s3_25_4 =
+ _mm256_add_epi32(s3_25_2, k__DCT_CONST_ROUNDING);
+ const __m256i s3_25_5 =
+ _mm256_add_epi32(s3_25_3, k__DCT_CONST_ROUNDING);
+ const __m256i s3_26_4 =
+ _mm256_add_epi32(s3_26_2, k__DCT_CONST_ROUNDING);
+ const __m256i s3_26_5 =
+ _mm256_add_epi32(s3_26_3, k__DCT_CONST_ROUNDING);
+ const __m256i s3_29_4 =
+ _mm256_add_epi32(s3_29_2, k__DCT_CONST_ROUNDING);
+ const __m256i s3_29_5 =
+ _mm256_add_epi32(s3_29_3, k__DCT_CONST_ROUNDING);
+ const __m256i s3_30_4 =
+ _mm256_add_epi32(s3_30_2, k__DCT_CONST_ROUNDING);
+ const __m256i s3_30_5 =
+ _mm256_add_epi32(s3_30_3, k__DCT_CONST_ROUNDING);
+ const __m256i s3_25_6 = _mm256_srai_epi32(s3_25_4, DCT_CONST_BITS);
+ const __m256i s3_25_7 = _mm256_srai_epi32(s3_25_5, DCT_CONST_BITS);
+ const __m256i s3_26_6 = _mm256_srai_epi32(s3_26_4, DCT_CONST_BITS);
+ const __m256i s3_26_7 = _mm256_srai_epi32(s3_26_5, DCT_CONST_BITS);
+ const __m256i s3_29_6 = _mm256_srai_epi32(s3_29_4, DCT_CONST_BITS);
+ const __m256i s3_29_7 = _mm256_srai_epi32(s3_29_5, DCT_CONST_BITS);
+ const __m256i s3_30_6 = _mm256_srai_epi32(s3_30_4, DCT_CONST_BITS);
+ const __m256i s3_30_7 = _mm256_srai_epi32(s3_30_5, DCT_CONST_BITS);
+ // Combine
+ step3[17] = _mm256_packs_epi32(s3_17_6, s3_17_7);
+ step3[18] = _mm256_packs_epi32(s3_18_6, s3_18_7);
+ step3[21] = _mm256_packs_epi32(s3_21_6, s3_21_7);
+ step3[22] = _mm256_packs_epi32(s3_22_6, s3_22_7);
+ // Combine
+ step3[25] = _mm256_packs_epi32(s3_25_6, s3_25_7);
+ step3[26] = _mm256_packs_epi32(s3_26_6, s3_26_7);
+ step3[29] = _mm256_packs_epi32(s3_29_6, s3_29_7);
+ step3[30] = _mm256_packs_epi32(s3_30_6, s3_30_7);
+ }
+ // Stage 7
+ {
+ const __m256i out_02_0 = _mm256_unpacklo_epi16(step3[8], step3[15]);
+ const __m256i out_02_1 = _mm256_unpackhi_epi16(step3[8], step3[15]);
+ const __m256i out_18_0 = _mm256_unpacklo_epi16(step3[9], step3[14]);
+ const __m256i out_18_1 = _mm256_unpackhi_epi16(step3[9], step3[14]);
+ const __m256i out_10_0 = _mm256_unpacklo_epi16(step3[10], step3[13]);
+ const __m256i out_10_1 = _mm256_unpackhi_epi16(step3[10], step3[13]);
+ const __m256i out_26_0 = _mm256_unpacklo_epi16(step3[11], step3[12]);
+ const __m256i out_26_1 = _mm256_unpackhi_epi16(step3[11], step3[12]);
+ const __m256i out_02_2 =
+ _mm256_madd_epi16(out_02_0, k__cospi_p30_p02);
+ const __m256i out_02_3 =
+ _mm256_madd_epi16(out_02_1, k__cospi_p30_p02);
+ const __m256i out_18_2 =
+ _mm256_madd_epi16(out_18_0, k__cospi_p14_p18);
+ const __m256i out_18_3 =
+ _mm256_madd_epi16(out_18_1, k__cospi_p14_p18);
+ const __m256i out_10_2 =
+ _mm256_madd_epi16(out_10_0, k__cospi_p22_p10);
+ const __m256i out_10_3 =
+ _mm256_madd_epi16(out_10_1, k__cospi_p22_p10);
+ const __m256i out_26_2 =
+ _mm256_madd_epi16(out_26_0, k__cospi_p06_p26);
+ const __m256i out_26_3 =
+ _mm256_madd_epi16(out_26_1, k__cospi_p06_p26);
+ const __m256i out_06_2 =
+ _mm256_madd_epi16(out_26_0, k__cospi_m26_p06);
+ const __m256i out_06_3 =
+ _mm256_madd_epi16(out_26_1, k__cospi_m26_p06);
+ const __m256i out_22_2 =
+ _mm256_madd_epi16(out_10_0, k__cospi_m10_p22);
+ const __m256i out_22_3 =
+ _mm256_madd_epi16(out_10_1, k__cospi_m10_p22);
+ const __m256i out_14_2 =
+ _mm256_madd_epi16(out_18_0, k__cospi_m18_p14);
+ const __m256i out_14_3 =
+ _mm256_madd_epi16(out_18_1, k__cospi_m18_p14);
+ const __m256i out_30_2 =
+ _mm256_madd_epi16(out_02_0, k__cospi_m02_p30);
+ const __m256i out_30_3 =
+ _mm256_madd_epi16(out_02_1, k__cospi_m02_p30);
+ // dct_const_round_shift
+ const __m256i out_02_4 =
+ _mm256_add_epi32(out_02_2, k__DCT_CONST_ROUNDING);
+ const __m256i out_02_5 =
+ _mm256_add_epi32(out_02_3, k__DCT_CONST_ROUNDING);
+ const __m256i out_18_4 =
+ _mm256_add_epi32(out_18_2, k__DCT_CONST_ROUNDING);
+ const __m256i out_18_5 =
+ _mm256_add_epi32(out_18_3, k__DCT_CONST_ROUNDING);
+ const __m256i out_10_4 =
+ _mm256_add_epi32(out_10_2, k__DCT_CONST_ROUNDING);
+ const __m256i out_10_5 =
+ _mm256_add_epi32(out_10_3, k__DCT_CONST_ROUNDING);
+ const __m256i out_26_4 =
+ _mm256_add_epi32(out_26_2, k__DCT_CONST_ROUNDING);
+ const __m256i out_26_5 =
+ _mm256_add_epi32(out_26_3, k__DCT_CONST_ROUNDING);
+ const __m256i out_06_4 =
+ _mm256_add_epi32(out_06_2, k__DCT_CONST_ROUNDING);
+ const __m256i out_06_5 =
+ _mm256_add_epi32(out_06_3, k__DCT_CONST_ROUNDING);
+ const __m256i out_22_4 =
+ _mm256_add_epi32(out_22_2, k__DCT_CONST_ROUNDING);
+ const __m256i out_22_5 =
+ _mm256_add_epi32(out_22_3, k__DCT_CONST_ROUNDING);
+ const __m256i out_14_4 =
+ _mm256_add_epi32(out_14_2, k__DCT_CONST_ROUNDING);
+ const __m256i out_14_5 =
+ _mm256_add_epi32(out_14_3, k__DCT_CONST_ROUNDING);
+ const __m256i out_30_4 =
+ _mm256_add_epi32(out_30_2, k__DCT_CONST_ROUNDING);
+ const __m256i out_30_5 =
+ _mm256_add_epi32(out_30_3, k__DCT_CONST_ROUNDING);
+ const __m256i out_02_6 = _mm256_srai_epi32(out_02_4, DCT_CONST_BITS);
+ const __m256i out_02_7 = _mm256_srai_epi32(out_02_5, DCT_CONST_BITS);
+ const __m256i out_18_6 = _mm256_srai_epi32(out_18_4, DCT_CONST_BITS);
+ const __m256i out_18_7 = _mm256_srai_epi32(out_18_5, DCT_CONST_BITS);
+ const __m256i out_10_6 = _mm256_srai_epi32(out_10_4, DCT_CONST_BITS);
+ const __m256i out_10_7 = _mm256_srai_epi32(out_10_5, DCT_CONST_BITS);
+ const __m256i out_26_6 = _mm256_srai_epi32(out_26_4, DCT_CONST_BITS);
+ const __m256i out_26_7 = _mm256_srai_epi32(out_26_5, DCT_CONST_BITS);
+ const __m256i out_06_6 = _mm256_srai_epi32(out_06_4, DCT_CONST_BITS);
+ const __m256i out_06_7 = _mm256_srai_epi32(out_06_5, DCT_CONST_BITS);
+ const __m256i out_22_6 = _mm256_srai_epi32(out_22_4, DCT_CONST_BITS);
+ const __m256i out_22_7 = _mm256_srai_epi32(out_22_5, DCT_CONST_BITS);
+ const __m256i out_14_6 = _mm256_srai_epi32(out_14_4, DCT_CONST_BITS);
+ const __m256i out_14_7 = _mm256_srai_epi32(out_14_5, DCT_CONST_BITS);
+ const __m256i out_30_6 = _mm256_srai_epi32(out_30_4, DCT_CONST_BITS);
+ const __m256i out_30_7 = _mm256_srai_epi32(out_30_5, DCT_CONST_BITS);
+ // Combine
+ out[2] = _mm256_packs_epi32(out_02_6, out_02_7);
+ out[18] = _mm256_packs_epi32(out_18_6, out_18_7);
+ out[10] = _mm256_packs_epi32(out_10_6, out_10_7);
+ out[26] = _mm256_packs_epi32(out_26_6, out_26_7);
+ out[6] = _mm256_packs_epi32(out_06_6, out_06_7);
+ out[22] = _mm256_packs_epi32(out_22_6, out_22_7);
+ out[14] = _mm256_packs_epi32(out_14_6, out_14_7);
+ out[30] = _mm256_packs_epi32(out_30_6, out_30_7);
+ }
+ {
+ step1[16] = _mm256_add_epi16(step3[17], step2[16]);
+ step1[17] = _mm256_sub_epi16(step2[16], step3[17]);
+ step1[18] = _mm256_sub_epi16(step2[19], step3[18]);
+ step1[19] = _mm256_add_epi16(step3[18], step2[19]);
+ step1[20] = _mm256_add_epi16(step3[21], step2[20]);
+ step1[21] = _mm256_sub_epi16(step2[20], step3[21]);
+ step1[22] = _mm256_sub_epi16(step2[23], step3[22]);
+ step1[23] = _mm256_add_epi16(step3[22], step2[23]);
+ step1[24] = _mm256_add_epi16(step3[25], step2[24]);
+ step1[25] = _mm256_sub_epi16(step2[24], step3[25]);
+ step1[26] = _mm256_sub_epi16(step2[27], step3[26]);
+ step1[27] = _mm256_add_epi16(step3[26], step2[27]);
+ step1[28] = _mm256_add_epi16(step3[29], step2[28]);
+ step1[29] = _mm256_sub_epi16(step2[28], step3[29]);
+ step1[30] = _mm256_sub_epi16(step2[31], step3[30]);
+ step1[31] = _mm256_add_epi16(step3[30], step2[31]);
+ }
+ // Final stage --- outputs indices are bit-reversed.
+ {
+ const __m256i out_01_0 = _mm256_unpacklo_epi16(step1[16], step1[31]);
+ const __m256i out_01_1 = _mm256_unpackhi_epi16(step1[16], step1[31]);
+ const __m256i out_17_0 = _mm256_unpacklo_epi16(step1[17], step1[30]);
+ const __m256i out_17_1 = _mm256_unpackhi_epi16(step1[17], step1[30]);
+ const __m256i out_09_0 = _mm256_unpacklo_epi16(step1[18], step1[29]);
+ const __m256i out_09_1 = _mm256_unpackhi_epi16(step1[18], step1[29]);
+ const __m256i out_25_0 = _mm256_unpacklo_epi16(step1[19], step1[28]);
+ const __m256i out_25_1 = _mm256_unpackhi_epi16(step1[19], step1[28]);
+ const __m256i out_01_2 =
+ _mm256_madd_epi16(out_01_0, k__cospi_p31_p01);
+ const __m256i out_01_3 =
+ _mm256_madd_epi16(out_01_1, k__cospi_p31_p01);
+ const __m256i out_17_2 =
+ _mm256_madd_epi16(out_17_0, k__cospi_p15_p17);
+ const __m256i out_17_3 =
+ _mm256_madd_epi16(out_17_1, k__cospi_p15_p17);
+ const __m256i out_09_2 =
+ _mm256_madd_epi16(out_09_0, k__cospi_p23_p09);
+ const __m256i out_09_3 =
+ _mm256_madd_epi16(out_09_1, k__cospi_p23_p09);
+ const __m256i out_25_2 =
+ _mm256_madd_epi16(out_25_0, k__cospi_p07_p25);
+ const __m256i out_25_3 =
+ _mm256_madd_epi16(out_25_1, k__cospi_p07_p25);
+ const __m256i out_07_2 =
+ _mm256_madd_epi16(out_25_0, k__cospi_m25_p07);
+ const __m256i out_07_3 =
+ _mm256_madd_epi16(out_25_1, k__cospi_m25_p07);
+ const __m256i out_23_2 =
+ _mm256_madd_epi16(out_09_0, k__cospi_m09_p23);
+ const __m256i out_23_3 =
+ _mm256_madd_epi16(out_09_1, k__cospi_m09_p23);
+ const __m256i out_15_2 =
+ _mm256_madd_epi16(out_17_0, k__cospi_m17_p15);
+ const __m256i out_15_3 =
+ _mm256_madd_epi16(out_17_1, k__cospi_m17_p15);
+ const __m256i out_31_2 =
+ _mm256_madd_epi16(out_01_0, k__cospi_m01_p31);
+ const __m256i out_31_3 =
+ _mm256_madd_epi16(out_01_1, k__cospi_m01_p31);
+ // dct_const_round_shift
+ const __m256i out_01_4 =
+ _mm256_add_epi32(out_01_2, k__DCT_CONST_ROUNDING);
+ const __m256i out_01_5 =
+ _mm256_add_epi32(out_01_3, k__DCT_CONST_ROUNDING);
+ const __m256i out_17_4 =
+ _mm256_add_epi32(out_17_2, k__DCT_CONST_ROUNDING);
+ const __m256i out_17_5 =
+ _mm256_add_epi32(out_17_3, k__DCT_CONST_ROUNDING);
+ const __m256i out_09_4 =
+ _mm256_add_epi32(out_09_2, k__DCT_CONST_ROUNDING);
+ const __m256i out_09_5 =
+ _mm256_add_epi32(out_09_3, k__DCT_CONST_ROUNDING);
+ const __m256i out_25_4 =
+ _mm256_add_epi32(out_25_2, k__DCT_CONST_ROUNDING);
+ const __m256i out_25_5 =
+ _mm256_add_epi32(out_25_3, k__DCT_CONST_ROUNDING);
+ const __m256i out_07_4 =
+ _mm256_add_epi32(out_07_2, k__DCT_CONST_ROUNDING);
+ const __m256i out_07_5 =
+ _mm256_add_epi32(out_07_3, k__DCT_CONST_ROUNDING);
+ const __m256i out_23_4 =
+ _mm256_add_epi32(out_23_2, k__DCT_CONST_ROUNDING);
+ const __m256i out_23_5 =
+ _mm256_add_epi32(out_23_3, k__DCT_CONST_ROUNDING);
+ const __m256i out_15_4 =
+ _mm256_add_epi32(out_15_2, k__DCT_CONST_ROUNDING);
+ const __m256i out_15_5 =
+ _mm256_add_epi32(out_15_3, k__DCT_CONST_ROUNDING);
+ const __m256i out_31_4 =
+ _mm256_add_epi32(out_31_2, k__DCT_CONST_ROUNDING);
+ const __m256i out_31_5 =
+ _mm256_add_epi32(out_31_3, k__DCT_CONST_ROUNDING);
+ const __m256i out_01_6 = _mm256_srai_epi32(out_01_4, DCT_CONST_BITS);
+ const __m256i out_01_7 = _mm256_srai_epi32(out_01_5, DCT_CONST_BITS);
+ const __m256i out_17_6 = _mm256_srai_epi32(out_17_4, DCT_CONST_BITS);
+ const __m256i out_17_7 = _mm256_srai_epi32(out_17_5, DCT_CONST_BITS);
+ const __m256i out_09_6 = _mm256_srai_epi32(out_09_4, DCT_CONST_BITS);
+ const __m256i out_09_7 = _mm256_srai_epi32(out_09_5, DCT_CONST_BITS);
+ const __m256i out_25_6 = _mm256_srai_epi32(out_25_4, DCT_CONST_BITS);
+ const __m256i out_25_7 = _mm256_srai_epi32(out_25_5, DCT_CONST_BITS);
+ const __m256i out_07_6 = _mm256_srai_epi32(out_07_4, DCT_CONST_BITS);
+ const __m256i out_07_7 = _mm256_srai_epi32(out_07_5, DCT_CONST_BITS);
+ const __m256i out_23_6 = _mm256_srai_epi32(out_23_4, DCT_CONST_BITS);
+ const __m256i out_23_7 = _mm256_srai_epi32(out_23_5, DCT_CONST_BITS);
+ const __m256i out_15_6 = _mm256_srai_epi32(out_15_4, DCT_CONST_BITS);
+ const __m256i out_15_7 = _mm256_srai_epi32(out_15_5, DCT_CONST_BITS);
+ const __m256i out_31_6 = _mm256_srai_epi32(out_31_4, DCT_CONST_BITS);
+ const __m256i out_31_7 = _mm256_srai_epi32(out_31_5, DCT_CONST_BITS);
+ // Combine
+ out[1] = _mm256_packs_epi32(out_01_6, out_01_7);
+ out[17] = _mm256_packs_epi32(out_17_6, out_17_7);
+ out[9] = _mm256_packs_epi32(out_09_6, out_09_7);
+ out[25] = _mm256_packs_epi32(out_25_6, out_25_7);
+ out[7] = _mm256_packs_epi32(out_07_6, out_07_7);
+ out[23] = _mm256_packs_epi32(out_23_6, out_23_7);
+ out[15] = _mm256_packs_epi32(out_15_6, out_15_7);
+ out[31] = _mm256_packs_epi32(out_31_6, out_31_7);
+ }
+ {
+ const __m256i out_05_0 = _mm256_unpacklo_epi16(step1[20], step1[27]);
+ const __m256i out_05_1 = _mm256_unpackhi_epi16(step1[20], step1[27]);
+ const __m256i out_21_0 = _mm256_unpacklo_epi16(step1[21], step1[26]);
+ const __m256i out_21_1 = _mm256_unpackhi_epi16(step1[21], step1[26]);
+ const __m256i out_13_0 = _mm256_unpacklo_epi16(step1[22], step1[25]);
+ const __m256i out_13_1 = _mm256_unpackhi_epi16(step1[22], step1[25]);
+ const __m256i out_29_0 = _mm256_unpacklo_epi16(step1[23], step1[24]);
+ const __m256i out_29_1 = _mm256_unpackhi_epi16(step1[23], step1[24]);
+ const __m256i out_05_2 =
+ _mm256_madd_epi16(out_05_0, k__cospi_p27_p05);
+ const __m256i out_05_3 =
+ _mm256_madd_epi16(out_05_1, k__cospi_p27_p05);
+ const __m256i out_21_2 =
+ _mm256_madd_epi16(out_21_0, k__cospi_p11_p21);
+ const __m256i out_21_3 =
+ _mm256_madd_epi16(out_21_1, k__cospi_p11_p21);
+ const __m256i out_13_2 =
+ _mm256_madd_epi16(out_13_0, k__cospi_p19_p13);
+ const __m256i out_13_3 =
+ _mm256_madd_epi16(out_13_1, k__cospi_p19_p13);
+ const __m256i out_29_2 =
+ _mm256_madd_epi16(out_29_0, k__cospi_p03_p29);
+ const __m256i out_29_3 =
+ _mm256_madd_epi16(out_29_1, k__cospi_p03_p29);
+ const __m256i out_03_2 =
+ _mm256_madd_epi16(out_29_0, k__cospi_m29_p03);
+ const __m256i out_03_3 =
+ _mm256_madd_epi16(out_29_1, k__cospi_m29_p03);
+ const __m256i out_19_2 =
+ _mm256_madd_epi16(out_13_0, k__cospi_m13_p19);
+ const __m256i out_19_3 =
+ _mm256_madd_epi16(out_13_1, k__cospi_m13_p19);
+ const __m256i out_11_2 =
+ _mm256_madd_epi16(out_21_0, k__cospi_m21_p11);
+ const __m256i out_11_3 =
+ _mm256_madd_epi16(out_21_1, k__cospi_m21_p11);
+ const __m256i out_27_2 =
+ _mm256_madd_epi16(out_05_0, k__cospi_m05_p27);
+ const __m256i out_27_3 =
+ _mm256_madd_epi16(out_05_1, k__cospi_m05_p27);
+ // dct_const_round_shift
+ const __m256i out_05_4 =
+ _mm256_add_epi32(out_05_2, k__DCT_CONST_ROUNDING);
+ const __m256i out_05_5 =
+ _mm256_add_epi32(out_05_3, k__DCT_CONST_ROUNDING);
+ const __m256i out_21_4 =
+ _mm256_add_epi32(out_21_2, k__DCT_CONST_ROUNDING);
+ const __m256i out_21_5 =
+ _mm256_add_epi32(out_21_3, k__DCT_CONST_ROUNDING);
+ const __m256i out_13_4 =
+ _mm256_add_epi32(out_13_2, k__DCT_CONST_ROUNDING);
+ const __m256i out_13_5 =
+ _mm256_add_epi32(out_13_3, k__DCT_CONST_ROUNDING);
+ const __m256i out_29_4 =
+ _mm256_add_epi32(out_29_2, k__DCT_CONST_ROUNDING);
+ const __m256i out_29_5 =
+ _mm256_add_epi32(out_29_3, k__DCT_CONST_ROUNDING);
+ const __m256i out_03_4 =
+ _mm256_add_epi32(out_03_2, k__DCT_CONST_ROUNDING);
+ const __m256i out_03_5 =
+ _mm256_add_epi32(out_03_3, k__DCT_CONST_ROUNDING);
+ const __m256i out_19_4 =
+ _mm256_add_epi32(out_19_2, k__DCT_CONST_ROUNDING);
+ const __m256i out_19_5 =
+ _mm256_add_epi32(out_19_3, k__DCT_CONST_ROUNDING);
+ const __m256i out_11_4 =
+ _mm256_add_epi32(out_11_2, k__DCT_CONST_ROUNDING);
+ const __m256i out_11_5 =
+ _mm256_add_epi32(out_11_3, k__DCT_CONST_ROUNDING);
+ const __m256i out_27_4 =
+ _mm256_add_epi32(out_27_2, k__DCT_CONST_ROUNDING);
+ const __m256i out_27_5 =
+ _mm256_add_epi32(out_27_3, k__DCT_CONST_ROUNDING);
+ const __m256i out_05_6 = _mm256_srai_epi32(out_05_4, DCT_CONST_BITS);
+ const __m256i out_05_7 = _mm256_srai_epi32(out_05_5, DCT_CONST_BITS);
+ const __m256i out_21_6 = _mm256_srai_epi32(out_21_4, DCT_CONST_BITS);
+ const __m256i out_21_7 = _mm256_srai_epi32(out_21_5, DCT_CONST_BITS);
+ const __m256i out_13_6 = _mm256_srai_epi32(out_13_4, DCT_CONST_BITS);
+ const __m256i out_13_7 = _mm256_srai_epi32(out_13_5, DCT_CONST_BITS);
+ const __m256i out_29_6 = _mm256_srai_epi32(out_29_4, DCT_CONST_BITS);
+ const __m256i out_29_7 = _mm256_srai_epi32(out_29_5, DCT_CONST_BITS);
+ const __m256i out_03_6 = _mm256_srai_epi32(out_03_4, DCT_CONST_BITS);
+ const __m256i out_03_7 = _mm256_srai_epi32(out_03_5, DCT_CONST_BITS);
+ const __m256i out_19_6 = _mm256_srai_epi32(out_19_4, DCT_CONST_BITS);
+ const __m256i out_19_7 = _mm256_srai_epi32(out_19_5, DCT_CONST_BITS);
+ const __m256i out_11_6 = _mm256_srai_epi32(out_11_4, DCT_CONST_BITS);
+ const __m256i out_11_7 = _mm256_srai_epi32(out_11_5, DCT_CONST_BITS);
+ const __m256i out_27_6 = _mm256_srai_epi32(out_27_4, DCT_CONST_BITS);
+ const __m256i out_27_7 = _mm256_srai_epi32(out_27_5, DCT_CONST_BITS);
+ // Combine
+ out[5] = _mm256_packs_epi32(out_05_6, out_05_7);
+ out[21] = _mm256_packs_epi32(out_21_6, out_21_7);
+ out[13] = _mm256_packs_epi32(out_13_6, out_13_7);
+ out[29] = _mm256_packs_epi32(out_29_6, out_29_7);
+ out[3] = _mm256_packs_epi32(out_03_6, out_03_7);
+ out[19] = _mm256_packs_epi32(out_19_6, out_19_7);
+ out[11] = _mm256_packs_epi32(out_11_6, out_11_7);
+ out[27] = _mm256_packs_epi32(out_27_6, out_27_7);
+ }
+#if FDCT32x32_HIGH_PRECISION
+ } else {
+ __m256i lstep1[64], lstep2[64], lstep3[64];
+ __m256i u[32], v[32], sign[16];
+ const __m256i K32One = _mm256_set_epi32(1, 1, 1, 1, 1, 1, 1, 1);
+ const __m256i k__pOne_mOne = pair256_set_epi16(1, -1);
+ // start using 32-bit operations
+ // stage 3
+ {
+ // expanding to 32-bit length while adding and subtracting
+ lstep2[0] = _mm256_unpacklo_epi16(step2[0], step2[7]);
+ lstep2[1] = _mm256_unpackhi_epi16(step2[0], step2[7]);
+ lstep2[2] = _mm256_unpacklo_epi16(step2[1], step2[6]);
+ lstep2[3] = _mm256_unpackhi_epi16(step2[1], step2[6]);
+ lstep2[4] = _mm256_unpacklo_epi16(step2[2], step2[5]);
+ lstep2[5] = _mm256_unpackhi_epi16(step2[2], step2[5]);
+ lstep2[6] = _mm256_unpacklo_epi16(step2[3], step2[4]);
+ lstep2[7] = _mm256_unpackhi_epi16(step2[3], step2[4]);
+
+ lstep3[0] = _mm256_madd_epi16(lstep2[0], kOne);
+ lstep3[1] = _mm256_madd_epi16(lstep2[1], kOne);
+ lstep3[2] = _mm256_madd_epi16(lstep2[2], kOne);
+ lstep3[3] = _mm256_madd_epi16(lstep2[3], kOne);
+ lstep3[4] = _mm256_madd_epi16(lstep2[4], kOne);
+ lstep3[5] = _mm256_madd_epi16(lstep2[5], kOne);
+ lstep3[6] = _mm256_madd_epi16(lstep2[6], kOne);
+ lstep3[7] = _mm256_madd_epi16(lstep2[7], kOne);
+
+ lstep3[8] = _mm256_madd_epi16(lstep2[6], k__pOne_mOne);
+ lstep3[9] = _mm256_madd_epi16(lstep2[7], k__pOne_mOne);
+ lstep3[10] = _mm256_madd_epi16(lstep2[4], k__pOne_mOne);
+ lstep3[11] = _mm256_madd_epi16(lstep2[5], k__pOne_mOne);
+ lstep3[12] = _mm256_madd_epi16(lstep2[2], k__pOne_mOne);
+ lstep3[13] = _mm256_madd_epi16(lstep2[3], k__pOne_mOne);
+ lstep3[14] = _mm256_madd_epi16(lstep2[0], k__pOne_mOne);
+ lstep3[15] = _mm256_madd_epi16(lstep2[1], k__pOne_mOne);
+ }
+ {
+ const __m256i s3_10_0 = _mm256_unpacklo_epi16(step2[13], step2[10]);
+ const __m256i s3_10_1 = _mm256_unpackhi_epi16(step2[13], step2[10]);
+ const __m256i s3_11_0 = _mm256_unpacklo_epi16(step2[12], step2[11]);
+ const __m256i s3_11_1 = _mm256_unpackhi_epi16(step2[12], step2[11]);
+ const __m256i s3_10_2 = _mm256_madd_epi16(s3_10_0, k__cospi_p16_m16);
+ const __m256i s3_10_3 = _mm256_madd_epi16(s3_10_1, k__cospi_p16_m16);
+ const __m256i s3_11_2 = _mm256_madd_epi16(s3_11_0, k__cospi_p16_m16);
+ const __m256i s3_11_3 = _mm256_madd_epi16(s3_11_1, k__cospi_p16_m16);
+ const __m256i s3_12_2 = _mm256_madd_epi16(s3_11_0, k__cospi_p16_p16);
+ const __m256i s3_12_3 = _mm256_madd_epi16(s3_11_1, k__cospi_p16_p16);
+ const __m256i s3_13_2 = _mm256_madd_epi16(s3_10_0, k__cospi_p16_p16);
+ const __m256i s3_13_3 = _mm256_madd_epi16(s3_10_1, k__cospi_p16_p16);
+ // dct_const_round_shift
+ const __m256i s3_10_4 =
+ _mm256_add_epi32(s3_10_2, k__DCT_CONST_ROUNDING);
+ const __m256i s3_10_5 =
+ _mm256_add_epi32(s3_10_3, k__DCT_CONST_ROUNDING);
+ const __m256i s3_11_4 =
+ _mm256_add_epi32(s3_11_2, k__DCT_CONST_ROUNDING);
+ const __m256i s3_11_5 =
+ _mm256_add_epi32(s3_11_3, k__DCT_CONST_ROUNDING);
+ const __m256i s3_12_4 =
+ _mm256_add_epi32(s3_12_2, k__DCT_CONST_ROUNDING);
+ const __m256i s3_12_5 =
+ _mm256_add_epi32(s3_12_3, k__DCT_CONST_ROUNDING);
+ const __m256i s3_13_4 =
+ _mm256_add_epi32(s3_13_2, k__DCT_CONST_ROUNDING);
+ const __m256i s3_13_5 =
+ _mm256_add_epi32(s3_13_3, k__DCT_CONST_ROUNDING);
+ lstep3[20] = _mm256_srai_epi32(s3_10_4, DCT_CONST_BITS);
+ lstep3[21] = _mm256_srai_epi32(s3_10_5, DCT_CONST_BITS);
+ lstep3[22] = _mm256_srai_epi32(s3_11_4, DCT_CONST_BITS);
+ lstep3[23] = _mm256_srai_epi32(s3_11_5, DCT_CONST_BITS);
+ lstep3[24] = _mm256_srai_epi32(s3_12_4, DCT_CONST_BITS);
+ lstep3[25] = _mm256_srai_epi32(s3_12_5, DCT_CONST_BITS);
+ lstep3[26] = _mm256_srai_epi32(s3_13_4, DCT_CONST_BITS);
+ lstep3[27] = _mm256_srai_epi32(s3_13_5, DCT_CONST_BITS);
+ }
+ {
+ lstep1[32] = _mm256_unpacklo_epi16(step1[16], step2[23]);
+ lstep1[33] = _mm256_unpackhi_epi16(step1[16], step2[23]);
+ lstep1[34] = _mm256_unpacklo_epi16(step1[17], step2[22]);
+ lstep1[35] = _mm256_unpackhi_epi16(step1[17], step2[22]);
+ lstep1[36] = _mm256_unpacklo_epi16(step1[18], step2[21]);
+ lstep1[37] = _mm256_unpackhi_epi16(step1[18], step2[21]);
+ lstep1[38] = _mm256_unpacklo_epi16(step1[19], step2[20]);
+ lstep1[39] = _mm256_unpackhi_epi16(step1[19], step2[20]);
+
+ lstep1[56] = _mm256_unpacklo_epi16(step1[28], step2[27]);
+ lstep1[57] = _mm256_unpackhi_epi16(step1[28], step2[27]);
+ lstep1[58] = _mm256_unpacklo_epi16(step1[29], step2[26]);
+ lstep1[59] = _mm256_unpackhi_epi16(step1[29], step2[26]);
+ lstep1[60] = _mm256_unpacklo_epi16(step1[30], step2[25]);
+ lstep1[61] = _mm256_unpackhi_epi16(step1[30], step2[25]);
+ lstep1[62] = _mm256_unpacklo_epi16(step1[31], step2[24]);
+ lstep1[63] = _mm256_unpackhi_epi16(step1[31], step2[24]);
+
+ lstep3[32] = _mm256_madd_epi16(lstep1[32], kOne);
+ lstep3[33] = _mm256_madd_epi16(lstep1[33], kOne);
+ lstep3[34] = _mm256_madd_epi16(lstep1[34], kOne);
+ lstep3[35] = _mm256_madd_epi16(lstep1[35], kOne);
+ lstep3[36] = _mm256_madd_epi16(lstep1[36], kOne);
+ lstep3[37] = _mm256_madd_epi16(lstep1[37], kOne);
+ lstep3[38] = _mm256_madd_epi16(lstep1[38], kOne);
+ lstep3[39] = _mm256_madd_epi16(lstep1[39], kOne);
+
+ lstep3[40] = _mm256_madd_epi16(lstep1[38], k__pOne_mOne);
+ lstep3[41] = _mm256_madd_epi16(lstep1[39], k__pOne_mOne);
+ lstep3[42] = _mm256_madd_epi16(lstep1[36], k__pOne_mOne);
+ lstep3[43] = _mm256_madd_epi16(lstep1[37], k__pOne_mOne);
+ lstep3[44] = _mm256_madd_epi16(lstep1[34], k__pOne_mOne);
+ lstep3[45] = _mm256_madd_epi16(lstep1[35], k__pOne_mOne);
+ lstep3[46] = _mm256_madd_epi16(lstep1[32], k__pOne_mOne);
+ lstep3[47] = _mm256_madd_epi16(lstep1[33], k__pOne_mOne);
+
+ lstep3[48] = _mm256_madd_epi16(lstep1[62], k__pOne_mOne);
+ lstep3[49] = _mm256_madd_epi16(lstep1[63], k__pOne_mOne);
+ lstep3[50] = _mm256_madd_epi16(lstep1[60], k__pOne_mOne);
+ lstep3[51] = _mm256_madd_epi16(lstep1[61], k__pOne_mOne);
+ lstep3[52] = _mm256_madd_epi16(lstep1[58], k__pOne_mOne);
+ lstep3[53] = _mm256_madd_epi16(lstep1[59], k__pOne_mOne);
+ lstep3[54] = _mm256_madd_epi16(lstep1[56], k__pOne_mOne);
+ lstep3[55] = _mm256_madd_epi16(lstep1[57], k__pOne_mOne);
+
+ lstep3[56] = _mm256_madd_epi16(lstep1[56], kOne);
+ lstep3[57] = _mm256_madd_epi16(lstep1[57], kOne);
+ lstep3[58] = _mm256_madd_epi16(lstep1[58], kOne);
+ lstep3[59] = _mm256_madd_epi16(lstep1[59], kOne);
+ lstep3[60] = _mm256_madd_epi16(lstep1[60], kOne);
+ lstep3[61] = _mm256_madd_epi16(lstep1[61], kOne);
+ lstep3[62] = _mm256_madd_epi16(lstep1[62], kOne);
+ lstep3[63] = _mm256_madd_epi16(lstep1[63], kOne);
+ }
+
+ // stage 4
+ {
+ // expanding to 32-bit length prior to addition operations
+ sign[0] = _mm256_cmpgt_epi16(kZero, step2[8]);
+ sign[1] = _mm256_cmpgt_epi16(kZero, step2[9]);
+ sign[2] = _mm256_cmpgt_epi16(kZero, step2[14]);
+ sign[3] = _mm256_cmpgt_epi16(kZero, step2[15]);
+ lstep2[16] = _mm256_unpacklo_epi16(step2[8], sign[0]);
+ lstep2[17] = _mm256_unpackhi_epi16(step2[8], sign[0]);
+ lstep2[18] = _mm256_unpacklo_epi16(step2[9], sign[1]);
+ lstep2[19] = _mm256_unpackhi_epi16(step2[9], sign[1]);
+ lstep2[28] = _mm256_unpacklo_epi16(step2[14], sign[2]);
+ lstep2[29] = _mm256_unpackhi_epi16(step2[14], sign[2]);
+ lstep2[30] = _mm256_unpacklo_epi16(step2[15], sign[3]);
+ lstep2[31] = _mm256_unpackhi_epi16(step2[15], sign[3]);
+
+ lstep1[0] = _mm256_add_epi32(lstep3[6], lstep3[0]);
+ lstep1[1] = _mm256_add_epi32(lstep3[7], lstep3[1]);
+ lstep1[2] = _mm256_add_epi32(lstep3[4], lstep3[2]);
+ lstep1[3] = _mm256_add_epi32(lstep3[5], lstep3[3]);
+ lstep1[4] = _mm256_sub_epi32(lstep3[2], lstep3[4]);
+ lstep1[5] = _mm256_sub_epi32(lstep3[3], lstep3[5]);
+ lstep1[6] = _mm256_sub_epi32(lstep3[0], lstep3[6]);
+ lstep1[7] = _mm256_sub_epi32(lstep3[1], lstep3[7]);
+ lstep1[16] = _mm256_add_epi32(lstep3[22], lstep2[16]);
+ lstep1[17] = _mm256_add_epi32(lstep3[23], lstep2[17]);
+ lstep1[18] = _mm256_add_epi32(lstep3[20], lstep2[18]);
+ lstep1[19] = _mm256_add_epi32(lstep3[21], lstep2[19]);
+ lstep1[20] = _mm256_sub_epi32(lstep2[18], lstep3[20]);
+ lstep1[21] = _mm256_sub_epi32(lstep2[19], lstep3[21]);
+ lstep1[22] = _mm256_sub_epi32(lstep2[16], lstep3[22]);
+ lstep1[23] = _mm256_sub_epi32(lstep2[17], lstep3[23]);
+ lstep1[24] = _mm256_sub_epi32(lstep2[30], lstep3[24]);
+ lstep1[25] = _mm256_sub_epi32(lstep2[31], lstep3[25]);
+ lstep1[26] = _mm256_sub_epi32(lstep2[28], lstep3[26]);
+ lstep1[27] = _mm256_sub_epi32(lstep2[29], lstep3[27]);
+ lstep1[28] = _mm256_add_epi32(lstep3[26], lstep2[28]);
+ lstep1[29] = _mm256_add_epi32(lstep3[27], lstep2[29]);
+ lstep1[30] = _mm256_add_epi32(lstep3[24], lstep2[30]);
+ lstep1[31] = _mm256_add_epi32(lstep3[25], lstep2[31]);
+ }
+ {
+ // to be continued...
+ //
+ const __m256i k32_p16_p16 =
+ pair256_set_epi32(cospi_16_64, cospi_16_64);
+ const __m256i k32_p16_m16 =
+ pair256_set_epi32(cospi_16_64, -cospi_16_64);
+
+ u[0] = _mm256_unpacklo_epi32(lstep3[12], lstep3[10]);
+ u[1] = _mm256_unpackhi_epi32(lstep3[12], lstep3[10]);
+ u[2] = _mm256_unpacklo_epi32(lstep3[13], lstep3[11]);
+ u[3] = _mm256_unpackhi_epi32(lstep3[13], lstep3[11]);
+
+ // TODO(jingning): manually inline k_madd_epi32_avx2_ to further hide
+ // instruction latency.
+ v[0] = k_madd_epi32_avx2(u[0], k32_p16_m16);
+ v[1] = k_madd_epi32_avx2(u[1], k32_p16_m16);
+ v[2] = k_madd_epi32_avx2(u[2], k32_p16_m16);
+ v[3] = k_madd_epi32_avx2(u[3], k32_p16_m16);
+ v[4] = k_madd_epi32_avx2(u[0], k32_p16_p16);
+ v[5] = k_madd_epi32_avx2(u[1], k32_p16_p16);
+ v[6] = k_madd_epi32_avx2(u[2], k32_p16_p16);
+ v[7] = k_madd_epi32_avx2(u[3], k32_p16_p16);
+
+ u[0] = k_packs_epi64_avx2(v[0], v[1]);
+ u[1] = k_packs_epi64_avx2(v[2], v[3]);
+ u[2] = k_packs_epi64_avx2(v[4], v[5]);
+ u[3] = k_packs_epi64_avx2(v[6], v[7]);
+
+ v[0] = _mm256_add_epi32(u[0], k__DCT_CONST_ROUNDING);
+ v[1] = _mm256_add_epi32(u[1], k__DCT_CONST_ROUNDING);
+ v[2] = _mm256_add_epi32(u[2], k__DCT_CONST_ROUNDING);
+ v[3] = _mm256_add_epi32(u[3], k__DCT_CONST_ROUNDING);
+
+ lstep1[10] = _mm256_srai_epi32(v[0], DCT_CONST_BITS);
+ lstep1[11] = _mm256_srai_epi32(v[1], DCT_CONST_BITS);
+ lstep1[12] = _mm256_srai_epi32(v[2], DCT_CONST_BITS);
+ lstep1[13] = _mm256_srai_epi32(v[3], DCT_CONST_BITS);
+ }
+ {
+ const __m256i k32_m08_p24 =
+ pair256_set_epi32(-cospi_8_64, cospi_24_64);
+ const __m256i k32_m24_m08 =
+ pair256_set_epi32(-cospi_24_64, -cospi_8_64);
+ const __m256i k32_p24_p08 =
+ pair256_set_epi32(cospi_24_64, cospi_8_64);
+
+ u[0] = _mm256_unpacklo_epi32(lstep3[36], lstep3[58]);
+ u[1] = _mm256_unpackhi_epi32(lstep3[36], lstep3[58]);
+ u[2] = _mm256_unpacklo_epi32(lstep3[37], lstep3[59]);
+ u[3] = _mm256_unpackhi_epi32(lstep3[37], lstep3[59]);
+ u[4] = _mm256_unpacklo_epi32(lstep3[38], lstep3[56]);
+ u[5] = _mm256_unpackhi_epi32(lstep3[38], lstep3[56]);
+ u[6] = _mm256_unpacklo_epi32(lstep3[39], lstep3[57]);
+ u[7] = _mm256_unpackhi_epi32(lstep3[39], lstep3[57]);
+ u[8] = _mm256_unpacklo_epi32(lstep3[40], lstep3[54]);
+ u[9] = _mm256_unpackhi_epi32(lstep3[40], lstep3[54]);
+ u[10] = _mm256_unpacklo_epi32(lstep3[41], lstep3[55]);
+ u[11] = _mm256_unpackhi_epi32(lstep3[41], lstep3[55]);
+ u[12] = _mm256_unpacklo_epi32(lstep3[42], lstep3[52]);
+ u[13] = _mm256_unpackhi_epi32(lstep3[42], lstep3[52]);
+ u[14] = _mm256_unpacklo_epi32(lstep3[43], lstep3[53]);
+ u[15] = _mm256_unpackhi_epi32(lstep3[43], lstep3[53]);
+
+ v[0] = k_madd_epi32_avx2(u[0], k32_m08_p24);
+ v[1] = k_madd_epi32_avx2(u[1], k32_m08_p24);
+ v[2] = k_madd_epi32_avx2(u[2], k32_m08_p24);
+ v[3] = k_madd_epi32_avx2(u[3], k32_m08_p24);
+ v[4] = k_madd_epi32_avx2(u[4], k32_m08_p24);
+ v[5] = k_madd_epi32_avx2(u[5], k32_m08_p24);
+ v[6] = k_madd_epi32_avx2(u[6], k32_m08_p24);
+ v[7] = k_madd_epi32_avx2(u[7], k32_m08_p24);
+ v[8] = k_madd_epi32_avx2(u[8], k32_m24_m08);
+ v[9] = k_madd_epi32_avx2(u[9], k32_m24_m08);
+ v[10] = k_madd_epi32_avx2(u[10], k32_m24_m08);
+ v[11] = k_madd_epi32_avx2(u[11], k32_m24_m08);
+ v[12] = k_madd_epi32_avx2(u[12], k32_m24_m08);
+ v[13] = k_madd_epi32_avx2(u[13], k32_m24_m08);
+ v[14] = k_madd_epi32_avx2(u[14], k32_m24_m08);
+ v[15] = k_madd_epi32_avx2(u[15], k32_m24_m08);
+ v[16] = k_madd_epi32_avx2(u[12], k32_m08_p24);
+ v[17] = k_madd_epi32_avx2(u[13], k32_m08_p24);
+ v[18] = k_madd_epi32_avx2(u[14], k32_m08_p24);
+ v[19] = k_madd_epi32_avx2(u[15], k32_m08_p24);
+ v[20] = k_madd_epi32_avx2(u[8], k32_m08_p24);
+ v[21] = k_madd_epi32_avx2(u[9], k32_m08_p24);
+ v[22] = k_madd_epi32_avx2(u[10], k32_m08_p24);
+ v[23] = k_madd_epi32_avx2(u[11], k32_m08_p24);
+ v[24] = k_madd_epi32_avx2(u[4], k32_p24_p08);
+ v[25] = k_madd_epi32_avx2(u[5], k32_p24_p08);
+ v[26] = k_madd_epi32_avx2(u[6], k32_p24_p08);
+ v[27] = k_madd_epi32_avx2(u[7], k32_p24_p08);
+ v[28] = k_madd_epi32_avx2(u[0], k32_p24_p08);
+ v[29] = k_madd_epi32_avx2(u[1], k32_p24_p08);
+ v[30] = k_madd_epi32_avx2(u[2], k32_p24_p08);
+ v[31] = k_madd_epi32_avx2(u[3], k32_p24_p08);
+
+ u[0] = k_packs_epi64_avx2(v[0], v[1]);
+ u[1] = k_packs_epi64_avx2(v[2], v[3]);
+ u[2] = k_packs_epi64_avx2(v[4], v[5]);
+ u[3] = k_packs_epi64_avx2(v[6], v[7]);
+ u[4] = k_packs_epi64_avx2(v[8], v[9]);
+ u[5] = k_packs_epi64_avx2(v[10], v[11]);
+ u[6] = k_packs_epi64_avx2(v[12], v[13]);
+ u[7] = k_packs_epi64_avx2(v[14], v[15]);
+ u[8] = k_packs_epi64_avx2(v[16], v[17]);
+ u[9] = k_packs_epi64_avx2(v[18], v[19]);
+ u[10] = k_packs_epi64_avx2(v[20], v[21]);
+ u[11] = k_packs_epi64_avx2(v[22], v[23]);
+ u[12] = k_packs_epi64_avx2(v[24], v[25]);
+ u[13] = k_packs_epi64_avx2(v[26], v[27]);
+ u[14] = k_packs_epi64_avx2(v[28], v[29]);
+ u[15] = k_packs_epi64_avx2(v[30], v[31]);
+
+ v[0] = _mm256_add_epi32(u[0], k__DCT_CONST_ROUNDING);
+ v[1] = _mm256_add_epi32(u[1], k__DCT_CONST_ROUNDING);
+ v[2] = _mm256_add_epi32(u[2], k__DCT_CONST_ROUNDING);
+ v[3] = _mm256_add_epi32(u[3], k__DCT_CONST_ROUNDING);
+ v[4] = _mm256_add_epi32(u[4], k__DCT_CONST_ROUNDING);
+ v[5] = _mm256_add_epi32(u[5], k__DCT_CONST_ROUNDING);
+ v[6] = _mm256_add_epi32(u[6], k__DCT_CONST_ROUNDING);
+ v[7] = _mm256_add_epi32(u[7], k__DCT_CONST_ROUNDING);
+ v[8] = _mm256_add_epi32(u[8], k__DCT_CONST_ROUNDING);
+ v[9] = _mm256_add_epi32(u[9], k__DCT_CONST_ROUNDING);
+ v[10] = _mm256_add_epi32(u[10], k__DCT_CONST_ROUNDING);
+ v[11] = _mm256_add_epi32(u[11], k__DCT_CONST_ROUNDING);
+ v[12] = _mm256_add_epi32(u[12], k__DCT_CONST_ROUNDING);
+ v[13] = _mm256_add_epi32(u[13], k__DCT_CONST_ROUNDING);
+ v[14] = _mm256_add_epi32(u[14], k__DCT_CONST_ROUNDING);
+ v[15] = _mm256_add_epi32(u[15], k__DCT_CONST_ROUNDING);
+
+ lstep1[36] = _mm256_srai_epi32(v[0], DCT_CONST_BITS);
+ lstep1[37] = _mm256_srai_epi32(v[1], DCT_CONST_BITS);
+ lstep1[38] = _mm256_srai_epi32(v[2], DCT_CONST_BITS);
+ lstep1[39] = _mm256_srai_epi32(v[3], DCT_CONST_BITS);
+ lstep1[40] = _mm256_srai_epi32(v[4], DCT_CONST_BITS);
+ lstep1[41] = _mm256_srai_epi32(v[5], DCT_CONST_BITS);
+ lstep1[42] = _mm256_srai_epi32(v[6], DCT_CONST_BITS);
+ lstep1[43] = _mm256_srai_epi32(v[7], DCT_CONST_BITS);
+ lstep1[52] = _mm256_srai_epi32(v[8], DCT_CONST_BITS);
+ lstep1[53] = _mm256_srai_epi32(v[9], DCT_CONST_BITS);
+ lstep1[54] = _mm256_srai_epi32(v[10], DCT_CONST_BITS);
+ lstep1[55] = _mm256_srai_epi32(v[11], DCT_CONST_BITS);
+ lstep1[56] = _mm256_srai_epi32(v[12], DCT_CONST_BITS);
+ lstep1[57] = _mm256_srai_epi32(v[13], DCT_CONST_BITS);
+ lstep1[58] = _mm256_srai_epi32(v[14], DCT_CONST_BITS);
+ lstep1[59] = _mm256_srai_epi32(v[15], DCT_CONST_BITS);
+ }
+ // stage 5
+ {
+ lstep2[8] = _mm256_add_epi32(lstep1[10], lstep3[8]);
+ lstep2[9] = _mm256_add_epi32(lstep1[11], lstep3[9]);
+ lstep2[10] = _mm256_sub_epi32(lstep3[8], lstep1[10]);
+ lstep2[11] = _mm256_sub_epi32(lstep3[9], lstep1[11]);
+ lstep2[12] = _mm256_sub_epi32(lstep3[14], lstep1[12]);
+ lstep2[13] = _mm256_sub_epi32(lstep3[15], lstep1[13]);
+ lstep2[14] = _mm256_add_epi32(lstep1[12], lstep3[14]);
+ lstep2[15] = _mm256_add_epi32(lstep1[13], lstep3[15]);
+ }
+ {
+ const __m256i k32_p16_p16 =
+ pair256_set_epi32(cospi_16_64, cospi_16_64);
+ const __m256i k32_p16_m16 =
+ pair256_set_epi32(cospi_16_64, -cospi_16_64);
+ const __m256i k32_p24_p08 =
+ pair256_set_epi32(cospi_24_64, cospi_8_64);
+ const __m256i k32_m08_p24 =
+ pair256_set_epi32(-cospi_8_64, cospi_24_64);
+
+ u[0] = _mm256_unpacklo_epi32(lstep1[0], lstep1[2]);
+ u[1] = _mm256_unpackhi_epi32(lstep1[0], lstep1[2]);
+ u[2] = _mm256_unpacklo_epi32(lstep1[1], lstep1[3]);
+ u[3] = _mm256_unpackhi_epi32(lstep1[1], lstep1[3]);
+ u[4] = _mm256_unpacklo_epi32(lstep1[4], lstep1[6]);
+ u[5] = _mm256_unpackhi_epi32(lstep1[4], lstep1[6]);
+ u[6] = _mm256_unpacklo_epi32(lstep1[5], lstep1[7]);
+ u[7] = _mm256_unpackhi_epi32(lstep1[5], lstep1[7]);
+
+ // TODO(jingning): manually inline k_madd_epi32_avx2_ to further hide
+ // instruction latency.
+ v[0] = k_madd_epi32_avx2(u[0], k32_p16_p16);
+ v[1] = k_madd_epi32_avx2(u[1], k32_p16_p16);
+ v[2] = k_madd_epi32_avx2(u[2], k32_p16_p16);
+ v[3] = k_madd_epi32_avx2(u[3], k32_p16_p16);
+ v[4] = k_madd_epi32_avx2(u[0], k32_p16_m16);
+ v[5] = k_madd_epi32_avx2(u[1], k32_p16_m16);
+ v[6] = k_madd_epi32_avx2(u[2], k32_p16_m16);
+ v[7] = k_madd_epi32_avx2(u[3], k32_p16_m16);
+ v[8] = k_madd_epi32_avx2(u[4], k32_p24_p08);
+ v[9] = k_madd_epi32_avx2(u[5], k32_p24_p08);
+ v[10] = k_madd_epi32_avx2(u[6], k32_p24_p08);
+ v[11] = k_madd_epi32_avx2(u[7], k32_p24_p08);
+ v[12] = k_madd_epi32_avx2(u[4], k32_m08_p24);
+ v[13] = k_madd_epi32_avx2(u[5], k32_m08_p24);
+ v[14] = k_madd_epi32_avx2(u[6], k32_m08_p24);
+ v[15] = k_madd_epi32_avx2(u[7], k32_m08_p24);
+
+ u[0] = k_packs_epi64_avx2(v[0], v[1]);
+ u[1] = k_packs_epi64_avx2(v[2], v[3]);
+ u[2] = k_packs_epi64_avx2(v[4], v[5]);
+ u[3] = k_packs_epi64_avx2(v[6], v[7]);
+ u[4] = k_packs_epi64_avx2(v[8], v[9]);
+ u[5] = k_packs_epi64_avx2(v[10], v[11]);
+ u[6] = k_packs_epi64_avx2(v[12], v[13]);
+ u[7] = k_packs_epi64_avx2(v[14], v[15]);
+
+ v[0] = _mm256_add_epi32(u[0], k__DCT_CONST_ROUNDING);
+ v[1] = _mm256_add_epi32(u[1], k__DCT_CONST_ROUNDING);
+ v[2] = _mm256_add_epi32(u[2], k__DCT_CONST_ROUNDING);
+ v[3] = _mm256_add_epi32(u[3], k__DCT_CONST_ROUNDING);
+ v[4] = _mm256_add_epi32(u[4], k__DCT_CONST_ROUNDING);
+ v[5] = _mm256_add_epi32(u[5], k__DCT_CONST_ROUNDING);
+ v[6] = _mm256_add_epi32(u[6], k__DCT_CONST_ROUNDING);
+ v[7] = _mm256_add_epi32(u[7], k__DCT_CONST_ROUNDING);
+
+ u[0] = _mm256_srai_epi32(v[0], DCT_CONST_BITS);
+ u[1] = _mm256_srai_epi32(v[1], DCT_CONST_BITS);
+ u[2] = _mm256_srai_epi32(v[2], DCT_CONST_BITS);
+ u[3] = _mm256_srai_epi32(v[3], DCT_CONST_BITS);
+ u[4] = _mm256_srai_epi32(v[4], DCT_CONST_BITS);
+ u[5] = _mm256_srai_epi32(v[5], DCT_CONST_BITS);
+ u[6] = _mm256_srai_epi32(v[6], DCT_CONST_BITS);
+ u[7] = _mm256_srai_epi32(v[7], DCT_CONST_BITS);
+
+ sign[0] = _mm256_cmpgt_epi32(kZero, u[0]);
+ sign[1] = _mm256_cmpgt_epi32(kZero, u[1]);
+ sign[2] = _mm256_cmpgt_epi32(kZero, u[2]);
+ sign[3] = _mm256_cmpgt_epi32(kZero, u[3]);
+ sign[4] = _mm256_cmpgt_epi32(kZero, u[4]);
+ sign[5] = _mm256_cmpgt_epi32(kZero, u[5]);
+ sign[6] = _mm256_cmpgt_epi32(kZero, u[6]);
+ sign[7] = _mm256_cmpgt_epi32(kZero, u[7]);
+
+ u[0] = _mm256_sub_epi32(u[0], sign[0]);
+ u[1] = _mm256_sub_epi32(u[1], sign[1]);
+ u[2] = _mm256_sub_epi32(u[2], sign[2]);
+ u[3] = _mm256_sub_epi32(u[3], sign[3]);
+ u[4] = _mm256_sub_epi32(u[4], sign[4]);
+ u[5] = _mm256_sub_epi32(u[5], sign[5]);
+ u[6] = _mm256_sub_epi32(u[6], sign[6]);
+ u[7] = _mm256_sub_epi32(u[7], sign[7]);
+
+ u[0] = _mm256_add_epi32(u[0], K32One);
+ u[1] = _mm256_add_epi32(u[1], K32One);
+ u[2] = _mm256_add_epi32(u[2], K32One);
+ u[3] = _mm256_add_epi32(u[3], K32One);
+ u[4] = _mm256_add_epi32(u[4], K32One);
+ u[5] = _mm256_add_epi32(u[5], K32One);
+ u[6] = _mm256_add_epi32(u[6], K32One);
+ u[7] = _mm256_add_epi32(u[7], K32One);
+
+ u[0] = _mm256_srai_epi32(u[0], 2);
+ u[1] = _mm256_srai_epi32(u[1], 2);
+ u[2] = _mm256_srai_epi32(u[2], 2);
+ u[3] = _mm256_srai_epi32(u[3], 2);
+ u[4] = _mm256_srai_epi32(u[4], 2);
+ u[5] = _mm256_srai_epi32(u[5], 2);
+ u[6] = _mm256_srai_epi32(u[6], 2);
+ u[7] = _mm256_srai_epi32(u[7], 2);
+
+ // Combine
+ out[0] = _mm256_packs_epi32(u[0], u[1]);
+ out[16] = _mm256_packs_epi32(u[2], u[3]);
+ out[8] = _mm256_packs_epi32(u[4], u[5]);
+ out[24] = _mm256_packs_epi32(u[6], u[7]);
+ }
+ {
+ const __m256i k32_m08_p24 =
+ pair256_set_epi32(-cospi_8_64, cospi_24_64);
+ const __m256i k32_m24_m08 =
+ pair256_set_epi32(-cospi_24_64, -cospi_8_64);
+ const __m256i k32_p24_p08 =
+ pair256_set_epi32(cospi_24_64, cospi_8_64);
+
+ u[0] = _mm256_unpacklo_epi32(lstep1[18], lstep1[28]);
+ u[1] = _mm256_unpackhi_epi32(lstep1[18], lstep1[28]);
+ u[2] = _mm256_unpacklo_epi32(lstep1[19], lstep1[29]);
+ u[3] = _mm256_unpackhi_epi32(lstep1[19], lstep1[29]);
+ u[4] = _mm256_unpacklo_epi32(lstep1[20], lstep1[26]);
+ u[5] = _mm256_unpackhi_epi32(lstep1[20], lstep1[26]);
+ u[6] = _mm256_unpacklo_epi32(lstep1[21], lstep1[27]);
+ u[7] = _mm256_unpackhi_epi32(lstep1[21], lstep1[27]);
+
+ v[0] = k_madd_epi32_avx2(u[0], k32_m08_p24);
+ v[1] = k_madd_epi32_avx2(u[1], k32_m08_p24);
+ v[2] = k_madd_epi32_avx2(u[2], k32_m08_p24);
+ v[3] = k_madd_epi32_avx2(u[3], k32_m08_p24);
+ v[4] = k_madd_epi32_avx2(u[4], k32_m24_m08);
+ v[5] = k_madd_epi32_avx2(u[5], k32_m24_m08);
+ v[6] = k_madd_epi32_avx2(u[6], k32_m24_m08);
+ v[7] = k_madd_epi32_avx2(u[7], k32_m24_m08);
+ v[8] = k_madd_epi32_avx2(u[4], k32_m08_p24);
+ v[9] = k_madd_epi32_avx2(u[5], k32_m08_p24);
+ v[10] = k_madd_epi32_avx2(u[6], k32_m08_p24);
+ v[11] = k_madd_epi32_avx2(u[7], k32_m08_p24);
+ v[12] = k_madd_epi32_avx2(u[0], k32_p24_p08);
+ v[13] = k_madd_epi32_avx2(u[1], k32_p24_p08);
+ v[14] = k_madd_epi32_avx2(u[2], k32_p24_p08);
+ v[15] = k_madd_epi32_avx2(u[3], k32_p24_p08);
+
+ u[0] = k_packs_epi64_avx2(v[0], v[1]);
+ u[1] = k_packs_epi64_avx2(v[2], v[3]);
+ u[2] = k_packs_epi64_avx2(v[4], v[5]);
+ u[3] = k_packs_epi64_avx2(v[6], v[7]);
+ u[4] = k_packs_epi64_avx2(v[8], v[9]);
+ u[5] = k_packs_epi64_avx2(v[10], v[11]);
+ u[6] = k_packs_epi64_avx2(v[12], v[13]);
+ u[7] = k_packs_epi64_avx2(v[14], v[15]);
+
+ u[0] = _mm256_add_epi32(u[0], k__DCT_CONST_ROUNDING);
+ u[1] = _mm256_add_epi32(u[1], k__DCT_CONST_ROUNDING);
+ u[2] = _mm256_add_epi32(u[2], k__DCT_CONST_ROUNDING);
+ u[3] = _mm256_add_epi32(u[3], k__DCT_CONST_ROUNDING);
+ u[4] = _mm256_add_epi32(u[4], k__DCT_CONST_ROUNDING);
+ u[5] = _mm256_add_epi32(u[5], k__DCT_CONST_ROUNDING);
+ u[6] = _mm256_add_epi32(u[6], k__DCT_CONST_ROUNDING);
+ u[7] = _mm256_add_epi32(u[7], k__DCT_CONST_ROUNDING);
+
+ lstep2[18] = _mm256_srai_epi32(u[0], DCT_CONST_BITS);
+ lstep2[19] = _mm256_srai_epi32(u[1], DCT_CONST_BITS);
+ lstep2[20] = _mm256_srai_epi32(u[2], DCT_CONST_BITS);
+ lstep2[21] = _mm256_srai_epi32(u[3], DCT_CONST_BITS);
+ lstep2[26] = _mm256_srai_epi32(u[4], DCT_CONST_BITS);
+ lstep2[27] = _mm256_srai_epi32(u[5], DCT_CONST_BITS);
+ lstep2[28] = _mm256_srai_epi32(u[6], DCT_CONST_BITS);
+ lstep2[29] = _mm256_srai_epi32(u[7], DCT_CONST_BITS);
+ }
+ {
+ lstep2[32] = _mm256_add_epi32(lstep1[38], lstep3[32]);
+ lstep2[33] = _mm256_add_epi32(lstep1[39], lstep3[33]);
+ lstep2[34] = _mm256_add_epi32(lstep1[36], lstep3[34]);
+ lstep2[35] = _mm256_add_epi32(lstep1[37], lstep3[35]);
+ lstep2[36] = _mm256_sub_epi32(lstep3[34], lstep1[36]);
+ lstep2[37] = _mm256_sub_epi32(lstep3[35], lstep1[37]);
+ lstep2[38] = _mm256_sub_epi32(lstep3[32], lstep1[38]);
+ lstep2[39] = _mm256_sub_epi32(lstep3[33], lstep1[39]);
+ lstep2[40] = _mm256_sub_epi32(lstep3[46], lstep1[40]);
+ lstep2[41] = _mm256_sub_epi32(lstep3[47], lstep1[41]);
+ lstep2[42] = _mm256_sub_epi32(lstep3[44], lstep1[42]);
+ lstep2[43] = _mm256_sub_epi32(lstep3[45], lstep1[43]);
+ lstep2[44] = _mm256_add_epi32(lstep1[42], lstep3[44]);
+ lstep2[45] = _mm256_add_epi32(lstep1[43], lstep3[45]);
+ lstep2[46] = _mm256_add_epi32(lstep1[40], lstep3[46]);
+ lstep2[47] = _mm256_add_epi32(lstep1[41], lstep3[47]);
+ lstep2[48] = _mm256_add_epi32(lstep1[54], lstep3[48]);
+ lstep2[49] = _mm256_add_epi32(lstep1[55], lstep3[49]);
+ lstep2[50] = _mm256_add_epi32(lstep1[52], lstep3[50]);
+ lstep2[51] = _mm256_add_epi32(lstep1[53], lstep3[51]);
+ lstep2[52] = _mm256_sub_epi32(lstep3[50], lstep1[52]);
+ lstep2[53] = _mm256_sub_epi32(lstep3[51], lstep1[53]);
+ lstep2[54] = _mm256_sub_epi32(lstep3[48], lstep1[54]);
+ lstep2[55] = _mm256_sub_epi32(lstep3[49], lstep1[55]);
+ lstep2[56] = _mm256_sub_epi32(lstep3[62], lstep1[56]);
+ lstep2[57] = _mm256_sub_epi32(lstep3[63], lstep1[57]);
+ lstep2[58] = _mm256_sub_epi32(lstep3[60], lstep1[58]);
+ lstep2[59] = _mm256_sub_epi32(lstep3[61], lstep1[59]);
+ lstep2[60] = _mm256_add_epi32(lstep1[58], lstep3[60]);
+ lstep2[61] = _mm256_add_epi32(lstep1[59], lstep3[61]);
+ lstep2[62] = _mm256_add_epi32(lstep1[56], lstep3[62]);
+ lstep2[63] = _mm256_add_epi32(lstep1[57], lstep3[63]);
+ }
+ // stage 6
+ {
+ const __m256i k32_p28_p04 =
+ pair256_set_epi32(cospi_28_64, cospi_4_64);
+ const __m256i k32_p12_p20 =
+ pair256_set_epi32(cospi_12_64, cospi_20_64);
+ const __m256i k32_m20_p12 =
+ pair256_set_epi32(-cospi_20_64, cospi_12_64);
+ const __m256i k32_m04_p28 =
+ pair256_set_epi32(-cospi_4_64, cospi_28_64);
+
+ u[0] = _mm256_unpacklo_epi32(lstep2[8], lstep2[14]);
+ u[1] = _mm256_unpackhi_epi32(lstep2[8], lstep2[14]);
+ u[2] = _mm256_unpacklo_epi32(lstep2[9], lstep2[15]);
+ u[3] = _mm256_unpackhi_epi32(lstep2[9], lstep2[15]);
+ u[4] = _mm256_unpacklo_epi32(lstep2[10], lstep2[12]);
+ u[5] = _mm256_unpackhi_epi32(lstep2[10], lstep2[12]);
+ u[6] = _mm256_unpacklo_epi32(lstep2[11], lstep2[13]);
+ u[7] = _mm256_unpackhi_epi32(lstep2[11], lstep2[13]);
+ u[8] = _mm256_unpacklo_epi32(lstep2[10], lstep2[12]);
+ u[9] = _mm256_unpackhi_epi32(lstep2[10], lstep2[12]);
+ u[10] = _mm256_unpacklo_epi32(lstep2[11], lstep2[13]);
+ u[11] = _mm256_unpackhi_epi32(lstep2[11], lstep2[13]);
+ u[12] = _mm256_unpacklo_epi32(lstep2[8], lstep2[14]);
+ u[13] = _mm256_unpackhi_epi32(lstep2[8], lstep2[14]);
+ u[14] = _mm256_unpacklo_epi32(lstep2[9], lstep2[15]);
+ u[15] = _mm256_unpackhi_epi32(lstep2[9], lstep2[15]);
+
+ v[0] = k_madd_epi32_avx2(u[0], k32_p28_p04);
+ v[1] = k_madd_epi32_avx2(u[1], k32_p28_p04);
+ v[2] = k_madd_epi32_avx2(u[2], k32_p28_p04);
+ v[3] = k_madd_epi32_avx2(u[3], k32_p28_p04);
+ v[4] = k_madd_epi32_avx2(u[4], k32_p12_p20);
+ v[5] = k_madd_epi32_avx2(u[5], k32_p12_p20);
+ v[6] = k_madd_epi32_avx2(u[6], k32_p12_p20);
+ v[7] = k_madd_epi32_avx2(u[7], k32_p12_p20);
+ v[8] = k_madd_epi32_avx2(u[8], k32_m20_p12);
+ v[9] = k_madd_epi32_avx2(u[9], k32_m20_p12);
+ v[10] = k_madd_epi32_avx2(u[10], k32_m20_p12);
+ v[11] = k_madd_epi32_avx2(u[11], k32_m20_p12);
+ v[12] = k_madd_epi32_avx2(u[12], k32_m04_p28);
+ v[13] = k_madd_epi32_avx2(u[13], k32_m04_p28);
+ v[14] = k_madd_epi32_avx2(u[14], k32_m04_p28);
+ v[15] = k_madd_epi32_avx2(u[15], k32_m04_p28);
+
+ u[0] = k_packs_epi64_avx2(v[0], v[1]);
+ u[1] = k_packs_epi64_avx2(v[2], v[3]);
+ u[2] = k_packs_epi64_avx2(v[4], v[5]);
+ u[3] = k_packs_epi64_avx2(v[6], v[7]);
+ u[4] = k_packs_epi64_avx2(v[8], v[9]);
+ u[5] = k_packs_epi64_avx2(v[10], v[11]);
+ u[6] = k_packs_epi64_avx2(v[12], v[13]);
+ u[7] = k_packs_epi64_avx2(v[14], v[15]);
+
+ v[0] = _mm256_add_epi32(u[0], k__DCT_CONST_ROUNDING);
+ v[1] = _mm256_add_epi32(u[1], k__DCT_CONST_ROUNDING);
+ v[2] = _mm256_add_epi32(u[2], k__DCT_CONST_ROUNDING);
+ v[3] = _mm256_add_epi32(u[3], k__DCT_CONST_ROUNDING);
+ v[4] = _mm256_add_epi32(u[4], k__DCT_CONST_ROUNDING);
+ v[5] = _mm256_add_epi32(u[5], k__DCT_CONST_ROUNDING);
+ v[6] = _mm256_add_epi32(u[6], k__DCT_CONST_ROUNDING);
+ v[7] = _mm256_add_epi32(u[7], k__DCT_CONST_ROUNDING);
+
+ u[0] = _mm256_srai_epi32(v[0], DCT_CONST_BITS);
+ u[1] = _mm256_srai_epi32(v[1], DCT_CONST_BITS);
+ u[2] = _mm256_srai_epi32(v[2], DCT_CONST_BITS);
+ u[3] = _mm256_srai_epi32(v[3], DCT_CONST_BITS);
+ u[4] = _mm256_srai_epi32(v[4], DCT_CONST_BITS);
+ u[5] = _mm256_srai_epi32(v[5], DCT_CONST_BITS);
+ u[6] = _mm256_srai_epi32(v[6], DCT_CONST_BITS);
+ u[7] = _mm256_srai_epi32(v[7], DCT_CONST_BITS);
+
+ sign[0] = _mm256_cmpgt_epi32(kZero, u[0]);
+ sign[1] = _mm256_cmpgt_epi32(kZero, u[1]);
+ sign[2] = _mm256_cmpgt_epi32(kZero, u[2]);
+ sign[3] = _mm256_cmpgt_epi32(kZero, u[3]);
+ sign[4] = _mm256_cmpgt_epi32(kZero, u[4]);
+ sign[5] = _mm256_cmpgt_epi32(kZero, u[5]);
+ sign[6] = _mm256_cmpgt_epi32(kZero, u[6]);
+ sign[7] = _mm256_cmpgt_epi32(kZero, u[7]);
+
+ u[0] = _mm256_sub_epi32(u[0], sign[0]);
+ u[1] = _mm256_sub_epi32(u[1], sign[1]);
+ u[2] = _mm256_sub_epi32(u[2], sign[2]);
+ u[3] = _mm256_sub_epi32(u[3], sign[3]);
+ u[4] = _mm256_sub_epi32(u[4], sign[4]);
+ u[5] = _mm256_sub_epi32(u[5], sign[5]);
+ u[6] = _mm256_sub_epi32(u[6], sign[6]);
+ u[7] = _mm256_sub_epi32(u[7], sign[7]);
+
+ u[0] = _mm256_add_epi32(u[0], K32One);
+ u[1] = _mm256_add_epi32(u[1], K32One);
+ u[2] = _mm256_add_epi32(u[2], K32One);
+ u[3] = _mm256_add_epi32(u[3], K32One);
+ u[4] = _mm256_add_epi32(u[4], K32One);
+ u[5] = _mm256_add_epi32(u[5], K32One);
+ u[6] = _mm256_add_epi32(u[6], K32One);
+ u[7] = _mm256_add_epi32(u[7], K32One);
+
+ u[0] = _mm256_srai_epi32(u[0], 2);
+ u[1] = _mm256_srai_epi32(u[1], 2);
+ u[2] = _mm256_srai_epi32(u[2], 2);
+ u[3] = _mm256_srai_epi32(u[3], 2);
+ u[4] = _mm256_srai_epi32(u[4], 2);
+ u[5] = _mm256_srai_epi32(u[5], 2);
+ u[6] = _mm256_srai_epi32(u[6], 2);
+ u[7] = _mm256_srai_epi32(u[7], 2);
+
+ out[4] = _mm256_packs_epi32(u[0], u[1]);
+ out[20] = _mm256_packs_epi32(u[2], u[3]);
+ out[12] = _mm256_packs_epi32(u[4], u[5]);
+ out[28] = _mm256_packs_epi32(u[6], u[7]);
+ }
+ {
+ lstep3[16] = _mm256_add_epi32(lstep2[18], lstep1[16]);
+ lstep3[17] = _mm256_add_epi32(lstep2[19], lstep1[17]);
+ lstep3[18] = _mm256_sub_epi32(lstep1[16], lstep2[18]);
+ lstep3[19] = _mm256_sub_epi32(lstep1[17], lstep2[19]);
+ lstep3[20] = _mm256_sub_epi32(lstep1[22], lstep2[20]);
+ lstep3[21] = _mm256_sub_epi32(lstep1[23], lstep2[21]);
+ lstep3[22] = _mm256_add_epi32(lstep2[20], lstep1[22]);
+ lstep3[23] = _mm256_add_epi32(lstep2[21], lstep1[23]);
+ lstep3[24] = _mm256_add_epi32(lstep2[26], lstep1[24]);
+ lstep3[25] = _mm256_add_epi32(lstep2[27], lstep1[25]);
+ lstep3[26] = _mm256_sub_epi32(lstep1[24], lstep2[26]);
+ lstep3[27] = _mm256_sub_epi32(lstep1[25], lstep2[27]);
+ lstep3[28] = _mm256_sub_epi32(lstep1[30], lstep2[28]);
+ lstep3[29] = _mm256_sub_epi32(lstep1[31], lstep2[29]);
+ lstep3[30] = _mm256_add_epi32(lstep2[28], lstep1[30]);
+ lstep3[31] = _mm256_add_epi32(lstep2[29], lstep1[31]);
+ }
+ {
+ const __m256i k32_m04_p28 =
+ pair256_set_epi32(-cospi_4_64, cospi_28_64);
+ const __m256i k32_m28_m04 =
+ pair256_set_epi32(-cospi_28_64, -cospi_4_64);
+ const __m256i k32_m20_p12 =
+ pair256_set_epi32(-cospi_20_64, cospi_12_64);
+ const __m256i k32_m12_m20 =
+ pair256_set_epi32(-cospi_12_64, -cospi_20_64);
+ const __m256i k32_p12_p20 =
+ pair256_set_epi32(cospi_12_64, cospi_20_64);
+ const __m256i k32_p28_p04 =
+ pair256_set_epi32(cospi_28_64, cospi_4_64);
+
+ u[0] = _mm256_unpacklo_epi32(lstep2[34], lstep2[60]);
+ u[1] = _mm256_unpackhi_epi32(lstep2[34], lstep2[60]);
+ u[2] = _mm256_unpacklo_epi32(lstep2[35], lstep2[61]);
+ u[3] = _mm256_unpackhi_epi32(lstep2[35], lstep2[61]);
+ u[4] = _mm256_unpacklo_epi32(lstep2[36], lstep2[58]);
+ u[5] = _mm256_unpackhi_epi32(lstep2[36], lstep2[58]);
+ u[6] = _mm256_unpacklo_epi32(lstep2[37], lstep2[59]);
+ u[7] = _mm256_unpackhi_epi32(lstep2[37], lstep2[59]);
+ u[8] = _mm256_unpacklo_epi32(lstep2[42], lstep2[52]);
+ u[9] = _mm256_unpackhi_epi32(lstep2[42], lstep2[52]);
+ u[10] = _mm256_unpacklo_epi32(lstep2[43], lstep2[53]);
+ u[11] = _mm256_unpackhi_epi32(lstep2[43], lstep2[53]);
+ u[12] = _mm256_unpacklo_epi32(lstep2[44], lstep2[50]);
+ u[13] = _mm256_unpackhi_epi32(lstep2[44], lstep2[50]);
+ u[14] = _mm256_unpacklo_epi32(lstep2[45], lstep2[51]);
+ u[15] = _mm256_unpackhi_epi32(lstep2[45], lstep2[51]);
+
+ v[0] = k_madd_epi32_avx2(u[0], k32_m04_p28);
+ v[1] = k_madd_epi32_avx2(u[1], k32_m04_p28);
+ v[2] = k_madd_epi32_avx2(u[2], k32_m04_p28);
+ v[3] = k_madd_epi32_avx2(u[3], k32_m04_p28);
+ v[4] = k_madd_epi32_avx2(u[4], k32_m28_m04);
+ v[5] = k_madd_epi32_avx2(u[5], k32_m28_m04);
+ v[6] = k_madd_epi32_avx2(u[6], k32_m28_m04);
+ v[7] = k_madd_epi32_avx2(u[7], k32_m28_m04);
+ v[8] = k_madd_epi32_avx2(u[8], k32_m20_p12);
+ v[9] = k_madd_epi32_avx2(u[9], k32_m20_p12);
+ v[10] = k_madd_epi32_avx2(u[10], k32_m20_p12);
+ v[11] = k_madd_epi32_avx2(u[11], k32_m20_p12);
+ v[12] = k_madd_epi32_avx2(u[12], k32_m12_m20);
+ v[13] = k_madd_epi32_avx2(u[13], k32_m12_m20);
+ v[14] = k_madd_epi32_avx2(u[14], k32_m12_m20);
+ v[15] = k_madd_epi32_avx2(u[15], k32_m12_m20);
+ v[16] = k_madd_epi32_avx2(u[12], k32_m20_p12);
+ v[17] = k_madd_epi32_avx2(u[13], k32_m20_p12);
+ v[18] = k_madd_epi32_avx2(u[14], k32_m20_p12);
+ v[19] = k_madd_epi32_avx2(u[15], k32_m20_p12);
+ v[20] = k_madd_epi32_avx2(u[8], k32_p12_p20);
+ v[21] = k_madd_epi32_avx2(u[9], k32_p12_p20);
+ v[22] = k_madd_epi32_avx2(u[10], k32_p12_p20);
+ v[23] = k_madd_epi32_avx2(u[11], k32_p12_p20);
+ v[24] = k_madd_epi32_avx2(u[4], k32_m04_p28);
+ v[25] = k_madd_epi32_avx2(u[5], k32_m04_p28);
+ v[26] = k_madd_epi32_avx2(u[6], k32_m04_p28);
+ v[27] = k_madd_epi32_avx2(u[7], k32_m04_p28);
+ v[28] = k_madd_epi32_avx2(u[0], k32_p28_p04);
+ v[29] = k_madd_epi32_avx2(u[1], k32_p28_p04);
+ v[30] = k_madd_epi32_avx2(u[2], k32_p28_p04);
+ v[31] = k_madd_epi32_avx2(u[3], k32_p28_p04);
+
+ u[0] = k_packs_epi64_avx2(v[0], v[1]);
+ u[1] = k_packs_epi64_avx2(v[2], v[3]);
+ u[2] = k_packs_epi64_avx2(v[4], v[5]);
+ u[3] = k_packs_epi64_avx2(v[6], v[7]);
+ u[4] = k_packs_epi64_avx2(v[8], v[9]);
+ u[5] = k_packs_epi64_avx2(v[10], v[11]);
+ u[6] = k_packs_epi64_avx2(v[12], v[13]);
+ u[7] = k_packs_epi64_avx2(v[14], v[15]);
+ u[8] = k_packs_epi64_avx2(v[16], v[17]);
+ u[9] = k_packs_epi64_avx2(v[18], v[19]);
+ u[10] = k_packs_epi64_avx2(v[20], v[21]);
+ u[11] = k_packs_epi64_avx2(v[22], v[23]);
+ u[12] = k_packs_epi64_avx2(v[24], v[25]);
+ u[13] = k_packs_epi64_avx2(v[26], v[27]);
+ u[14] = k_packs_epi64_avx2(v[28], v[29]);
+ u[15] = k_packs_epi64_avx2(v[30], v[31]);
+
+ v[0] = _mm256_add_epi32(u[0], k__DCT_CONST_ROUNDING);
+ v[1] = _mm256_add_epi32(u[1], k__DCT_CONST_ROUNDING);
+ v[2] = _mm256_add_epi32(u[2], k__DCT_CONST_ROUNDING);
+ v[3] = _mm256_add_epi32(u[3], k__DCT_CONST_ROUNDING);
+ v[4] = _mm256_add_epi32(u[4], k__DCT_CONST_ROUNDING);
+ v[5] = _mm256_add_epi32(u[5], k__DCT_CONST_ROUNDING);
+ v[6] = _mm256_add_epi32(u[6], k__DCT_CONST_ROUNDING);
+ v[7] = _mm256_add_epi32(u[7], k__DCT_CONST_ROUNDING);
+ v[8] = _mm256_add_epi32(u[8], k__DCT_CONST_ROUNDING);
+ v[9] = _mm256_add_epi32(u[9], k__DCT_CONST_ROUNDING);
+ v[10] = _mm256_add_epi32(u[10], k__DCT_CONST_ROUNDING);
+ v[11] = _mm256_add_epi32(u[11], k__DCT_CONST_ROUNDING);
+ v[12] = _mm256_add_epi32(u[12], k__DCT_CONST_ROUNDING);
+ v[13] = _mm256_add_epi32(u[13], k__DCT_CONST_ROUNDING);
+ v[14] = _mm256_add_epi32(u[14], k__DCT_CONST_ROUNDING);
+ v[15] = _mm256_add_epi32(u[15], k__DCT_CONST_ROUNDING);
+
+ lstep3[34] = _mm256_srai_epi32(v[0], DCT_CONST_BITS);
+ lstep3[35] = _mm256_srai_epi32(v[1], DCT_CONST_BITS);
+ lstep3[36] = _mm256_srai_epi32(v[2], DCT_CONST_BITS);
+ lstep3[37] = _mm256_srai_epi32(v[3], DCT_CONST_BITS);
+ lstep3[42] = _mm256_srai_epi32(v[4], DCT_CONST_BITS);
+ lstep3[43] = _mm256_srai_epi32(v[5], DCT_CONST_BITS);
+ lstep3[44] = _mm256_srai_epi32(v[6], DCT_CONST_BITS);
+ lstep3[45] = _mm256_srai_epi32(v[7], DCT_CONST_BITS);
+ lstep3[50] = _mm256_srai_epi32(v[8], DCT_CONST_BITS);
+ lstep3[51] = _mm256_srai_epi32(v[9], DCT_CONST_BITS);
+ lstep3[52] = _mm256_srai_epi32(v[10], DCT_CONST_BITS);
+ lstep3[53] = _mm256_srai_epi32(v[11], DCT_CONST_BITS);
+ lstep3[58] = _mm256_srai_epi32(v[12], DCT_CONST_BITS);
+ lstep3[59] = _mm256_srai_epi32(v[13], DCT_CONST_BITS);
+ lstep3[60] = _mm256_srai_epi32(v[14], DCT_CONST_BITS);
+ lstep3[61] = _mm256_srai_epi32(v[15], DCT_CONST_BITS);
+ }
+ // stage 7
+ {
+ const __m256i k32_p30_p02 =
+ pair256_set_epi32(cospi_30_64, cospi_2_64);
+ const __m256i k32_p14_p18 =
+ pair256_set_epi32(cospi_14_64, cospi_18_64);
+ const __m256i k32_p22_p10 =
+ pair256_set_epi32(cospi_22_64, cospi_10_64);
+ const __m256i k32_p06_p26 =
+ pair256_set_epi32(cospi_6_64, cospi_26_64);
+ const __m256i k32_m26_p06 =
+ pair256_set_epi32(-cospi_26_64, cospi_6_64);
+ const __m256i k32_m10_p22 =
+ pair256_set_epi32(-cospi_10_64, cospi_22_64);
+ const __m256i k32_m18_p14 =
+ pair256_set_epi32(-cospi_18_64, cospi_14_64);
+ const __m256i k32_m02_p30 =
+ pair256_set_epi32(-cospi_2_64, cospi_30_64);
+
+ u[0] = _mm256_unpacklo_epi32(lstep3[16], lstep3[30]);
+ u[1] = _mm256_unpackhi_epi32(lstep3[16], lstep3[30]);
+ u[2] = _mm256_unpacklo_epi32(lstep3[17], lstep3[31]);
+ u[3] = _mm256_unpackhi_epi32(lstep3[17], lstep3[31]);
+ u[4] = _mm256_unpacklo_epi32(lstep3[18], lstep3[28]);
+ u[5] = _mm256_unpackhi_epi32(lstep3[18], lstep3[28]);
+ u[6] = _mm256_unpacklo_epi32(lstep3[19], lstep3[29]);
+ u[7] = _mm256_unpackhi_epi32(lstep3[19], lstep3[29]);
+ u[8] = _mm256_unpacklo_epi32(lstep3[20], lstep3[26]);
+ u[9] = _mm256_unpackhi_epi32(lstep3[20], lstep3[26]);
+ u[10] = _mm256_unpacklo_epi32(lstep3[21], lstep3[27]);
+ u[11] = _mm256_unpackhi_epi32(lstep3[21], lstep3[27]);
+ u[12] = _mm256_unpacklo_epi32(lstep3[22], lstep3[24]);
+ u[13] = _mm256_unpackhi_epi32(lstep3[22], lstep3[24]);
+ u[14] = _mm256_unpacklo_epi32(lstep3[23], lstep3[25]);
+ u[15] = _mm256_unpackhi_epi32(lstep3[23], lstep3[25]);
+
+ v[0] = k_madd_epi32_avx2(u[0], k32_p30_p02);
+ v[1] = k_madd_epi32_avx2(u[1], k32_p30_p02);
+ v[2] = k_madd_epi32_avx2(u[2], k32_p30_p02);
+ v[3] = k_madd_epi32_avx2(u[3], k32_p30_p02);
+ v[4] = k_madd_epi32_avx2(u[4], k32_p14_p18);
+ v[5] = k_madd_epi32_avx2(u[5], k32_p14_p18);
+ v[6] = k_madd_epi32_avx2(u[6], k32_p14_p18);
+ v[7] = k_madd_epi32_avx2(u[7], k32_p14_p18);
+ v[8] = k_madd_epi32_avx2(u[8], k32_p22_p10);
+ v[9] = k_madd_epi32_avx2(u[9], k32_p22_p10);
+ v[10] = k_madd_epi32_avx2(u[10], k32_p22_p10);
+ v[11] = k_madd_epi32_avx2(u[11], k32_p22_p10);
+ v[12] = k_madd_epi32_avx2(u[12], k32_p06_p26);
+ v[13] = k_madd_epi32_avx2(u[13], k32_p06_p26);
+ v[14] = k_madd_epi32_avx2(u[14], k32_p06_p26);
+ v[15] = k_madd_epi32_avx2(u[15], k32_p06_p26);
+ v[16] = k_madd_epi32_avx2(u[12], k32_m26_p06);
+ v[17] = k_madd_epi32_avx2(u[13], k32_m26_p06);
+ v[18] = k_madd_epi32_avx2(u[14], k32_m26_p06);
+ v[19] = k_madd_epi32_avx2(u[15], k32_m26_p06);
+ v[20] = k_madd_epi32_avx2(u[8], k32_m10_p22);
+ v[21] = k_madd_epi32_avx2(u[9], k32_m10_p22);
+ v[22] = k_madd_epi32_avx2(u[10], k32_m10_p22);
+ v[23] = k_madd_epi32_avx2(u[11], k32_m10_p22);
+ v[24] = k_madd_epi32_avx2(u[4], k32_m18_p14);
+ v[25] = k_madd_epi32_avx2(u[5], k32_m18_p14);
+ v[26] = k_madd_epi32_avx2(u[6], k32_m18_p14);
+ v[27] = k_madd_epi32_avx2(u[7], k32_m18_p14);
+ v[28] = k_madd_epi32_avx2(u[0], k32_m02_p30);
+ v[29] = k_madd_epi32_avx2(u[1], k32_m02_p30);
+ v[30] = k_madd_epi32_avx2(u[2], k32_m02_p30);
+ v[31] = k_madd_epi32_avx2(u[3], k32_m02_p30);
+
+ u[0] = k_packs_epi64_avx2(v[0], v[1]);
+ u[1] = k_packs_epi64_avx2(v[2], v[3]);
+ u[2] = k_packs_epi64_avx2(v[4], v[5]);
+ u[3] = k_packs_epi64_avx2(v[6], v[7]);
+ u[4] = k_packs_epi64_avx2(v[8], v[9]);
+ u[5] = k_packs_epi64_avx2(v[10], v[11]);
+ u[6] = k_packs_epi64_avx2(v[12], v[13]);
+ u[7] = k_packs_epi64_avx2(v[14], v[15]);
+ u[8] = k_packs_epi64_avx2(v[16], v[17]);
+ u[9] = k_packs_epi64_avx2(v[18], v[19]);
+ u[10] = k_packs_epi64_avx2(v[20], v[21]);
+ u[11] = k_packs_epi64_avx2(v[22], v[23]);
+ u[12] = k_packs_epi64_avx2(v[24], v[25]);
+ u[13] = k_packs_epi64_avx2(v[26], v[27]);
+ u[14] = k_packs_epi64_avx2(v[28], v[29]);
+ u[15] = k_packs_epi64_avx2(v[30], v[31]);
+
+ v[0] = _mm256_add_epi32(u[0], k__DCT_CONST_ROUNDING);
+ v[1] = _mm256_add_epi32(u[1], k__DCT_CONST_ROUNDING);
+ v[2] = _mm256_add_epi32(u[2], k__DCT_CONST_ROUNDING);
+ v[3] = _mm256_add_epi32(u[3], k__DCT_CONST_ROUNDING);
+ v[4] = _mm256_add_epi32(u[4], k__DCT_CONST_ROUNDING);
+ v[5] = _mm256_add_epi32(u[5], k__DCT_CONST_ROUNDING);
+ v[6] = _mm256_add_epi32(u[6], k__DCT_CONST_ROUNDING);
+ v[7] = _mm256_add_epi32(u[7], k__DCT_CONST_ROUNDING);
+ v[8] = _mm256_add_epi32(u[8], k__DCT_CONST_ROUNDING);
+ v[9] = _mm256_add_epi32(u[9], k__DCT_CONST_ROUNDING);
+ v[10] = _mm256_add_epi32(u[10], k__DCT_CONST_ROUNDING);
+ v[11] = _mm256_add_epi32(u[11], k__DCT_CONST_ROUNDING);
+ v[12] = _mm256_add_epi32(u[12], k__DCT_CONST_ROUNDING);
+ v[13] = _mm256_add_epi32(u[13], k__DCT_CONST_ROUNDING);
+ v[14] = _mm256_add_epi32(u[14], k__DCT_CONST_ROUNDING);
+ v[15] = _mm256_add_epi32(u[15], k__DCT_CONST_ROUNDING);
+
+ u[0] = _mm256_srai_epi32(v[0], DCT_CONST_BITS);
+ u[1] = _mm256_srai_epi32(v[1], DCT_CONST_BITS);
+ u[2] = _mm256_srai_epi32(v[2], DCT_CONST_BITS);
+ u[3] = _mm256_srai_epi32(v[3], DCT_CONST_BITS);
+ u[4] = _mm256_srai_epi32(v[4], DCT_CONST_BITS);
+ u[5] = _mm256_srai_epi32(v[5], DCT_CONST_BITS);
+ u[6] = _mm256_srai_epi32(v[6], DCT_CONST_BITS);
+ u[7] = _mm256_srai_epi32(v[7], DCT_CONST_BITS);
+ u[8] = _mm256_srai_epi32(v[8], DCT_CONST_BITS);
+ u[9] = _mm256_srai_epi32(v[9], DCT_CONST_BITS);
+ u[10] = _mm256_srai_epi32(v[10], DCT_CONST_BITS);
+ u[11] = _mm256_srai_epi32(v[11], DCT_CONST_BITS);
+ u[12] = _mm256_srai_epi32(v[12], DCT_CONST_BITS);
+ u[13] = _mm256_srai_epi32(v[13], DCT_CONST_BITS);
+ u[14] = _mm256_srai_epi32(v[14], DCT_CONST_BITS);
+ u[15] = _mm256_srai_epi32(v[15], DCT_CONST_BITS);
+
+ v[0] = _mm256_cmpgt_epi32(kZero, u[0]);
+ v[1] = _mm256_cmpgt_epi32(kZero, u[1]);
+ v[2] = _mm256_cmpgt_epi32(kZero, u[2]);
+ v[3] = _mm256_cmpgt_epi32(kZero, u[3]);
+ v[4] = _mm256_cmpgt_epi32(kZero, u[4]);
+ v[5] = _mm256_cmpgt_epi32(kZero, u[5]);
+ v[6] = _mm256_cmpgt_epi32(kZero, u[6]);
+ v[7] = _mm256_cmpgt_epi32(kZero, u[7]);
+ v[8] = _mm256_cmpgt_epi32(kZero, u[8]);
+ v[9] = _mm256_cmpgt_epi32(kZero, u[9]);
+ v[10] = _mm256_cmpgt_epi32(kZero, u[10]);
+ v[11] = _mm256_cmpgt_epi32(kZero, u[11]);
+ v[12] = _mm256_cmpgt_epi32(kZero, u[12]);
+ v[13] = _mm256_cmpgt_epi32(kZero, u[13]);
+ v[14] = _mm256_cmpgt_epi32(kZero, u[14]);
+ v[15] = _mm256_cmpgt_epi32(kZero, u[15]);
+
+ u[0] = _mm256_sub_epi32(u[0], v[0]);
+ u[1] = _mm256_sub_epi32(u[1], v[1]);
+ u[2] = _mm256_sub_epi32(u[2], v[2]);
+ u[3] = _mm256_sub_epi32(u[3], v[3]);
+ u[4] = _mm256_sub_epi32(u[4], v[4]);
+ u[5] = _mm256_sub_epi32(u[5], v[5]);
+ u[6] = _mm256_sub_epi32(u[6], v[6]);
+ u[7] = _mm256_sub_epi32(u[7], v[7]);
+ u[8] = _mm256_sub_epi32(u[8], v[8]);
+ u[9] = _mm256_sub_epi32(u[9], v[9]);
+ u[10] = _mm256_sub_epi32(u[10], v[10]);
+ u[11] = _mm256_sub_epi32(u[11], v[11]);
+ u[12] = _mm256_sub_epi32(u[12], v[12]);
+ u[13] = _mm256_sub_epi32(u[13], v[13]);
+ u[14] = _mm256_sub_epi32(u[14], v[14]);
+ u[15] = _mm256_sub_epi32(u[15], v[15]);
+
+ v[0] = _mm256_add_epi32(u[0], K32One);
+ v[1] = _mm256_add_epi32(u[1], K32One);
+ v[2] = _mm256_add_epi32(u[2], K32One);
+ v[3] = _mm256_add_epi32(u[3], K32One);
+ v[4] = _mm256_add_epi32(u[4], K32One);
+ v[5] = _mm256_add_epi32(u[5], K32One);
+ v[6] = _mm256_add_epi32(u[6], K32One);
+ v[7] = _mm256_add_epi32(u[7], K32One);
+ v[8] = _mm256_add_epi32(u[8], K32One);
+ v[9] = _mm256_add_epi32(u[9], K32One);
+ v[10] = _mm256_add_epi32(u[10], K32One);
+ v[11] = _mm256_add_epi32(u[11], K32One);
+ v[12] = _mm256_add_epi32(u[12], K32One);
+ v[13] = _mm256_add_epi32(u[13], K32One);
+ v[14] = _mm256_add_epi32(u[14], K32One);
+ v[15] = _mm256_add_epi32(u[15], K32One);
+
+ u[0] = _mm256_srai_epi32(v[0], 2);
+ u[1] = _mm256_srai_epi32(v[1], 2);
+ u[2] = _mm256_srai_epi32(v[2], 2);
+ u[3] = _mm256_srai_epi32(v[3], 2);
+ u[4] = _mm256_srai_epi32(v[4], 2);
+ u[5] = _mm256_srai_epi32(v[5], 2);
+ u[6] = _mm256_srai_epi32(v[6], 2);
+ u[7] = _mm256_srai_epi32(v[7], 2);
+ u[8] = _mm256_srai_epi32(v[8], 2);
+ u[9] = _mm256_srai_epi32(v[9], 2);
+ u[10] = _mm256_srai_epi32(v[10], 2);
+ u[11] = _mm256_srai_epi32(v[11], 2);
+ u[12] = _mm256_srai_epi32(v[12], 2);
+ u[13] = _mm256_srai_epi32(v[13], 2);
+ u[14] = _mm256_srai_epi32(v[14], 2);
+ u[15] = _mm256_srai_epi32(v[15], 2);
+
+ out[2] = _mm256_packs_epi32(u[0], u[1]);
+ out[18] = _mm256_packs_epi32(u[2], u[3]);
+ out[10] = _mm256_packs_epi32(u[4], u[5]);
+ out[26] = _mm256_packs_epi32(u[6], u[7]);
+ out[6] = _mm256_packs_epi32(u[8], u[9]);
+ out[22] = _mm256_packs_epi32(u[10], u[11]);
+ out[14] = _mm256_packs_epi32(u[12], u[13]);
+ out[30] = _mm256_packs_epi32(u[14], u[15]);
+ }
+ {
+ lstep1[32] = _mm256_add_epi32(lstep3[34], lstep2[32]);
+ lstep1[33] = _mm256_add_epi32(lstep3[35], lstep2[33]);
+ lstep1[34] = _mm256_sub_epi32(lstep2[32], lstep3[34]);
+ lstep1[35] = _mm256_sub_epi32(lstep2[33], lstep3[35]);
+ lstep1[36] = _mm256_sub_epi32(lstep2[38], lstep3[36]);
+ lstep1[37] = _mm256_sub_epi32(lstep2[39], lstep3[37]);
+ lstep1[38] = _mm256_add_epi32(lstep3[36], lstep2[38]);
+ lstep1[39] = _mm256_add_epi32(lstep3[37], lstep2[39]);
+ lstep1[40] = _mm256_add_epi32(lstep3[42], lstep2[40]);
+ lstep1[41] = _mm256_add_epi32(lstep3[43], lstep2[41]);
+ lstep1[42] = _mm256_sub_epi32(lstep2[40], lstep3[42]);
+ lstep1[43] = _mm256_sub_epi32(lstep2[41], lstep3[43]);
+ lstep1[44] = _mm256_sub_epi32(lstep2[46], lstep3[44]);
+ lstep1[45] = _mm256_sub_epi32(lstep2[47], lstep3[45]);
+ lstep1[46] = _mm256_add_epi32(lstep3[44], lstep2[46]);
+ lstep1[47] = _mm256_add_epi32(lstep3[45], lstep2[47]);
+ lstep1[48] = _mm256_add_epi32(lstep3[50], lstep2[48]);
+ lstep1[49] = _mm256_add_epi32(lstep3[51], lstep2[49]);
+ lstep1[50] = _mm256_sub_epi32(lstep2[48], lstep3[50]);
+ lstep1[51] = _mm256_sub_epi32(lstep2[49], lstep3[51]);
+ lstep1[52] = _mm256_sub_epi32(lstep2[54], lstep3[52]);
+ lstep1[53] = _mm256_sub_epi32(lstep2[55], lstep3[53]);
+ lstep1[54] = _mm256_add_epi32(lstep3[52], lstep2[54]);
+ lstep1[55] = _mm256_add_epi32(lstep3[53], lstep2[55]);
+ lstep1[56] = _mm256_add_epi32(lstep3[58], lstep2[56]);
+ lstep1[57] = _mm256_add_epi32(lstep3[59], lstep2[57]);
+ lstep1[58] = _mm256_sub_epi32(lstep2[56], lstep3[58]);
+ lstep1[59] = _mm256_sub_epi32(lstep2[57], lstep3[59]);
+ lstep1[60] = _mm256_sub_epi32(lstep2[62], lstep3[60]);
+ lstep1[61] = _mm256_sub_epi32(lstep2[63], lstep3[61]);
+ lstep1[62] = _mm256_add_epi32(lstep3[60], lstep2[62]);
+ lstep1[63] = _mm256_add_epi32(lstep3[61], lstep2[63]);
+ }
+ // stage 8
+ {
+ const __m256i k32_p31_p01 =
+ pair256_set_epi32(cospi_31_64, cospi_1_64);
+ const __m256i k32_p15_p17 =
+ pair256_set_epi32(cospi_15_64, cospi_17_64);
+ const __m256i k32_p23_p09 =
+ pair256_set_epi32(cospi_23_64, cospi_9_64);
+ const __m256i k32_p07_p25 =
+ pair256_set_epi32(cospi_7_64, cospi_25_64);
+ const __m256i k32_m25_p07 =
+ pair256_set_epi32(-cospi_25_64, cospi_7_64);
+ const __m256i k32_m09_p23 =
+ pair256_set_epi32(-cospi_9_64, cospi_23_64);
+ const __m256i k32_m17_p15 =
+ pair256_set_epi32(-cospi_17_64, cospi_15_64);
+ const __m256i k32_m01_p31 =
+ pair256_set_epi32(-cospi_1_64, cospi_31_64);
+
+ u[0] = _mm256_unpacklo_epi32(lstep1[32], lstep1[62]);
+ u[1] = _mm256_unpackhi_epi32(lstep1[32], lstep1[62]);
+ u[2] = _mm256_unpacklo_epi32(lstep1[33], lstep1[63]);
+ u[3] = _mm256_unpackhi_epi32(lstep1[33], lstep1[63]);
+ u[4] = _mm256_unpacklo_epi32(lstep1[34], lstep1[60]);
+ u[5] = _mm256_unpackhi_epi32(lstep1[34], lstep1[60]);
+ u[6] = _mm256_unpacklo_epi32(lstep1[35], lstep1[61]);
+ u[7] = _mm256_unpackhi_epi32(lstep1[35], lstep1[61]);
+ u[8] = _mm256_unpacklo_epi32(lstep1[36], lstep1[58]);
+ u[9] = _mm256_unpackhi_epi32(lstep1[36], lstep1[58]);
+ u[10] = _mm256_unpacklo_epi32(lstep1[37], lstep1[59]);
+ u[11] = _mm256_unpackhi_epi32(lstep1[37], lstep1[59]);
+ u[12] = _mm256_unpacklo_epi32(lstep1[38], lstep1[56]);
+ u[13] = _mm256_unpackhi_epi32(lstep1[38], lstep1[56]);
+ u[14] = _mm256_unpacklo_epi32(lstep1[39], lstep1[57]);
+ u[15] = _mm256_unpackhi_epi32(lstep1[39], lstep1[57]);
+
+ v[0] = k_madd_epi32_avx2(u[0], k32_p31_p01);
+ v[1] = k_madd_epi32_avx2(u[1], k32_p31_p01);
+ v[2] = k_madd_epi32_avx2(u[2], k32_p31_p01);
+ v[3] = k_madd_epi32_avx2(u[3], k32_p31_p01);
+ v[4] = k_madd_epi32_avx2(u[4], k32_p15_p17);
+ v[5] = k_madd_epi32_avx2(u[5], k32_p15_p17);
+ v[6] = k_madd_epi32_avx2(u[6], k32_p15_p17);
+ v[7] = k_madd_epi32_avx2(u[7], k32_p15_p17);
+ v[8] = k_madd_epi32_avx2(u[8], k32_p23_p09);
+ v[9] = k_madd_epi32_avx2(u[9], k32_p23_p09);
+ v[10] = k_madd_epi32_avx2(u[10], k32_p23_p09);
+ v[11] = k_madd_epi32_avx2(u[11], k32_p23_p09);
+ v[12] = k_madd_epi32_avx2(u[12], k32_p07_p25);
+ v[13] = k_madd_epi32_avx2(u[13], k32_p07_p25);
+ v[14] = k_madd_epi32_avx2(u[14], k32_p07_p25);
+ v[15] = k_madd_epi32_avx2(u[15], k32_p07_p25);
+ v[16] = k_madd_epi32_avx2(u[12], k32_m25_p07);
+ v[17] = k_madd_epi32_avx2(u[13], k32_m25_p07);
+ v[18] = k_madd_epi32_avx2(u[14], k32_m25_p07);
+ v[19] = k_madd_epi32_avx2(u[15], k32_m25_p07);
+ v[20] = k_madd_epi32_avx2(u[8], k32_m09_p23);
+ v[21] = k_madd_epi32_avx2(u[9], k32_m09_p23);
+ v[22] = k_madd_epi32_avx2(u[10], k32_m09_p23);
+ v[23] = k_madd_epi32_avx2(u[11], k32_m09_p23);
+ v[24] = k_madd_epi32_avx2(u[4], k32_m17_p15);
+ v[25] = k_madd_epi32_avx2(u[5], k32_m17_p15);
+ v[26] = k_madd_epi32_avx2(u[6], k32_m17_p15);
+ v[27] = k_madd_epi32_avx2(u[7], k32_m17_p15);
+ v[28] = k_madd_epi32_avx2(u[0], k32_m01_p31);
+ v[29] = k_madd_epi32_avx2(u[1], k32_m01_p31);
+ v[30] = k_madd_epi32_avx2(u[2], k32_m01_p31);
+ v[31] = k_madd_epi32_avx2(u[3], k32_m01_p31);
+
+ u[0] = k_packs_epi64_avx2(v[0], v[1]);
+ u[1] = k_packs_epi64_avx2(v[2], v[3]);
+ u[2] = k_packs_epi64_avx2(v[4], v[5]);
+ u[3] = k_packs_epi64_avx2(v[6], v[7]);
+ u[4] = k_packs_epi64_avx2(v[8], v[9]);
+ u[5] = k_packs_epi64_avx2(v[10], v[11]);
+ u[6] = k_packs_epi64_avx2(v[12], v[13]);
+ u[7] = k_packs_epi64_avx2(v[14], v[15]);
+ u[8] = k_packs_epi64_avx2(v[16], v[17]);
+ u[9] = k_packs_epi64_avx2(v[18], v[19]);
+ u[10] = k_packs_epi64_avx2(v[20], v[21]);
+ u[11] = k_packs_epi64_avx2(v[22], v[23]);
+ u[12] = k_packs_epi64_avx2(v[24], v[25]);
+ u[13] = k_packs_epi64_avx2(v[26], v[27]);
+ u[14] = k_packs_epi64_avx2(v[28], v[29]);
+ u[15] = k_packs_epi64_avx2(v[30], v[31]);
+
+ v[0] = _mm256_add_epi32(u[0], k__DCT_CONST_ROUNDING);
+ v[1] = _mm256_add_epi32(u[1], k__DCT_CONST_ROUNDING);
+ v[2] = _mm256_add_epi32(u[2], k__DCT_CONST_ROUNDING);
+ v[3] = _mm256_add_epi32(u[3], k__DCT_CONST_ROUNDING);
+ v[4] = _mm256_add_epi32(u[4], k__DCT_CONST_ROUNDING);
+ v[5] = _mm256_add_epi32(u[5], k__DCT_CONST_ROUNDING);
+ v[6] = _mm256_add_epi32(u[6], k__DCT_CONST_ROUNDING);
+ v[7] = _mm256_add_epi32(u[7], k__DCT_CONST_ROUNDING);
+ v[8] = _mm256_add_epi32(u[8], k__DCT_CONST_ROUNDING);
+ v[9] = _mm256_add_epi32(u[9], k__DCT_CONST_ROUNDING);
+ v[10] = _mm256_add_epi32(u[10], k__DCT_CONST_ROUNDING);
+ v[11] = _mm256_add_epi32(u[11], k__DCT_CONST_ROUNDING);
+ v[12] = _mm256_add_epi32(u[12], k__DCT_CONST_ROUNDING);
+ v[13] = _mm256_add_epi32(u[13], k__DCT_CONST_ROUNDING);
+ v[14] = _mm256_add_epi32(u[14], k__DCT_CONST_ROUNDING);
+ v[15] = _mm256_add_epi32(u[15], k__DCT_CONST_ROUNDING);
+
+ u[0] = _mm256_srai_epi32(v[0], DCT_CONST_BITS);
+ u[1] = _mm256_srai_epi32(v[1], DCT_CONST_BITS);
+ u[2] = _mm256_srai_epi32(v[2], DCT_CONST_BITS);
+ u[3] = _mm256_srai_epi32(v[3], DCT_CONST_BITS);
+ u[4] = _mm256_srai_epi32(v[4], DCT_CONST_BITS);
+ u[5] = _mm256_srai_epi32(v[5], DCT_CONST_BITS);
+ u[6] = _mm256_srai_epi32(v[6], DCT_CONST_BITS);
+ u[7] = _mm256_srai_epi32(v[7], DCT_CONST_BITS);
+ u[8] = _mm256_srai_epi32(v[8], DCT_CONST_BITS);
+ u[9] = _mm256_srai_epi32(v[9], DCT_CONST_BITS);
+ u[10] = _mm256_srai_epi32(v[10], DCT_CONST_BITS);
+ u[11] = _mm256_srai_epi32(v[11], DCT_CONST_BITS);
+ u[12] = _mm256_srai_epi32(v[12], DCT_CONST_BITS);
+ u[13] = _mm256_srai_epi32(v[13], DCT_CONST_BITS);
+ u[14] = _mm256_srai_epi32(v[14], DCT_CONST_BITS);
+ u[15] = _mm256_srai_epi32(v[15], DCT_CONST_BITS);
+
+ v[0] = _mm256_cmpgt_epi32(kZero, u[0]);
+ v[1] = _mm256_cmpgt_epi32(kZero, u[1]);
+ v[2] = _mm256_cmpgt_epi32(kZero, u[2]);
+ v[3] = _mm256_cmpgt_epi32(kZero, u[3]);
+ v[4] = _mm256_cmpgt_epi32(kZero, u[4]);
+ v[5] = _mm256_cmpgt_epi32(kZero, u[5]);
+ v[6] = _mm256_cmpgt_epi32(kZero, u[6]);
+ v[7] = _mm256_cmpgt_epi32(kZero, u[7]);
+ v[8] = _mm256_cmpgt_epi32(kZero, u[8]);
+ v[9] = _mm256_cmpgt_epi32(kZero, u[9]);
+ v[10] = _mm256_cmpgt_epi32(kZero, u[10]);
+ v[11] = _mm256_cmpgt_epi32(kZero, u[11]);
+ v[12] = _mm256_cmpgt_epi32(kZero, u[12]);
+ v[13] = _mm256_cmpgt_epi32(kZero, u[13]);
+ v[14] = _mm256_cmpgt_epi32(kZero, u[14]);
+ v[15] = _mm256_cmpgt_epi32(kZero, u[15]);
+
+ u[0] = _mm256_sub_epi32(u[0], v[0]);
+ u[1] = _mm256_sub_epi32(u[1], v[1]);
+ u[2] = _mm256_sub_epi32(u[2], v[2]);
+ u[3] = _mm256_sub_epi32(u[3], v[3]);
+ u[4] = _mm256_sub_epi32(u[4], v[4]);
+ u[5] = _mm256_sub_epi32(u[5], v[5]);
+ u[6] = _mm256_sub_epi32(u[6], v[6]);
+ u[7] = _mm256_sub_epi32(u[7], v[7]);
+ u[8] = _mm256_sub_epi32(u[8], v[8]);
+ u[9] = _mm256_sub_epi32(u[9], v[9]);
+ u[10] = _mm256_sub_epi32(u[10], v[10]);
+ u[11] = _mm256_sub_epi32(u[11], v[11]);
+ u[12] = _mm256_sub_epi32(u[12], v[12]);
+ u[13] = _mm256_sub_epi32(u[13], v[13]);
+ u[14] = _mm256_sub_epi32(u[14], v[14]);
+ u[15] = _mm256_sub_epi32(u[15], v[15]);
+
+ v[0] = _mm256_add_epi32(u[0], K32One);
+ v[1] = _mm256_add_epi32(u[1], K32One);
+ v[2] = _mm256_add_epi32(u[2], K32One);
+ v[3] = _mm256_add_epi32(u[3], K32One);
+ v[4] = _mm256_add_epi32(u[4], K32One);
+ v[5] = _mm256_add_epi32(u[5], K32One);
+ v[6] = _mm256_add_epi32(u[6], K32One);
+ v[7] = _mm256_add_epi32(u[7], K32One);
+ v[8] = _mm256_add_epi32(u[8], K32One);
+ v[9] = _mm256_add_epi32(u[9], K32One);
+ v[10] = _mm256_add_epi32(u[10], K32One);
+ v[11] = _mm256_add_epi32(u[11], K32One);
+ v[12] = _mm256_add_epi32(u[12], K32One);
+ v[13] = _mm256_add_epi32(u[13], K32One);
+ v[14] = _mm256_add_epi32(u[14], K32One);
+ v[15] = _mm256_add_epi32(u[15], K32One);
+
+ u[0] = _mm256_srai_epi32(v[0], 2);
+ u[1] = _mm256_srai_epi32(v[1], 2);
+ u[2] = _mm256_srai_epi32(v[2], 2);
+ u[3] = _mm256_srai_epi32(v[3], 2);
+ u[4] = _mm256_srai_epi32(v[4], 2);
+ u[5] = _mm256_srai_epi32(v[5], 2);
+ u[6] = _mm256_srai_epi32(v[6], 2);
+ u[7] = _mm256_srai_epi32(v[7], 2);
+ u[8] = _mm256_srai_epi32(v[8], 2);
+ u[9] = _mm256_srai_epi32(v[9], 2);
+ u[10] = _mm256_srai_epi32(v[10], 2);
+ u[11] = _mm256_srai_epi32(v[11], 2);
+ u[12] = _mm256_srai_epi32(v[12], 2);
+ u[13] = _mm256_srai_epi32(v[13], 2);
+ u[14] = _mm256_srai_epi32(v[14], 2);
+ u[15] = _mm256_srai_epi32(v[15], 2);
+
+ out[1] = _mm256_packs_epi32(u[0], u[1]);
+ out[17] = _mm256_packs_epi32(u[2], u[3]);
+ out[9] = _mm256_packs_epi32(u[4], u[5]);
+ out[25] = _mm256_packs_epi32(u[6], u[7]);
+ out[7] = _mm256_packs_epi32(u[8], u[9]);
+ out[23] = _mm256_packs_epi32(u[10], u[11]);
+ out[15] = _mm256_packs_epi32(u[12], u[13]);
+ out[31] = _mm256_packs_epi32(u[14], u[15]);
+ }
+ {
+ const __m256i k32_p27_p05 =
+ pair256_set_epi32(cospi_27_64, cospi_5_64);
+ const __m256i k32_p11_p21 =
+ pair256_set_epi32(cospi_11_64, cospi_21_64);
+ const __m256i k32_p19_p13 =
+ pair256_set_epi32(cospi_19_64, cospi_13_64);
+ const __m256i k32_p03_p29 =
+ pair256_set_epi32(cospi_3_64, cospi_29_64);
+ const __m256i k32_m29_p03 =
+ pair256_set_epi32(-cospi_29_64, cospi_3_64);
+ const __m256i k32_m13_p19 =
+ pair256_set_epi32(-cospi_13_64, cospi_19_64);
+ const __m256i k32_m21_p11 =
+ pair256_set_epi32(-cospi_21_64, cospi_11_64);
+ const __m256i k32_m05_p27 =
+ pair256_set_epi32(-cospi_5_64, cospi_27_64);
+
+ u[0] = _mm256_unpacklo_epi32(lstep1[40], lstep1[54]);
+ u[1] = _mm256_unpackhi_epi32(lstep1[40], lstep1[54]);
+ u[2] = _mm256_unpacklo_epi32(lstep1[41], lstep1[55]);
+ u[3] = _mm256_unpackhi_epi32(lstep1[41], lstep1[55]);
+ u[4] = _mm256_unpacklo_epi32(lstep1[42], lstep1[52]);
+ u[5] = _mm256_unpackhi_epi32(lstep1[42], lstep1[52]);
+ u[6] = _mm256_unpacklo_epi32(lstep1[43], lstep1[53]);
+ u[7] = _mm256_unpackhi_epi32(lstep1[43], lstep1[53]);
+ u[8] = _mm256_unpacklo_epi32(lstep1[44], lstep1[50]);
+ u[9] = _mm256_unpackhi_epi32(lstep1[44], lstep1[50]);
+ u[10] = _mm256_unpacklo_epi32(lstep1[45], lstep1[51]);
+ u[11] = _mm256_unpackhi_epi32(lstep1[45], lstep1[51]);
+ u[12] = _mm256_unpacklo_epi32(lstep1[46], lstep1[48]);
+ u[13] = _mm256_unpackhi_epi32(lstep1[46], lstep1[48]);
+ u[14] = _mm256_unpacklo_epi32(lstep1[47], lstep1[49]);
+ u[15] = _mm256_unpackhi_epi32(lstep1[47], lstep1[49]);
+
+ v[0] = k_madd_epi32_avx2(u[0], k32_p27_p05);
+ v[1] = k_madd_epi32_avx2(u[1], k32_p27_p05);
+ v[2] = k_madd_epi32_avx2(u[2], k32_p27_p05);
+ v[3] = k_madd_epi32_avx2(u[3], k32_p27_p05);
+ v[4] = k_madd_epi32_avx2(u[4], k32_p11_p21);
+ v[5] = k_madd_epi32_avx2(u[5], k32_p11_p21);
+ v[6] = k_madd_epi32_avx2(u[6], k32_p11_p21);
+ v[7] = k_madd_epi32_avx2(u[7], k32_p11_p21);
+ v[8] = k_madd_epi32_avx2(u[8], k32_p19_p13);
+ v[9] = k_madd_epi32_avx2(u[9], k32_p19_p13);
+ v[10] = k_madd_epi32_avx2(u[10], k32_p19_p13);
+ v[11] = k_madd_epi32_avx2(u[11], k32_p19_p13);
+ v[12] = k_madd_epi32_avx2(u[12], k32_p03_p29);
+ v[13] = k_madd_epi32_avx2(u[13], k32_p03_p29);
+ v[14] = k_madd_epi32_avx2(u[14], k32_p03_p29);
+ v[15] = k_madd_epi32_avx2(u[15], k32_p03_p29);
+ v[16] = k_madd_epi32_avx2(u[12], k32_m29_p03);
+ v[17] = k_madd_epi32_avx2(u[13], k32_m29_p03);
+ v[18] = k_madd_epi32_avx2(u[14], k32_m29_p03);
+ v[19] = k_madd_epi32_avx2(u[15], k32_m29_p03);
+ v[20] = k_madd_epi32_avx2(u[8], k32_m13_p19);
+ v[21] = k_madd_epi32_avx2(u[9], k32_m13_p19);
+ v[22] = k_madd_epi32_avx2(u[10], k32_m13_p19);
+ v[23] = k_madd_epi32_avx2(u[11], k32_m13_p19);
+ v[24] = k_madd_epi32_avx2(u[4], k32_m21_p11);
+ v[25] = k_madd_epi32_avx2(u[5], k32_m21_p11);
+ v[26] = k_madd_epi32_avx2(u[6], k32_m21_p11);
+ v[27] = k_madd_epi32_avx2(u[7], k32_m21_p11);
+ v[28] = k_madd_epi32_avx2(u[0], k32_m05_p27);
+ v[29] = k_madd_epi32_avx2(u[1], k32_m05_p27);
+ v[30] = k_madd_epi32_avx2(u[2], k32_m05_p27);
+ v[31] = k_madd_epi32_avx2(u[3], k32_m05_p27);
+
+ u[0] = k_packs_epi64_avx2(v[0], v[1]);
+ u[1] = k_packs_epi64_avx2(v[2], v[3]);
+ u[2] = k_packs_epi64_avx2(v[4], v[5]);
+ u[3] = k_packs_epi64_avx2(v[6], v[7]);
+ u[4] = k_packs_epi64_avx2(v[8], v[9]);
+ u[5] = k_packs_epi64_avx2(v[10], v[11]);
+ u[6] = k_packs_epi64_avx2(v[12], v[13]);
+ u[7] = k_packs_epi64_avx2(v[14], v[15]);
+ u[8] = k_packs_epi64_avx2(v[16], v[17]);
+ u[9] = k_packs_epi64_avx2(v[18], v[19]);
+ u[10] = k_packs_epi64_avx2(v[20], v[21]);
+ u[11] = k_packs_epi64_avx2(v[22], v[23]);
+ u[12] = k_packs_epi64_avx2(v[24], v[25]);
+ u[13] = k_packs_epi64_avx2(v[26], v[27]);
+ u[14] = k_packs_epi64_avx2(v[28], v[29]);
+ u[15] = k_packs_epi64_avx2(v[30], v[31]);
+
+ v[0] = _mm256_add_epi32(u[0], k__DCT_CONST_ROUNDING);
+ v[1] = _mm256_add_epi32(u[1], k__DCT_CONST_ROUNDING);
+ v[2] = _mm256_add_epi32(u[2], k__DCT_CONST_ROUNDING);
+ v[3] = _mm256_add_epi32(u[3], k__DCT_CONST_ROUNDING);
+ v[4] = _mm256_add_epi32(u[4], k__DCT_CONST_ROUNDING);
+ v[5] = _mm256_add_epi32(u[5], k__DCT_CONST_ROUNDING);
+ v[6] = _mm256_add_epi32(u[6], k__DCT_CONST_ROUNDING);
+ v[7] = _mm256_add_epi32(u[7], k__DCT_CONST_ROUNDING);
+ v[8] = _mm256_add_epi32(u[8], k__DCT_CONST_ROUNDING);
+ v[9] = _mm256_add_epi32(u[9], k__DCT_CONST_ROUNDING);
+ v[10] = _mm256_add_epi32(u[10], k__DCT_CONST_ROUNDING);
+ v[11] = _mm256_add_epi32(u[11], k__DCT_CONST_ROUNDING);
+ v[12] = _mm256_add_epi32(u[12], k__DCT_CONST_ROUNDING);
+ v[13] = _mm256_add_epi32(u[13], k__DCT_CONST_ROUNDING);
+ v[14] = _mm256_add_epi32(u[14], k__DCT_CONST_ROUNDING);
+ v[15] = _mm256_add_epi32(u[15], k__DCT_CONST_ROUNDING);
+
+ u[0] = _mm256_srai_epi32(v[0], DCT_CONST_BITS);
+ u[1] = _mm256_srai_epi32(v[1], DCT_CONST_BITS);
+ u[2] = _mm256_srai_epi32(v[2], DCT_CONST_BITS);
+ u[3] = _mm256_srai_epi32(v[3], DCT_CONST_BITS);
+ u[4] = _mm256_srai_epi32(v[4], DCT_CONST_BITS);
+ u[5] = _mm256_srai_epi32(v[5], DCT_CONST_BITS);
+ u[6] = _mm256_srai_epi32(v[6], DCT_CONST_BITS);
+ u[7] = _mm256_srai_epi32(v[7], DCT_CONST_BITS);
+ u[8] = _mm256_srai_epi32(v[8], DCT_CONST_BITS);
+ u[9] = _mm256_srai_epi32(v[9], DCT_CONST_BITS);
+ u[10] = _mm256_srai_epi32(v[10], DCT_CONST_BITS);
+ u[11] = _mm256_srai_epi32(v[11], DCT_CONST_BITS);
+ u[12] = _mm256_srai_epi32(v[12], DCT_CONST_BITS);
+ u[13] = _mm256_srai_epi32(v[13], DCT_CONST_BITS);
+ u[14] = _mm256_srai_epi32(v[14], DCT_CONST_BITS);
+ u[15] = _mm256_srai_epi32(v[15], DCT_CONST_BITS);
+
+ v[0] = _mm256_cmpgt_epi32(kZero, u[0]);
+ v[1] = _mm256_cmpgt_epi32(kZero, u[1]);
+ v[2] = _mm256_cmpgt_epi32(kZero, u[2]);
+ v[3] = _mm256_cmpgt_epi32(kZero, u[3]);
+ v[4] = _mm256_cmpgt_epi32(kZero, u[4]);
+ v[5] = _mm256_cmpgt_epi32(kZero, u[5]);
+ v[6] = _mm256_cmpgt_epi32(kZero, u[6]);
+ v[7] = _mm256_cmpgt_epi32(kZero, u[7]);
+ v[8] = _mm256_cmpgt_epi32(kZero, u[8]);
+ v[9] = _mm256_cmpgt_epi32(kZero, u[9]);
+ v[10] = _mm256_cmpgt_epi32(kZero, u[10]);
+ v[11] = _mm256_cmpgt_epi32(kZero, u[11]);
+ v[12] = _mm256_cmpgt_epi32(kZero, u[12]);
+ v[13] = _mm256_cmpgt_epi32(kZero, u[13]);
+ v[14] = _mm256_cmpgt_epi32(kZero, u[14]);
+ v[15] = _mm256_cmpgt_epi32(kZero, u[15]);
+
+ u[0] = _mm256_sub_epi32(u[0], v[0]);
+ u[1] = _mm256_sub_epi32(u[1], v[1]);
+ u[2] = _mm256_sub_epi32(u[2], v[2]);
+ u[3] = _mm256_sub_epi32(u[3], v[3]);
+ u[4] = _mm256_sub_epi32(u[4], v[4]);
+ u[5] = _mm256_sub_epi32(u[5], v[5]);
+ u[6] = _mm256_sub_epi32(u[6], v[6]);
+ u[7] = _mm256_sub_epi32(u[7], v[7]);
+ u[8] = _mm256_sub_epi32(u[8], v[8]);
+ u[9] = _mm256_sub_epi32(u[9], v[9]);
+ u[10] = _mm256_sub_epi32(u[10], v[10]);
+ u[11] = _mm256_sub_epi32(u[11], v[11]);
+ u[12] = _mm256_sub_epi32(u[12], v[12]);
+ u[13] = _mm256_sub_epi32(u[13], v[13]);
+ u[14] = _mm256_sub_epi32(u[14], v[14]);
+ u[15] = _mm256_sub_epi32(u[15], v[15]);
+
+ v[0] = _mm256_add_epi32(u[0], K32One);
+ v[1] = _mm256_add_epi32(u[1], K32One);
+ v[2] = _mm256_add_epi32(u[2], K32One);
+ v[3] = _mm256_add_epi32(u[3], K32One);
+ v[4] = _mm256_add_epi32(u[4], K32One);
+ v[5] = _mm256_add_epi32(u[5], K32One);
+ v[6] = _mm256_add_epi32(u[6], K32One);
+ v[7] = _mm256_add_epi32(u[7], K32One);
+ v[8] = _mm256_add_epi32(u[8], K32One);
+ v[9] = _mm256_add_epi32(u[9], K32One);
+ v[10] = _mm256_add_epi32(u[10], K32One);
+ v[11] = _mm256_add_epi32(u[11], K32One);
+ v[12] = _mm256_add_epi32(u[12], K32One);
+ v[13] = _mm256_add_epi32(u[13], K32One);
+ v[14] = _mm256_add_epi32(u[14], K32One);
+ v[15] = _mm256_add_epi32(u[15], K32One);
+
+ u[0] = _mm256_srai_epi32(v[0], 2);
+ u[1] = _mm256_srai_epi32(v[1], 2);
+ u[2] = _mm256_srai_epi32(v[2], 2);
+ u[3] = _mm256_srai_epi32(v[3], 2);
+ u[4] = _mm256_srai_epi32(v[4], 2);
+ u[5] = _mm256_srai_epi32(v[5], 2);
+ u[6] = _mm256_srai_epi32(v[6], 2);
+ u[7] = _mm256_srai_epi32(v[7], 2);
+ u[8] = _mm256_srai_epi32(v[8], 2);
+ u[9] = _mm256_srai_epi32(v[9], 2);
+ u[10] = _mm256_srai_epi32(v[10], 2);
+ u[11] = _mm256_srai_epi32(v[11], 2);
+ u[12] = _mm256_srai_epi32(v[12], 2);
+ u[13] = _mm256_srai_epi32(v[13], 2);
+ u[14] = _mm256_srai_epi32(v[14], 2);
+ u[15] = _mm256_srai_epi32(v[15], 2);
+
+ out[5] = _mm256_packs_epi32(u[0], u[1]);
+ out[21] = _mm256_packs_epi32(u[2], u[3]);
+ out[13] = _mm256_packs_epi32(u[4], u[5]);
+ out[29] = _mm256_packs_epi32(u[6], u[7]);
+ out[3] = _mm256_packs_epi32(u[8], u[9]);
+ out[19] = _mm256_packs_epi32(u[10], u[11]);
+ out[11] = _mm256_packs_epi32(u[12], u[13]);
+ out[27] = _mm256_packs_epi32(u[14], u[15]);
+ }
+ }
+#endif
+ // Transpose the results, do it as four 8x8 transposes.
+ {
+ int transpose_block;
+ int16_t *output_currStep, *output_nextStep;
+ if (0 == pass) {
+ output_currStep = &intermediate[column_start * 32];
+ output_nextStep = &intermediate[(column_start + 8) * 32];
+ } else {
+ output_currStep = &output_org[column_start * 32];
+ output_nextStep = &output_org[(column_start + 8) * 32];
+ }
+ for (transpose_block = 0; transpose_block < 4; ++transpose_block) {
+ __m256i *this_out = &out[8 * transpose_block];
+ // 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14 15
+ // 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35
+ // 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55
+ // 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75
+ // 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95
+ // 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115
+ // 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135
+ // 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155
+ const __m256i tr0_0 = _mm256_unpacklo_epi16(this_out[0], this_out[1]);
+ const __m256i tr0_1 = _mm256_unpacklo_epi16(this_out[2], this_out[3]);
+ const __m256i tr0_2 = _mm256_unpackhi_epi16(this_out[0], this_out[1]);
+ const __m256i tr0_3 = _mm256_unpackhi_epi16(this_out[2], this_out[3]);
+ const __m256i tr0_4 = _mm256_unpacklo_epi16(this_out[4], this_out[5]);
+ const __m256i tr0_5 = _mm256_unpacklo_epi16(this_out[6], this_out[7]);
+ const __m256i tr0_6 = _mm256_unpackhi_epi16(this_out[4], this_out[5]);
+ const __m256i tr0_7 = _mm256_unpackhi_epi16(this_out[6], this_out[7]);
+ // 00 20 01 21 02 22 03 23 08 28 09 29 10 30 11 31
+ // 40 60 41 61 42 62 43 63 48 68 49 69 50 70 51 71
+ // 04 24 05 25 06 26 07 27 12 32 13 33 14 34 15 35
+ // 44 64 45 65 46 66 47 67 52 72 53 73 54 74 55 75
+ // 80 100 81 101 82 102 83 103 88 108 89 109 90 110 91 101
+ // 120 140 121 141 122 142 123 143 128 148 129 149 130 150 131 151
+ // 84 104 85 105 86 106 87 107 92 112 93 113 94 114 95 115
+ // 124 144 125 145 126 146 127 147 132 152 133 153 134 154 135 155
+
+ const __m256i tr1_0 = _mm256_unpacklo_epi32(tr0_0, tr0_1);
+ const __m256i tr1_1 = _mm256_unpacklo_epi32(tr0_2, tr0_3);
+ const __m256i tr1_2 = _mm256_unpackhi_epi32(tr0_0, tr0_1);
+ const __m256i tr1_3 = _mm256_unpackhi_epi32(tr0_2, tr0_3);
+ const __m256i tr1_4 = _mm256_unpacklo_epi32(tr0_4, tr0_5);
+ const __m256i tr1_5 = _mm256_unpacklo_epi32(tr0_6, tr0_7);
+ const __m256i tr1_6 = _mm256_unpackhi_epi32(tr0_4, tr0_5);
+ const __m256i tr1_7 = _mm256_unpackhi_epi32(tr0_6, tr0_7);
+ // 00 20 40 60 01 21 41 61 08 28 48 68 09 29 49 69
+ // 04 24 44 64 05 25 45 65 12 32 52 72 13 33 53 73
+ // 02 22 42 62 03 23 43 63 10 30 50 70 11 31 51 71
+ // 06 26 46 66 07 27 47 67 14 34 54 74 15 35 55 75
+ // 80 100 120 140 81 101 121 141 88 108 128 148 89 109 129 149
+ // 84 104 124 144 85 105 125 145 92 112 132 152 93 113 133 153
+ // 82 102 122 142 83 103 123 143 90 110 130 150 91 101 131 151
+ // 86 106 126 146 87 107 127 147 94 114 134 154 95 115 135 155
+ __m256i tr2_0 = _mm256_unpacklo_epi64(tr1_0, tr1_4);
+ __m256i tr2_1 = _mm256_unpackhi_epi64(tr1_0, tr1_4);
+ __m256i tr2_2 = _mm256_unpacklo_epi64(tr1_2, tr1_6);
+ __m256i tr2_3 = _mm256_unpackhi_epi64(tr1_2, tr1_6);
+ __m256i tr2_4 = _mm256_unpacklo_epi64(tr1_1, tr1_5);
+ __m256i tr2_5 = _mm256_unpackhi_epi64(tr1_1, tr1_5);
+ __m256i tr2_6 = _mm256_unpacklo_epi64(tr1_3, tr1_7);
+ __m256i tr2_7 = _mm256_unpackhi_epi64(tr1_3, tr1_7);
+ // 00 20 40 60 80 100 120 140 08 28 48 68 88 108 128 148
+ // 01 21 41 61 81 101 121 141 09 29 49 69 89 109 129 149
+ // 02 22 42 62 82 102 122 142 10 30 50 70 90 110 130 150
+ // 03 23 43 63 83 103 123 143 11 31 51 71 91 101 131 151
+ // 04 24 44 64 84 104 124 144 12 32 52 72 92 112 132 152
+ // 05 25 45 65 85 105 125 145 13 33 53 73 93 113 133 153
+ // 06 26 46 66 86 106 126 146 14 34 54 74 94 114 134 154
+ // 07 27 47 67 87 107 127 147 15 35 55 75 95 115 135 155
+ if (0 == pass) {
+ // output[j] = (output[j] + 1 + (output[j] > 0)) >> 2;
+ // TODO(cd): see quality impact of only doing
+ // output[j] = (output[j] + 1) >> 2;
+ // which would remove the code between here ...
+ __m256i tr2_0_0 = _mm256_cmpgt_epi16(tr2_0, kZero);
+ __m256i tr2_1_0 = _mm256_cmpgt_epi16(tr2_1, kZero);
+ __m256i tr2_2_0 = _mm256_cmpgt_epi16(tr2_2, kZero);
+ __m256i tr2_3_0 = _mm256_cmpgt_epi16(tr2_3, kZero);
+ __m256i tr2_4_0 = _mm256_cmpgt_epi16(tr2_4, kZero);
+ __m256i tr2_5_0 = _mm256_cmpgt_epi16(tr2_5, kZero);
+ __m256i tr2_6_0 = _mm256_cmpgt_epi16(tr2_6, kZero);
+ __m256i tr2_7_0 = _mm256_cmpgt_epi16(tr2_7, kZero);
+ tr2_0 = _mm256_sub_epi16(tr2_0, tr2_0_0);
+ tr2_1 = _mm256_sub_epi16(tr2_1, tr2_1_0);
+ tr2_2 = _mm256_sub_epi16(tr2_2, tr2_2_0);
+ tr2_3 = _mm256_sub_epi16(tr2_3, tr2_3_0);
+ tr2_4 = _mm256_sub_epi16(tr2_4, tr2_4_0);
+ tr2_5 = _mm256_sub_epi16(tr2_5, tr2_5_0);
+ tr2_6 = _mm256_sub_epi16(tr2_6, tr2_6_0);
+ tr2_7 = _mm256_sub_epi16(tr2_7, tr2_7_0);
+ // ... and here.
+ // PS: also change code in vp9/encoder/vp9_dct.c
+ tr2_0 = _mm256_add_epi16(tr2_0, kOne);
+ tr2_1 = _mm256_add_epi16(tr2_1, kOne);
+ tr2_2 = _mm256_add_epi16(tr2_2, kOne);
+ tr2_3 = _mm256_add_epi16(tr2_3, kOne);
+ tr2_4 = _mm256_add_epi16(tr2_4, kOne);
+ tr2_5 = _mm256_add_epi16(tr2_5, kOne);
+ tr2_6 = _mm256_add_epi16(tr2_6, kOne);
+ tr2_7 = _mm256_add_epi16(tr2_7, kOne);
+ tr2_0 = _mm256_srai_epi16(tr2_0, 2);
+ tr2_1 = _mm256_srai_epi16(tr2_1, 2);
+ tr2_2 = _mm256_srai_epi16(tr2_2, 2);
+ tr2_3 = _mm256_srai_epi16(tr2_3, 2);
+ tr2_4 = _mm256_srai_epi16(tr2_4, 2);
+ tr2_5 = _mm256_srai_epi16(tr2_5, 2);
+ tr2_6 = _mm256_srai_epi16(tr2_6, 2);
+ tr2_7 = _mm256_srai_epi16(tr2_7, 2);
+ }
+ // Note: even though all these stores are aligned, using the aligned
+ // intrinsic make the code slightly slower.
+ _mm_storeu_si128((__m128i *)(output_currStep + 0 * 32),
+ _mm256_castsi256_si128(tr2_0));
+ _mm_storeu_si128((__m128i *)(output_currStep + 1 * 32),
+ _mm256_castsi256_si128(tr2_1));
+ _mm_storeu_si128((__m128i *)(output_currStep + 2 * 32),
+ _mm256_castsi256_si128(tr2_2));
+ _mm_storeu_si128((__m128i *)(output_currStep + 3 * 32),
+ _mm256_castsi256_si128(tr2_3));
+ _mm_storeu_si128((__m128i *)(output_currStep + 4 * 32),
+ _mm256_castsi256_si128(tr2_4));
+ _mm_storeu_si128((__m128i *)(output_currStep + 5 * 32),
+ _mm256_castsi256_si128(tr2_5));
+ _mm_storeu_si128((__m128i *)(output_currStep + 6 * 32),
+ _mm256_castsi256_si128(tr2_6));
+ _mm_storeu_si128((__m128i *)(output_currStep + 7 * 32),
+ _mm256_castsi256_si128(tr2_7));
+
+ _mm_storeu_si128((__m128i *)(output_nextStep + 0 * 32),
+ _mm256_extractf128_si256(tr2_0, 1));
+ _mm_storeu_si128((__m128i *)(output_nextStep + 1 * 32),
+ _mm256_extractf128_si256(tr2_1, 1));
+ _mm_storeu_si128((__m128i *)(output_nextStep + 2 * 32),
+ _mm256_extractf128_si256(tr2_2, 1));
+ _mm_storeu_si128((__m128i *)(output_nextStep + 3 * 32),
+ _mm256_extractf128_si256(tr2_3, 1));
+ _mm_storeu_si128((__m128i *)(output_nextStep + 4 * 32),
+ _mm256_extractf128_si256(tr2_4, 1));
+ _mm_storeu_si128((__m128i *)(output_nextStep + 5 * 32),
+ _mm256_extractf128_si256(tr2_5, 1));
+ _mm_storeu_si128((__m128i *)(output_nextStep + 6 * 32),
+ _mm256_extractf128_si256(tr2_6, 1));
+ _mm_storeu_si128((__m128i *)(output_nextStep + 7 * 32),
+ _mm256_extractf128_si256(tr2_7, 1));
+ // Process next 8x8
+ output_currStep += 8;
+ output_nextStep += 8;
+ }
+ }
+ }
+ }
+} // NOLINT
diff --git a/media/libvpx/libvpx/vpx_dsp/x86/fwd_dct32x32_impl_sse2.h b/media/libvpx/libvpx/vpx_dsp/x86/fwd_dct32x32_impl_sse2.h
new file mode 100644
index 0000000000..bf350b6da0
--- /dev/null
+++ b/media/libvpx/libvpx/vpx_dsp/x86/fwd_dct32x32_impl_sse2.h
@@ -0,0 +1,3130 @@
+/*
+ * Copyright (c) 2012 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <emmintrin.h> // SSE2
+
+#include "vpx_dsp/fwd_txfm.h"
+#include "vpx_dsp/txfm_common.h"
+#include "vpx_dsp/x86/txfm_common_sse2.h"
+
+// TODO(jingning) The high bit-depth version needs re-work for performance.
+// The current SSE2 implementation also causes cross reference to the static
+// functions in the C implementation file.
+#if DCT_HIGH_BIT_DEPTH
+#define ADD_EPI16 _mm_adds_epi16
+#define SUB_EPI16 _mm_subs_epi16
+#if FDCT32x32_HIGH_PRECISION
+static void vpx_fdct32x32_rows_c(const int16_t *intermediate, tran_low_t *out) {
+ int i, j;
+ for (i = 0; i < 32; ++i) {
+ tran_high_t temp_in[32], temp_out[32];
+ for (j = 0; j < 32; ++j) temp_in[j] = intermediate[j * 32 + i];
+ vpx_fdct32(temp_in, temp_out, 0);
+ for (j = 0; j < 32; ++j)
+ out[j + i * 32] =
+ (tran_low_t)((temp_out[j] + 1 + (temp_out[j] < 0)) >> 2);
+ }
+}
+#define HIGH_FDCT32x32_2D_C vpx_highbd_fdct32x32_c
+#define HIGH_FDCT32x32_2D_ROWS_C vpx_fdct32x32_rows_c
+#else
+static void vpx_fdct32x32_rd_rows_c(const int16_t *intermediate,
+ tran_low_t *out) {
+ int i, j;
+ for (i = 0; i < 32; ++i) {
+ tran_high_t temp_in[32], temp_out[32];
+ for (j = 0; j < 32; ++j) temp_in[j] = intermediate[j * 32 + i];
+ vpx_fdct32(temp_in, temp_out, 1);
+ for (j = 0; j < 32; ++j) out[j + i * 32] = (tran_low_t)temp_out[j];
+ }
+}
+#define HIGH_FDCT32x32_2D_C vpx_highbd_fdct32x32_rd_c
+#define HIGH_FDCT32x32_2D_ROWS_C vpx_fdct32x32_rd_rows_c
+#endif // FDCT32x32_HIGH_PRECISION
+#else
+#define ADD_EPI16 _mm_add_epi16
+#define SUB_EPI16 _mm_sub_epi16
+#endif // DCT_HIGH_BIT_DEPTH
+
+void FDCT32x32_2D(const int16_t *input, tran_low_t *output_org, int stride) {
+ // Calculate pre-multiplied strides
+ const int str1 = stride;
+ const int str2 = 2 * stride;
+ const int str3 = 2 * stride + str1;
+ // We need an intermediate buffer between passes.
+ DECLARE_ALIGNED(16, int16_t, intermediate[32 * 32]);
+ // Constants
+ // When we use them, in one case, they are all the same. In all others
+ // it's a pair of them that we need to repeat four times. This is done
+ // by constructing the 32 bit constant corresponding to that pair.
+ const __m128i k__cospi_p16_p16 = _mm_set1_epi16(cospi_16_64);
+ const __m128i k__cospi_p16_m16 = pair_set_epi16(+cospi_16_64, -cospi_16_64);
+ const __m128i k__cospi_m08_p24 = pair_set_epi16(-cospi_8_64, cospi_24_64);
+ const __m128i k__cospi_m24_m08 = pair_set_epi16(-cospi_24_64, -cospi_8_64);
+ const __m128i k__cospi_p24_p08 = pair_set_epi16(+cospi_24_64, cospi_8_64);
+ const __m128i k__cospi_p12_p20 = pair_set_epi16(+cospi_12_64, cospi_20_64);
+ const __m128i k__cospi_m20_p12 = pair_set_epi16(-cospi_20_64, cospi_12_64);
+ const __m128i k__cospi_m04_p28 = pair_set_epi16(-cospi_4_64, cospi_28_64);
+ const __m128i k__cospi_p28_p04 = pair_set_epi16(+cospi_28_64, cospi_4_64);
+ const __m128i k__cospi_m28_m04 = pair_set_epi16(-cospi_28_64, -cospi_4_64);
+ const __m128i k__cospi_m12_m20 = pair_set_epi16(-cospi_12_64, -cospi_20_64);
+ const __m128i k__cospi_p30_p02 = pair_set_epi16(+cospi_30_64, cospi_2_64);
+ const __m128i k__cospi_p14_p18 = pair_set_epi16(+cospi_14_64, cospi_18_64);
+ const __m128i k__cospi_p22_p10 = pair_set_epi16(+cospi_22_64, cospi_10_64);
+ const __m128i k__cospi_p06_p26 = pair_set_epi16(+cospi_6_64, cospi_26_64);
+ const __m128i k__cospi_m26_p06 = pair_set_epi16(-cospi_26_64, cospi_6_64);
+ const __m128i k__cospi_m10_p22 = pair_set_epi16(-cospi_10_64, cospi_22_64);
+ const __m128i k__cospi_m18_p14 = pair_set_epi16(-cospi_18_64, cospi_14_64);
+ const __m128i k__cospi_m02_p30 = pair_set_epi16(-cospi_2_64, cospi_30_64);
+ const __m128i k__cospi_p31_p01 = pair_set_epi16(+cospi_31_64, cospi_1_64);
+ const __m128i k__cospi_p15_p17 = pair_set_epi16(+cospi_15_64, cospi_17_64);
+ const __m128i k__cospi_p23_p09 = pair_set_epi16(+cospi_23_64, cospi_9_64);
+ const __m128i k__cospi_p07_p25 = pair_set_epi16(+cospi_7_64, cospi_25_64);
+ const __m128i k__cospi_m25_p07 = pair_set_epi16(-cospi_25_64, cospi_7_64);
+ const __m128i k__cospi_m09_p23 = pair_set_epi16(-cospi_9_64, cospi_23_64);
+ const __m128i k__cospi_m17_p15 = pair_set_epi16(-cospi_17_64, cospi_15_64);
+ const __m128i k__cospi_m01_p31 = pair_set_epi16(-cospi_1_64, cospi_31_64);
+ const __m128i k__cospi_p27_p05 = pair_set_epi16(+cospi_27_64, cospi_5_64);
+ const __m128i k__cospi_p11_p21 = pair_set_epi16(+cospi_11_64, cospi_21_64);
+ const __m128i k__cospi_p19_p13 = pair_set_epi16(+cospi_19_64, cospi_13_64);
+ const __m128i k__cospi_p03_p29 = pair_set_epi16(+cospi_3_64, cospi_29_64);
+ const __m128i k__cospi_m29_p03 = pair_set_epi16(-cospi_29_64, cospi_3_64);
+ const __m128i k__cospi_m13_p19 = pair_set_epi16(-cospi_13_64, cospi_19_64);
+ const __m128i k__cospi_m21_p11 = pair_set_epi16(-cospi_21_64, cospi_11_64);
+ const __m128i k__cospi_m05_p27 = pair_set_epi16(-cospi_5_64, cospi_27_64);
+ const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING);
+ const __m128i kZero = _mm_setzero_si128();
+ const __m128i kOne = _mm_set1_epi16(1);
+
+ // Do the two transform/transpose passes
+ int pass;
+#if DCT_HIGH_BIT_DEPTH
+ int overflow;
+#endif
+ for (pass = 0; pass < 2; ++pass) {
+ // We process eight columns (transposed rows in second pass) at a time.
+ int column_start;
+ for (column_start = 0; column_start < 32; column_start += 8) {
+ __m128i step1[32];
+ __m128i step2[32];
+ __m128i step3[32];
+ __m128i out[32];
+ // Stage 1
+ // Note: even though all the loads below are aligned, using the aligned
+ // intrinsic make the code slightly slower.
+ if (0 == pass) {
+ const int16_t *in = &input[column_start];
+ // step1[i] = (in[ 0 * stride] + in[(32 - 1) * stride]) << 2;
+ // Note: the next four blocks could be in a loop. That would help the
+ // instruction cache but is actually slower.
+ {
+ const int16_t *ina = in + 0 * str1;
+ const int16_t *inb = in + 31 * str1;
+ __m128i *step1a = &step1[0];
+ __m128i *step1b = &step1[31];
+ const __m128i ina0 = _mm_loadu_si128((const __m128i *)(ina));
+ const __m128i ina1 = _mm_loadu_si128((const __m128i *)(ina + str1));
+ const __m128i ina2 = _mm_loadu_si128((const __m128i *)(ina + str2));
+ const __m128i ina3 = _mm_loadu_si128((const __m128i *)(ina + str3));
+ const __m128i inb3 = _mm_loadu_si128((const __m128i *)(inb - str3));
+ const __m128i inb2 = _mm_loadu_si128((const __m128i *)(inb - str2));
+ const __m128i inb1 = _mm_loadu_si128((const __m128i *)(inb - str1));
+ const __m128i inb0 = _mm_loadu_si128((const __m128i *)(inb));
+ step1a[0] = _mm_add_epi16(ina0, inb0);
+ step1a[1] = _mm_add_epi16(ina1, inb1);
+ step1a[2] = _mm_add_epi16(ina2, inb2);
+ step1a[3] = _mm_add_epi16(ina3, inb3);
+ step1b[-3] = _mm_sub_epi16(ina3, inb3);
+ step1b[-2] = _mm_sub_epi16(ina2, inb2);
+ step1b[-1] = _mm_sub_epi16(ina1, inb1);
+ step1b[-0] = _mm_sub_epi16(ina0, inb0);
+ step1a[0] = _mm_slli_epi16(step1a[0], 2);
+ step1a[1] = _mm_slli_epi16(step1a[1], 2);
+ step1a[2] = _mm_slli_epi16(step1a[2], 2);
+ step1a[3] = _mm_slli_epi16(step1a[3], 2);
+ step1b[-3] = _mm_slli_epi16(step1b[-3], 2);
+ step1b[-2] = _mm_slli_epi16(step1b[-2], 2);
+ step1b[-1] = _mm_slli_epi16(step1b[-1], 2);
+ step1b[-0] = _mm_slli_epi16(step1b[-0], 2);
+ }
+ {
+ const int16_t *ina = in + 4 * str1;
+ const int16_t *inb = in + 27 * str1;
+ __m128i *step1a = &step1[4];
+ __m128i *step1b = &step1[27];
+ const __m128i ina0 = _mm_loadu_si128((const __m128i *)(ina));
+ const __m128i ina1 = _mm_loadu_si128((const __m128i *)(ina + str1));
+ const __m128i ina2 = _mm_loadu_si128((const __m128i *)(ina + str2));
+ const __m128i ina3 = _mm_loadu_si128((const __m128i *)(ina + str3));
+ const __m128i inb3 = _mm_loadu_si128((const __m128i *)(inb - str3));
+ const __m128i inb2 = _mm_loadu_si128((const __m128i *)(inb - str2));
+ const __m128i inb1 = _mm_loadu_si128((const __m128i *)(inb - str1));
+ const __m128i inb0 = _mm_loadu_si128((const __m128i *)(inb));
+ step1a[0] = _mm_add_epi16(ina0, inb0);
+ step1a[1] = _mm_add_epi16(ina1, inb1);
+ step1a[2] = _mm_add_epi16(ina2, inb2);
+ step1a[3] = _mm_add_epi16(ina3, inb3);
+ step1b[-3] = _mm_sub_epi16(ina3, inb3);
+ step1b[-2] = _mm_sub_epi16(ina2, inb2);
+ step1b[-1] = _mm_sub_epi16(ina1, inb1);
+ step1b[-0] = _mm_sub_epi16(ina0, inb0);
+ step1a[0] = _mm_slli_epi16(step1a[0], 2);
+ step1a[1] = _mm_slli_epi16(step1a[1], 2);
+ step1a[2] = _mm_slli_epi16(step1a[2], 2);
+ step1a[3] = _mm_slli_epi16(step1a[3], 2);
+ step1b[-3] = _mm_slli_epi16(step1b[-3], 2);
+ step1b[-2] = _mm_slli_epi16(step1b[-2], 2);
+ step1b[-1] = _mm_slli_epi16(step1b[-1], 2);
+ step1b[-0] = _mm_slli_epi16(step1b[-0], 2);
+ }
+ {
+ const int16_t *ina = in + 8 * str1;
+ const int16_t *inb = in + 23 * str1;
+ __m128i *step1a = &step1[8];
+ __m128i *step1b = &step1[23];
+ const __m128i ina0 = _mm_loadu_si128((const __m128i *)(ina));
+ const __m128i ina1 = _mm_loadu_si128((const __m128i *)(ina + str1));
+ const __m128i ina2 = _mm_loadu_si128((const __m128i *)(ina + str2));
+ const __m128i ina3 = _mm_loadu_si128((const __m128i *)(ina + str3));
+ const __m128i inb3 = _mm_loadu_si128((const __m128i *)(inb - str3));
+ const __m128i inb2 = _mm_loadu_si128((const __m128i *)(inb - str2));
+ const __m128i inb1 = _mm_loadu_si128((const __m128i *)(inb - str1));
+ const __m128i inb0 = _mm_loadu_si128((const __m128i *)(inb));
+ step1a[0] = _mm_add_epi16(ina0, inb0);
+ step1a[1] = _mm_add_epi16(ina1, inb1);
+ step1a[2] = _mm_add_epi16(ina2, inb2);
+ step1a[3] = _mm_add_epi16(ina3, inb3);
+ step1b[-3] = _mm_sub_epi16(ina3, inb3);
+ step1b[-2] = _mm_sub_epi16(ina2, inb2);
+ step1b[-1] = _mm_sub_epi16(ina1, inb1);
+ step1b[-0] = _mm_sub_epi16(ina0, inb0);
+ step1a[0] = _mm_slli_epi16(step1a[0], 2);
+ step1a[1] = _mm_slli_epi16(step1a[1], 2);
+ step1a[2] = _mm_slli_epi16(step1a[2], 2);
+ step1a[3] = _mm_slli_epi16(step1a[3], 2);
+ step1b[-3] = _mm_slli_epi16(step1b[-3], 2);
+ step1b[-2] = _mm_slli_epi16(step1b[-2], 2);
+ step1b[-1] = _mm_slli_epi16(step1b[-1], 2);
+ step1b[-0] = _mm_slli_epi16(step1b[-0], 2);
+ }
+ {
+ const int16_t *ina = in + 12 * str1;
+ const int16_t *inb = in + 19 * str1;
+ __m128i *step1a = &step1[12];
+ __m128i *step1b = &step1[19];
+ const __m128i ina0 = _mm_loadu_si128((const __m128i *)(ina));
+ const __m128i ina1 = _mm_loadu_si128((const __m128i *)(ina + str1));
+ const __m128i ina2 = _mm_loadu_si128((const __m128i *)(ina + str2));
+ const __m128i ina3 = _mm_loadu_si128((const __m128i *)(ina + str3));
+ const __m128i inb3 = _mm_loadu_si128((const __m128i *)(inb - str3));
+ const __m128i inb2 = _mm_loadu_si128((const __m128i *)(inb - str2));
+ const __m128i inb1 = _mm_loadu_si128((const __m128i *)(inb - str1));
+ const __m128i inb0 = _mm_loadu_si128((const __m128i *)(inb));
+ step1a[0] = _mm_add_epi16(ina0, inb0);
+ step1a[1] = _mm_add_epi16(ina1, inb1);
+ step1a[2] = _mm_add_epi16(ina2, inb2);
+ step1a[3] = _mm_add_epi16(ina3, inb3);
+ step1b[-3] = _mm_sub_epi16(ina3, inb3);
+ step1b[-2] = _mm_sub_epi16(ina2, inb2);
+ step1b[-1] = _mm_sub_epi16(ina1, inb1);
+ step1b[-0] = _mm_sub_epi16(ina0, inb0);
+ step1a[0] = _mm_slli_epi16(step1a[0], 2);
+ step1a[1] = _mm_slli_epi16(step1a[1], 2);
+ step1a[2] = _mm_slli_epi16(step1a[2], 2);
+ step1a[3] = _mm_slli_epi16(step1a[3], 2);
+ step1b[-3] = _mm_slli_epi16(step1b[-3], 2);
+ step1b[-2] = _mm_slli_epi16(step1b[-2], 2);
+ step1b[-1] = _mm_slli_epi16(step1b[-1], 2);
+ step1b[-0] = _mm_slli_epi16(step1b[-0], 2);
+ }
+ } else {
+ int16_t *in = &intermediate[column_start];
+ // step1[i] = in[ 0 * 32] + in[(32 - 1) * 32];
+ // Note: using the same approach as above to have common offset is
+ // counter-productive as all offsets can be calculated at compile
+ // time.
+ // Note: the next four blocks could be in a loop. That would help the
+ // instruction cache but is actually slower.
+ {
+ __m128i in00 = _mm_loadu_si128((const __m128i *)(in + 0 * 32));
+ __m128i in01 = _mm_loadu_si128((const __m128i *)(in + 1 * 32));
+ __m128i in02 = _mm_loadu_si128((const __m128i *)(in + 2 * 32));
+ __m128i in03 = _mm_loadu_si128((const __m128i *)(in + 3 * 32));
+ __m128i in28 = _mm_loadu_si128((const __m128i *)(in + 28 * 32));
+ __m128i in29 = _mm_loadu_si128((const __m128i *)(in + 29 * 32));
+ __m128i in30 = _mm_loadu_si128((const __m128i *)(in + 30 * 32));
+ __m128i in31 = _mm_loadu_si128((const __m128i *)(in + 31 * 32));
+ step1[0] = ADD_EPI16(in00, in31);
+ step1[1] = ADD_EPI16(in01, in30);
+ step1[2] = ADD_EPI16(in02, in29);
+ step1[3] = ADD_EPI16(in03, in28);
+ step1[28] = SUB_EPI16(in03, in28);
+ step1[29] = SUB_EPI16(in02, in29);
+ step1[30] = SUB_EPI16(in01, in30);
+ step1[31] = SUB_EPI16(in00, in31);
+#if DCT_HIGH_BIT_DEPTH
+ overflow = check_epi16_overflow_x8(&step1[0], &step1[1], &step1[2],
+ &step1[3], &step1[28], &step1[29],
+ &step1[30], &step1[31]);
+ if (overflow) {
+ HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org);
+ return;
+ }
+#endif // DCT_HIGH_BIT_DEPTH
+ }
+ {
+ __m128i in04 = _mm_loadu_si128((const __m128i *)(in + 4 * 32));
+ __m128i in05 = _mm_loadu_si128((const __m128i *)(in + 5 * 32));
+ __m128i in06 = _mm_loadu_si128((const __m128i *)(in + 6 * 32));
+ __m128i in07 = _mm_loadu_si128((const __m128i *)(in + 7 * 32));
+ __m128i in24 = _mm_loadu_si128((const __m128i *)(in + 24 * 32));
+ __m128i in25 = _mm_loadu_si128((const __m128i *)(in + 25 * 32));
+ __m128i in26 = _mm_loadu_si128((const __m128i *)(in + 26 * 32));
+ __m128i in27 = _mm_loadu_si128((const __m128i *)(in + 27 * 32));
+ step1[4] = ADD_EPI16(in04, in27);
+ step1[5] = ADD_EPI16(in05, in26);
+ step1[6] = ADD_EPI16(in06, in25);
+ step1[7] = ADD_EPI16(in07, in24);
+ step1[24] = SUB_EPI16(in07, in24);
+ step1[25] = SUB_EPI16(in06, in25);
+ step1[26] = SUB_EPI16(in05, in26);
+ step1[27] = SUB_EPI16(in04, in27);
+#if DCT_HIGH_BIT_DEPTH
+ overflow = check_epi16_overflow_x8(&step1[4], &step1[5], &step1[6],
+ &step1[7], &step1[24], &step1[25],
+ &step1[26], &step1[27]);
+ if (overflow) {
+ HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org);
+ return;
+ }
+#endif // DCT_HIGH_BIT_DEPTH
+ }
+ {
+ __m128i in08 = _mm_loadu_si128((const __m128i *)(in + 8 * 32));
+ __m128i in09 = _mm_loadu_si128((const __m128i *)(in + 9 * 32));
+ __m128i in10 = _mm_loadu_si128((const __m128i *)(in + 10 * 32));
+ __m128i in11 = _mm_loadu_si128((const __m128i *)(in + 11 * 32));
+ __m128i in20 = _mm_loadu_si128((const __m128i *)(in + 20 * 32));
+ __m128i in21 = _mm_loadu_si128((const __m128i *)(in + 21 * 32));
+ __m128i in22 = _mm_loadu_si128((const __m128i *)(in + 22 * 32));
+ __m128i in23 = _mm_loadu_si128((const __m128i *)(in + 23 * 32));
+ step1[8] = ADD_EPI16(in08, in23);
+ step1[9] = ADD_EPI16(in09, in22);
+ step1[10] = ADD_EPI16(in10, in21);
+ step1[11] = ADD_EPI16(in11, in20);
+ step1[20] = SUB_EPI16(in11, in20);
+ step1[21] = SUB_EPI16(in10, in21);
+ step1[22] = SUB_EPI16(in09, in22);
+ step1[23] = SUB_EPI16(in08, in23);
+#if DCT_HIGH_BIT_DEPTH
+ overflow = check_epi16_overflow_x8(&step1[8], &step1[9], &step1[10],
+ &step1[11], &step1[20], &step1[21],
+ &step1[22], &step1[23]);
+ if (overflow) {
+ HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org);
+ return;
+ }
+#endif // DCT_HIGH_BIT_DEPTH
+ }
+ {
+ __m128i in12 = _mm_loadu_si128((const __m128i *)(in + 12 * 32));
+ __m128i in13 = _mm_loadu_si128((const __m128i *)(in + 13 * 32));
+ __m128i in14 = _mm_loadu_si128((const __m128i *)(in + 14 * 32));
+ __m128i in15 = _mm_loadu_si128((const __m128i *)(in + 15 * 32));
+ __m128i in16 = _mm_loadu_si128((const __m128i *)(in + 16 * 32));
+ __m128i in17 = _mm_loadu_si128((const __m128i *)(in + 17 * 32));
+ __m128i in18 = _mm_loadu_si128((const __m128i *)(in + 18 * 32));
+ __m128i in19 = _mm_loadu_si128((const __m128i *)(in + 19 * 32));
+ step1[12] = ADD_EPI16(in12, in19);
+ step1[13] = ADD_EPI16(in13, in18);
+ step1[14] = ADD_EPI16(in14, in17);
+ step1[15] = ADD_EPI16(in15, in16);
+ step1[16] = SUB_EPI16(in15, in16);
+ step1[17] = SUB_EPI16(in14, in17);
+ step1[18] = SUB_EPI16(in13, in18);
+ step1[19] = SUB_EPI16(in12, in19);
+#if DCT_HIGH_BIT_DEPTH
+ overflow = check_epi16_overflow_x8(&step1[12], &step1[13], &step1[14],
+ &step1[15], &step1[16], &step1[17],
+ &step1[18], &step1[19]);
+ if (overflow) {
+ HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org);
+ return;
+ }
+#endif // DCT_HIGH_BIT_DEPTH
+ }
+ }
+ // Stage 2
+ {
+ step2[0] = ADD_EPI16(step1[0], step1[15]);
+ step2[1] = ADD_EPI16(step1[1], step1[14]);
+ step2[2] = ADD_EPI16(step1[2], step1[13]);
+ step2[3] = ADD_EPI16(step1[3], step1[12]);
+ step2[4] = ADD_EPI16(step1[4], step1[11]);
+ step2[5] = ADD_EPI16(step1[5], step1[10]);
+ step2[6] = ADD_EPI16(step1[6], step1[9]);
+ step2[7] = ADD_EPI16(step1[7], step1[8]);
+ step2[8] = SUB_EPI16(step1[7], step1[8]);
+ step2[9] = SUB_EPI16(step1[6], step1[9]);
+ step2[10] = SUB_EPI16(step1[5], step1[10]);
+ step2[11] = SUB_EPI16(step1[4], step1[11]);
+ step2[12] = SUB_EPI16(step1[3], step1[12]);
+ step2[13] = SUB_EPI16(step1[2], step1[13]);
+ step2[14] = SUB_EPI16(step1[1], step1[14]);
+ step2[15] = SUB_EPI16(step1[0], step1[15]);
+#if DCT_HIGH_BIT_DEPTH
+ overflow = check_epi16_overflow_x16(
+ &step2[0], &step2[1], &step2[2], &step2[3], &step2[4], &step2[5],
+ &step2[6], &step2[7], &step2[8], &step2[9], &step2[10], &step2[11],
+ &step2[12], &step2[13], &step2[14], &step2[15]);
+ if (overflow) {
+ if (pass == 0)
+ HIGH_FDCT32x32_2D_C(input, output_org, stride);
+ else
+ HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org);
+ return;
+ }
+#endif // DCT_HIGH_BIT_DEPTH
+ }
+ {
+ const __m128i s2_20_0 = _mm_unpacklo_epi16(step1[27], step1[20]);
+ const __m128i s2_20_1 = _mm_unpackhi_epi16(step1[27], step1[20]);
+ const __m128i s2_21_0 = _mm_unpacklo_epi16(step1[26], step1[21]);
+ const __m128i s2_21_1 = _mm_unpackhi_epi16(step1[26], step1[21]);
+ const __m128i s2_22_0 = _mm_unpacklo_epi16(step1[25], step1[22]);
+ const __m128i s2_22_1 = _mm_unpackhi_epi16(step1[25], step1[22]);
+ const __m128i s2_23_0 = _mm_unpacklo_epi16(step1[24], step1[23]);
+ const __m128i s2_23_1 = _mm_unpackhi_epi16(step1[24], step1[23]);
+ const __m128i s2_20_2 = _mm_madd_epi16(s2_20_0, k__cospi_p16_m16);
+ const __m128i s2_20_3 = _mm_madd_epi16(s2_20_1, k__cospi_p16_m16);
+ const __m128i s2_21_2 = _mm_madd_epi16(s2_21_0, k__cospi_p16_m16);
+ const __m128i s2_21_3 = _mm_madd_epi16(s2_21_1, k__cospi_p16_m16);
+ const __m128i s2_22_2 = _mm_madd_epi16(s2_22_0, k__cospi_p16_m16);
+ const __m128i s2_22_3 = _mm_madd_epi16(s2_22_1, k__cospi_p16_m16);
+ const __m128i s2_23_2 = _mm_madd_epi16(s2_23_0, k__cospi_p16_m16);
+ const __m128i s2_23_3 = _mm_madd_epi16(s2_23_1, k__cospi_p16_m16);
+ const __m128i s2_24_2 = _mm_madd_epi16(s2_23_0, k__cospi_p16_p16);
+ const __m128i s2_24_3 = _mm_madd_epi16(s2_23_1, k__cospi_p16_p16);
+ const __m128i s2_25_2 = _mm_madd_epi16(s2_22_0, k__cospi_p16_p16);
+ const __m128i s2_25_3 = _mm_madd_epi16(s2_22_1, k__cospi_p16_p16);
+ const __m128i s2_26_2 = _mm_madd_epi16(s2_21_0, k__cospi_p16_p16);
+ const __m128i s2_26_3 = _mm_madd_epi16(s2_21_1, k__cospi_p16_p16);
+ const __m128i s2_27_2 = _mm_madd_epi16(s2_20_0, k__cospi_p16_p16);
+ const __m128i s2_27_3 = _mm_madd_epi16(s2_20_1, k__cospi_p16_p16);
+ // dct_const_round_shift
+ const __m128i s2_20_4 = _mm_add_epi32(s2_20_2, k__DCT_CONST_ROUNDING);
+ const __m128i s2_20_5 = _mm_add_epi32(s2_20_3, k__DCT_CONST_ROUNDING);
+ const __m128i s2_21_4 = _mm_add_epi32(s2_21_2, k__DCT_CONST_ROUNDING);
+ const __m128i s2_21_5 = _mm_add_epi32(s2_21_3, k__DCT_CONST_ROUNDING);
+ const __m128i s2_22_4 = _mm_add_epi32(s2_22_2, k__DCT_CONST_ROUNDING);
+ const __m128i s2_22_5 = _mm_add_epi32(s2_22_3, k__DCT_CONST_ROUNDING);
+ const __m128i s2_23_4 = _mm_add_epi32(s2_23_2, k__DCT_CONST_ROUNDING);
+ const __m128i s2_23_5 = _mm_add_epi32(s2_23_3, k__DCT_CONST_ROUNDING);
+ const __m128i s2_24_4 = _mm_add_epi32(s2_24_2, k__DCT_CONST_ROUNDING);
+ const __m128i s2_24_5 = _mm_add_epi32(s2_24_3, k__DCT_CONST_ROUNDING);
+ const __m128i s2_25_4 = _mm_add_epi32(s2_25_2, k__DCT_CONST_ROUNDING);
+ const __m128i s2_25_5 = _mm_add_epi32(s2_25_3, k__DCT_CONST_ROUNDING);
+ const __m128i s2_26_4 = _mm_add_epi32(s2_26_2, k__DCT_CONST_ROUNDING);
+ const __m128i s2_26_5 = _mm_add_epi32(s2_26_3, k__DCT_CONST_ROUNDING);
+ const __m128i s2_27_4 = _mm_add_epi32(s2_27_2, k__DCT_CONST_ROUNDING);
+ const __m128i s2_27_5 = _mm_add_epi32(s2_27_3, k__DCT_CONST_ROUNDING);
+ const __m128i s2_20_6 = _mm_srai_epi32(s2_20_4, DCT_CONST_BITS);
+ const __m128i s2_20_7 = _mm_srai_epi32(s2_20_5, DCT_CONST_BITS);
+ const __m128i s2_21_6 = _mm_srai_epi32(s2_21_4, DCT_CONST_BITS);
+ const __m128i s2_21_7 = _mm_srai_epi32(s2_21_5, DCT_CONST_BITS);
+ const __m128i s2_22_6 = _mm_srai_epi32(s2_22_4, DCT_CONST_BITS);
+ const __m128i s2_22_7 = _mm_srai_epi32(s2_22_5, DCT_CONST_BITS);
+ const __m128i s2_23_6 = _mm_srai_epi32(s2_23_4, DCT_CONST_BITS);
+ const __m128i s2_23_7 = _mm_srai_epi32(s2_23_5, DCT_CONST_BITS);
+ const __m128i s2_24_6 = _mm_srai_epi32(s2_24_4, DCT_CONST_BITS);
+ const __m128i s2_24_7 = _mm_srai_epi32(s2_24_5, DCT_CONST_BITS);
+ const __m128i s2_25_6 = _mm_srai_epi32(s2_25_4, DCT_CONST_BITS);
+ const __m128i s2_25_7 = _mm_srai_epi32(s2_25_5, DCT_CONST_BITS);
+ const __m128i s2_26_6 = _mm_srai_epi32(s2_26_4, DCT_CONST_BITS);
+ const __m128i s2_26_7 = _mm_srai_epi32(s2_26_5, DCT_CONST_BITS);
+ const __m128i s2_27_6 = _mm_srai_epi32(s2_27_4, DCT_CONST_BITS);
+ const __m128i s2_27_7 = _mm_srai_epi32(s2_27_5, DCT_CONST_BITS);
+ // Combine
+ step2[20] = _mm_packs_epi32(s2_20_6, s2_20_7);
+ step2[21] = _mm_packs_epi32(s2_21_6, s2_21_7);
+ step2[22] = _mm_packs_epi32(s2_22_6, s2_22_7);
+ step2[23] = _mm_packs_epi32(s2_23_6, s2_23_7);
+ step2[24] = _mm_packs_epi32(s2_24_6, s2_24_7);
+ step2[25] = _mm_packs_epi32(s2_25_6, s2_25_7);
+ step2[26] = _mm_packs_epi32(s2_26_6, s2_26_7);
+ step2[27] = _mm_packs_epi32(s2_27_6, s2_27_7);
+#if DCT_HIGH_BIT_DEPTH
+ overflow = check_epi16_overflow_x8(&step2[20], &step2[21], &step2[22],
+ &step2[23], &step2[24], &step2[25],
+ &step2[26], &step2[27]);
+ if (overflow) {
+ if (pass == 0)
+ HIGH_FDCT32x32_2D_C(input, output_org, stride);
+ else
+ HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org);
+ return;
+ }
+#endif // DCT_HIGH_BIT_DEPTH
+ }
+
+#if !FDCT32x32_HIGH_PRECISION
+ // dump the magnitude by half, hence the intermediate values are within
+ // the range of 16 bits.
+ if (1 == pass) {
+ __m128i s3_00_0 = _mm_cmplt_epi16(step2[0], kZero);
+ __m128i s3_01_0 = _mm_cmplt_epi16(step2[1], kZero);
+ __m128i s3_02_0 = _mm_cmplt_epi16(step2[2], kZero);
+ __m128i s3_03_0 = _mm_cmplt_epi16(step2[3], kZero);
+ __m128i s3_04_0 = _mm_cmplt_epi16(step2[4], kZero);
+ __m128i s3_05_0 = _mm_cmplt_epi16(step2[5], kZero);
+ __m128i s3_06_0 = _mm_cmplt_epi16(step2[6], kZero);
+ __m128i s3_07_0 = _mm_cmplt_epi16(step2[7], kZero);
+ __m128i s2_08_0 = _mm_cmplt_epi16(step2[8], kZero);
+ __m128i s2_09_0 = _mm_cmplt_epi16(step2[9], kZero);
+ __m128i s3_10_0 = _mm_cmplt_epi16(step2[10], kZero);
+ __m128i s3_11_0 = _mm_cmplt_epi16(step2[11], kZero);
+ __m128i s3_12_0 = _mm_cmplt_epi16(step2[12], kZero);
+ __m128i s3_13_0 = _mm_cmplt_epi16(step2[13], kZero);
+ __m128i s2_14_0 = _mm_cmplt_epi16(step2[14], kZero);
+ __m128i s2_15_0 = _mm_cmplt_epi16(step2[15], kZero);
+ __m128i s3_16_0 = _mm_cmplt_epi16(step1[16], kZero);
+ __m128i s3_17_0 = _mm_cmplt_epi16(step1[17], kZero);
+ __m128i s3_18_0 = _mm_cmplt_epi16(step1[18], kZero);
+ __m128i s3_19_0 = _mm_cmplt_epi16(step1[19], kZero);
+ __m128i s3_20_0 = _mm_cmplt_epi16(step2[20], kZero);
+ __m128i s3_21_0 = _mm_cmplt_epi16(step2[21], kZero);
+ __m128i s3_22_0 = _mm_cmplt_epi16(step2[22], kZero);
+ __m128i s3_23_0 = _mm_cmplt_epi16(step2[23], kZero);
+ __m128i s3_24_0 = _mm_cmplt_epi16(step2[24], kZero);
+ __m128i s3_25_0 = _mm_cmplt_epi16(step2[25], kZero);
+ __m128i s3_26_0 = _mm_cmplt_epi16(step2[26], kZero);
+ __m128i s3_27_0 = _mm_cmplt_epi16(step2[27], kZero);
+ __m128i s3_28_0 = _mm_cmplt_epi16(step1[28], kZero);
+ __m128i s3_29_0 = _mm_cmplt_epi16(step1[29], kZero);
+ __m128i s3_30_0 = _mm_cmplt_epi16(step1[30], kZero);
+ __m128i s3_31_0 = _mm_cmplt_epi16(step1[31], kZero);
+
+ step2[0] = SUB_EPI16(step2[0], s3_00_0);
+ step2[1] = SUB_EPI16(step2[1], s3_01_0);
+ step2[2] = SUB_EPI16(step2[2], s3_02_0);
+ step2[3] = SUB_EPI16(step2[3], s3_03_0);
+ step2[4] = SUB_EPI16(step2[4], s3_04_0);
+ step2[5] = SUB_EPI16(step2[5], s3_05_0);
+ step2[6] = SUB_EPI16(step2[6], s3_06_0);
+ step2[7] = SUB_EPI16(step2[7], s3_07_0);
+ step2[8] = SUB_EPI16(step2[8], s2_08_0);
+ step2[9] = SUB_EPI16(step2[9], s2_09_0);
+ step2[10] = SUB_EPI16(step2[10], s3_10_0);
+ step2[11] = SUB_EPI16(step2[11], s3_11_0);
+ step2[12] = SUB_EPI16(step2[12], s3_12_0);
+ step2[13] = SUB_EPI16(step2[13], s3_13_0);
+ step2[14] = SUB_EPI16(step2[14], s2_14_0);
+ step2[15] = SUB_EPI16(step2[15], s2_15_0);
+ step1[16] = SUB_EPI16(step1[16], s3_16_0);
+ step1[17] = SUB_EPI16(step1[17], s3_17_0);
+ step1[18] = SUB_EPI16(step1[18], s3_18_0);
+ step1[19] = SUB_EPI16(step1[19], s3_19_0);
+ step2[20] = SUB_EPI16(step2[20], s3_20_0);
+ step2[21] = SUB_EPI16(step2[21], s3_21_0);
+ step2[22] = SUB_EPI16(step2[22], s3_22_0);
+ step2[23] = SUB_EPI16(step2[23], s3_23_0);
+ step2[24] = SUB_EPI16(step2[24], s3_24_0);
+ step2[25] = SUB_EPI16(step2[25], s3_25_0);
+ step2[26] = SUB_EPI16(step2[26], s3_26_0);
+ step2[27] = SUB_EPI16(step2[27], s3_27_0);
+ step1[28] = SUB_EPI16(step1[28], s3_28_0);
+ step1[29] = SUB_EPI16(step1[29], s3_29_0);
+ step1[30] = SUB_EPI16(step1[30], s3_30_0);
+ step1[31] = SUB_EPI16(step1[31], s3_31_0);
+#if DCT_HIGH_BIT_DEPTH
+ overflow = check_epi16_overflow_x32(
+ &step2[0], &step2[1], &step2[2], &step2[3], &step2[4], &step2[5],
+ &step2[6], &step2[7], &step2[8], &step2[9], &step2[10], &step2[11],
+ &step2[12], &step2[13], &step2[14], &step2[15], &step1[16],
+ &step1[17], &step1[18], &step1[19], &step2[20], &step2[21],
+ &step2[22], &step2[23], &step2[24], &step2[25], &step2[26],
+ &step2[27], &step1[28], &step1[29], &step1[30], &step1[31]);
+ if (overflow) {
+ HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org);
+ return;
+ }
+#endif // DCT_HIGH_BIT_DEPTH
+ step2[0] = _mm_add_epi16(step2[0], kOne);
+ step2[1] = _mm_add_epi16(step2[1], kOne);
+ step2[2] = _mm_add_epi16(step2[2], kOne);
+ step2[3] = _mm_add_epi16(step2[3], kOne);
+ step2[4] = _mm_add_epi16(step2[4], kOne);
+ step2[5] = _mm_add_epi16(step2[5], kOne);
+ step2[6] = _mm_add_epi16(step2[6], kOne);
+ step2[7] = _mm_add_epi16(step2[7], kOne);
+ step2[8] = _mm_add_epi16(step2[8], kOne);
+ step2[9] = _mm_add_epi16(step2[9], kOne);
+ step2[10] = _mm_add_epi16(step2[10], kOne);
+ step2[11] = _mm_add_epi16(step2[11], kOne);
+ step2[12] = _mm_add_epi16(step2[12], kOne);
+ step2[13] = _mm_add_epi16(step2[13], kOne);
+ step2[14] = _mm_add_epi16(step2[14], kOne);
+ step2[15] = _mm_add_epi16(step2[15], kOne);
+ step1[16] = _mm_add_epi16(step1[16], kOne);
+ step1[17] = _mm_add_epi16(step1[17], kOne);
+ step1[18] = _mm_add_epi16(step1[18], kOne);
+ step1[19] = _mm_add_epi16(step1[19], kOne);
+ step2[20] = _mm_add_epi16(step2[20], kOne);
+ step2[21] = _mm_add_epi16(step2[21], kOne);
+ step2[22] = _mm_add_epi16(step2[22], kOne);
+ step2[23] = _mm_add_epi16(step2[23], kOne);
+ step2[24] = _mm_add_epi16(step2[24], kOne);
+ step2[25] = _mm_add_epi16(step2[25], kOne);
+ step2[26] = _mm_add_epi16(step2[26], kOne);
+ step2[27] = _mm_add_epi16(step2[27], kOne);
+ step1[28] = _mm_add_epi16(step1[28], kOne);
+ step1[29] = _mm_add_epi16(step1[29], kOne);
+ step1[30] = _mm_add_epi16(step1[30], kOne);
+ step1[31] = _mm_add_epi16(step1[31], kOne);
+
+ step2[0] = _mm_srai_epi16(step2[0], 2);
+ step2[1] = _mm_srai_epi16(step2[1], 2);
+ step2[2] = _mm_srai_epi16(step2[2], 2);
+ step2[3] = _mm_srai_epi16(step2[3], 2);
+ step2[4] = _mm_srai_epi16(step2[4], 2);
+ step2[5] = _mm_srai_epi16(step2[5], 2);
+ step2[6] = _mm_srai_epi16(step2[6], 2);
+ step2[7] = _mm_srai_epi16(step2[7], 2);
+ step2[8] = _mm_srai_epi16(step2[8], 2);
+ step2[9] = _mm_srai_epi16(step2[9], 2);
+ step2[10] = _mm_srai_epi16(step2[10], 2);
+ step2[11] = _mm_srai_epi16(step2[11], 2);
+ step2[12] = _mm_srai_epi16(step2[12], 2);
+ step2[13] = _mm_srai_epi16(step2[13], 2);
+ step2[14] = _mm_srai_epi16(step2[14], 2);
+ step2[15] = _mm_srai_epi16(step2[15], 2);
+ step1[16] = _mm_srai_epi16(step1[16], 2);
+ step1[17] = _mm_srai_epi16(step1[17], 2);
+ step1[18] = _mm_srai_epi16(step1[18], 2);
+ step1[19] = _mm_srai_epi16(step1[19], 2);
+ step2[20] = _mm_srai_epi16(step2[20], 2);
+ step2[21] = _mm_srai_epi16(step2[21], 2);
+ step2[22] = _mm_srai_epi16(step2[22], 2);
+ step2[23] = _mm_srai_epi16(step2[23], 2);
+ step2[24] = _mm_srai_epi16(step2[24], 2);
+ step2[25] = _mm_srai_epi16(step2[25], 2);
+ step2[26] = _mm_srai_epi16(step2[26], 2);
+ step2[27] = _mm_srai_epi16(step2[27], 2);
+ step1[28] = _mm_srai_epi16(step1[28], 2);
+ step1[29] = _mm_srai_epi16(step1[29], 2);
+ step1[30] = _mm_srai_epi16(step1[30], 2);
+ step1[31] = _mm_srai_epi16(step1[31], 2);
+ }
+#endif // !FDCT32x32_HIGH_PRECISION
+
+#if FDCT32x32_HIGH_PRECISION
+ if (pass == 0) {
+#endif
+ // Stage 3
+ {
+ step3[0] = ADD_EPI16(step2[(8 - 1)], step2[0]);
+ step3[1] = ADD_EPI16(step2[(8 - 2)], step2[1]);
+ step3[2] = ADD_EPI16(step2[(8 - 3)], step2[2]);
+ step3[3] = ADD_EPI16(step2[(8 - 4)], step2[3]);
+ step3[4] = SUB_EPI16(step2[(8 - 5)], step2[4]);
+ step3[5] = SUB_EPI16(step2[(8 - 6)], step2[5]);
+ step3[6] = SUB_EPI16(step2[(8 - 7)], step2[6]);
+ step3[7] = SUB_EPI16(step2[(8 - 8)], step2[7]);
+#if DCT_HIGH_BIT_DEPTH
+ overflow = check_epi16_overflow_x8(&step3[0], &step3[1], &step3[2],
+ &step3[3], &step3[4], &step3[5],
+ &step3[6], &step3[7]);
+ if (overflow) {
+ if (pass == 0)
+ HIGH_FDCT32x32_2D_C(input, output_org, stride);
+ else
+ HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org);
+ return;
+ }
+#endif // DCT_HIGH_BIT_DEPTH
+ }
+ {
+ const __m128i s3_10_0 = _mm_unpacklo_epi16(step2[13], step2[10]);
+ const __m128i s3_10_1 = _mm_unpackhi_epi16(step2[13], step2[10]);
+ const __m128i s3_11_0 = _mm_unpacklo_epi16(step2[12], step2[11]);
+ const __m128i s3_11_1 = _mm_unpackhi_epi16(step2[12], step2[11]);
+ const __m128i s3_10_2 = _mm_madd_epi16(s3_10_0, k__cospi_p16_m16);
+ const __m128i s3_10_3 = _mm_madd_epi16(s3_10_1, k__cospi_p16_m16);
+ const __m128i s3_11_2 = _mm_madd_epi16(s3_11_0, k__cospi_p16_m16);
+ const __m128i s3_11_3 = _mm_madd_epi16(s3_11_1, k__cospi_p16_m16);
+ const __m128i s3_12_2 = _mm_madd_epi16(s3_11_0, k__cospi_p16_p16);
+ const __m128i s3_12_3 = _mm_madd_epi16(s3_11_1, k__cospi_p16_p16);
+ const __m128i s3_13_2 = _mm_madd_epi16(s3_10_0, k__cospi_p16_p16);
+ const __m128i s3_13_3 = _mm_madd_epi16(s3_10_1, k__cospi_p16_p16);
+ // dct_const_round_shift
+ const __m128i s3_10_4 = _mm_add_epi32(s3_10_2, k__DCT_CONST_ROUNDING);
+ const __m128i s3_10_5 = _mm_add_epi32(s3_10_3, k__DCT_CONST_ROUNDING);
+ const __m128i s3_11_4 = _mm_add_epi32(s3_11_2, k__DCT_CONST_ROUNDING);
+ const __m128i s3_11_5 = _mm_add_epi32(s3_11_3, k__DCT_CONST_ROUNDING);
+ const __m128i s3_12_4 = _mm_add_epi32(s3_12_2, k__DCT_CONST_ROUNDING);
+ const __m128i s3_12_5 = _mm_add_epi32(s3_12_3, k__DCT_CONST_ROUNDING);
+ const __m128i s3_13_4 = _mm_add_epi32(s3_13_2, k__DCT_CONST_ROUNDING);
+ const __m128i s3_13_5 = _mm_add_epi32(s3_13_3, k__DCT_CONST_ROUNDING);
+ const __m128i s3_10_6 = _mm_srai_epi32(s3_10_4, DCT_CONST_BITS);
+ const __m128i s3_10_7 = _mm_srai_epi32(s3_10_5, DCT_CONST_BITS);
+ const __m128i s3_11_6 = _mm_srai_epi32(s3_11_4, DCT_CONST_BITS);
+ const __m128i s3_11_7 = _mm_srai_epi32(s3_11_5, DCT_CONST_BITS);
+ const __m128i s3_12_6 = _mm_srai_epi32(s3_12_4, DCT_CONST_BITS);
+ const __m128i s3_12_7 = _mm_srai_epi32(s3_12_5, DCT_CONST_BITS);
+ const __m128i s3_13_6 = _mm_srai_epi32(s3_13_4, DCT_CONST_BITS);
+ const __m128i s3_13_7 = _mm_srai_epi32(s3_13_5, DCT_CONST_BITS);
+ // Combine
+ step3[10] = _mm_packs_epi32(s3_10_6, s3_10_7);
+ step3[11] = _mm_packs_epi32(s3_11_6, s3_11_7);
+ step3[12] = _mm_packs_epi32(s3_12_6, s3_12_7);
+ step3[13] = _mm_packs_epi32(s3_13_6, s3_13_7);
+#if DCT_HIGH_BIT_DEPTH
+ overflow = check_epi16_overflow_x4(&step3[10], &step3[11], &step3[12],
+ &step3[13]);
+ if (overflow) {
+ if (pass == 0)
+ HIGH_FDCT32x32_2D_C(input, output_org, stride);
+ else
+ HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org);
+ return;
+ }
+#endif // DCT_HIGH_BIT_DEPTH
+ }
+ {
+ step3[16] = ADD_EPI16(step2[23], step1[16]);
+ step3[17] = ADD_EPI16(step2[22], step1[17]);
+ step3[18] = ADD_EPI16(step2[21], step1[18]);
+ step3[19] = ADD_EPI16(step2[20], step1[19]);
+ step3[20] = SUB_EPI16(step1[19], step2[20]);
+ step3[21] = SUB_EPI16(step1[18], step2[21]);
+ step3[22] = SUB_EPI16(step1[17], step2[22]);
+ step3[23] = SUB_EPI16(step1[16], step2[23]);
+ step3[24] = SUB_EPI16(step1[31], step2[24]);
+ step3[25] = SUB_EPI16(step1[30], step2[25]);
+ step3[26] = SUB_EPI16(step1[29], step2[26]);
+ step3[27] = SUB_EPI16(step1[28], step2[27]);
+ step3[28] = ADD_EPI16(step2[27], step1[28]);
+ step3[29] = ADD_EPI16(step2[26], step1[29]);
+ step3[30] = ADD_EPI16(step2[25], step1[30]);
+ step3[31] = ADD_EPI16(step2[24], step1[31]);
+#if DCT_HIGH_BIT_DEPTH
+ overflow = check_epi16_overflow_x16(
+ &step3[16], &step3[17], &step3[18], &step3[19], &step3[20],
+ &step3[21], &step3[22], &step3[23], &step3[24], &step3[25],
+ &step3[26], &step3[27], &step3[28], &step3[29], &step3[30],
+ &step3[31]);
+ if (overflow) {
+ if (pass == 0)
+ HIGH_FDCT32x32_2D_C(input, output_org, stride);
+ else
+ HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org);
+ return;
+ }
+#endif // DCT_HIGH_BIT_DEPTH
+ }
+
+ // Stage 4
+ {
+ step1[0] = ADD_EPI16(step3[3], step3[0]);
+ step1[1] = ADD_EPI16(step3[2], step3[1]);
+ step1[2] = SUB_EPI16(step3[1], step3[2]);
+ step1[3] = SUB_EPI16(step3[0], step3[3]);
+ step1[8] = ADD_EPI16(step3[11], step2[8]);
+ step1[9] = ADD_EPI16(step3[10], step2[9]);
+ step1[10] = SUB_EPI16(step2[9], step3[10]);
+ step1[11] = SUB_EPI16(step2[8], step3[11]);
+ step1[12] = SUB_EPI16(step2[15], step3[12]);
+ step1[13] = SUB_EPI16(step2[14], step3[13]);
+ step1[14] = ADD_EPI16(step3[13], step2[14]);
+ step1[15] = ADD_EPI16(step3[12], step2[15]);
+#if DCT_HIGH_BIT_DEPTH
+ overflow = check_epi16_overflow_x16(
+ &step1[0], &step1[1], &step1[2], &step1[3], &step1[4], &step1[5],
+ &step1[6], &step1[7], &step1[8], &step1[9], &step1[10],
+ &step1[11], &step1[12], &step1[13], &step1[14], &step1[15]);
+ if (overflow) {
+ if (pass == 0)
+ HIGH_FDCT32x32_2D_C(input, output_org, stride);
+ else
+ HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org);
+ return;
+ }
+#endif // DCT_HIGH_BIT_DEPTH
+ }
+ {
+ const __m128i s1_05_0 = _mm_unpacklo_epi16(step3[6], step3[5]);
+ const __m128i s1_05_1 = _mm_unpackhi_epi16(step3[6], step3[5]);
+ const __m128i s1_05_2 = _mm_madd_epi16(s1_05_0, k__cospi_p16_m16);
+ const __m128i s1_05_3 = _mm_madd_epi16(s1_05_1, k__cospi_p16_m16);
+ const __m128i s1_06_2 = _mm_madd_epi16(s1_05_0, k__cospi_p16_p16);
+ const __m128i s1_06_3 = _mm_madd_epi16(s1_05_1, k__cospi_p16_p16);
+ // dct_const_round_shift
+ const __m128i s1_05_4 = _mm_add_epi32(s1_05_2, k__DCT_CONST_ROUNDING);
+ const __m128i s1_05_5 = _mm_add_epi32(s1_05_3, k__DCT_CONST_ROUNDING);
+ const __m128i s1_06_4 = _mm_add_epi32(s1_06_2, k__DCT_CONST_ROUNDING);
+ const __m128i s1_06_5 = _mm_add_epi32(s1_06_3, k__DCT_CONST_ROUNDING);
+ const __m128i s1_05_6 = _mm_srai_epi32(s1_05_4, DCT_CONST_BITS);
+ const __m128i s1_05_7 = _mm_srai_epi32(s1_05_5, DCT_CONST_BITS);
+ const __m128i s1_06_6 = _mm_srai_epi32(s1_06_4, DCT_CONST_BITS);
+ const __m128i s1_06_7 = _mm_srai_epi32(s1_06_5, DCT_CONST_BITS);
+ // Combine
+ step1[5] = _mm_packs_epi32(s1_05_6, s1_05_7);
+ step1[6] = _mm_packs_epi32(s1_06_6, s1_06_7);
+#if DCT_HIGH_BIT_DEPTH
+ overflow = check_epi16_overflow_x2(&step1[5], &step1[6]);
+ if (overflow) {
+ if (pass == 0)
+ HIGH_FDCT32x32_2D_C(input, output_org, stride);
+ else
+ HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org);
+ return;
+ }
+#endif // DCT_HIGH_BIT_DEPTH
+ }
+ {
+ const __m128i s1_18_0 = _mm_unpacklo_epi16(step3[18], step3[29]);
+ const __m128i s1_18_1 = _mm_unpackhi_epi16(step3[18], step3[29]);
+ const __m128i s1_19_0 = _mm_unpacklo_epi16(step3[19], step3[28]);
+ const __m128i s1_19_1 = _mm_unpackhi_epi16(step3[19], step3[28]);
+ const __m128i s1_20_0 = _mm_unpacklo_epi16(step3[20], step3[27]);
+ const __m128i s1_20_1 = _mm_unpackhi_epi16(step3[20], step3[27]);
+ const __m128i s1_21_0 = _mm_unpacklo_epi16(step3[21], step3[26]);
+ const __m128i s1_21_1 = _mm_unpackhi_epi16(step3[21], step3[26]);
+ const __m128i s1_18_2 = _mm_madd_epi16(s1_18_0, k__cospi_m08_p24);
+ const __m128i s1_18_3 = _mm_madd_epi16(s1_18_1, k__cospi_m08_p24);
+ const __m128i s1_19_2 = _mm_madd_epi16(s1_19_0, k__cospi_m08_p24);
+ const __m128i s1_19_3 = _mm_madd_epi16(s1_19_1, k__cospi_m08_p24);
+ const __m128i s1_20_2 = _mm_madd_epi16(s1_20_0, k__cospi_m24_m08);
+ const __m128i s1_20_3 = _mm_madd_epi16(s1_20_1, k__cospi_m24_m08);
+ const __m128i s1_21_2 = _mm_madd_epi16(s1_21_0, k__cospi_m24_m08);
+ const __m128i s1_21_3 = _mm_madd_epi16(s1_21_1, k__cospi_m24_m08);
+ const __m128i s1_26_2 = _mm_madd_epi16(s1_21_0, k__cospi_m08_p24);
+ const __m128i s1_26_3 = _mm_madd_epi16(s1_21_1, k__cospi_m08_p24);
+ const __m128i s1_27_2 = _mm_madd_epi16(s1_20_0, k__cospi_m08_p24);
+ const __m128i s1_27_3 = _mm_madd_epi16(s1_20_1, k__cospi_m08_p24);
+ const __m128i s1_28_2 = _mm_madd_epi16(s1_19_0, k__cospi_p24_p08);
+ const __m128i s1_28_3 = _mm_madd_epi16(s1_19_1, k__cospi_p24_p08);
+ const __m128i s1_29_2 = _mm_madd_epi16(s1_18_0, k__cospi_p24_p08);
+ const __m128i s1_29_3 = _mm_madd_epi16(s1_18_1, k__cospi_p24_p08);
+ // dct_const_round_shift
+ const __m128i s1_18_4 = _mm_add_epi32(s1_18_2, k__DCT_CONST_ROUNDING);
+ const __m128i s1_18_5 = _mm_add_epi32(s1_18_3, k__DCT_CONST_ROUNDING);
+ const __m128i s1_19_4 = _mm_add_epi32(s1_19_2, k__DCT_CONST_ROUNDING);
+ const __m128i s1_19_5 = _mm_add_epi32(s1_19_3, k__DCT_CONST_ROUNDING);
+ const __m128i s1_20_4 = _mm_add_epi32(s1_20_2, k__DCT_CONST_ROUNDING);
+ const __m128i s1_20_5 = _mm_add_epi32(s1_20_3, k__DCT_CONST_ROUNDING);
+ const __m128i s1_21_4 = _mm_add_epi32(s1_21_2, k__DCT_CONST_ROUNDING);
+ const __m128i s1_21_5 = _mm_add_epi32(s1_21_3, k__DCT_CONST_ROUNDING);
+ const __m128i s1_26_4 = _mm_add_epi32(s1_26_2, k__DCT_CONST_ROUNDING);
+ const __m128i s1_26_5 = _mm_add_epi32(s1_26_3, k__DCT_CONST_ROUNDING);
+ const __m128i s1_27_4 = _mm_add_epi32(s1_27_2, k__DCT_CONST_ROUNDING);
+ const __m128i s1_27_5 = _mm_add_epi32(s1_27_3, k__DCT_CONST_ROUNDING);
+ const __m128i s1_28_4 = _mm_add_epi32(s1_28_2, k__DCT_CONST_ROUNDING);
+ const __m128i s1_28_5 = _mm_add_epi32(s1_28_3, k__DCT_CONST_ROUNDING);
+ const __m128i s1_29_4 = _mm_add_epi32(s1_29_2, k__DCT_CONST_ROUNDING);
+ const __m128i s1_29_5 = _mm_add_epi32(s1_29_3, k__DCT_CONST_ROUNDING);
+ const __m128i s1_18_6 = _mm_srai_epi32(s1_18_4, DCT_CONST_BITS);
+ const __m128i s1_18_7 = _mm_srai_epi32(s1_18_5, DCT_CONST_BITS);
+ const __m128i s1_19_6 = _mm_srai_epi32(s1_19_4, DCT_CONST_BITS);
+ const __m128i s1_19_7 = _mm_srai_epi32(s1_19_5, DCT_CONST_BITS);
+ const __m128i s1_20_6 = _mm_srai_epi32(s1_20_4, DCT_CONST_BITS);
+ const __m128i s1_20_7 = _mm_srai_epi32(s1_20_5, DCT_CONST_BITS);
+ const __m128i s1_21_6 = _mm_srai_epi32(s1_21_4, DCT_CONST_BITS);
+ const __m128i s1_21_7 = _mm_srai_epi32(s1_21_5, DCT_CONST_BITS);
+ const __m128i s1_26_6 = _mm_srai_epi32(s1_26_4, DCT_CONST_BITS);
+ const __m128i s1_26_7 = _mm_srai_epi32(s1_26_5, DCT_CONST_BITS);
+ const __m128i s1_27_6 = _mm_srai_epi32(s1_27_4, DCT_CONST_BITS);
+ const __m128i s1_27_7 = _mm_srai_epi32(s1_27_5, DCT_CONST_BITS);
+ const __m128i s1_28_6 = _mm_srai_epi32(s1_28_4, DCT_CONST_BITS);
+ const __m128i s1_28_7 = _mm_srai_epi32(s1_28_5, DCT_CONST_BITS);
+ const __m128i s1_29_6 = _mm_srai_epi32(s1_29_4, DCT_CONST_BITS);
+ const __m128i s1_29_7 = _mm_srai_epi32(s1_29_5, DCT_CONST_BITS);
+ // Combine
+ step1[18] = _mm_packs_epi32(s1_18_6, s1_18_7);
+ step1[19] = _mm_packs_epi32(s1_19_6, s1_19_7);
+ step1[20] = _mm_packs_epi32(s1_20_6, s1_20_7);
+ step1[21] = _mm_packs_epi32(s1_21_6, s1_21_7);
+ step1[26] = _mm_packs_epi32(s1_26_6, s1_26_7);
+ step1[27] = _mm_packs_epi32(s1_27_6, s1_27_7);
+ step1[28] = _mm_packs_epi32(s1_28_6, s1_28_7);
+ step1[29] = _mm_packs_epi32(s1_29_6, s1_29_7);
+#if DCT_HIGH_BIT_DEPTH
+ overflow = check_epi16_overflow_x8(&step1[18], &step1[19], &step1[20],
+ &step1[21], &step1[26], &step1[27],
+ &step1[28], &step1[29]);
+ if (overflow) {
+ if (pass == 0)
+ HIGH_FDCT32x32_2D_C(input, output_org, stride);
+ else
+ HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org);
+ return;
+ }
+#endif // DCT_HIGH_BIT_DEPTH
+ }
+ // Stage 5
+ {
+ step2[4] = ADD_EPI16(step1[5], step3[4]);
+ step2[5] = SUB_EPI16(step3[4], step1[5]);
+ step2[6] = SUB_EPI16(step3[7], step1[6]);
+ step2[7] = ADD_EPI16(step1[6], step3[7]);
+#if DCT_HIGH_BIT_DEPTH
+ overflow = check_epi16_overflow_x4(&step2[4], &step2[5], &step2[6],
+ &step2[7]);
+ if (overflow) {
+ if (pass == 0)
+ HIGH_FDCT32x32_2D_C(input, output_org, stride);
+ else
+ HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org);
+ return;
+ }
+#endif // DCT_HIGH_BIT_DEPTH
+ }
+ {
+ const __m128i out_00_0 = _mm_unpacklo_epi16(step1[0], step1[1]);
+ const __m128i out_00_1 = _mm_unpackhi_epi16(step1[0], step1[1]);
+ const __m128i out_08_0 = _mm_unpacklo_epi16(step1[2], step1[3]);
+ const __m128i out_08_1 = _mm_unpackhi_epi16(step1[2], step1[3]);
+ const __m128i out_00_2 = _mm_madd_epi16(out_00_0, k__cospi_p16_p16);
+ const __m128i out_00_3 = _mm_madd_epi16(out_00_1, k__cospi_p16_p16);
+ const __m128i out_16_2 = _mm_madd_epi16(out_00_0, k__cospi_p16_m16);
+ const __m128i out_16_3 = _mm_madd_epi16(out_00_1, k__cospi_p16_m16);
+ const __m128i out_08_2 = _mm_madd_epi16(out_08_0, k__cospi_p24_p08);
+ const __m128i out_08_3 = _mm_madd_epi16(out_08_1, k__cospi_p24_p08);
+ const __m128i out_24_2 = _mm_madd_epi16(out_08_0, k__cospi_m08_p24);
+ const __m128i out_24_3 = _mm_madd_epi16(out_08_1, k__cospi_m08_p24);
+ // dct_const_round_shift
+ const __m128i out_00_4 =
+ _mm_add_epi32(out_00_2, k__DCT_CONST_ROUNDING);
+ const __m128i out_00_5 =
+ _mm_add_epi32(out_00_3, k__DCT_CONST_ROUNDING);
+ const __m128i out_16_4 =
+ _mm_add_epi32(out_16_2, k__DCT_CONST_ROUNDING);
+ const __m128i out_16_5 =
+ _mm_add_epi32(out_16_3, k__DCT_CONST_ROUNDING);
+ const __m128i out_08_4 =
+ _mm_add_epi32(out_08_2, k__DCT_CONST_ROUNDING);
+ const __m128i out_08_5 =
+ _mm_add_epi32(out_08_3, k__DCT_CONST_ROUNDING);
+ const __m128i out_24_4 =
+ _mm_add_epi32(out_24_2, k__DCT_CONST_ROUNDING);
+ const __m128i out_24_5 =
+ _mm_add_epi32(out_24_3, k__DCT_CONST_ROUNDING);
+ const __m128i out_00_6 = _mm_srai_epi32(out_00_4, DCT_CONST_BITS);
+ const __m128i out_00_7 = _mm_srai_epi32(out_00_5, DCT_CONST_BITS);
+ const __m128i out_16_6 = _mm_srai_epi32(out_16_4, DCT_CONST_BITS);
+ const __m128i out_16_7 = _mm_srai_epi32(out_16_5, DCT_CONST_BITS);
+ const __m128i out_08_6 = _mm_srai_epi32(out_08_4, DCT_CONST_BITS);
+ const __m128i out_08_7 = _mm_srai_epi32(out_08_5, DCT_CONST_BITS);
+ const __m128i out_24_6 = _mm_srai_epi32(out_24_4, DCT_CONST_BITS);
+ const __m128i out_24_7 = _mm_srai_epi32(out_24_5, DCT_CONST_BITS);
+ // Combine
+ out[0] = _mm_packs_epi32(out_00_6, out_00_7);
+ out[16] = _mm_packs_epi32(out_16_6, out_16_7);
+ out[8] = _mm_packs_epi32(out_08_6, out_08_7);
+ out[24] = _mm_packs_epi32(out_24_6, out_24_7);
+#if DCT_HIGH_BIT_DEPTH
+ overflow =
+ check_epi16_overflow_x4(&out[0], &out[16], &out[8], &out[24]);
+ if (overflow) {
+ if (pass == 0)
+ HIGH_FDCT32x32_2D_C(input, output_org, stride);
+ else
+ HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org);
+ return;
+ }
+#endif // DCT_HIGH_BIT_DEPTH
+ }
+ {
+ const __m128i s2_09_0 = _mm_unpacklo_epi16(step1[9], step1[14]);
+ const __m128i s2_09_1 = _mm_unpackhi_epi16(step1[9], step1[14]);
+ const __m128i s2_10_0 = _mm_unpacklo_epi16(step1[10], step1[13]);
+ const __m128i s2_10_1 = _mm_unpackhi_epi16(step1[10], step1[13]);
+ const __m128i s2_09_2 = _mm_madd_epi16(s2_09_0, k__cospi_m08_p24);
+ const __m128i s2_09_3 = _mm_madd_epi16(s2_09_1, k__cospi_m08_p24);
+ const __m128i s2_10_2 = _mm_madd_epi16(s2_10_0, k__cospi_m24_m08);
+ const __m128i s2_10_3 = _mm_madd_epi16(s2_10_1, k__cospi_m24_m08);
+ const __m128i s2_13_2 = _mm_madd_epi16(s2_10_0, k__cospi_m08_p24);
+ const __m128i s2_13_3 = _mm_madd_epi16(s2_10_1, k__cospi_m08_p24);
+ const __m128i s2_14_2 = _mm_madd_epi16(s2_09_0, k__cospi_p24_p08);
+ const __m128i s2_14_3 = _mm_madd_epi16(s2_09_1, k__cospi_p24_p08);
+ // dct_const_round_shift
+ const __m128i s2_09_4 = _mm_add_epi32(s2_09_2, k__DCT_CONST_ROUNDING);
+ const __m128i s2_09_5 = _mm_add_epi32(s2_09_3, k__DCT_CONST_ROUNDING);
+ const __m128i s2_10_4 = _mm_add_epi32(s2_10_2, k__DCT_CONST_ROUNDING);
+ const __m128i s2_10_5 = _mm_add_epi32(s2_10_3, k__DCT_CONST_ROUNDING);
+ const __m128i s2_13_4 = _mm_add_epi32(s2_13_2, k__DCT_CONST_ROUNDING);
+ const __m128i s2_13_5 = _mm_add_epi32(s2_13_3, k__DCT_CONST_ROUNDING);
+ const __m128i s2_14_4 = _mm_add_epi32(s2_14_2, k__DCT_CONST_ROUNDING);
+ const __m128i s2_14_5 = _mm_add_epi32(s2_14_3, k__DCT_CONST_ROUNDING);
+ const __m128i s2_09_6 = _mm_srai_epi32(s2_09_4, DCT_CONST_BITS);
+ const __m128i s2_09_7 = _mm_srai_epi32(s2_09_5, DCT_CONST_BITS);
+ const __m128i s2_10_6 = _mm_srai_epi32(s2_10_4, DCT_CONST_BITS);
+ const __m128i s2_10_7 = _mm_srai_epi32(s2_10_5, DCT_CONST_BITS);
+ const __m128i s2_13_6 = _mm_srai_epi32(s2_13_4, DCT_CONST_BITS);
+ const __m128i s2_13_7 = _mm_srai_epi32(s2_13_5, DCT_CONST_BITS);
+ const __m128i s2_14_6 = _mm_srai_epi32(s2_14_4, DCT_CONST_BITS);
+ const __m128i s2_14_7 = _mm_srai_epi32(s2_14_5, DCT_CONST_BITS);
+ // Combine
+ step2[9] = _mm_packs_epi32(s2_09_6, s2_09_7);
+ step2[10] = _mm_packs_epi32(s2_10_6, s2_10_7);
+ step2[13] = _mm_packs_epi32(s2_13_6, s2_13_7);
+ step2[14] = _mm_packs_epi32(s2_14_6, s2_14_7);
+#if DCT_HIGH_BIT_DEPTH
+ overflow = check_epi16_overflow_x4(&step2[9], &step2[10], &step2[13],
+ &step2[14]);
+ if (overflow) {
+ if (pass == 0)
+ HIGH_FDCT32x32_2D_C(input, output_org, stride);
+ else
+ HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org);
+ return;
+ }
+#endif // DCT_HIGH_BIT_DEPTH
+ }
+ {
+ step2[16] = ADD_EPI16(step1[19], step3[16]);
+ step2[17] = ADD_EPI16(step1[18], step3[17]);
+ step2[18] = SUB_EPI16(step3[17], step1[18]);
+ step2[19] = SUB_EPI16(step3[16], step1[19]);
+ step2[20] = SUB_EPI16(step3[23], step1[20]);
+ step2[21] = SUB_EPI16(step3[22], step1[21]);
+ step2[22] = ADD_EPI16(step1[21], step3[22]);
+ step2[23] = ADD_EPI16(step1[20], step3[23]);
+ step2[24] = ADD_EPI16(step1[27], step3[24]);
+ step2[25] = ADD_EPI16(step1[26], step3[25]);
+ step2[26] = SUB_EPI16(step3[25], step1[26]);
+ step2[27] = SUB_EPI16(step3[24], step1[27]);
+ step2[28] = SUB_EPI16(step3[31], step1[28]);
+ step2[29] = SUB_EPI16(step3[30], step1[29]);
+ step2[30] = ADD_EPI16(step1[29], step3[30]);
+ step2[31] = ADD_EPI16(step1[28], step3[31]);
+#if DCT_HIGH_BIT_DEPTH
+ overflow = check_epi16_overflow_x16(
+ &step2[16], &step2[17], &step2[18], &step2[19], &step2[20],
+ &step2[21], &step2[22], &step2[23], &step2[24], &step2[25],
+ &step2[26], &step2[27], &step2[28], &step2[29], &step2[30],
+ &step2[31]);
+ if (overflow) {
+ if (pass == 0)
+ HIGH_FDCT32x32_2D_C(input, output_org, stride);
+ else
+ HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org);
+ return;
+ }
+#endif // DCT_HIGH_BIT_DEPTH
+ }
+ // Stage 6
+ {
+ const __m128i out_04_0 = _mm_unpacklo_epi16(step2[4], step2[7]);
+ const __m128i out_04_1 = _mm_unpackhi_epi16(step2[4], step2[7]);
+ const __m128i out_20_0 = _mm_unpacklo_epi16(step2[5], step2[6]);
+ const __m128i out_20_1 = _mm_unpackhi_epi16(step2[5], step2[6]);
+ const __m128i out_12_0 = _mm_unpacklo_epi16(step2[5], step2[6]);
+ const __m128i out_12_1 = _mm_unpackhi_epi16(step2[5], step2[6]);
+ const __m128i out_28_0 = _mm_unpacklo_epi16(step2[4], step2[7]);
+ const __m128i out_28_1 = _mm_unpackhi_epi16(step2[4], step2[7]);
+ const __m128i out_04_2 = _mm_madd_epi16(out_04_0, k__cospi_p28_p04);
+ const __m128i out_04_3 = _mm_madd_epi16(out_04_1, k__cospi_p28_p04);
+ const __m128i out_20_2 = _mm_madd_epi16(out_20_0, k__cospi_p12_p20);
+ const __m128i out_20_3 = _mm_madd_epi16(out_20_1, k__cospi_p12_p20);
+ const __m128i out_12_2 = _mm_madd_epi16(out_12_0, k__cospi_m20_p12);
+ const __m128i out_12_3 = _mm_madd_epi16(out_12_1, k__cospi_m20_p12);
+ const __m128i out_28_2 = _mm_madd_epi16(out_28_0, k__cospi_m04_p28);
+ const __m128i out_28_3 = _mm_madd_epi16(out_28_1, k__cospi_m04_p28);
+ // dct_const_round_shift
+ const __m128i out_04_4 =
+ _mm_add_epi32(out_04_2, k__DCT_CONST_ROUNDING);
+ const __m128i out_04_5 =
+ _mm_add_epi32(out_04_3, k__DCT_CONST_ROUNDING);
+ const __m128i out_20_4 =
+ _mm_add_epi32(out_20_2, k__DCT_CONST_ROUNDING);
+ const __m128i out_20_5 =
+ _mm_add_epi32(out_20_3, k__DCT_CONST_ROUNDING);
+ const __m128i out_12_4 =
+ _mm_add_epi32(out_12_2, k__DCT_CONST_ROUNDING);
+ const __m128i out_12_5 =
+ _mm_add_epi32(out_12_3, k__DCT_CONST_ROUNDING);
+ const __m128i out_28_4 =
+ _mm_add_epi32(out_28_2, k__DCT_CONST_ROUNDING);
+ const __m128i out_28_5 =
+ _mm_add_epi32(out_28_3, k__DCT_CONST_ROUNDING);
+ const __m128i out_04_6 = _mm_srai_epi32(out_04_4, DCT_CONST_BITS);
+ const __m128i out_04_7 = _mm_srai_epi32(out_04_5, DCT_CONST_BITS);
+ const __m128i out_20_6 = _mm_srai_epi32(out_20_4, DCT_CONST_BITS);
+ const __m128i out_20_7 = _mm_srai_epi32(out_20_5, DCT_CONST_BITS);
+ const __m128i out_12_6 = _mm_srai_epi32(out_12_4, DCT_CONST_BITS);
+ const __m128i out_12_7 = _mm_srai_epi32(out_12_5, DCT_CONST_BITS);
+ const __m128i out_28_6 = _mm_srai_epi32(out_28_4, DCT_CONST_BITS);
+ const __m128i out_28_7 = _mm_srai_epi32(out_28_5, DCT_CONST_BITS);
+ // Combine
+ out[4] = _mm_packs_epi32(out_04_6, out_04_7);
+ out[20] = _mm_packs_epi32(out_20_6, out_20_7);
+ out[12] = _mm_packs_epi32(out_12_6, out_12_7);
+ out[28] = _mm_packs_epi32(out_28_6, out_28_7);
+#if DCT_HIGH_BIT_DEPTH
+ overflow =
+ check_epi16_overflow_x4(&out[4], &out[20], &out[12], &out[28]);
+ if (overflow) {
+ if (pass == 0)
+ HIGH_FDCT32x32_2D_C(input, output_org, stride);
+ else
+ HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org);
+ return;
+ }
+#endif // DCT_HIGH_BIT_DEPTH
+ }
+ {
+ step3[8] = ADD_EPI16(step2[9], step1[8]);
+ step3[9] = SUB_EPI16(step1[8], step2[9]);
+ step3[10] = SUB_EPI16(step1[11], step2[10]);
+ step3[11] = ADD_EPI16(step2[10], step1[11]);
+ step3[12] = ADD_EPI16(step2[13], step1[12]);
+ step3[13] = SUB_EPI16(step1[12], step2[13]);
+ step3[14] = SUB_EPI16(step1[15], step2[14]);
+ step3[15] = ADD_EPI16(step2[14], step1[15]);
+#if DCT_HIGH_BIT_DEPTH
+ overflow = check_epi16_overflow_x8(&step3[8], &step3[9], &step3[10],
+ &step3[11], &step3[12], &step3[13],
+ &step3[14], &step3[15]);
+ if (overflow) {
+ if (pass == 0)
+ HIGH_FDCT32x32_2D_C(input, output_org, stride);
+ else
+ HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org);
+ return;
+ }
+#endif // DCT_HIGH_BIT_DEPTH
+ }
+ {
+ const __m128i s3_17_0 = _mm_unpacklo_epi16(step2[17], step2[30]);
+ const __m128i s3_17_1 = _mm_unpackhi_epi16(step2[17], step2[30]);
+ const __m128i s3_18_0 = _mm_unpacklo_epi16(step2[18], step2[29]);
+ const __m128i s3_18_1 = _mm_unpackhi_epi16(step2[18], step2[29]);
+ const __m128i s3_21_0 = _mm_unpacklo_epi16(step2[21], step2[26]);
+ const __m128i s3_21_1 = _mm_unpackhi_epi16(step2[21], step2[26]);
+ const __m128i s3_22_0 = _mm_unpacklo_epi16(step2[22], step2[25]);
+ const __m128i s3_22_1 = _mm_unpackhi_epi16(step2[22], step2[25]);
+ const __m128i s3_17_2 = _mm_madd_epi16(s3_17_0, k__cospi_m04_p28);
+ const __m128i s3_17_3 = _mm_madd_epi16(s3_17_1, k__cospi_m04_p28);
+ const __m128i s3_18_2 = _mm_madd_epi16(s3_18_0, k__cospi_m28_m04);
+ const __m128i s3_18_3 = _mm_madd_epi16(s3_18_1, k__cospi_m28_m04);
+ const __m128i s3_21_2 = _mm_madd_epi16(s3_21_0, k__cospi_m20_p12);
+ const __m128i s3_21_3 = _mm_madd_epi16(s3_21_1, k__cospi_m20_p12);
+ const __m128i s3_22_2 = _mm_madd_epi16(s3_22_0, k__cospi_m12_m20);
+ const __m128i s3_22_3 = _mm_madd_epi16(s3_22_1, k__cospi_m12_m20);
+ const __m128i s3_25_2 = _mm_madd_epi16(s3_22_0, k__cospi_m20_p12);
+ const __m128i s3_25_3 = _mm_madd_epi16(s3_22_1, k__cospi_m20_p12);
+ const __m128i s3_26_2 = _mm_madd_epi16(s3_21_0, k__cospi_p12_p20);
+ const __m128i s3_26_3 = _mm_madd_epi16(s3_21_1, k__cospi_p12_p20);
+ const __m128i s3_29_2 = _mm_madd_epi16(s3_18_0, k__cospi_m04_p28);
+ const __m128i s3_29_3 = _mm_madd_epi16(s3_18_1, k__cospi_m04_p28);
+ const __m128i s3_30_2 = _mm_madd_epi16(s3_17_0, k__cospi_p28_p04);
+ const __m128i s3_30_3 = _mm_madd_epi16(s3_17_1, k__cospi_p28_p04);
+ // dct_const_round_shift
+ const __m128i s3_17_4 = _mm_add_epi32(s3_17_2, k__DCT_CONST_ROUNDING);
+ const __m128i s3_17_5 = _mm_add_epi32(s3_17_3, k__DCT_CONST_ROUNDING);
+ const __m128i s3_18_4 = _mm_add_epi32(s3_18_2, k__DCT_CONST_ROUNDING);
+ const __m128i s3_18_5 = _mm_add_epi32(s3_18_3, k__DCT_CONST_ROUNDING);
+ const __m128i s3_21_4 = _mm_add_epi32(s3_21_2, k__DCT_CONST_ROUNDING);
+ const __m128i s3_21_5 = _mm_add_epi32(s3_21_3, k__DCT_CONST_ROUNDING);
+ const __m128i s3_22_4 = _mm_add_epi32(s3_22_2, k__DCT_CONST_ROUNDING);
+ const __m128i s3_22_5 = _mm_add_epi32(s3_22_3, k__DCT_CONST_ROUNDING);
+ const __m128i s3_17_6 = _mm_srai_epi32(s3_17_4, DCT_CONST_BITS);
+ const __m128i s3_17_7 = _mm_srai_epi32(s3_17_5, DCT_CONST_BITS);
+ const __m128i s3_18_6 = _mm_srai_epi32(s3_18_4, DCT_CONST_BITS);
+ const __m128i s3_18_7 = _mm_srai_epi32(s3_18_5, DCT_CONST_BITS);
+ const __m128i s3_21_6 = _mm_srai_epi32(s3_21_4, DCT_CONST_BITS);
+ const __m128i s3_21_7 = _mm_srai_epi32(s3_21_5, DCT_CONST_BITS);
+ const __m128i s3_22_6 = _mm_srai_epi32(s3_22_4, DCT_CONST_BITS);
+ const __m128i s3_22_7 = _mm_srai_epi32(s3_22_5, DCT_CONST_BITS);
+ const __m128i s3_25_4 = _mm_add_epi32(s3_25_2, k__DCT_CONST_ROUNDING);
+ const __m128i s3_25_5 = _mm_add_epi32(s3_25_3, k__DCT_CONST_ROUNDING);
+ const __m128i s3_26_4 = _mm_add_epi32(s3_26_2, k__DCT_CONST_ROUNDING);
+ const __m128i s3_26_5 = _mm_add_epi32(s3_26_3, k__DCT_CONST_ROUNDING);
+ const __m128i s3_29_4 = _mm_add_epi32(s3_29_2, k__DCT_CONST_ROUNDING);
+ const __m128i s3_29_5 = _mm_add_epi32(s3_29_3, k__DCT_CONST_ROUNDING);
+ const __m128i s3_30_4 = _mm_add_epi32(s3_30_2, k__DCT_CONST_ROUNDING);
+ const __m128i s3_30_5 = _mm_add_epi32(s3_30_3, k__DCT_CONST_ROUNDING);
+ const __m128i s3_25_6 = _mm_srai_epi32(s3_25_4, DCT_CONST_BITS);
+ const __m128i s3_25_7 = _mm_srai_epi32(s3_25_5, DCT_CONST_BITS);
+ const __m128i s3_26_6 = _mm_srai_epi32(s3_26_4, DCT_CONST_BITS);
+ const __m128i s3_26_7 = _mm_srai_epi32(s3_26_5, DCT_CONST_BITS);
+ const __m128i s3_29_6 = _mm_srai_epi32(s3_29_4, DCT_CONST_BITS);
+ const __m128i s3_29_7 = _mm_srai_epi32(s3_29_5, DCT_CONST_BITS);
+ const __m128i s3_30_6 = _mm_srai_epi32(s3_30_4, DCT_CONST_BITS);
+ const __m128i s3_30_7 = _mm_srai_epi32(s3_30_5, DCT_CONST_BITS);
+ // Combine
+ step3[17] = _mm_packs_epi32(s3_17_6, s3_17_7);
+ step3[18] = _mm_packs_epi32(s3_18_6, s3_18_7);
+ step3[21] = _mm_packs_epi32(s3_21_6, s3_21_7);
+ step3[22] = _mm_packs_epi32(s3_22_6, s3_22_7);
+ // Combine
+ step3[25] = _mm_packs_epi32(s3_25_6, s3_25_7);
+ step3[26] = _mm_packs_epi32(s3_26_6, s3_26_7);
+ step3[29] = _mm_packs_epi32(s3_29_6, s3_29_7);
+ step3[30] = _mm_packs_epi32(s3_30_6, s3_30_7);
+#if DCT_HIGH_BIT_DEPTH
+ overflow = check_epi16_overflow_x8(&step3[17], &step3[18], &step3[21],
+ &step3[22], &step3[25], &step3[26],
+ &step3[29], &step3[30]);
+ if (overflow) {
+ if (pass == 0)
+ HIGH_FDCT32x32_2D_C(input, output_org, stride);
+ else
+ HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org);
+ return;
+ }
+#endif // DCT_HIGH_BIT_DEPTH
+ }
+ // Stage 7
+ {
+ const __m128i out_02_0 = _mm_unpacklo_epi16(step3[8], step3[15]);
+ const __m128i out_02_1 = _mm_unpackhi_epi16(step3[8], step3[15]);
+ const __m128i out_18_0 = _mm_unpacklo_epi16(step3[9], step3[14]);
+ const __m128i out_18_1 = _mm_unpackhi_epi16(step3[9], step3[14]);
+ const __m128i out_10_0 = _mm_unpacklo_epi16(step3[10], step3[13]);
+ const __m128i out_10_1 = _mm_unpackhi_epi16(step3[10], step3[13]);
+ const __m128i out_26_0 = _mm_unpacklo_epi16(step3[11], step3[12]);
+ const __m128i out_26_1 = _mm_unpackhi_epi16(step3[11], step3[12]);
+ const __m128i out_02_2 = _mm_madd_epi16(out_02_0, k__cospi_p30_p02);
+ const __m128i out_02_3 = _mm_madd_epi16(out_02_1, k__cospi_p30_p02);
+ const __m128i out_18_2 = _mm_madd_epi16(out_18_0, k__cospi_p14_p18);
+ const __m128i out_18_3 = _mm_madd_epi16(out_18_1, k__cospi_p14_p18);
+ const __m128i out_10_2 = _mm_madd_epi16(out_10_0, k__cospi_p22_p10);
+ const __m128i out_10_3 = _mm_madd_epi16(out_10_1, k__cospi_p22_p10);
+ const __m128i out_26_2 = _mm_madd_epi16(out_26_0, k__cospi_p06_p26);
+ const __m128i out_26_3 = _mm_madd_epi16(out_26_1, k__cospi_p06_p26);
+ const __m128i out_06_2 = _mm_madd_epi16(out_26_0, k__cospi_m26_p06);
+ const __m128i out_06_3 = _mm_madd_epi16(out_26_1, k__cospi_m26_p06);
+ const __m128i out_22_2 = _mm_madd_epi16(out_10_0, k__cospi_m10_p22);
+ const __m128i out_22_3 = _mm_madd_epi16(out_10_1, k__cospi_m10_p22);
+ const __m128i out_14_2 = _mm_madd_epi16(out_18_0, k__cospi_m18_p14);
+ const __m128i out_14_3 = _mm_madd_epi16(out_18_1, k__cospi_m18_p14);
+ const __m128i out_30_2 = _mm_madd_epi16(out_02_0, k__cospi_m02_p30);
+ const __m128i out_30_3 = _mm_madd_epi16(out_02_1, k__cospi_m02_p30);
+ // dct_const_round_shift
+ const __m128i out_02_4 =
+ _mm_add_epi32(out_02_2, k__DCT_CONST_ROUNDING);
+ const __m128i out_02_5 =
+ _mm_add_epi32(out_02_3, k__DCT_CONST_ROUNDING);
+ const __m128i out_18_4 =
+ _mm_add_epi32(out_18_2, k__DCT_CONST_ROUNDING);
+ const __m128i out_18_5 =
+ _mm_add_epi32(out_18_3, k__DCT_CONST_ROUNDING);
+ const __m128i out_10_4 =
+ _mm_add_epi32(out_10_2, k__DCT_CONST_ROUNDING);
+ const __m128i out_10_5 =
+ _mm_add_epi32(out_10_3, k__DCT_CONST_ROUNDING);
+ const __m128i out_26_4 =
+ _mm_add_epi32(out_26_2, k__DCT_CONST_ROUNDING);
+ const __m128i out_26_5 =
+ _mm_add_epi32(out_26_3, k__DCT_CONST_ROUNDING);
+ const __m128i out_06_4 =
+ _mm_add_epi32(out_06_2, k__DCT_CONST_ROUNDING);
+ const __m128i out_06_5 =
+ _mm_add_epi32(out_06_3, k__DCT_CONST_ROUNDING);
+ const __m128i out_22_4 =
+ _mm_add_epi32(out_22_2, k__DCT_CONST_ROUNDING);
+ const __m128i out_22_5 =
+ _mm_add_epi32(out_22_3, k__DCT_CONST_ROUNDING);
+ const __m128i out_14_4 =
+ _mm_add_epi32(out_14_2, k__DCT_CONST_ROUNDING);
+ const __m128i out_14_5 =
+ _mm_add_epi32(out_14_3, k__DCT_CONST_ROUNDING);
+ const __m128i out_30_4 =
+ _mm_add_epi32(out_30_2, k__DCT_CONST_ROUNDING);
+ const __m128i out_30_5 =
+ _mm_add_epi32(out_30_3, k__DCT_CONST_ROUNDING);
+ const __m128i out_02_6 = _mm_srai_epi32(out_02_4, DCT_CONST_BITS);
+ const __m128i out_02_7 = _mm_srai_epi32(out_02_5, DCT_CONST_BITS);
+ const __m128i out_18_6 = _mm_srai_epi32(out_18_4, DCT_CONST_BITS);
+ const __m128i out_18_7 = _mm_srai_epi32(out_18_5, DCT_CONST_BITS);
+ const __m128i out_10_6 = _mm_srai_epi32(out_10_4, DCT_CONST_BITS);
+ const __m128i out_10_7 = _mm_srai_epi32(out_10_5, DCT_CONST_BITS);
+ const __m128i out_26_6 = _mm_srai_epi32(out_26_4, DCT_CONST_BITS);
+ const __m128i out_26_7 = _mm_srai_epi32(out_26_5, DCT_CONST_BITS);
+ const __m128i out_06_6 = _mm_srai_epi32(out_06_4, DCT_CONST_BITS);
+ const __m128i out_06_7 = _mm_srai_epi32(out_06_5, DCT_CONST_BITS);
+ const __m128i out_22_6 = _mm_srai_epi32(out_22_4, DCT_CONST_BITS);
+ const __m128i out_22_7 = _mm_srai_epi32(out_22_5, DCT_CONST_BITS);
+ const __m128i out_14_6 = _mm_srai_epi32(out_14_4, DCT_CONST_BITS);
+ const __m128i out_14_7 = _mm_srai_epi32(out_14_5, DCT_CONST_BITS);
+ const __m128i out_30_6 = _mm_srai_epi32(out_30_4, DCT_CONST_BITS);
+ const __m128i out_30_7 = _mm_srai_epi32(out_30_5, DCT_CONST_BITS);
+ // Combine
+ out[2] = _mm_packs_epi32(out_02_6, out_02_7);
+ out[18] = _mm_packs_epi32(out_18_6, out_18_7);
+ out[10] = _mm_packs_epi32(out_10_6, out_10_7);
+ out[26] = _mm_packs_epi32(out_26_6, out_26_7);
+ out[6] = _mm_packs_epi32(out_06_6, out_06_7);
+ out[22] = _mm_packs_epi32(out_22_6, out_22_7);
+ out[14] = _mm_packs_epi32(out_14_6, out_14_7);
+ out[30] = _mm_packs_epi32(out_30_6, out_30_7);
+#if DCT_HIGH_BIT_DEPTH
+ overflow =
+ check_epi16_overflow_x8(&out[2], &out[18], &out[10], &out[26],
+ &out[6], &out[22], &out[14], &out[30]);
+ if (overflow) {
+ if (pass == 0)
+ HIGH_FDCT32x32_2D_C(input, output_org, stride);
+ else
+ HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org);
+ return;
+ }
+#endif // DCT_HIGH_BIT_DEPTH
+ }
+ {
+ step1[16] = ADD_EPI16(step3[17], step2[16]);
+ step1[17] = SUB_EPI16(step2[16], step3[17]);
+ step1[18] = SUB_EPI16(step2[19], step3[18]);
+ step1[19] = ADD_EPI16(step3[18], step2[19]);
+ step1[20] = ADD_EPI16(step3[21], step2[20]);
+ step1[21] = SUB_EPI16(step2[20], step3[21]);
+ step1[22] = SUB_EPI16(step2[23], step3[22]);
+ step1[23] = ADD_EPI16(step3[22], step2[23]);
+ step1[24] = ADD_EPI16(step3[25], step2[24]);
+ step1[25] = SUB_EPI16(step2[24], step3[25]);
+ step1[26] = SUB_EPI16(step2[27], step3[26]);
+ step1[27] = ADD_EPI16(step3[26], step2[27]);
+ step1[28] = ADD_EPI16(step3[29], step2[28]);
+ step1[29] = SUB_EPI16(step2[28], step3[29]);
+ step1[30] = SUB_EPI16(step2[31], step3[30]);
+ step1[31] = ADD_EPI16(step3[30], step2[31]);
+#if DCT_HIGH_BIT_DEPTH
+ overflow = check_epi16_overflow_x16(
+ &step1[16], &step1[17], &step1[18], &step1[19], &step1[20],
+ &step1[21], &step1[22], &step1[23], &step1[24], &step1[25],
+ &step1[26], &step1[27], &step1[28], &step1[29], &step1[30],
+ &step1[31]);
+ if (overflow) {
+ if (pass == 0)
+ HIGH_FDCT32x32_2D_C(input, output_org, stride);
+ else
+ HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org);
+ return;
+ }
+#endif // DCT_HIGH_BIT_DEPTH
+ }
+ // Final stage --- outputs indices are bit-reversed.
+ {
+ const __m128i out_01_0 = _mm_unpacklo_epi16(step1[16], step1[31]);
+ const __m128i out_01_1 = _mm_unpackhi_epi16(step1[16], step1[31]);
+ const __m128i out_17_0 = _mm_unpacklo_epi16(step1[17], step1[30]);
+ const __m128i out_17_1 = _mm_unpackhi_epi16(step1[17], step1[30]);
+ const __m128i out_09_0 = _mm_unpacklo_epi16(step1[18], step1[29]);
+ const __m128i out_09_1 = _mm_unpackhi_epi16(step1[18], step1[29]);
+ const __m128i out_25_0 = _mm_unpacklo_epi16(step1[19], step1[28]);
+ const __m128i out_25_1 = _mm_unpackhi_epi16(step1[19], step1[28]);
+ const __m128i out_01_2 = _mm_madd_epi16(out_01_0, k__cospi_p31_p01);
+ const __m128i out_01_3 = _mm_madd_epi16(out_01_1, k__cospi_p31_p01);
+ const __m128i out_17_2 = _mm_madd_epi16(out_17_0, k__cospi_p15_p17);
+ const __m128i out_17_3 = _mm_madd_epi16(out_17_1, k__cospi_p15_p17);
+ const __m128i out_09_2 = _mm_madd_epi16(out_09_0, k__cospi_p23_p09);
+ const __m128i out_09_3 = _mm_madd_epi16(out_09_1, k__cospi_p23_p09);
+ const __m128i out_25_2 = _mm_madd_epi16(out_25_0, k__cospi_p07_p25);
+ const __m128i out_25_3 = _mm_madd_epi16(out_25_1, k__cospi_p07_p25);
+ const __m128i out_07_2 = _mm_madd_epi16(out_25_0, k__cospi_m25_p07);
+ const __m128i out_07_3 = _mm_madd_epi16(out_25_1, k__cospi_m25_p07);
+ const __m128i out_23_2 = _mm_madd_epi16(out_09_0, k__cospi_m09_p23);
+ const __m128i out_23_3 = _mm_madd_epi16(out_09_1, k__cospi_m09_p23);
+ const __m128i out_15_2 = _mm_madd_epi16(out_17_0, k__cospi_m17_p15);
+ const __m128i out_15_3 = _mm_madd_epi16(out_17_1, k__cospi_m17_p15);
+ const __m128i out_31_2 = _mm_madd_epi16(out_01_0, k__cospi_m01_p31);
+ const __m128i out_31_3 = _mm_madd_epi16(out_01_1, k__cospi_m01_p31);
+ // dct_const_round_shift
+ const __m128i out_01_4 =
+ _mm_add_epi32(out_01_2, k__DCT_CONST_ROUNDING);
+ const __m128i out_01_5 =
+ _mm_add_epi32(out_01_3, k__DCT_CONST_ROUNDING);
+ const __m128i out_17_4 =
+ _mm_add_epi32(out_17_2, k__DCT_CONST_ROUNDING);
+ const __m128i out_17_5 =
+ _mm_add_epi32(out_17_3, k__DCT_CONST_ROUNDING);
+ const __m128i out_09_4 =
+ _mm_add_epi32(out_09_2, k__DCT_CONST_ROUNDING);
+ const __m128i out_09_5 =
+ _mm_add_epi32(out_09_3, k__DCT_CONST_ROUNDING);
+ const __m128i out_25_4 =
+ _mm_add_epi32(out_25_2, k__DCT_CONST_ROUNDING);
+ const __m128i out_25_5 =
+ _mm_add_epi32(out_25_3, k__DCT_CONST_ROUNDING);
+ const __m128i out_07_4 =
+ _mm_add_epi32(out_07_2, k__DCT_CONST_ROUNDING);
+ const __m128i out_07_5 =
+ _mm_add_epi32(out_07_3, k__DCT_CONST_ROUNDING);
+ const __m128i out_23_4 =
+ _mm_add_epi32(out_23_2, k__DCT_CONST_ROUNDING);
+ const __m128i out_23_5 =
+ _mm_add_epi32(out_23_3, k__DCT_CONST_ROUNDING);
+ const __m128i out_15_4 =
+ _mm_add_epi32(out_15_2, k__DCT_CONST_ROUNDING);
+ const __m128i out_15_5 =
+ _mm_add_epi32(out_15_3, k__DCT_CONST_ROUNDING);
+ const __m128i out_31_4 =
+ _mm_add_epi32(out_31_2, k__DCT_CONST_ROUNDING);
+ const __m128i out_31_5 =
+ _mm_add_epi32(out_31_3, k__DCT_CONST_ROUNDING);
+ const __m128i out_01_6 = _mm_srai_epi32(out_01_4, DCT_CONST_BITS);
+ const __m128i out_01_7 = _mm_srai_epi32(out_01_5, DCT_CONST_BITS);
+ const __m128i out_17_6 = _mm_srai_epi32(out_17_4, DCT_CONST_BITS);
+ const __m128i out_17_7 = _mm_srai_epi32(out_17_5, DCT_CONST_BITS);
+ const __m128i out_09_6 = _mm_srai_epi32(out_09_4, DCT_CONST_BITS);
+ const __m128i out_09_7 = _mm_srai_epi32(out_09_5, DCT_CONST_BITS);
+ const __m128i out_25_6 = _mm_srai_epi32(out_25_4, DCT_CONST_BITS);
+ const __m128i out_25_7 = _mm_srai_epi32(out_25_5, DCT_CONST_BITS);
+ const __m128i out_07_6 = _mm_srai_epi32(out_07_4, DCT_CONST_BITS);
+ const __m128i out_07_7 = _mm_srai_epi32(out_07_5, DCT_CONST_BITS);
+ const __m128i out_23_6 = _mm_srai_epi32(out_23_4, DCT_CONST_BITS);
+ const __m128i out_23_7 = _mm_srai_epi32(out_23_5, DCT_CONST_BITS);
+ const __m128i out_15_6 = _mm_srai_epi32(out_15_4, DCT_CONST_BITS);
+ const __m128i out_15_7 = _mm_srai_epi32(out_15_5, DCT_CONST_BITS);
+ const __m128i out_31_6 = _mm_srai_epi32(out_31_4, DCT_CONST_BITS);
+ const __m128i out_31_7 = _mm_srai_epi32(out_31_5, DCT_CONST_BITS);
+ // Combine
+ out[1] = _mm_packs_epi32(out_01_6, out_01_7);
+ out[17] = _mm_packs_epi32(out_17_6, out_17_7);
+ out[9] = _mm_packs_epi32(out_09_6, out_09_7);
+ out[25] = _mm_packs_epi32(out_25_6, out_25_7);
+ out[7] = _mm_packs_epi32(out_07_6, out_07_7);
+ out[23] = _mm_packs_epi32(out_23_6, out_23_7);
+ out[15] = _mm_packs_epi32(out_15_6, out_15_7);
+ out[31] = _mm_packs_epi32(out_31_6, out_31_7);
+#if DCT_HIGH_BIT_DEPTH
+ overflow =
+ check_epi16_overflow_x8(&out[1], &out[17], &out[9], &out[25],
+ &out[7], &out[23], &out[15], &out[31]);
+ if (overflow) {
+ if (pass == 0)
+ HIGH_FDCT32x32_2D_C(input, output_org, stride);
+ else
+ HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org);
+ return;
+ }
+#endif // DCT_HIGH_BIT_DEPTH
+ }
+ {
+ const __m128i out_05_0 = _mm_unpacklo_epi16(step1[20], step1[27]);
+ const __m128i out_05_1 = _mm_unpackhi_epi16(step1[20], step1[27]);
+ const __m128i out_21_0 = _mm_unpacklo_epi16(step1[21], step1[26]);
+ const __m128i out_21_1 = _mm_unpackhi_epi16(step1[21], step1[26]);
+ const __m128i out_13_0 = _mm_unpacklo_epi16(step1[22], step1[25]);
+ const __m128i out_13_1 = _mm_unpackhi_epi16(step1[22], step1[25]);
+ const __m128i out_29_0 = _mm_unpacklo_epi16(step1[23], step1[24]);
+ const __m128i out_29_1 = _mm_unpackhi_epi16(step1[23], step1[24]);
+ const __m128i out_05_2 = _mm_madd_epi16(out_05_0, k__cospi_p27_p05);
+ const __m128i out_05_3 = _mm_madd_epi16(out_05_1, k__cospi_p27_p05);
+ const __m128i out_21_2 = _mm_madd_epi16(out_21_0, k__cospi_p11_p21);
+ const __m128i out_21_3 = _mm_madd_epi16(out_21_1, k__cospi_p11_p21);
+ const __m128i out_13_2 = _mm_madd_epi16(out_13_0, k__cospi_p19_p13);
+ const __m128i out_13_3 = _mm_madd_epi16(out_13_1, k__cospi_p19_p13);
+ const __m128i out_29_2 = _mm_madd_epi16(out_29_0, k__cospi_p03_p29);
+ const __m128i out_29_3 = _mm_madd_epi16(out_29_1, k__cospi_p03_p29);
+ const __m128i out_03_2 = _mm_madd_epi16(out_29_0, k__cospi_m29_p03);
+ const __m128i out_03_3 = _mm_madd_epi16(out_29_1, k__cospi_m29_p03);
+ const __m128i out_19_2 = _mm_madd_epi16(out_13_0, k__cospi_m13_p19);
+ const __m128i out_19_3 = _mm_madd_epi16(out_13_1, k__cospi_m13_p19);
+ const __m128i out_11_2 = _mm_madd_epi16(out_21_0, k__cospi_m21_p11);
+ const __m128i out_11_3 = _mm_madd_epi16(out_21_1, k__cospi_m21_p11);
+ const __m128i out_27_2 = _mm_madd_epi16(out_05_0, k__cospi_m05_p27);
+ const __m128i out_27_3 = _mm_madd_epi16(out_05_1, k__cospi_m05_p27);
+ // dct_const_round_shift
+ const __m128i out_05_4 =
+ _mm_add_epi32(out_05_2, k__DCT_CONST_ROUNDING);
+ const __m128i out_05_5 =
+ _mm_add_epi32(out_05_3, k__DCT_CONST_ROUNDING);
+ const __m128i out_21_4 =
+ _mm_add_epi32(out_21_2, k__DCT_CONST_ROUNDING);
+ const __m128i out_21_5 =
+ _mm_add_epi32(out_21_3, k__DCT_CONST_ROUNDING);
+ const __m128i out_13_4 =
+ _mm_add_epi32(out_13_2, k__DCT_CONST_ROUNDING);
+ const __m128i out_13_5 =
+ _mm_add_epi32(out_13_3, k__DCT_CONST_ROUNDING);
+ const __m128i out_29_4 =
+ _mm_add_epi32(out_29_2, k__DCT_CONST_ROUNDING);
+ const __m128i out_29_5 =
+ _mm_add_epi32(out_29_3, k__DCT_CONST_ROUNDING);
+ const __m128i out_03_4 =
+ _mm_add_epi32(out_03_2, k__DCT_CONST_ROUNDING);
+ const __m128i out_03_5 =
+ _mm_add_epi32(out_03_3, k__DCT_CONST_ROUNDING);
+ const __m128i out_19_4 =
+ _mm_add_epi32(out_19_2, k__DCT_CONST_ROUNDING);
+ const __m128i out_19_5 =
+ _mm_add_epi32(out_19_3, k__DCT_CONST_ROUNDING);
+ const __m128i out_11_4 =
+ _mm_add_epi32(out_11_2, k__DCT_CONST_ROUNDING);
+ const __m128i out_11_5 =
+ _mm_add_epi32(out_11_3, k__DCT_CONST_ROUNDING);
+ const __m128i out_27_4 =
+ _mm_add_epi32(out_27_2, k__DCT_CONST_ROUNDING);
+ const __m128i out_27_5 =
+ _mm_add_epi32(out_27_3, k__DCT_CONST_ROUNDING);
+ const __m128i out_05_6 = _mm_srai_epi32(out_05_4, DCT_CONST_BITS);
+ const __m128i out_05_7 = _mm_srai_epi32(out_05_5, DCT_CONST_BITS);
+ const __m128i out_21_6 = _mm_srai_epi32(out_21_4, DCT_CONST_BITS);
+ const __m128i out_21_7 = _mm_srai_epi32(out_21_5, DCT_CONST_BITS);
+ const __m128i out_13_6 = _mm_srai_epi32(out_13_4, DCT_CONST_BITS);
+ const __m128i out_13_7 = _mm_srai_epi32(out_13_5, DCT_CONST_BITS);
+ const __m128i out_29_6 = _mm_srai_epi32(out_29_4, DCT_CONST_BITS);
+ const __m128i out_29_7 = _mm_srai_epi32(out_29_5, DCT_CONST_BITS);
+ const __m128i out_03_6 = _mm_srai_epi32(out_03_4, DCT_CONST_BITS);
+ const __m128i out_03_7 = _mm_srai_epi32(out_03_5, DCT_CONST_BITS);
+ const __m128i out_19_6 = _mm_srai_epi32(out_19_4, DCT_CONST_BITS);
+ const __m128i out_19_7 = _mm_srai_epi32(out_19_5, DCT_CONST_BITS);
+ const __m128i out_11_6 = _mm_srai_epi32(out_11_4, DCT_CONST_BITS);
+ const __m128i out_11_7 = _mm_srai_epi32(out_11_5, DCT_CONST_BITS);
+ const __m128i out_27_6 = _mm_srai_epi32(out_27_4, DCT_CONST_BITS);
+ const __m128i out_27_7 = _mm_srai_epi32(out_27_5, DCT_CONST_BITS);
+ // Combine
+ out[5] = _mm_packs_epi32(out_05_6, out_05_7);
+ out[21] = _mm_packs_epi32(out_21_6, out_21_7);
+ out[13] = _mm_packs_epi32(out_13_6, out_13_7);
+ out[29] = _mm_packs_epi32(out_29_6, out_29_7);
+ out[3] = _mm_packs_epi32(out_03_6, out_03_7);
+ out[19] = _mm_packs_epi32(out_19_6, out_19_7);
+ out[11] = _mm_packs_epi32(out_11_6, out_11_7);
+ out[27] = _mm_packs_epi32(out_27_6, out_27_7);
+#if DCT_HIGH_BIT_DEPTH
+ overflow =
+ check_epi16_overflow_x8(&out[5], &out[21], &out[13], &out[29],
+ &out[3], &out[19], &out[11], &out[27]);
+ if (overflow) {
+ if (pass == 0)
+ HIGH_FDCT32x32_2D_C(input, output_org, stride);
+ else
+ HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org);
+ return;
+ }
+#endif // DCT_HIGH_BIT_DEPTH
+ }
+#if FDCT32x32_HIGH_PRECISION
+ } else {
+ __m128i lstep1[64], lstep2[64], lstep3[64];
+ __m128i u[32], v[32], sign[16];
+ const __m128i K32One = _mm_set_epi32(1, 1, 1, 1);
+ const __m128i k__pOne_mOne = pair_set_epi16(1, -1);
+ // start using 32-bit operations
+ // stage 3
+ {
+ // expanding to 32-bit length while adding and subtracting
+ lstep2[0] = _mm_unpacklo_epi16(step2[0], step2[7]);
+ lstep2[1] = _mm_unpackhi_epi16(step2[0], step2[7]);
+ lstep2[2] = _mm_unpacklo_epi16(step2[1], step2[6]);
+ lstep2[3] = _mm_unpackhi_epi16(step2[1], step2[6]);
+ lstep2[4] = _mm_unpacklo_epi16(step2[2], step2[5]);
+ lstep2[5] = _mm_unpackhi_epi16(step2[2], step2[5]);
+ lstep2[6] = _mm_unpacklo_epi16(step2[3], step2[4]);
+ lstep2[7] = _mm_unpackhi_epi16(step2[3], step2[4]);
+
+ lstep3[0] = _mm_madd_epi16(lstep2[0], kOne);
+ lstep3[1] = _mm_madd_epi16(lstep2[1], kOne);
+ lstep3[2] = _mm_madd_epi16(lstep2[2], kOne);
+ lstep3[3] = _mm_madd_epi16(lstep2[3], kOne);
+ lstep3[4] = _mm_madd_epi16(lstep2[4], kOne);
+ lstep3[5] = _mm_madd_epi16(lstep2[5], kOne);
+ lstep3[6] = _mm_madd_epi16(lstep2[6], kOne);
+ lstep3[7] = _mm_madd_epi16(lstep2[7], kOne);
+
+ lstep3[8] = _mm_madd_epi16(lstep2[6], k__pOne_mOne);
+ lstep3[9] = _mm_madd_epi16(lstep2[7], k__pOne_mOne);
+ lstep3[10] = _mm_madd_epi16(lstep2[4], k__pOne_mOne);
+ lstep3[11] = _mm_madd_epi16(lstep2[5], k__pOne_mOne);
+ lstep3[12] = _mm_madd_epi16(lstep2[2], k__pOne_mOne);
+ lstep3[13] = _mm_madd_epi16(lstep2[3], k__pOne_mOne);
+ lstep3[14] = _mm_madd_epi16(lstep2[0], k__pOne_mOne);
+ lstep3[15] = _mm_madd_epi16(lstep2[1], k__pOne_mOne);
+ }
+ {
+ const __m128i s3_10_0 = _mm_unpacklo_epi16(step2[13], step2[10]);
+ const __m128i s3_10_1 = _mm_unpackhi_epi16(step2[13], step2[10]);
+ const __m128i s3_11_0 = _mm_unpacklo_epi16(step2[12], step2[11]);
+ const __m128i s3_11_1 = _mm_unpackhi_epi16(step2[12], step2[11]);
+ const __m128i s3_10_2 = _mm_madd_epi16(s3_10_0, k__cospi_p16_m16);
+ const __m128i s3_10_3 = _mm_madd_epi16(s3_10_1, k__cospi_p16_m16);
+ const __m128i s3_11_2 = _mm_madd_epi16(s3_11_0, k__cospi_p16_m16);
+ const __m128i s3_11_3 = _mm_madd_epi16(s3_11_1, k__cospi_p16_m16);
+ const __m128i s3_12_2 = _mm_madd_epi16(s3_11_0, k__cospi_p16_p16);
+ const __m128i s3_12_3 = _mm_madd_epi16(s3_11_1, k__cospi_p16_p16);
+ const __m128i s3_13_2 = _mm_madd_epi16(s3_10_0, k__cospi_p16_p16);
+ const __m128i s3_13_3 = _mm_madd_epi16(s3_10_1, k__cospi_p16_p16);
+ // dct_const_round_shift
+ const __m128i s3_10_4 = _mm_add_epi32(s3_10_2, k__DCT_CONST_ROUNDING);
+ const __m128i s3_10_5 = _mm_add_epi32(s3_10_3, k__DCT_CONST_ROUNDING);
+ const __m128i s3_11_4 = _mm_add_epi32(s3_11_2, k__DCT_CONST_ROUNDING);
+ const __m128i s3_11_5 = _mm_add_epi32(s3_11_3, k__DCT_CONST_ROUNDING);
+ const __m128i s3_12_4 = _mm_add_epi32(s3_12_2, k__DCT_CONST_ROUNDING);
+ const __m128i s3_12_5 = _mm_add_epi32(s3_12_3, k__DCT_CONST_ROUNDING);
+ const __m128i s3_13_4 = _mm_add_epi32(s3_13_2, k__DCT_CONST_ROUNDING);
+ const __m128i s3_13_5 = _mm_add_epi32(s3_13_3, k__DCT_CONST_ROUNDING);
+ lstep3[20] = _mm_srai_epi32(s3_10_4, DCT_CONST_BITS);
+ lstep3[21] = _mm_srai_epi32(s3_10_5, DCT_CONST_BITS);
+ lstep3[22] = _mm_srai_epi32(s3_11_4, DCT_CONST_BITS);
+ lstep3[23] = _mm_srai_epi32(s3_11_5, DCT_CONST_BITS);
+ lstep3[24] = _mm_srai_epi32(s3_12_4, DCT_CONST_BITS);
+ lstep3[25] = _mm_srai_epi32(s3_12_5, DCT_CONST_BITS);
+ lstep3[26] = _mm_srai_epi32(s3_13_4, DCT_CONST_BITS);
+ lstep3[27] = _mm_srai_epi32(s3_13_5, DCT_CONST_BITS);
+ }
+ {
+ lstep1[32] = _mm_unpacklo_epi16(step1[16], step2[23]);
+ lstep1[33] = _mm_unpackhi_epi16(step1[16], step2[23]);
+ lstep1[34] = _mm_unpacklo_epi16(step1[17], step2[22]);
+ lstep1[35] = _mm_unpackhi_epi16(step1[17], step2[22]);
+ lstep1[36] = _mm_unpacklo_epi16(step1[18], step2[21]);
+ lstep1[37] = _mm_unpackhi_epi16(step1[18], step2[21]);
+ lstep1[38] = _mm_unpacklo_epi16(step1[19], step2[20]);
+ lstep1[39] = _mm_unpackhi_epi16(step1[19], step2[20]);
+
+ lstep1[56] = _mm_unpacklo_epi16(step1[28], step2[27]);
+ lstep1[57] = _mm_unpackhi_epi16(step1[28], step2[27]);
+ lstep1[58] = _mm_unpacklo_epi16(step1[29], step2[26]);
+ lstep1[59] = _mm_unpackhi_epi16(step1[29], step2[26]);
+ lstep1[60] = _mm_unpacklo_epi16(step1[30], step2[25]);
+ lstep1[61] = _mm_unpackhi_epi16(step1[30], step2[25]);
+ lstep1[62] = _mm_unpacklo_epi16(step1[31], step2[24]);
+ lstep1[63] = _mm_unpackhi_epi16(step1[31], step2[24]);
+
+ lstep3[32] = _mm_madd_epi16(lstep1[32], kOne);
+ lstep3[33] = _mm_madd_epi16(lstep1[33], kOne);
+ lstep3[34] = _mm_madd_epi16(lstep1[34], kOne);
+ lstep3[35] = _mm_madd_epi16(lstep1[35], kOne);
+ lstep3[36] = _mm_madd_epi16(lstep1[36], kOne);
+ lstep3[37] = _mm_madd_epi16(lstep1[37], kOne);
+ lstep3[38] = _mm_madd_epi16(lstep1[38], kOne);
+ lstep3[39] = _mm_madd_epi16(lstep1[39], kOne);
+
+ lstep3[40] = _mm_madd_epi16(lstep1[38], k__pOne_mOne);
+ lstep3[41] = _mm_madd_epi16(lstep1[39], k__pOne_mOne);
+ lstep3[42] = _mm_madd_epi16(lstep1[36], k__pOne_mOne);
+ lstep3[43] = _mm_madd_epi16(lstep1[37], k__pOne_mOne);
+ lstep3[44] = _mm_madd_epi16(lstep1[34], k__pOne_mOne);
+ lstep3[45] = _mm_madd_epi16(lstep1[35], k__pOne_mOne);
+ lstep3[46] = _mm_madd_epi16(lstep1[32], k__pOne_mOne);
+ lstep3[47] = _mm_madd_epi16(lstep1[33], k__pOne_mOne);
+
+ lstep3[48] = _mm_madd_epi16(lstep1[62], k__pOne_mOne);
+ lstep3[49] = _mm_madd_epi16(lstep1[63], k__pOne_mOne);
+ lstep3[50] = _mm_madd_epi16(lstep1[60], k__pOne_mOne);
+ lstep3[51] = _mm_madd_epi16(lstep1[61], k__pOne_mOne);
+ lstep3[52] = _mm_madd_epi16(lstep1[58], k__pOne_mOne);
+ lstep3[53] = _mm_madd_epi16(lstep1[59], k__pOne_mOne);
+ lstep3[54] = _mm_madd_epi16(lstep1[56], k__pOne_mOne);
+ lstep3[55] = _mm_madd_epi16(lstep1[57], k__pOne_mOne);
+
+ lstep3[56] = _mm_madd_epi16(lstep1[56], kOne);
+ lstep3[57] = _mm_madd_epi16(lstep1[57], kOne);
+ lstep3[58] = _mm_madd_epi16(lstep1[58], kOne);
+ lstep3[59] = _mm_madd_epi16(lstep1[59], kOne);
+ lstep3[60] = _mm_madd_epi16(lstep1[60], kOne);
+ lstep3[61] = _mm_madd_epi16(lstep1[61], kOne);
+ lstep3[62] = _mm_madd_epi16(lstep1[62], kOne);
+ lstep3[63] = _mm_madd_epi16(lstep1[63], kOne);
+ }
+
+ // stage 4
+ {
+ // expanding to 32-bit length prior to addition operations
+ sign[0] = _mm_cmpgt_epi16(kZero, step2[8]);
+ sign[1] = _mm_cmpgt_epi16(kZero, step2[9]);
+ sign[2] = _mm_cmpgt_epi16(kZero, step2[14]);
+ sign[3] = _mm_cmpgt_epi16(kZero, step2[15]);
+ lstep2[16] = _mm_unpacklo_epi16(step2[8], sign[0]);
+ lstep2[17] = _mm_unpackhi_epi16(step2[8], sign[0]);
+ lstep2[18] = _mm_unpacklo_epi16(step2[9], sign[1]);
+ lstep2[19] = _mm_unpackhi_epi16(step2[9], sign[1]);
+ lstep2[28] = _mm_unpacklo_epi16(step2[14], sign[2]);
+ lstep2[29] = _mm_unpackhi_epi16(step2[14], sign[2]);
+ lstep2[30] = _mm_unpacklo_epi16(step2[15], sign[3]);
+ lstep2[31] = _mm_unpackhi_epi16(step2[15], sign[3]);
+
+ lstep1[0] = _mm_add_epi32(lstep3[6], lstep3[0]);
+ lstep1[1] = _mm_add_epi32(lstep3[7], lstep3[1]);
+ lstep1[2] = _mm_add_epi32(lstep3[4], lstep3[2]);
+ lstep1[3] = _mm_add_epi32(lstep3[5], lstep3[3]);
+ lstep1[4] = _mm_sub_epi32(lstep3[2], lstep3[4]);
+ lstep1[5] = _mm_sub_epi32(lstep3[3], lstep3[5]);
+ lstep1[6] = _mm_sub_epi32(lstep3[0], lstep3[6]);
+ lstep1[7] = _mm_sub_epi32(lstep3[1], lstep3[7]);
+ lstep1[16] = _mm_add_epi32(lstep3[22], lstep2[16]);
+ lstep1[17] = _mm_add_epi32(lstep3[23], lstep2[17]);
+ lstep1[18] = _mm_add_epi32(lstep3[20], lstep2[18]);
+ lstep1[19] = _mm_add_epi32(lstep3[21], lstep2[19]);
+ lstep1[20] = _mm_sub_epi32(lstep2[18], lstep3[20]);
+ lstep1[21] = _mm_sub_epi32(lstep2[19], lstep3[21]);
+ lstep1[22] = _mm_sub_epi32(lstep2[16], lstep3[22]);
+ lstep1[23] = _mm_sub_epi32(lstep2[17], lstep3[23]);
+ lstep1[24] = _mm_sub_epi32(lstep2[30], lstep3[24]);
+ lstep1[25] = _mm_sub_epi32(lstep2[31], lstep3[25]);
+ lstep1[26] = _mm_sub_epi32(lstep2[28], lstep3[26]);
+ lstep1[27] = _mm_sub_epi32(lstep2[29], lstep3[27]);
+ lstep1[28] = _mm_add_epi32(lstep3[26], lstep2[28]);
+ lstep1[29] = _mm_add_epi32(lstep3[27], lstep2[29]);
+ lstep1[30] = _mm_add_epi32(lstep3[24], lstep2[30]);
+ lstep1[31] = _mm_add_epi32(lstep3[25], lstep2[31]);
+ }
+ {
+ // to be continued...
+ //
+ const __m128i k32_p16_p16 = pair_set_epi32(cospi_16_64, cospi_16_64);
+ const __m128i k32_p16_m16 = pair_set_epi32(cospi_16_64, -cospi_16_64);
+
+ u[0] = _mm_unpacklo_epi32(lstep3[12], lstep3[10]);
+ u[1] = _mm_unpackhi_epi32(lstep3[12], lstep3[10]);
+ u[2] = _mm_unpacklo_epi32(lstep3[13], lstep3[11]);
+ u[3] = _mm_unpackhi_epi32(lstep3[13], lstep3[11]);
+
+ // TODO(jingning): manually inline k_madd_epi32_ to further hide
+ // instruction latency.
+ v[0] = k_madd_epi32(u[0], k32_p16_m16);
+ v[1] = k_madd_epi32(u[1], k32_p16_m16);
+ v[2] = k_madd_epi32(u[2], k32_p16_m16);
+ v[3] = k_madd_epi32(u[3], k32_p16_m16);
+ v[4] = k_madd_epi32(u[0], k32_p16_p16);
+ v[5] = k_madd_epi32(u[1], k32_p16_p16);
+ v[6] = k_madd_epi32(u[2], k32_p16_p16);
+ v[7] = k_madd_epi32(u[3], k32_p16_p16);
+#if DCT_HIGH_BIT_DEPTH
+ overflow = k_check_epi32_overflow_8(&v[0], &v[1], &v[2], &v[3], &v[4],
+ &v[5], &v[6], &v[7], &kZero);
+ if (overflow) {
+ HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org);
+ return;
+ }
+#endif // DCT_HIGH_BIT_DEPTH
+ u[0] = k_packs_epi64(v[0], v[1]);
+ u[1] = k_packs_epi64(v[2], v[3]);
+ u[2] = k_packs_epi64(v[4], v[5]);
+ u[3] = k_packs_epi64(v[6], v[7]);
+
+ v[0] = _mm_add_epi32(u[0], k__DCT_CONST_ROUNDING);
+ v[1] = _mm_add_epi32(u[1], k__DCT_CONST_ROUNDING);
+ v[2] = _mm_add_epi32(u[2], k__DCT_CONST_ROUNDING);
+ v[3] = _mm_add_epi32(u[3], k__DCT_CONST_ROUNDING);
+
+ lstep1[10] = _mm_srai_epi32(v[0], DCT_CONST_BITS);
+ lstep1[11] = _mm_srai_epi32(v[1], DCT_CONST_BITS);
+ lstep1[12] = _mm_srai_epi32(v[2], DCT_CONST_BITS);
+ lstep1[13] = _mm_srai_epi32(v[3], DCT_CONST_BITS);
+ }
+ {
+ const __m128i k32_m08_p24 = pair_set_epi32(-cospi_8_64, cospi_24_64);
+ const __m128i k32_m24_m08 = pair_set_epi32(-cospi_24_64, -cospi_8_64);
+ const __m128i k32_p24_p08 = pair_set_epi32(cospi_24_64, cospi_8_64);
+
+ u[0] = _mm_unpacklo_epi32(lstep3[36], lstep3[58]);
+ u[1] = _mm_unpackhi_epi32(lstep3[36], lstep3[58]);
+ u[2] = _mm_unpacklo_epi32(lstep3[37], lstep3[59]);
+ u[3] = _mm_unpackhi_epi32(lstep3[37], lstep3[59]);
+ u[4] = _mm_unpacklo_epi32(lstep3[38], lstep3[56]);
+ u[5] = _mm_unpackhi_epi32(lstep3[38], lstep3[56]);
+ u[6] = _mm_unpacklo_epi32(lstep3[39], lstep3[57]);
+ u[7] = _mm_unpackhi_epi32(lstep3[39], lstep3[57]);
+ u[8] = _mm_unpacklo_epi32(lstep3[40], lstep3[54]);
+ u[9] = _mm_unpackhi_epi32(lstep3[40], lstep3[54]);
+ u[10] = _mm_unpacklo_epi32(lstep3[41], lstep3[55]);
+ u[11] = _mm_unpackhi_epi32(lstep3[41], lstep3[55]);
+ u[12] = _mm_unpacklo_epi32(lstep3[42], lstep3[52]);
+ u[13] = _mm_unpackhi_epi32(lstep3[42], lstep3[52]);
+ u[14] = _mm_unpacklo_epi32(lstep3[43], lstep3[53]);
+ u[15] = _mm_unpackhi_epi32(lstep3[43], lstep3[53]);
+
+ v[0] = k_madd_epi32(u[0], k32_m08_p24);
+ v[1] = k_madd_epi32(u[1], k32_m08_p24);
+ v[2] = k_madd_epi32(u[2], k32_m08_p24);
+ v[3] = k_madd_epi32(u[3], k32_m08_p24);
+ v[4] = k_madd_epi32(u[4], k32_m08_p24);
+ v[5] = k_madd_epi32(u[5], k32_m08_p24);
+ v[6] = k_madd_epi32(u[6], k32_m08_p24);
+ v[7] = k_madd_epi32(u[7], k32_m08_p24);
+ v[8] = k_madd_epi32(u[8], k32_m24_m08);
+ v[9] = k_madd_epi32(u[9], k32_m24_m08);
+ v[10] = k_madd_epi32(u[10], k32_m24_m08);
+ v[11] = k_madd_epi32(u[11], k32_m24_m08);
+ v[12] = k_madd_epi32(u[12], k32_m24_m08);
+ v[13] = k_madd_epi32(u[13], k32_m24_m08);
+ v[14] = k_madd_epi32(u[14], k32_m24_m08);
+ v[15] = k_madd_epi32(u[15], k32_m24_m08);
+ v[16] = k_madd_epi32(u[12], k32_m08_p24);
+ v[17] = k_madd_epi32(u[13], k32_m08_p24);
+ v[18] = k_madd_epi32(u[14], k32_m08_p24);
+ v[19] = k_madd_epi32(u[15], k32_m08_p24);
+ v[20] = k_madd_epi32(u[8], k32_m08_p24);
+ v[21] = k_madd_epi32(u[9], k32_m08_p24);
+ v[22] = k_madd_epi32(u[10], k32_m08_p24);
+ v[23] = k_madd_epi32(u[11], k32_m08_p24);
+ v[24] = k_madd_epi32(u[4], k32_p24_p08);
+ v[25] = k_madd_epi32(u[5], k32_p24_p08);
+ v[26] = k_madd_epi32(u[6], k32_p24_p08);
+ v[27] = k_madd_epi32(u[7], k32_p24_p08);
+ v[28] = k_madd_epi32(u[0], k32_p24_p08);
+ v[29] = k_madd_epi32(u[1], k32_p24_p08);
+ v[30] = k_madd_epi32(u[2], k32_p24_p08);
+ v[31] = k_madd_epi32(u[3], k32_p24_p08);
+
+#if DCT_HIGH_BIT_DEPTH
+ overflow = k_check_epi32_overflow_32(
+ &v[0], &v[1], &v[2], &v[3], &v[4], &v[5], &v[6], &v[7], &v[8],
+ &v[9], &v[10], &v[11], &v[12], &v[13], &v[14], &v[15], &v[16],
+ &v[17], &v[18], &v[19], &v[20], &v[21], &v[22], &v[23], &v[24],
+ &v[25], &v[26], &v[27], &v[28], &v[29], &v[30], &v[31], &kZero);
+ if (overflow) {
+ HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org);
+ return;
+ }
+#endif // DCT_HIGH_BIT_DEPTH
+ u[0] = k_packs_epi64(v[0], v[1]);
+ u[1] = k_packs_epi64(v[2], v[3]);
+ u[2] = k_packs_epi64(v[4], v[5]);
+ u[3] = k_packs_epi64(v[6], v[7]);
+ u[4] = k_packs_epi64(v[8], v[9]);
+ u[5] = k_packs_epi64(v[10], v[11]);
+ u[6] = k_packs_epi64(v[12], v[13]);
+ u[7] = k_packs_epi64(v[14], v[15]);
+ u[8] = k_packs_epi64(v[16], v[17]);
+ u[9] = k_packs_epi64(v[18], v[19]);
+ u[10] = k_packs_epi64(v[20], v[21]);
+ u[11] = k_packs_epi64(v[22], v[23]);
+ u[12] = k_packs_epi64(v[24], v[25]);
+ u[13] = k_packs_epi64(v[26], v[27]);
+ u[14] = k_packs_epi64(v[28], v[29]);
+ u[15] = k_packs_epi64(v[30], v[31]);
+
+ v[0] = _mm_add_epi32(u[0], k__DCT_CONST_ROUNDING);
+ v[1] = _mm_add_epi32(u[1], k__DCT_CONST_ROUNDING);
+ v[2] = _mm_add_epi32(u[2], k__DCT_CONST_ROUNDING);
+ v[3] = _mm_add_epi32(u[3], k__DCT_CONST_ROUNDING);
+ v[4] = _mm_add_epi32(u[4], k__DCT_CONST_ROUNDING);
+ v[5] = _mm_add_epi32(u[5], k__DCT_CONST_ROUNDING);
+ v[6] = _mm_add_epi32(u[6], k__DCT_CONST_ROUNDING);
+ v[7] = _mm_add_epi32(u[7], k__DCT_CONST_ROUNDING);
+ v[8] = _mm_add_epi32(u[8], k__DCT_CONST_ROUNDING);
+ v[9] = _mm_add_epi32(u[9], k__DCT_CONST_ROUNDING);
+ v[10] = _mm_add_epi32(u[10], k__DCT_CONST_ROUNDING);
+ v[11] = _mm_add_epi32(u[11], k__DCT_CONST_ROUNDING);
+ v[12] = _mm_add_epi32(u[12], k__DCT_CONST_ROUNDING);
+ v[13] = _mm_add_epi32(u[13], k__DCT_CONST_ROUNDING);
+ v[14] = _mm_add_epi32(u[14], k__DCT_CONST_ROUNDING);
+ v[15] = _mm_add_epi32(u[15], k__DCT_CONST_ROUNDING);
+
+ lstep1[36] = _mm_srai_epi32(v[0], DCT_CONST_BITS);
+ lstep1[37] = _mm_srai_epi32(v[1], DCT_CONST_BITS);
+ lstep1[38] = _mm_srai_epi32(v[2], DCT_CONST_BITS);
+ lstep1[39] = _mm_srai_epi32(v[3], DCT_CONST_BITS);
+ lstep1[40] = _mm_srai_epi32(v[4], DCT_CONST_BITS);
+ lstep1[41] = _mm_srai_epi32(v[5], DCT_CONST_BITS);
+ lstep1[42] = _mm_srai_epi32(v[6], DCT_CONST_BITS);
+ lstep1[43] = _mm_srai_epi32(v[7], DCT_CONST_BITS);
+ lstep1[52] = _mm_srai_epi32(v[8], DCT_CONST_BITS);
+ lstep1[53] = _mm_srai_epi32(v[9], DCT_CONST_BITS);
+ lstep1[54] = _mm_srai_epi32(v[10], DCT_CONST_BITS);
+ lstep1[55] = _mm_srai_epi32(v[11], DCT_CONST_BITS);
+ lstep1[56] = _mm_srai_epi32(v[12], DCT_CONST_BITS);
+ lstep1[57] = _mm_srai_epi32(v[13], DCT_CONST_BITS);
+ lstep1[58] = _mm_srai_epi32(v[14], DCT_CONST_BITS);
+ lstep1[59] = _mm_srai_epi32(v[15], DCT_CONST_BITS);
+ }
+ // stage 5
+ {
+ lstep2[8] = _mm_add_epi32(lstep1[10], lstep3[8]);
+ lstep2[9] = _mm_add_epi32(lstep1[11], lstep3[9]);
+ lstep2[10] = _mm_sub_epi32(lstep3[8], lstep1[10]);
+ lstep2[11] = _mm_sub_epi32(lstep3[9], lstep1[11]);
+ lstep2[12] = _mm_sub_epi32(lstep3[14], lstep1[12]);
+ lstep2[13] = _mm_sub_epi32(lstep3[15], lstep1[13]);
+ lstep2[14] = _mm_add_epi32(lstep1[12], lstep3[14]);
+ lstep2[15] = _mm_add_epi32(lstep1[13], lstep3[15]);
+ }
+ {
+ const __m128i k32_p16_p16 = pair_set_epi32(cospi_16_64, cospi_16_64);
+ const __m128i k32_p16_m16 = pair_set_epi32(cospi_16_64, -cospi_16_64);
+ const __m128i k32_p24_p08 = pair_set_epi32(cospi_24_64, cospi_8_64);
+ const __m128i k32_m08_p24 = pair_set_epi32(-cospi_8_64, cospi_24_64);
+
+ u[0] = _mm_unpacklo_epi32(lstep1[0], lstep1[2]);
+ u[1] = _mm_unpackhi_epi32(lstep1[0], lstep1[2]);
+ u[2] = _mm_unpacklo_epi32(lstep1[1], lstep1[3]);
+ u[3] = _mm_unpackhi_epi32(lstep1[1], lstep1[3]);
+ u[4] = _mm_unpacklo_epi32(lstep1[4], lstep1[6]);
+ u[5] = _mm_unpackhi_epi32(lstep1[4], lstep1[6]);
+ u[6] = _mm_unpacklo_epi32(lstep1[5], lstep1[7]);
+ u[7] = _mm_unpackhi_epi32(lstep1[5], lstep1[7]);
+
+ // TODO(jingning): manually inline k_madd_epi32_ to further hide
+ // instruction latency.
+ v[0] = k_madd_epi32(u[0], k32_p16_p16);
+ v[1] = k_madd_epi32(u[1], k32_p16_p16);
+ v[2] = k_madd_epi32(u[2], k32_p16_p16);
+ v[3] = k_madd_epi32(u[3], k32_p16_p16);
+ v[4] = k_madd_epi32(u[0], k32_p16_m16);
+ v[5] = k_madd_epi32(u[1], k32_p16_m16);
+ v[6] = k_madd_epi32(u[2], k32_p16_m16);
+ v[7] = k_madd_epi32(u[3], k32_p16_m16);
+ v[8] = k_madd_epi32(u[4], k32_p24_p08);
+ v[9] = k_madd_epi32(u[5], k32_p24_p08);
+ v[10] = k_madd_epi32(u[6], k32_p24_p08);
+ v[11] = k_madd_epi32(u[7], k32_p24_p08);
+ v[12] = k_madd_epi32(u[4], k32_m08_p24);
+ v[13] = k_madd_epi32(u[5], k32_m08_p24);
+ v[14] = k_madd_epi32(u[6], k32_m08_p24);
+ v[15] = k_madd_epi32(u[7], k32_m08_p24);
+
+#if DCT_HIGH_BIT_DEPTH
+ overflow = k_check_epi32_overflow_16(
+ &v[0], &v[1], &v[2], &v[3], &v[4], &v[5], &v[6], &v[7], &v[8],
+ &v[9], &v[10], &v[11], &v[12], &v[13], &v[14], &v[15], &kZero);
+ if (overflow) {
+ HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org);
+ return;
+ }
+#endif // DCT_HIGH_BIT_DEPTH
+ u[0] = k_packs_epi64(v[0], v[1]);
+ u[1] = k_packs_epi64(v[2], v[3]);
+ u[2] = k_packs_epi64(v[4], v[5]);
+ u[3] = k_packs_epi64(v[6], v[7]);
+ u[4] = k_packs_epi64(v[8], v[9]);
+ u[5] = k_packs_epi64(v[10], v[11]);
+ u[6] = k_packs_epi64(v[12], v[13]);
+ u[7] = k_packs_epi64(v[14], v[15]);
+
+ v[0] = _mm_add_epi32(u[0], k__DCT_CONST_ROUNDING);
+ v[1] = _mm_add_epi32(u[1], k__DCT_CONST_ROUNDING);
+ v[2] = _mm_add_epi32(u[2], k__DCT_CONST_ROUNDING);
+ v[3] = _mm_add_epi32(u[3], k__DCT_CONST_ROUNDING);
+ v[4] = _mm_add_epi32(u[4], k__DCT_CONST_ROUNDING);
+ v[5] = _mm_add_epi32(u[5], k__DCT_CONST_ROUNDING);
+ v[6] = _mm_add_epi32(u[6], k__DCT_CONST_ROUNDING);
+ v[7] = _mm_add_epi32(u[7], k__DCT_CONST_ROUNDING);
+
+ u[0] = _mm_srai_epi32(v[0], DCT_CONST_BITS);
+ u[1] = _mm_srai_epi32(v[1], DCT_CONST_BITS);
+ u[2] = _mm_srai_epi32(v[2], DCT_CONST_BITS);
+ u[3] = _mm_srai_epi32(v[3], DCT_CONST_BITS);
+ u[4] = _mm_srai_epi32(v[4], DCT_CONST_BITS);
+ u[5] = _mm_srai_epi32(v[5], DCT_CONST_BITS);
+ u[6] = _mm_srai_epi32(v[6], DCT_CONST_BITS);
+ u[7] = _mm_srai_epi32(v[7], DCT_CONST_BITS);
+
+ sign[0] = _mm_cmplt_epi32(u[0], kZero);
+ sign[1] = _mm_cmplt_epi32(u[1], kZero);
+ sign[2] = _mm_cmplt_epi32(u[2], kZero);
+ sign[3] = _mm_cmplt_epi32(u[3], kZero);
+ sign[4] = _mm_cmplt_epi32(u[4], kZero);
+ sign[5] = _mm_cmplt_epi32(u[5], kZero);
+ sign[6] = _mm_cmplt_epi32(u[6], kZero);
+ sign[7] = _mm_cmplt_epi32(u[7], kZero);
+
+ u[0] = _mm_sub_epi32(u[0], sign[0]);
+ u[1] = _mm_sub_epi32(u[1], sign[1]);
+ u[2] = _mm_sub_epi32(u[2], sign[2]);
+ u[3] = _mm_sub_epi32(u[3], sign[3]);
+ u[4] = _mm_sub_epi32(u[4], sign[4]);
+ u[5] = _mm_sub_epi32(u[5], sign[5]);
+ u[6] = _mm_sub_epi32(u[6], sign[6]);
+ u[7] = _mm_sub_epi32(u[7], sign[7]);
+
+ u[0] = _mm_add_epi32(u[0], K32One);
+ u[1] = _mm_add_epi32(u[1], K32One);
+ u[2] = _mm_add_epi32(u[2], K32One);
+ u[3] = _mm_add_epi32(u[3], K32One);
+ u[4] = _mm_add_epi32(u[4], K32One);
+ u[5] = _mm_add_epi32(u[5], K32One);
+ u[6] = _mm_add_epi32(u[6], K32One);
+ u[7] = _mm_add_epi32(u[7], K32One);
+
+ u[0] = _mm_srai_epi32(u[0], 2);
+ u[1] = _mm_srai_epi32(u[1], 2);
+ u[2] = _mm_srai_epi32(u[2], 2);
+ u[3] = _mm_srai_epi32(u[3], 2);
+ u[4] = _mm_srai_epi32(u[4], 2);
+ u[5] = _mm_srai_epi32(u[5], 2);
+ u[6] = _mm_srai_epi32(u[6], 2);
+ u[7] = _mm_srai_epi32(u[7], 2);
+
+ // Combine
+ out[0] = _mm_packs_epi32(u[0], u[1]);
+ out[16] = _mm_packs_epi32(u[2], u[3]);
+ out[8] = _mm_packs_epi32(u[4], u[5]);
+ out[24] = _mm_packs_epi32(u[6], u[7]);
+#if DCT_HIGH_BIT_DEPTH
+ overflow =
+ check_epi16_overflow_x4(&out[0], &out[16], &out[8], &out[24]);
+ if (overflow) {
+ HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org);
+ return;
+ }
+#endif // DCT_HIGH_BIT_DEPTH
+ }
+ {
+ const __m128i k32_m08_p24 = pair_set_epi32(-cospi_8_64, cospi_24_64);
+ const __m128i k32_m24_m08 = pair_set_epi32(-cospi_24_64, -cospi_8_64);
+ const __m128i k32_p24_p08 = pair_set_epi32(cospi_24_64, cospi_8_64);
+
+ u[0] = _mm_unpacklo_epi32(lstep1[18], lstep1[28]);
+ u[1] = _mm_unpackhi_epi32(lstep1[18], lstep1[28]);
+ u[2] = _mm_unpacklo_epi32(lstep1[19], lstep1[29]);
+ u[3] = _mm_unpackhi_epi32(lstep1[19], lstep1[29]);
+ u[4] = _mm_unpacklo_epi32(lstep1[20], lstep1[26]);
+ u[5] = _mm_unpackhi_epi32(lstep1[20], lstep1[26]);
+ u[6] = _mm_unpacklo_epi32(lstep1[21], lstep1[27]);
+ u[7] = _mm_unpackhi_epi32(lstep1[21], lstep1[27]);
+
+ v[0] = k_madd_epi32(u[0], k32_m08_p24);
+ v[1] = k_madd_epi32(u[1], k32_m08_p24);
+ v[2] = k_madd_epi32(u[2], k32_m08_p24);
+ v[3] = k_madd_epi32(u[3], k32_m08_p24);
+ v[4] = k_madd_epi32(u[4], k32_m24_m08);
+ v[5] = k_madd_epi32(u[5], k32_m24_m08);
+ v[6] = k_madd_epi32(u[6], k32_m24_m08);
+ v[7] = k_madd_epi32(u[7], k32_m24_m08);
+ v[8] = k_madd_epi32(u[4], k32_m08_p24);
+ v[9] = k_madd_epi32(u[5], k32_m08_p24);
+ v[10] = k_madd_epi32(u[6], k32_m08_p24);
+ v[11] = k_madd_epi32(u[7], k32_m08_p24);
+ v[12] = k_madd_epi32(u[0], k32_p24_p08);
+ v[13] = k_madd_epi32(u[1], k32_p24_p08);
+ v[14] = k_madd_epi32(u[2], k32_p24_p08);
+ v[15] = k_madd_epi32(u[3], k32_p24_p08);
+
+#if DCT_HIGH_BIT_DEPTH
+ overflow = k_check_epi32_overflow_16(
+ &v[0], &v[1], &v[2], &v[3], &v[4], &v[5], &v[6], &v[7], &v[8],
+ &v[9], &v[10], &v[11], &v[12], &v[13], &v[14], &v[15], &kZero);
+ if (overflow) {
+ HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org);
+ return;
+ }
+#endif // DCT_HIGH_BIT_DEPTH
+ u[0] = k_packs_epi64(v[0], v[1]);
+ u[1] = k_packs_epi64(v[2], v[3]);
+ u[2] = k_packs_epi64(v[4], v[5]);
+ u[3] = k_packs_epi64(v[6], v[7]);
+ u[4] = k_packs_epi64(v[8], v[9]);
+ u[5] = k_packs_epi64(v[10], v[11]);
+ u[6] = k_packs_epi64(v[12], v[13]);
+ u[7] = k_packs_epi64(v[14], v[15]);
+
+ u[0] = _mm_add_epi32(u[0], k__DCT_CONST_ROUNDING);
+ u[1] = _mm_add_epi32(u[1], k__DCT_CONST_ROUNDING);
+ u[2] = _mm_add_epi32(u[2], k__DCT_CONST_ROUNDING);
+ u[3] = _mm_add_epi32(u[3], k__DCT_CONST_ROUNDING);
+ u[4] = _mm_add_epi32(u[4], k__DCT_CONST_ROUNDING);
+ u[5] = _mm_add_epi32(u[5], k__DCT_CONST_ROUNDING);
+ u[6] = _mm_add_epi32(u[6], k__DCT_CONST_ROUNDING);
+ u[7] = _mm_add_epi32(u[7], k__DCT_CONST_ROUNDING);
+
+ lstep2[18] = _mm_srai_epi32(u[0], DCT_CONST_BITS);
+ lstep2[19] = _mm_srai_epi32(u[1], DCT_CONST_BITS);
+ lstep2[20] = _mm_srai_epi32(u[2], DCT_CONST_BITS);
+ lstep2[21] = _mm_srai_epi32(u[3], DCT_CONST_BITS);
+ lstep2[26] = _mm_srai_epi32(u[4], DCT_CONST_BITS);
+ lstep2[27] = _mm_srai_epi32(u[5], DCT_CONST_BITS);
+ lstep2[28] = _mm_srai_epi32(u[6], DCT_CONST_BITS);
+ lstep2[29] = _mm_srai_epi32(u[7], DCT_CONST_BITS);
+ }
+ {
+ lstep2[32] = _mm_add_epi32(lstep1[38], lstep3[32]);
+ lstep2[33] = _mm_add_epi32(lstep1[39], lstep3[33]);
+ lstep2[34] = _mm_add_epi32(lstep1[36], lstep3[34]);
+ lstep2[35] = _mm_add_epi32(lstep1[37], lstep3[35]);
+ lstep2[36] = _mm_sub_epi32(lstep3[34], lstep1[36]);
+ lstep2[37] = _mm_sub_epi32(lstep3[35], lstep1[37]);
+ lstep2[38] = _mm_sub_epi32(lstep3[32], lstep1[38]);
+ lstep2[39] = _mm_sub_epi32(lstep3[33], lstep1[39]);
+ lstep2[40] = _mm_sub_epi32(lstep3[46], lstep1[40]);
+ lstep2[41] = _mm_sub_epi32(lstep3[47], lstep1[41]);
+ lstep2[42] = _mm_sub_epi32(lstep3[44], lstep1[42]);
+ lstep2[43] = _mm_sub_epi32(lstep3[45], lstep1[43]);
+ lstep2[44] = _mm_add_epi32(lstep1[42], lstep3[44]);
+ lstep2[45] = _mm_add_epi32(lstep1[43], lstep3[45]);
+ lstep2[46] = _mm_add_epi32(lstep1[40], lstep3[46]);
+ lstep2[47] = _mm_add_epi32(lstep1[41], lstep3[47]);
+ lstep2[48] = _mm_add_epi32(lstep1[54], lstep3[48]);
+ lstep2[49] = _mm_add_epi32(lstep1[55], lstep3[49]);
+ lstep2[50] = _mm_add_epi32(lstep1[52], lstep3[50]);
+ lstep2[51] = _mm_add_epi32(lstep1[53], lstep3[51]);
+ lstep2[52] = _mm_sub_epi32(lstep3[50], lstep1[52]);
+ lstep2[53] = _mm_sub_epi32(lstep3[51], lstep1[53]);
+ lstep2[54] = _mm_sub_epi32(lstep3[48], lstep1[54]);
+ lstep2[55] = _mm_sub_epi32(lstep3[49], lstep1[55]);
+ lstep2[56] = _mm_sub_epi32(lstep3[62], lstep1[56]);
+ lstep2[57] = _mm_sub_epi32(lstep3[63], lstep1[57]);
+ lstep2[58] = _mm_sub_epi32(lstep3[60], lstep1[58]);
+ lstep2[59] = _mm_sub_epi32(lstep3[61], lstep1[59]);
+ lstep2[60] = _mm_add_epi32(lstep1[58], lstep3[60]);
+ lstep2[61] = _mm_add_epi32(lstep1[59], lstep3[61]);
+ lstep2[62] = _mm_add_epi32(lstep1[56], lstep3[62]);
+ lstep2[63] = _mm_add_epi32(lstep1[57], lstep3[63]);
+ }
+ // stage 6
+ {
+ const __m128i k32_p28_p04 = pair_set_epi32(cospi_28_64, cospi_4_64);
+ const __m128i k32_p12_p20 = pair_set_epi32(cospi_12_64, cospi_20_64);
+ const __m128i k32_m20_p12 = pair_set_epi32(-cospi_20_64, cospi_12_64);
+ const __m128i k32_m04_p28 = pair_set_epi32(-cospi_4_64, cospi_28_64);
+
+ u[0] = _mm_unpacklo_epi32(lstep2[8], lstep2[14]);
+ u[1] = _mm_unpackhi_epi32(lstep2[8], lstep2[14]);
+ u[2] = _mm_unpacklo_epi32(lstep2[9], lstep2[15]);
+ u[3] = _mm_unpackhi_epi32(lstep2[9], lstep2[15]);
+ u[4] = _mm_unpacklo_epi32(lstep2[10], lstep2[12]);
+ u[5] = _mm_unpackhi_epi32(lstep2[10], lstep2[12]);
+ u[6] = _mm_unpacklo_epi32(lstep2[11], lstep2[13]);
+ u[7] = _mm_unpackhi_epi32(lstep2[11], lstep2[13]);
+ u[8] = _mm_unpacklo_epi32(lstep2[10], lstep2[12]);
+ u[9] = _mm_unpackhi_epi32(lstep2[10], lstep2[12]);
+ u[10] = _mm_unpacklo_epi32(lstep2[11], lstep2[13]);
+ u[11] = _mm_unpackhi_epi32(lstep2[11], lstep2[13]);
+ u[12] = _mm_unpacklo_epi32(lstep2[8], lstep2[14]);
+ u[13] = _mm_unpackhi_epi32(lstep2[8], lstep2[14]);
+ u[14] = _mm_unpacklo_epi32(lstep2[9], lstep2[15]);
+ u[15] = _mm_unpackhi_epi32(lstep2[9], lstep2[15]);
+
+ v[0] = k_madd_epi32(u[0], k32_p28_p04);
+ v[1] = k_madd_epi32(u[1], k32_p28_p04);
+ v[2] = k_madd_epi32(u[2], k32_p28_p04);
+ v[3] = k_madd_epi32(u[3], k32_p28_p04);
+ v[4] = k_madd_epi32(u[4], k32_p12_p20);
+ v[5] = k_madd_epi32(u[5], k32_p12_p20);
+ v[6] = k_madd_epi32(u[6], k32_p12_p20);
+ v[7] = k_madd_epi32(u[7], k32_p12_p20);
+ v[8] = k_madd_epi32(u[8], k32_m20_p12);
+ v[9] = k_madd_epi32(u[9], k32_m20_p12);
+ v[10] = k_madd_epi32(u[10], k32_m20_p12);
+ v[11] = k_madd_epi32(u[11], k32_m20_p12);
+ v[12] = k_madd_epi32(u[12], k32_m04_p28);
+ v[13] = k_madd_epi32(u[13], k32_m04_p28);
+ v[14] = k_madd_epi32(u[14], k32_m04_p28);
+ v[15] = k_madd_epi32(u[15], k32_m04_p28);
+
+#if DCT_HIGH_BIT_DEPTH
+ overflow = k_check_epi32_overflow_16(
+ &v[0], &v[1], &v[2], &v[3], &v[4], &v[5], &v[6], &v[7], &v[8],
+ &v[9], &v[10], &v[11], &v[12], &v[13], &v[14], &v[15], &kZero);
+ if (overflow) {
+ HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org);
+ return;
+ }
+#endif // DCT_HIGH_BIT_DEPTH
+ u[0] = k_packs_epi64(v[0], v[1]);
+ u[1] = k_packs_epi64(v[2], v[3]);
+ u[2] = k_packs_epi64(v[4], v[5]);
+ u[3] = k_packs_epi64(v[6], v[7]);
+ u[4] = k_packs_epi64(v[8], v[9]);
+ u[5] = k_packs_epi64(v[10], v[11]);
+ u[6] = k_packs_epi64(v[12], v[13]);
+ u[7] = k_packs_epi64(v[14], v[15]);
+
+ v[0] = _mm_add_epi32(u[0], k__DCT_CONST_ROUNDING);
+ v[1] = _mm_add_epi32(u[1], k__DCT_CONST_ROUNDING);
+ v[2] = _mm_add_epi32(u[2], k__DCT_CONST_ROUNDING);
+ v[3] = _mm_add_epi32(u[3], k__DCT_CONST_ROUNDING);
+ v[4] = _mm_add_epi32(u[4], k__DCT_CONST_ROUNDING);
+ v[5] = _mm_add_epi32(u[5], k__DCT_CONST_ROUNDING);
+ v[6] = _mm_add_epi32(u[6], k__DCT_CONST_ROUNDING);
+ v[7] = _mm_add_epi32(u[7], k__DCT_CONST_ROUNDING);
+
+ u[0] = _mm_srai_epi32(v[0], DCT_CONST_BITS);
+ u[1] = _mm_srai_epi32(v[1], DCT_CONST_BITS);
+ u[2] = _mm_srai_epi32(v[2], DCT_CONST_BITS);
+ u[3] = _mm_srai_epi32(v[3], DCT_CONST_BITS);
+ u[4] = _mm_srai_epi32(v[4], DCT_CONST_BITS);
+ u[5] = _mm_srai_epi32(v[5], DCT_CONST_BITS);
+ u[6] = _mm_srai_epi32(v[6], DCT_CONST_BITS);
+ u[7] = _mm_srai_epi32(v[7], DCT_CONST_BITS);
+
+ sign[0] = _mm_cmplt_epi32(u[0], kZero);
+ sign[1] = _mm_cmplt_epi32(u[1], kZero);
+ sign[2] = _mm_cmplt_epi32(u[2], kZero);
+ sign[3] = _mm_cmplt_epi32(u[3], kZero);
+ sign[4] = _mm_cmplt_epi32(u[4], kZero);
+ sign[5] = _mm_cmplt_epi32(u[5], kZero);
+ sign[6] = _mm_cmplt_epi32(u[6], kZero);
+ sign[7] = _mm_cmplt_epi32(u[7], kZero);
+
+ u[0] = _mm_sub_epi32(u[0], sign[0]);
+ u[1] = _mm_sub_epi32(u[1], sign[1]);
+ u[2] = _mm_sub_epi32(u[2], sign[2]);
+ u[3] = _mm_sub_epi32(u[3], sign[3]);
+ u[4] = _mm_sub_epi32(u[4], sign[4]);
+ u[5] = _mm_sub_epi32(u[5], sign[5]);
+ u[6] = _mm_sub_epi32(u[6], sign[6]);
+ u[7] = _mm_sub_epi32(u[7], sign[7]);
+
+ u[0] = _mm_add_epi32(u[0], K32One);
+ u[1] = _mm_add_epi32(u[1], K32One);
+ u[2] = _mm_add_epi32(u[2], K32One);
+ u[3] = _mm_add_epi32(u[3], K32One);
+ u[4] = _mm_add_epi32(u[4], K32One);
+ u[5] = _mm_add_epi32(u[5], K32One);
+ u[6] = _mm_add_epi32(u[6], K32One);
+ u[7] = _mm_add_epi32(u[7], K32One);
+
+ u[0] = _mm_srai_epi32(u[0], 2);
+ u[1] = _mm_srai_epi32(u[1], 2);
+ u[2] = _mm_srai_epi32(u[2], 2);
+ u[3] = _mm_srai_epi32(u[3], 2);
+ u[4] = _mm_srai_epi32(u[4], 2);
+ u[5] = _mm_srai_epi32(u[5], 2);
+ u[6] = _mm_srai_epi32(u[6], 2);
+ u[7] = _mm_srai_epi32(u[7], 2);
+
+ out[4] = _mm_packs_epi32(u[0], u[1]);
+ out[20] = _mm_packs_epi32(u[2], u[3]);
+ out[12] = _mm_packs_epi32(u[4], u[5]);
+ out[28] = _mm_packs_epi32(u[6], u[7]);
+#if DCT_HIGH_BIT_DEPTH
+ overflow =
+ check_epi16_overflow_x4(&out[4], &out[20], &out[12], &out[28]);
+ if (overflow) {
+ HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org);
+ return;
+ }
+#endif // DCT_HIGH_BIT_DEPTH
+ }
+ {
+ lstep3[16] = _mm_add_epi32(lstep2[18], lstep1[16]);
+ lstep3[17] = _mm_add_epi32(lstep2[19], lstep1[17]);
+ lstep3[18] = _mm_sub_epi32(lstep1[16], lstep2[18]);
+ lstep3[19] = _mm_sub_epi32(lstep1[17], lstep2[19]);
+ lstep3[20] = _mm_sub_epi32(lstep1[22], lstep2[20]);
+ lstep3[21] = _mm_sub_epi32(lstep1[23], lstep2[21]);
+ lstep3[22] = _mm_add_epi32(lstep2[20], lstep1[22]);
+ lstep3[23] = _mm_add_epi32(lstep2[21], lstep1[23]);
+ lstep3[24] = _mm_add_epi32(lstep2[26], lstep1[24]);
+ lstep3[25] = _mm_add_epi32(lstep2[27], lstep1[25]);
+ lstep3[26] = _mm_sub_epi32(lstep1[24], lstep2[26]);
+ lstep3[27] = _mm_sub_epi32(lstep1[25], lstep2[27]);
+ lstep3[28] = _mm_sub_epi32(lstep1[30], lstep2[28]);
+ lstep3[29] = _mm_sub_epi32(lstep1[31], lstep2[29]);
+ lstep3[30] = _mm_add_epi32(lstep2[28], lstep1[30]);
+ lstep3[31] = _mm_add_epi32(lstep2[29], lstep1[31]);
+ }
+ {
+ const __m128i k32_m04_p28 = pair_set_epi32(-cospi_4_64, cospi_28_64);
+ const __m128i k32_m28_m04 = pair_set_epi32(-cospi_28_64, -cospi_4_64);
+ const __m128i k32_m20_p12 = pair_set_epi32(-cospi_20_64, cospi_12_64);
+ const __m128i k32_m12_m20 =
+ pair_set_epi32(-cospi_12_64, -cospi_20_64);
+ const __m128i k32_p12_p20 = pair_set_epi32(cospi_12_64, cospi_20_64);
+ const __m128i k32_p28_p04 = pair_set_epi32(cospi_28_64, cospi_4_64);
+
+ u[0] = _mm_unpacklo_epi32(lstep2[34], lstep2[60]);
+ u[1] = _mm_unpackhi_epi32(lstep2[34], lstep2[60]);
+ u[2] = _mm_unpacklo_epi32(lstep2[35], lstep2[61]);
+ u[3] = _mm_unpackhi_epi32(lstep2[35], lstep2[61]);
+ u[4] = _mm_unpacklo_epi32(lstep2[36], lstep2[58]);
+ u[5] = _mm_unpackhi_epi32(lstep2[36], lstep2[58]);
+ u[6] = _mm_unpacklo_epi32(lstep2[37], lstep2[59]);
+ u[7] = _mm_unpackhi_epi32(lstep2[37], lstep2[59]);
+ u[8] = _mm_unpacklo_epi32(lstep2[42], lstep2[52]);
+ u[9] = _mm_unpackhi_epi32(lstep2[42], lstep2[52]);
+ u[10] = _mm_unpacklo_epi32(lstep2[43], lstep2[53]);
+ u[11] = _mm_unpackhi_epi32(lstep2[43], lstep2[53]);
+ u[12] = _mm_unpacklo_epi32(lstep2[44], lstep2[50]);
+ u[13] = _mm_unpackhi_epi32(lstep2[44], lstep2[50]);
+ u[14] = _mm_unpacklo_epi32(lstep2[45], lstep2[51]);
+ u[15] = _mm_unpackhi_epi32(lstep2[45], lstep2[51]);
+
+ v[0] = k_madd_epi32(u[0], k32_m04_p28);
+ v[1] = k_madd_epi32(u[1], k32_m04_p28);
+ v[2] = k_madd_epi32(u[2], k32_m04_p28);
+ v[3] = k_madd_epi32(u[3], k32_m04_p28);
+ v[4] = k_madd_epi32(u[4], k32_m28_m04);
+ v[5] = k_madd_epi32(u[5], k32_m28_m04);
+ v[6] = k_madd_epi32(u[6], k32_m28_m04);
+ v[7] = k_madd_epi32(u[7], k32_m28_m04);
+ v[8] = k_madd_epi32(u[8], k32_m20_p12);
+ v[9] = k_madd_epi32(u[9], k32_m20_p12);
+ v[10] = k_madd_epi32(u[10], k32_m20_p12);
+ v[11] = k_madd_epi32(u[11], k32_m20_p12);
+ v[12] = k_madd_epi32(u[12], k32_m12_m20);
+ v[13] = k_madd_epi32(u[13], k32_m12_m20);
+ v[14] = k_madd_epi32(u[14], k32_m12_m20);
+ v[15] = k_madd_epi32(u[15], k32_m12_m20);
+ v[16] = k_madd_epi32(u[12], k32_m20_p12);
+ v[17] = k_madd_epi32(u[13], k32_m20_p12);
+ v[18] = k_madd_epi32(u[14], k32_m20_p12);
+ v[19] = k_madd_epi32(u[15], k32_m20_p12);
+ v[20] = k_madd_epi32(u[8], k32_p12_p20);
+ v[21] = k_madd_epi32(u[9], k32_p12_p20);
+ v[22] = k_madd_epi32(u[10], k32_p12_p20);
+ v[23] = k_madd_epi32(u[11], k32_p12_p20);
+ v[24] = k_madd_epi32(u[4], k32_m04_p28);
+ v[25] = k_madd_epi32(u[5], k32_m04_p28);
+ v[26] = k_madd_epi32(u[6], k32_m04_p28);
+ v[27] = k_madd_epi32(u[7], k32_m04_p28);
+ v[28] = k_madd_epi32(u[0], k32_p28_p04);
+ v[29] = k_madd_epi32(u[1], k32_p28_p04);
+ v[30] = k_madd_epi32(u[2], k32_p28_p04);
+ v[31] = k_madd_epi32(u[3], k32_p28_p04);
+
+#if DCT_HIGH_BIT_DEPTH
+ overflow = k_check_epi32_overflow_32(
+ &v[0], &v[1], &v[2], &v[3], &v[4], &v[5], &v[6], &v[7], &v[8],
+ &v[9], &v[10], &v[11], &v[12], &v[13], &v[14], &v[15], &v[16],
+ &v[17], &v[18], &v[19], &v[20], &v[21], &v[22], &v[23], &v[24],
+ &v[25], &v[26], &v[27], &v[28], &v[29], &v[30], &v[31], &kZero);
+ if (overflow) {
+ HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org);
+ return;
+ }
+#endif // DCT_HIGH_BIT_DEPTH
+ u[0] = k_packs_epi64(v[0], v[1]);
+ u[1] = k_packs_epi64(v[2], v[3]);
+ u[2] = k_packs_epi64(v[4], v[5]);
+ u[3] = k_packs_epi64(v[6], v[7]);
+ u[4] = k_packs_epi64(v[8], v[9]);
+ u[5] = k_packs_epi64(v[10], v[11]);
+ u[6] = k_packs_epi64(v[12], v[13]);
+ u[7] = k_packs_epi64(v[14], v[15]);
+ u[8] = k_packs_epi64(v[16], v[17]);
+ u[9] = k_packs_epi64(v[18], v[19]);
+ u[10] = k_packs_epi64(v[20], v[21]);
+ u[11] = k_packs_epi64(v[22], v[23]);
+ u[12] = k_packs_epi64(v[24], v[25]);
+ u[13] = k_packs_epi64(v[26], v[27]);
+ u[14] = k_packs_epi64(v[28], v[29]);
+ u[15] = k_packs_epi64(v[30], v[31]);
+
+ v[0] = _mm_add_epi32(u[0], k__DCT_CONST_ROUNDING);
+ v[1] = _mm_add_epi32(u[1], k__DCT_CONST_ROUNDING);
+ v[2] = _mm_add_epi32(u[2], k__DCT_CONST_ROUNDING);
+ v[3] = _mm_add_epi32(u[3], k__DCT_CONST_ROUNDING);
+ v[4] = _mm_add_epi32(u[4], k__DCT_CONST_ROUNDING);
+ v[5] = _mm_add_epi32(u[5], k__DCT_CONST_ROUNDING);
+ v[6] = _mm_add_epi32(u[6], k__DCT_CONST_ROUNDING);
+ v[7] = _mm_add_epi32(u[7], k__DCT_CONST_ROUNDING);
+ v[8] = _mm_add_epi32(u[8], k__DCT_CONST_ROUNDING);
+ v[9] = _mm_add_epi32(u[9], k__DCT_CONST_ROUNDING);
+ v[10] = _mm_add_epi32(u[10], k__DCT_CONST_ROUNDING);
+ v[11] = _mm_add_epi32(u[11], k__DCT_CONST_ROUNDING);
+ v[12] = _mm_add_epi32(u[12], k__DCT_CONST_ROUNDING);
+ v[13] = _mm_add_epi32(u[13], k__DCT_CONST_ROUNDING);
+ v[14] = _mm_add_epi32(u[14], k__DCT_CONST_ROUNDING);
+ v[15] = _mm_add_epi32(u[15], k__DCT_CONST_ROUNDING);
+
+ lstep3[34] = _mm_srai_epi32(v[0], DCT_CONST_BITS);
+ lstep3[35] = _mm_srai_epi32(v[1], DCT_CONST_BITS);
+ lstep3[36] = _mm_srai_epi32(v[2], DCT_CONST_BITS);
+ lstep3[37] = _mm_srai_epi32(v[3], DCT_CONST_BITS);
+ lstep3[42] = _mm_srai_epi32(v[4], DCT_CONST_BITS);
+ lstep3[43] = _mm_srai_epi32(v[5], DCT_CONST_BITS);
+ lstep3[44] = _mm_srai_epi32(v[6], DCT_CONST_BITS);
+ lstep3[45] = _mm_srai_epi32(v[7], DCT_CONST_BITS);
+ lstep3[50] = _mm_srai_epi32(v[8], DCT_CONST_BITS);
+ lstep3[51] = _mm_srai_epi32(v[9], DCT_CONST_BITS);
+ lstep3[52] = _mm_srai_epi32(v[10], DCT_CONST_BITS);
+ lstep3[53] = _mm_srai_epi32(v[11], DCT_CONST_BITS);
+ lstep3[58] = _mm_srai_epi32(v[12], DCT_CONST_BITS);
+ lstep3[59] = _mm_srai_epi32(v[13], DCT_CONST_BITS);
+ lstep3[60] = _mm_srai_epi32(v[14], DCT_CONST_BITS);
+ lstep3[61] = _mm_srai_epi32(v[15], DCT_CONST_BITS);
+ }
+ // stage 7
+ {
+ const __m128i k32_p30_p02 = pair_set_epi32(cospi_30_64, cospi_2_64);
+ const __m128i k32_p14_p18 = pair_set_epi32(cospi_14_64, cospi_18_64);
+ const __m128i k32_p22_p10 = pair_set_epi32(cospi_22_64, cospi_10_64);
+ const __m128i k32_p06_p26 = pair_set_epi32(cospi_6_64, cospi_26_64);
+ const __m128i k32_m26_p06 = pair_set_epi32(-cospi_26_64, cospi_6_64);
+ const __m128i k32_m10_p22 = pair_set_epi32(-cospi_10_64, cospi_22_64);
+ const __m128i k32_m18_p14 = pair_set_epi32(-cospi_18_64, cospi_14_64);
+ const __m128i k32_m02_p30 = pair_set_epi32(-cospi_2_64, cospi_30_64);
+
+ u[0] = _mm_unpacklo_epi32(lstep3[16], lstep3[30]);
+ u[1] = _mm_unpackhi_epi32(lstep3[16], lstep3[30]);
+ u[2] = _mm_unpacklo_epi32(lstep3[17], lstep3[31]);
+ u[3] = _mm_unpackhi_epi32(lstep3[17], lstep3[31]);
+ u[4] = _mm_unpacklo_epi32(lstep3[18], lstep3[28]);
+ u[5] = _mm_unpackhi_epi32(lstep3[18], lstep3[28]);
+ u[6] = _mm_unpacklo_epi32(lstep3[19], lstep3[29]);
+ u[7] = _mm_unpackhi_epi32(lstep3[19], lstep3[29]);
+ u[8] = _mm_unpacklo_epi32(lstep3[20], lstep3[26]);
+ u[9] = _mm_unpackhi_epi32(lstep3[20], lstep3[26]);
+ u[10] = _mm_unpacklo_epi32(lstep3[21], lstep3[27]);
+ u[11] = _mm_unpackhi_epi32(lstep3[21], lstep3[27]);
+ u[12] = _mm_unpacklo_epi32(lstep3[22], lstep3[24]);
+ u[13] = _mm_unpackhi_epi32(lstep3[22], lstep3[24]);
+ u[14] = _mm_unpacklo_epi32(lstep3[23], lstep3[25]);
+ u[15] = _mm_unpackhi_epi32(lstep3[23], lstep3[25]);
+
+ v[0] = k_madd_epi32(u[0], k32_p30_p02);
+ v[1] = k_madd_epi32(u[1], k32_p30_p02);
+ v[2] = k_madd_epi32(u[2], k32_p30_p02);
+ v[3] = k_madd_epi32(u[3], k32_p30_p02);
+ v[4] = k_madd_epi32(u[4], k32_p14_p18);
+ v[5] = k_madd_epi32(u[5], k32_p14_p18);
+ v[6] = k_madd_epi32(u[6], k32_p14_p18);
+ v[7] = k_madd_epi32(u[7], k32_p14_p18);
+ v[8] = k_madd_epi32(u[8], k32_p22_p10);
+ v[9] = k_madd_epi32(u[9], k32_p22_p10);
+ v[10] = k_madd_epi32(u[10], k32_p22_p10);
+ v[11] = k_madd_epi32(u[11], k32_p22_p10);
+ v[12] = k_madd_epi32(u[12], k32_p06_p26);
+ v[13] = k_madd_epi32(u[13], k32_p06_p26);
+ v[14] = k_madd_epi32(u[14], k32_p06_p26);
+ v[15] = k_madd_epi32(u[15], k32_p06_p26);
+ v[16] = k_madd_epi32(u[12], k32_m26_p06);
+ v[17] = k_madd_epi32(u[13], k32_m26_p06);
+ v[18] = k_madd_epi32(u[14], k32_m26_p06);
+ v[19] = k_madd_epi32(u[15], k32_m26_p06);
+ v[20] = k_madd_epi32(u[8], k32_m10_p22);
+ v[21] = k_madd_epi32(u[9], k32_m10_p22);
+ v[22] = k_madd_epi32(u[10], k32_m10_p22);
+ v[23] = k_madd_epi32(u[11], k32_m10_p22);
+ v[24] = k_madd_epi32(u[4], k32_m18_p14);
+ v[25] = k_madd_epi32(u[5], k32_m18_p14);
+ v[26] = k_madd_epi32(u[6], k32_m18_p14);
+ v[27] = k_madd_epi32(u[7], k32_m18_p14);
+ v[28] = k_madd_epi32(u[0], k32_m02_p30);
+ v[29] = k_madd_epi32(u[1], k32_m02_p30);
+ v[30] = k_madd_epi32(u[2], k32_m02_p30);
+ v[31] = k_madd_epi32(u[3], k32_m02_p30);
+
+#if DCT_HIGH_BIT_DEPTH
+ overflow = k_check_epi32_overflow_32(
+ &v[0], &v[1], &v[2], &v[3], &v[4], &v[5], &v[6], &v[7], &v[8],
+ &v[9], &v[10], &v[11], &v[12], &v[13], &v[14], &v[15], &v[16],
+ &v[17], &v[18], &v[19], &v[20], &v[21], &v[22], &v[23], &v[24],
+ &v[25], &v[26], &v[27], &v[28], &v[29], &v[30], &v[31], &kZero);
+ if (overflow) {
+ HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org);
+ return;
+ }
+#endif // DCT_HIGH_BIT_DEPTH
+ u[0] = k_packs_epi64(v[0], v[1]);
+ u[1] = k_packs_epi64(v[2], v[3]);
+ u[2] = k_packs_epi64(v[4], v[5]);
+ u[3] = k_packs_epi64(v[6], v[7]);
+ u[4] = k_packs_epi64(v[8], v[9]);
+ u[5] = k_packs_epi64(v[10], v[11]);
+ u[6] = k_packs_epi64(v[12], v[13]);
+ u[7] = k_packs_epi64(v[14], v[15]);
+ u[8] = k_packs_epi64(v[16], v[17]);
+ u[9] = k_packs_epi64(v[18], v[19]);
+ u[10] = k_packs_epi64(v[20], v[21]);
+ u[11] = k_packs_epi64(v[22], v[23]);
+ u[12] = k_packs_epi64(v[24], v[25]);
+ u[13] = k_packs_epi64(v[26], v[27]);
+ u[14] = k_packs_epi64(v[28], v[29]);
+ u[15] = k_packs_epi64(v[30], v[31]);
+
+ v[0] = _mm_add_epi32(u[0], k__DCT_CONST_ROUNDING);
+ v[1] = _mm_add_epi32(u[1], k__DCT_CONST_ROUNDING);
+ v[2] = _mm_add_epi32(u[2], k__DCT_CONST_ROUNDING);
+ v[3] = _mm_add_epi32(u[3], k__DCT_CONST_ROUNDING);
+ v[4] = _mm_add_epi32(u[4], k__DCT_CONST_ROUNDING);
+ v[5] = _mm_add_epi32(u[5], k__DCT_CONST_ROUNDING);
+ v[6] = _mm_add_epi32(u[6], k__DCT_CONST_ROUNDING);
+ v[7] = _mm_add_epi32(u[7], k__DCT_CONST_ROUNDING);
+ v[8] = _mm_add_epi32(u[8], k__DCT_CONST_ROUNDING);
+ v[9] = _mm_add_epi32(u[9], k__DCT_CONST_ROUNDING);
+ v[10] = _mm_add_epi32(u[10], k__DCT_CONST_ROUNDING);
+ v[11] = _mm_add_epi32(u[11], k__DCT_CONST_ROUNDING);
+ v[12] = _mm_add_epi32(u[12], k__DCT_CONST_ROUNDING);
+ v[13] = _mm_add_epi32(u[13], k__DCT_CONST_ROUNDING);
+ v[14] = _mm_add_epi32(u[14], k__DCT_CONST_ROUNDING);
+ v[15] = _mm_add_epi32(u[15], k__DCT_CONST_ROUNDING);
+
+ u[0] = _mm_srai_epi32(v[0], DCT_CONST_BITS);
+ u[1] = _mm_srai_epi32(v[1], DCT_CONST_BITS);
+ u[2] = _mm_srai_epi32(v[2], DCT_CONST_BITS);
+ u[3] = _mm_srai_epi32(v[3], DCT_CONST_BITS);
+ u[4] = _mm_srai_epi32(v[4], DCT_CONST_BITS);
+ u[5] = _mm_srai_epi32(v[5], DCT_CONST_BITS);
+ u[6] = _mm_srai_epi32(v[6], DCT_CONST_BITS);
+ u[7] = _mm_srai_epi32(v[7], DCT_CONST_BITS);
+ u[8] = _mm_srai_epi32(v[8], DCT_CONST_BITS);
+ u[9] = _mm_srai_epi32(v[9], DCT_CONST_BITS);
+ u[10] = _mm_srai_epi32(v[10], DCT_CONST_BITS);
+ u[11] = _mm_srai_epi32(v[11], DCT_CONST_BITS);
+ u[12] = _mm_srai_epi32(v[12], DCT_CONST_BITS);
+ u[13] = _mm_srai_epi32(v[13], DCT_CONST_BITS);
+ u[14] = _mm_srai_epi32(v[14], DCT_CONST_BITS);
+ u[15] = _mm_srai_epi32(v[15], DCT_CONST_BITS);
+
+ v[0] = _mm_cmplt_epi32(u[0], kZero);
+ v[1] = _mm_cmplt_epi32(u[1], kZero);
+ v[2] = _mm_cmplt_epi32(u[2], kZero);
+ v[3] = _mm_cmplt_epi32(u[3], kZero);
+ v[4] = _mm_cmplt_epi32(u[4], kZero);
+ v[5] = _mm_cmplt_epi32(u[5], kZero);
+ v[6] = _mm_cmplt_epi32(u[6], kZero);
+ v[7] = _mm_cmplt_epi32(u[7], kZero);
+ v[8] = _mm_cmplt_epi32(u[8], kZero);
+ v[9] = _mm_cmplt_epi32(u[9], kZero);
+ v[10] = _mm_cmplt_epi32(u[10], kZero);
+ v[11] = _mm_cmplt_epi32(u[11], kZero);
+ v[12] = _mm_cmplt_epi32(u[12], kZero);
+ v[13] = _mm_cmplt_epi32(u[13], kZero);
+ v[14] = _mm_cmplt_epi32(u[14], kZero);
+ v[15] = _mm_cmplt_epi32(u[15], kZero);
+
+ u[0] = _mm_sub_epi32(u[0], v[0]);
+ u[1] = _mm_sub_epi32(u[1], v[1]);
+ u[2] = _mm_sub_epi32(u[2], v[2]);
+ u[3] = _mm_sub_epi32(u[3], v[3]);
+ u[4] = _mm_sub_epi32(u[4], v[4]);
+ u[5] = _mm_sub_epi32(u[5], v[5]);
+ u[6] = _mm_sub_epi32(u[6], v[6]);
+ u[7] = _mm_sub_epi32(u[7], v[7]);
+ u[8] = _mm_sub_epi32(u[8], v[8]);
+ u[9] = _mm_sub_epi32(u[9], v[9]);
+ u[10] = _mm_sub_epi32(u[10], v[10]);
+ u[11] = _mm_sub_epi32(u[11], v[11]);
+ u[12] = _mm_sub_epi32(u[12], v[12]);
+ u[13] = _mm_sub_epi32(u[13], v[13]);
+ u[14] = _mm_sub_epi32(u[14], v[14]);
+ u[15] = _mm_sub_epi32(u[15], v[15]);
+
+ v[0] = _mm_add_epi32(u[0], K32One);
+ v[1] = _mm_add_epi32(u[1], K32One);
+ v[2] = _mm_add_epi32(u[2], K32One);
+ v[3] = _mm_add_epi32(u[3], K32One);
+ v[4] = _mm_add_epi32(u[4], K32One);
+ v[5] = _mm_add_epi32(u[5], K32One);
+ v[6] = _mm_add_epi32(u[6], K32One);
+ v[7] = _mm_add_epi32(u[7], K32One);
+ v[8] = _mm_add_epi32(u[8], K32One);
+ v[9] = _mm_add_epi32(u[9], K32One);
+ v[10] = _mm_add_epi32(u[10], K32One);
+ v[11] = _mm_add_epi32(u[11], K32One);
+ v[12] = _mm_add_epi32(u[12], K32One);
+ v[13] = _mm_add_epi32(u[13], K32One);
+ v[14] = _mm_add_epi32(u[14], K32One);
+ v[15] = _mm_add_epi32(u[15], K32One);
+
+ u[0] = _mm_srai_epi32(v[0], 2);
+ u[1] = _mm_srai_epi32(v[1], 2);
+ u[2] = _mm_srai_epi32(v[2], 2);
+ u[3] = _mm_srai_epi32(v[3], 2);
+ u[4] = _mm_srai_epi32(v[4], 2);
+ u[5] = _mm_srai_epi32(v[5], 2);
+ u[6] = _mm_srai_epi32(v[6], 2);
+ u[7] = _mm_srai_epi32(v[7], 2);
+ u[8] = _mm_srai_epi32(v[8], 2);
+ u[9] = _mm_srai_epi32(v[9], 2);
+ u[10] = _mm_srai_epi32(v[10], 2);
+ u[11] = _mm_srai_epi32(v[11], 2);
+ u[12] = _mm_srai_epi32(v[12], 2);
+ u[13] = _mm_srai_epi32(v[13], 2);
+ u[14] = _mm_srai_epi32(v[14], 2);
+ u[15] = _mm_srai_epi32(v[15], 2);
+
+ out[2] = _mm_packs_epi32(u[0], u[1]);
+ out[18] = _mm_packs_epi32(u[2], u[3]);
+ out[10] = _mm_packs_epi32(u[4], u[5]);
+ out[26] = _mm_packs_epi32(u[6], u[7]);
+ out[6] = _mm_packs_epi32(u[8], u[9]);
+ out[22] = _mm_packs_epi32(u[10], u[11]);
+ out[14] = _mm_packs_epi32(u[12], u[13]);
+ out[30] = _mm_packs_epi32(u[14], u[15]);
+#if DCT_HIGH_BIT_DEPTH
+ overflow =
+ check_epi16_overflow_x8(&out[2], &out[18], &out[10], &out[26],
+ &out[6], &out[22], &out[14], &out[30]);
+ if (overflow) {
+ HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org);
+ return;
+ }
+#endif // DCT_HIGH_BIT_DEPTH
+ }
+ {
+ lstep1[32] = _mm_add_epi32(lstep3[34], lstep2[32]);
+ lstep1[33] = _mm_add_epi32(lstep3[35], lstep2[33]);
+ lstep1[34] = _mm_sub_epi32(lstep2[32], lstep3[34]);
+ lstep1[35] = _mm_sub_epi32(lstep2[33], lstep3[35]);
+ lstep1[36] = _mm_sub_epi32(lstep2[38], lstep3[36]);
+ lstep1[37] = _mm_sub_epi32(lstep2[39], lstep3[37]);
+ lstep1[38] = _mm_add_epi32(lstep3[36], lstep2[38]);
+ lstep1[39] = _mm_add_epi32(lstep3[37], lstep2[39]);
+ lstep1[40] = _mm_add_epi32(lstep3[42], lstep2[40]);
+ lstep1[41] = _mm_add_epi32(lstep3[43], lstep2[41]);
+ lstep1[42] = _mm_sub_epi32(lstep2[40], lstep3[42]);
+ lstep1[43] = _mm_sub_epi32(lstep2[41], lstep3[43]);
+ lstep1[44] = _mm_sub_epi32(lstep2[46], lstep3[44]);
+ lstep1[45] = _mm_sub_epi32(lstep2[47], lstep3[45]);
+ lstep1[46] = _mm_add_epi32(lstep3[44], lstep2[46]);
+ lstep1[47] = _mm_add_epi32(lstep3[45], lstep2[47]);
+ lstep1[48] = _mm_add_epi32(lstep3[50], lstep2[48]);
+ lstep1[49] = _mm_add_epi32(lstep3[51], lstep2[49]);
+ lstep1[50] = _mm_sub_epi32(lstep2[48], lstep3[50]);
+ lstep1[51] = _mm_sub_epi32(lstep2[49], lstep3[51]);
+ lstep1[52] = _mm_sub_epi32(lstep2[54], lstep3[52]);
+ lstep1[53] = _mm_sub_epi32(lstep2[55], lstep3[53]);
+ lstep1[54] = _mm_add_epi32(lstep3[52], lstep2[54]);
+ lstep1[55] = _mm_add_epi32(lstep3[53], lstep2[55]);
+ lstep1[56] = _mm_add_epi32(lstep3[58], lstep2[56]);
+ lstep1[57] = _mm_add_epi32(lstep3[59], lstep2[57]);
+ lstep1[58] = _mm_sub_epi32(lstep2[56], lstep3[58]);
+ lstep1[59] = _mm_sub_epi32(lstep2[57], lstep3[59]);
+ lstep1[60] = _mm_sub_epi32(lstep2[62], lstep3[60]);
+ lstep1[61] = _mm_sub_epi32(lstep2[63], lstep3[61]);
+ lstep1[62] = _mm_add_epi32(lstep3[60], lstep2[62]);
+ lstep1[63] = _mm_add_epi32(lstep3[61], lstep2[63]);
+ }
+ // stage 8
+ {
+ const __m128i k32_p31_p01 = pair_set_epi32(cospi_31_64, cospi_1_64);
+ const __m128i k32_p15_p17 = pair_set_epi32(cospi_15_64, cospi_17_64);
+ const __m128i k32_p23_p09 = pair_set_epi32(cospi_23_64, cospi_9_64);
+ const __m128i k32_p07_p25 = pair_set_epi32(cospi_7_64, cospi_25_64);
+ const __m128i k32_m25_p07 = pair_set_epi32(-cospi_25_64, cospi_7_64);
+ const __m128i k32_m09_p23 = pair_set_epi32(-cospi_9_64, cospi_23_64);
+ const __m128i k32_m17_p15 = pair_set_epi32(-cospi_17_64, cospi_15_64);
+ const __m128i k32_m01_p31 = pair_set_epi32(-cospi_1_64, cospi_31_64);
+
+ u[0] = _mm_unpacklo_epi32(lstep1[32], lstep1[62]);
+ u[1] = _mm_unpackhi_epi32(lstep1[32], lstep1[62]);
+ u[2] = _mm_unpacklo_epi32(lstep1[33], lstep1[63]);
+ u[3] = _mm_unpackhi_epi32(lstep1[33], lstep1[63]);
+ u[4] = _mm_unpacklo_epi32(lstep1[34], lstep1[60]);
+ u[5] = _mm_unpackhi_epi32(lstep1[34], lstep1[60]);
+ u[6] = _mm_unpacklo_epi32(lstep1[35], lstep1[61]);
+ u[7] = _mm_unpackhi_epi32(lstep1[35], lstep1[61]);
+ u[8] = _mm_unpacklo_epi32(lstep1[36], lstep1[58]);
+ u[9] = _mm_unpackhi_epi32(lstep1[36], lstep1[58]);
+ u[10] = _mm_unpacklo_epi32(lstep1[37], lstep1[59]);
+ u[11] = _mm_unpackhi_epi32(lstep1[37], lstep1[59]);
+ u[12] = _mm_unpacklo_epi32(lstep1[38], lstep1[56]);
+ u[13] = _mm_unpackhi_epi32(lstep1[38], lstep1[56]);
+ u[14] = _mm_unpacklo_epi32(lstep1[39], lstep1[57]);
+ u[15] = _mm_unpackhi_epi32(lstep1[39], lstep1[57]);
+
+ v[0] = k_madd_epi32(u[0], k32_p31_p01);
+ v[1] = k_madd_epi32(u[1], k32_p31_p01);
+ v[2] = k_madd_epi32(u[2], k32_p31_p01);
+ v[3] = k_madd_epi32(u[3], k32_p31_p01);
+ v[4] = k_madd_epi32(u[4], k32_p15_p17);
+ v[5] = k_madd_epi32(u[5], k32_p15_p17);
+ v[6] = k_madd_epi32(u[6], k32_p15_p17);
+ v[7] = k_madd_epi32(u[7], k32_p15_p17);
+ v[8] = k_madd_epi32(u[8], k32_p23_p09);
+ v[9] = k_madd_epi32(u[9], k32_p23_p09);
+ v[10] = k_madd_epi32(u[10], k32_p23_p09);
+ v[11] = k_madd_epi32(u[11], k32_p23_p09);
+ v[12] = k_madd_epi32(u[12], k32_p07_p25);
+ v[13] = k_madd_epi32(u[13], k32_p07_p25);
+ v[14] = k_madd_epi32(u[14], k32_p07_p25);
+ v[15] = k_madd_epi32(u[15], k32_p07_p25);
+ v[16] = k_madd_epi32(u[12], k32_m25_p07);
+ v[17] = k_madd_epi32(u[13], k32_m25_p07);
+ v[18] = k_madd_epi32(u[14], k32_m25_p07);
+ v[19] = k_madd_epi32(u[15], k32_m25_p07);
+ v[20] = k_madd_epi32(u[8], k32_m09_p23);
+ v[21] = k_madd_epi32(u[9], k32_m09_p23);
+ v[22] = k_madd_epi32(u[10], k32_m09_p23);
+ v[23] = k_madd_epi32(u[11], k32_m09_p23);
+ v[24] = k_madd_epi32(u[4], k32_m17_p15);
+ v[25] = k_madd_epi32(u[5], k32_m17_p15);
+ v[26] = k_madd_epi32(u[6], k32_m17_p15);
+ v[27] = k_madd_epi32(u[7], k32_m17_p15);
+ v[28] = k_madd_epi32(u[0], k32_m01_p31);
+ v[29] = k_madd_epi32(u[1], k32_m01_p31);
+ v[30] = k_madd_epi32(u[2], k32_m01_p31);
+ v[31] = k_madd_epi32(u[3], k32_m01_p31);
+
+#if DCT_HIGH_BIT_DEPTH
+ overflow = k_check_epi32_overflow_32(
+ &v[0], &v[1], &v[2], &v[3], &v[4], &v[5], &v[6], &v[7], &v[8],
+ &v[9], &v[10], &v[11], &v[12], &v[13], &v[14], &v[15], &v[16],
+ &v[17], &v[18], &v[19], &v[20], &v[21], &v[22], &v[23], &v[24],
+ &v[25], &v[26], &v[27], &v[28], &v[29], &v[30], &v[31], &kZero);
+ if (overflow) {
+ HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org);
+ return;
+ }
+#endif // DCT_HIGH_BIT_DEPTH
+ u[0] = k_packs_epi64(v[0], v[1]);
+ u[1] = k_packs_epi64(v[2], v[3]);
+ u[2] = k_packs_epi64(v[4], v[5]);
+ u[3] = k_packs_epi64(v[6], v[7]);
+ u[4] = k_packs_epi64(v[8], v[9]);
+ u[5] = k_packs_epi64(v[10], v[11]);
+ u[6] = k_packs_epi64(v[12], v[13]);
+ u[7] = k_packs_epi64(v[14], v[15]);
+ u[8] = k_packs_epi64(v[16], v[17]);
+ u[9] = k_packs_epi64(v[18], v[19]);
+ u[10] = k_packs_epi64(v[20], v[21]);
+ u[11] = k_packs_epi64(v[22], v[23]);
+ u[12] = k_packs_epi64(v[24], v[25]);
+ u[13] = k_packs_epi64(v[26], v[27]);
+ u[14] = k_packs_epi64(v[28], v[29]);
+ u[15] = k_packs_epi64(v[30], v[31]);
+
+ v[0] = _mm_add_epi32(u[0], k__DCT_CONST_ROUNDING);
+ v[1] = _mm_add_epi32(u[1], k__DCT_CONST_ROUNDING);
+ v[2] = _mm_add_epi32(u[2], k__DCT_CONST_ROUNDING);
+ v[3] = _mm_add_epi32(u[3], k__DCT_CONST_ROUNDING);
+ v[4] = _mm_add_epi32(u[4], k__DCT_CONST_ROUNDING);
+ v[5] = _mm_add_epi32(u[5], k__DCT_CONST_ROUNDING);
+ v[6] = _mm_add_epi32(u[6], k__DCT_CONST_ROUNDING);
+ v[7] = _mm_add_epi32(u[7], k__DCT_CONST_ROUNDING);
+ v[8] = _mm_add_epi32(u[8], k__DCT_CONST_ROUNDING);
+ v[9] = _mm_add_epi32(u[9], k__DCT_CONST_ROUNDING);
+ v[10] = _mm_add_epi32(u[10], k__DCT_CONST_ROUNDING);
+ v[11] = _mm_add_epi32(u[11], k__DCT_CONST_ROUNDING);
+ v[12] = _mm_add_epi32(u[12], k__DCT_CONST_ROUNDING);
+ v[13] = _mm_add_epi32(u[13], k__DCT_CONST_ROUNDING);
+ v[14] = _mm_add_epi32(u[14], k__DCT_CONST_ROUNDING);
+ v[15] = _mm_add_epi32(u[15], k__DCT_CONST_ROUNDING);
+
+ u[0] = _mm_srai_epi32(v[0], DCT_CONST_BITS);
+ u[1] = _mm_srai_epi32(v[1], DCT_CONST_BITS);
+ u[2] = _mm_srai_epi32(v[2], DCT_CONST_BITS);
+ u[3] = _mm_srai_epi32(v[3], DCT_CONST_BITS);
+ u[4] = _mm_srai_epi32(v[4], DCT_CONST_BITS);
+ u[5] = _mm_srai_epi32(v[5], DCT_CONST_BITS);
+ u[6] = _mm_srai_epi32(v[6], DCT_CONST_BITS);
+ u[7] = _mm_srai_epi32(v[7], DCT_CONST_BITS);
+ u[8] = _mm_srai_epi32(v[8], DCT_CONST_BITS);
+ u[9] = _mm_srai_epi32(v[9], DCT_CONST_BITS);
+ u[10] = _mm_srai_epi32(v[10], DCT_CONST_BITS);
+ u[11] = _mm_srai_epi32(v[11], DCT_CONST_BITS);
+ u[12] = _mm_srai_epi32(v[12], DCT_CONST_BITS);
+ u[13] = _mm_srai_epi32(v[13], DCT_CONST_BITS);
+ u[14] = _mm_srai_epi32(v[14], DCT_CONST_BITS);
+ u[15] = _mm_srai_epi32(v[15], DCT_CONST_BITS);
+
+ v[0] = _mm_cmplt_epi32(u[0], kZero);
+ v[1] = _mm_cmplt_epi32(u[1], kZero);
+ v[2] = _mm_cmplt_epi32(u[2], kZero);
+ v[3] = _mm_cmplt_epi32(u[3], kZero);
+ v[4] = _mm_cmplt_epi32(u[4], kZero);
+ v[5] = _mm_cmplt_epi32(u[5], kZero);
+ v[6] = _mm_cmplt_epi32(u[6], kZero);
+ v[7] = _mm_cmplt_epi32(u[7], kZero);
+ v[8] = _mm_cmplt_epi32(u[8], kZero);
+ v[9] = _mm_cmplt_epi32(u[9], kZero);
+ v[10] = _mm_cmplt_epi32(u[10], kZero);
+ v[11] = _mm_cmplt_epi32(u[11], kZero);
+ v[12] = _mm_cmplt_epi32(u[12], kZero);
+ v[13] = _mm_cmplt_epi32(u[13], kZero);
+ v[14] = _mm_cmplt_epi32(u[14], kZero);
+ v[15] = _mm_cmplt_epi32(u[15], kZero);
+
+ u[0] = _mm_sub_epi32(u[0], v[0]);
+ u[1] = _mm_sub_epi32(u[1], v[1]);
+ u[2] = _mm_sub_epi32(u[2], v[2]);
+ u[3] = _mm_sub_epi32(u[3], v[3]);
+ u[4] = _mm_sub_epi32(u[4], v[4]);
+ u[5] = _mm_sub_epi32(u[5], v[5]);
+ u[6] = _mm_sub_epi32(u[6], v[6]);
+ u[7] = _mm_sub_epi32(u[7], v[7]);
+ u[8] = _mm_sub_epi32(u[8], v[8]);
+ u[9] = _mm_sub_epi32(u[9], v[9]);
+ u[10] = _mm_sub_epi32(u[10], v[10]);
+ u[11] = _mm_sub_epi32(u[11], v[11]);
+ u[12] = _mm_sub_epi32(u[12], v[12]);
+ u[13] = _mm_sub_epi32(u[13], v[13]);
+ u[14] = _mm_sub_epi32(u[14], v[14]);
+ u[15] = _mm_sub_epi32(u[15], v[15]);
+
+ v[0] = _mm_add_epi32(u[0], K32One);
+ v[1] = _mm_add_epi32(u[1], K32One);
+ v[2] = _mm_add_epi32(u[2], K32One);
+ v[3] = _mm_add_epi32(u[3], K32One);
+ v[4] = _mm_add_epi32(u[4], K32One);
+ v[5] = _mm_add_epi32(u[5], K32One);
+ v[6] = _mm_add_epi32(u[6], K32One);
+ v[7] = _mm_add_epi32(u[7], K32One);
+ v[8] = _mm_add_epi32(u[8], K32One);
+ v[9] = _mm_add_epi32(u[9], K32One);
+ v[10] = _mm_add_epi32(u[10], K32One);
+ v[11] = _mm_add_epi32(u[11], K32One);
+ v[12] = _mm_add_epi32(u[12], K32One);
+ v[13] = _mm_add_epi32(u[13], K32One);
+ v[14] = _mm_add_epi32(u[14], K32One);
+ v[15] = _mm_add_epi32(u[15], K32One);
+
+ u[0] = _mm_srai_epi32(v[0], 2);
+ u[1] = _mm_srai_epi32(v[1], 2);
+ u[2] = _mm_srai_epi32(v[2], 2);
+ u[3] = _mm_srai_epi32(v[3], 2);
+ u[4] = _mm_srai_epi32(v[4], 2);
+ u[5] = _mm_srai_epi32(v[5], 2);
+ u[6] = _mm_srai_epi32(v[6], 2);
+ u[7] = _mm_srai_epi32(v[7], 2);
+ u[8] = _mm_srai_epi32(v[8], 2);
+ u[9] = _mm_srai_epi32(v[9], 2);
+ u[10] = _mm_srai_epi32(v[10], 2);
+ u[11] = _mm_srai_epi32(v[11], 2);
+ u[12] = _mm_srai_epi32(v[12], 2);
+ u[13] = _mm_srai_epi32(v[13], 2);
+ u[14] = _mm_srai_epi32(v[14], 2);
+ u[15] = _mm_srai_epi32(v[15], 2);
+
+ out[1] = _mm_packs_epi32(u[0], u[1]);
+ out[17] = _mm_packs_epi32(u[2], u[3]);
+ out[9] = _mm_packs_epi32(u[4], u[5]);
+ out[25] = _mm_packs_epi32(u[6], u[7]);
+ out[7] = _mm_packs_epi32(u[8], u[9]);
+ out[23] = _mm_packs_epi32(u[10], u[11]);
+ out[15] = _mm_packs_epi32(u[12], u[13]);
+ out[31] = _mm_packs_epi32(u[14], u[15]);
+#if DCT_HIGH_BIT_DEPTH
+ overflow =
+ check_epi16_overflow_x8(&out[1], &out[17], &out[9], &out[25],
+ &out[7], &out[23], &out[15], &out[31]);
+ if (overflow) {
+ HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org);
+ return;
+ }
+#endif // DCT_HIGH_BIT_DEPTH
+ }
+ {
+ const __m128i k32_p27_p05 = pair_set_epi32(cospi_27_64, cospi_5_64);
+ const __m128i k32_p11_p21 = pair_set_epi32(cospi_11_64, cospi_21_64);
+ const __m128i k32_p19_p13 = pair_set_epi32(cospi_19_64, cospi_13_64);
+ const __m128i k32_p03_p29 = pair_set_epi32(cospi_3_64, cospi_29_64);
+ const __m128i k32_m29_p03 = pair_set_epi32(-cospi_29_64, cospi_3_64);
+ const __m128i k32_m13_p19 = pair_set_epi32(-cospi_13_64, cospi_19_64);
+ const __m128i k32_m21_p11 = pair_set_epi32(-cospi_21_64, cospi_11_64);
+ const __m128i k32_m05_p27 = pair_set_epi32(-cospi_5_64, cospi_27_64);
+
+ u[0] = _mm_unpacklo_epi32(lstep1[40], lstep1[54]);
+ u[1] = _mm_unpackhi_epi32(lstep1[40], lstep1[54]);
+ u[2] = _mm_unpacklo_epi32(lstep1[41], lstep1[55]);
+ u[3] = _mm_unpackhi_epi32(lstep1[41], lstep1[55]);
+ u[4] = _mm_unpacklo_epi32(lstep1[42], lstep1[52]);
+ u[5] = _mm_unpackhi_epi32(lstep1[42], lstep1[52]);
+ u[6] = _mm_unpacklo_epi32(lstep1[43], lstep1[53]);
+ u[7] = _mm_unpackhi_epi32(lstep1[43], lstep1[53]);
+ u[8] = _mm_unpacklo_epi32(lstep1[44], lstep1[50]);
+ u[9] = _mm_unpackhi_epi32(lstep1[44], lstep1[50]);
+ u[10] = _mm_unpacklo_epi32(lstep1[45], lstep1[51]);
+ u[11] = _mm_unpackhi_epi32(lstep1[45], lstep1[51]);
+ u[12] = _mm_unpacklo_epi32(lstep1[46], lstep1[48]);
+ u[13] = _mm_unpackhi_epi32(lstep1[46], lstep1[48]);
+ u[14] = _mm_unpacklo_epi32(lstep1[47], lstep1[49]);
+ u[15] = _mm_unpackhi_epi32(lstep1[47], lstep1[49]);
+
+ v[0] = k_madd_epi32(u[0], k32_p27_p05);
+ v[1] = k_madd_epi32(u[1], k32_p27_p05);
+ v[2] = k_madd_epi32(u[2], k32_p27_p05);
+ v[3] = k_madd_epi32(u[3], k32_p27_p05);
+ v[4] = k_madd_epi32(u[4], k32_p11_p21);
+ v[5] = k_madd_epi32(u[5], k32_p11_p21);
+ v[6] = k_madd_epi32(u[6], k32_p11_p21);
+ v[7] = k_madd_epi32(u[7], k32_p11_p21);
+ v[8] = k_madd_epi32(u[8], k32_p19_p13);
+ v[9] = k_madd_epi32(u[9], k32_p19_p13);
+ v[10] = k_madd_epi32(u[10], k32_p19_p13);
+ v[11] = k_madd_epi32(u[11], k32_p19_p13);
+ v[12] = k_madd_epi32(u[12], k32_p03_p29);
+ v[13] = k_madd_epi32(u[13], k32_p03_p29);
+ v[14] = k_madd_epi32(u[14], k32_p03_p29);
+ v[15] = k_madd_epi32(u[15], k32_p03_p29);
+ v[16] = k_madd_epi32(u[12], k32_m29_p03);
+ v[17] = k_madd_epi32(u[13], k32_m29_p03);
+ v[18] = k_madd_epi32(u[14], k32_m29_p03);
+ v[19] = k_madd_epi32(u[15], k32_m29_p03);
+ v[20] = k_madd_epi32(u[8], k32_m13_p19);
+ v[21] = k_madd_epi32(u[9], k32_m13_p19);
+ v[22] = k_madd_epi32(u[10], k32_m13_p19);
+ v[23] = k_madd_epi32(u[11], k32_m13_p19);
+ v[24] = k_madd_epi32(u[4], k32_m21_p11);
+ v[25] = k_madd_epi32(u[5], k32_m21_p11);
+ v[26] = k_madd_epi32(u[6], k32_m21_p11);
+ v[27] = k_madd_epi32(u[7], k32_m21_p11);
+ v[28] = k_madd_epi32(u[0], k32_m05_p27);
+ v[29] = k_madd_epi32(u[1], k32_m05_p27);
+ v[30] = k_madd_epi32(u[2], k32_m05_p27);
+ v[31] = k_madd_epi32(u[3], k32_m05_p27);
+
+#if DCT_HIGH_BIT_DEPTH
+ overflow = k_check_epi32_overflow_32(
+ &v[0], &v[1], &v[2], &v[3], &v[4], &v[5], &v[6], &v[7], &v[8],
+ &v[9], &v[10], &v[11], &v[12], &v[13], &v[14], &v[15], &v[16],
+ &v[17], &v[18], &v[19], &v[20], &v[21], &v[22], &v[23], &v[24],
+ &v[25], &v[26], &v[27], &v[28], &v[29], &v[30], &v[31], &kZero);
+ if (overflow) {
+ HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org);
+ return;
+ }
+#endif // DCT_HIGH_BIT_DEPTH
+ u[0] = k_packs_epi64(v[0], v[1]);
+ u[1] = k_packs_epi64(v[2], v[3]);
+ u[2] = k_packs_epi64(v[4], v[5]);
+ u[3] = k_packs_epi64(v[6], v[7]);
+ u[4] = k_packs_epi64(v[8], v[9]);
+ u[5] = k_packs_epi64(v[10], v[11]);
+ u[6] = k_packs_epi64(v[12], v[13]);
+ u[7] = k_packs_epi64(v[14], v[15]);
+ u[8] = k_packs_epi64(v[16], v[17]);
+ u[9] = k_packs_epi64(v[18], v[19]);
+ u[10] = k_packs_epi64(v[20], v[21]);
+ u[11] = k_packs_epi64(v[22], v[23]);
+ u[12] = k_packs_epi64(v[24], v[25]);
+ u[13] = k_packs_epi64(v[26], v[27]);
+ u[14] = k_packs_epi64(v[28], v[29]);
+ u[15] = k_packs_epi64(v[30], v[31]);
+
+ v[0] = _mm_add_epi32(u[0], k__DCT_CONST_ROUNDING);
+ v[1] = _mm_add_epi32(u[1], k__DCT_CONST_ROUNDING);
+ v[2] = _mm_add_epi32(u[2], k__DCT_CONST_ROUNDING);
+ v[3] = _mm_add_epi32(u[3], k__DCT_CONST_ROUNDING);
+ v[4] = _mm_add_epi32(u[4], k__DCT_CONST_ROUNDING);
+ v[5] = _mm_add_epi32(u[5], k__DCT_CONST_ROUNDING);
+ v[6] = _mm_add_epi32(u[6], k__DCT_CONST_ROUNDING);
+ v[7] = _mm_add_epi32(u[7], k__DCT_CONST_ROUNDING);
+ v[8] = _mm_add_epi32(u[8], k__DCT_CONST_ROUNDING);
+ v[9] = _mm_add_epi32(u[9], k__DCT_CONST_ROUNDING);
+ v[10] = _mm_add_epi32(u[10], k__DCT_CONST_ROUNDING);
+ v[11] = _mm_add_epi32(u[11], k__DCT_CONST_ROUNDING);
+ v[12] = _mm_add_epi32(u[12], k__DCT_CONST_ROUNDING);
+ v[13] = _mm_add_epi32(u[13], k__DCT_CONST_ROUNDING);
+ v[14] = _mm_add_epi32(u[14], k__DCT_CONST_ROUNDING);
+ v[15] = _mm_add_epi32(u[15], k__DCT_CONST_ROUNDING);
+
+ u[0] = _mm_srai_epi32(v[0], DCT_CONST_BITS);
+ u[1] = _mm_srai_epi32(v[1], DCT_CONST_BITS);
+ u[2] = _mm_srai_epi32(v[2], DCT_CONST_BITS);
+ u[3] = _mm_srai_epi32(v[3], DCT_CONST_BITS);
+ u[4] = _mm_srai_epi32(v[4], DCT_CONST_BITS);
+ u[5] = _mm_srai_epi32(v[5], DCT_CONST_BITS);
+ u[6] = _mm_srai_epi32(v[6], DCT_CONST_BITS);
+ u[7] = _mm_srai_epi32(v[7], DCT_CONST_BITS);
+ u[8] = _mm_srai_epi32(v[8], DCT_CONST_BITS);
+ u[9] = _mm_srai_epi32(v[9], DCT_CONST_BITS);
+ u[10] = _mm_srai_epi32(v[10], DCT_CONST_BITS);
+ u[11] = _mm_srai_epi32(v[11], DCT_CONST_BITS);
+ u[12] = _mm_srai_epi32(v[12], DCT_CONST_BITS);
+ u[13] = _mm_srai_epi32(v[13], DCT_CONST_BITS);
+ u[14] = _mm_srai_epi32(v[14], DCT_CONST_BITS);
+ u[15] = _mm_srai_epi32(v[15], DCT_CONST_BITS);
+
+ v[0] = _mm_cmplt_epi32(u[0], kZero);
+ v[1] = _mm_cmplt_epi32(u[1], kZero);
+ v[2] = _mm_cmplt_epi32(u[2], kZero);
+ v[3] = _mm_cmplt_epi32(u[3], kZero);
+ v[4] = _mm_cmplt_epi32(u[4], kZero);
+ v[5] = _mm_cmplt_epi32(u[5], kZero);
+ v[6] = _mm_cmplt_epi32(u[6], kZero);
+ v[7] = _mm_cmplt_epi32(u[7], kZero);
+ v[8] = _mm_cmplt_epi32(u[8], kZero);
+ v[9] = _mm_cmplt_epi32(u[9], kZero);
+ v[10] = _mm_cmplt_epi32(u[10], kZero);
+ v[11] = _mm_cmplt_epi32(u[11], kZero);
+ v[12] = _mm_cmplt_epi32(u[12], kZero);
+ v[13] = _mm_cmplt_epi32(u[13], kZero);
+ v[14] = _mm_cmplt_epi32(u[14], kZero);
+ v[15] = _mm_cmplt_epi32(u[15], kZero);
+
+ u[0] = _mm_sub_epi32(u[0], v[0]);
+ u[1] = _mm_sub_epi32(u[1], v[1]);
+ u[2] = _mm_sub_epi32(u[2], v[2]);
+ u[3] = _mm_sub_epi32(u[3], v[3]);
+ u[4] = _mm_sub_epi32(u[4], v[4]);
+ u[5] = _mm_sub_epi32(u[5], v[5]);
+ u[6] = _mm_sub_epi32(u[6], v[6]);
+ u[7] = _mm_sub_epi32(u[7], v[7]);
+ u[8] = _mm_sub_epi32(u[8], v[8]);
+ u[9] = _mm_sub_epi32(u[9], v[9]);
+ u[10] = _mm_sub_epi32(u[10], v[10]);
+ u[11] = _mm_sub_epi32(u[11], v[11]);
+ u[12] = _mm_sub_epi32(u[12], v[12]);
+ u[13] = _mm_sub_epi32(u[13], v[13]);
+ u[14] = _mm_sub_epi32(u[14], v[14]);
+ u[15] = _mm_sub_epi32(u[15], v[15]);
+
+ v[0] = _mm_add_epi32(u[0], K32One);
+ v[1] = _mm_add_epi32(u[1], K32One);
+ v[2] = _mm_add_epi32(u[2], K32One);
+ v[3] = _mm_add_epi32(u[3], K32One);
+ v[4] = _mm_add_epi32(u[4], K32One);
+ v[5] = _mm_add_epi32(u[5], K32One);
+ v[6] = _mm_add_epi32(u[6], K32One);
+ v[7] = _mm_add_epi32(u[7], K32One);
+ v[8] = _mm_add_epi32(u[8], K32One);
+ v[9] = _mm_add_epi32(u[9], K32One);
+ v[10] = _mm_add_epi32(u[10], K32One);
+ v[11] = _mm_add_epi32(u[11], K32One);
+ v[12] = _mm_add_epi32(u[12], K32One);
+ v[13] = _mm_add_epi32(u[13], K32One);
+ v[14] = _mm_add_epi32(u[14], K32One);
+ v[15] = _mm_add_epi32(u[15], K32One);
+
+ u[0] = _mm_srai_epi32(v[0], 2);
+ u[1] = _mm_srai_epi32(v[1], 2);
+ u[2] = _mm_srai_epi32(v[2], 2);
+ u[3] = _mm_srai_epi32(v[3], 2);
+ u[4] = _mm_srai_epi32(v[4], 2);
+ u[5] = _mm_srai_epi32(v[5], 2);
+ u[6] = _mm_srai_epi32(v[6], 2);
+ u[7] = _mm_srai_epi32(v[7], 2);
+ u[8] = _mm_srai_epi32(v[8], 2);
+ u[9] = _mm_srai_epi32(v[9], 2);
+ u[10] = _mm_srai_epi32(v[10], 2);
+ u[11] = _mm_srai_epi32(v[11], 2);
+ u[12] = _mm_srai_epi32(v[12], 2);
+ u[13] = _mm_srai_epi32(v[13], 2);
+ u[14] = _mm_srai_epi32(v[14], 2);
+ u[15] = _mm_srai_epi32(v[15], 2);
+
+ out[5] = _mm_packs_epi32(u[0], u[1]);
+ out[21] = _mm_packs_epi32(u[2], u[3]);
+ out[13] = _mm_packs_epi32(u[4], u[5]);
+ out[29] = _mm_packs_epi32(u[6], u[7]);
+ out[3] = _mm_packs_epi32(u[8], u[9]);
+ out[19] = _mm_packs_epi32(u[10], u[11]);
+ out[11] = _mm_packs_epi32(u[12], u[13]);
+ out[27] = _mm_packs_epi32(u[14], u[15]);
+#if DCT_HIGH_BIT_DEPTH
+ overflow =
+ check_epi16_overflow_x8(&out[5], &out[21], &out[13], &out[29],
+ &out[3], &out[19], &out[11], &out[27]);
+ if (overflow) {
+ HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org);
+ return;
+ }
+#endif // DCT_HIGH_BIT_DEPTH
+ }
+ }
+#endif // FDCT32x32_HIGH_PRECISION
+ // Transpose the results, do it as four 8x8 transposes.
+ {
+ int transpose_block;
+ int16_t *output0 = &intermediate[column_start * 32];
+ tran_low_t *output1 = &output_org[column_start * 32];
+ for (transpose_block = 0; transpose_block < 4; ++transpose_block) {
+ __m128i *this_out = &out[8 * transpose_block];
+ // 00 01 02 03 04 05 06 07
+ // 10 11 12 13 14 15 16 17
+ // 20 21 22 23 24 25 26 27
+ // 30 31 32 33 34 35 36 37
+ // 40 41 42 43 44 45 46 47
+ // 50 51 52 53 54 55 56 57
+ // 60 61 62 63 64 65 66 67
+ // 70 71 72 73 74 75 76 77
+ const __m128i tr0_0 = _mm_unpacklo_epi16(this_out[0], this_out[1]);
+ const __m128i tr0_1 = _mm_unpacklo_epi16(this_out[2], this_out[3]);
+ const __m128i tr0_2 = _mm_unpackhi_epi16(this_out[0], this_out[1]);
+ const __m128i tr0_3 = _mm_unpackhi_epi16(this_out[2], this_out[3]);
+ const __m128i tr0_4 = _mm_unpacklo_epi16(this_out[4], this_out[5]);
+ const __m128i tr0_5 = _mm_unpacklo_epi16(this_out[6], this_out[7]);
+ const __m128i tr0_6 = _mm_unpackhi_epi16(this_out[4], this_out[5]);
+ const __m128i tr0_7 = _mm_unpackhi_epi16(this_out[6], this_out[7]);
+ // 00 10 01 11 02 12 03 13
+ // 20 30 21 31 22 32 23 33
+ // 04 14 05 15 06 16 07 17
+ // 24 34 25 35 26 36 27 37
+ // 40 50 41 51 42 52 43 53
+ // 60 70 61 71 62 72 63 73
+ // 54 54 55 55 56 56 57 57
+ // 64 74 65 75 66 76 67 77
+ const __m128i tr1_0 = _mm_unpacklo_epi32(tr0_0, tr0_1);
+ const __m128i tr1_1 = _mm_unpacklo_epi32(tr0_2, tr0_3);
+ const __m128i tr1_2 = _mm_unpackhi_epi32(tr0_0, tr0_1);
+ const __m128i tr1_3 = _mm_unpackhi_epi32(tr0_2, tr0_3);
+ const __m128i tr1_4 = _mm_unpacklo_epi32(tr0_4, tr0_5);
+ const __m128i tr1_5 = _mm_unpacklo_epi32(tr0_6, tr0_7);
+ const __m128i tr1_6 = _mm_unpackhi_epi32(tr0_4, tr0_5);
+ const __m128i tr1_7 = _mm_unpackhi_epi32(tr0_6, tr0_7);
+ // 00 10 20 30 01 11 21 31
+ // 40 50 60 70 41 51 61 71
+ // 02 12 22 32 03 13 23 33
+ // 42 52 62 72 43 53 63 73
+ // 04 14 24 34 05 15 21 36
+ // 44 54 64 74 45 55 61 76
+ // 06 16 26 36 07 17 27 37
+ // 46 56 66 76 47 57 67 77
+ __m128i tr2_0 = _mm_unpacklo_epi64(tr1_0, tr1_4);
+ __m128i tr2_1 = _mm_unpackhi_epi64(tr1_0, tr1_4);
+ __m128i tr2_2 = _mm_unpacklo_epi64(tr1_2, tr1_6);
+ __m128i tr2_3 = _mm_unpackhi_epi64(tr1_2, tr1_6);
+ __m128i tr2_4 = _mm_unpacklo_epi64(tr1_1, tr1_5);
+ __m128i tr2_5 = _mm_unpackhi_epi64(tr1_1, tr1_5);
+ __m128i tr2_6 = _mm_unpacklo_epi64(tr1_3, tr1_7);
+ __m128i tr2_7 = _mm_unpackhi_epi64(tr1_3, tr1_7);
+ // 00 10 20 30 40 50 60 70
+ // 01 11 21 31 41 51 61 71
+ // 02 12 22 32 42 52 62 72
+ // 03 13 23 33 43 53 63 73
+ // 04 14 24 34 44 54 64 74
+ // 05 15 25 35 45 55 65 75
+ // 06 16 26 36 46 56 66 76
+ // 07 17 27 37 47 57 67 77
+ if (0 == pass) {
+ // output[j] = (output[j] + 1 + (output[j] > 0)) >> 2;
+ // TODO(cd): see quality impact of only doing
+ // output[j] = (output[j] + 1) >> 2;
+ // which would remove the code between here ...
+ __m128i tr2_0_0 = _mm_cmpgt_epi16(tr2_0, kZero);
+ __m128i tr2_1_0 = _mm_cmpgt_epi16(tr2_1, kZero);
+ __m128i tr2_2_0 = _mm_cmpgt_epi16(tr2_2, kZero);
+ __m128i tr2_3_0 = _mm_cmpgt_epi16(tr2_3, kZero);
+ __m128i tr2_4_0 = _mm_cmpgt_epi16(tr2_4, kZero);
+ __m128i tr2_5_0 = _mm_cmpgt_epi16(tr2_5, kZero);
+ __m128i tr2_6_0 = _mm_cmpgt_epi16(tr2_6, kZero);
+ __m128i tr2_7_0 = _mm_cmpgt_epi16(tr2_7, kZero);
+ tr2_0 = _mm_sub_epi16(tr2_0, tr2_0_0);
+ tr2_1 = _mm_sub_epi16(tr2_1, tr2_1_0);
+ tr2_2 = _mm_sub_epi16(tr2_2, tr2_2_0);
+ tr2_3 = _mm_sub_epi16(tr2_3, tr2_3_0);
+ tr2_4 = _mm_sub_epi16(tr2_4, tr2_4_0);
+ tr2_5 = _mm_sub_epi16(tr2_5, tr2_5_0);
+ tr2_6 = _mm_sub_epi16(tr2_6, tr2_6_0);
+ tr2_7 = _mm_sub_epi16(tr2_7, tr2_7_0);
+ // ... and here.
+ // PS: also change code in vp9/encoder/vp9_dct.c
+ tr2_0 = _mm_add_epi16(tr2_0, kOne);
+ tr2_1 = _mm_add_epi16(tr2_1, kOne);
+ tr2_2 = _mm_add_epi16(tr2_2, kOne);
+ tr2_3 = _mm_add_epi16(tr2_3, kOne);
+ tr2_4 = _mm_add_epi16(tr2_4, kOne);
+ tr2_5 = _mm_add_epi16(tr2_5, kOne);
+ tr2_6 = _mm_add_epi16(tr2_6, kOne);
+ tr2_7 = _mm_add_epi16(tr2_7, kOne);
+ tr2_0 = _mm_srai_epi16(tr2_0, 2);
+ tr2_1 = _mm_srai_epi16(tr2_1, 2);
+ tr2_2 = _mm_srai_epi16(tr2_2, 2);
+ tr2_3 = _mm_srai_epi16(tr2_3, 2);
+ tr2_4 = _mm_srai_epi16(tr2_4, 2);
+ tr2_5 = _mm_srai_epi16(tr2_5, 2);
+ tr2_6 = _mm_srai_epi16(tr2_6, 2);
+ tr2_7 = _mm_srai_epi16(tr2_7, 2);
+ }
+ // Note: even though all these stores are aligned, using the aligned
+ // intrinsic make the code slightly slower.
+ if (pass == 0) {
+ _mm_storeu_si128((__m128i *)(output0 + 0 * 32), tr2_0);
+ _mm_storeu_si128((__m128i *)(output0 + 1 * 32), tr2_1);
+ _mm_storeu_si128((__m128i *)(output0 + 2 * 32), tr2_2);
+ _mm_storeu_si128((__m128i *)(output0 + 3 * 32), tr2_3);
+ _mm_storeu_si128((__m128i *)(output0 + 4 * 32), tr2_4);
+ _mm_storeu_si128((__m128i *)(output0 + 5 * 32), tr2_5);
+ _mm_storeu_si128((__m128i *)(output0 + 6 * 32), tr2_6);
+ _mm_storeu_si128((__m128i *)(output0 + 7 * 32), tr2_7);
+ // Process next 8x8
+ output0 += 8;
+ } else {
+ storeu_output(&tr2_0, (output1 + 0 * 32));
+ storeu_output(&tr2_1, (output1 + 1 * 32));
+ storeu_output(&tr2_2, (output1 + 2 * 32));
+ storeu_output(&tr2_3, (output1 + 3 * 32));
+ storeu_output(&tr2_4, (output1 + 4 * 32));
+ storeu_output(&tr2_5, (output1 + 5 * 32));
+ storeu_output(&tr2_6, (output1 + 6 * 32));
+ storeu_output(&tr2_7, (output1 + 7 * 32));
+ // Process next 8x8
+ output1 += 8;
+ }
+ }
+ }
+ }
+ }
+} // NOLINT
+
+#undef ADD_EPI16
+#undef SUB_EPI16
+#undef HIGH_FDCT32x32_2D_C
+#undef HIGH_FDCT32x32_2D_ROWS_C
diff --git a/media/libvpx/libvpx/vpx_dsp/x86/fwd_txfm_avx2.c b/media/libvpx/libvpx/vpx_dsp/x86/fwd_txfm_avx2.c
new file mode 100644
index 0000000000..c8f54a49cb
--- /dev/null
+++ b/media/libvpx/libvpx/vpx_dsp/x86/fwd_txfm_avx2.c
@@ -0,0 +1,399 @@
+/*
+ * Copyright (c) 2012 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <immintrin.h> // AVX2
+#include "./vpx_config.h"
+#include "./vpx_dsp_rtcd.h"
+
+#include "vpx_dsp/txfm_common.h"
+#define ADD256_EPI16 _mm256_add_epi16
+#define SUB256_EPI16 _mm256_sub_epi16
+
+static INLINE void load_buffer_16bit_to_16bit_avx2(const int16_t *in,
+ int stride, __m256i *out,
+ int out_size, int pass) {
+ int i;
+ const __m256i kOne = _mm256_set1_epi16(1);
+ if (pass == 0) {
+ for (i = 0; i < out_size; i++) {
+ out[i] = _mm256_loadu_si256((const __m256i *)(in + i * stride));
+ // x = x << 2
+ out[i] = _mm256_slli_epi16(out[i], 2);
+ }
+ } else {
+ for (i = 0; i < out_size; i++) {
+ out[i] = _mm256_loadu_si256((const __m256i *)(in + i * 16));
+ // x = (x + 1) >> 2
+ out[i] = _mm256_add_epi16(out[i], kOne);
+ out[i] = _mm256_srai_epi16(out[i], 2);
+ }
+ }
+}
+
+static INLINE void transpose2_8x8_avx2(const __m256i *const in,
+ __m256i *const out) {
+ int i;
+ __m256i t[16], u[16];
+ // (1st, 2nd) ==> (lo, hi)
+ // (0, 1) ==> (0, 1)
+ // (2, 3) ==> (2, 3)
+ // (4, 5) ==> (4, 5)
+ // (6, 7) ==> (6, 7)
+ for (i = 0; i < 4; i++) {
+ t[2 * i] = _mm256_unpacklo_epi16(in[2 * i], in[2 * i + 1]);
+ t[2 * i + 1] = _mm256_unpackhi_epi16(in[2 * i], in[2 * i + 1]);
+ }
+
+ // (1st, 2nd) ==> (lo, hi)
+ // (0, 2) ==> (0, 2)
+ // (1, 3) ==> (1, 3)
+ // (4, 6) ==> (4, 6)
+ // (5, 7) ==> (5, 7)
+ for (i = 0; i < 2; i++) {
+ u[i] = _mm256_unpacklo_epi32(t[i], t[i + 2]);
+ u[i + 2] = _mm256_unpackhi_epi32(t[i], t[i + 2]);
+
+ u[i + 4] = _mm256_unpacklo_epi32(t[i + 4], t[i + 6]);
+ u[i + 6] = _mm256_unpackhi_epi32(t[i + 4], t[i + 6]);
+ }
+
+ // (1st, 2nd) ==> (lo, hi)
+ // (0, 4) ==> (0, 1)
+ // (1, 5) ==> (4, 5)
+ // (2, 6) ==> (2, 3)
+ // (3, 7) ==> (6, 7)
+ for (i = 0; i < 2; i++) {
+ out[2 * i] = _mm256_unpacklo_epi64(u[2 * i], u[2 * i + 4]);
+ out[2 * i + 1] = _mm256_unpackhi_epi64(u[2 * i], u[2 * i + 4]);
+
+ out[2 * i + 4] = _mm256_unpacklo_epi64(u[2 * i + 1], u[2 * i + 5]);
+ out[2 * i + 5] = _mm256_unpackhi_epi64(u[2 * i + 1], u[2 * i + 5]);
+ }
+}
+
+static INLINE void transpose_16bit_16x16_avx2(const __m256i *const in,
+ __m256i *const out) {
+ __m256i t[16];
+
+#define LOADL(idx) \
+ t[idx] = _mm256_castsi128_si256(_mm_load_si128((__m128i const *)&in[idx])); \
+ t[idx] = _mm256_inserti128_si256( \
+ t[idx], _mm_load_si128((__m128i const *)&in[idx + 8]), 1);
+
+#define LOADR(idx) \
+ t[8 + idx] = \
+ _mm256_castsi128_si256(_mm_load_si128((__m128i const *)&in[idx] + 1)); \
+ t[8 + idx] = _mm256_inserti128_si256( \
+ t[8 + idx], _mm_load_si128((__m128i const *)&in[idx + 8] + 1), 1);
+
+ // load left 8x16
+ LOADL(0)
+ LOADL(1)
+ LOADL(2)
+ LOADL(3)
+ LOADL(4)
+ LOADL(5)
+ LOADL(6)
+ LOADL(7)
+
+ // load right 8x16
+ LOADR(0)
+ LOADR(1)
+ LOADR(2)
+ LOADR(3)
+ LOADR(4)
+ LOADR(5)
+ LOADR(6)
+ LOADR(7)
+
+ // get the top 16x8 result
+ transpose2_8x8_avx2(t, out);
+ // get the bottom 16x8 result
+ transpose2_8x8_avx2(&t[8], &out[8]);
+}
+
+// Store 8 16-bit values. Sign extend the values.
+static INLINE void store_buffer_16bit_to_32bit_w16_avx2(const __m256i *const in,
+ tran_low_t *out,
+ const int stride,
+ const int out_size) {
+ int i;
+ for (i = 0; i < out_size; ++i) {
+ _mm256_storeu_si256((__m256i *)(out), in[i]);
+ out += stride;
+ }
+}
+
+#define PAIR256_SET_EPI16(a, b) \
+ _mm256_set_epi16((int16_t)(b), (int16_t)(a), (int16_t)(b), (int16_t)(a), \
+ (int16_t)(b), (int16_t)(a), (int16_t)(b), (int16_t)(a), \
+ (int16_t)(b), (int16_t)(a), (int16_t)(b), (int16_t)(a), \
+ (int16_t)(b), (int16_t)(a), (int16_t)(b), (int16_t)(a))
+
+static INLINE __m256i mult256_round_shift(const __m256i *pin0,
+ const __m256i *pin1,
+ const __m256i *pmultiplier,
+ const __m256i *prounding,
+ const int shift) {
+ const __m256i u0 = _mm256_madd_epi16(*pin0, *pmultiplier);
+ const __m256i u1 = _mm256_madd_epi16(*pin1, *pmultiplier);
+ const __m256i v0 = _mm256_add_epi32(u0, *prounding);
+ const __m256i v1 = _mm256_add_epi32(u1, *prounding);
+ const __m256i w0 = _mm256_srai_epi32(v0, shift);
+ const __m256i w1 = _mm256_srai_epi32(v1, shift);
+ return _mm256_packs_epi32(w0, w1);
+}
+
+static INLINE void fdct16x16_1D_avx2(__m256i *input, __m256i *output) {
+ int i;
+ __m256i step2[4];
+ __m256i in[8];
+ __m256i step1[8];
+ __m256i step3[8];
+
+ const __m256i k__cospi_p16_p16 = _mm256_set1_epi16(cospi_16_64);
+ const __m256i k__cospi_p16_m16 = PAIR256_SET_EPI16(cospi_16_64, -cospi_16_64);
+ const __m256i k__cospi_p24_p08 = PAIR256_SET_EPI16(cospi_24_64, cospi_8_64);
+ const __m256i k__cospi_p08_m24 = PAIR256_SET_EPI16(cospi_8_64, -cospi_24_64);
+ const __m256i k__cospi_m08_p24 = PAIR256_SET_EPI16(-cospi_8_64, cospi_24_64);
+ const __m256i k__cospi_p28_p04 = PAIR256_SET_EPI16(cospi_28_64, cospi_4_64);
+ const __m256i k__cospi_m04_p28 = PAIR256_SET_EPI16(-cospi_4_64, cospi_28_64);
+ const __m256i k__cospi_p12_p20 = PAIR256_SET_EPI16(cospi_12_64, cospi_20_64);
+ const __m256i k__cospi_m20_p12 = PAIR256_SET_EPI16(-cospi_20_64, cospi_12_64);
+ const __m256i k__cospi_p30_p02 = PAIR256_SET_EPI16(cospi_30_64, cospi_2_64);
+ const __m256i k__cospi_p14_p18 = PAIR256_SET_EPI16(cospi_14_64, cospi_18_64);
+ const __m256i k__cospi_m02_p30 = PAIR256_SET_EPI16(-cospi_2_64, cospi_30_64);
+ const __m256i k__cospi_m18_p14 = PAIR256_SET_EPI16(-cospi_18_64, cospi_14_64);
+ const __m256i k__cospi_p22_p10 = PAIR256_SET_EPI16(cospi_22_64, cospi_10_64);
+ const __m256i k__cospi_p06_p26 = PAIR256_SET_EPI16(cospi_6_64, cospi_26_64);
+ const __m256i k__cospi_m10_p22 = PAIR256_SET_EPI16(-cospi_10_64, cospi_22_64);
+ const __m256i k__cospi_m26_p06 = PAIR256_SET_EPI16(-cospi_26_64, cospi_6_64);
+ const __m256i k__DCT_CONST_ROUNDING = _mm256_set1_epi32(DCT_CONST_ROUNDING);
+
+ // Calculate input for the first 8 results.
+ for (i = 0; i < 8; i++) {
+ in[i] = ADD256_EPI16(input[i], input[15 - i]);
+ }
+
+ // Calculate input for the next 8 results.
+ for (i = 0; i < 8; i++) {
+ step1[i] = SUB256_EPI16(input[7 - i], input[8 + i]);
+ }
+
+ // Work on the first eight values; fdct8(input, even_results);
+ {
+ // Add/subtract
+ const __m256i q0 = ADD256_EPI16(in[0], in[7]);
+ const __m256i q1 = ADD256_EPI16(in[1], in[6]);
+ const __m256i q2 = ADD256_EPI16(in[2], in[5]);
+ const __m256i q3 = ADD256_EPI16(in[3], in[4]);
+ const __m256i q4 = SUB256_EPI16(in[3], in[4]);
+ const __m256i q5 = SUB256_EPI16(in[2], in[5]);
+ const __m256i q6 = SUB256_EPI16(in[1], in[6]);
+ const __m256i q7 = SUB256_EPI16(in[0], in[7]);
+
+ // Work on first four results
+ {
+ // Add/subtract
+ const __m256i r0 = ADD256_EPI16(q0, q3);
+ const __m256i r1 = ADD256_EPI16(q1, q2);
+ const __m256i r2 = SUB256_EPI16(q1, q2);
+ const __m256i r3 = SUB256_EPI16(q0, q3);
+
+ // Interleave to do the multiply by constants which gets us
+ // into 32 bits.
+ {
+ const __m256i t0 = _mm256_unpacklo_epi16(r0, r1);
+ const __m256i t1 = _mm256_unpackhi_epi16(r0, r1);
+ const __m256i t2 = _mm256_unpacklo_epi16(r2, r3);
+ const __m256i t3 = _mm256_unpackhi_epi16(r2, r3);
+
+ output[0] = mult256_round_shift(&t0, &t1, &k__cospi_p16_p16,
+ &k__DCT_CONST_ROUNDING, DCT_CONST_BITS);
+ output[8] = mult256_round_shift(&t0, &t1, &k__cospi_p16_m16,
+ &k__DCT_CONST_ROUNDING, DCT_CONST_BITS);
+ output[4] = mult256_round_shift(&t2, &t3, &k__cospi_p24_p08,
+ &k__DCT_CONST_ROUNDING, DCT_CONST_BITS);
+ output[12] =
+ mult256_round_shift(&t2, &t3, &k__cospi_m08_p24,
+ &k__DCT_CONST_ROUNDING, DCT_CONST_BITS);
+ }
+ }
+
+ // Work on next four results
+ {
+ // Interleave to do the multiply by constants which gets us
+ // into 32 bits.
+ const __m256i d0 = _mm256_unpacklo_epi16(q6, q5);
+ const __m256i d1 = _mm256_unpackhi_epi16(q6, q5);
+ const __m256i r0 = mult256_round_shift(
+ &d0, &d1, &k__cospi_p16_m16, &k__DCT_CONST_ROUNDING, DCT_CONST_BITS);
+ const __m256i r1 = mult256_round_shift(
+ &d0, &d1, &k__cospi_p16_p16, &k__DCT_CONST_ROUNDING, DCT_CONST_BITS);
+
+ {
+ // Add/subtract
+ const __m256i x0 = ADD256_EPI16(q4, r0);
+ const __m256i x1 = SUB256_EPI16(q4, r0);
+ const __m256i x2 = SUB256_EPI16(q7, r1);
+ const __m256i x3 = ADD256_EPI16(q7, r1);
+
+ // Interleave to do the multiply by constants which gets us
+ // into 32 bits.
+ {
+ const __m256i t0 = _mm256_unpacklo_epi16(x0, x3);
+ const __m256i t1 = _mm256_unpackhi_epi16(x0, x3);
+ const __m256i t2 = _mm256_unpacklo_epi16(x1, x2);
+ const __m256i t3 = _mm256_unpackhi_epi16(x1, x2);
+ output[2] =
+ mult256_round_shift(&t0, &t1, &k__cospi_p28_p04,
+ &k__DCT_CONST_ROUNDING, DCT_CONST_BITS);
+ output[14] =
+ mult256_round_shift(&t0, &t1, &k__cospi_m04_p28,
+ &k__DCT_CONST_ROUNDING, DCT_CONST_BITS);
+ output[10] =
+ mult256_round_shift(&t2, &t3, &k__cospi_p12_p20,
+ &k__DCT_CONST_ROUNDING, DCT_CONST_BITS);
+ output[6] =
+ mult256_round_shift(&t2, &t3, &k__cospi_m20_p12,
+ &k__DCT_CONST_ROUNDING, DCT_CONST_BITS);
+ }
+ }
+ }
+ }
+ // Work on the next eight values; step1 -> odd_results
+ { // step 2
+ {
+ const __m256i t0 = _mm256_unpacklo_epi16(step1[5], step1[2]);
+ const __m256i t1 = _mm256_unpackhi_epi16(step1[5], step1[2]);
+ const __m256i t2 = _mm256_unpacklo_epi16(step1[4], step1[3]);
+ const __m256i t3 = _mm256_unpackhi_epi16(step1[4], step1[3]);
+ step2[0] = mult256_round_shift(&t0, &t1, &k__cospi_p16_m16,
+ &k__DCT_CONST_ROUNDING, DCT_CONST_BITS);
+ step2[1] = mult256_round_shift(&t2, &t3, &k__cospi_p16_m16,
+ &k__DCT_CONST_ROUNDING, DCT_CONST_BITS);
+ step2[2] = mult256_round_shift(&t0, &t1, &k__cospi_p16_p16,
+ &k__DCT_CONST_ROUNDING, DCT_CONST_BITS);
+ step2[3] = mult256_round_shift(&t2, &t3, &k__cospi_p16_p16,
+ &k__DCT_CONST_ROUNDING, DCT_CONST_BITS);
+ }
+ // step 3
+ {
+ step3[0] = ADD256_EPI16(step1[0], step2[1]);
+ step3[1] = ADD256_EPI16(step1[1], step2[0]);
+ step3[2] = SUB256_EPI16(step1[1], step2[0]);
+ step3[3] = SUB256_EPI16(step1[0], step2[1]);
+ step3[4] = SUB256_EPI16(step1[7], step2[3]);
+ step3[5] = SUB256_EPI16(step1[6], step2[2]);
+ step3[6] = ADD256_EPI16(step1[6], step2[2]);
+ step3[7] = ADD256_EPI16(step1[7], step2[3]);
+ }
+ // step 4
+ {
+ const __m256i t0 = _mm256_unpacklo_epi16(step3[1], step3[6]);
+ const __m256i t1 = _mm256_unpackhi_epi16(step3[1], step3[6]);
+ const __m256i t2 = _mm256_unpacklo_epi16(step3[2], step3[5]);
+ const __m256i t3 = _mm256_unpackhi_epi16(step3[2], step3[5]);
+ step2[0] = mult256_round_shift(&t0, &t1, &k__cospi_m08_p24,
+ &k__DCT_CONST_ROUNDING, DCT_CONST_BITS);
+ step2[1] = mult256_round_shift(&t2, &t3, &k__cospi_p24_p08,
+ &k__DCT_CONST_ROUNDING, DCT_CONST_BITS);
+ step2[2] = mult256_round_shift(&t0, &t1, &k__cospi_p24_p08,
+ &k__DCT_CONST_ROUNDING, DCT_CONST_BITS);
+ step2[3] = mult256_round_shift(&t2, &t3, &k__cospi_p08_m24,
+ &k__DCT_CONST_ROUNDING, DCT_CONST_BITS);
+ }
+ // step 5
+ {
+ step1[0] = ADD256_EPI16(step3[0], step2[0]);
+ step1[1] = SUB256_EPI16(step3[0], step2[0]);
+ step1[2] = ADD256_EPI16(step3[3], step2[1]);
+ step1[3] = SUB256_EPI16(step3[3], step2[1]);
+ step1[4] = SUB256_EPI16(step3[4], step2[3]);
+ step1[5] = ADD256_EPI16(step3[4], step2[3]);
+ step1[6] = SUB256_EPI16(step3[7], step2[2]);
+ step1[7] = ADD256_EPI16(step3[7], step2[2]);
+ }
+ // step 6
+ {
+ const __m256i t0 = _mm256_unpacklo_epi16(step1[0], step1[7]);
+ const __m256i t1 = _mm256_unpackhi_epi16(step1[0], step1[7]);
+ const __m256i t2 = _mm256_unpacklo_epi16(step1[1], step1[6]);
+ const __m256i t3 = _mm256_unpackhi_epi16(step1[1], step1[6]);
+ output[1] = mult256_round_shift(&t0, &t1, &k__cospi_p30_p02,
+ &k__DCT_CONST_ROUNDING, DCT_CONST_BITS);
+ output[9] = mult256_round_shift(&t2, &t3, &k__cospi_p14_p18,
+ &k__DCT_CONST_ROUNDING, DCT_CONST_BITS);
+ output[15] = mult256_round_shift(&t0, &t1, &k__cospi_m02_p30,
+ &k__DCT_CONST_ROUNDING, DCT_CONST_BITS);
+ output[7] = mult256_round_shift(&t2, &t3, &k__cospi_m18_p14,
+ &k__DCT_CONST_ROUNDING, DCT_CONST_BITS);
+ }
+ {
+ const __m256i t0 = _mm256_unpacklo_epi16(step1[2], step1[5]);
+ const __m256i t1 = _mm256_unpackhi_epi16(step1[2], step1[5]);
+ const __m256i t2 = _mm256_unpacklo_epi16(step1[3], step1[4]);
+ const __m256i t3 = _mm256_unpackhi_epi16(step1[3], step1[4]);
+ output[5] = mult256_round_shift(&t0, &t1, &k__cospi_p22_p10,
+ &k__DCT_CONST_ROUNDING, DCT_CONST_BITS);
+ output[13] = mult256_round_shift(&t2, &t3, &k__cospi_p06_p26,
+ &k__DCT_CONST_ROUNDING, DCT_CONST_BITS);
+ output[11] = mult256_round_shift(&t0, &t1, &k__cospi_m10_p22,
+ &k__DCT_CONST_ROUNDING, DCT_CONST_BITS);
+ output[3] = mult256_round_shift(&t2, &t3, &k__cospi_m26_p06,
+ &k__DCT_CONST_ROUNDING, DCT_CONST_BITS);
+ }
+ }
+}
+
+void vpx_fdct16x16_avx2(const int16_t *input, tran_low_t *output, int stride) {
+ int pass;
+ DECLARE_ALIGNED(32, int16_t, intermediate[256]);
+ int16_t *out0 = intermediate;
+ tran_low_t *out1 = output;
+ const int width = 16;
+ const int height = 16;
+ __m256i buf0[16], buf1[16];
+
+ // Two transform and transpose passes
+ // Process 16 columns (transposed rows in second pass) at a time.
+ for (pass = 0; pass < 2; ++pass) {
+ // Load and pre-condition input.
+ load_buffer_16bit_to_16bit_avx2(input, stride, buf1, height, pass);
+
+ // Calculate dct for 16x16 values
+ fdct16x16_1D_avx2(buf1, buf0);
+
+ // Transpose the results.
+ transpose_16bit_16x16_avx2(buf0, buf1);
+
+ if (pass == 0) {
+ store_buffer_16bit_to_32bit_w16_avx2(buf1, out0, width, height);
+ } else {
+ store_buffer_16bit_to_32bit_w16_avx2(buf1, out1, width, height);
+ }
+ // Setup in/out for next pass.
+ input = intermediate;
+ }
+}
+
+#if !CONFIG_VP9_HIGHBITDEPTH
+#define FDCT32x32_2D_AVX2 vpx_fdct32x32_rd_avx2
+#define FDCT32x32_HIGH_PRECISION 0
+#include "vpx_dsp/x86/fwd_dct32x32_impl_avx2.h"
+#undef FDCT32x32_2D_AVX2
+#undef FDCT32x32_HIGH_PRECISION
+
+#define FDCT32x32_2D_AVX2 vpx_fdct32x32_avx2
+#define FDCT32x32_HIGH_PRECISION 1
+#include "vpx_dsp/x86/fwd_dct32x32_impl_avx2.h" // NOLINT
+#undef FDCT32x32_2D_AVX2
+#undef FDCT32x32_HIGH_PRECISION
+#endif // !CONFIG_VP9_HIGHBITDEPTH
diff --git a/media/libvpx/libvpx/vpx_dsp/x86/fwd_txfm_impl_sse2.h b/media/libvpx/libvpx/vpx_dsp/x86/fwd_txfm_impl_sse2.h
new file mode 100644
index 0000000000..d546f02a14
--- /dev/null
+++ b/media/libvpx/libvpx/vpx_dsp/x86/fwd_txfm_impl_sse2.h
@@ -0,0 +1,1015 @@
+/*
+ * Copyright (c) 2014 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <emmintrin.h> // SSE2
+
+#include "./vpx_dsp_rtcd.h"
+#include "vpx_dsp/txfm_common.h"
+#include "vpx_dsp/x86/fwd_txfm_sse2.h"
+#include "vpx_dsp/x86/txfm_common_sse2.h"
+#include "vpx_ports/mem.h"
+
+// TODO(jingning) The high bit-depth functions need rework for performance.
+// After we properly fix the high bit-depth function implementations, this
+// file's dependency should be substantially simplified.
+#if DCT_HIGH_BIT_DEPTH
+#define ADD_EPI16 _mm_adds_epi16
+#define SUB_EPI16 _mm_subs_epi16
+
+#else
+#define ADD_EPI16 _mm_add_epi16
+#define SUB_EPI16 _mm_sub_epi16
+#endif
+
+void FDCT4x4_2D(const int16_t *input, tran_low_t *output, int stride) {
+ // This 2D transform implements 4 vertical 1D transforms followed
+ // by 4 horizontal 1D transforms. The multiplies and adds are as given
+ // by Chen, Smith and Fralick ('77). The commands for moving the data
+ // around have been minimized by hand.
+ // For the purposes of the comments, the 16 inputs are referred to at i0
+ // through iF (in raster order), intermediate variables are a0, b0, c0
+ // through f, and correspond to the in-place computations mapped to input
+ // locations. The outputs, o0 through oF are labeled according to the
+ // output locations.
+
+ // Constants
+ // These are the coefficients used for the multiplies.
+ // In the comments, pN means cos(N pi /64) and mN is -cos(N pi /64),
+ // where cospi_N_64 = cos(N pi /64)
+ const __m128i k__cospi_A =
+ octa_set_epi16(cospi_16_64, cospi_16_64, cospi_16_64, cospi_16_64,
+ cospi_16_64, -cospi_16_64, cospi_16_64, -cospi_16_64);
+ const __m128i k__cospi_B =
+ octa_set_epi16(cospi_16_64, -cospi_16_64, cospi_16_64, -cospi_16_64,
+ cospi_16_64, cospi_16_64, cospi_16_64, cospi_16_64);
+ const __m128i k__cospi_C =
+ octa_set_epi16(cospi_8_64, cospi_24_64, cospi_8_64, cospi_24_64,
+ cospi_24_64, -cospi_8_64, cospi_24_64, -cospi_8_64);
+ const __m128i k__cospi_D =
+ octa_set_epi16(cospi_24_64, -cospi_8_64, cospi_24_64, -cospi_8_64,
+ cospi_8_64, cospi_24_64, cospi_8_64, cospi_24_64);
+ const __m128i k__cospi_E =
+ octa_set_epi16(cospi_16_64, cospi_16_64, cospi_16_64, cospi_16_64,
+ cospi_16_64, cospi_16_64, cospi_16_64, cospi_16_64);
+ const __m128i k__cospi_F =
+ octa_set_epi16(cospi_16_64, -cospi_16_64, cospi_16_64, -cospi_16_64,
+ cospi_16_64, -cospi_16_64, cospi_16_64, -cospi_16_64);
+ const __m128i k__cospi_G =
+ octa_set_epi16(cospi_8_64, cospi_24_64, cospi_8_64, cospi_24_64,
+ -cospi_8_64, -cospi_24_64, -cospi_8_64, -cospi_24_64);
+ const __m128i k__cospi_H =
+ octa_set_epi16(cospi_24_64, -cospi_8_64, cospi_24_64, -cospi_8_64,
+ -cospi_24_64, cospi_8_64, -cospi_24_64, cospi_8_64);
+
+ const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING);
+ // This second rounding constant saves doing some extra adds at the end
+ const __m128i k__DCT_CONST_ROUNDING2 =
+ _mm_set1_epi32(DCT_CONST_ROUNDING + (DCT_CONST_ROUNDING << 1));
+ const int DCT_CONST_BITS2 = DCT_CONST_BITS + 2;
+ const __m128i k__nonzero_bias_a = _mm_setr_epi16(0, 1, 1, 1, 1, 1, 1, 1);
+ const __m128i k__nonzero_bias_b = _mm_setr_epi16(1, 0, 0, 0, 0, 0, 0, 0);
+ __m128i in0, in1;
+#if DCT_HIGH_BIT_DEPTH
+ __m128i cmp0, cmp1;
+ int test, overflow;
+#endif
+
+ // Load inputs.
+ in0 = _mm_loadl_epi64((const __m128i *)(input + 0 * stride));
+ in1 = _mm_loadl_epi64((const __m128i *)(input + 1 * stride));
+ in1 = _mm_unpacklo_epi64(
+ in1, _mm_loadl_epi64((const __m128i *)(input + 2 * stride)));
+ in0 = _mm_unpacklo_epi64(
+ in0, _mm_loadl_epi64((const __m128i *)(input + 3 * stride)));
+// in0 = [i0 i1 i2 i3 iC iD iE iF]
+// in1 = [i4 i5 i6 i7 i8 i9 iA iB]
+#if DCT_HIGH_BIT_DEPTH
+ // Check inputs small enough to use optimised code
+ cmp0 = _mm_xor_si128(_mm_cmpgt_epi16(in0, _mm_set1_epi16(0x3ff)),
+ _mm_cmplt_epi16(in0, _mm_set1_epi16((int16_t)0xfc00)));
+ cmp1 = _mm_xor_si128(_mm_cmpgt_epi16(in1, _mm_set1_epi16(0x3ff)),
+ _mm_cmplt_epi16(in1, _mm_set1_epi16((int16_t)0xfc00)));
+ test = _mm_movemask_epi8(_mm_or_si128(cmp0, cmp1));
+ if (test) {
+ vpx_highbd_fdct4x4_c(input, output, stride);
+ return;
+ }
+#endif // DCT_HIGH_BIT_DEPTH
+
+ // multiply by 16 to give some extra precision
+ in0 = _mm_slli_epi16(in0, 4);
+ in1 = _mm_slli_epi16(in1, 4);
+ // if (i == 0 && input[0]) input[0] += 1;
+ // add 1 to the upper left pixel if it is non-zero, which helps reduce
+ // the round-trip error
+ {
+ // The mask will only contain whether the first value is zero, all
+ // other comparison will fail as something shifted by 4 (above << 4)
+ // can never be equal to one. To increment in the non-zero case, we
+ // add the mask and one for the first element:
+ // - if zero, mask = -1, v = v - 1 + 1 = v
+ // - if non-zero, mask = 0, v = v + 0 + 1 = v + 1
+ __m128i mask = _mm_cmpeq_epi16(in0, k__nonzero_bias_a);
+ in0 = _mm_add_epi16(in0, mask);
+ in0 = _mm_add_epi16(in0, k__nonzero_bias_b);
+ }
+ // There are 4 total stages, alternating between an add/subtract stage
+ // followed by an multiply-and-add stage.
+ {
+ // Stage 1: Add/subtract
+
+ // in0 = [i0 i1 i2 i3 iC iD iE iF]
+ // in1 = [i4 i5 i6 i7 i8 i9 iA iB]
+ const __m128i r0 = _mm_unpacklo_epi16(in0, in1);
+ const __m128i r1 = _mm_unpackhi_epi16(in0, in1);
+ // r0 = [i0 i4 i1 i5 i2 i6 i3 i7]
+ // r1 = [iC i8 iD i9 iE iA iF iB]
+ const __m128i r2 = _mm_shuffle_epi32(r0, 0xB4);
+ const __m128i r3 = _mm_shuffle_epi32(r1, 0xB4);
+ // r2 = [i0 i4 i1 i5 i3 i7 i2 i6]
+ // r3 = [iC i8 iD i9 iF iB iE iA]
+
+ const __m128i t0 = _mm_add_epi16(r2, r3);
+ const __m128i t1 = _mm_sub_epi16(r2, r3);
+ // t0 = [a0 a4 a1 a5 a3 a7 a2 a6]
+ // t1 = [aC a8 aD a9 aF aB aE aA]
+
+ // Stage 2: multiply by constants (which gets us into 32 bits).
+ // The constants needed here are:
+ // k__cospi_A = [p16 p16 p16 p16 p16 m16 p16 m16]
+ // k__cospi_B = [p16 m16 p16 m16 p16 p16 p16 p16]
+ // k__cospi_C = [p08 p24 p08 p24 p24 m08 p24 m08]
+ // k__cospi_D = [p24 m08 p24 m08 p08 p24 p08 p24]
+ const __m128i u0 = _mm_madd_epi16(t0, k__cospi_A);
+ const __m128i u2 = _mm_madd_epi16(t0, k__cospi_B);
+ const __m128i u1 = _mm_madd_epi16(t1, k__cospi_C);
+ const __m128i u3 = _mm_madd_epi16(t1, k__cospi_D);
+ // Then add and right-shift to get back to 16-bit range
+ const __m128i v0 = _mm_add_epi32(u0, k__DCT_CONST_ROUNDING);
+ const __m128i v1 = _mm_add_epi32(u1, k__DCT_CONST_ROUNDING);
+ const __m128i v2 = _mm_add_epi32(u2, k__DCT_CONST_ROUNDING);
+ const __m128i v3 = _mm_add_epi32(u3, k__DCT_CONST_ROUNDING);
+ const __m128i w0 = _mm_srai_epi32(v0, DCT_CONST_BITS);
+ const __m128i w1 = _mm_srai_epi32(v1, DCT_CONST_BITS);
+ const __m128i w2 = _mm_srai_epi32(v2, DCT_CONST_BITS);
+ const __m128i w3 = _mm_srai_epi32(v3, DCT_CONST_BITS);
+ // w0 = [b0 b1 b7 b6]
+ // w1 = [b8 b9 bF bE]
+ // w2 = [b4 b5 b3 b2]
+ // w3 = [bC bD bB bA]
+ const __m128i x0 = _mm_packs_epi32(w0, w1);
+ const __m128i x1 = _mm_packs_epi32(w2, w3);
+#if DCT_HIGH_BIT_DEPTH
+ overflow = check_epi16_overflow_x2(&x0, &x1);
+ if (overflow) {
+ vpx_highbd_fdct4x4_c(input, output, stride);
+ return;
+ }
+#endif // DCT_HIGH_BIT_DEPTH
+ // x0 = [b0 b1 b7 b6 b8 b9 bF bE]
+ // x1 = [b4 b5 b3 b2 bC bD bB bA]
+ in0 = _mm_shuffle_epi32(x0, 0xD8);
+ in1 = _mm_shuffle_epi32(x1, 0x8D);
+ // in0 = [b0 b1 b8 b9 b7 b6 bF bE]
+ // in1 = [b3 b2 bB bA b4 b5 bC bD]
+ }
+ {
+ // vertical DCTs finished. Now we do the horizontal DCTs.
+ // Stage 3: Add/subtract
+
+ const __m128i t0 = ADD_EPI16(in0, in1);
+ const __m128i t1 = SUB_EPI16(in0, in1);
+// t0 = [c0 c1 c8 c9 c4 c5 cC cD]
+// t1 = [c3 c2 cB cA -c7 -c6 -cF -cE]
+#if DCT_HIGH_BIT_DEPTH
+ overflow = check_epi16_overflow_x2(&t0, &t1);
+ if (overflow) {
+ vpx_highbd_fdct4x4_c(input, output, stride);
+ return;
+ }
+#endif // DCT_HIGH_BIT_DEPTH
+
+ // Stage 4: multiply by constants (which gets us into 32 bits).
+ {
+ // The constants needed here are:
+ // k__cospi_E = [p16 p16 p16 p16 p16 p16 p16 p16]
+ // k__cospi_F = [p16 m16 p16 m16 p16 m16 p16 m16]
+ // k__cospi_G = [p08 p24 p08 p24 m08 m24 m08 m24]
+ // k__cospi_H = [p24 m08 p24 m08 m24 p08 m24 p08]
+ const __m128i u0 = _mm_madd_epi16(t0, k__cospi_E);
+ const __m128i u1 = _mm_madd_epi16(t0, k__cospi_F);
+ const __m128i u2 = _mm_madd_epi16(t1, k__cospi_G);
+ const __m128i u3 = _mm_madd_epi16(t1, k__cospi_H);
+ // Then add and right-shift to get back to 16-bit range
+ // but this combines the final right-shift as well to save operations
+ // This unusual rounding operations is to maintain bit-accurate
+ // compatibility with the c version of this function which has two
+ // rounding steps in a row.
+ const __m128i v0 = _mm_add_epi32(u0, k__DCT_CONST_ROUNDING2);
+ const __m128i v1 = _mm_add_epi32(u1, k__DCT_CONST_ROUNDING2);
+ const __m128i v2 = _mm_add_epi32(u2, k__DCT_CONST_ROUNDING2);
+ const __m128i v3 = _mm_add_epi32(u3, k__DCT_CONST_ROUNDING2);
+ const __m128i w0 = _mm_srai_epi32(v0, DCT_CONST_BITS2);
+ const __m128i w1 = _mm_srai_epi32(v1, DCT_CONST_BITS2);
+ const __m128i w2 = _mm_srai_epi32(v2, DCT_CONST_BITS2);
+ const __m128i w3 = _mm_srai_epi32(v3, DCT_CONST_BITS2);
+ // w0 = [o0 o4 o8 oC]
+ // w1 = [o2 o6 oA oE]
+ // w2 = [o1 o5 o9 oD]
+ // w3 = [o3 o7 oB oF]
+ // remember the o's are numbered according to the correct output location
+ const __m128i x0 = _mm_packs_epi32(w0, w1);
+ const __m128i x1 = _mm_packs_epi32(w2, w3);
+#if DCT_HIGH_BIT_DEPTH
+ overflow = check_epi16_overflow_x2(&x0, &x1);
+ if (overflow) {
+ vpx_highbd_fdct4x4_c(input, output, stride);
+ return;
+ }
+#endif // DCT_HIGH_BIT_DEPTH
+ {
+ // x0 = [o0 o4 o8 oC o2 o6 oA oE]
+ // x1 = [o1 o5 o9 oD o3 o7 oB oF]
+ const __m128i y0 = _mm_unpacklo_epi16(x0, x1);
+ const __m128i y1 = _mm_unpackhi_epi16(x0, x1);
+ // y0 = [o0 o1 o4 o5 o8 o9 oC oD]
+ // y1 = [o2 o3 o6 o7 oA oB oE oF]
+ in0 = _mm_unpacklo_epi32(y0, y1);
+ // in0 = [o0 o1 o2 o3 o4 o5 o6 o7]
+ in1 = _mm_unpackhi_epi32(y0, y1);
+ // in1 = [o8 o9 oA oB oC oD oE oF]
+ }
+ }
+ }
+ // Post-condition (v + 1) >> 2 is now incorporated into previous
+ // add and right-shift commands. Only 2 store instructions needed
+ // because we are using the fact that 1/3 are stored just after 0/2.
+ storeu_output(&in0, output + 0 * 4);
+ storeu_output(&in1, output + 2 * 4);
+}
+
+void FDCT8x8_2D(const int16_t *input, tran_low_t *output, int stride) {
+ int pass;
+ // Constants
+ // When we use them, in one case, they are all the same. In all others
+ // it's a pair of them that we need to repeat four times. This is done
+ // by constructing the 32 bit constant corresponding to that pair.
+ const __m128i k__cospi_p16_p16 = _mm_set1_epi16(cospi_16_64);
+ const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64);
+ const __m128i k__cospi_p24_p08 = pair_set_epi16(cospi_24_64, cospi_8_64);
+ const __m128i k__cospi_m08_p24 = pair_set_epi16(-cospi_8_64, cospi_24_64);
+ const __m128i k__cospi_p28_p04 = pair_set_epi16(cospi_28_64, cospi_4_64);
+ const __m128i k__cospi_m04_p28 = pair_set_epi16(-cospi_4_64, cospi_28_64);
+ const __m128i k__cospi_p12_p20 = pair_set_epi16(cospi_12_64, cospi_20_64);
+ const __m128i k__cospi_m20_p12 = pair_set_epi16(-cospi_20_64, cospi_12_64);
+ const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING);
+#if DCT_HIGH_BIT_DEPTH
+ int overflow;
+#endif
+ // Load input
+ __m128i in0 = _mm_load_si128((const __m128i *)(input + 0 * stride));
+ __m128i in1 = _mm_load_si128((const __m128i *)(input + 1 * stride));
+ __m128i in2 = _mm_load_si128((const __m128i *)(input + 2 * stride));
+ __m128i in3 = _mm_load_si128((const __m128i *)(input + 3 * stride));
+ __m128i in4 = _mm_load_si128((const __m128i *)(input + 4 * stride));
+ __m128i in5 = _mm_load_si128((const __m128i *)(input + 5 * stride));
+ __m128i in6 = _mm_load_si128((const __m128i *)(input + 6 * stride));
+ __m128i in7 = _mm_load_si128((const __m128i *)(input + 7 * stride));
+ // Pre-condition input (shift by two)
+ in0 = _mm_slli_epi16(in0, 2);
+ in1 = _mm_slli_epi16(in1, 2);
+ in2 = _mm_slli_epi16(in2, 2);
+ in3 = _mm_slli_epi16(in3, 2);
+ in4 = _mm_slli_epi16(in4, 2);
+ in5 = _mm_slli_epi16(in5, 2);
+ in6 = _mm_slli_epi16(in6, 2);
+ in7 = _mm_slli_epi16(in7, 2);
+
+ // We do two passes, first the columns, then the rows. The results of the
+ // first pass are transposed so that the same column code can be reused. The
+ // results of the second pass are also transposed so that the rows (processed
+ // as columns) are put back in row positions.
+ for (pass = 0; pass < 2; pass++) {
+ // To store results of each pass before the transpose.
+ __m128i res0, res1, res2, res3, res4, res5, res6, res7;
+ // Add/subtract
+ const __m128i q0 = ADD_EPI16(in0, in7);
+ const __m128i q1 = ADD_EPI16(in1, in6);
+ const __m128i q2 = ADD_EPI16(in2, in5);
+ const __m128i q3 = ADD_EPI16(in3, in4);
+ const __m128i q4 = SUB_EPI16(in3, in4);
+ const __m128i q5 = SUB_EPI16(in2, in5);
+ const __m128i q6 = SUB_EPI16(in1, in6);
+ const __m128i q7 = SUB_EPI16(in0, in7);
+#if DCT_HIGH_BIT_DEPTH
+ if (pass == 1) {
+ overflow =
+ check_epi16_overflow_x8(&q0, &q1, &q2, &q3, &q4, &q5, &q6, &q7);
+ if (overflow) {
+ vpx_highbd_fdct8x8_c(input, output, stride);
+ return;
+ }
+ }
+#endif // DCT_HIGH_BIT_DEPTH
+ // Work on first four results
+ {
+ // Add/subtract
+ const __m128i r0 = ADD_EPI16(q0, q3);
+ const __m128i r1 = ADD_EPI16(q1, q2);
+ const __m128i r2 = SUB_EPI16(q1, q2);
+ const __m128i r3 = SUB_EPI16(q0, q3);
+#if DCT_HIGH_BIT_DEPTH
+ overflow = check_epi16_overflow_x4(&r0, &r1, &r2, &r3);
+ if (overflow) {
+ vpx_highbd_fdct8x8_c(input, output, stride);
+ return;
+ }
+#endif // DCT_HIGH_BIT_DEPTH
+ // Interleave to do the multiply by constants which gets us into 32bits
+ {
+ const __m128i t0 = _mm_unpacklo_epi16(r0, r1);
+ const __m128i t1 = _mm_unpackhi_epi16(r0, r1);
+ const __m128i t2 = _mm_unpacklo_epi16(r2, r3);
+ const __m128i t3 = _mm_unpackhi_epi16(r2, r3);
+ const __m128i u0 = _mm_madd_epi16(t0, k__cospi_p16_p16);
+ const __m128i u1 = _mm_madd_epi16(t1, k__cospi_p16_p16);
+ const __m128i u2 = _mm_madd_epi16(t0, k__cospi_p16_m16);
+ const __m128i u3 = _mm_madd_epi16(t1, k__cospi_p16_m16);
+ const __m128i u4 = _mm_madd_epi16(t2, k__cospi_p24_p08);
+ const __m128i u5 = _mm_madd_epi16(t3, k__cospi_p24_p08);
+ const __m128i u6 = _mm_madd_epi16(t2, k__cospi_m08_p24);
+ const __m128i u7 = _mm_madd_epi16(t3, k__cospi_m08_p24);
+ // dct_const_round_shift
+ const __m128i v0 = _mm_add_epi32(u0, k__DCT_CONST_ROUNDING);
+ const __m128i v1 = _mm_add_epi32(u1, k__DCT_CONST_ROUNDING);
+ const __m128i v2 = _mm_add_epi32(u2, k__DCT_CONST_ROUNDING);
+ const __m128i v3 = _mm_add_epi32(u3, k__DCT_CONST_ROUNDING);
+ const __m128i v4 = _mm_add_epi32(u4, k__DCT_CONST_ROUNDING);
+ const __m128i v5 = _mm_add_epi32(u5, k__DCT_CONST_ROUNDING);
+ const __m128i v6 = _mm_add_epi32(u6, k__DCT_CONST_ROUNDING);
+ const __m128i v7 = _mm_add_epi32(u7, k__DCT_CONST_ROUNDING);
+ const __m128i w0 = _mm_srai_epi32(v0, DCT_CONST_BITS);
+ const __m128i w1 = _mm_srai_epi32(v1, DCT_CONST_BITS);
+ const __m128i w2 = _mm_srai_epi32(v2, DCT_CONST_BITS);
+ const __m128i w3 = _mm_srai_epi32(v3, DCT_CONST_BITS);
+ const __m128i w4 = _mm_srai_epi32(v4, DCT_CONST_BITS);
+ const __m128i w5 = _mm_srai_epi32(v5, DCT_CONST_BITS);
+ const __m128i w6 = _mm_srai_epi32(v6, DCT_CONST_BITS);
+ const __m128i w7 = _mm_srai_epi32(v7, DCT_CONST_BITS);
+ // Combine
+ res0 = _mm_packs_epi32(w0, w1);
+ res4 = _mm_packs_epi32(w2, w3);
+ res2 = _mm_packs_epi32(w4, w5);
+ res6 = _mm_packs_epi32(w6, w7);
+#if DCT_HIGH_BIT_DEPTH
+ overflow = check_epi16_overflow_x4(&res0, &res4, &res2, &res6);
+ if (overflow) {
+ vpx_highbd_fdct8x8_c(input, output, stride);
+ return;
+ }
+#endif // DCT_HIGH_BIT_DEPTH
+ }
+ }
+ // Work on next four results
+ {
+ // Interleave to do the multiply by constants which gets us into 32bits
+ const __m128i d0 = _mm_unpacklo_epi16(q6, q5);
+ const __m128i d1 = _mm_unpackhi_epi16(q6, q5);
+ const __m128i e0 = _mm_madd_epi16(d0, k__cospi_p16_m16);
+ const __m128i e1 = _mm_madd_epi16(d1, k__cospi_p16_m16);
+ const __m128i e2 = _mm_madd_epi16(d0, k__cospi_p16_p16);
+ const __m128i e3 = _mm_madd_epi16(d1, k__cospi_p16_p16);
+ // dct_const_round_shift
+ const __m128i f0 = _mm_add_epi32(e0, k__DCT_CONST_ROUNDING);
+ const __m128i f1 = _mm_add_epi32(e1, k__DCT_CONST_ROUNDING);
+ const __m128i f2 = _mm_add_epi32(e2, k__DCT_CONST_ROUNDING);
+ const __m128i f3 = _mm_add_epi32(e3, k__DCT_CONST_ROUNDING);
+ const __m128i s0 = _mm_srai_epi32(f0, DCT_CONST_BITS);
+ const __m128i s1 = _mm_srai_epi32(f1, DCT_CONST_BITS);
+ const __m128i s2 = _mm_srai_epi32(f2, DCT_CONST_BITS);
+ const __m128i s3 = _mm_srai_epi32(f3, DCT_CONST_BITS);
+ // Combine
+ const __m128i r0 = _mm_packs_epi32(s0, s1);
+ const __m128i r1 = _mm_packs_epi32(s2, s3);
+#if DCT_HIGH_BIT_DEPTH
+ overflow = check_epi16_overflow_x2(&r0, &r1);
+ if (overflow) {
+ vpx_highbd_fdct8x8_c(input, output, stride);
+ return;
+ }
+#endif // DCT_HIGH_BIT_DEPTH
+ {
+ // Add/subtract
+ const __m128i x0 = ADD_EPI16(q4, r0);
+ const __m128i x1 = SUB_EPI16(q4, r0);
+ const __m128i x2 = SUB_EPI16(q7, r1);
+ const __m128i x3 = ADD_EPI16(q7, r1);
+#if DCT_HIGH_BIT_DEPTH
+ overflow = check_epi16_overflow_x4(&x0, &x1, &x2, &x3);
+ if (overflow) {
+ vpx_highbd_fdct8x8_c(input, output, stride);
+ return;
+ }
+#endif // DCT_HIGH_BIT_DEPTH
+ // Interleave to do the multiply by constants which gets us into 32bits
+ {
+ const __m128i t0 = _mm_unpacklo_epi16(x0, x3);
+ const __m128i t1 = _mm_unpackhi_epi16(x0, x3);
+ const __m128i t2 = _mm_unpacklo_epi16(x1, x2);
+ const __m128i t3 = _mm_unpackhi_epi16(x1, x2);
+ const __m128i u0 = _mm_madd_epi16(t0, k__cospi_p28_p04);
+ const __m128i u1 = _mm_madd_epi16(t1, k__cospi_p28_p04);
+ const __m128i u2 = _mm_madd_epi16(t0, k__cospi_m04_p28);
+ const __m128i u3 = _mm_madd_epi16(t1, k__cospi_m04_p28);
+ const __m128i u4 = _mm_madd_epi16(t2, k__cospi_p12_p20);
+ const __m128i u5 = _mm_madd_epi16(t3, k__cospi_p12_p20);
+ const __m128i u6 = _mm_madd_epi16(t2, k__cospi_m20_p12);
+ const __m128i u7 = _mm_madd_epi16(t3, k__cospi_m20_p12);
+ // dct_const_round_shift
+ const __m128i v0 = _mm_add_epi32(u0, k__DCT_CONST_ROUNDING);
+ const __m128i v1 = _mm_add_epi32(u1, k__DCT_CONST_ROUNDING);
+ const __m128i v2 = _mm_add_epi32(u2, k__DCT_CONST_ROUNDING);
+ const __m128i v3 = _mm_add_epi32(u3, k__DCT_CONST_ROUNDING);
+ const __m128i v4 = _mm_add_epi32(u4, k__DCT_CONST_ROUNDING);
+ const __m128i v5 = _mm_add_epi32(u5, k__DCT_CONST_ROUNDING);
+ const __m128i v6 = _mm_add_epi32(u6, k__DCT_CONST_ROUNDING);
+ const __m128i v7 = _mm_add_epi32(u7, k__DCT_CONST_ROUNDING);
+ const __m128i w0 = _mm_srai_epi32(v0, DCT_CONST_BITS);
+ const __m128i w1 = _mm_srai_epi32(v1, DCT_CONST_BITS);
+ const __m128i w2 = _mm_srai_epi32(v2, DCT_CONST_BITS);
+ const __m128i w3 = _mm_srai_epi32(v3, DCT_CONST_BITS);
+ const __m128i w4 = _mm_srai_epi32(v4, DCT_CONST_BITS);
+ const __m128i w5 = _mm_srai_epi32(v5, DCT_CONST_BITS);
+ const __m128i w6 = _mm_srai_epi32(v6, DCT_CONST_BITS);
+ const __m128i w7 = _mm_srai_epi32(v7, DCT_CONST_BITS);
+ // Combine
+ res1 = _mm_packs_epi32(w0, w1);
+ res7 = _mm_packs_epi32(w2, w3);
+ res5 = _mm_packs_epi32(w4, w5);
+ res3 = _mm_packs_epi32(w6, w7);
+#if DCT_HIGH_BIT_DEPTH
+ overflow = check_epi16_overflow_x4(&res1, &res7, &res5, &res3);
+ if (overflow) {
+ vpx_highbd_fdct8x8_c(input, output, stride);
+ return;
+ }
+#endif // DCT_HIGH_BIT_DEPTH
+ }
+ }
+ }
+ // Transpose the 8x8.
+ {
+ // 00 01 02 03 04 05 06 07
+ // 10 11 12 13 14 15 16 17
+ // 20 21 22 23 24 25 26 27
+ // 30 31 32 33 34 35 36 37
+ // 40 41 42 43 44 45 46 47
+ // 50 51 52 53 54 55 56 57
+ // 60 61 62 63 64 65 66 67
+ // 70 71 72 73 74 75 76 77
+ const __m128i tr0_0 = _mm_unpacklo_epi16(res0, res1);
+ const __m128i tr0_1 = _mm_unpacklo_epi16(res2, res3);
+ const __m128i tr0_2 = _mm_unpackhi_epi16(res0, res1);
+ const __m128i tr0_3 = _mm_unpackhi_epi16(res2, res3);
+ const __m128i tr0_4 = _mm_unpacklo_epi16(res4, res5);
+ const __m128i tr0_5 = _mm_unpacklo_epi16(res6, res7);
+ const __m128i tr0_6 = _mm_unpackhi_epi16(res4, res5);
+ const __m128i tr0_7 = _mm_unpackhi_epi16(res6, res7);
+ // 00 10 01 11 02 12 03 13
+ // 20 30 21 31 22 32 23 33
+ // 04 14 05 15 06 16 07 17
+ // 24 34 25 35 26 36 27 37
+ // 40 50 41 51 42 52 43 53
+ // 60 70 61 71 62 72 63 73
+ // 54 54 55 55 56 56 57 57
+ // 64 74 65 75 66 76 67 77
+ const __m128i tr1_0 = _mm_unpacklo_epi32(tr0_0, tr0_1);
+ const __m128i tr1_1 = _mm_unpacklo_epi32(tr0_2, tr0_3);
+ const __m128i tr1_2 = _mm_unpackhi_epi32(tr0_0, tr0_1);
+ const __m128i tr1_3 = _mm_unpackhi_epi32(tr0_2, tr0_3);
+ const __m128i tr1_4 = _mm_unpacklo_epi32(tr0_4, tr0_5);
+ const __m128i tr1_5 = _mm_unpacklo_epi32(tr0_6, tr0_7);
+ const __m128i tr1_6 = _mm_unpackhi_epi32(tr0_4, tr0_5);
+ const __m128i tr1_7 = _mm_unpackhi_epi32(tr0_6, tr0_7);
+ // 00 10 20 30 01 11 21 31
+ // 40 50 60 70 41 51 61 71
+ // 02 12 22 32 03 13 23 33
+ // 42 52 62 72 43 53 63 73
+ // 04 14 24 34 05 15 21 36
+ // 44 54 64 74 45 55 61 76
+ // 06 16 26 36 07 17 27 37
+ // 46 56 66 76 47 57 67 77
+ in0 = _mm_unpacklo_epi64(tr1_0, tr1_4);
+ in1 = _mm_unpackhi_epi64(tr1_0, tr1_4);
+ in2 = _mm_unpacklo_epi64(tr1_2, tr1_6);
+ in3 = _mm_unpackhi_epi64(tr1_2, tr1_6);
+ in4 = _mm_unpacklo_epi64(tr1_1, tr1_5);
+ in5 = _mm_unpackhi_epi64(tr1_1, tr1_5);
+ in6 = _mm_unpacklo_epi64(tr1_3, tr1_7);
+ in7 = _mm_unpackhi_epi64(tr1_3, tr1_7);
+ // 00 10 20 30 40 50 60 70
+ // 01 11 21 31 41 51 61 71
+ // 02 12 22 32 42 52 62 72
+ // 03 13 23 33 43 53 63 73
+ // 04 14 24 34 44 54 64 74
+ // 05 15 25 35 45 55 65 75
+ // 06 16 26 36 46 56 66 76
+ // 07 17 27 37 47 57 67 77
+ }
+ }
+ // Post-condition output and store it
+ {
+ // Post-condition (division by two)
+ // division of two 16 bits signed numbers using shifts
+ // n / 2 = (n - (n >> 15)) >> 1
+ const __m128i sign_in0 = _mm_srai_epi16(in0, 15);
+ const __m128i sign_in1 = _mm_srai_epi16(in1, 15);
+ const __m128i sign_in2 = _mm_srai_epi16(in2, 15);
+ const __m128i sign_in3 = _mm_srai_epi16(in3, 15);
+ const __m128i sign_in4 = _mm_srai_epi16(in4, 15);
+ const __m128i sign_in5 = _mm_srai_epi16(in5, 15);
+ const __m128i sign_in6 = _mm_srai_epi16(in6, 15);
+ const __m128i sign_in7 = _mm_srai_epi16(in7, 15);
+ in0 = _mm_sub_epi16(in0, sign_in0);
+ in1 = _mm_sub_epi16(in1, sign_in1);
+ in2 = _mm_sub_epi16(in2, sign_in2);
+ in3 = _mm_sub_epi16(in3, sign_in3);
+ in4 = _mm_sub_epi16(in4, sign_in4);
+ in5 = _mm_sub_epi16(in5, sign_in5);
+ in6 = _mm_sub_epi16(in6, sign_in6);
+ in7 = _mm_sub_epi16(in7, sign_in7);
+ in0 = _mm_srai_epi16(in0, 1);
+ in1 = _mm_srai_epi16(in1, 1);
+ in2 = _mm_srai_epi16(in2, 1);
+ in3 = _mm_srai_epi16(in3, 1);
+ in4 = _mm_srai_epi16(in4, 1);
+ in5 = _mm_srai_epi16(in5, 1);
+ in6 = _mm_srai_epi16(in6, 1);
+ in7 = _mm_srai_epi16(in7, 1);
+ // store results
+ store_output(&in0, (output + 0 * 8));
+ store_output(&in1, (output + 1 * 8));
+ store_output(&in2, (output + 2 * 8));
+ store_output(&in3, (output + 3 * 8));
+ store_output(&in4, (output + 4 * 8));
+ store_output(&in5, (output + 5 * 8));
+ store_output(&in6, (output + 6 * 8));
+ store_output(&in7, (output + 7 * 8));
+ }
+}
+
+void FDCT16x16_2D(const int16_t *input, tran_low_t *output, int stride) {
+ // The 2D transform is done with two passes which are actually pretty
+ // similar. In the first one, we transform the columns and transpose
+ // the results. In the second one, we transform the rows. To achieve that,
+ // as the first pass results are transposed, we transpose the columns (that
+ // is the transposed rows) and transpose the results (so that it goes back
+ // in normal/row positions).
+ int pass;
+ // We need an intermediate buffer between passes.
+ DECLARE_ALIGNED(16, int16_t, intermediate[256]);
+ const int16_t *in = input;
+ int16_t *out0 = intermediate;
+ tran_low_t *out1 = output;
+ // Constants
+ // When we use them, in one case, they are all the same. In all others
+ // it's a pair of them that we need to repeat four times. This is done
+ // by constructing the 32 bit constant corresponding to that pair.
+ const __m128i k__cospi_p16_p16 = _mm_set1_epi16(cospi_16_64);
+ const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64);
+ const __m128i k__cospi_p24_p08 = pair_set_epi16(cospi_24_64, cospi_8_64);
+ const __m128i k__cospi_p08_m24 = pair_set_epi16(cospi_8_64, -cospi_24_64);
+ const __m128i k__cospi_m08_p24 = pair_set_epi16(-cospi_8_64, cospi_24_64);
+ const __m128i k__cospi_p28_p04 = pair_set_epi16(cospi_28_64, cospi_4_64);
+ const __m128i k__cospi_m04_p28 = pair_set_epi16(-cospi_4_64, cospi_28_64);
+ const __m128i k__cospi_p12_p20 = pair_set_epi16(cospi_12_64, cospi_20_64);
+ const __m128i k__cospi_m20_p12 = pair_set_epi16(-cospi_20_64, cospi_12_64);
+ const __m128i k__cospi_p30_p02 = pair_set_epi16(cospi_30_64, cospi_2_64);
+ const __m128i k__cospi_p14_p18 = pair_set_epi16(cospi_14_64, cospi_18_64);
+ const __m128i k__cospi_m02_p30 = pair_set_epi16(-cospi_2_64, cospi_30_64);
+ const __m128i k__cospi_m18_p14 = pair_set_epi16(-cospi_18_64, cospi_14_64);
+ const __m128i k__cospi_p22_p10 = pair_set_epi16(cospi_22_64, cospi_10_64);
+ const __m128i k__cospi_p06_p26 = pair_set_epi16(cospi_6_64, cospi_26_64);
+ const __m128i k__cospi_m10_p22 = pair_set_epi16(-cospi_10_64, cospi_22_64);
+ const __m128i k__cospi_m26_p06 = pair_set_epi16(-cospi_26_64, cospi_6_64);
+ const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING);
+ const __m128i kOne = _mm_set1_epi16(1);
+ // Do the two transform/transpose passes
+ for (pass = 0; pass < 2; ++pass) {
+ // We process eight columns (transposed rows in second pass) at a time.
+ int column_start;
+#if DCT_HIGH_BIT_DEPTH
+ int overflow;
+#endif
+ for (column_start = 0; column_start < 16; column_start += 8) {
+ __m128i in00, in01, in02, in03, in04, in05, in06, in07;
+ __m128i in08, in09, in10, in11, in12, in13, in14, in15;
+ __m128i input0, input1, input2, input3, input4, input5, input6, input7;
+ __m128i step1_0, step1_1, step1_2, step1_3;
+ __m128i step1_4, step1_5, step1_6, step1_7;
+ __m128i step2_1, step2_2, step2_3, step2_4, step2_5, step2_6;
+ __m128i step3_0, step3_1, step3_2, step3_3;
+ __m128i step3_4, step3_5, step3_6, step3_7;
+ __m128i res00, res01, res02, res03, res04, res05, res06, res07;
+ __m128i res08, res09, res10, res11, res12, res13, res14, res15;
+ // Load and pre-condition input.
+ if (0 == pass) {
+ in00 = _mm_load_si128((const __m128i *)(in + 0 * stride));
+ in01 = _mm_load_si128((const __m128i *)(in + 1 * stride));
+ in02 = _mm_load_si128((const __m128i *)(in + 2 * stride));
+ in03 = _mm_load_si128((const __m128i *)(in + 3 * stride));
+ in04 = _mm_load_si128((const __m128i *)(in + 4 * stride));
+ in05 = _mm_load_si128((const __m128i *)(in + 5 * stride));
+ in06 = _mm_load_si128((const __m128i *)(in + 6 * stride));
+ in07 = _mm_load_si128((const __m128i *)(in + 7 * stride));
+ in08 = _mm_load_si128((const __m128i *)(in + 8 * stride));
+ in09 = _mm_load_si128((const __m128i *)(in + 9 * stride));
+ in10 = _mm_load_si128((const __m128i *)(in + 10 * stride));
+ in11 = _mm_load_si128((const __m128i *)(in + 11 * stride));
+ in12 = _mm_load_si128((const __m128i *)(in + 12 * stride));
+ in13 = _mm_load_si128((const __m128i *)(in + 13 * stride));
+ in14 = _mm_load_si128((const __m128i *)(in + 14 * stride));
+ in15 = _mm_load_si128((const __m128i *)(in + 15 * stride));
+ // x = x << 2
+ in00 = _mm_slli_epi16(in00, 2);
+ in01 = _mm_slli_epi16(in01, 2);
+ in02 = _mm_slli_epi16(in02, 2);
+ in03 = _mm_slli_epi16(in03, 2);
+ in04 = _mm_slli_epi16(in04, 2);
+ in05 = _mm_slli_epi16(in05, 2);
+ in06 = _mm_slli_epi16(in06, 2);
+ in07 = _mm_slli_epi16(in07, 2);
+ in08 = _mm_slli_epi16(in08, 2);
+ in09 = _mm_slli_epi16(in09, 2);
+ in10 = _mm_slli_epi16(in10, 2);
+ in11 = _mm_slli_epi16(in11, 2);
+ in12 = _mm_slli_epi16(in12, 2);
+ in13 = _mm_slli_epi16(in13, 2);
+ in14 = _mm_slli_epi16(in14, 2);
+ in15 = _mm_slli_epi16(in15, 2);
+ } else {
+ in00 = _mm_load_si128((const __m128i *)(in + 0 * 16));
+ in01 = _mm_load_si128((const __m128i *)(in + 1 * 16));
+ in02 = _mm_load_si128((const __m128i *)(in + 2 * 16));
+ in03 = _mm_load_si128((const __m128i *)(in + 3 * 16));
+ in04 = _mm_load_si128((const __m128i *)(in + 4 * 16));
+ in05 = _mm_load_si128((const __m128i *)(in + 5 * 16));
+ in06 = _mm_load_si128((const __m128i *)(in + 6 * 16));
+ in07 = _mm_load_si128((const __m128i *)(in + 7 * 16));
+ in08 = _mm_load_si128((const __m128i *)(in + 8 * 16));
+ in09 = _mm_load_si128((const __m128i *)(in + 9 * 16));
+ in10 = _mm_load_si128((const __m128i *)(in + 10 * 16));
+ in11 = _mm_load_si128((const __m128i *)(in + 11 * 16));
+ in12 = _mm_load_si128((const __m128i *)(in + 12 * 16));
+ in13 = _mm_load_si128((const __m128i *)(in + 13 * 16));
+ in14 = _mm_load_si128((const __m128i *)(in + 14 * 16));
+ in15 = _mm_load_si128((const __m128i *)(in + 15 * 16));
+ // x = (x + 1) >> 2
+ in00 = _mm_add_epi16(in00, kOne);
+ in01 = _mm_add_epi16(in01, kOne);
+ in02 = _mm_add_epi16(in02, kOne);
+ in03 = _mm_add_epi16(in03, kOne);
+ in04 = _mm_add_epi16(in04, kOne);
+ in05 = _mm_add_epi16(in05, kOne);
+ in06 = _mm_add_epi16(in06, kOne);
+ in07 = _mm_add_epi16(in07, kOne);
+ in08 = _mm_add_epi16(in08, kOne);
+ in09 = _mm_add_epi16(in09, kOne);
+ in10 = _mm_add_epi16(in10, kOne);
+ in11 = _mm_add_epi16(in11, kOne);
+ in12 = _mm_add_epi16(in12, kOne);
+ in13 = _mm_add_epi16(in13, kOne);
+ in14 = _mm_add_epi16(in14, kOne);
+ in15 = _mm_add_epi16(in15, kOne);
+ in00 = _mm_srai_epi16(in00, 2);
+ in01 = _mm_srai_epi16(in01, 2);
+ in02 = _mm_srai_epi16(in02, 2);
+ in03 = _mm_srai_epi16(in03, 2);
+ in04 = _mm_srai_epi16(in04, 2);
+ in05 = _mm_srai_epi16(in05, 2);
+ in06 = _mm_srai_epi16(in06, 2);
+ in07 = _mm_srai_epi16(in07, 2);
+ in08 = _mm_srai_epi16(in08, 2);
+ in09 = _mm_srai_epi16(in09, 2);
+ in10 = _mm_srai_epi16(in10, 2);
+ in11 = _mm_srai_epi16(in11, 2);
+ in12 = _mm_srai_epi16(in12, 2);
+ in13 = _mm_srai_epi16(in13, 2);
+ in14 = _mm_srai_epi16(in14, 2);
+ in15 = _mm_srai_epi16(in15, 2);
+ }
+ in += 8;
+ // Calculate input for the first 8 results.
+ {
+ input0 = ADD_EPI16(in00, in15);
+ input1 = ADD_EPI16(in01, in14);
+ input2 = ADD_EPI16(in02, in13);
+ input3 = ADD_EPI16(in03, in12);
+ input4 = ADD_EPI16(in04, in11);
+ input5 = ADD_EPI16(in05, in10);
+ input6 = ADD_EPI16(in06, in09);
+ input7 = ADD_EPI16(in07, in08);
+#if DCT_HIGH_BIT_DEPTH
+ overflow = check_epi16_overflow_x8(&input0, &input1, &input2, &input3,
+ &input4, &input5, &input6, &input7);
+ if (overflow) {
+ vpx_highbd_fdct16x16_c(input, output, stride);
+ return;
+ }
+#endif // DCT_HIGH_BIT_DEPTH
+ }
+ // Calculate input for the next 8 results.
+ {
+ step1_0 = SUB_EPI16(in07, in08);
+ step1_1 = SUB_EPI16(in06, in09);
+ step1_2 = SUB_EPI16(in05, in10);
+ step1_3 = SUB_EPI16(in04, in11);
+ step1_4 = SUB_EPI16(in03, in12);
+ step1_5 = SUB_EPI16(in02, in13);
+ step1_6 = SUB_EPI16(in01, in14);
+ step1_7 = SUB_EPI16(in00, in15);
+#if DCT_HIGH_BIT_DEPTH
+ overflow =
+ check_epi16_overflow_x8(&step1_0, &step1_1, &step1_2, &step1_3,
+ &step1_4, &step1_5, &step1_6, &step1_7);
+ if (overflow) {
+ vpx_highbd_fdct16x16_c(input, output, stride);
+ return;
+ }
+#endif // DCT_HIGH_BIT_DEPTH
+ }
+ // Work on the first eight values; fdct8(input, even_results);
+ {
+ // Add/subtract
+ const __m128i q0 = ADD_EPI16(input0, input7);
+ const __m128i q1 = ADD_EPI16(input1, input6);
+ const __m128i q2 = ADD_EPI16(input2, input5);
+ const __m128i q3 = ADD_EPI16(input3, input4);
+ const __m128i q4 = SUB_EPI16(input3, input4);
+ const __m128i q5 = SUB_EPI16(input2, input5);
+ const __m128i q6 = SUB_EPI16(input1, input6);
+ const __m128i q7 = SUB_EPI16(input0, input7);
+#if DCT_HIGH_BIT_DEPTH
+ overflow =
+ check_epi16_overflow_x8(&q0, &q1, &q2, &q3, &q4, &q5, &q6, &q7);
+ if (overflow) {
+ vpx_highbd_fdct16x16_c(input, output, stride);
+ return;
+ }
+#endif // DCT_HIGH_BIT_DEPTH
+ // Work on first four results
+ {
+ // Add/subtract
+ const __m128i r0 = ADD_EPI16(q0, q3);
+ const __m128i r1 = ADD_EPI16(q1, q2);
+ const __m128i r2 = SUB_EPI16(q1, q2);
+ const __m128i r3 = SUB_EPI16(q0, q3);
+#if DCT_HIGH_BIT_DEPTH
+ overflow = check_epi16_overflow_x4(&r0, &r1, &r2, &r3);
+ if (overflow) {
+ vpx_highbd_fdct16x16_c(input, output, stride);
+ return;
+ }
+#endif // DCT_HIGH_BIT_DEPTH
+
+ // Interleave to do the multiply by constants which gets us
+ // into 32 bits.
+ {
+ const __m128i t0 = _mm_unpacklo_epi16(r0, r1);
+ const __m128i t1 = _mm_unpackhi_epi16(r0, r1);
+ const __m128i t2 = _mm_unpacklo_epi16(r2, r3);
+ const __m128i t3 = _mm_unpackhi_epi16(r2, r3);
+ res00 = mult_round_shift(&t0, &t1, &k__cospi_p16_p16,
+ &k__DCT_CONST_ROUNDING, DCT_CONST_BITS);
+ res08 = mult_round_shift(&t0, &t1, &k__cospi_p16_m16,
+ &k__DCT_CONST_ROUNDING, DCT_CONST_BITS);
+ res04 = mult_round_shift(&t2, &t3, &k__cospi_p24_p08,
+ &k__DCT_CONST_ROUNDING, DCT_CONST_BITS);
+ res12 = mult_round_shift(&t2, &t3, &k__cospi_m08_p24,
+ &k__DCT_CONST_ROUNDING, DCT_CONST_BITS);
+#if DCT_HIGH_BIT_DEPTH
+ overflow = check_epi16_overflow_x4(&res00, &res08, &res04, &res12);
+ if (overflow) {
+ vpx_highbd_fdct16x16_c(input, output, stride);
+ return;
+ }
+#endif // DCT_HIGH_BIT_DEPTH
+ }
+ }
+ // Work on next four results
+ {
+ // Interleave to do the multiply by constants which gets us
+ // into 32 bits.
+ const __m128i d0 = _mm_unpacklo_epi16(q6, q5);
+ const __m128i d1 = _mm_unpackhi_epi16(q6, q5);
+ const __m128i r0 =
+ mult_round_shift(&d0, &d1, &k__cospi_p16_m16,
+ &k__DCT_CONST_ROUNDING, DCT_CONST_BITS);
+ const __m128i r1 =
+ mult_round_shift(&d0, &d1, &k__cospi_p16_p16,
+ &k__DCT_CONST_ROUNDING, DCT_CONST_BITS);
+#if DCT_HIGH_BIT_DEPTH
+ overflow = check_epi16_overflow_x2(&r0, &r1);
+ if (overflow) {
+ vpx_highbd_fdct16x16_c(input, output, stride);
+ return;
+ }
+#endif // DCT_HIGH_BIT_DEPTH
+ {
+ // Add/subtract
+ const __m128i x0 = ADD_EPI16(q4, r0);
+ const __m128i x1 = SUB_EPI16(q4, r0);
+ const __m128i x2 = SUB_EPI16(q7, r1);
+ const __m128i x3 = ADD_EPI16(q7, r1);
+#if DCT_HIGH_BIT_DEPTH
+ overflow = check_epi16_overflow_x4(&x0, &x1, &x2, &x3);
+ if (overflow) {
+ vpx_highbd_fdct16x16_c(input, output, stride);
+ return;
+ }
+#endif // DCT_HIGH_BIT_DEPTH
+
+ // Interleave to do the multiply by constants which gets us
+ // into 32 bits.
+ {
+ const __m128i t0 = _mm_unpacklo_epi16(x0, x3);
+ const __m128i t1 = _mm_unpackhi_epi16(x0, x3);
+ const __m128i t2 = _mm_unpacklo_epi16(x1, x2);
+ const __m128i t3 = _mm_unpackhi_epi16(x1, x2);
+ res02 = mult_round_shift(&t0, &t1, &k__cospi_p28_p04,
+ &k__DCT_CONST_ROUNDING, DCT_CONST_BITS);
+ res14 = mult_round_shift(&t0, &t1, &k__cospi_m04_p28,
+ &k__DCT_CONST_ROUNDING, DCT_CONST_BITS);
+ res10 = mult_round_shift(&t2, &t3, &k__cospi_p12_p20,
+ &k__DCT_CONST_ROUNDING, DCT_CONST_BITS);
+ res06 = mult_round_shift(&t2, &t3, &k__cospi_m20_p12,
+ &k__DCT_CONST_ROUNDING, DCT_CONST_BITS);
+#if DCT_HIGH_BIT_DEPTH
+ overflow =
+ check_epi16_overflow_x4(&res02, &res14, &res10, &res06);
+ if (overflow) {
+ vpx_highbd_fdct16x16_c(input, output, stride);
+ return;
+ }
+#endif // DCT_HIGH_BIT_DEPTH
+ }
+ }
+ }
+ }
+ // Work on the next eight values; step1 -> odd_results
+ {
+ // step 2
+ {
+ const __m128i t0 = _mm_unpacklo_epi16(step1_5, step1_2);
+ const __m128i t1 = _mm_unpackhi_epi16(step1_5, step1_2);
+ const __m128i t2 = _mm_unpacklo_epi16(step1_4, step1_3);
+ const __m128i t3 = _mm_unpackhi_epi16(step1_4, step1_3);
+ step2_2 = mult_round_shift(&t0, &t1, &k__cospi_p16_m16,
+ &k__DCT_CONST_ROUNDING, DCT_CONST_BITS);
+ step2_3 = mult_round_shift(&t2, &t3, &k__cospi_p16_m16,
+ &k__DCT_CONST_ROUNDING, DCT_CONST_BITS);
+ step2_5 = mult_round_shift(&t0, &t1, &k__cospi_p16_p16,
+ &k__DCT_CONST_ROUNDING, DCT_CONST_BITS);
+ step2_4 = mult_round_shift(&t2, &t3, &k__cospi_p16_p16,
+ &k__DCT_CONST_ROUNDING, DCT_CONST_BITS);
+#if DCT_HIGH_BIT_DEPTH
+ overflow =
+ check_epi16_overflow_x4(&step2_2, &step2_3, &step2_5, &step2_4);
+ if (overflow) {
+ vpx_highbd_fdct16x16_c(input, output, stride);
+ return;
+ }
+#endif // DCT_HIGH_BIT_DEPTH
+ }
+ // step 3
+ {
+ step3_0 = ADD_EPI16(step1_0, step2_3);
+ step3_1 = ADD_EPI16(step1_1, step2_2);
+ step3_2 = SUB_EPI16(step1_1, step2_2);
+ step3_3 = SUB_EPI16(step1_0, step2_3);
+ step3_4 = SUB_EPI16(step1_7, step2_4);
+ step3_5 = SUB_EPI16(step1_6, step2_5);
+ step3_6 = ADD_EPI16(step1_6, step2_5);
+ step3_7 = ADD_EPI16(step1_7, step2_4);
+#if DCT_HIGH_BIT_DEPTH
+ overflow =
+ check_epi16_overflow_x8(&step3_0, &step3_1, &step3_2, &step3_3,
+ &step3_4, &step3_5, &step3_6, &step3_7);
+ if (overflow) {
+ vpx_highbd_fdct16x16_c(input, output, stride);
+ return;
+ }
+#endif // DCT_HIGH_BIT_DEPTH
+ }
+ // step 4
+ {
+ const __m128i t0 = _mm_unpacklo_epi16(step3_1, step3_6);
+ const __m128i t1 = _mm_unpackhi_epi16(step3_1, step3_6);
+ const __m128i t2 = _mm_unpacklo_epi16(step3_2, step3_5);
+ const __m128i t3 = _mm_unpackhi_epi16(step3_2, step3_5);
+ step2_1 = mult_round_shift(&t0, &t1, &k__cospi_m08_p24,
+ &k__DCT_CONST_ROUNDING, DCT_CONST_BITS);
+ step2_2 = mult_round_shift(&t2, &t3, &k__cospi_p24_p08,
+ &k__DCT_CONST_ROUNDING, DCT_CONST_BITS);
+ step2_6 = mult_round_shift(&t0, &t1, &k__cospi_p24_p08,
+ &k__DCT_CONST_ROUNDING, DCT_CONST_BITS);
+ step2_5 = mult_round_shift(&t2, &t3, &k__cospi_p08_m24,
+ &k__DCT_CONST_ROUNDING, DCT_CONST_BITS);
+#if DCT_HIGH_BIT_DEPTH
+ overflow =
+ check_epi16_overflow_x4(&step2_1, &step2_2, &step2_6, &step2_5);
+ if (overflow) {
+ vpx_highbd_fdct16x16_c(input, output, stride);
+ return;
+ }
+#endif // DCT_HIGH_BIT_DEPTH
+ }
+ // step 5
+ {
+ step1_0 = ADD_EPI16(step3_0, step2_1);
+ step1_1 = SUB_EPI16(step3_0, step2_1);
+ step1_2 = ADD_EPI16(step3_3, step2_2);
+ step1_3 = SUB_EPI16(step3_3, step2_2);
+ step1_4 = SUB_EPI16(step3_4, step2_5);
+ step1_5 = ADD_EPI16(step3_4, step2_5);
+ step1_6 = SUB_EPI16(step3_7, step2_6);
+ step1_7 = ADD_EPI16(step3_7, step2_6);
+#if DCT_HIGH_BIT_DEPTH
+ overflow =
+ check_epi16_overflow_x8(&step1_0, &step1_1, &step1_2, &step1_3,
+ &step1_4, &step1_5, &step1_6, &step1_7);
+ if (overflow) {
+ vpx_highbd_fdct16x16_c(input, output, stride);
+ return;
+ }
+#endif // DCT_HIGH_BIT_DEPTH
+ }
+ // step 6
+ {
+ const __m128i t0 = _mm_unpacklo_epi16(step1_0, step1_7);
+ const __m128i t1 = _mm_unpackhi_epi16(step1_0, step1_7);
+ const __m128i t2 = _mm_unpacklo_epi16(step1_1, step1_6);
+ const __m128i t3 = _mm_unpackhi_epi16(step1_1, step1_6);
+ res01 = mult_round_shift(&t0, &t1, &k__cospi_p30_p02,
+ &k__DCT_CONST_ROUNDING, DCT_CONST_BITS);
+ res09 = mult_round_shift(&t2, &t3, &k__cospi_p14_p18,
+ &k__DCT_CONST_ROUNDING, DCT_CONST_BITS);
+ res15 = mult_round_shift(&t0, &t1, &k__cospi_m02_p30,
+ &k__DCT_CONST_ROUNDING, DCT_CONST_BITS);
+ res07 = mult_round_shift(&t2, &t3, &k__cospi_m18_p14,
+ &k__DCT_CONST_ROUNDING, DCT_CONST_BITS);
+#if DCT_HIGH_BIT_DEPTH
+ overflow = check_epi16_overflow_x4(&res01, &res09, &res15, &res07);
+ if (overflow) {
+ vpx_highbd_fdct16x16_c(input, output, stride);
+ return;
+ }
+#endif // DCT_HIGH_BIT_DEPTH
+ }
+ {
+ const __m128i t0 = _mm_unpacklo_epi16(step1_2, step1_5);
+ const __m128i t1 = _mm_unpackhi_epi16(step1_2, step1_5);
+ const __m128i t2 = _mm_unpacklo_epi16(step1_3, step1_4);
+ const __m128i t3 = _mm_unpackhi_epi16(step1_3, step1_4);
+ res05 = mult_round_shift(&t0, &t1, &k__cospi_p22_p10,
+ &k__DCT_CONST_ROUNDING, DCT_CONST_BITS);
+ res13 = mult_round_shift(&t2, &t3, &k__cospi_p06_p26,
+ &k__DCT_CONST_ROUNDING, DCT_CONST_BITS);
+ res11 = mult_round_shift(&t0, &t1, &k__cospi_m10_p22,
+ &k__DCT_CONST_ROUNDING, DCT_CONST_BITS);
+ res03 = mult_round_shift(&t2, &t3, &k__cospi_m26_p06,
+ &k__DCT_CONST_ROUNDING, DCT_CONST_BITS);
+#if DCT_HIGH_BIT_DEPTH
+ overflow = check_epi16_overflow_x4(&res05, &res13, &res11, &res03);
+ if (overflow) {
+ vpx_highbd_fdct16x16_c(input, output, stride);
+ return;
+ }
+#endif // DCT_HIGH_BIT_DEPTH
+ }
+ }
+ // Transpose the results, do it as two 8x8 transposes.
+ transpose_and_output8x8(&res00, &res01, &res02, &res03, &res04, &res05,
+ &res06, &res07, pass, out0, out1);
+ transpose_and_output8x8(&res08, &res09, &res10, &res11, &res12, &res13,
+ &res14, &res15, pass, out0 + 8, out1 + 8);
+ if (pass == 0) {
+ out0 += 8 * 16;
+ } else {
+ out1 += 8 * 16;
+ }
+ }
+ // Setup in/out for next pass.
+ in = intermediate;
+ }
+}
+
+#undef ADD_EPI16
+#undef SUB_EPI16
diff --git a/media/libvpx/libvpx/vpx_dsp/x86/fwd_txfm_sse2.c b/media/libvpx/libvpx/vpx_dsp/x86/fwd_txfm_sse2.c
new file mode 100644
index 0000000000..e14b99197f
--- /dev/null
+++ b/media/libvpx/libvpx/vpx_dsp/x86/fwd_txfm_sse2.c
@@ -0,0 +1,272 @@
+/*
+ * Copyright (c) 2015 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <emmintrin.h> // SSE2
+
+#include "./vpx_config.h"
+#include "./vpx_dsp_rtcd.h"
+#include "vpx_dsp/vpx_dsp_common.h"
+#include "vpx_dsp/x86/fwd_txfm_sse2.h"
+
+void vpx_fdct4x4_1_sse2(const int16_t *input, tran_low_t *output, int stride) {
+ __m128i in0, in1;
+ __m128i tmp;
+ const __m128i zero = _mm_setzero_si128();
+ in0 = _mm_loadl_epi64((const __m128i *)(input + 0 * stride));
+ in1 = _mm_loadl_epi64((const __m128i *)(input + 1 * stride));
+ in1 = _mm_unpacklo_epi64(
+ in1, _mm_loadl_epi64((const __m128i *)(input + 2 * stride)));
+ in0 = _mm_unpacklo_epi64(
+ in0, _mm_loadl_epi64((const __m128i *)(input + 3 * stride)));
+
+ tmp = _mm_add_epi16(in0, in1);
+ in0 = _mm_unpacklo_epi16(zero, tmp);
+ in1 = _mm_unpackhi_epi16(zero, tmp);
+ in0 = _mm_srai_epi32(in0, 16);
+ in1 = _mm_srai_epi32(in1, 16);
+
+ tmp = _mm_add_epi32(in0, in1);
+ in0 = _mm_unpacklo_epi32(tmp, zero);
+ in1 = _mm_unpackhi_epi32(tmp, zero);
+
+ tmp = _mm_add_epi32(in0, in1);
+ in0 = _mm_srli_si128(tmp, 8);
+
+ in1 = _mm_add_epi32(tmp, in0);
+ in0 = _mm_slli_epi32(in1, 1);
+ output[0] = (tran_low_t)_mm_cvtsi128_si32(in0);
+}
+
+void vpx_fdct8x8_1_sse2(const int16_t *input, tran_low_t *output, int stride) {
+ __m128i in0 = _mm_load_si128((const __m128i *)(input + 0 * stride));
+ __m128i in1 = _mm_load_si128((const __m128i *)(input + 1 * stride));
+ __m128i in2 = _mm_load_si128((const __m128i *)(input + 2 * stride));
+ __m128i in3 = _mm_load_si128((const __m128i *)(input + 3 * stride));
+ __m128i u0, u1, sum;
+
+ u0 = _mm_add_epi16(in0, in1);
+ u1 = _mm_add_epi16(in2, in3);
+
+ in0 = _mm_load_si128((const __m128i *)(input + 4 * stride));
+ in1 = _mm_load_si128((const __m128i *)(input + 5 * stride));
+ in2 = _mm_load_si128((const __m128i *)(input + 6 * stride));
+ in3 = _mm_load_si128((const __m128i *)(input + 7 * stride));
+
+ sum = _mm_add_epi16(u0, u1);
+
+ in0 = _mm_add_epi16(in0, in1);
+ in2 = _mm_add_epi16(in2, in3);
+ sum = _mm_add_epi16(sum, in0);
+
+ u0 = _mm_setzero_si128();
+ sum = _mm_add_epi16(sum, in2);
+
+ in0 = _mm_unpacklo_epi16(u0, sum);
+ in1 = _mm_unpackhi_epi16(u0, sum);
+ in0 = _mm_srai_epi32(in0, 16);
+ in1 = _mm_srai_epi32(in1, 16);
+
+ sum = _mm_add_epi32(in0, in1);
+ in0 = _mm_unpacklo_epi32(sum, u0);
+ in1 = _mm_unpackhi_epi32(sum, u0);
+
+ sum = _mm_add_epi32(in0, in1);
+ in0 = _mm_srli_si128(sum, 8);
+
+ in1 = _mm_add_epi32(sum, in0);
+ output[0] = (tran_low_t)_mm_cvtsi128_si32(in1);
+}
+
+void vpx_fdct16x16_1_sse2(const int16_t *input, tran_low_t *output,
+ int stride) {
+ __m128i in0, in1, in2, in3;
+ __m128i u0, u1;
+ __m128i sum = _mm_setzero_si128();
+ int i;
+
+ for (i = 0; i < 2; ++i) {
+ in0 = _mm_load_si128((const __m128i *)(input + 0 * stride + 0));
+ in1 = _mm_load_si128((const __m128i *)(input + 0 * stride + 8));
+ in2 = _mm_load_si128((const __m128i *)(input + 1 * stride + 0));
+ in3 = _mm_load_si128((const __m128i *)(input + 1 * stride + 8));
+
+ u0 = _mm_add_epi16(in0, in1);
+ u1 = _mm_add_epi16(in2, in3);
+ sum = _mm_add_epi16(sum, u0);
+
+ in0 = _mm_load_si128((const __m128i *)(input + 2 * stride + 0));
+ in1 = _mm_load_si128((const __m128i *)(input + 2 * stride + 8));
+ in2 = _mm_load_si128((const __m128i *)(input + 3 * stride + 0));
+ in3 = _mm_load_si128((const __m128i *)(input + 3 * stride + 8));
+
+ sum = _mm_add_epi16(sum, u1);
+ u0 = _mm_add_epi16(in0, in1);
+ u1 = _mm_add_epi16(in2, in3);
+ sum = _mm_add_epi16(sum, u0);
+
+ in0 = _mm_load_si128((const __m128i *)(input + 4 * stride + 0));
+ in1 = _mm_load_si128((const __m128i *)(input + 4 * stride + 8));
+ in2 = _mm_load_si128((const __m128i *)(input + 5 * stride + 0));
+ in3 = _mm_load_si128((const __m128i *)(input + 5 * stride + 8));
+
+ sum = _mm_add_epi16(sum, u1);
+ u0 = _mm_add_epi16(in0, in1);
+ u1 = _mm_add_epi16(in2, in3);
+ sum = _mm_add_epi16(sum, u0);
+
+ in0 = _mm_load_si128((const __m128i *)(input + 6 * stride + 0));
+ in1 = _mm_load_si128((const __m128i *)(input + 6 * stride + 8));
+ in2 = _mm_load_si128((const __m128i *)(input + 7 * stride + 0));
+ in3 = _mm_load_si128((const __m128i *)(input + 7 * stride + 8));
+
+ sum = _mm_add_epi16(sum, u1);
+ u0 = _mm_add_epi16(in0, in1);
+ u1 = _mm_add_epi16(in2, in3);
+ sum = _mm_add_epi16(sum, u0);
+
+ sum = _mm_add_epi16(sum, u1);
+ input += 8 * stride;
+ }
+
+ u0 = _mm_setzero_si128();
+ in0 = _mm_unpacklo_epi16(u0, sum);
+ in1 = _mm_unpackhi_epi16(u0, sum);
+ in0 = _mm_srai_epi32(in0, 16);
+ in1 = _mm_srai_epi32(in1, 16);
+
+ sum = _mm_add_epi32(in0, in1);
+ in0 = _mm_unpacklo_epi32(sum, u0);
+ in1 = _mm_unpackhi_epi32(sum, u0);
+
+ sum = _mm_add_epi32(in0, in1);
+ in0 = _mm_srli_si128(sum, 8);
+
+ in1 = _mm_add_epi32(sum, in0);
+ in1 = _mm_srai_epi32(in1, 1);
+ output[0] = (tran_low_t)_mm_cvtsi128_si32(in1);
+}
+
+void vpx_fdct32x32_1_sse2(const int16_t *input, tran_low_t *output,
+ int stride) {
+ __m128i in0, in1, in2, in3;
+ __m128i u0, u1;
+ __m128i sum = _mm_setzero_si128();
+ int i;
+
+ for (i = 0; i < 8; ++i) {
+ in0 = _mm_load_si128((const __m128i *)(input + 0));
+ in1 = _mm_load_si128((const __m128i *)(input + 8));
+ in2 = _mm_load_si128((const __m128i *)(input + 16));
+ in3 = _mm_load_si128((const __m128i *)(input + 24));
+
+ input += stride;
+ u0 = _mm_add_epi16(in0, in1);
+ u1 = _mm_add_epi16(in2, in3);
+ sum = _mm_add_epi16(sum, u0);
+
+ in0 = _mm_load_si128((const __m128i *)(input + 0));
+ in1 = _mm_load_si128((const __m128i *)(input + 8));
+ in2 = _mm_load_si128((const __m128i *)(input + 16));
+ in3 = _mm_load_si128((const __m128i *)(input + 24));
+
+ input += stride;
+ sum = _mm_add_epi16(sum, u1);
+ u0 = _mm_add_epi16(in0, in1);
+ u1 = _mm_add_epi16(in2, in3);
+ sum = _mm_add_epi16(sum, u0);
+
+ in0 = _mm_load_si128((const __m128i *)(input + 0));
+ in1 = _mm_load_si128((const __m128i *)(input + 8));
+ in2 = _mm_load_si128((const __m128i *)(input + 16));
+ in3 = _mm_load_si128((const __m128i *)(input + 24));
+
+ input += stride;
+ sum = _mm_add_epi16(sum, u1);
+ u0 = _mm_add_epi16(in0, in1);
+ u1 = _mm_add_epi16(in2, in3);
+ sum = _mm_add_epi16(sum, u0);
+
+ in0 = _mm_load_si128((const __m128i *)(input + 0));
+ in1 = _mm_load_si128((const __m128i *)(input + 8));
+ in2 = _mm_load_si128((const __m128i *)(input + 16));
+ in3 = _mm_load_si128((const __m128i *)(input + 24));
+
+ input += stride;
+ sum = _mm_add_epi16(sum, u1);
+ u0 = _mm_add_epi16(in0, in1);
+ u1 = _mm_add_epi16(in2, in3);
+ sum = _mm_add_epi16(sum, u0);
+
+ sum = _mm_add_epi16(sum, u1);
+ }
+
+ u0 = _mm_setzero_si128();
+ in0 = _mm_unpacklo_epi16(u0, sum);
+ in1 = _mm_unpackhi_epi16(u0, sum);
+ in0 = _mm_srai_epi32(in0, 16);
+ in1 = _mm_srai_epi32(in1, 16);
+
+ sum = _mm_add_epi32(in0, in1);
+ in0 = _mm_unpacklo_epi32(sum, u0);
+ in1 = _mm_unpackhi_epi32(sum, u0);
+
+ sum = _mm_add_epi32(in0, in1);
+ in0 = _mm_srli_si128(sum, 8);
+
+ in1 = _mm_add_epi32(sum, in0);
+ in1 = _mm_srai_epi32(in1, 3);
+ output[0] = (tran_low_t)_mm_cvtsi128_si32(in1);
+}
+
+#define DCT_HIGH_BIT_DEPTH 0
+#define FDCT4x4_2D vpx_fdct4x4_sse2
+#define FDCT8x8_2D vpx_fdct8x8_sse2
+#define FDCT16x16_2D vpx_fdct16x16_sse2
+#include "vpx_dsp/x86/fwd_txfm_impl_sse2.h"
+#undef FDCT4x4_2D
+#undef FDCT8x8_2D
+#undef FDCT16x16_2D
+
+#define FDCT32x32_2D vpx_fdct32x32_rd_sse2
+#define FDCT32x32_HIGH_PRECISION 0
+#include "vpx_dsp/x86/fwd_dct32x32_impl_sse2.h"
+#undef FDCT32x32_2D
+#undef FDCT32x32_HIGH_PRECISION
+
+#define FDCT32x32_2D vpx_fdct32x32_sse2
+#define FDCT32x32_HIGH_PRECISION 1
+#include "vpx_dsp/x86/fwd_dct32x32_impl_sse2.h" // NOLINT
+#undef FDCT32x32_2D
+#undef FDCT32x32_HIGH_PRECISION
+#undef DCT_HIGH_BIT_DEPTH
+
+#if CONFIG_VP9_HIGHBITDEPTH
+#define DCT_HIGH_BIT_DEPTH 1
+#define FDCT4x4_2D vpx_highbd_fdct4x4_sse2
+#define FDCT8x8_2D vpx_highbd_fdct8x8_sse2
+#define FDCT16x16_2D vpx_highbd_fdct16x16_sse2
+#include "vpx_dsp/x86/fwd_txfm_impl_sse2.h" // NOLINT
+#undef FDCT4x4_2D
+#undef FDCT8x8_2D
+#undef FDCT16x16_2D
+
+#define FDCT32x32_2D vpx_highbd_fdct32x32_rd_sse2
+#define FDCT32x32_HIGH_PRECISION 0
+#include "vpx_dsp/x86/fwd_dct32x32_impl_sse2.h" // NOLINT
+#undef FDCT32x32_2D
+#undef FDCT32x32_HIGH_PRECISION
+
+#define FDCT32x32_2D vpx_highbd_fdct32x32_sse2
+#define FDCT32x32_HIGH_PRECISION 1
+#include "vpx_dsp/x86/fwd_dct32x32_impl_sse2.h" // NOLINT
+#undef FDCT32x32_2D
+#undef FDCT32x32_HIGH_PRECISION
+#undef DCT_HIGH_BIT_DEPTH
+#endif // CONFIG_VP9_HIGHBITDEPTH
diff --git a/media/libvpx/libvpx/vpx_dsp/x86/fwd_txfm_sse2.h b/media/libvpx/libvpx/vpx_dsp/x86/fwd_txfm_sse2.h
new file mode 100644
index 0000000000..5aa2779706
--- /dev/null
+++ b/media/libvpx/libvpx/vpx_dsp/x86/fwd_txfm_sse2.h
@@ -0,0 +1,371 @@
+/*
+ * Copyright (c) 2014 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef VPX_VPX_DSP_X86_FWD_TXFM_SSE2_H_
+#define VPX_VPX_DSP_X86_FWD_TXFM_SSE2_H_
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#define pair_set_epi32(a, b) \
+ _mm_set_epi32((int)(b), (int)(a), (int)(b), (int)(a))
+
+static INLINE __m128i k_madd_epi32(__m128i a, __m128i b) {
+ __m128i buf0, buf1;
+ buf0 = _mm_mul_epu32(a, b);
+ a = _mm_srli_epi64(a, 32);
+ b = _mm_srli_epi64(b, 32);
+ buf1 = _mm_mul_epu32(a, b);
+ return _mm_add_epi64(buf0, buf1);
+}
+
+static INLINE __m128i k_packs_epi64(__m128i a, __m128i b) {
+ __m128i buf0 = _mm_shuffle_epi32(a, _MM_SHUFFLE(0, 0, 2, 0));
+ __m128i buf1 = _mm_shuffle_epi32(b, _MM_SHUFFLE(0, 0, 2, 0));
+ return _mm_unpacklo_epi64(buf0, buf1);
+}
+
+static INLINE int check_epi16_overflow_x2(const __m128i *preg0,
+ const __m128i *preg1) {
+ const __m128i max_overflow = _mm_set1_epi16(0x7fff);
+ const __m128i min_overflow = _mm_set1_epi16((short)0x8000);
+ __m128i cmp0 = _mm_or_si128(_mm_cmpeq_epi16(*preg0, max_overflow),
+ _mm_cmpeq_epi16(*preg0, min_overflow));
+ __m128i cmp1 = _mm_or_si128(_mm_cmpeq_epi16(*preg1, max_overflow),
+ _mm_cmpeq_epi16(*preg1, min_overflow));
+ cmp0 = _mm_or_si128(cmp0, cmp1);
+ return _mm_movemask_epi8(cmp0);
+}
+
+static INLINE int check_epi16_overflow_x4(const __m128i *preg0,
+ const __m128i *preg1,
+ const __m128i *preg2,
+ const __m128i *preg3) {
+ const __m128i max_overflow = _mm_set1_epi16(0x7fff);
+ const __m128i min_overflow = _mm_set1_epi16((short)0x8000);
+ __m128i cmp0 = _mm_or_si128(_mm_cmpeq_epi16(*preg0, max_overflow),
+ _mm_cmpeq_epi16(*preg0, min_overflow));
+ __m128i cmp1 = _mm_or_si128(_mm_cmpeq_epi16(*preg1, max_overflow),
+ _mm_cmpeq_epi16(*preg1, min_overflow));
+ __m128i cmp2 = _mm_or_si128(_mm_cmpeq_epi16(*preg2, max_overflow),
+ _mm_cmpeq_epi16(*preg2, min_overflow));
+ __m128i cmp3 = _mm_or_si128(_mm_cmpeq_epi16(*preg3, max_overflow),
+ _mm_cmpeq_epi16(*preg3, min_overflow));
+ cmp0 = _mm_or_si128(_mm_or_si128(cmp0, cmp1), _mm_or_si128(cmp2, cmp3));
+ return _mm_movemask_epi8(cmp0);
+}
+
+static INLINE int check_epi16_overflow_x8(
+ const __m128i *preg0, const __m128i *preg1, const __m128i *preg2,
+ const __m128i *preg3, const __m128i *preg4, const __m128i *preg5,
+ const __m128i *preg6, const __m128i *preg7) {
+ int res0, res1;
+ res0 = check_epi16_overflow_x4(preg0, preg1, preg2, preg3);
+ res1 = check_epi16_overflow_x4(preg4, preg5, preg6, preg7);
+ return res0 + res1;
+}
+
+static INLINE int check_epi16_overflow_x12(
+ const __m128i *preg0, const __m128i *preg1, const __m128i *preg2,
+ const __m128i *preg3, const __m128i *preg4, const __m128i *preg5,
+ const __m128i *preg6, const __m128i *preg7, const __m128i *preg8,
+ const __m128i *preg9, const __m128i *preg10, const __m128i *preg11) {
+ int res0, res1;
+ res0 = check_epi16_overflow_x4(preg0, preg1, preg2, preg3);
+ res1 = check_epi16_overflow_x4(preg4, preg5, preg6, preg7);
+ if (!res0) res0 = check_epi16_overflow_x4(preg8, preg9, preg10, preg11);
+ return res0 + res1;
+}
+
+static INLINE int check_epi16_overflow_x16(
+ const __m128i *preg0, const __m128i *preg1, const __m128i *preg2,
+ const __m128i *preg3, const __m128i *preg4, const __m128i *preg5,
+ const __m128i *preg6, const __m128i *preg7, const __m128i *preg8,
+ const __m128i *preg9, const __m128i *preg10, const __m128i *preg11,
+ const __m128i *preg12, const __m128i *preg13, const __m128i *preg14,
+ const __m128i *preg15) {
+ int res0, res1;
+ res0 = check_epi16_overflow_x4(preg0, preg1, preg2, preg3);
+ res1 = check_epi16_overflow_x4(preg4, preg5, preg6, preg7);
+ if (!res0) {
+ res0 = check_epi16_overflow_x4(preg8, preg9, preg10, preg11);
+ if (!res1) res1 = check_epi16_overflow_x4(preg12, preg13, preg14, preg15);
+ }
+ return res0 + res1;
+}
+
+static INLINE int check_epi16_overflow_x32(
+ const __m128i *preg0, const __m128i *preg1, const __m128i *preg2,
+ const __m128i *preg3, const __m128i *preg4, const __m128i *preg5,
+ const __m128i *preg6, const __m128i *preg7, const __m128i *preg8,
+ const __m128i *preg9, const __m128i *preg10, const __m128i *preg11,
+ const __m128i *preg12, const __m128i *preg13, const __m128i *preg14,
+ const __m128i *preg15, const __m128i *preg16, const __m128i *preg17,
+ const __m128i *preg18, const __m128i *preg19, const __m128i *preg20,
+ const __m128i *preg21, const __m128i *preg22, const __m128i *preg23,
+ const __m128i *preg24, const __m128i *preg25, const __m128i *preg26,
+ const __m128i *preg27, const __m128i *preg28, const __m128i *preg29,
+ const __m128i *preg30, const __m128i *preg31) {
+ int res0, res1;
+ res0 = check_epi16_overflow_x4(preg0, preg1, preg2, preg3);
+ res1 = check_epi16_overflow_x4(preg4, preg5, preg6, preg7);
+ if (!res0) {
+ res0 = check_epi16_overflow_x4(preg8, preg9, preg10, preg11);
+ if (!res1) {
+ res1 = check_epi16_overflow_x4(preg12, preg13, preg14, preg15);
+ if (!res0) {
+ res0 = check_epi16_overflow_x4(preg16, preg17, preg18, preg19);
+ if (!res1) {
+ res1 = check_epi16_overflow_x4(preg20, preg21, preg22, preg23);
+ if (!res0) {
+ res0 = check_epi16_overflow_x4(preg24, preg25, preg26, preg27);
+ if (!res1)
+ res1 = check_epi16_overflow_x4(preg28, preg29, preg30, preg31);
+ }
+ }
+ }
+ }
+ }
+ return res0 + res1;
+}
+
+static INLINE int k_check_epi32_overflow_4(const __m128i *preg0,
+ const __m128i *preg1,
+ const __m128i *preg2,
+ const __m128i *preg3,
+ const __m128i *zero) {
+ __m128i minus_one = _mm_set1_epi32(-1);
+ // Check for overflows
+ __m128i reg0_shifted = _mm_slli_epi64(*preg0, 1);
+ __m128i reg1_shifted = _mm_slli_epi64(*preg1, 1);
+ __m128i reg2_shifted = _mm_slli_epi64(*preg2, 1);
+ __m128i reg3_shifted = _mm_slli_epi64(*preg3, 1);
+ __m128i reg0_top_dwords =
+ _mm_shuffle_epi32(reg0_shifted, _MM_SHUFFLE(0, 0, 3, 1));
+ __m128i reg1_top_dwords =
+ _mm_shuffle_epi32(reg1_shifted, _MM_SHUFFLE(0, 0, 3, 1));
+ __m128i reg2_top_dwords =
+ _mm_shuffle_epi32(reg2_shifted, _MM_SHUFFLE(0, 0, 3, 1));
+ __m128i reg3_top_dwords =
+ _mm_shuffle_epi32(reg3_shifted, _MM_SHUFFLE(0, 0, 3, 1));
+ __m128i top_dwords_01 = _mm_unpacklo_epi64(reg0_top_dwords, reg1_top_dwords);
+ __m128i top_dwords_23 = _mm_unpacklo_epi64(reg2_top_dwords, reg3_top_dwords);
+ __m128i valid_positve_01 = _mm_cmpeq_epi32(top_dwords_01, *zero);
+ __m128i valid_positve_23 = _mm_cmpeq_epi32(top_dwords_23, *zero);
+ __m128i valid_negative_01 = _mm_cmpeq_epi32(top_dwords_01, minus_one);
+ __m128i valid_negative_23 = _mm_cmpeq_epi32(top_dwords_23, minus_one);
+ int overflow_01 =
+ _mm_movemask_epi8(_mm_cmpeq_epi32(valid_positve_01, valid_negative_01));
+ int overflow_23 =
+ _mm_movemask_epi8(_mm_cmpeq_epi32(valid_positve_23, valid_negative_23));
+ return (overflow_01 + overflow_23);
+}
+
+static INLINE int k_check_epi32_overflow_8(
+ const __m128i *preg0, const __m128i *preg1, const __m128i *preg2,
+ const __m128i *preg3, const __m128i *preg4, const __m128i *preg5,
+ const __m128i *preg6, const __m128i *preg7, const __m128i *zero) {
+ int overflow = k_check_epi32_overflow_4(preg0, preg1, preg2, preg3, zero);
+ if (!overflow) {
+ overflow = k_check_epi32_overflow_4(preg4, preg5, preg6, preg7, zero);
+ }
+ return overflow;
+}
+
+static INLINE int k_check_epi32_overflow_16(
+ const __m128i *preg0, const __m128i *preg1, const __m128i *preg2,
+ const __m128i *preg3, const __m128i *preg4, const __m128i *preg5,
+ const __m128i *preg6, const __m128i *preg7, const __m128i *preg8,
+ const __m128i *preg9, const __m128i *preg10, const __m128i *preg11,
+ const __m128i *preg12, const __m128i *preg13, const __m128i *preg14,
+ const __m128i *preg15, const __m128i *zero) {
+ int overflow = k_check_epi32_overflow_4(preg0, preg1, preg2, preg3, zero);
+ if (!overflow) {
+ overflow = k_check_epi32_overflow_4(preg4, preg5, preg6, preg7, zero);
+ if (!overflow) {
+ overflow = k_check_epi32_overflow_4(preg8, preg9, preg10, preg11, zero);
+ if (!overflow) {
+ overflow =
+ k_check_epi32_overflow_4(preg12, preg13, preg14, preg15, zero);
+ }
+ }
+ }
+ return overflow;
+}
+
+static INLINE int k_check_epi32_overflow_32(
+ const __m128i *preg0, const __m128i *preg1, const __m128i *preg2,
+ const __m128i *preg3, const __m128i *preg4, const __m128i *preg5,
+ const __m128i *preg6, const __m128i *preg7, const __m128i *preg8,
+ const __m128i *preg9, const __m128i *preg10, const __m128i *preg11,
+ const __m128i *preg12, const __m128i *preg13, const __m128i *preg14,
+ const __m128i *preg15, const __m128i *preg16, const __m128i *preg17,
+ const __m128i *preg18, const __m128i *preg19, const __m128i *preg20,
+ const __m128i *preg21, const __m128i *preg22, const __m128i *preg23,
+ const __m128i *preg24, const __m128i *preg25, const __m128i *preg26,
+ const __m128i *preg27, const __m128i *preg28, const __m128i *preg29,
+ const __m128i *preg30, const __m128i *preg31, const __m128i *zero) {
+ int overflow = k_check_epi32_overflow_4(preg0, preg1, preg2, preg3, zero);
+ if (!overflow) {
+ overflow = k_check_epi32_overflow_4(preg4, preg5, preg6, preg7, zero);
+ if (!overflow) {
+ overflow = k_check_epi32_overflow_4(preg8, preg9, preg10, preg11, zero);
+ if (!overflow) {
+ overflow =
+ k_check_epi32_overflow_4(preg12, preg13, preg14, preg15, zero);
+ if (!overflow) {
+ overflow =
+ k_check_epi32_overflow_4(preg16, preg17, preg18, preg19, zero);
+ if (!overflow) {
+ overflow =
+ k_check_epi32_overflow_4(preg20, preg21, preg22, preg23, zero);
+ if (!overflow) {
+ overflow = k_check_epi32_overflow_4(preg24, preg25, preg26,
+ preg27, zero);
+ if (!overflow) {
+ overflow = k_check_epi32_overflow_4(preg28, preg29, preg30,
+ preg31, zero);
+ }
+ }
+ }
+ }
+ }
+ }
+ }
+ return overflow;
+}
+
+static INLINE void store_output(const __m128i *poutput, tran_low_t *dst_ptr) {
+#if CONFIG_VP9_HIGHBITDEPTH
+ const __m128i zero = _mm_setzero_si128();
+ const __m128i sign_bits = _mm_cmplt_epi16(*poutput, zero);
+ __m128i out0 = _mm_unpacklo_epi16(*poutput, sign_bits);
+ __m128i out1 = _mm_unpackhi_epi16(*poutput, sign_bits);
+ _mm_store_si128((__m128i *)(dst_ptr), out0);
+ _mm_store_si128((__m128i *)(dst_ptr + 4), out1);
+#else
+ _mm_store_si128((__m128i *)(dst_ptr), *poutput);
+#endif // CONFIG_VP9_HIGHBITDEPTH
+}
+
+static INLINE void storeu_output(const __m128i *poutput, tran_low_t *dst_ptr) {
+#if CONFIG_VP9_HIGHBITDEPTH
+ const __m128i zero = _mm_setzero_si128();
+ const __m128i sign_bits = _mm_cmplt_epi16(*poutput, zero);
+ __m128i out0 = _mm_unpacklo_epi16(*poutput, sign_bits);
+ __m128i out1 = _mm_unpackhi_epi16(*poutput, sign_bits);
+ _mm_storeu_si128((__m128i *)(dst_ptr), out0);
+ _mm_storeu_si128((__m128i *)(dst_ptr + 4), out1);
+#else
+ _mm_storeu_si128((__m128i *)(dst_ptr), *poutput);
+#endif // CONFIG_VP9_HIGHBITDEPTH
+}
+
+static INLINE __m128i mult_round_shift(const __m128i *pin0, const __m128i *pin1,
+ const __m128i *pmultiplier,
+ const __m128i *prounding,
+ const int shift) {
+ const __m128i u0 = _mm_madd_epi16(*pin0, *pmultiplier);
+ const __m128i u1 = _mm_madd_epi16(*pin1, *pmultiplier);
+ const __m128i v0 = _mm_add_epi32(u0, *prounding);
+ const __m128i v1 = _mm_add_epi32(u1, *prounding);
+ const __m128i w0 = _mm_srai_epi32(v0, shift);
+ const __m128i w1 = _mm_srai_epi32(v1, shift);
+ return _mm_packs_epi32(w0, w1);
+}
+
+static INLINE void transpose_and_output8x8(
+ const __m128i *pin00, const __m128i *pin01, const __m128i *pin02,
+ const __m128i *pin03, const __m128i *pin04, const __m128i *pin05,
+ const __m128i *pin06, const __m128i *pin07, const int pass,
+ int16_t *out0_ptr, tran_low_t *out1_ptr) {
+ // 00 01 02 03 04 05 06 07
+ // 10 11 12 13 14 15 16 17
+ // 20 21 22 23 24 25 26 27
+ // 30 31 32 33 34 35 36 37
+ // 40 41 42 43 44 45 46 47
+ // 50 51 52 53 54 55 56 57
+ // 60 61 62 63 64 65 66 67
+ // 70 71 72 73 74 75 76 77
+ const __m128i tr0_0 = _mm_unpacklo_epi16(*pin00, *pin01);
+ const __m128i tr0_1 = _mm_unpacklo_epi16(*pin02, *pin03);
+ const __m128i tr0_2 = _mm_unpackhi_epi16(*pin00, *pin01);
+ const __m128i tr0_3 = _mm_unpackhi_epi16(*pin02, *pin03);
+ const __m128i tr0_4 = _mm_unpacklo_epi16(*pin04, *pin05);
+ const __m128i tr0_5 = _mm_unpacklo_epi16(*pin06, *pin07);
+ const __m128i tr0_6 = _mm_unpackhi_epi16(*pin04, *pin05);
+ const __m128i tr0_7 = _mm_unpackhi_epi16(*pin06, *pin07);
+ // 00 10 01 11 02 12 03 13
+ // 20 30 21 31 22 32 23 33
+ // 04 14 05 15 06 16 07 17
+ // 24 34 25 35 26 36 27 37
+ // 40 50 41 51 42 52 43 53
+ // 60 70 61 71 62 72 63 73
+ // 54 54 55 55 56 56 57 57
+ // 64 74 65 75 66 76 67 77
+ const __m128i tr1_0 = _mm_unpacklo_epi32(tr0_0, tr0_1);
+ const __m128i tr1_1 = _mm_unpacklo_epi32(tr0_2, tr0_3);
+ const __m128i tr1_2 = _mm_unpackhi_epi32(tr0_0, tr0_1);
+ const __m128i tr1_3 = _mm_unpackhi_epi32(tr0_2, tr0_3);
+ const __m128i tr1_4 = _mm_unpacklo_epi32(tr0_4, tr0_5);
+ const __m128i tr1_5 = _mm_unpacklo_epi32(tr0_6, tr0_7);
+ const __m128i tr1_6 = _mm_unpackhi_epi32(tr0_4, tr0_5);
+ const __m128i tr1_7 = _mm_unpackhi_epi32(tr0_6, tr0_7);
+ // 00 10 20 30 01 11 21 31
+ // 40 50 60 70 41 51 61 71
+ // 02 12 22 32 03 13 23 33
+ // 42 52 62 72 43 53 63 73
+ // 04 14 24 34 05 15 21 36
+ // 44 54 64 74 45 55 61 76
+ // 06 16 26 36 07 17 27 37
+ // 46 56 66 76 47 57 67 77
+ const __m128i tr2_0 = _mm_unpacklo_epi64(tr1_0, tr1_4);
+ const __m128i tr2_1 = _mm_unpackhi_epi64(tr1_0, tr1_4);
+ const __m128i tr2_2 = _mm_unpacklo_epi64(tr1_2, tr1_6);
+ const __m128i tr2_3 = _mm_unpackhi_epi64(tr1_2, tr1_6);
+ const __m128i tr2_4 = _mm_unpacklo_epi64(tr1_1, tr1_5);
+ const __m128i tr2_5 = _mm_unpackhi_epi64(tr1_1, tr1_5);
+ const __m128i tr2_6 = _mm_unpacklo_epi64(tr1_3, tr1_7);
+ const __m128i tr2_7 = _mm_unpackhi_epi64(tr1_3, tr1_7);
+ // 00 10 20 30 40 50 60 70
+ // 01 11 21 31 41 51 61 71
+ // 02 12 22 32 42 52 62 72
+ // 03 13 23 33 43 53 63 73
+ // 04 14 24 34 44 54 64 74
+ // 05 15 25 35 45 55 65 75
+ // 06 16 26 36 46 56 66 76
+ // 07 17 27 37 47 57 67 77
+ if (pass == 0) {
+ _mm_storeu_si128((__m128i *)(out0_ptr + 0 * 16), tr2_0);
+ _mm_storeu_si128((__m128i *)(out0_ptr + 1 * 16), tr2_1);
+ _mm_storeu_si128((__m128i *)(out0_ptr + 2 * 16), tr2_2);
+ _mm_storeu_si128((__m128i *)(out0_ptr + 3 * 16), tr2_3);
+ _mm_storeu_si128((__m128i *)(out0_ptr + 4 * 16), tr2_4);
+ _mm_storeu_si128((__m128i *)(out0_ptr + 5 * 16), tr2_5);
+ _mm_storeu_si128((__m128i *)(out0_ptr + 6 * 16), tr2_6);
+ _mm_storeu_si128((__m128i *)(out0_ptr + 7 * 16), tr2_7);
+ } else {
+ storeu_output(&tr2_0, (out1_ptr + 0 * 16));
+ storeu_output(&tr2_1, (out1_ptr + 1 * 16));
+ storeu_output(&tr2_2, (out1_ptr + 2 * 16));
+ storeu_output(&tr2_3, (out1_ptr + 3 * 16));
+ storeu_output(&tr2_4, (out1_ptr + 4 * 16));
+ storeu_output(&tr2_5, (out1_ptr + 5 * 16));
+ storeu_output(&tr2_6, (out1_ptr + 6 * 16));
+ storeu_output(&tr2_7, (out1_ptr + 7 * 16));
+ }
+}
+
+#ifdef __cplusplus
+} // extern "C"
+#endif
+
+#endif // VPX_VPX_DSP_X86_FWD_TXFM_SSE2_H_
diff --git a/media/libvpx/libvpx/vpx_dsp/x86/fwd_txfm_ssse3_x86_64.asm b/media/libvpx/libvpx/vpx_dsp/x86/fwd_txfm_ssse3_x86_64.asm
new file mode 100644
index 0000000000..2c338fb5dd
--- /dev/null
+++ b/media/libvpx/libvpx/vpx_dsp/x86/fwd_txfm_ssse3_x86_64.asm
@@ -0,0 +1,361 @@
+;
+; Copyright (c) 2015 The WebM project authors. All Rights Reserved.
+;
+; Use of this source code is governed by a BSD-style license
+; that can be found in the LICENSE file in the root of the source
+; tree. An additional intellectual property rights grant can be found
+; in the file PATENTS. All contributing project authors may
+; be found in the AUTHORS file in the root of the source tree.
+;
+
+%include "third_party/x86inc/x86inc.asm"
+
+SECTION_RODATA
+
+pw_11585x2: times 8 dw 23170
+pd_8192: times 4 dd 8192
+
+%macro TRANSFORM_COEFFS 2
+pw_%1_%2: dw %1, %2, %1, %2, %1, %2, %1, %2
+pw_%2_m%1: dw %2, -%1, %2, -%1, %2, -%1, %2, -%1
+%endmacro
+
+TRANSFORM_COEFFS 11585, 11585
+TRANSFORM_COEFFS 15137, 6270
+TRANSFORM_COEFFS 16069, 3196
+TRANSFORM_COEFFS 9102, 13623
+
+SECTION .text
+
+%if VPX_ARCH_X86_64
+INIT_XMM ssse3
+cglobal fdct8x8, 3, 5, 13, input, output, stride
+
+ mova m8, [GLOBAL(pd_8192)]
+ mova m12, [GLOBAL(pw_11585x2)]
+
+ lea r3, [2 * strideq]
+ lea r4, [4 * strideq]
+ mova m0, [inputq]
+ mova m1, [inputq + r3]
+ lea inputq, [inputq + r4]
+ mova m2, [inputq]
+ mova m3, [inputq + r3]
+ lea inputq, [inputq + r4]
+ mova m4, [inputq]
+ mova m5, [inputq + r3]
+ lea inputq, [inputq + r4]
+ mova m6, [inputq]
+ mova m7, [inputq + r3]
+
+ ; left shift by 2 to increase forward transformation precision
+ psllw m0, 2
+ psllw m1, 2
+ psllw m2, 2
+ psllw m3, 2
+ psllw m4, 2
+ psllw m5, 2
+ psllw m6, 2
+ psllw m7, 2
+
+ ; column transform
+ ; stage 1
+ paddw m10, m0, m7
+ psubw m0, m7
+
+ paddw m9, m1, m6
+ psubw m1, m6
+
+ paddw m7, m2, m5
+ psubw m2, m5
+
+ paddw m6, m3, m4
+ psubw m3, m4
+
+ ; stage 2
+ paddw m5, m9, m7
+ psubw m9, m7
+
+ paddw m4, m10, m6
+ psubw m10, m6
+
+ paddw m7, m1, m2
+ psubw m1, m2
+
+ ; stage 3
+ paddw m6, m4, m5
+ psubw m4, m5
+
+ pmulhrsw m1, m12
+ pmulhrsw m7, m12
+
+ ; sin(pi / 8), cos(pi / 8)
+ punpcklwd m2, m10, m9
+ punpckhwd m10, m9
+ pmaddwd m5, m2, [GLOBAL(pw_15137_6270)]
+ pmaddwd m2, [GLOBAL(pw_6270_m15137)]
+ pmaddwd m9, m10, [GLOBAL(pw_15137_6270)]
+ pmaddwd m10, [GLOBAL(pw_6270_m15137)]
+ paddd m5, m8
+ paddd m2, m8
+ paddd m9, m8
+ paddd m10, m8
+ psrad m5, 14
+ psrad m2, 14
+ psrad m9, 14
+ psrad m10, 14
+ packssdw m5, m9
+ packssdw m2, m10
+
+ pmulhrsw m6, m12
+ pmulhrsw m4, m12
+
+ paddw m9, m3, m1
+ psubw m3, m1
+
+ paddw m10, m0, m7
+ psubw m0, m7
+
+ ; stage 4
+ ; sin(pi / 16), cos(pi / 16)
+ punpcklwd m1, m10, m9
+ punpckhwd m10, m9
+ pmaddwd m7, m1, [GLOBAL(pw_16069_3196)]
+ pmaddwd m1, [GLOBAL(pw_3196_m16069)]
+ pmaddwd m9, m10, [GLOBAL(pw_16069_3196)]
+ pmaddwd m10, [GLOBAL(pw_3196_m16069)]
+ paddd m7, m8
+ paddd m1, m8
+ paddd m9, m8
+ paddd m10, m8
+ psrad m7, 14
+ psrad m1, 14
+ psrad m9, 14
+ psrad m10, 14
+ packssdw m7, m9
+ packssdw m1, m10
+
+ ; sin(3 * pi / 16), cos(3 * pi / 16)
+ punpcklwd m11, m0, m3
+ punpckhwd m0, m3
+ pmaddwd m9, m11, [GLOBAL(pw_9102_13623)]
+ pmaddwd m11, [GLOBAL(pw_13623_m9102)]
+ pmaddwd m3, m0, [GLOBAL(pw_9102_13623)]
+ pmaddwd m0, [GLOBAL(pw_13623_m9102)]
+ paddd m9, m8
+ paddd m11, m8
+ paddd m3, m8
+ paddd m0, m8
+ psrad m9, 14
+ psrad m11, 14
+ psrad m3, 14
+ psrad m0, 14
+ packssdw m9, m3
+ packssdw m11, m0
+
+ ; transpose
+ ; stage 1
+ punpcklwd m0, m6, m7
+ punpcklwd m3, m5, m11
+ punpckhwd m6, m7
+ punpckhwd m5, m11
+ punpcklwd m7, m4, m9
+ punpcklwd m10, m2, m1
+ punpckhwd m4, m9
+ punpckhwd m2, m1
+
+ ; stage 2
+ punpckldq m9, m0, m3
+ punpckldq m1, m6, m5
+ punpckhdq m0, m3
+ punpckhdq m6, m5
+ punpckldq m3, m7, m10
+ punpckldq m5, m4, m2
+ punpckhdq m7, m10
+ punpckhdq m4, m2
+
+ ; stage 3
+ punpcklqdq m10, m9, m3
+ punpckhqdq m9, m3
+ punpcklqdq m2, m0, m7
+ punpckhqdq m0, m7
+ punpcklqdq m3, m1, m5
+ punpckhqdq m1, m5
+ punpcklqdq m7, m6, m4
+ punpckhqdq m6, m4
+
+ ; row transform
+ ; stage 1
+ paddw m5, m10, m6
+ psubw m10, m6
+
+ paddw m4, m9, m7
+ psubw m9, m7
+
+ paddw m6, m2, m1
+ psubw m2, m1
+
+ paddw m7, m0, m3
+ psubw m0, m3
+
+ ;stage 2
+ paddw m1, m5, m7
+ psubw m5, m7
+
+ paddw m3, m4, m6
+ psubw m4, m6
+
+ paddw m7, m9, m2
+ psubw m9, m2
+
+ ; stage 3
+ punpcklwd m6, m1, m3
+ punpckhwd m1, m3
+ pmaddwd m2, m6, [GLOBAL(pw_11585_11585)]
+ pmaddwd m6, [GLOBAL(pw_11585_m11585)]
+ pmaddwd m3, m1, [GLOBAL(pw_11585_11585)]
+ pmaddwd m1, [GLOBAL(pw_11585_m11585)]
+ paddd m2, m8
+ paddd m6, m8
+ paddd m3, m8
+ paddd m1, m8
+ psrad m2, 14
+ psrad m6, 14
+ psrad m3, 14
+ psrad m1, 14
+ packssdw m2, m3
+ packssdw m6, m1
+
+ pmulhrsw m7, m12
+ pmulhrsw m9, m12
+
+ punpcklwd m3, m5, m4
+ punpckhwd m5, m4
+ pmaddwd m1, m3, [GLOBAL(pw_15137_6270)]
+ pmaddwd m3, [GLOBAL(pw_6270_m15137)]
+ pmaddwd m4, m5, [GLOBAL(pw_15137_6270)]
+ pmaddwd m5, [GLOBAL(pw_6270_m15137)]
+ paddd m1, m8
+ paddd m3, m8
+ paddd m4, m8
+ paddd m5, m8
+ psrad m1, 14
+ psrad m3, 14
+ psrad m4, 14
+ psrad m5, 14
+ packssdw m1, m4
+ packssdw m3, m5
+
+ paddw m4, m0, m9
+ psubw m0, m9
+
+ paddw m5, m10, m7
+ psubw m10, m7
+
+ ; stage 4
+ punpcklwd m9, m5, m4
+ punpckhwd m5, m4
+ pmaddwd m7, m9, [GLOBAL(pw_16069_3196)]
+ pmaddwd m9, [GLOBAL(pw_3196_m16069)]
+ pmaddwd m4, m5, [GLOBAL(pw_16069_3196)]
+ pmaddwd m5, [GLOBAL(pw_3196_m16069)]
+ paddd m7, m8
+ paddd m9, m8
+ paddd m4, m8
+ paddd m5, m8
+ psrad m7, 14
+ psrad m9, 14
+ psrad m4, 14
+ psrad m5, 14
+ packssdw m7, m4
+ packssdw m9, m5
+
+ punpcklwd m4, m10, m0
+ punpckhwd m10, m0
+ pmaddwd m5, m4, [GLOBAL(pw_9102_13623)]
+ pmaddwd m4, [GLOBAL(pw_13623_m9102)]
+ pmaddwd m0, m10, [GLOBAL(pw_9102_13623)]
+ pmaddwd m10, [GLOBAL(pw_13623_m9102)]
+ paddd m5, m8
+ paddd m4, m8
+ paddd m0, m8
+ paddd m10, m8
+ psrad m5, 14
+ psrad m4, 14
+ psrad m0, 14
+ psrad m10, 14
+ packssdw m5, m0
+ packssdw m4, m10
+
+ ; transpose
+ ; stage 1
+ punpcklwd m0, m2, m7
+ punpcklwd m10, m1, m4
+ punpckhwd m2, m7
+ punpckhwd m1, m4
+ punpcklwd m7, m6, m5
+ punpcklwd m4, m3, m9
+ punpckhwd m6, m5
+ punpckhwd m3, m9
+
+ ; stage 2
+ punpckldq m5, m0, m10
+ punpckldq m9, m2, m1
+ punpckhdq m0, m10
+ punpckhdq m2, m1
+ punpckldq m10, m7, m4
+ punpckldq m1, m6, m3
+ punpckhdq m7, m4
+ punpckhdq m6, m3
+
+ ; stage 3
+ punpcklqdq m4, m5, m10
+ punpckhqdq m5, m10
+ punpcklqdq m3, m0, m7
+ punpckhqdq m0, m7
+ punpcklqdq m10, m9, m1
+ punpckhqdq m9, m1
+ punpcklqdq m7, m2, m6
+ punpckhqdq m2, m6
+
+ psraw m1, m4, 15
+ psraw m6, m5, 15
+ psraw m8, m3, 15
+ psraw m11, m0, 15
+
+ psubw m4, m1
+ psubw m5, m6
+ psubw m3, m8
+ psubw m0, m11
+
+ psraw m4, 1
+ psraw m5, 1
+ psraw m3, 1
+ psraw m0, 1
+
+ psraw m1, m10, 15
+ psraw m6, m9, 15
+ psraw m8, m7, 15
+ psraw m11, m2, 15
+
+ psubw m10, m1
+ psubw m9, m6
+ psubw m7, m8
+ psubw m2, m11
+
+ psraw m10, 1
+ psraw m9, 1
+ psraw m7, 1
+ psraw m2, 1
+
+ mova [outputq + 0], m4
+ mova [outputq + 16], m5
+ mova [outputq + 32], m3
+ mova [outputq + 48], m0
+ mova [outputq + 64], m10
+ mova [outputq + 80], m9
+ mova [outputq + 96], m7
+ mova [outputq + 112], m2
+
+ RET
+%endif
diff --git a/media/libvpx/libvpx/vpx_dsp/x86/highbd_convolve_avx2.c b/media/libvpx/libvpx/vpx_dsp/x86/highbd_convolve_avx2.c
new file mode 100644
index 0000000000..01a52ec8bf
--- /dev/null
+++ b/media/libvpx/libvpx/vpx_dsp/x86/highbd_convolve_avx2.c
@@ -0,0 +1,1495 @@
+/*
+ * Copyright (c) 2017 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <immintrin.h>
+#include "./vpx_dsp_rtcd.h"
+#include "vpx_dsp/x86/convolve.h"
+#include "vpx_dsp/x86/convolve_avx2.h"
+
+// -----------------------------------------------------------------------------
+// Copy and average
+
+void vpx_highbd_convolve_copy_avx2(const uint16_t *src, ptrdiff_t src_stride,
+ uint16_t *dst, ptrdiff_t dst_stride,
+ const InterpKernel *filter, int x0_q4,
+ int x_step_q4, int y0_q4, int y_step_q4,
+ int w, int h, int bd) {
+ (void)filter;
+ (void)x0_q4;
+ (void)x_step_q4;
+ (void)y0_q4;
+ (void)y_step_q4;
+ (void)bd;
+
+ assert(w % 4 == 0);
+ if (w > 32) { // w = 64
+ do {
+ const __m256i p0 = _mm256_loadu_si256((const __m256i *)src);
+ const __m256i p1 = _mm256_loadu_si256((const __m256i *)(src + 16));
+ const __m256i p2 = _mm256_loadu_si256((const __m256i *)(src + 32));
+ const __m256i p3 = _mm256_loadu_si256((const __m256i *)(src + 48));
+ src += src_stride;
+ _mm256_storeu_si256((__m256i *)dst, p0);
+ _mm256_storeu_si256((__m256i *)(dst + 16), p1);
+ _mm256_storeu_si256((__m256i *)(dst + 32), p2);
+ _mm256_storeu_si256((__m256i *)(dst + 48), p3);
+ dst += dst_stride;
+ h--;
+ } while (h > 0);
+ } else if (w > 16) { // w = 32
+ do {
+ const __m256i p0 = _mm256_loadu_si256((const __m256i *)src);
+ const __m256i p1 = _mm256_loadu_si256((const __m256i *)(src + 16));
+ src += src_stride;
+ _mm256_storeu_si256((__m256i *)dst, p0);
+ _mm256_storeu_si256((__m256i *)(dst + 16), p1);
+ dst += dst_stride;
+ h--;
+ } while (h > 0);
+ } else if (w > 8) { // w = 16
+ __m256i p0, p1;
+ do {
+ p0 = _mm256_loadu_si256((const __m256i *)src);
+ src += src_stride;
+ p1 = _mm256_loadu_si256((const __m256i *)src);
+ src += src_stride;
+
+ _mm256_storeu_si256((__m256i *)dst, p0);
+ dst += dst_stride;
+ _mm256_storeu_si256((__m256i *)dst, p1);
+ dst += dst_stride;
+ h -= 2;
+ } while (h > 0);
+ } else if (w > 4) { // w = 8
+ __m128i p0, p1;
+ do {
+ p0 = _mm_loadu_si128((const __m128i *)src);
+ src += src_stride;
+ p1 = _mm_loadu_si128((const __m128i *)src);
+ src += src_stride;
+
+ _mm_storeu_si128((__m128i *)dst, p0);
+ dst += dst_stride;
+ _mm_storeu_si128((__m128i *)dst, p1);
+ dst += dst_stride;
+ h -= 2;
+ } while (h > 0);
+ } else { // w = 4
+ __m128i p0, p1;
+ do {
+ p0 = _mm_loadl_epi64((const __m128i *)src);
+ src += src_stride;
+ p1 = _mm_loadl_epi64((const __m128i *)src);
+ src += src_stride;
+
+ _mm_storel_epi64((__m128i *)dst, p0);
+ dst += dst_stride;
+ _mm_storel_epi64((__m128i *)dst, p1);
+ dst += dst_stride;
+ h -= 2;
+ } while (h > 0);
+ }
+}
+
+void vpx_highbd_convolve_avg_avx2(const uint16_t *src, ptrdiff_t src_stride,
+ uint16_t *dst, ptrdiff_t dst_stride,
+ const InterpKernel *filter, int x0_q4,
+ int x_step_q4, int y0_q4, int y_step_q4,
+ int w, int h, int bd) {
+ (void)filter;
+ (void)x0_q4;
+ (void)x_step_q4;
+ (void)y0_q4;
+ (void)y_step_q4;
+ (void)bd;
+
+ assert(w % 4 == 0);
+ if (w > 32) { // w = 64
+ __m256i p0, p1, p2, p3, u0, u1, u2, u3;
+ do {
+ p0 = _mm256_loadu_si256((const __m256i *)src);
+ p1 = _mm256_loadu_si256((const __m256i *)(src + 16));
+ p2 = _mm256_loadu_si256((const __m256i *)(src + 32));
+ p3 = _mm256_loadu_si256((const __m256i *)(src + 48));
+ src += src_stride;
+ u0 = _mm256_loadu_si256((const __m256i *)dst);
+ u1 = _mm256_loadu_si256((const __m256i *)(dst + 16));
+ u2 = _mm256_loadu_si256((const __m256i *)(dst + 32));
+ u3 = _mm256_loadu_si256((const __m256i *)(dst + 48));
+ _mm256_storeu_si256((__m256i *)dst, _mm256_avg_epu16(p0, u0));
+ _mm256_storeu_si256((__m256i *)(dst + 16), _mm256_avg_epu16(p1, u1));
+ _mm256_storeu_si256((__m256i *)(dst + 32), _mm256_avg_epu16(p2, u2));
+ _mm256_storeu_si256((__m256i *)(dst + 48), _mm256_avg_epu16(p3, u3));
+ dst += dst_stride;
+ h--;
+ } while (h > 0);
+ } else if (w > 16) { // w = 32
+ __m256i p0, p1, u0, u1;
+ do {
+ p0 = _mm256_loadu_si256((const __m256i *)src);
+ p1 = _mm256_loadu_si256((const __m256i *)(src + 16));
+ src += src_stride;
+ u0 = _mm256_loadu_si256((const __m256i *)dst);
+ u1 = _mm256_loadu_si256((const __m256i *)(dst + 16));
+ _mm256_storeu_si256((__m256i *)dst, _mm256_avg_epu16(p0, u0));
+ _mm256_storeu_si256((__m256i *)(dst + 16), _mm256_avg_epu16(p1, u1));
+ dst += dst_stride;
+ h--;
+ } while (h > 0);
+ } else if (w > 8) { // w = 16
+ __m256i p0, p1, u0, u1;
+ do {
+ p0 = _mm256_loadu_si256((const __m256i *)src);
+ p1 = _mm256_loadu_si256((const __m256i *)(src + src_stride));
+ src += src_stride << 1;
+ u0 = _mm256_loadu_si256((const __m256i *)dst);
+ u1 = _mm256_loadu_si256((const __m256i *)(dst + dst_stride));
+
+ _mm256_storeu_si256((__m256i *)dst, _mm256_avg_epu16(p0, u0));
+ _mm256_storeu_si256((__m256i *)(dst + dst_stride),
+ _mm256_avg_epu16(p1, u1));
+ dst += dst_stride << 1;
+ h -= 2;
+ } while (h > 0);
+ } else if (w > 4) { // w = 8
+ __m128i p0, p1, u0, u1;
+ do {
+ p0 = _mm_loadu_si128((const __m128i *)src);
+ p1 = _mm_loadu_si128((const __m128i *)(src + src_stride));
+ src += src_stride << 1;
+ u0 = _mm_loadu_si128((const __m128i *)dst);
+ u1 = _mm_loadu_si128((const __m128i *)(dst + dst_stride));
+
+ _mm_storeu_si128((__m128i *)dst, _mm_avg_epu16(p0, u0));
+ _mm_storeu_si128((__m128i *)(dst + dst_stride), _mm_avg_epu16(p1, u1));
+ dst += dst_stride << 1;
+ h -= 2;
+ } while (h > 0);
+ } else { // w = 4
+ __m128i p0, p1, u0, u1;
+ do {
+ p0 = _mm_loadl_epi64((const __m128i *)src);
+ p1 = _mm_loadl_epi64((const __m128i *)(src + src_stride));
+ src += src_stride << 1;
+ u0 = _mm_loadl_epi64((const __m128i *)dst);
+ u1 = _mm_loadl_epi64((const __m128i *)(dst + dst_stride));
+
+ _mm_storel_epi64((__m128i *)dst, _mm_avg_epu16(u0, p0));
+ _mm_storel_epi64((__m128i *)(dst + dst_stride), _mm_avg_epu16(u1, p1));
+ dst += dst_stride << 1;
+ h -= 2;
+ } while (h > 0);
+ }
+}
+
+// -----------------------------------------------------------------------------
+// Horizontal and vertical filtering
+
+static const uint8_t signal_pattern_0[32] = { 0, 1, 2, 3, 2, 3, 4, 5, 4, 5, 6,
+ 7, 6, 7, 8, 9, 0, 1, 2, 3, 2, 3,
+ 4, 5, 4, 5, 6, 7, 6, 7, 8, 9 };
+
+static const uint8_t signal_pattern_1[32] = { 4, 5, 6, 7, 6, 7, 8, 9,
+ 8, 9, 10, 11, 10, 11, 12, 13,
+ 4, 5, 6, 7, 6, 7, 8, 9,
+ 8, 9, 10, 11, 10, 11, 12, 13 };
+
+static const uint8_t signal_pattern_2[32] = { 6, 7, 8, 9, 8, 9, 10, 11,
+ 10, 11, 12, 13, 12, 13, 14, 15,
+ 6, 7, 8, 9, 8, 9, 10, 11,
+ 10, 11, 12, 13, 12, 13, 14, 15 };
+
+static const uint32_t signal_index[8] = { 2, 3, 4, 5, 2, 3, 4, 5 };
+
+#define CONV8_ROUNDING_BITS (7)
+#define CONV8_ROUNDING_NUM (1 << (CONV8_ROUNDING_BITS - 1))
+
+// -----------------------------------------------------------------------------
+// Horizontal Filtering
+
+static INLINE void pack_pixels(const __m256i *s, __m256i *p /*p[4]*/) {
+ const __m256i idx = _mm256_loadu_si256((const __m256i *)signal_index);
+ const __m256i sf0 = _mm256_loadu_si256((const __m256i *)signal_pattern_0);
+ const __m256i sf1 = _mm256_loadu_si256((const __m256i *)signal_pattern_1);
+ const __m256i c = _mm256_permutevar8x32_epi32(*s, idx);
+
+ p[0] = _mm256_shuffle_epi8(*s, sf0); // x0x6
+ p[1] = _mm256_shuffle_epi8(*s, sf1); // x1x7
+ p[2] = _mm256_shuffle_epi8(c, sf0); // x2x4
+ p[3] = _mm256_shuffle_epi8(c, sf1); // x3x5
+}
+
+// Note:
+// Shared by 8x2 and 16x1 block
+static INLINE void pack_16_pixels(const __m256i *s0, const __m256i *s1,
+ __m256i *x /*x[8]*/) {
+ __m256i pp[8];
+ pack_pixels(s0, pp);
+ pack_pixels(s1, &pp[4]);
+ x[0] = _mm256_permute2x128_si256(pp[0], pp[4], 0x20);
+ x[1] = _mm256_permute2x128_si256(pp[1], pp[5], 0x20);
+ x[2] = _mm256_permute2x128_si256(pp[2], pp[6], 0x20);
+ x[3] = _mm256_permute2x128_si256(pp[3], pp[7], 0x20);
+ x[4] = x[2];
+ x[5] = x[3];
+ x[6] = _mm256_permute2x128_si256(pp[0], pp[4], 0x31);
+ x[7] = _mm256_permute2x128_si256(pp[1], pp[5], 0x31);
+}
+
+static INLINE void pack_8x1_pixels(const uint16_t *src, __m256i *x) {
+ __m256i pp[8];
+ __m256i s0;
+ s0 = _mm256_loadu_si256((const __m256i *)src);
+ pack_pixels(&s0, pp);
+ x[0] = _mm256_permute2x128_si256(pp[0], pp[2], 0x30);
+ x[1] = _mm256_permute2x128_si256(pp[1], pp[3], 0x30);
+ x[2] = _mm256_permute2x128_si256(pp[2], pp[0], 0x30);
+ x[3] = _mm256_permute2x128_si256(pp[3], pp[1], 0x30);
+}
+
+static INLINE void pack_8x2_pixels(const uint16_t *src, ptrdiff_t stride,
+ __m256i *x) {
+ __m256i s0, s1;
+ s0 = _mm256_loadu_si256((const __m256i *)src);
+ s1 = _mm256_loadu_si256((const __m256i *)(src + stride));
+ pack_16_pixels(&s0, &s1, x);
+}
+
+static INLINE void pack_16x1_pixels(const uint16_t *src, __m256i *x) {
+ __m256i s0, s1;
+ s0 = _mm256_loadu_si256((const __m256i *)src);
+ s1 = _mm256_loadu_si256((const __m256i *)(src + 8));
+ pack_16_pixels(&s0, &s1, x);
+}
+
+// Note:
+// Shared by horizontal and vertical filtering
+static INLINE void pack_filters(const int16_t *filter, __m256i *f /*f[4]*/) {
+ const __m128i h = _mm_loadu_si128((const __m128i *)filter);
+ const __m256i hh = _mm256_insertf128_si256(_mm256_castsi128_si256(h), h, 1);
+ const __m256i p0 = _mm256_set1_epi32(0x03020100);
+ const __m256i p1 = _mm256_set1_epi32(0x07060504);
+ const __m256i p2 = _mm256_set1_epi32(0x0b0a0908);
+ const __m256i p3 = _mm256_set1_epi32(0x0f0e0d0c);
+ f[0] = _mm256_shuffle_epi8(hh, p0);
+ f[1] = _mm256_shuffle_epi8(hh, p1);
+ f[2] = _mm256_shuffle_epi8(hh, p2);
+ f[3] = _mm256_shuffle_epi8(hh, p3);
+}
+
+static INLINE void filter_8x1_pixels(const __m256i *sig /*sig[4]*/,
+ const __m256i *fil /*fil[4]*/,
+ __m256i *y) {
+ __m256i a, a0, a1;
+
+ a0 = _mm256_madd_epi16(fil[0], sig[0]);
+ a1 = _mm256_madd_epi16(fil[3], sig[3]);
+ a = _mm256_add_epi32(a0, a1);
+
+ a0 = _mm256_madd_epi16(fil[1], sig[1]);
+ a1 = _mm256_madd_epi16(fil[2], sig[2]);
+
+ {
+ const __m256i min = _mm256_min_epi32(a0, a1);
+ a = _mm256_add_epi32(a, min);
+ }
+ {
+ const __m256i max = _mm256_max_epi32(a0, a1);
+ a = _mm256_add_epi32(a, max);
+ }
+ {
+ const __m256i rounding = _mm256_set1_epi32(1 << (CONV8_ROUNDING_BITS - 1));
+ a = _mm256_add_epi32(a, rounding);
+ *y = _mm256_srai_epi32(a, CONV8_ROUNDING_BITS);
+ }
+}
+
+static INLINE void store_8x1_pixels(const __m256i *y, const __m256i *mask,
+ uint16_t *dst) {
+ const __m128i a0 = _mm256_castsi256_si128(*y);
+ const __m128i a1 = _mm256_extractf128_si256(*y, 1);
+ __m128i res = _mm_packus_epi32(a0, a1);
+ res = _mm_min_epi16(res, _mm256_castsi256_si128(*mask));
+ _mm_storeu_si128((__m128i *)dst, res);
+}
+
+static INLINE void store_8x2_pixels(const __m256i *y0, const __m256i *y1,
+ const __m256i *mask, uint16_t *dst,
+ ptrdiff_t pitch) {
+ __m256i a = _mm256_packus_epi32(*y0, *y1);
+ a = _mm256_min_epi16(a, *mask);
+ _mm_storeu_si128((__m128i *)dst, _mm256_castsi256_si128(a));
+ _mm_storeu_si128((__m128i *)(dst + pitch), _mm256_extractf128_si256(a, 1));
+}
+
+static INLINE void store_16x1_pixels(const __m256i *y0, const __m256i *y1,
+ const __m256i *mask, uint16_t *dst) {
+ __m256i a = _mm256_packus_epi32(*y0, *y1);
+ a = _mm256_min_epi16(a, *mask);
+ _mm256_storeu_si256((__m256i *)dst, a);
+}
+
+static void vpx_highbd_filter_block1d8_h8_avx2(
+ const uint16_t *src_ptr, ptrdiff_t src_pitch, uint16_t *dst_ptr,
+ ptrdiff_t dst_pitch, uint32_t height, const int16_t *filter, int bd) {
+ __m256i signal[8], res0, res1;
+ const __m256i max = _mm256_set1_epi16((1 << bd) - 1);
+
+ __m256i ff[4];
+ pack_filters(filter, ff);
+
+ src_ptr -= 3;
+ do {
+ pack_8x2_pixels(src_ptr, src_pitch, signal);
+ filter_8x1_pixels(signal, ff, &res0);
+ filter_8x1_pixels(&signal[4], ff, &res1);
+ store_8x2_pixels(&res0, &res1, &max, dst_ptr, dst_pitch);
+ height -= 2;
+ src_ptr += src_pitch << 1;
+ dst_ptr += dst_pitch << 1;
+ } while (height > 1);
+
+ if (height > 0) {
+ pack_8x1_pixels(src_ptr, signal);
+ filter_8x1_pixels(signal, ff, &res0);
+ store_8x1_pixels(&res0, &max, dst_ptr);
+ }
+}
+
+static void vpx_highbd_filter_block1d16_h8_avx2(
+ const uint16_t *src_ptr, ptrdiff_t src_pitch, uint16_t *dst_ptr,
+ ptrdiff_t dst_pitch, uint32_t height, const int16_t *filter, int bd) {
+ __m256i signal[8], res0, res1;
+ const __m256i max = _mm256_set1_epi16((1 << bd) - 1);
+
+ __m256i ff[4];
+ pack_filters(filter, ff);
+
+ src_ptr -= 3;
+ do {
+ pack_16x1_pixels(src_ptr, signal);
+ filter_8x1_pixels(signal, ff, &res0);
+ filter_8x1_pixels(&signal[4], ff, &res1);
+ store_16x1_pixels(&res0, &res1, &max, dst_ptr);
+ height -= 1;
+ src_ptr += src_pitch;
+ dst_ptr += dst_pitch;
+ } while (height > 0);
+}
+
+// -----------------------------------------------------------------------------
+// 2-tap horizontal filtering
+
+static INLINE void pack_2t_filter(const int16_t *filter, __m256i *f) {
+ const __m128i h = _mm_loadu_si128((const __m128i *)filter);
+ const __m256i hh = _mm256_insertf128_si256(_mm256_castsi128_si256(h), h, 1);
+ const __m256i p = _mm256_set1_epi32(0x09080706);
+ f[0] = _mm256_shuffle_epi8(hh, p);
+}
+
+// can be used by pack_8x2_2t_pixels() and pack_16x1_2t_pixels()
+// the difference is s0/s1 specifies first and second rows or,
+// first 16 samples and 8-sample shifted 16 samples
+static INLINE void pack_16_2t_pixels(const __m256i *s0, const __m256i *s1,
+ __m256i *sig) {
+ const __m256i idx = _mm256_loadu_si256((const __m256i *)signal_index);
+ const __m256i sf2 = _mm256_loadu_si256((const __m256i *)signal_pattern_2);
+ __m256i x0 = _mm256_shuffle_epi8(*s0, sf2);
+ __m256i x1 = _mm256_shuffle_epi8(*s1, sf2);
+ __m256i r0 = _mm256_permutevar8x32_epi32(*s0, idx);
+ __m256i r1 = _mm256_permutevar8x32_epi32(*s1, idx);
+ r0 = _mm256_shuffle_epi8(r0, sf2);
+ r1 = _mm256_shuffle_epi8(r1, sf2);
+ sig[0] = _mm256_permute2x128_si256(x0, x1, 0x20);
+ sig[1] = _mm256_permute2x128_si256(r0, r1, 0x20);
+}
+
+static INLINE void pack_8x2_2t_pixels(const uint16_t *src,
+ const ptrdiff_t pitch, __m256i *sig) {
+ const __m256i r0 = _mm256_loadu_si256((const __m256i *)src);
+ const __m256i r1 = _mm256_loadu_si256((const __m256i *)(src + pitch));
+ pack_16_2t_pixels(&r0, &r1, sig);
+}
+
+static INLINE void pack_16x1_2t_pixels(const uint16_t *src,
+ __m256i *sig /*sig[2]*/) {
+ const __m256i r0 = _mm256_loadu_si256((const __m256i *)src);
+ const __m256i r1 = _mm256_loadu_si256((const __m256i *)(src + 8));
+ pack_16_2t_pixels(&r0, &r1, sig);
+}
+
+static INLINE void pack_8x1_2t_pixels(const uint16_t *src,
+ __m256i *sig /*sig[2]*/) {
+ const __m256i idx = _mm256_loadu_si256((const __m256i *)signal_index);
+ const __m256i sf2 = _mm256_loadu_si256((const __m256i *)signal_pattern_2);
+ __m256i r0 = _mm256_loadu_si256((const __m256i *)src);
+ __m256i x0 = _mm256_shuffle_epi8(r0, sf2);
+ r0 = _mm256_permutevar8x32_epi32(r0, idx);
+ r0 = _mm256_shuffle_epi8(r0, sf2);
+ sig[0] = _mm256_permute2x128_si256(x0, r0, 0x20);
+}
+
+// can be used by filter_8x2_2t_pixels() and filter_16x1_2t_pixels()
+static INLINE void filter_16_2t_pixels(const __m256i *sig, const __m256i *f,
+ __m256i *y0, __m256i *y1) {
+ const __m256i rounding = _mm256_set1_epi32(1 << (CONV8_ROUNDING_BITS - 1));
+ __m256i x0 = _mm256_madd_epi16(sig[0], *f);
+ __m256i x1 = _mm256_madd_epi16(sig[1], *f);
+ x0 = _mm256_add_epi32(x0, rounding);
+ x1 = _mm256_add_epi32(x1, rounding);
+ *y0 = _mm256_srai_epi32(x0, CONV8_ROUNDING_BITS);
+ *y1 = _mm256_srai_epi32(x1, CONV8_ROUNDING_BITS);
+}
+
+static INLINE void filter_8x1_2t_pixels(const __m256i *sig, const __m256i *f,
+ __m256i *y0) {
+ const __m256i rounding = _mm256_set1_epi32(1 << (CONV8_ROUNDING_BITS - 1));
+ __m256i x0 = _mm256_madd_epi16(sig[0], *f);
+ x0 = _mm256_add_epi32(x0, rounding);
+ *y0 = _mm256_srai_epi32(x0, CONV8_ROUNDING_BITS);
+}
+
+static void vpx_highbd_filter_block1d8_h2_avx2(
+ const uint16_t *src_ptr, ptrdiff_t src_pitch, uint16_t *dst_ptr,
+ ptrdiff_t dst_pitch, uint32_t height, const int16_t *filter, int bd) {
+ __m256i signal[2], res0, res1;
+ const __m256i max = _mm256_set1_epi16((1 << bd) - 1);
+
+ __m256i ff;
+ pack_2t_filter(filter, &ff);
+
+ src_ptr -= 3;
+ do {
+ pack_8x2_2t_pixels(src_ptr, src_pitch, signal);
+ filter_16_2t_pixels(signal, &ff, &res0, &res1);
+ store_8x2_pixels(&res0, &res1, &max, dst_ptr, dst_pitch);
+ height -= 2;
+ src_ptr += src_pitch << 1;
+ dst_ptr += dst_pitch << 1;
+ } while (height > 1);
+
+ if (height > 0) {
+ pack_8x1_2t_pixels(src_ptr, signal);
+ filter_8x1_2t_pixels(signal, &ff, &res0);
+ store_8x1_pixels(&res0, &max, dst_ptr);
+ }
+}
+
+static void vpx_highbd_filter_block1d16_h2_avx2(
+ const uint16_t *src_ptr, ptrdiff_t src_pitch, uint16_t *dst_ptr,
+ ptrdiff_t dst_pitch, uint32_t height, const int16_t *filter, int bd) {
+ __m256i signal[2], res0, res1;
+ const __m256i max = _mm256_set1_epi16((1 << bd) - 1);
+
+ __m256i ff;
+ pack_2t_filter(filter, &ff);
+
+ src_ptr -= 3;
+ do {
+ pack_16x1_2t_pixels(src_ptr, signal);
+ filter_16_2t_pixels(signal, &ff, &res0, &res1);
+ store_16x1_pixels(&res0, &res1, &max, dst_ptr);
+ height -= 1;
+ src_ptr += src_pitch;
+ dst_ptr += dst_pitch;
+ } while (height > 0);
+}
+
+// -----------------------------------------------------------------------------
+// Vertical Filtering
+
+static void pack_8x9_init(const uint16_t *src, ptrdiff_t pitch, __m256i *sig) {
+ __m256i s0 = _mm256_castsi128_si256(_mm_loadu_si128((const __m128i *)src));
+ __m256i s1 =
+ _mm256_castsi128_si256(_mm_loadu_si128((const __m128i *)(src + pitch)));
+ __m256i s2 = _mm256_castsi128_si256(
+ _mm_loadu_si128((const __m128i *)(src + 2 * pitch)));
+ __m256i s3 = _mm256_castsi128_si256(
+ _mm_loadu_si128((const __m128i *)(src + 3 * pitch)));
+ __m256i s4 = _mm256_castsi128_si256(
+ _mm_loadu_si128((const __m128i *)(src + 4 * pitch)));
+ __m256i s5 = _mm256_castsi128_si256(
+ _mm_loadu_si128((const __m128i *)(src + 5 * pitch)));
+ __m256i s6 = _mm256_castsi128_si256(
+ _mm_loadu_si128((const __m128i *)(src + 6 * pitch)));
+
+ s0 = _mm256_inserti128_si256(s0, _mm256_castsi256_si128(s1), 1);
+ s1 = _mm256_inserti128_si256(s1, _mm256_castsi256_si128(s2), 1);
+ s2 = _mm256_inserti128_si256(s2, _mm256_castsi256_si128(s3), 1);
+ s3 = _mm256_inserti128_si256(s3, _mm256_castsi256_si128(s4), 1);
+ s4 = _mm256_inserti128_si256(s4, _mm256_castsi256_si128(s5), 1);
+ s5 = _mm256_inserti128_si256(s5, _mm256_castsi256_si128(s6), 1);
+
+ sig[0] = _mm256_unpacklo_epi16(s0, s1);
+ sig[4] = _mm256_unpackhi_epi16(s0, s1);
+ sig[1] = _mm256_unpacklo_epi16(s2, s3);
+ sig[5] = _mm256_unpackhi_epi16(s2, s3);
+ sig[2] = _mm256_unpacklo_epi16(s4, s5);
+ sig[6] = _mm256_unpackhi_epi16(s4, s5);
+ sig[8] = s6;
+}
+
+static INLINE void pack_8x9_pixels(const uint16_t *src, ptrdiff_t pitch,
+ __m256i *sig) {
+ // base + 7th row
+ __m256i s0 = _mm256_castsi128_si256(
+ _mm_loadu_si128((const __m128i *)(src + 7 * pitch)));
+ // base + 8th row
+ __m256i s1 = _mm256_castsi128_si256(
+ _mm_loadu_si128((const __m128i *)(src + 8 * pitch)));
+ __m256i s2 = _mm256_inserti128_si256(sig[8], _mm256_castsi256_si128(s0), 1);
+ __m256i s3 = _mm256_inserti128_si256(s0, _mm256_castsi256_si128(s1), 1);
+ sig[3] = _mm256_unpacklo_epi16(s2, s3);
+ sig[7] = _mm256_unpackhi_epi16(s2, s3);
+ sig[8] = s1;
+}
+
+static INLINE void filter_8x9_pixels(const __m256i *sig, const __m256i *f,
+ __m256i *y0, __m256i *y1) {
+ filter_8x1_pixels(sig, f, y0);
+ filter_8x1_pixels(&sig[4], f, y1);
+}
+
+static INLINE void update_pixels(__m256i *sig) {
+ int i;
+ for (i = 0; i < 3; ++i) {
+ sig[i] = sig[i + 1];
+ sig[i + 4] = sig[i + 5];
+ }
+}
+
+static void vpx_highbd_filter_block1d8_v8_avx2(
+ const uint16_t *src_ptr, ptrdiff_t src_pitch, uint16_t *dst_ptr,
+ ptrdiff_t dst_pitch, uint32_t height, const int16_t *filter, int bd) {
+ __m256i signal[9], res0, res1;
+ const __m256i max = _mm256_set1_epi16((1 << bd) - 1);
+
+ __m256i ff[4];
+ pack_filters(filter, ff);
+
+ pack_8x9_init(src_ptr, src_pitch, signal);
+
+ do {
+ pack_8x9_pixels(src_ptr, src_pitch, signal);
+
+ filter_8x9_pixels(signal, ff, &res0, &res1);
+ store_8x2_pixels(&res0, &res1, &max, dst_ptr, dst_pitch);
+ update_pixels(signal);
+
+ src_ptr += src_pitch << 1;
+ dst_ptr += dst_pitch << 1;
+ height -= 2;
+ } while (height > 0);
+}
+
+static void pack_16x9_init(const uint16_t *src, ptrdiff_t pitch, __m256i *sig) {
+ __m256i u0, u1, u2, u3;
+ // load 0-6 rows
+ const __m256i s0 = _mm256_loadu_si256((const __m256i *)src);
+ const __m256i s1 = _mm256_loadu_si256((const __m256i *)(src + pitch));
+ const __m256i s2 = _mm256_loadu_si256((const __m256i *)(src + 2 * pitch));
+ const __m256i s3 = _mm256_loadu_si256((const __m256i *)(src + 3 * pitch));
+ const __m256i s4 = _mm256_loadu_si256((const __m256i *)(src + 4 * pitch));
+ const __m256i s5 = _mm256_loadu_si256((const __m256i *)(src + 5 * pitch));
+ const __m256i s6 = _mm256_loadu_si256((const __m256i *)(src + 6 * pitch));
+
+ u0 = _mm256_permute2x128_si256(s0, s1, 0x20); // 0, 1 low
+ u1 = _mm256_permute2x128_si256(s0, s1, 0x31); // 0, 1 high
+
+ u2 = _mm256_permute2x128_si256(s1, s2, 0x20); // 1, 2 low
+ u3 = _mm256_permute2x128_si256(s1, s2, 0x31); // 1, 2 high
+
+ sig[0] = _mm256_unpacklo_epi16(u0, u2);
+ sig[4] = _mm256_unpackhi_epi16(u0, u2);
+
+ sig[8] = _mm256_unpacklo_epi16(u1, u3);
+ sig[12] = _mm256_unpackhi_epi16(u1, u3);
+
+ u0 = _mm256_permute2x128_si256(s2, s3, 0x20);
+ u1 = _mm256_permute2x128_si256(s2, s3, 0x31);
+
+ u2 = _mm256_permute2x128_si256(s3, s4, 0x20);
+ u3 = _mm256_permute2x128_si256(s3, s4, 0x31);
+
+ sig[1] = _mm256_unpacklo_epi16(u0, u2);
+ sig[5] = _mm256_unpackhi_epi16(u0, u2);
+
+ sig[9] = _mm256_unpacklo_epi16(u1, u3);
+ sig[13] = _mm256_unpackhi_epi16(u1, u3);
+
+ u0 = _mm256_permute2x128_si256(s4, s5, 0x20);
+ u1 = _mm256_permute2x128_si256(s4, s5, 0x31);
+
+ u2 = _mm256_permute2x128_si256(s5, s6, 0x20);
+ u3 = _mm256_permute2x128_si256(s5, s6, 0x31);
+
+ sig[2] = _mm256_unpacklo_epi16(u0, u2);
+ sig[6] = _mm256_unpackhi_epi16(u0, u2);
+
+ sig[10] = _mm256_unpacklo_epi16(u1, u3);
+ sig[14] = _mm256_unpackhi_epi16(u1, u3);
+
+ sig[16] = s6;
+}
+
+static void pack_16x9_pixels(const uint16_t *src, ptrdiff_t pitch,
+ __m256i *sig) {
+ // base + 7th row
+ const __m256i s7 = _mm256_loadu_si256((const __m256i *)(src + 7 * pitch));
+ // base + 8th row
+ const __m256i s8 = _mm256_loadu_si256((const __m256i *)(src + 8 * pitch));
+
+ __m256i u0, u1, u2, u3;
+ u0 = _mm256_permute2x128_si256(sig[16], s7, 0x20);
+ u1 = _mm256_permute2x128_si256(sig[16], s7, 0x31);
+
+ u2 = _mm256_permute2x128_si256(s7, s8, 0x20);
+ u3 = _mm256_permute2x128_si256(s7, s8, 0x31);
+
+ sig[3] = _mm256_unpacklo_epi16(u0, u2);
+ sig[7] = _mm256_unpackhi_epi16(u0, u2);
+
+ sig[11] = _mm256_unpacklo_epi16(u1, u3);
+ sig[15] = _mm256_unpackhi_epi16(u1, u3);
+
+ sig[16] = s8;
+}
+
+static INLINE void filter_16x9_pixels(const __m256i *sig, const __m256i *f,
+ __m256i *y0, __m256i *y1) {
+ __m256i res[4];
+ int i;
+ for (i = 0; i < 4; ++i) {
+ filter_8x1_pixels(&sig[i << 2], f, &res[i]);
+ }
+
+ {
+ const __m256i l0l1 = _mm256_packus_epi32(res[0], res[1]);
+ const __m256i h0h1 = _mm256_packus_epi32(res[2], res[3]);
+ *y0 = _mm256_permute2x128_si256(l0l1, h0h1, 0x20);
+ *y1 = _mm256_permute2x128_si256(l0l1, h0h1, 0x31);
+ }
+}
+
+static INLINE void store_16x2_pixels(const __m256i *y0, const __m256i *y1,
+ const __m256i *mask, uint16_t *dst,
+ ptrdiff_t pitch) {
+ __m256i p = _mm256_min_epi16(*y0, *mask);
+ _mm256_storeu_si256((__m256i *)dst, p);
+ p = _mm256_min_epi16(*y1, *mask);
+ _mm256_storeu_si256((__m256i *)(dst + pitch), p);
+}
+
+static void update_16x9_pixels(__m256i *sig) {
+ update_pixels(&sig[0]);
+ update_pixels(&sig[8]);
+}
+
+static void vpx_highbd_filter_block1d16_v8_avx2(
+ const uint16_t *src_ptr, ptrdiff_t src_pitch, uint16_t *dst_ptr,
+ ptrdiff_t dst_pitch, uint32_t height, const int16_t *filter, int bd) {
+ __m256i signal[17], res0, res1;
+ const __m256i max = _mm256_set1_epi16((1 << bd) - 1);
+
+ __m256i ff[4];
+ pack_filters(filter, ff);
+
+ pack_16x9_init(src_ptr, src_pitch, signal);
+
+ do {
+ pack_16x9_pixels(src_ptr, src_pitch, signal);
+ filter_16x9_pixels(signal, ff, &res0, &res1);
+ store_16x2_pixels(&res0, &res1, &max, dst_ptr, dst_pitch);
+ update_16x9_pixels(signal);
+
+ src_ptr += src_pitch << 1;
+ dst_ptr += dst_pitch << 1;
+ height -= 2;
+ } while (height > 0);
+}
+
+// -----------------------------------------------------------------------------
+// 2-tap vertical filtering
+
+static void pack_16x2_init(const uint16_t *src, __m256i *sig) {
+ sig[2] = _mm256_loadu_si256((const __m256i *)src);
+}
+
+static INLINE void pack_16x2_2t_pixels(const uint16_t *src, ptrdiff_t pitch,
+ __m256i *sig) {
+ // load the next row
+ const __m256i u = _mm256_loadu_si256((const __m256i *)(src + pitch));
+ sig[0] = _mm256_unpacklo_epi16(sig[2], u);
+ sig[1] = _mm256_unpackhi_epi16(sig[2], u);
+ sig[2] = u;
+}
+
+static INLINE void filter_16x2_2t_pixels(const __m256i *sig, const __m256i *f,
+ __m256i *y0, __m256i *y1) {
+ filter_16_2t_pixels(sig, f, y0, y1);
+}
+
+static void vpx_highbd_filter_block1d16_v2_avx2(
+ const uint16_t *src_ptr, ptrdiff_t src_pitch, uint16_t *dst_ptr,
+ ptrdiff_t dst_pitch, uint32_t height, const int16_t *filter, int bd) {
+ __m256i signal[3], res0, res1;
+ const __m256i max = _mm256_set1_epi16((1 << bd) - 1);
+ __m256i ff;
+
+ pack_2t_filter(filter, &ff);
+ pack_16x2_init(src_ptr, signal);
+
+ do {
+ pack_16x2_2t_pixels(src_ptr, src_pitch, signal);
+ filter_16x2_2t_pixels(signal, &ff, &res0, &res1);
+ store_16x1_pixels(&res0, &res1, &max, dst_ptr);
+
+ src_ptr += src_pitch;
+ dst_ptr += dst_pitch;
+ height -= 1;
+ } while (height > 0);
+}
+
+static INLINE void pack_8x1_2t_filter(const int16_t *filter, __m128i *f) {
+ const __m128i h = _mm_loadu_si128((const __m128i *)filter);
+ const __m128i p = _mm_set1_epi32(0x09080706);
+ f[0] = _mm_shuffle_epi8(h, p);
+}
+
+static void pack_8x2_init(const uint16_t *src, __m128i *sig) {
+ sig[2] = _mm_loadu_si128((const __m128i *)src);
+}
+
+static INLINE void pack_8x2_2t_pixels_ver(const uint16_t *src, ptrdiff_t pitch,
+ __m128i *sig) {
+ // load the next row
+ const __m128i u = _mm_loadu_si128((const __m128i *)(src + pitch));
+ sig[0] = _mm_unpacklo_epi16(sig[2], u);
+ sig[1] = _mm_unpackhi_epi16(sig[2], u);
+ sig[2] = u;
+}
+
+static INLINE void filter_8_2t_pixels(const __m128i *sig, const __m128i *f,
+ __m128i *y0, __m128i *y1) {
+ const __m128i rounding = _mm_set1_epi32(1 << (CONV8_ROUNDING_BITS - 1));
+ __m128i x0 = _mm_madd_epi16(sig[0], *f);
+ __m128i x1 = _mm_madd_epi16(sig[1], *f);
+ x0 = _mm_add_epi32(x0, rounding);
+ x1 = _mm_add_epi32(x1, rounding);
+ *y0 = _mm_srai_epi32(x0, CONV8_ROUNDING_BITS);
+ *y1 = _mm_srai_epi32(x1, CONV8_ROUNDING_BITS);
+}
+
+static INLINE void store_8x1_2t_pixels_ver(const __m128i *y0, const __m128i *y1,
+ const __m128i *mask, uint16_t *dst) {
+ __m128i res = _mm_packus_epi32(*y0, *y1);
+ res = _mm_min_epi16(res, *mask);
+ _mm_storeu_si128((__m128i *)dst, res);
+}
+
+static void vpx_highbd_filter_block1d8_v2_avx2(
+ const uint16_t *src_ptr, ptrdiff_t src_pitch, uint16_t *dst_ptr,
+ ptrdiff_t dst_pitch, uint32_t height, const int16_t *filter, int bd) {
+ __m128i signal[3], res0, res1;
+ const __m128i max = _mm_set1_epi16((1 << bd) - 1);
+ __m128i ff;
+
+ pack_8x1_2t_filter(filter, &ff);
+ pack_8x2_init(src_ptr, signal);
+
+ do {
+ pack_8x2_2t_pixels_ver(src_ptr, src_pitch, signal);
+ filter_8_2t_pixels(signal, &ff, &res0, &res1);
+ store_8x1_2t_pixels_ver(&res0, &res1, &max, dst_ptr);
+
+ src_ptr += src_pitch;
+ dst_ptr += dst_pitch;
+ height -= 1;
+ } while (height > 0);
+}
+
+// Calculation with averaging the input pixels
+
+static INLINE void store_8x1_avg_pixels(const __m256i *y0, const __m256i *mask,
+ uint16_t *dst) {
+ const __m128i a0 = _mm256_castsi256_si128(*y0);
+ const __m128i a1 = _mm256_extractf128_si256(*y0, 1);
+ __m128i res = _mm_packus_epi32(a0, a1);
+ const __m128i pix = _mm_loadu_si128((const __m128i *)dst);
+ res = _mm_min_epi16(res, _mm256_castsi256_si128(*mask));
+ res = _mm_avg_epu16(res, pix);
+ _mm_storeu_si128((__m128i *)dst, res);
+}
+
+static INLINE void store_8x2_avg_pixels(const __m256i *y0, const __m256i *y1,
+ const __m256i *mask, uint16_t *dst,
+ ptrdiff_t pitch) {
+ __m256i a = _mm256_packus_epi32(*y0, *y1);
+ const __m128i pix0 = _mm_loadu_si128((const __m128i *)dst);
+ const __m128i pix1 = _mm_loadu_si128((const __m128i *)(dst + pitch));
+ const __m256i pix =
+ _mm256_insertf128_si256(_mm256_castsi128_si256(pix0), pix1, 1);
+ a = _mm256_min_epi16(a, *mask);
+ a = _mm256_avg_epu16(a, pix);
+ _mm_storeu_si128((__m128i *)dst, _mm256_castsi256_si128(a));
+ _mm_storeu_si128((__m128i *)(dst + pitch), _mm256_extractf128_si256(a, 1));
+}
+
+static INLINE void store_16x1_avg_pixels(const __m256i *y0, const __m256i *y1,
+ const __m256i *mask, uint16_t *dst) {
+ __m256i a = _mm256_packus_epi32(*y0, *y1);
+ const __m256i pix = _mm256_loadu_si256((const __m256i *)dst);
+ a = _mm256_min_epi16(a, *mask);
+ a = _mm256_avg_epu16(a, pix);
+ _mm256_storeu_si256((__m256i *)dst, a);
+}
+
+static INLINE void store_16x2_avg_pixels(const __m256i *y0, const __m256i *y1,
+ const __m256i *mask, uint16_t *dst,
+ ptrdiff_t pitch) {
+ const __m256i pix0 = _mm256_loadu_si256((const __m256i *)dst);
+ const __m256i pix1 = _mm256_loadu_si256((const __m256i *)(dst + pitch));
+ __m256i p = _mm256_min_epi16(*y0, *mask);
+ p = _mm256_avg_epu16(p, pix0);
+ _mm256_storeu_si256((__m256i *)dst, p);
+
+ p = _mm256_min_epi16(*y1, *mask);
+ p = _mm256_avg_epu16(p, pix1);
+ _mm256_storeu_si256((__m256i *)(dst + pitch), p);
+}
+
+static INLINE void store_8x1_2t_avg_pixels_ver(const __m128i *y0,
+ const __m128i *y1,
+ const __m128i *mask,
+ uint16_t *dst) {
+ __m128i res = _mm_packus_epi32(*y0, *y1);
+ const __m128i pix = _mm_loadu_si128((const __m128i *)dst);
+ res = _mm_min_epi16(res, *mask);
+ res = _mm_avg_epu16(res, pix);
+ _mm_storeu_si128((__m128i *)dst, res);
+}
+
+static void vpx_highbd_filter_block1d8_h8_avg_avx2(
+ const uint16_t *src_ptr, ptrdiff_t src_pitch, uint16_t *dst_ptr,
+ ptrdiff_t dst_pitch, uint32_t height, const int16_t *filter, int bd) {
+ __m256i signal[8], res0, res1;
+ const __m256i max = _mm256_set1_epi16((1 << bd) - 1);
+
+ __m256i ff[4];
+ pack_filters(filter, ff);
+
+ src_ptr -= 3;
+ do {
+ pack_8x2_pixels(src_ptr, src_pitch, signal);
+ filter_8x1_pixels(signal, ff, &res0);
+ filter_8x1_pixels(&signal[4], ff, &res1);
+ store_8x2_avg_pixels(&res0, &res1, &max, dst_ptr, dst_pitch);
+ height -= 2;
+ src_ptr += src_pitch << 1;
+ dst_ptr += dst_pitch << 1;
+ } while (height > 1);
+
+ if (height > 0) {
+ pack_8x1_pixels(src_ptr, signal);
+ filter_8x1_pixels(signal, ff, &res0);
+ store_8x1_avg_pixels(&res0, &max, dst_ptr);
+ }
+}
+
+static void vpx_highbd_filter_block1d16_h8_avg_avx2(
+ const uint16_t *src_ptr, ptrdiff_t src_pitch, uint16_t *dst_ptr,
+ ptrdiff_t dst_pitch, uint32_t height, const int16_t *filter, int bd) {
+ __m256i signal[8], res0, res1;
+ const __m256i max = _mm256_set1_epi16((1 << bd) - 1);
+
+ __m256i ff[4];
+ pack_filters(filter, ff);
+
+ src_ptr -= 3;
+ do {
+ pack_16x1_pixels(src_ptr, signal);
+ filter_8x1_pixels(signal, ff, &res0);
+ filter_8x1_pixels(&signal[4], ff, &res1);
+ store_16x1_avg_pixels(&res0, &res1, &max, dst_ptr);
+ height -= 1;
+ src_ptr += src_pitch;
+ dst_ptr += dst_pitch;
+ } while (height > 0);
+}
+
+static void vpx_highbd_filter_block1d4_h4_avx2(
+ const uint16_t *src_ptr, ptrdiff_t src_stride, uint16_t *dst_ptr,
+ ptrdiff_t dst_stride, uint32_t height, const int16_t *kernel, int bd) {
+ // We extract the middle four elements of the kernel into two registers in
+ // the form
+ // ... k[3] k[2] k[3] k[2]
+ // ... k[5] k[4] k[5] k[4]
+ // Then we shuffle the source into
+ // ... s[1] s[0] s[0] s[-1]
+ // ... s[3] s[2] s[2] s[1]
+ // Calling multiply and add gives us half of the sum. Calling add on the two
+ // halves gives us the output. Since avx2 allows us to use 256-bit buffer, we
+ // can do this two rows at a time.
+
+ __m256i src_reg, src_reg_shift_0, src_reg_shift_2;
+ __m256i res_reg;
+ __m256i idx_shift_0 =
+ _mm256_setr_epi8(0, 1, 2, 3, 2, 3, 4, 5, 4, 5, 6, 7, 6, 7, 8, 9, 0, 1, 2,
+ 3, 2, 3, 4, 5, 4, 5, 6, 7, 6, 7, 8, 9);
+ __m256i idx_shift_2 =
+ _mm256_setr_epi8(4, 5, 6, 7, 6, 7, 8, 9, 8, 9, 10, 11, 10, 11, 12, 13, 4,
+ 5, 6, 7, 6, 7, 8, 9, 8, 9, 10, 11, 10, 11, 12, 13);
+
+ __m128i kernel_reg_128; // Kernel
+ __m256i kernel_reg, kernel_reg_23,
+ kernel_reg_45; // Segments of the kernel used
+ const __m256i reg_round =
+ _mm256_set1_epi32(CONV8_ROUNDING_NUM); // Used for rounding
+ const __m256i reg_max = _mm256_set1_epi16((1 << bd) - 1);
+ const ptrdiff_t unrolled_src_stride = src_stride << 1;
+ const ptrdiff_t unrolled_dst_stride = dst_stride << 1;
+ int h;
+
+ // Start one pixel before as we need tap/2 - 1 = 1 sample from the past
+ src_ptr -= 1;
+
+ // Load Kernel
+ kernel_reg_128 = _mm_loadu_si128((const __m128i *)kernel);
+ kernel_reg = _mm256_broadcastsi128_si256(kernel_reg_128);
+ kernel_reg_23 = _mm256_shuffle_epi32(kernel_reg, 0x55);
+ kernel_reg_45 = _mm256_shuffle_epi32(kernel_reg, 0xaa);
+
+ for (h = height; h >= 2; h -= 2) {
+ // Load the source
+ src_reg = mm256_loadu2_si128(src_ptr, src_ptr + src_stride);
+ src_reg_shift_0 = _mm256_shuffle_epi8(src_reg, idx_shift_0);
+ src_reg_shift_2 = _mm256_shuffle_epi8(src_reg, idx_shift_2);
+
+ // Get the output
+ res_reg = mm256_madd_add_epi32(&src_reg_shift_0, &src_reg_shift_2,
+ &kernel_reg_23, &kernel_reg_45);
+
+ // Round the result
+ res_reg = mm256_round_epi32(&res_reg, &reg_round, CONV8_ROUNDING_BITS);
+
+ // Finally combine to get the final dst
+ res_reg = _mm256_packus_epi32(res_reg, res_reg);
+ res_reg = _mm256_min_epi16(res_reg, reg_max);
+ mm256_storeu2_epi64((__m128i *)dst_ptr, (__m128i *)(dst_ptr + dst_stride),
+ &res_reg);
+
+ src_ptr += unrolled_src_stride;
+ dst_ptr += unrolled_dst_stride;
+ }
+
+ // Repeat for the last row if needed
+ if (h > 0) {
+ // Load the source
+ src_reg = mm256_loadu2_si128(src_ptr, src_ptr + 4);
+ src_reg_shift_0 = _mm256_shuffle_epi8(src_reg, idx_shift_0);
+ src_reg_shift_2 = _mm256_shuffle_epi8(src_reg, idx_shift_2);
+
+ // Get the output
+ res_reg = mm256_madd_add_epi32(&src_reg_shift_0, &src_reg_shift_2,
+ &kernel_reg_23, &kernel_reg_45);
+
+ // Round the result
+ res_reg = mm256_round_epi32(&res_reg, &reg_round, CONV8_ROUNDING_BITS);
+
+ // Finally combine to get the final dst
+ res_reg = _mm256_packus_epi32(res_reg, res_reg);
+ res_reg = _mm256_min_epi16(res_reg, reg_max);
+ _mm_storel_epi64((__m128i *)dst_ptr, _mm256_castsi256_si128(res_reg));
+ }
+}
+
+static void vpx_highbd_filter_block1d8_h4_avx2(
+ const uint16_t *src_ptr, ptrdiff_t src_stride, uint16_t *dst_ptr,
+ ptrdiff_t dst_stride, uint32_t height, const int16_t *kernel, int bd) {
+ // We will extract the middle four elements of the kernel into two registers
+ // in the form
+ // ... k[3] k[2] k[3] k[2]
+ // ... k[5] k[4] k[5] k[4]
+ // Then we shuffle the source into
+ // ... s[1] s[0] s[0] s[-1]
+ // ... s[3] s[2] s[2] s[1]
+ // Calling multiply and add gives us half of the sum of the first half.
+ // Calling add gives us first half of the output. Repat again to get the whole
+ // output. Since avx2 allows us to use 256-bit buffer, we can do this two rows
+ // at a time.
+
+ __m256i src_reg, src_reg_shift_0, src_reg_shift_2;
+ __m256i res_reg, res_first, res_last;
+ __m256i idx_shift_0 =
+ _mm256_setr_epi8(0, 1, 2, 3, 2, 3, 4, 5, 4, 5, 6, 7, 6, 7, 8, 9, 0, 1, 2,
+ 3, 2, 3, 4, 5, 4, 5, 6, 7, 6, 7, 8, 9);
+ __m256i idx_shift_2 =
+ _mm256_setr_epi8(4, 5, 6, 7, 6, 7, 8, 9, 8, 9, 10, 11, 10, 11, 12, 13, 4,
+ 5, 6, 7, 6, 7, 8, 9, 8, 9, 10, 11, 10, 11, 12, 13);
+
+ __m128i kernel_reg_128; // Kernel
+ __m256i kernel_reg, kernel_reg_23,
+ kernel_reg_45; // Segments of the kernel used
+ const __m256i reg_round =
+ _mm256_set1_epi32(CONV8_ROUNDING_NUM); // Used for rounding
+ const __m256i reg_max = _mm256_set1_epi16((1 << bd) - 1);
+ const ptrdiff_t unrolled_src_stride = src_stride << 1;
+ const ptrdiff_t unrolled_dst_stride = dst_stride << 1;
+ int h;
+
+ // Start one pixel before as we need tap/2 - 1 = 1 sample from the past
+ src_ptr -= 1;
+
+ // Load Kernel
+ kernel_reg_128 = _mm_loadu_si128((const __m128i *)kernel);
+ kernel_reg = _mm256_broadcastsi128_si256(kernel_reg_128);
+ kernel_reg_23 = _mm256_shuffle_epi32(kernel_reg, 0x55);
+ kernel_reg_45 = _mm256_shuffle_epi32(kernel_reg, 0xaa);
+
+ for (h = height; h >= 2; h -= 2) {
+ // Load the source
+ src_reg = mm256_loadu2_si128(src_ptr, src_ptr + src_stride);
+ src_reg_shift_0 = _mm256_shuffle_epi8(src_reg, idx_shift_0);
+ src_reg_shift_2 = _mm256_shuffle_epi8(src_reg, idx_shift_2);
+
+ // Result for first half
+ res_first = mm256_madd_add_epi32(&src_reg_shift_0, &src_reg_shift_2,
+ &kernel_reg_23, &kernel_reg_45);
+
+ // Do again to get the second half of dst
+ // Load the source
+ src_reg = mm256_loadu2_si128(src_ptr + 4, src_ptr + src_stride + 4);
+ src_reg_shift_0 = _mm256_shuffle_epi8(src_reg, idx_shift_0);
+ src_reg_shift_2 = _mm256_shuffle_epi8(src_reg, idx_shift_2);
+
+ // Result for second half
+ res_last = mm256_madd_add_epi32(&src_reg_shift_0, &src_reg_shift_2,
+ &kernel_reg_23, &kernel_reg_45);
+
+ // Round each result
+ res_first = mm256_round_epi32(&res_first, &reg_round, CONV8_ROUNDING_BITS);
+ res_last = mm256_round_epi32(&res_last, &reg_round, CONV8_ROUNDING_BITS);
+
+ // Finally combine to get the final dst
+ res_reg = _mm256_packus_epi32(res_first, res_last);
+ res_reg = _mm256_min_epi16(res_reg, reg_max);
+ mm256_store2_si128((__m128i *)dst_ptr, (__m128i *)(dst_ptr + dst_stride),
+ &res_reg);
+
+ src_ptr += unrolled_src_stride;
+ dst_ptr += unrolled_dst_stride;
+ }
+
+ // Repeat for the last row if needed
+ if (h > 0) {
+ src_reg = mm256_loadu2_si128(src_ptr, src_ptr + 4);
+ src_reg_shift_0 = _mm256_shuffle_epi8(src_reg, idx_shift_0);
+ src_reg_shift_2 = _mm256_shuffle_epi8(src_reg, idx_shift_2);
+
+ res_reg = mm256_madd_add_epi32(&src_reg_shift_0, &src_reg_shift_2,
+ &kernel_reg_23, &kernel_reg_45);
+
+ res_reg = mm256_round_epi32(&res_reg, &reg_round, CONV8_ROUNDING_BITS);
+
+ res_reg = _mm256_packus_epi32(res_reg, res_reg);
+ res_reg = _mm256_min_epi16(res_reg, reg_max);
+
+ mm256_storeu2_epi64((__m128i *)dst_ptr, (__m128i *)(dst_ptr + 4), &res_reg);
+ }
+}
+
+static void vpx_highbd_filter_block1d16_h4_avx2(
+ const uint16_t *src_ptr, ptrdiff_t src_stride, uint16_t *dst_ptr,
+ ptrdiff_t dst_stride, uint32_t height, const int16_t *kernel, int bd) {
+ vpx_highbd_filter_block1d8_h4_avx2(src_ptr, src_stride, dst_ptr, dst_stride,
+ height, kernel, bd);
+ vpx_highbd_filter_block1d8_h4_avx2(src_ptr + 8, src_stride, dst_ptr + 8,
+ dst_stride, height, kernel, bd);
+}
+
+static void vpx_highbd_filter_block1d8_v8_avg_avx2(
+ const uint16_t *src_ptr, ptrdiff_t src_pitch, uint16_t *dst_ptr,
+ ptrdiff_t dst_pitch, uint32_t height, const int16_t *filter, int bd) {
+ __m256i signal[9], res0, res1;
+ const __m256i max = _mm256_set1_epi16((1 << bd) - 1);
+
+ __m256i ff[4];
+ pack_filters(filter, ff);
+
+ pack_8x9_init(src_ptr, src_pitch, signal);
+
+ do {
+ pack_8x9_pixels(src_ptr, src_pitch, signal);
+
+ filter_8x9_pixels(signal, ff, &res0, &res1);
+ store_8x2_avg_pixels(&res0, &res1, &max, dst_ptr, dst_pitch);
+ update_pixels(signal);
+
+ src_ptr += src_pitch << 1;
+ dst_ptr += dst_pitch << 1;
+ height -= 2;
+ } while (height > 0);
+}
+
+static void vpx_highbd_filter_block1d16_v8_avg_avx2(
+ const uint16_t *src_ptr, ptrdiff_t src_pitch, uint16_t *dst_ptr,
+ ptrdiff_t dst_pitch, uint32_t height, const int16_t *filter, int bd) {
+ __m256i signal[17], res0, res1;
+ const __m256i max = _mm256_set1_epi16((1 << bd) - 1);
+
+ __m256i ff[4];
+ pack_filters(filter, ff);
+
+ pack_16x9_init(src_ptr, src_pitch, signal);
+
+ do {
+ pack_16x9_pixels(src_ptr, src_pitch, signal);
+ filter_16x9_pixels(signal, ff, &res0, &res1);
+ store_16x2_avg_pixels(&res0, &res1, &max, dst_ptr, dst_pitch);
+ update_16x9_pixels(signal);
+
+ src_ptr += src_pitch << 1;
+ dst_ptr += dst_pitch << 1;
+ height -= 2;
+ } while (height > 0);
+}
+
+static void vpx_highbd_filter_block1d8_h2_avg_avx2(
+ const uint16_t *src_ptr, ptrdiff_t src_pitch, uint16_t *dst_ptr,
+ ptrdiff_t dst_pitch, uint32_t height, const int16_t *filter, int bd) {
+ __m256i signal[2], res0, res1;
+ const __m256i max = _mm256_set1_epi16((1 << bd) - 1);
+
+ __m256i ff;
+ pack_2t_filter(filter, &ff);
+
+ src_ptr -= 3;
+ do {
+ pack_8x2_2t_pixels(src_ptr, src_pitch, signal);
+ filter_16_2t_pixels(signal, &ff, &res0, &res1);
+ store_8x2_avg_pixels(&res0, &res1, &max, dst_ptr, dst_pitch);
+ height -= 2;
+ src_ptr += src_pitch << 1;
+ dst_ptr += dst_pitch << 1;
+ } while (height > 1);
+
+ if (height > 0) {
+ pack_8x1_2t_pixels(src_ptr, signal);
+ filter_8x1_2t_pixels(signal, &ff, &res0);
+ store_8x1_avg_pixels(&res0, &max, dst_ptr);
+ }
+}
+
+static void vpx_highbd_filter_block1d16_h2_avg_avx2(
+ const uint16_t *src_ptr, ptrdiff_t src_pitch, uint16_t *dst_ptr,
+ ptrdiff_t dst_pitch, uint32_t height, const int16_t *filter, int bd) {
+ __m256i signal[2], res0, res1;
+ const __m256i max = _mm256_set1_epi16((1 << bd) - 1);
+
+ __m256i ff;
+ pack_2t_filter(filter, &ff);
+
+ src_ptr -= 3;
+ do {
+ pack_16x1_2t_pixels(src_ptr, signal);
+ filter_16_2t_pixels(signal, &ff, &res0, &res1);
+ store_16x1_avg_pixels(&res0, &res1, &max, dst_ptr);
+ height -= 1;
+ src_ptr += src_pitch;
+ dst_ptr += dst_pitch;
+ } while (height > 0);
+}
+
+static void vpx_highbd_filter_block1d16_v2_avg_avx2(
+ const uint16_t *src_ptr, ptrdiff_t src_pitch, uint16_t *dst_ptr,
+ ptrdiff_t dst_pitch, uint32_t height, const int16_t *filter, int bd) {
+ __m256i signal[3], res0, res1;
+ const __m256i max = _mm256_set1_epi16((1 << bd) - 1);
+ __m256i ff;
+
+ pack_2t_filter(filter, &ff);
+ pack_16x2_init(src_ptr, signal);
+
+ do {
+ pack_16x2_2t_pixels(src_ptr, src_pitch, signal);
+ filter_16x2_2t_pixels(signal, &ff, &res0, &res1);
+ store_16x1_avg_pixels(&res0, &res1, &max, dst_ptr);
+
+ src_ptr += src_pitch;
+ dst_ptr += dst_pitch;
+ height -= 1;
+ } while (height > 0);
+}
+
+static void vpx_highbd_filter_block1d8_v2_avg_avx2(
+ const uint16_t *src_ptr, ptrdiff_t src_pitch, uint16_t *dst_ptr,
+ ptrdiff_t dst_pitch, uint32_t height, const int16_t *filter, int bd) {
+ __m128i signal[3], res0, res1;
+ const __m128i max = _mm_set1_epi16((1 << bd) - 1);
+ __m128i ff;
+
+ pack_8x1_2t_filter(filter, &ff);
+ pack_8x2_init(src_ptr, signal);
+
+ do {
+ pack_8x2_2t_pixels_ver(src_ptr, src_pitch, signal);
+ filter_8_2t_pixels(signal, &ff, &res0, &res1);
+ store_8x1_2t_avg_pixels_ver(&res0, &res1, &max, dst_ptr);
+
+ src_ptr += src_pitch;
+ dst_ptr += dst_pitch;
+ height -= 1;
+ } while (height > 0);
+}
+
+static void vpx_highbd_filter_block1d4_v4_avx2(
+ const uint16_t *src_ptr, ptrdiff_t src_stride, uint16_t *dst_ptr,
+ ptrdiff_t dst_stride, uint32_t height, const int16_t *kernel, int bd) {
+ // We will load two rows of pixels and rearrange them into the form
+ // ... s[1,0] s[0,0] s[0,0] s[-1,0]
+ // so that we can call multiply and add with the kernel partial output. Then
+ // we can call add with another row to get the output.
+
+ // Register for source s[-1:3, :]
+ __m256i src_reg_1, src_reg_2, src_reg_3;
+ // Interleaved rows of the source. lo is first half, hi second
+ __m256i src_reg_m10, src_reg_01, src_reg_12, src_reg_23;
+ __m256i src_reg_m1001, src_reg_1223;
+
+ // Result after multiply and add
+ __m256i res_reg;
+
+ __m128i kernel_reg_128; // Kernel
+ __m256i kernel_reg, kernel_reg_23, kernel_reg_45; // Segments of kernel used
+
+ const __m256i reg_round =
+ _mm256_set1_epi32(CONV8_ROUNDING_NUM); // Used for rounding
+ const __m256i reg_max = _mm256_set1_epi16((1 << bd) - 1);
+ const ptrdiff_t src_stride_unrolled = src_stride << 1;
+ const ptrdiff_t dst_stride_unrolled = dst_stride << 1;
+ int h;
+
+ // Load Kernel
+ kernel_reg_128 = _mm_loadu_si128((const __m128i *)kernel);
+ kernel_reg = _mm256_broadcastsi128_si256(kernel_reg_128);
+ kernel_reg_23 = _mm256_shuffle_epi32(kernel_reg, 0x55);
+ kernel_reg_45 = _mm256_shuffle_epi32(kernel_reg, 0xaa);
+
+ // Row -1 to row 0
+ src_reg_m10 = mm256_loadu2_epi64((const __m128i *)src_ptr,
+ (const __m128i *)(src_ptr + src_stride));
+
+ // Row 0 to row 1
+ src_reg_1 = _mm256_castsi128_si256(
+ _mm_loadu_si128((const __m128i *)(src_ptr + src_stride * 2)));
+ src_reg_01 = _mm256_permute2x128_si256(src_reg_m10, src_reg_1, 0x21);
+
+ // First three rows
+ src_reg_m1001 = _mm256_unpacklo_epi16(src_reg_m10, src_reg_01);
+
+ for (h = height; h > 1; h -= 2) {
+ src_reg_2 = _mm256_castsi128_si256(
+ _mm_loadl_epi64((const __m128i *)(src_ptr + src_stride * 3)));
+
+ src_reg_12 = _mm256_inserti128_si256(src_reg_1,
+ _mm256_castsi256_si128(src_reg_2), 1);
+
+ src_reg_3 = _mm256_castsi128_si256(
+ _mm_loadl_epi64((const __m128i *)(src_ptr + src_stride * 4)));
+
+ src_reg_23 = _mm256_inserti128_si256(src_reg_2,
+ _mm256_castsi256_si128(src_reg_3), 1);
+
+ // Last three rows
+ src_reg_1223 = _mm256_unpacklo_epi16(src_reg_12, src_reg_23);
+
+ // Output
+ res_reg = mm256_madd_add_epi32(&src_reg_m1001, &src_reg_1223,
+ &kernel_reg_23, &kernel_reg_45);
+
+ // Round the words
+ res_reg = mm256_round_epi32(&res_reg, &reg_round, CONV8_ROUNDING_BITS);
+
+ // Combine to get the result
+ res_reg = _mm256_packus_epi32(res_reg, res_reg);
+ res_reg = _mm256_min_epi16(res_reg, reg_max);
+
+ // Save the result
+ mm256_storeu2_epi64((__m128i *)dst_ptr, (__m128i *)(dst_ptr + dst_stride),
+ &res_reg);
+
+ // Update the source by two rows
+ src_ptr += src_stride_unrolled;
+ dst_ptr += dst_stride_unrolled;
+
+ src_reg_m1001 = src_reg_1223;
+ src_reg_1 = src_reg_3;
+ }
+}
+
+static void vpx_highbd_filter_block1d8_v4_avx2(
+ const uint16_t *src_ptr, ptrdiff_t src_stride, uint16_t *dst_ptr,
+ ptrdiff_t dst_stride, uint32_t height, const int16_t *kernel, int bd) {
+ // We will load two rows of pixels and rearrange them into the form
+ // ... s[1,0] s[0,0] s[0,0] s[-1,0]
+ // so that we can call multiply and add with the kernel partial output. Then
+ // we can call add with another row to get the output.
+
+ // Register for source s[-1:3, :]
+ __m256i src_reg_1, src_reg_2, src_reg_3;
+ // Interleaved rows of the source. lo is first half, hi second
+ __m256i src_reg_m10, src_reg_01, src_reg_12, src_reg_23;
+ __m256i src_reg_m1001_lo, src_reg_m1001_hi, src_reg_1223_lo, src_reg_1223_hi;
+
+ __m128i kernel_reg_128; // Kernel
+ __m256i kernel_reg, kernel_reg_23, kernel_reg_45; // Segments of kernel
+
+ // Result after multiply and add
+ __m256i res_reg, res_reg_lo, res_reg_hi;
+
+ const __m256i reg_round =
+ _mm256_set1_epi32(CONV8_ROUNDING_NUM); // Used for rounding
+ const __m256i reg_max = _mm256_set1_epi16((1 << bd) - 1);
+ const ptrdiff_t src_stride_unrolled = src_stride << 1;
+ const ptrdiff_t dst_stride_unrolled = dst_stride << 1;
+ int h;
+
+ // Load Kernel
+ kernel_reg_128 = _mm_loadu_si128((const __m128i *)kernel);
+ kernel_reg = _mm256_broadcastsi128_si256(kernel_reg_128);
+ kernel_reg_23 = _mm256_shuffle_epi32(kernel_reg, 0x55);
+ kernel_reg_45 = _mm256_shuffle_epi32(kernel_reg, 0xaa);
+
+ // Row -1 to row 0
+ src_reg_m10 = mm256_loadu2_si128((const __m128i *)src_ptr,
+ (const __m128i *)(src_ptr + src_stride));
+
+ // Row 0 to row 1
+ src_reg_1 = _mm256_castsi128_si256(
+ _mm_loadu_si128((const __m128i *)(src_ptr + src_stride * 2)));
+ src_reg_01 = _mm256_permute2x128_si256(src_reg_m10, src_reg_1, 0x21);
+
+ // First three rows
+ src_reg_m1001_lo = _mm256_unpacklo_epi16(src_reg_m10, src_reg_01);
+ src_reg_m1001_hi = _mm256_unpackhi_epi16(src_reg_m10, src_reg_01);
+
+ for (h = height; h > 1; h -= 2) {
+ src_reg_2 = _mm256_castsi128_si256(
+ _mm_loadu_si128((const __m128i *)(src_ptr + src_stride * 3)));
+
+ src_reg_12 = _mm256_inserti128_si256(src_reg_1,
+ _mm256_castsi256_si128(src_reg_2), 1);
+
+ src_reg_3 = _mm256_castsi128_si256(
+ _mm_loadu_si128((const __m128i *)(src_ptr + src_stride * 4)));
+
+ src_reg_23 = _mm256_inserti128_si256(src_reg_2,
+ _mm256_castsi256_si128(src_reg_3), 1);
+
+ // Last three rows
+ src_reg_1223_lo = _mm256_unpacklo_epi16(src_reg_12, src_reg_23);
+ src_reg_1223_hi = _mm256_unpackhi_epi16(src_reg_12, src_reg_23);
+
+ // Output from first half
+ res_reg_lo = mm256_madd_add_epi32(&src_reg_m1001_lo, &src_reg_1223_lo,
+ &kernel_reg_23, &kernel_reg_45);
+
+ // Output from second half
+ res_reg_hi = mm256_madd_add_epi32(&src_reg_m1001_hi, &src_reg_1223_hi,
+ &kernel_reg_23, &kernel_reg_45);
+
+ // Round the words
+ res_reg_lo =
+ mm256_round_epi32(&res_reg_lo, &reg_round, CONV8_ROUNDING_BITS);
+ res_reg_hi =
+ mm256_round_epi32(&res_reg_hi, &reg_round, CONV8_ROUNDING_BITS);
+
+ // Combine to get the result
+ res_reg = _mm256_packus_epi32(res_reg_lo, res_reg_hi);
+ res_reg = _mm256_min_epi16(res_reg, reg_max);
+
+ // Save the result
+ mm256_store2_si128((__m128i *)dst_ptr, (__m128i *)(dst_ptr + dst_stride),
+ &res_reg);
+
+ // Update the source by two rows
+ src_ptr += src_stride_unrolled;
+ dst_ptr += dst_stride_unrolled;
+
+ src_reg_m1001_lo = src_reg_1223_lo;
+ src_reg_m1001_hi = src_reg_1223_hi;
+ src_reg_1 = src_reg_3;
+ }
+}
+
+static void vpx_highbd_filter_block1d16_v4_avx2(
+ const uint16_t *src_ptr, ptrdiff_t src_stride, uint16_t *dst_ptr,
+ ptrdiff_t dst_stride, uint32_t height, const int16_t *kernel, int bd) {
+ vpx_highbd_filter_block1d8_v4_avx2(src_ptr, src_stride, dst_ptr, dst_stride,
+ height, kernel, bd);
+ vpx_highbd_filter_block1d8_v4_avx2(src_ptr + 8, src_stride, dst_ptr + 8,
+ dst_stride, height, kernel, bd);
+}
+
+// From vpx_dsp/x86/vpx_high_subpixel_8t_sse2.asm.
+highbd_filter8_1dfunction vpx_highbd_filter_block1d4_h8_sse2;
+highbd_filter8_1dfunction vpx_highbd_filter_block1d4_v8_sse2;
+
+// From vpx_dsp/x86/vpx_high_subpixel_bilinear_sse2.asm.
+highbd_filter8_1dfunction vpx_highbd_filter_block1d4_h2_sse2;
+highbd_filter8_1dfunction vpx_highbd_filter_block1d4_v2_sse2;
+
+#define vpx_highbd_filter_block1d4_h8_avx2 vpx_highbd_filter_block1d4_h8_sse2
+#define vpx_highbd_filter_block1d4_h2_avx2 vpx_highbd_filter_block1d4_h2_sse2
+#define vpx_highbd_filter_block1d4_v8_avx2 vpx_highbd_filter_block1d4_v8_sse2
+#define vpx_highbd_filter_block1d4_v2_avx2 vpx_highbd_filter_block1d4_v2_sse2
+
+// Use the [vh]8 version because there is no [vh]4 implementation.
+#define vpx_highbd_filter_block1d16_v4_avg_avx2 \
+ vpx_highbd_filter_block1d16_v8_avg_avx2
+#define vpx_highbd_filter_block1d16_h4_avg_avx2 \
+ vpx_highbd_filter_block1d16_h8_avg_avx2
+#define vpx_highbd_filter_block1d8_v4_avg_avx2 \
+ vpx_highbd_filter_block1d8_v8_avg_avx2
+#define vpx_highbd_filter_block1d8_h4_avg_avx2 \
+ vpx_highbd_filter_block1d8_h8_avg_avx2
+#define vpx_highbd_filter_block1d4_v4_avg_avx2 \
+ vpx_highbd_filter_block1d4_v8_avg_avx2
+#define vpx_highbd_filter_block1d4_h4_avg_avx2 \
+ vpx_highbd_filter_block1d4_h8_avg_avx2
+
+HIGH_FUN_CONV_1D(horiz, x0_q4, x_step_q4, h, src, , avx2, 0)
+HIGH_FUN_CONV_1D(vert, y0_q4, y_step_q4, v,
+ src - src_stride * (num_taps / 2 - 1), , avx2, 0)
+HIGH_FUN_CONV_2D(, avx2, 0)
+
+// From vpx_dsp/x86/vpx_high_subpixel_8t_sse2.asm.
+highbd_filter8_1dfunction vpx_highbd_filter_block1d4_h8_avg_sse2;
+highbd_filter8_1dfunction vpx_highbd_filter_block1d4_v8_avg_sse2;
+
+// From vpx_dsp/x86/vpx_high_subpixel_bilinear_sse2.asm.
+highbd_filter8_1dfunction vpx_highbd_filter_block1d4_h2_avg_sse2;
+highbd_filter8_1dfunction vpx_highbd_filter_block1d4_v2_avg_sse2;
+
+#define vpx_highbd_filter_block1d4_h8_avg_avx2 \
+ vpx_highbd_filter_block1d4_h8_avg_sse2
+#define vpx_highbd_filter_block1d4_h2_avg_avx2 \
+ vpx_highbd_filter_block1d4_h2_avg_sse2
+#define vpx_highbd_filter_block1d4_v8_avg_avx2 \
+ vpx_highbd_filter_block1d4_v8_avg_sse2
+#define vpx_highbd_filter_block1d4_v2_avg_avx2 \
+ vpx_highbd_filter_block1d4_v2_avg_sse2
+
+HIGH_FUN_CONV_1D(avg_horiz, x0_q4, x_step_q4, h, src, avg_, avx2, 1)
+HIGH_FUN_CONV_1D(avg_vert, y0_q4, y_step_q4, v,
+ src - src_stride * (num_taps / 2 - 1), avg_, avx2, 1)
+HIGH_FUN_CONV_2D(avg_, avx2, 1)
+
+#undef HIGHBD_FUNC
diff --git a/media/libvpx/libvpx/vpx_dsp/x86/highbd_idct16x16_add_sse2.c b/media/libvpx/libvpx/vpx_dsp/x86/highbd_idct16x16_add_sse2.c
new file mode 100644
index 0000000000..f4f7235d13
--- /dev/null
+++ b/media/libvpx/libvpx/vpx_dsp/x86/highbd_idct16x16_add_sse2.c
@@ -0,0 +1,355 @@
+/*
+ * Copyright (c) 2015 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <emmintrin.h> // SSE2
+
+#include "./vpx_dsp_rtcd.h"
+#include "vpx_dsp/x86/highbd_inv_txfm_sse2.h"
+#include "vpx_dsp/x86/inv_txfm_sse2.h"
+#include "vpx_dsp/x86/transpose_sse2.h"
+#include "vpx_dsp/x86/txfm_common_sse2.h"
+
+static INLINE void highbd_idct16_4col_stage5(const __m128i *const in,
+ __m128i *const out) {
+ // stage 5
+ out[0] = _mm_add_epi32(in[0], in[3]);
+ out[1] = _mm_add_epi32(in[1], in[2]);
+ out[2] = _mm_sub_epi32(in[1], in[2]);
+ out[3] = _mm_sub_epi32(in[0], in[3]);
+ highbd_butterfly_cospi16_sse2(in[6], in[5], &out[6], &out[5]);
+ out[8] = _mm_add_epi32(in[8], in[11]);
+ out[9] = _mm_add_epi32(in[9], in[10]);
+ out[10] = _mm_sub_epi32(in[9], in[10]);
+ out[11] = _mm_sub_epi32(in[8], in[11]);
+ out[12] = _mm_sub_epi32(in[15], in[12]);
+ out[13] = _mm_sub_epi32(in[14], in[13]);
+ out[14] = _mm_add_epi32(in[14], in[13]);
+ out[15] = _mm_add_epi32(in[15], in[12]);
+}
+
+static INLINE void highbd_idct16_4col_stage6(const __m128i *const in,
+ __m128i *const out) {
+ out[0] = _mm_add_epi32(in[0], in[7]);
+ out[1] = _mm_add_epi32(in[1], in[6]);
+ out[2] = _mm_add_epi32(in[2], in[5]);
+ out[3] = _mm_add_epi32(in[3], in[4]);
+ out[4] = _mm_sub_epi32(in[3], in[4]);
+ out[5] = _mm_sub_epi32(in[2], in[5]);
+ out[6] = _mm_sub_epi32(in[1], in[6]);
+ out[7] = _mm_sub_epi32(in[0], in[7]);
+ out[8] = in[8];
+ out[9] = in[9];
+ highbd_butterfly_cospi16_sse2(in[13], in[10], &out[13], &out[10]);
+ highbd_butterfly_cospi16_sse2(in[12], in[11], &out[12], &out[11]);
+ out[14] = in[14];
+ out[15] = in[15];
+}
+
+static INLINE void highbd_idct16_4col(__m128i *const io /*io[16]*/) {
+ __m128i step1[16], step2[16];
+
+ // stage 2
+ highbd_butterfly_sse2(io[1], io[15], cospi_30_64, cospi_2_64, &step2[8],
+ &step2[15]);
+ highbd_butterfly_sse2(io[9], io[7], cospi_14_64, cospi_18_64, &step2[9],
+ &step2[14]);
+ highbd_butterfly_sse2(io[5], io[11], cospi_22_64, cospi_10_64, &step2[10],
+ &step2[13]);
+ highbd_butterfly_sse2(io[13], io[3], cospi_6_64, cospi_26_64, &step2[11],
+ &step2[12]);
+
+ // stage 3
+ highbd_butterfly_sse2(io[2], io[14], cospi_28_64, cospi_4_64, &step1[4],
+ &step1[7]);
+ highbd_butterfly_sse2(io[10], io[6], cospi_12_64, cospi_20_64, &step1[5],
+ &step1[6]);
+ step1[8] = _mm_add_epi32(step2[8], step2[9]);
+ step1[9] = _mm_sub_epi32(step2[8], step2[9]);
+ step1[10] = _mm_sub_epi32(step2[10], step2[11]); // step1[10] = -step1[10]
+ step1[11] = _mm_add_epi32(step2[10], step2[11]);
+ step1[12] = _mm_add_epi32(step2[13], step2[12]);
+ step1[13] = _mm_sub_epi32(step2[13], step2[12]); // step1[13] = -step1[13]
+ step1[14] = _mm_sub_epi32(step2[15], step2[14]);
+ step1[15] = _mm_add_epi32(step2[15], step2[14]);
+
+ // stage 4
+ highbd_butterfly_cospi16_sse2(io[0], io[8], &step2[0], &step2[1]);
+ highbd_butterfly_sse2(io[4], io[12], cospi_24_64, cospi_8_64, &step2[2],
+ &step2[3]);
+ highbd_butterfly_sse2(step1[14], step1[9], cospi_24_64, cospi_8_64, &step2[9],
+ &step2[14]);
+ highbd_butterfly_sse2(step1[10], step1[13], cospi_8_64, cospi_24_64,
+ &step2[13], &step2[10]);
+ step2[5] = _mm_sub_epi32(step1[4], step1[5]);
+ step1[4] = _mm_add_epi32(step1[4], step1[5]);
+ step2[6] = _mm_sub_epi32(step1[7], step1[6]);
+ step1[7] = _mm_add_epi32(step1[7], step1[6]);
+ step2[8] = step1[8];
+ step2[11] = step1[11];
+ step2[12] = step1[12];
+ step2[15] = step1[15];
+
+ highbd_idct16_4col_stage5(step2, step1);
+ highbd_idct16_4col_stage6(step1, step2);
+ highbd_idct16_4col_stage7(step2, io);
+}
+
+static INLINE void highbd_idct16x16_38_4col(__m128i *const io /*io[16]*/) {
+ __m128i step1[16], step2[16];
+ __m128i temp1[2], sign[2];
+
+ // stage 2
+ highbd_partial_butterfly_sse2(io[1], cospi_30_64, cospi_2_64, &step2[8],
+ &step2[15]);
+ highbd_partial_butterfly_neg_sse2(io[7], cospi_14_64, cospi_18_64, &step2[9],
+ &step2[14]);
+ highbd_partial_butterfly_sse2(io[5], cospi_22_64, cospi_10_64, &step2[10],
+ &step2[13]);
+ highbd_partial_butterfly_neg_sse2(io[3], cospi_6_64, cospi_26_64, &step2[11],
+ &step2[12]);
+
+ // stage 3
+ highbd_partial_butterfly_sse2(io[2], cospi_28_64, cospi_4_64, &step1[4],
+ &step1[7]);
+ highbd_partial_butterfly_neg_sse2(io[6], cospi_12_64, cospi_20_64, &step1[5],
+ &step1[6]);
+ step1[8] = _mm_add_epi32(step2[8], step2[9]);
+ step1[9] = _mm_sub_epi32(step2[8], step2[9]);
+ step1[10] = _mm_sub_epi32(step2[10], step2[11]); // step1[10] = -step1[10]
+ step1[11] = _mm_add_epi32(step2[10], step2[11]);
+ step1[12] = _mm_add_epi32(step2[13], step2[12]);
+ step1[13] = _mm_sub_epi32(step2[13], step2[12]); // step1[13] = -step1[13]
+ step1[14] = _mm_sub_epi32(step2[15], step2[14]);
+ step1[15] = _mm_add_epi32(step2[15], step2[14]);
+
+ // stage 4
+ abs_extend_64bit_sse2(io[0], temp1, sign);
+ step2[0] = multiplication_round_shift_sse2(temp1, sign, cospi_16_64);
+ step2[1] = step2[0];
+ highbd_partial_butterfly_sse2(io[4], cospi_24_64, cospi_8_64, &step2[2],
+ &step2[3]);
+ highbd_butterfly_sse2(step1[14], step1[9], cospi_24_64, cospi_8_64, &step2[9],
+ &step2[14]);
+ highbd_butterfly_sse2(step1[10], step1[13], cospi_8_64, cospi_24_64,
+ &step2[13], &step2[10]);
+ step2[5] = _mm_sub_epi32(step1[4], step1[5]);
+ step1[4] = _mm_add_epi32(step1[4], step1[5]);
+ step2[6] = _mm_sub_epi32(step1[7], step1[6]);
+ step1[7] = _mm_add_epi32(step1[7], step1[6]);
+ step2[8] = step1[8];
+ step2[11] = step1[11];
+ step2[12] = step1[12];
+ step2[15] = step1[15];
+
+ highbd_idct16_4col_stage5(step2, step1);
+ highbd_idct16_4col_stage6(step1, step2);
+ highbd_idct16_4col_stage7(step2, io);
+}
+
+static INLINE void highbd_idct16x16_10_4col(__m128i *const io /*io[16]*/) {
+ __m128i step1[16], step2[16];
+ __m128i temp[2], sign[2];
+
+ // stage 2
+ highbd_partial_butterfly_sse2(io[1], cospi_30_64, cospi_2_64, &step2[8],
+ &step2[15]);
+ highbd_partial_butterfly_neg_sse2(io[3], cospi_6_64, cospi_26_64, &step2[11],
+ &step2[12]);
+
+ // stage 3
+ highbd_partial_butterfly_sse2(io[2], cospi_28_64, cospi_4_64, &step1[4],
+ &step1[7]);
+ step1[8] = step2[8];
+ step1[9] = step2[8];
+ step1[10] =
+ _mm_sub_epi32(_mm_setzero_si128(), step2[11]); // step1[10] = -step1[10]
+ step1[11] = step2[11];
+ step1[12] = step2[12];
+ step1[13] =
+ _mm_sub_epi32(_mm_setzero_si128(), step2[12]); // step1[13] = -step1[13]
+ step1[14] = step2[15];
+ step1[15] = step2[15];
+
+ // stage 4
+ abs_extend_64bit_sse2(io[0], temp, sign);
+ step2[0] = multiplication_round_shift_sse2(temp, sign, cospi_16_64);
+ step2[1] = step2[0];
+ step2[2] = _mm_setzero_si128();
+ step2[3] = _mm_setzero_si128();
+ highbd_butterfly_sse2(step1[14], step1[9], cospi_24_64, cospi_8_64, &step2[9],
+ &step2[14]);
+ highbd_butterfly_sse2(step1[10], step1[13], cospi_8_64, cospi_24_64,
+ &step2[13], &step2[10]);
+ step2[5] = step1[4];
+ step2[6] = step1[7];
+ step2[8] = step1[8];
+ step2[11] = step1[11];
+ step2[12] = step1[12];
+ step2[15] = step1[15];
+
+ highbd_idct16_4col_stage5(step2, step1);
+ highbd_idct16_4col_stage6(step1, step2);
+ highbd_idct16_4col_stage7(step2, io);
+}
+
+void vpx_highbd_idct16x16_256_add_sse2(const tran_low_t *input, uint16_t *dest,
+ int stride, int bd) {
+ int i;
+ __m128i out[16], *in;
+
+ if (bd == 8) {
+ __m128i l[16], r[16];
+
+ in = l;
+ for (i = 0; i < 2; i++) {
+ highbd_load_pack_transpose_32bit_8x8(&input[0], 16, &in[0]);
+ highbd_load_pack_transpose_32bit_8x8(&input[8], 16, &in[8]);
+ idct16_8col(in, in);
+ in = r;
+ input += 128;
+ }
+
+ for (i = 0; i < 16; i += 8) {
+ int j;
+ transpose_16bit_8x8(l + i, out);
+ transpose_16bit_8x8(r + i, out + 8);
+ idct16_8col(out, out);
+
+ for (j = 0; j < 16; ++j) {
+ highbd_write_buffer_8(dest + j * stride, out[j], bd);
+ }
+ dest += 8;
+ }
+ } else {
+ __m128i all[4][16];
+
+ for (i = 0; i < 4; i++) {
+ in = all[i];
+ highbd_load_transpose_32bit_8x4(&input[0], 16, &in[0]);
+ highbd_load_transpose_32bit_8x4(&input[8], 16, &in[8]);
+ highbd_idct16_4col(in);
+ input += 4 * 16;
+ }
+
+ for (i = 0; i < 16; i += 4) {
+ int j;
+ transpose_32bit_4x4(all[0] + i, out + 0);
+ transpose_32bit_4x4(all[1] + i, out + 4);
+ transpose_32bit_4x4(all[2] + i, out + 8);
+ transpose_32bit_4x4(all[3] + i, out + 12);
+ highbd_idct16_4col(out);
+
+ for (j = 0; j < 16; ++j) {
+ highbd_write_buffer_4(dest + j * stride, out[j], bd);
+ }
+ dest += 4;
+ }
+ }
+}
+
+void vpx_highbd_idct16x16_38_add_sse2(const tran_low_t *input, uint16_t *dest,
+ int stride, int bd) {
+ int i;
+ __m128i out[16];
+
+ if (bd == 8) {
+ __m128i in[16], temp[16];
+
+ highbd_load_pack_transpose_32bit_8x8(input, 16, in);
+ for (i = 8; i < 16; i++) {
+ in[i] = _mm_setzero_si128();
+ }
+ idct16_8col(in, temp);
+
+ for (i = 0; i < 16; i += 8) {
+ int j;
+ transpose_16bit_8x8(temp + i, in);
+ idct16_8col(in, out);
+
+ for (j = 0; j < 16; ++j) {
+ highbd_write_buffer_8(dest + j * stride, out[j], bd);
+ }
+ dest += 8;
+ }
+ } else {
+ __m128i all[2][16], *in;
+
+ for (i = 0; i < 2; i++) {
+ in = all[i];
+ highbd_load_transpose_32bit_8x4(input, 16, in);
+ highbd_idct16x16_38_4col(in);
+ input += 4 * 16;
+ }
+
+ for (i = 0; i < 16; i += 4) {
+ int j;
+ transpose_32bit_4x4(all[0] + i, out + 0);
+ transpose_32bit_4x4(all[1] + i, out + 4);
+ highbd_idct16x16_38_4col(out);
+
+ for (j = 0; j < 16; ++j) {
+ highbd_write_buffer_4(dest + j * stride, out[j], bd);
+ }
+ dest += 4;
+ }
+ }
+}
+
+void vpx_highbd_idct16x16_10_add_sse2(const tran_low_t *input, uint16_t *dest,
+ int stride, int bd) {
+ int i;
+ __m128i out[16];
+
+ if (bd == 8) {
+ __m128i in[16], l[16];
+
+ in[0] = load_pack_8_32bit(input + 0 * 16);
+ in[1] = load_pack_8_32bit(input + 1 * 16);
+ in[2] = load_pack_8_32bit(input + 2 * 16);
+ in[3] = load_pack_8_32bit(input + 3 * 16);
+
+ idct16x16_10_pass1(in, l);
+
+ for (i = 0; i < 16; i += 8) {
+ int j;
+ idct16x16_10_pass2(l + i, in);
+
+ for (j = 0; j < 16; ++j) {
+ highbd_write_buffer_8(dest + j * stride, in[j], bd);
+ }
+ dest += 8;
+ }
+ } else {
+ __m128i all[2][16], *in;
+
+ for (i = 0; i < 2; i++) {
+ in = all[i];
+ highbd_load_transpose_32bit_4x4(input, 16, in);
+ highbd_idct16x16_10_4col(in);
+ input += 4 * 16;
+ }
+
+ for (i = 0; i < 16; i += 4) {
+ int j;
+ transpose_32bit_4x4(&all[0][i], out);
+ highbd_idct16x16_10_4col(out);
+
+ for (j = 0; j < 16; ++j) {
+ highbd_write_buffer_4(dest + j * stride, out[j], bd);
+ }
+ dest += 4;
+ }
+ }
+}
+
+void vpx_highbd_idct16x16_1_add_sse2(const tran_low_t *input, uint16_t *dest,
+ int stride, int bd) {
+ highbd_idct_1_add_kernel(input, dest, stride, bd, 16);
+}
diff --git a/media/libvpx/libvpx/vpx_dsp/x86/highbd_idct16x16_add_sse4.c b/media/libvpx/libvpx/vpx_dsp/x86/highbd_idct16x16_add_sse4.c
new file mode 100644
index 0000000000..7898ee12c8
--- /dev/null
+++ b/media/libvpx/libvpx/vpx_dsp/x86/highbd_idct16x16_add_sse4.c
@@ -0,0 +1,349 @@
+/*
+ * Copyright (c) 2017 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <smmintrin.h> // SSE4.1
+
+#include "./vpx_dsp_rtcd.h"
+#include "vpx_dsp/x86/highbd_inv_txfm_sse2.h"
+#include "vpx_dsp/x86/highbd_inv_txfm_sse4.h"
+#include "vpx_dsp/x86/inv_txfm_sse2.h"
+#include "vpx_dsp/x86/transpose_sse2.h"
+#include "vpx_dsp/x86/txfm_common_sse2.h"
+
+static INLINE void highbd_idct16_4col_stage5(const __m128i *const in,
+ __m128i *const out) {
+ // stage 5
+ out[0] = _mm_add_epi32(in[0], in[3]);
+ out[1] = _mm_add_epi32(in[1], in[2]);
+ out[2] = _mm_sub_epi32(in[1], in[2]);
+ out[3] = _mm_sub_epi32(in[0], in[3]);
+ highbd_butterfly_cospi16_sse4_1(in[6], in[5], &out[6], &out[5]);
+ out[8] = _mm_add_epi32(in[8], in[11]);
+ out[9] = _mm_add_epi32(in[9], in[10]);
+ out[10] = _mm_sub_epi32(in[9], in[10]);
+ out[11] = _mm_sub_epi32(in[8], in[11]);
+ out[12] = _mm_sub_epi32(in[15], in[12]);
+ out[13] = _mm_sub_epi32(in[14], in[13]);
+ out[14] = _mm_add_epi32(in[14], in[13]);
+ out[15] = _mm_add_epi32(in[15], in[12]);
+}
+
+static INLINE void highbd_idct16_4col_stage6(const __m128i *const in,
+ __m128i *const out) {
+ out[0] = _mm_add_epi32(in[0], in[7]);
+ out[1] = _mm_add_epi32(in[1], in[6]);
+ out[2] = _mm_add_epi32(in[2], in[5]);
+ out[3] = _mm_add_epi32(in[3], in[4]);
+ out[4] = _mm_sub_epi32(in[3], in[4]);
+ out[5] = _mm_sub_epi32(in[2], in[5]);
+ out[6] = _mm_sub_epi32(in[1], in[6]);
+ out[7] = _mm_sub_epi32(in[0], in[7]);
+ out[8] = in[8];
+ out[9] = in[9];
+ highbd_butterfly_cospi16_sse4_1(in[13], in[10], &out[13], &out[10]);
+ highbd_butterfly_cospi16_sse4_1(in[12], in[11], &out[12], &out[11]);
+ out[14] = in[14];
+ out[15] = in[15];
+}
+
+void vpx_highbd_idct16_4col_sse4_1(__m128i *const io /*io[16]*/) {
+ __m128i step1[16], step2[16];
+
+ // stage 2
+ highbd_butterfly_sse4_1(io[1], io[15], cospi_30_64, cospi_2_64, &step2[8],
+ &step2[15]);
+ highbd_butterfly_sse4_1(io[9], io[7], cospi_14_64, cospi_18_64, &step2[9],
+ &step2[14]);
+ highbd_butterfly_sse4_1(io[5], io[11], cospi_22_64, cospi_10_64, &step2[10],
+ &step2[13]);
+ highbd_butterfly_sse4_1(io[13], io[3], cospi_6_64, cospi_26_64, &step2[11],
+ &step2[12]);
+
+ // stage 3
+ highbd_butterfly_sse4_1(io[2], io[14], cospi_28_64, cospi_4_64, &step1[4],
+ &step1[7]);
+ highbd_butterfly_sse4_1(io[10], io[6], cospi_12_64, cospi_20_64, &step1[5],
+ &step1[6]);
+ step1[8] = _mm_add_epi32(step2[8], step2[9]);
+ step1[9] = _mm_sub_epi32(step2[8], step2[9]);
+ step1[10] = _mm_sub_epi32(step2[11], step2[10]);
+ step1[11] = _mm_add_epi32(step2[11], step2[10]);
+ step1[12] = _mm_add_epi32(step2[12], step2[13]);
+ step1[13] = _mm_sub_epi32(step2[12], step2[13]);
+ step1[14] = _mm_sub_epi32(step2[15], step2[14]);
+ step1[15] = _mm_add_epi32(step2[15], step2[14]);
+
+ // stage 4
+ highbd_butterfly_cospi16_sse4_1(io[0], io[8], &step2[0], &step2[1]);
+ highbd_butterfly_sse4_1(io[4], io[12], cospi_24_64, cospi_8_64, &step2[2],
+ &step2[3]);
+ highbd_butterfly_sse4_1(step1[14], step1[9], cospi_24_64, cospi_8_64,
+ &step2[9], &step2[14]);
+ highbd_butterfly_sse4_1(step1[10], step1[13], -cospi_8_64, -cospi_24_64,
+ &step2[13], &step2[10]);
+ step2[5] = _mm_sub_epi32(step1[4], step1[5]);
+ step1[4] = _mm_add_epi32(step1[4], step1[5]);
+ step2[6] = _mm_sub_epi32(step1[7], step1[6]);
+ step1[7] = _mm_add_epi32(step1[7], step1[6]);
+ step2[8] = step1[8];
+ step2[11] = step1[11];
+ step2[12] = step1[12];
+ step2[15] = step1[15];
+
+ highbd_idct16_4col_stage5(step2, step1);
+ highbd_idct16_4col_stage6(step1, step2);
+ highbd_idct16_4col_stage7(step2, io);
+}
+
+static INLINE void highbd_idct16x16_38_4col(__m128i *const io /*io[16]*/) {
+ __m128i step1[16], step2[16];
+ __m128i temp1[2];
+
+ // stage 2
+ highbd_partial_butterfly_sse4_1(io[1], cospi_30_64, cospi_2_64, &step2[8],
+ &step2[15]);
+ highbd_partial_butterfly_sse4_1(io[7], -cospi_18_64, cospi_14_64, &step2[9],
+ &step2[14]);
+ highbd_partial_butterfly_sse4_1(io[5], cospi_22_64, cospi_10_64, &step2[10],
+ &step2[13]);
+ highbd_partial_butterfly_sse4_1(io[3], -cospi_26_64, cospi_6_64, &step2[11],
+ &step2[12]);
+
+ // stage 3
+ highbd_partial_butterfly_sse4_1(io[2], cospi_28_64, cospi_4_64, &step1[4],
+ &step1[7]);
+ highbd_partial_butterfly_sse4_1(io[6], -cospi_20_64, cospi_12_64, &step1[5],
+ &step1[6]);
+ step1[8] = _mm_add_epi32(step2[8], step2[9]);
+ step1[9] = _mm_sub_epi32(step2[8], step2[9]);
+ step1[10] = _mm_sub_epi32(step2[11], step2[10]);
+ step1[11] = _mm_add_epi32(step2[11], step2[10]);
+ step1[12] = _mm_add_epi32(step2[12], step2[13]);
+ step1[13] = _mm_sub_epi32(step2[12], step2[13]);
+ step1[14] = _mm_sub_epi32(step2[15], step2[14]);
+ step1[15] = _mm_add_epi32(step2[15], step2[14]);
+
+ // stage 4
+ extend_64bit(io[0], temp1);
+ step2[0] = multiplication_round_shift_sse4_1(temp1, cospi_16_64);
+ step2[1] = step2[0];
+ highbd_partial_butterfly_sse4_1(io[4], cospi_24_64, cospi_8_64, &step2[2],
+ &step2[3]);
+ highbd_butterfly_sse4_1(step1[14], step1[9], cospi_24_64, cospi_8_64,
+ &step2[9], &step2[14]);
+ highbd_butterfly_sse4_1(step1[10], step1[13], -cospi_8_64, -cospi_24_64,
+ &step2[13], &step2[10]);
+ step2[5] = _mm_sub_epi32(step1[4], step1[5]);
+ step1[4] = _mm_add_epi32(step1[4], step1[5]);
+ step2[6] = _mm_sub_epi32(step1[7], step1[6]);
+ step1[7] = _mm_add_epi32(step1[7], step1[6]);
+ step2[8] = step1[8];
+ step2[11] = step1[11];
+ step2[12] = step1[12];
+ step2[15] = step1[15];
+
+ highbd_idct16_4col_stage5(step2, step1);
+ highbd_idct16_4col_stage6(step1, step2);
+ highbd_idct16_4col_stage7(step2, io);
+}
+
+static INLINE void highbd_idct16x16_10_4col(__m128i *const io /*io[16]*/) {
+ __m128i step1[16], step2[16];
+ __m128i temp[2];
+
+ // stage 2
+ highbd_partial_butterfly_sse4_1(io[1], cospi_30_64, cospi_2_64, &step2[8],
+ &step2[15]);
+ highbd_partial_butterfly_sse4_1(io[3], -cospi_26_64, cospi_6_64, &step2[11],
+ &step2[12]);
+
+ // stage 3
+ highbd_partial_butterfly_sse4_1(io[2], cospi_28_64, cospi_4_64, &step1[4],
+ &step1[7]);
+ step1[8] = step2[8];
+ step1[9] = step2[8];
+ step1[10] = step2[11];
+ step1[11] = step2[11];
+ step1[12] = step2[12];
+ step1[13] = step2[12];
+ step1[14] = step2[15];
+ step1[15] = step2[15];
+
+ // stage 4
+ extend_64bit(io[0], temp);
+ step2[0] = multiplication_round_shift_sse4_1(temp, cospi_16_64);
+ step2[1] = step2[0];
+ step2[2] = _mm_setzero_si128();
+ step2[3] = _mm_setzero_si128();
+ highbd_butterfly_sse4_1(step1[14], step1[9], cospi_24_64, cospi_8_64,
+ &step2[9], &step2[14]);
+ highbd_butterfly_sse4_1(step1[10], step1[13], -cospi_8_64, -cospi_24_64,
+ &step2[13], &step2[10]);
+ step2[5] = step1[4];
+ step2[6] = step1[7];
+ step2[8] = step1[8];
+ step2[11] = step1[11];
+ step2[12] = step1[12];
+ step2[15] = step1[15];
+
+ highbd_idct16_4col_stage5(step2, step1);
+ highbd_idct16_4col_stage6(step1, step2);
+ highbd_idct16_4col_stage7(step2, io);
+}
+
+void vpx_highbd_idct16x16_256_add_sse4_1(const tran_low_t *input,
+ uint16_t *dest, int stride, int bd) {
+ int i;
+ __m128i out[16], *in;
+
+ if (bd == 8) {
+ __m128i l[16], r[16];
+
+ in = l;
+ for (i = 0; i < 2; i++) {
+ highbd_load_pack_transpose_32bit_8x8(&input[0], 16, &in[0]);
+ highbd_load_pack_transpose_32bit_8x8(&input[8], 16, &in[8]);
+ idct16_8col(in, in);
+ in = r;
+ input += 128;
+ }
+
+ for (i = 0; i < 16; i += 8) {
+ int j;
+ transpose_16bit_8x8(l + i, out);
+ transpose_16bit_8x8(r + i, out + 8);
+ idct16_8col(out, out);
+
+ for (j = 0; j < 16; ++j) {
+ highbd_write_buffer_8(dest + j * stride, out[j], bd);
+ }
+ dest += 8;
+ }
+ } else {
+ __m128i all[4][16];
+
+ for (i = 0; i < 4; i++) {
+ in = all[i];
+ highbd_load_transpose_32bit_8x4(&input[0], 16, &in[0]);
+ highbd_load_transpose_32bit_8x4(&input[8], 16, &in[8]);
+ vpx_highbd_idct16_4col_sse4_1(in);
+ input += 4 * 16;
+ }
+
+ for (i = 0; i < 16; i += 4) {
+ int j;
+ transpose_32bit_4x4(all[0] + i, out + 0);
+ transpose_32bit_4x4(all[1] + i, out + 4);
+ transpose_32bit_4x4(all[2] + i, out + 8);
+ transpose_32bit_4x4(all[3] + i, out + 12);
+ vpx_highbd_idct16_4col_sse4_1(out);
+
+ for (j = 0; j < 16; ++j) {
+ highbd_write_buffer_4(dest + j * stride, out[j], bd);
+ }
+ dest += 4;
+ }
+ }
+}
+
+void vpx_highbd_idct16x16_38_add_sse4_1(const tran_low_t *input, uint16_t *dest,
+ int stride, int bd) {
+ int i;
+ __m128i out[16];
+
+ if (bd == 8) {
+ __m128i in[16], temp[16];
+
+ highbd_load_pack_transpose_32bit_8x8(&input[0], 16, &in[0]);
+ for (i = 8; i < 16; i++) {
+ in[i] = _mm_setzero_si128();
+ }
+ idct16_8col(in, temp);
+
+ for (i = 0; i < 16; i += 8) {
+ int j;
+ transpose_16bit_8x8(temp + i, in);
+ idct16_8col(in, out);
+
+ for (j = 0; j < 16; ++j) {
+ highbd_write_buffer_8(dest + j * stride, out[j], bd);
+ }
+ dest += 8;
+ }
+ } else {
+ __m128i all[2][16], *in;
+
+ for (i = 0; i < 2; i++) {
+ in = all[i];
+ highbd_load_transpose_32bit_8x4(input, 16, in);
+ highbd_idct16x16_38_4col(in);
+ input += 4 * 16;
+ }
+
+ for (i = 0; i < 16; i += 4) {
+ int j;
+ transpose_32bit_4x4(all[0] + i, out + 0);
+ transpose_32bit_4x4(all[1] + i, out + 4);
+ highbd_idct16x16_38_4col(out);
+
+ for (j = 0; j < 16; ++j) {
+ highbd_write_buffer_4(dest + j * stride, out[j], bd);
+ }
+ dest += 4;
+ }
+ }
+}
+
+void vpx_highbd_idct16x16_10_add_sse4_1(const tran_low_t *input, uint16_t *dest,
+ int stride, int bd) {
+ int i;
+ __m128i out[16];
+
+ if (bd == 8) {
+ __m128i in[16], l[16];
+
+ in[0] = load_pack_8_32bit(input + 0 * 16);
+ in[1] = load_pack_8_32bit(input + 1 * 16);
+ in[2] = load_pack_8_32bit(input + 2 * 16);
+ in[3] = load_pack_8_32bit(input + 3 * 16);
+
+ idct16x16_10_pass1(in, l);
+
+ for (i = 0; i < 16; i += 8) {
+ int j;
+ idct16x16_10_pass2(l + i, in);
+
+ for (j = 0; j < 16; ++j) {
+ highbd_write_buffer_8(dest + j * stride, in[j], bd);
+ }
+ dest += 8;
+ }
+ } else {
+ __m128i all[2][16], *in;
+
+ for (i = 0; i < 2; i++) {
+ in = all[i];
+ highbd_load_transpose_32bit_4x4(input, 16, in);
+ highbd_idct16x16_10_4col(in);
+ input += 4 * 16;
+ }
+
+ for (i = 0; i < 16; i += 4) {
+ int j;
+ transpose_32bit_4x4(&all[0][i], out);
+ highbd_idct16x16_10_4col(out);
+
+ for (j = 0; j < 16; ++j) {
+ highbd_write_buffer_4(dest + j * stride, out[j], bd);
+ }
+ dest += 4;
+ }
+ }
+}
diff --git a/media/libvpx/libvpx/vpx_dsp/x86/highbd_idct32x32_add_sse2.c b/media/libvpx/libvpx/vpx_dsp/x86/highbd_idct32x32_add_sse2.c
new file mode 100644
index 0000000000..c710e89954
--- /dev/null
+++ b/media/libvpx/libvpx/vpx_dsp/x86/highbd_idct32x32_add_sse2.c
@@ -0,0 +1,782 @@
+/*
+ * Copyright (c) 2015 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "./vpx_dsp_rtcd.h"
+#include "vpx_dsp/x86/highbd_inv_txfm_sse2.h"
+#include "vpx_dsp/x86/inv_txfm_sse2.h"
+#include "vpx_dsp/x86/transpose_sse2.h"
+#include "vpx_dsp/x86/txfm_common_sse2.h"
+
+static INLINE void highbd_idct32_4x32_quarter_2_stage_4_to_6(
+ __m128i *const step1 /*step1[16]*/, __m128i *const out /*out[16]*/) {
+ __m128i step2[32];
+
+ // stage 4
+ step2[8] = step1[8];
+ step2[15] = step1[15];
+ highbd_butterfly_sse2(step1[14], step1[9], cospi_24_64, cospi_8_64, &step2[9],
+ &step2[14]);
+ highbd_butterfly_sse2(step1[10], step1[13], cospi_8_64, cospi_24_64,
+ &step2[13], &step2[10]);
+ step2[11] = step1[11];
+ step2[12] = step1[12];
+
+ // stage 5
+ step1[8] = _mm_add_epi32(step2[8], step2[11]);
+ step1[9] = _mm_add_epi32(step2[9], step2[10]);
+ step1[10] = _mm_sub_epi32(step2[9], step2[10]);
+ step1[11] = _mm_sub_epi32(step2[8], step2[11]);
+ step1[12] = _mm_sub_epi32(step2[15], step2[12]);
+ step1[13] = _mm_sub_epi32(step2[14], step2[13]);
+ step1[14] = _mm_add_epi32(step2[14], step2[13]);
+ step1[15] = _mm_add_epi32(step2[15], step2[12]);
+
+ // stage 6
+ out[8] = step1[8];
+ out[9] = step1[9];
+ highbd_butterfly_sse2(step1[13], step1[10], cospi_16_64, cospi_16_64,
+ &out[10], &out[13]);
+ highbd_butterfly_sse2(step1[12], step1[11], cospi_16_64, cospi_16_64,
+ &out[11], &out[12]);
+ out[14] = step1[14];
+ out[15] = step1[15];
+}
+
+static INLINE void highbd_idct32_4x32_quarter_3_4_stage_4_to_7(
+ __m128i *const step1 /*step1[32]*/, __m128i *const out /*out[32]*/) {
+ __m128i step2[32];
+
+ // stage 4
+ step2[16] = _mm_add_epi32(step1[16], step1[19]);
+ step2[17] = _mm_add_epi32(step1[17], step1[18]);
+ step2[18] = _mm_sub_epi32(step1[17], step1[18]);
+ step2[19] = _mm_sub_epi32(step1[16], step1[19]);
+ step2[20] = _mm_sub_epi32(step1[20], step1[23]); // step2[20] = -step2[20]
+ step2[21] = _mm_sub_epi32(step1[21], step1[22]); // step2[21] = -step2[21]
+ step2[22] = _mm_add_epi32(step1[21], step1[22]);
+ step2[23] = _mm_add_epi32(step1[20], step1[23]);
+
+ step2[24] = _mm_add_epi32(step1[27], step1[24]);
+ step2[25] = _mm_add_epi32(step1[26], step1[25]);
+ step2[26] = _mm_sub_epi32(step1[26], step1[25]); // step2[26] = -step2[26]
+ step2[27] = _mm_sub_epi32(step1[27], step1[24]); // step2[27] = -step2[27]
+ step2[28] = _mm_sub_epi32(step1[31], step1[28]);
+ step2[29] = _mm_sub_epi32(step1[30], step1[29]);
+ step2[30] = _mm_add_epi32(step1[29], step1[30]);
+ step2[31] = _mm_add_epi32(step1[28], step1[31]);
+
+ // stage 5
+ step1[16] = step2[16];
+ step1[17] = step2[17];
+ highbd_butterfly_sse2(step2[29], step2[18], cospi_24_64, cospi_8_64,
+ &step1[18], &step1[29]);
+ highbd_butterfly_sse2(step2[28], step2[19], cospi_24_64, cospi_8_64,
+ &step1[19], &step1[28]);
+ highbd_butterfly_sse2(step2[20], step2[27], cospi_8_64, cospi_24_64,
+ &step1[27], &step1[20]);
+ highbd_butterfly_sse2(step2[21], step2[26], cospi_8_64, cospi_24_64,
+ &step1[26], &step1[21]);
+ step1[22] = step2[22];
+ step1[23] = step2[23];
+ step1[24] = step2[24];
+ step1[25] = step2[25];
+ step1[30] = step2[30];
+ step1[31] = step2[31];
+
+ // stage 6
+ step2[16] = _mm_add_epi32(step1[16], step1[23]);
+ step2[17] = _mm_add_epi32(step1[17], step1[22]);
+ step2[18] = _mm_add_epi32(step1[18], step1[21]);
+ step2[19] = _mm_add_epi32(step1[19], step1[20]);
+ step2[20] = _mm_sub_epi32(step1[19], step1[20]);
+ step2[21] = _mm_sub_epi32(step1[18], step1[21]);
+ step2[22] = _mm_sub_epi32(step1[17], step1[22]);
+ step2[23] = _mm_sub_epi32(step1[16], step1[23]);
+
+ step2[24] = _mm_sub_epi32(step1[31], step1[24]);
+ step2[25] = _mm_sub_epi32(step1[30], step1[25]);
+ step2[26] = _mm_sub_epi32(step1[29], step1[26]);
+ step2[27] = _mm_sub_epi32(step1[28], step1[27]);
+ step2[28] = _mm_add_epi32(step1[27], step1[28]);
+ step2[29] = _mm_add_epi32(step1[26], step1[29]);
+ step2[30] = _mm_add_epi32(step1[25], step1[30]);
+ step2[31] = _mm_add_epi32(step1[24], step1[31]);
+
+ // stage 7
+ out[16] = step2[16];
+ out[17] = step2[17];
+ out[18] = step2[18];
+ out[19] = step2[19];
+ highbd_butterfly_sse2(step2[27], step2[20], cospi_16_64, cospi_16_64,
+ &out[20], &out[27]);
+ highbd_butterfly_sse2(step2[26], step2[21], cospi_16_64, cospi_16_64,
+ &out[21], &out[26]);
+ highbd_butterfly_sse2(step2[25], step2[22], cospi_16_64, cospi_16_64,
+ &out[22], &out[25]);
+ highbd_butterfly_sse2(step2[24], step2[23], cospi_16_64, cospi_16_64,
+ &out[23], &out[24]);
+ out[28] = step2[28];
+ out[29] = step2[29];
+ out[30] = step2[30];
+ out[31] = step2[31];
+}
+
+// Group the coefficient calculation into smaller functions to prevent stack
+// spillover in 32x32 idct optimizations:
+// quarter_1: 0-7
+// quarter_2: 8-15
+// quarter_3_4: 16-23, 24-31
+
+// For each 4x32 block __m128i in[32],
+// Input with index, 0, 4, 8, 12, 16, 20, 24, 28
+// output pixels: 0-7 in __m128i out[32]
+static INLINE void highbd_idct32_1024_4x32_quarter_1(
+ const __m128i *const in /*in[32]*/, __m128i *const out /*out[8]*/) {
+ __m128i step1[8], step2[8];
+
+ // stage 3
+ highbd_butterfly_sse2(in[4], in[28], cospi_28_64, cospi_4_64, &step1[4],
+ &step1[7]);
+ highbd_butterfly_sse2(in[20], in[12], cospi_12_64, cospi_20_64, &step1[5],
+ &step1[6]);
+
+ // stage 4
+ highbd_butterfly_sse2(in[0], in[16], cospi_16_64, cospi_16_64, &step2[1],
+ &step2[0]);
+ highbd_butterfly_sse2(in[8], in[24], cospi_24_64, cospi_8_64, &step2[2],
+ &step2[3]);
+ step2[4] = _mm_add_epi32(step1[4], step1[5]);
+ step2[5] = _mm_sub_epi32(step1[4], step1[5]);
+ step2[6] = _mm_sub_epi32(step1[7], step1[6]);
+ step2[7] = _mm_add_epi32(step1[7], step1[6]);
+
+ // stage 5
+ step1[0] = _mm_add_epi32(step2[0], step2[3]);
+ step1[1] = _mm_add_epi32(step2[1], step2[2]);
+ step1[2] = _mm_sub_epi32(step2[1], step2[2]);
+ step1[3] = _mm_sub_epi32(step2[0], step2[3]);
+ step1[4] = step2[4];
+ highbd_butterfly_sse2(step2[6], step2[5], cospi_16_64, cospi_16_64, &step1[5],
+ &step1[6]);
+ step1[7] = step2[7];
+
+ // stage 6
+ out[0] = _mm_add_epi32(step1[0], step1[7]);
+ out[1] = _mm_add_epi32(step1[1], step1[6]);
+ out[2] = _mm_add_epi32(step1[2], step1[5]);
+ out[3] = _mm_add_epi32(step1[3], step1[4]);
+ out[4] = _mm_sub_epi32(step1[3], step1[4]);
+ out[5] = _mm_sub_epi32(step1[2], step1[5]);
+ out[6] = _mm_sub_epi32(step1[1], step1[6]);
+ out[7] = _mm_sub_epi32(step1[0], step1[7]);
+}
+
+// For each 4x32 block __m128i in[32],
+// Input with index, 2, 6, 10, 14, 18, 22, 26, 30
+// output pixels: 8-15 in __m128i out[32]
+static INLINE void highbd_idct32_1024_4x32_quarter_2(
+ const __m128i *in /*in[32]*/, __m128i *out /*out[16]*/) {
+ __m128i step1[32], step2[32];
+
+ // stage 2
+ highbd_butterfly_sse2(in[2], in[30], cospi_30_64, cospi_2_64, &step2[8],
+ &step2[15]);
+ highbd_butterfly_sse2(in[18], in[14], cospi_14_64, cospi_18_64, &step2[9],
+ &step2[14]);
+ highbd_butterfly_sse2(in[10], in[22], cospi_22_64, cospi_10_64, &step2[10],
+ &step2[13]);
+ highbd_butterfly_sse2(in[26], in[6], cospi_6_64, cospi_26_64, &step2[11],
+ &step2[12]);
+
+ // stage 3
+ step1[8] = _mm_add_epi32(step2[8], step2[9]);
+ step1[9] = _mm_sub_epi32(step2[8], step2[9]);
+ step1[14] = _mm_sub_epi32(step2[15], step2[14]);
+ step1[15] = _mm_add_epi32(step2[15], step2[14]);
+ step1[10] = _mm_sub_epi32(step2[10], step2[11]); // step1[10] = -step1[10]
+ step1[11] = _mm_add_epi32(step2[10], step2[11]);
+ step1[12] = _mm_add_epi32(step2[13], step2[12]);
+ step1[13] = _mm_sub_epi32(step2[13], step2[12]); // step1[13] = -step1[13]
+
+ highbd_idct32_4x32_quarter_2_stage_4_to_6(step1, out);
+}
+
+static INLINE void highbd_idct32_1024_4x32_quarter_1_2(
+ const __m128i *const in /*in[32]*/, __m128i *const out /*out[32]*/) {
+ __m128i temp[16];
+ highbd_idct32_1024_4x32_quarter_1(in, temp);
+ highbd_idct32_1024_4x32_quarter_2(in, temp);
+ // stage 7
+ highbd_add_sub_butterfly(temp, out, 16);
+}
+
+// For each 4x32 block __m128i in[32],
+// Input with odd index,
+// 1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31
+// output pixels: 16-23, 24-31 in __m128i out[32]
+static INLINE void highbd_idct32_1024_4x32_quarter_3_4(
+ const __m128i *const in /*in[32]*/, __m128i *const out /*out[32]*/) {
+ __m128i step1[32], step2[32];
+
+ // stage 1
+ highbd_butterfly_sse2(in[1], in[31], cospi_31_64, cospi_1_64, &step1[16],
+ &step1[31]);
+ highbd_butterfly_sse2(in[17], in[15], cospi_15_64, cospi_17_64, &step1[17],
+ &step1[30]);
+ highbd_butterfly_sse2(in[9], in[23], cospi_23_64, cospi_9_64, &step1[18],
+ &step1[29]);
+ highbd_butterfly_sse2(in[25], in[7], cospi_7_64, cospi_25_64, &step1[19],
+ &step1[28]);
+
+ highbd_butterfly_sse2(in[5], in[27], cospi_27_64, cospi_5_64, &step1[20],
+ &step1[27]);
+ highbd_butterfly_sse2(in[21], in[11], cospi_11_64, cospi_21_64, &step1[21],
+ &step1[26]);
+
+ highbd_butterfly_sse2(in[13], in[19], cospi_19_64, cospi_13_64, &step1[22],
+ &step1[25]);
+ highbd_butterfly_sse2(in[29], in[3], cospi_3_64, cospi_29_64, &step1[23],
+ &step1[24]);
+
+ // stage 2
+ step2[16] = _mm_add_epi32(step1[16], step1[17]);
+ step2[17] = _mm_sub_epi32(step1[16], step1[17]);
+ step2[18] = _mm_sub_epi32(step1[18], step1[19]); // step2[18] = -step2[18]
+ step2[19] = _mm_add_epi32(step1[18], step1[19]);
+ step2[20] = _mm_add_epi32(step1[20], step1[21]);
+ step2[21] = _mm_sub_epi32(step1[20], step1[21]);
+ step2[22] = _mm_sub_epi32(step1[22], step1[23]); // step2[22] = -step2[22]
+ step2[23] = _mm_add_epi32(step1[22], step1[23]);
+
+ step2[24] = _mm_add_epi32(step1[25], step1[24]);
+ step2[25] = _mm_sub_epi32(step1[25], step1[24]); // step2[25] = -step2[25]
+ step2[26] = _mm_sub_epi32(step1[27], step1[26]);
+ step2[27] = _mm_add_epi32(step1[27], step1[26]);
+ step2[28] = _mm_add_epi32(step1[29], step1[28]);
+ step2[29] = _mm_sub_epi32(step1[29], step1[28]); // step2[29] = -step2[29]
+ step2[30] = _mm_sub_epi32(step1[31], step1[30]);
+ step2[31] = _mm_add_epi32(step1[31], step1[30]);
+
+ // stage 3
+ step1[16] = step2[16];
+ step1[31] = step2[31];
+ highbd_butterfly_sse2(step2[30], step2[17], cospi_28_64, cospi_4_64,
+ &step1[17], &step1[30]);
+ highbd_butterfly_sse2(step2[18], step2[29], cospi_4_64, cospi_28_64,
+ &step1[29], &step1[18]);
+ step1[19] = step2[19];
+ step1[20] = step2[20];
+ highbd_butterfly_sse2(step2[26], step2[21], cospi_12_64, cospi_20_64,
+ &step1[21], &step1[26]);
+ highbd_butterfly_sse2(step2[22], step2[25], cospi_20_64, cospi_12_64,
+ &step1[25], &step1[22]);
+ step1[23] = step2[23];
+ step1[24] = step2[24];
+ step1[27] = step2[27];
+ step1[28] = step2[28];
+
+ highbd_idct32_4x32_quarter_3_4_stage_4_to_7(step1, out);
+}
+
+static void highbd_idct32_1024_4x32(__m128i *const io /*io[32]*/) {
+ __m128i temp[32];
+
+ highbd_idct32_1024_4x32_quarter_1_2(io, temp);
+ highbd_idct32_1024_4x32_quarter_3_4(io, temp);
+ // final stage
+ highbd_add_sub_butterfly(temp, io, 32);
+}
+
+void vpx_highbd_idct32x32_1024_add_sse2(const tran_low_t *input, uint16_t *dest,
+ int stride, int bd) {
+ int i, j;
+
+ if (bd == 8) {
+ __m128i col[4][32], io[32];
+
+ // rows
+ for (i = 0; i < 4; i++) {
+ highbd_load_pack_transpose_32bit_8x8(&input[0], 32, &io[0]);
+ highbd_load_pack_transpose_32bit_8x8(&input[8], 32, &io[8]);
+ highbd_load_pack_transpose_32bit_8x8(&input[16], 32, &io[16]);
+ highbd_load_pack_transpose_32bit_8x8(&input[24], 32, &io[24]);
+ idct32_1024_8x32(io, col[i]);
+ input += 32 << 3;
+ }
+
+ // columns
+ for (i = 0; i < 32; i += 8) {
+ // Transpose 32x8 block to 8x32 block
+ transpose_16bit_8x8(col[0] + i, io);
+ transpose_16bit_8x8(col[1] + i, io + 8);
+ transpose_16bit_8x8(col[2] + i, io + 16);
+ transpose_16bit_8x8(col[3] + i, io + 24);
+ idct32_1024_8x32(io, io);
+ for (j = 0; j < 32; ++j) {
+ highbd_write_buffer_8(dest + j * stride, io[j], bd);
+ }
+ dest += 8;
+ }
+ } else {
+ __m128i all[8][32], out[32], *in;
+
+ for (i = 0; i < 8; i++) {
+ in = all[i];
+ highbd_load_transpose_32bit_8x4(&input[0], 32, &in[0]);
+ highbd_load_transpose_32bit_8x4(&input[8], 32, &in[8]);
+ highbd_load_transpose_32bit_8x4(&input[16], 32, &in[16]);
+ highbd_load_transpose_32bit_8x4(&input[24], 32, &in[24]);
+ highbd_idct32_1024_4x32(in);
+ input += 4 * 32;
+ }
+
+ for (i = 0; i < 32; i += 4) {
+ transpose_32bit_4x4(all[0] + i, out + 0);
+ transpose_32bit_4x4(all[1] + i, out + 4);
+ transpose_32bit_4x4(all[2] + i, out + 8);
+ transpose_32bit_4x4(all[3] + i, out + 12);
+ transpose_32bit_4x4(all[4] + i, out + 16);
+ transpose_32bit_4x4(all[5] + i, out + 20);
+ transpose_32bit_4x4(all[6] + i, out + 24);
+ transpose_32bit_4x4(all[7] + i, out + 28);
+ highbd_idct32_1024_4x32(out);
+
+ for (j = 0; j < 32; ++j) {
+ highbd_write_buffer_4(dest + j * stride, out[j], bd);
+ }
+ dest += 4;
+ }
+ }
+}
+
+// -----------------------------------------------------------------------------
+
+// For each 4x32 block __m128i in[32],
+// Input with index, 0, 4, 8, 12
+// output pixels: 0-7 in __m128i out[32]
+static INLINE void highbd_idct32_135_4x32_quarter_1(
+ const __m128i *const in /*in[32]*/, __m128i *const out /*out[8]*/) {
+ __m128i step1[8], step2[8];
+
+ // stage 3
+ highbd_partial_butterfly_sse2(in[4], cospi_28_64, cospi_4_64, &step1[4],
+ &step1[7]);
+ highbd_partial_butterfly_neg_sse2(in[12], cospi_12_64, cospi_20_64, &step1[5],
+ &step1[6]);
+
+ // stage 4
+ highbd_partial_butterfly_sse2(in[0], cospi_16_64, cospi_16_64, &step2[1],
+ &step2[0]);
+ highbd_partial_butterfly_sse2(in[8], cospi_24_64, cospi_8_64, &step2[2],
+ &step2[3]);
+ step2[4] = _mm_add_epi32(step1[4], step1[5]);
+ step2[5] = _mm_sub_epi32(step1[4], step1[5]);
+ step2[6] = _mm_sub_epi32(step1[7], step1[6]);
+ step2[7] = _mm_add_epi32(step1[7], step1[6]);
+
+ // stage 5
+ step1[0] = _mm_add_epi32(step2[0], step2[3]);
+ step1[1] = _mm_add_epi32(step2[1], step2[2]);
+ step1[2] = _mm_sub_epi32(step2[1], step2[2]);
+ step1[3] = _mm_sub_epi32(step2[0], step2[3]);
+ step1[4] = step2[4];
+ highbd_butterfly_sse2(step2[6], step2[5], cospi_16_64, cospi_16_64, &step1[5],
+ &step1[6]);
+ step1[7] = step2[7];
+
+ // stage 6
+ out[0] = _mm_add_epi32(step1[0], step1[7]);
+ out[1] = _mm_add_epi32(step1[1], step1[6]);
+ out[2] = _mm_add_epi32(step1[2], step1[5]);
+ out[3] = _mm_add_epi32(step1[3], step1[4]);
+ out[4] = _mm_sub_epi32(step1[3], step1[4]);
+ out[5] = _mm_sub_epi32(step1[2], step1[5]);
+ out[6] = _mm_sub_epi32(step1[1], step1[6]);
+ out[7] = _mm_sub_epi32(step1[0], step1[7]);
+}
+
+// For each 4x32 block __m128i in[32],
+// Input with index, 2, 6, 10, 14
+// output pixels: 8-15 in __m128i out[32]
+static INLINE void highbd_idct32_135_4x32_quarter_2(
+ const __m128i *in /*in[32]*/, __m128i *out /*out[16]*/) {
+ __m128i step1[32], step2[32];
+
+ // stage 2
+ highbd_partial_butterfly_sse2(in[2], cospi_30_64, cospi_2_64, &step2[8],
+ &step2[15]);
+ highbd_partial_butterfly_neg_sse2(in[14], cospi_14_64, cospi_18_64, &step2[9],
+ &step2[14]);
+ highbd_partial_butterfly_sse2(in[10], cospi_22_64, cospi_10_64, &step2[10],
+ &step2[13]);
+ highbd_partial_butterfly_neg_sse2(in[6], cospi_6_64, cospi_26_64, &step2[11],
+ &step2[12]);
+
+ // stage 3
+ step1[8] = _mm_add_epi32(step2[8], step2[9]);
+ step1[9] = _mm_sub_epi32(step2[8], step2[9]);
+ step1[14] = _mm_sub_epi32(step2[15], step2[14]);
+ step1[15] = _mm_add_epi32(step2[15], step2[14]);
+ step1[10] = _mm_sub_epi32(step2[10], step2[11]); // step1[10] = -step1[10]
+ step1[11] = _mm_add_epi32(step2[10], step2[11]);
+ step1[12] = _mm_add_epi32(step2[13], step2[12]);
+ step1[13] = _mm_sub_epi32(step2[13], step2[12]); // step1[13] = -step1[13]
+
+ highbd_idct32_4x32_quarter_2_stage_4_to_6(step1, out);
+}
+
+static INLINE void highbd_idct32_135_4x32_quarter_1_2(
+ const __m128i *const in /*in[32]*/, __m128i *const out /*out[32]*/) {
+ __m128i temp[16];
+ highbd_idct32_135_4x32_quarter_1(in, temp);
+ highbd_idct32_135_4x32_quarter_2(in, temp);
+ // stage 7
+ highbd_add_sub_butterfly(temp, out, 16);
+}
+
+// For each 4x32 block __m128i in[32],
+// Input with odd index,
+// 1, 3, 5, 7, 9, 11, 13, 15
+// output pixels: 16-23, 24-31 in __m128i out[32]
+static INLINE void highbd_idct32_135_4x32_quarter_3_4(
+ const __m128i *const in /*in[32]*/, __m128i *const out /*out[32]*/) {
+ __m128i step1[32], step2[32];
+
+ // stage 1
+ highbd_partial_butterfly_sse2(in[1], cospi_31_64, cospi_1_64, &step1[16],
+ &step1[31]);
+ highbd_partial_butterfly_neg_sse2(in[15], cospi_15_64, cospi_17_64,
+ &step1[17], &step1[30]);
+ highbd_partial_butterfly_sse2(in[9], cospi_23_64, cospi_9_64, &step1[18],
+ &step1[29]);
+ highbd_partial_butterfly_neg_sse2(in[7], cospi_7_64, cospi_25_64, &step1[19],
+ &step1[28]);
+
+ highbd_partial_butterfly_sse2(in[5], cospi_27_64, cospi_5_64, &step1[20],
+ &step1[27]);
+ highbd_partial_butterfly_neg_sse2(in[11], cospi_11_64, cospi_21_64,
+ &step1[21], &step1[26]);
+
+ highbd_partial_butterfly_sse2(in[13], cospi_19_64, cospi_13_64, &step1[22],
+ &step1[25]);
+ highbd_partial_butterfly_neg_sse2(in[3], cospi_3_64, cospi_29_64, &step1[23],
+ &step1[24]);
+
+ // stage 2
+ step2[16] = _mm_add_epi32(step1[16], step1[17]);
+ step2[17] = _mm_sub_epi32(step1[16], step1[17]);
+ step2[18] = _mm_sub_epi32(step1[18], step1[19]); // step2[18] = -step2[18]
+ step2[19] = _mm_add_epi32(step1[18], step1[19]);
+ step2[20] = _mm_add_epi32(step1[20], step1[21]);
+ step2[21] = _mm_sub_epi32(step1[20], step1[21]);
+ step2[22] = _mm_sub_epi32(step1[22], step1[23]); // step2[22] = -step2[22]
+ step2[23] = _mm_add_epi32(step1[22], step1[23]);
+
+ step2[24] = _mm_add_epi32(step1[25], step1[24]);
+ step2[25] = _mm_sub_epi32(step1[25], step1[24]); // step2[25] = -step2[25]
+ step2[26] = _mm_sub_epi32(step1[27], step1[26]);
+ step2[27] = _mm_add_epi32(step1[27], step1[26]);
+ step2[28] = _mm_add_epi32(step1[29], step1[28]);
+ step2[29] = _mm_sub_epi32(step1[29], step1[28]); // step2[29] = -step2[29]
+ step2[30] = _mm_sub_epi32(step1[31], step1[30]);
+ step2[31] = _mm_add_epi32(step1[31], step1[30]);
+
+ // stage 3
+ step1[16] = step2[16];
+ step1[31] = step2[31];
+ highbd_butterfly_sse2(step2[30], step2[17], cospi_28_64, cospi_4_64,
+ &step1[17], &step1[30]);
+ highbd_butterfly_sse2(step2[18], step2[29], cospi_4_64, cospi_28_64,
+ &step1[29], &step1[18]);
+ step1[19] = step2[19];
+ step1[20] = step2[20];
+ highbd_butterfly_sse2(step2[26], step2[21], cospi_12_64, cospi_20_64,
+ &step1[21], &step1[26]);
+ highbd_butterfly_sse2(step2[22], step2[25], cospi_20_64, cospi_12_64,
+ &step1[25], &step1[22]);
+ step1[23] = step2[23];
+ step1[24] = step2[24];
+ step1[27] = step2[27];
+ step1[28] = step2[28];
+
+ highbd_idct32_4x32_quarter_3_4_stage_4_to_7(step1, out);
+}
+
+static void highbd_idct32_135_4x32(__m128i *const io /*io[32]*/) {
+ __m128i temp[32];
+
+ highbd_idct32_135_4x32_quarter_1_2(io, temp);
+ highbd_idct32_135_4x32_quarter_3_4(io, temp);
+ // final stage
+ highbd_add_sub_butterfly(temp, io, 32);
+}
+
+void vpx_highbd_idct32x32_135_add_sse2(const tran_low_t *input, uint16_t *dest,
+ int stride, int bd) {
+ int i, j;
+
+ if (bd == 8) {
+ __m128i col[2][32], in[32], out[32];
+
+ for (i = 16; i < 32; i++) {
+ in[i] = _mm_setzero_si128();
+ }
+
+ // rows
+ for (i = 0; i < 2; i++) {
+ highbd_load_pack_transpose_32bit_8x8(&input[0], 32, &in[0]);
+ highbd_load_pack_transpose_32bit_8x8(&input[8], 32, &in[8]);
+ idct32_1024_8x32(in, col[i]);
+ input += 32 << 3;
+ }
+
+ // columns
+ for (i = 0; i < 32; i += 8) {
+ transpose_16bit_8x8(col[0] + i, in);
+ transpose_16bit_8x8(col[1] + i, in + 8);
+ idct32_1024_8x32(in, out);
+ for (j = 0; j < 32; ++j) {
+ highbd_write_buffer_8(dest + j * stride, out[j], bd);
+ }
+ dest += 8;
+ }
+ } else {
+ __m128i all[8][32], out[32], *in;
+
+ for (i = 0; i < 4; i++) {
+ in = all[i];
+ highbd_load_transpose_32bit_8x4(&input[0], 32, &in[0]);
+ highbd_load_transpose_32bit_8x4(&input[8], 32, &in[8]);
+ highbd_idct32_135_4x32(in);
+ input += 4 * 32;
+ }
+
+ for (i = 0; i < 32; i += 4) {
+ transpose_32bit_4x4(all[0] + i, out + 0);
+ transpose_32bit_4x4(all[1] + i, out + 4);
+ transpose_32bit_4x4(all[2] + i, out + 8);
+ transpose_32bit_4x4(all[3] + i, out + 12);
+ highbd_idct32_135_4x32(out);
+
+ for (j = 0; j < 32; ++j) {
+ highbd_write_buffer_4(dest + j * stride, out[j], bd);
+ }
+ dest += 4;
+ }
+ }
+}
+
+// -----------------------------------------------------------------------------
+
+// For each 4x32 block __m128i in[32],
+// Input with index, 0, 4
+// output pixels: 0-7 in __m128i out[32]
+static INLINE void highbd_idct32_34_4x32_quarter_1(
+ const __m128i *const in /*in[32]*/, __m128i *const out /*out[8]*/) {
+ __m128i step1[8], step2[8];
+
+ // stage 3
+ highbd_partial_butterfly_sse2(in[4], cospi_28_64, cospi_4_64, &step1[4],
+ &step1[7]);
+
+ // stage 4
+ highbd_partial_butterfly_sse2(in[0], cospi_16_64, cospi_16_64, &step2[1],
+ &step2[0]);
+ step2[4] = step1[4];
+ step2[5] = step1[4];
+ step2[6] = step1[7];
+ step2[7] = step1[7];
+
+ // stage 5
+ step1[0] = step2[0];
+ step1[1] = step2[1];
+ step1[2] = step2[1];
+ step1[3] = step2[0];
+ step1[4] = step2[4];
+ highbd_butterfly_sse2(step2[6], step2[5], cospi_16_64, cospi_16_64, &step1[5],
+ &step1[6]);
+ step1[7] = step2[7];
+
+ // stage 6
+ out[0] = _mm_add_epi32(step1[0], step1[7]);
+ out[1] = _mm_add_epi32(step1[1], step1[6]);
+ out[2] = _mm_add_epi32(step1[2], step1[5]);
+ out[3] = _mm_add_epi32(step1[3], step1[4]);
+ out[4] = _mm_sub_epi32(step1[3], step1[4]);
+ out[5] = _mm_sub_epi32(step1[2], step1[5]);
+ out[6] = _mm_sub_epi32(step1[1], step1[6]);
+ out[7] = _mm_sub_epi32(step1[0], step1[7]);
+}
+
+// For each 4x32 block __m128i in[32],
+// Input with index, 2, 6
+// output pixels: 8-15 in __m128i out[32]
+static INLINE void highbd_idct32_34_4x32_quarter_2(const __m128i *in /*in[32]*/,
+ __m128i *out /*out[16]*/) {
+ __m128i step1[32], step2[32];
+
+ // stage 2
+ highbd_partial_butterfly_sse2(in[2], cospi_30_64, cospi_2_64, &step2[8],
+ &step2[15]);
+ highbd_partial_butterfly_neg_sse2(in[6], cospi_6_64, cospi_26_64, &step2[11],
+ &step2[12]);
+
+ // stage 3
+ step1[8] = step2[8];
+ step1[9] = step2[8];
+ step1[14] = step2[15];
+ step1[15] = step2[15];
+ step1[10] = step2[11];
+ step1[11] = step2[11];
+ step1[12] = step2[12];
+ step1[13] = step2[12];
+
+ step1[10] =
+ _mm_sub_epi32(_mm_setzero_si128(), step1[10]); // step1[10] = -step1[10]
+ step1[13] =
+ _mm_sub_epi32(_mm_setzero_si128(), step1[13]); // step1[13] = -step1[13]
+ highbd_idct32_4x32_quarter_2_stage_4_to_6(step1, out);
+}
+
+static INLINE void highbd_idct32_34_4x32_quarter_1_2(
+ const __m128i *const in /*in[32]*/, __m128i *const out /*out[32]*/) {
+ __m128i temp[16];
+ highbd_idct32_34_4x32_quarter_1(in, temp);
+ highbd_idct32_34_4x32_quarter_2(in, temp);
+ // stage 7
+ highbd_add_sub_butterfly(temp, out, 16);
+}
+
+// For each 4x32 block __m128i in[32],
+// Input with odd index,
+// 1, 3, 5, 7
+// output pixels: 16-23, 24-31 in __m128i out[32]
+static INLINE void highbd_idct32_34_4x32_quarter_3_4(
+ const __m128i *const in /*in[32]*/, __m128i *const out /*out[32]*/) {
+ __m128i step1[32], step2[32];
+
+ // stage 1
+ highbd_partial_butterfly_sse2(in[1], cospi_31_64, cospi_1_64, &step1[16],
+ &step1[31]);
+ highbd_partial_butterfly_neg_sse2(in[7], cospi_7_64, cospi_25_64, &step1[19],
+ &step1[28]);
+
+ highbd_partial_butterfly_sse2(in[5], cospi_27_64, cospi_5_64, &step1[20],
+ &step1[27]);
+ highbd_partial_butterfly_neg_sse2(in[3], cospi_3_64, cospi_29_64, &step1[23],
+ &step1[24]);
+
+ // stage 2
+ step2[16] = step1[16];
+ step2[17] = step1[16];
+ step2[18] = step1[19];
+ step2[19] = step1[19];
+ step2[20] = step1[20];
+ step2[21] = step1[20];
+ step2[22] = step1[23];
+ step2[23] = step1[23];
+
+ step2[24] = step1[24];
+ step2[25] = step1[24];
+ step2[26] = step1[27];
+ step2[27] = step1[27];
+ step2[28] = step1[28];
+ step2[29] = step1[28];
+ step2[30] = step1[31];
+ step2[31] = step1[31];
+
+ // stage 3
+ step2[18] =
+ _mm_sub_epi32(_mm_setzero_si128(), step2[18]); // step2[18] = -step2[18]
+ step2[22] =
+ _mm_sub_epi32(_mm_setzero_si128(), step2[22]); // step2[22] = -step2[22]
+ step2[25] =
+ _mm_sub_epi32(_mm_setzero_si128(), step2[25]); // step2[25] = -step2[25]
+ step2[29] =
+ _mm_sub_epi32(_mm_setzero_si128(), step2[29]); // step2[29] = -step2[29]
+ step1[16] = step2[16];
+ step1[31] = step2[31];
+ highbd_butterfly_sse2(step2[30], step2[17], cospi_28_64, cospi_4_64,
+ &step1[17], &step1[30]);
+ highbd_butterfly_sse2(step2[18], step2[29], cospi_4_64, cospi_28_64,
+ &step1[29], &step1[18]);
+ step1[19] = step2[19];
+ step1[20] = step2[20];
+ highbd_butterfly_sse2(step2[26], step2[21], cospi_12_64, cospi_20_64,
+ &step1[21], &step1[26]);
+ highbd_butterfly_sse2(step2[22], step2[25], cospi_20_64, cospi_12_64,
+ &step1[25], &step1[22]);
+ step1[23] = step2[23];
+ step1[24] = step2[24];
+ step1[27] = step2[27];
+ step1[28] = step2[28];
+
+ highbd_idct32_4x32_quarter_3_4_stage_4_to_7(step1, out);
+}
+
+static void highbd_idct32_34_4x32(__m128i *const io /*io[32]*/) {
+ __m128i temp[32];
+
+ highbd_idct32_34_4x32_quarter_1_2(io, temp);
+ highbd_idct32_34_4x32_quarter_3_4(io, temp);
+ // final stage
+ highbd_add_sub_butterfly(temp, io, 32);
+}
+
+void vpx_highbd_idct32x32_34_add_sse2(const tran_low_t *input, uint16_t *dest,
+ int stride, int bd) {
+ int i, j;
+
+ if (bd == 8) {
+ __m128i col[32], in[32], out[32];
+
+ // rows
+ highbd_load_pack_transpose_32bit_8x8(&input[0], 32, &in[0]);
+ idct32_34_8x32_sse2(in, col);
+
+ // columns
+ for (i = 0; i < 32; i += 8) {
+ transpose_16bit_8x8(col + i, in);
+ idct32_34_8x32_sse2(in, out);
+ for (j = 0; j < 32; ++j) {
+ highbd_write_buffer_8(dest + j * stride, out[j], bd);
+ }
+ dest += 8;
+ }
+ } else {
+ __m128i all[8][32], out[32], *in;
+
+ for (i = 0; i < 4; i++) {
+ in = all[i];
+ highbd_load_transpose_32bit_8x4(&input[0], 32, &in[0]);
+ highbd_load_transpose_32bit_8x4(&input[8], 32, &in[8]);
+ highbd_idct32_34_4x32(in);
+ input += 4 * 32;
+ }
+
+ for (i = 0; i < 32; i += 4) {
+ transpose_32bit_4x4(all[0] + i, out + 0);
+ transpose_32bit_4x4(all[1] + i, out + 4);
+ transpose_32bit_4x4(all[2] + i, out + 8);
+ transpose_32bit_4x4(all[3] + i, out + 12);
+ highbd_idct32_34_4x32(out);
+
+ for (j = 0; j < 32; ++j) {
+ highbd_write_buffer_4(dest + j * stride, out[j], bd);
+ }
+ dest += 4;
+ }
+ }
+}
+
+void vpx_highbd_idct32x32_1_add_sse2(const tran_low_t *input, uint16_t *dest,
+ int stride, int bd) {
+ highbd_idct_1_add_kernel(input, dest, stride, bd, 32);
+}
diff --git a/media/libvpx/libvpx/vpx_dsp/x86/highbd_idct32x32_add_sse4.c b/media/libvpx/libvpx/vpx_dsp/x86/highbd_idct32x32_add_sse4.c
new file mode 100644
index 0000000000..2d0a53ac0a
--- /dev/null
+++ b/media/libvpx/libvpx/vpx_dsp/x86/highbd_idct32x32_add_sse4.c
@@ -0,0 +1,765 @@
+/*
+ * Copyright (c) 2017 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <smmintrin.h> // SSE4.1
+
+#include "./vpx_dsp_rtcd.h"
+#include "vpx_dsp/x86/highbd_inv_txfm_sse2.h"
+#include "vpx_dsp/x86/highbd_inv_txfm_sse4.h"
+#include "vpx_dsp/x86/inv_txfm_sse2.h"
+#include "vpx_dsp/x86/inv_txfm_ssse3.h"
+#include "vpx_dsp/x86/transpose_sse2.h"
+#include "vpx_dsp/x86/txfm_common_sse2.h"
+
+static INLINE void highbd_idct32_4x32_quarter_2_stage_4_to_6(
+ __m128i *const step1 /*step1[16]*/, __m128i *const out /*out[16]*/) {
+ __m128i step2[32];
+
+ // stage 4
+ step2[8] = step1[8];
+ step2[15] = step1[15];
+ highbd_butterfly_sse4_1(step1[14], step1[9], cospi_24_64, cospi_8_64,
+ &step2[9], &step2[14]);
+ highbd_butterfly_sse4_1(step1[13], step1[10], -cospi_8_64, cospi_24_64,
+ &step2[10], &step2[13]);
+ step2[11] = step1[11];
+ step2[12] = step1[12];
+
+ // stage 5
+ step1[8] = _mm_add_epi32(step2[8], step2[11]);
+ step1[9] = _mm_add_epi32(step2[9], step2[10]);
+ step1[10] = _mm_sub_epi32(step2[9], step2[10]);
+ step1[11] = _mm_sub_epi32(step2[8], step2[11]);
+ step1[12] = _mm_sub_epi32(step2[15], step2[12]);
+ step1[13] = _mm_sub_epi32(step2[14], step2[13]);
+ step1[14] = _mm_add_epi32(step2[14], step2[13]);
+ step1[15] = _mm_add_epi32(step2[15], step2[12]);
+
+ // stage 6
+ out[8] = step1[8];
+ out[9] = step1[9];
+ highbd_butterfly_sse4_1(step1[13], step1[10], cospi_16_64, cospi_16_64,
+ &out[10], &out[13]);
+ highbd_butterfly_sse4_1(step1[12], step1[11], cospi_16_64, cospi_16_64,
+ &out[11], &out[12]);
+ out[14] = step1[14];
+ out[15] = step1[15];
+}
+
+static INLINE void highbd_idct32_4x32_quarter_3_4_stage_4_to_7(
+ __m128i *const step1 /*step1[32]*/, __m128i *const out /*out[32]*/) {
+ __m128i step2[32];
+
+ // stage 4
+ step2[16] = _mm_add_epi32(step1[16], step1[19]);
+ step2[17] = _mm_add_epi32(step1[17], step1[18]);
+ step2[18] = _mm_sub_epi32(step1[17], step1[18]);
+ step2[19] = _mm_sub_epi32(step1[16], step1[19]);
+ step2[20] = _mm_sub_epi32(step1[23], step1[20]);
+ step2[21] = _mm_sub_epi32(step1[22], step1[21]);
+ step2[22] = _mm_add_epi32(step1[22], step1[21]);
+ step2[23] = _mm_add_epi32(step1[23], step1[20]);
+
+ step2[24] = _mm_add_epi32(step1[24], step1[27]);
+ step2[25] = _mm_add_epi32(step1[25], step1[26]);
+ step2[26] = _mm_sub_epi32(step1[25], step1[26]);
+ step2[27] = _mm_sub_epi32(step1[24], step1[27]);
+ step2[28] = _mm_sub_epi32(step1[31], step1[28]);
+ step2[29] = _mm_sub_epi32(step1[30], step1[29]);
+ step2[30] = _mm_add_epi32(step1[29], step1[30]);
+ step2[31] = _mm_add_epi32(step1[28], step1[31]);
+
+ // stage 5
+ step1[16] = step2[16];
+ step1[17] = step2[17];
+ highbd_butterfly_sse4_1(step2[29], step2[18], cospi_24_64, cospi_8_64,
+ &step1[18], &step1[29]);
+ highbd_butterfly_sse4_1(step2[28], step2[19], cospi_24_64, cospi_8_64,
+ &step1[19], &step1[28]);
+ highbd_butterfly_sse4_1(step2[27], step2[20], -cospi_8_64, cospi_24_64,
+ &step1[20], &step1[27]);
+ highbd_butterfly_sse4_1(step2[26], step2[21], -cospi_8_64, cospi_24_64,
+ &step1[21], &step1[26]);
+ step1[22] = step2[22];
+ step1[23] = step2[23];
+ step1[24] = step2[24];
+ step1[25] = step2[25];
+ step1[30] = step2[30];
+ step1[31] = step2[31];
+
+ // stage 6
+ step2[16] = _mm_add_epi32(step1[16], step1[23]);
+ step2[17] = _mm_add_epi32(step1[17], step1[22]);
+ step2[18] = _mm_add_epi32(step1[18], step1[21]);
+ step2[19] = _mm_add_epi32(step1[19], step1[20]);
+ step2[20] = _mm_sub_epi32(step1[19], step1[20]);
+ step2[21] = _mm_sub_epi32(step1[18], step1[21]);
+ step2[22] = _mm_sub_epi32(step1[17], step1[22]);
+ step2[23] = _mm_sub_epi32(step1[16], step1[23]);
+
+ step2[24] = _mm_sub_epi32(step1[31], step1[24]);
+ step2[25] = _mm_sub_epi32(step1[30], step1[25]);
+ step2[26] = _mm_sub_epi32(step1[29], step1[26]);
+ step2[27] = _mm_sub_epi32(step1[28], step1[27]);
+ step2[28] = _mm_add_epi32(step1[27], step1[28]);
+ step2[29] = _mm_add_epi32(step1[26], step1[29]);
+ step2[30] = _mm_add_epi32(step1[25], step1[30]);
+ step2[31] = _mm_add_epi32(step1[24], step1[31]);
+
+ // stage 7
+ out[16] = step2[16];
+ out[17] = step2[17];
+ out[18] = step2[18];
+ out[19] = step2[19];
+ highbd_butterfly_sse4_1(step2[27], step2[20], cospi_16_64, cospi_16_64,
+ &out[20], &out[27]);
+ highbd_butterfly_sse4_1(step2[26], step2[21], cospi_16_64, cospi_16_64,
+ &out[21], &out[26]);
+ highbd_butterfly_sse4_1(step2[25], step2[22], cospi_16_64, cospi_16_64,
+ &out[22], &out[25]);
+ highbd_butterfly_sse4_1(step2[24], step2[23], cospi_16_64, cospi_16_64,
+ &out[23], &out[24]);
+ out[28] = step2[28];
+ out[29] = step2[29];
+ out[30] = step2[30];
+ out[31] = step2[31];
+}
+
+// Group the coefficient calculation into smaller functions to prevent stack
+// spillover in 32x32 idct optimizations:
+// quarter_1: 0-7
+// quarter_2: 8-15
+// quarter_3_4: 16-23, 24-31
+
+// For each 4x32 block __m128i in[32],
+// Input with index, 0, 4, 8, 12, 16, 20, 24, 28
+// output pixels: 0-7 in __m128i out[32]
+static INLINE void highbd_idct32_1024_4x32_quarter_1(
+ const __m128i *const in /*in[32]*/, __m128i *const out /*out[8]*/) {
+ __m128i step1[8], step2[8];
+
+ // stage 3
+ highbd_butterfly_sse4_1(in[4], in[28], cospi_28_64, cospi_4_64, &step1[4],
+ &step1[7]);
+ highbd_butterfly_sse4_1(in[20], in[12], cospi_12_64, cospi_20_64, &step1[5],
+ &step1[6]);
+
+ // stage 4
+ highbd_butterfly_sse4_1(in[0], in[16], cospi_16_64, cospi_16_64, &step2[1],
+ &step2[0]);
+ highbd_butterfly_sse4_1(in[8], in[24], cospi_24_64, cospi_8_64, &step2[2],
+ &step2[3]);
+ step2[4] = _mm_add_epi32(step1[4], step1[5]);
+ step2[5] = _mm_sub_epi32(step1[4], step1[5]);
+ step2[6] = _mm_sub_epi32(step1[7], step1[6]);
+ step2[7] = _mm_add_epi32(step1[7], step1[6]);
+
+ // stage 5
+ step1[0] = _mm_add_epi32(step2[0], step2[3]);
+ step1[1] = _mm_add_epi32(step2[1], step2[2]);
+ step1[2] = _mm_sub_epi32(step2[1], step2[2]);
+ step1[3] = _mm_sub_epi32(step2[0], step2[3]);
+ step1[4] = step2[4];
+ highbd_butterfly_sse4_1(step2[6], step2[5], cospi_16_64, cospi_16_64,
+ &step1[5], &step1[6]);
+ step1[7] = step2[7];
+
+ // stage 6
+ out[0] = _mm_add_epi32(step1[0], step1[7]);
+ out[1] = _mm_add_epi32(step1[1], step1[6]);
+ out[2] = _mm_add_epi32(step1[2], step1[5]);
+ out[3] = _mm_add_epi32(step1[3], step1[4]);
+ out[4] = _mm_sub_epi32(step1[3], step1[4]);
+ out[5] = _mm_sub_epi32(step1[2], step1[5]);
+ out[6] = _mm_sub_epi32(step1[1], step1[6]);
+ out[7] = _mm_sub_epi32(step1[0], step1[7]);
+}
+
+// For each 4x32 block __m128i in[32],
+// Input with index, 2, 6, 10, 14, 18, 22, 26, 30
+// output pixels: 8-15 in __m128i out[32]
+static INLINE void highbd_idct32_1024_4x32_quarter_2(
+ const __m128i *in /*in[32]*/, __m128i *out /*out[16]*/) {
+ __m128i step1[32], step2[32];
+
+ // stage 2
+ highbd_butterfly_sse4_1(in[2], in[30], cospi_30_64, cospi_2_64, &step2[8],
+ &step2[15]);
+ highbd_butterfly_sse4_1(in[18], in[14], cospi_14_64, cospi_18_64, &step2[9],
+ &step2[14]);
+ highbd_butterfly_sse4_1(in[10], in[22], cospi_22_64, cospi_10_64, &step2[10],
+ &step2[13]);
+ highbd_butterfly_sse4_1(in[26], in[6], cospi_6_64, cospi_26_64, &step2[11],
+ &step2[12]);
+
+ // stage 3
+ step1[8] = _mm_add_epi32(step2[8], step2[9]);
+ step1[9] = _mm_sub_epi32(step2[8], step2[9]);
+ step1[14] = _mm_sub_epi32(step2[15], step2[14]);
+ step1[15] = _mm_add_epi32(step2[15], step2[14]);
+ step1[10] = _mm_sub_epi32(step2[11], step2[10]);
+ step1[11] = _mm_add_epi32(step2[11], step2[10]);
+ step1[12] = _mm_add_epi32(step2[12], step2[13]);
+ step1[13] = _mm_sub_epi32(step2[12], step2[13]);
+
+ highbd_idct32_4x32_quarter_2_stage_4_to_6(step1, out);
+}
+
+static INLINE void highbd_idct32_1024_4x32_quarter_1_2(
+ const __m128i *const in /*in[32]*/, __m128i *const out /*out[32]*/) {
+ __m128i temp[16];
+ highbd_idct32_1024_4x32_quarter_1(in, temp);
+ highbd_idct32_1024_4x32_quarter_2(in, temp);
+ // stage 7
+ highbd_add_sub_butterfly(temp, out, 16);
+}
+
+// For each 4x32 block __m128i in[32],
+// Input with odd index,
+// 1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31
+// output pixels: 16-23, 24-31 in __m128i out[32]
+static INLINE void highbd_idct32_1024_4x32_quarter_3_4(
+ const __m128i *const in /*in[32]*/, __m128i *const out /*out[32]*/) {
+ __m128i step1[32], step2[32];
+
+ // stage 1
+ highbd_butterfly_sse4_1(in[1], in[31], cospi_31_64, cospi_1_64, &step1[16],
+ &step1[31]);
+ highbd_butterfly_sse4_1(in[17], in[15], cospi_15_64, cospi_17_64, &step1[17],
+ &step1[30]);
+ highbd_butterfly_sse4_1(in[9], in[23], cospi_23_64, cospi_9_64, &step1[18],
+ &step1[29]);
+ highbd_butterfly_sse4_1(in[25], in[7], cospi_7_64, cospi_25_64, &step1[19],
+ &step1[28]);
+
+ highbd_butterfly_sse4_1(in[5], in[27], cospi_27_64, cospi_5_64, &step1[20],
+ &step1[27]);
+ highbd_butterfly_sse4_1(in[21], in[11], cospi_11_64, cospi_21_64, &step1[21],
+ &step1[26]);
+
+ highbd_butterfly_sse4_1(in[13], in[19], cospi_19_64, cospi_13_64, &step1[22],
+ &step1[25]);
+ highbd_butterfly_sse4_1(in[29], in[3], cospi_3_64, cospi_29_64, &step1[23],
+ &step1[24]);
+
+ // stage 2
+ step2[16] = _mm_add_epi32(step1[16], step1[17]);
+ step2[17] = _mm_sub_epi32(step1[16], step1[17]);
+ step2[18] = _mm_sub_epi32(step1[19], step1[18]);
+ step2[19] = _mm_add_epi32(step1[19], step1[18]);
+ step2[20] = _mm_add_epi32(step1[20], step1[21]);
+ step2[21] = _mm_sub_epi32(step1[20], step1[21]);
+ step2[22] = _mm_sub_epi32(step1[23], step1[22]);
+ step2[23] = _mm_add_epi32(step1[23], step1[22]);
+
+ step2[24] = _mm_add_epi32(step1[24], step1[25]);
+ step2[25] = _mm_sub_epi32(step1[24], step1[25]);
+ step2[26] = _mm_sub_epi32(step1[27], step1[26]);
+ step2[27] = _mm_add_epi32(step1[27], step1[26]);
+ step2[28] = _mm_add_epi32(step1[28], step1[29]);
+ step2[29] = _mm_sub_epi32(step1[28], step1[29]);
+ step2[30] = _mm_sub_epi32(step1[31], step1[30]);
+ step2[31] = _mm_add_epi32(step1[31], step1[30]);
+
+ // stage 3
+ step1[16] = step2[16];
+ step1[31] = step2[31];
+ highbd_butterfly_sse4_1(step2[30], step2[17], cospi_28_64, cospi_4_64,
+ &step1[17], &step1[30]);
+ highbd_butterfly_sse4_1(step2[29], step2[18], -cospi_4_64, cospi_28_64,
+ &step1[18], &step1[29]);
+ step1[19] = step2[19];
+ step1[20] = step2[20];
+ highbd_butterfly_sse4_1(step2[26], step2[21], cospi_12_64, cospi_20_64,
+ &step1[21], &step1[26]);
+ highbd_butterfly_sse4_1(step2[25], step2[22], -cospi_20_64, cospi_12_64,
+ &step1[22], &step1[25]);
+ step1[23] = step2[23];
+ step1[24] = step2[24];
+ step1[27] = step2[27];
+ step1[28] = step2[28];
+
+ highbd_idct32_4x32_quarter_3_4_stage_4_to_7(step1, out);
+}
+
+static void highbd_idct32_1024_4x32(__m128i *const io /*io[32]*/) {
+ __m128i temp[32];
+
+ highbd_idct32_1024_4x32_quarter_1_2(io, temp);
+ highbd_idct32_1024_4x32_quarter_3_4(io, temp);
+ // final stage
+ highbd_add_sub_butterfly(temp, io, 32);
+}
+
+void vpx_highbd_idct32x32_1024_add_sse4_1(const tran_low_t *input,
+ uint16_t *dest, int stride, int bd) {
+ int i, j;
+
+ if (bd == 8) {
+ __m128i col[4][32], io[32];
+
+ // rows
+ for (i = 0; i < 4; i++) {
+ highbd_load_pack_transpose_32bit_8x8(&input[0], 32, &io[0]);
+ highbd_load_pack_transpose_32bit_8x8(&input[8], 32, &io[8]);
+ highbd_load_pack_transpose_32bit_8x8(&input[16], 32, &io[16]);
+ highbd_load_pack_transpose_32bit_8x8(&input[24], 32, &io[24]);
+ idct32_1024_8x32(io, col[i]);
+ input += 32 << 3;
+ }
+
+ // columns
+ for (i = 0; i < 32; i += 8) {
+ // Transpose 32x8 block to 8x32 block
+ transpose_16bit_8x8(col[0] + i, io);
+ transpose_16bit_8x8(col[1] + i, io + 8);
+ transpose_16bit_8x8(col[2] + i, io + 16);
+ transpose_16bit_8x8(col[3] + i, io + 24);
+ idct32_1024_8x32(io, io);
+ for (j = 0; j < 32; ++j) {
+ highbd_write_buffer_8(dest + j * stride, io[j], bd);
+ }
+ dest += 8;
+ }
+ } else {
+ __m128i all[8][32], out[32], *in;
+
+ for (i = 0; i < 8; i++) {
+ in = all[i];
+ highbd_load_transpose_32bit_8x4(&input[0], 32, &in[0]);
+ highbd_load_transpose_32bit_8x4(&input[8], 32, &in[8]);
+ highbd_load_transpose_32bit_8x4(&input[16], 32, &in[16]);
+ highbd_load_transpose_32bit_8x4(&input[24], 32, &in[24]);
+ highbd_idct32_1024_4x32(in);
+ input += 4 * 32;
+ }
+
+ for (i = 0; i < 32; i += 4) {
+ transpose_32bit_4x4(all[0] + i, out + 0);
+ transpose_32bit_4x4(all[1] + i, out + 4);
+ transpose_32bit_4x4(all[2] + i, out + 8);
+ transpose_32bit_4x4(all[3] + i, out + 12);
+ transpose_32bit_4x4(all[4] + i, out + 16);
+ transpose_32bit_4x4(all[5] + i, out + 20);
+ transpose_32bit_4x4(all[6] + i, out + 24);
+ transpose_32bit_4x4(all[7] + i, out + 28);
+ highbd_idct32_1024_4x32(out);
+
+ for (j = 0; j < 32; ++j) {
+ highbd_write_buffer_4(dest + j * stride, out[j], bd);
+ }
+ dest += 4;
+ }
+ }
+}
+
+// -----------------------------------------------------------------------------
+
+// For each 4x32 block __m128i in[32],
+// Input with index, 0, 4, 8, 12
+// output pixels: 0-7 in __m128i out[32]
+static INLINE void highbd_idct32_135_4x32_quarter_1(
+ const __m128i *const in /*in[32]*/, __m128i *const out /*out[8]*/) {
+ __m128i step1[8], step2[8];
+
+ // stage 3
+ highbd_partial_butterfly_sse4_1(in[4], cospi_28_64, cospi_4_64, &step1[4],
+ &step1[7]);
+ highbd_partial_butterfly_sse4_1(in[12], -cospi_20_64, cospi_12_64, &step1[5],
+ &step1[6]);
+
+ // stage 4
+ highbd_partial_butterfly_sse4_1(in[0], cospi_16_64, cospi_16_64, &step2[1],
+ &step2[0]);
+ highbd_partial_butterfly_sse4_1(in[8], cospi_24_64, cospi_8_64, &step2[2],
+ &step2[3]);
+ step2[4] = _mm_add_epi32(step1[4], step1[5]);
+ step2[5] = _mm_sub_epi32(step1[4], step1[5]);
+ step2[6] = _mm_sub_epi32(step1[7], step1[6]);
+ step2[7] = _mm_add_epi32(step1[7], step1[6]);
+
+ // stage 5
+ step1[0] = _mm_add_epi32(step2[0], step2[3]);
+ step1[1] = _mm_add_epi32(step2[1], step2[2]);
+ step1[2] = _mm_sub_epi32(step2[1], step2[2]);
+ step1[3] = _mm_sub_epi32(step2[0], step2[3]);
+ step1[4] = step2[4];
+ highbd_butterfly_sse4_1(step2[6], step2[5], cospi_16_64, cospi_16_64,
+ &step1[5], &step1[6]);
+ step1[7] = step2[7];
+
+ // stage 6
+ out[0] = _mm_add_epi32(step1[0], step1[7]);
+ out[1] = _mm_add_epi32(step1[1], step1[6]);
+ out[2] = _mm_add_epi32(step1[2], step1[5]);
+ out[3] = _mm_add_epi32(step1[3], step1[4]);
+ out[4] = _mm_sub_epi32(step1[3], step1[4]);
+ out[5] = _mm_sub_epi32(step1[2], step1[5]);
+ out[6] = _mm_sub_epi32(step1[1], step1[6]);
+ out[7] = _mm_sub_epi32(step1[0], step1[7]);
+}
+
+// For each 4x32 block __m128i in[32],
+// Input with index, 2, 6, 10, 14
+// output pixels: 8-15 in __m128i out[32]
+static INLINE void highbd_idct32_135_4x32_quarter_2(
+ const __m128i *in /*in[32]*/, __m128i *out /*out[16]*/) {
+ __m128i step1[32], step2[32];
+
+ // stage 2
+ highbd_partial_butterfly_sse4_1(in[2], cospi_30_64, cospi_2_64, &step2[8],
+ &step2[15]);
+ highbd_partial_butterfly_sse4_1(in[14], -cospi_18_64, cospi_14_64, &step2[9],
+ &step2[14]);
+ highbd_partial_butterfly_sse4_1(in[10], cospi_22_64, cospi_10_64, &step2[10],
+ &step2[13]);
+ highbd_partial_butterfly_sse4_1(in[6], -cospi_26_64, cospi_6_64, &step2[11],
+ &step2[12]);
+
+ // stage 3
+ step1[8] = _mm_add_epi32(step2[8], step2[9]);
+ step1[9] = _mm_sub_epi32(step2[8], step2[9]);
+ step1[14] = _mm_sub_epi32(step2[15], step2[14]);
+ step1[15] = _mm_add_epi32(step2[15], step2[14]);
+ step1[10] = _mm_sub_epi32(step2[11], step2[10]);
+ step1[11] = _mm_add_epi32(step2[11], step2[10]);
+ step1[12] = _mm_add_epi32(step2[12], step2[13]);
+ step1[13] = _mm_sub_epi32(step2[12], step2[13]);
+
+ highbd_idct32_4x32_quarter_2_stage_4_to_6(step1, out);
+}
+
+static INLINE void highbd_idct32_135_4x32_quarter_1_2(
+ const __m128i *const in /*in[32]*/, __m128i *const out /*out[32]*/) {
+ __m128i temp[16];
+ highbd_idct32_135_4x32_quarter_1(in, temp);
+ highbd_idct32_135_4x32_quarter_2(in, temp);
+ // stage 7
+ highbd_add_sub_butterfly(temp, out, 16);
+}
+
+// For each 4x32 block __m128i in[32],
+// Input with odd index,
+// 1, 3, 5, 7, 9, 11, 13, 15
+// output pixels: 16-23, 24-31 in __m128i out[32]
+static INLINE void highbd_idct32_135_4x32_quarter_3_4(
+ const __m128i *const in /*in[32]*/, __m128i *const out /*out[32]*/) {
+ __m128i step1[32], step2[32];
+
+ // stage 1
+ highbd_partial_butterfly_sse4_1(in[1], cospi_31_64, cospi_1_64, &step1[16],
+ &step1[31]);
+ highbd_partial_butterfly_sse4_1(in[15], -cospi_17_64, cospi_15_64, &step1[17],
+ &step1[30]);
+ highbd_partial_butterfly_sse4_1(in[9], cospi_23_64, cospi_9_64, &step1[18],
+ &step1[29]);
+ highbd_partial_butterfly_sse4_1(in[7], -cospi_25_64, cospi_7_64, &step1[19],
+ &step1[28]);
+
+ highbd_partial_butterfly_sse4_1(in[5], cospi_27_64, cospi_5_64, &step1[20],
+ &step1[27]);
+ highbd_partial_butterfly_sse4_1(in[11], -cospi_21_64, cospi_11_64, &step1[21],
+ &step1[26]);
+
+ highbd_partial_butterfly_sse4_1(in[13], cospi_19_64, cospi_13_64, &step1[22],
+ &step1[25]);
+ highbd_partial_butterfly_sse4_1(in[3], -cospi_29_64, cospi_3_64, &step1[23],
+ &step1[24]);
+
+ // stage 2
+ step2[16] = _mm_add_epi32(step1[16], step1[17]);
+ step2[17] = _mm_sub_epi32(step1[16], step1[17]);
+ step2[18] = _mm_sub_epi32(step1[19], step1[18]);
+ step2[19] = _mm_add_epi32(step1[19], step1[18]);
+ step2[20] = _mm_add_epi32(step1[20], step1[21]);
+ step2[21] = _mm_sub_epi32(step1[20], step1[21]);
+ step2[22] = _mm_sub_epi32(step1[23], step1[22]);
+ step2[23] = _mm_add_epi32(step1[23], step1[22]);
+
+ step2[24] = _mm_add_epi32(step1[24], step1[25]);
+ step2[25] = _mm_sub_epi32(step1[24], step1[25]);
+ step2[26] = _mm_sub_epi32(step1[27], step1[26]);
+ step2[27] = _mm_add_epi32(step1[27], step1[26]);
+ step2[28] = _mm_add_epi32(step1[28], step1[29]);
+ step2[29] = _mm_sub_epi32(step1[28], step1[29]);
+ step2[30] = _mm_sub_epi32(step1[31], step1[30]);
+ step2[31] = _mm_add_epi32(step1[31], step1[30]);
+
+ // stage 3
+ step1[16] = step2[16];
+ step1[31] = step2[31];
+ highbd_butterfly_sse4_1(step2[30], step2[17], cospi_28_64, cospi_4_64,
+ &step1[17], &step1[30]);
+ highbd_butterfly_sse4_1(step2[29], step2[18], -cospi_4_64, cospi_28_64,
+ &step1[18], &step1[29]);
+ step1[19] = step2[19];
+ step1[20] = step2[20];
+ highbd_butterfly_sse4_1(step2[26], step2[21], cospi_12_64, cospi_20_64,
+ &step1[21], &step1[26]);
+ highbd_butterfly_sse4_1(step2[25], step2[22], -cospi_20_64, cospi_12_64,
+ &step1[22], &step1[25]);
+ step1[23] = step2[23];
+ step1[24] = step2[24];
+ step1[27] = step2[27];
+ step1[28] = step2[28];
+
+ highbd_idct32_4x32_quarter_3_4_stage_4_to_7(step1, out);
+}
+
+static void highbd_idct32_135_4x32(__m128i *const io /*io[32]*/) {
+ __m128i temp[32];
+
+ highbd_idct32_135_4x32_quarter_1_2(io, temp);
+ highbd_idct32_135_4x32_quarter_3_4(io, temp);
+ // final stage
+ highbd_add_sub_butterfly(temp, io, 32);
+}
+
+void vpx_highbd_idct32x32_135_add_sse4_1(const tran_low_t *input,
+ uint16_t *dest, int stride, int bd) {
+ int i, j;
+
+ if (bd == 8) {
+ __m128i col[2][32], in[32], out[32];
+
+ // rows
+ for (i = 0; i < 2; i++) {
+ highbd_load_pack_transpose_32bit_8x8(&input[0], 32, &in[0]);
+ highbd_load_pack_transpose_32bit_8x8(&input[8], 32, &in[8]);
+ idct32_135_8x32_ssse3(in, col[i]);
+ input += 32 << 3;
+ }
+
+ // columns
+ for (i = 0; i < 32; i += 8) {
+ transpose_16bit_8x8(col[0] + i, in);
+ transpose_16bit_8x8(col[1] + i, in + 8);
+ idct32_135_8x32_ssse3(in, out);
+ for (j = 0; j < 32; ++j) {
+ highbd_write_buffer_8(dest + j * stride, out[j], bd);
+ }
+ dest += 8;
+ }
+ } else {
+ __m128i all[8][32], out[32], *in;
+
+ for (i = 0; i < 4; i++) {
+ in = all[i];
+ highbd_load_transpose_32bit_8x4(&input[0], 32, &in[0]);
+ highbd_load_transpose_32bit_8x4(&input[8], 32, &in[8]);
+ highbd_idct32_135_4x32(in);
+ input += 4 * 32;
+ }
+
+ for (i = 0; i < 32; i += 4) {
+ transpose_32bit_4x4(all[0] + i, out + 0);
+ transpose_32bit_4x4(all[1] + i, out + 4);
+ transpose_32bit_4x4(all[2] + i, out + 8);
+ transpose_32bit_4x4(all[3] + i, out + 12);
+ highbd_idct32_135_4x32(out);
+
+ for (j = 0; j < 32; ++j) {
+ highbd_write_buffer_4(dest + j * stride, out[j], bd);
+ }
+ dest += 4;
+ }
+ }
+}
+
+// -----------------------------------------------------------------------------
+
+// For each 4x32 block __m128i in[32],
+// Input with index, 0, 4
+// output pixels: 0-7 in __m128i out[32]
+static INLINE void highbd_idct32_34_4x32_quarter_1(
+ const __m128i *const in /*in[32]*/, __m128i *const out /*out[8]*/) {
+ __m128i step1[8], step2[8];
+
+ // stage 3
+ highbd_partial_butterfly_sse4_1(in[4], cospi_28_64, cospi_4_64, &step1[4],
+ &step1[7]);
+
+ // stage 4
+ highbd_partial_butterfly_sse4_1(in[0], cospi_16_64, cospi_16_64, &step2[1],
+ &step2[0]);
+ step2[4] = step1[4];
+ step2[5] = step1[4];
+ step2[6] = step1[7];
+ step2[7] = step1[7];
+
+ // stage 5
+ step1[0] = step2[0];
+ step1[1] = step2[1];
+ step1[2] = step2[1];
+ step1[3] = step2[0];
+ step1[4] = step2[4];
+ highbd_butterfly_sse4_1(step2[6], step2[5], cospi_16_64, cospi_16_64,
+ &step1[5], &step1[6]);
+ step1[7] = step2[7];
+
+ // stage 6
+ out[0] = _mm_add_epi32(step1[0], step1[7]);
+ out[1] = _mm_add_epi32(step1[1], step1[6]);
+ out[2] = _mm_add_epi32(step1[2], step1[5]);
+ out[3] = _mm_add_epi32(step1[3], step1[4]);
+ out[4] = _mm_sub_epi32(step1[3], step1[4]);
+ out[5] = _mm_sub_epi32(step1[2], step1[5]);
+ out[6] = _mm_sub_epi32(step1[1], step1[6]);
+ out[7] = _mm_sub_epi32(step1[0], step1[7]);
+}
+
+// For each 4x32 block __m128i in[32],
+// Input with index, 2, 6
+// output pixels: 8-15 in __m128i out[32]
+static INLINE void highbd_idct32_34_4x32_quarter_2(const __m128i *in /*in[32]*/,
+ __m128i *out /*out[16]*/) {
+ __m128i step1[32], step2[32];
+
+ // stage 2
+ highbd_partial_butterfly_sse4_1(in[2], cospi_30_64, cospi_2_64, &step2[8],
+ &step2[15]);
+ highbd_partial_butterfly_sse4_1(in[6], -cospi_26_64, cospi_6_64, &step2[11],
+ &step2[12]);
+
+ // stage 3
+ step1[8] = step2[8];
+ step1[9] = step2[8];
+ step1[14] = step2[15];
+ step1[15] = step2[15];
+ step1[10] = step2[11];
+ step1[11] = step2[11];
+ step1[12] = step2[12];
+ step1[13] = step2[12];
+
+ highbd_idct32_4x32_quarter_2_stage_4_to_6(step1, out);
+}
+
+static INLINE void highbd_idct32_34_4x32_quarter_1_2(
+ const __m128i *const in /*in[32]*/, __m128i *const out /*out[32]*/) {
+ __m128i temp[16];
+ highbd_idct32_34_4x32_quarter_1(in, temp);
+ highbd_idct32_34_4x32_quarter_2(in, temp);
+ // stage 7
+ highbd_add_sub_butterfly(temp, out, 16);
+}
+
+// For each 4x32 block __m128i in[32],
+// Input with odd index,
+// 1, 3, 5, 7
+// output pixels: 16-23, 24-31 in __m128i out[32]
+static INLINE void highbd_idct32_34_4x32_quarter_3_4(
+ const __m128i *const in /*in[32]*/, __m128i *const out /*out[32]*/) {
+ __m128i step1[32], step2[32];
+
+ // stage 1
+ highbd_partial_butterfly_sse4_1(in[1], cospi_31_64, cospi_1_64, &step1[16],
+ &step1[31]);
+ highbd_partial_butterfly_sse4_1(in[7], -cospi_25_64, cospi_7_64, &step1[19],
+ &step1[28]);
+
+ highbd_partial_butterfly_sse4_1(in[5], cospi_27_64, cospi_5_64, &step1[20],
+ &step1[27]);
+ highbd_partial_butterfly_sse4_1(in[3], -cospi_29_64, cospi_3_64, &step1[23],
+ &step1[24]);
+
+ // stage 2
+ step2[16] = step1[16];
+ step2[17] = step1[16];
+ step2[18] = step1[19];
+ step2[19] = step1[19];
+ step2[20] = step1[20];
+ step2[21] = step1[20];
+ step2[22] = step1[23];
+ step2[23] = step1[23];
+
+ step2[24] = step1[24];
+ step2[25] = step1[24];
+ step2[26] = step1[27];
+ step2[27] = step1[27];
+ step2[28] = step1[28];
+ step2[29] = step1[28];
+ step2[30] = step1[31];
+ step2[31] = step1[31];
+
+ // stage 3
+ step1[16] = step2[16];
+ step1[31] = step2[31];
+ highbd_butterfly_sse4_1(step2[30], step2[17], cospi_28_64, cospi_4_64,
+ &step1[17], &step1[30]);
+ highbd_butterfly_sse4_1(step2[29], step2[18], -cospi_4_64, cospi_28_64,
+ &step1[18], &step1[29]);
+ step1[19] = step2[19];
+ step1[20] = step2[20];
+ highbd_butterfly_sse4_1(step2[26], step2[21], cospi_12_64, cospi_20_64,
+ &step1[21], &step1[26]);
+ highbd_butterfly_sse4_1(step2[25], step2[22], -cospi_20_64, cospi_12_64,
+ &step1[22], &step1[25]);
+ step1[23] = step2[23];
+ step1[24] = step2[24];
+ step1[27] = step2[27];
+ step1[28] = step2[28];
+
+ highbd_idct32_4x32_quarter_3_4_stage_4_to_7(step1, out);
+}
+
+static void highbd_idct32_34_4x32(__m128i *const io /*io[32]*/) {
+ __m128i temp[32];
+
+ highbd_idct32_34_4x32_quarter_1_2(io, temp);
+ highbd_idct32_34_4x32_quarter_3_4(io, temp);
+ // final stage
+ highbd_add_sub_butterfly(temp, io, 32);
+}
+
+void vpx_highbd_idct32x32_34_add_sse4_1(const tran_low_t *input, uint16_t *dest,
+ int stride, int bd) {
+ int i, j;
+
+ if (bd == 8) {
+ __m128i col[32], in[32], out[32];
+
+ // rows
+ highbd_load_pack_transpose_32bit_8x8(&input[0], 32, &in[0]);
+ idct32_34_8x32_ssse3(in, col);
+
+ // columns
+ for (i = 0; i < 32; i += 8) {
+ transpose_16bit_8x8(col + i, in);
+ idct32_34_8x32_ssse3(in, out);
+ for (j = 0; j < 32; ++j) {
+ highbd_write_buffer_8(dest + j * stride, out[j], bd);
+ }
+ dest += 8;
+ }
+ } else {
+ __m128i all[8][32], out[32], *in;
+
+ for (i = 0; i < 4; i++) {
+ in = all[i];
+ highbd_load_transpose_32bit_8x4(&input[0], 32, &in[0]);
+ highbd_load_transpose_32bit_8x4(&input[8], 32, &in[8]);
+ highbd_idct32_34_4x32(in);
+ input += 4 * 32;
+ }
+
+ for (i = 0; i < 32; i += 4) {
+ transpose_32bit_4x4(all[0] + i, out + 0);
+ transpose_32bit_4x4(all[1] + i, out + 4);
+ transpose_32bit_4x4(all[2] + i, out + 8);
+ transpose_32bit_4x4(all[3] + i, out + 12);
+ highbd_idct32_34_4x32(out);
+
+ for (j = 0; j < 32; ++j) {
+ highbd_write_buffer_4(dest + j * stride, out[j], bd);
+ }
+ dest += 4;
+ }
+ }
+}
diff --git a/media/libvpx/libvpx/vpx_dsp/x86/highbd_idct4x4_add_sse2.c b/media/libvpx/libvpx/vpx_dsp/x86/highbd_idct4x4_add_sse2.c
new file mode 100644
index 0000000000..b9c8884f99
--- /dev/null
+++ b/media/libvpx/libvpx/vpx_dsp/x86/highbd_idct4x4_add_sse2.c
@@ -0,0 +1,160 @@
+/*
+ * Copyright (c) 2015 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <emmintrin.h> // SSE2
+
+#include "./vpx_dsp_rtcd.h"
+#include "vpx_dsp/x86/highbd_inv_txfm_sse2.h"
+#include "vpx_dsp/x86/inv_txfm_sse2.h"
+#include "vpx_dsp/x86/transpose_sse2.h"
+
+static INLINE __m128i dct_const_round_shift_4_sse2(const __m128i in0,
+ const __m128i in1) {
+ const __m128i t0 = _mm_unpacklo_epi32(in0, in1); // 0, 1
+ const __m128i t1 = _mm_unpackhi_epi32(in0, in1); // 2, 3
+ const __m128i t2 = _mm_unpacklo_epi64(t0, t1); // 0, 1, 2, 3
+ return dct_const_round_shift_sse2(t2);
+}
+
+static INLINE void highbd_idct4_small_sse2(__m128i *const io) {
+ const __m128i cospi_p16_p16 = _mm_setr_epi32(cospi_16_64, 0, cospi_16_64, 0);
+ const __m128i cospi_p08_p08 = _mm_setr_epi32(cospi_8_64, 0, cospi_8_64, 0);
+ const __m128i cospi_p24_p24 = _mm_setr_epi32(cospi_24_64, 0, cospi_24_64, 0);
+ __m128i temp1[4], temp2[4], step[4];
+
+ transpose_32bit_4x4(io, io);
+
+ // Note: There is no 32-bit signed multiply SIMD instruction in SSE2.
+ // _mm_mul_epu32() is used which can only guarantee the lower 32-bit
+ // (signed) result is meaningful, which is enough in this function.
+
+ // stage 1
+ temp1[0] = _mm_add_epi32(io[0], io[2]); // input[0] + input[2]
+ temp2[0] = _mm_sub_epi32(io[0], io[2]); // input[0] - input[2]
+ temp1[1] = _mm_srli_si128(temp1[0], 4); // 1, 3
+ temp2[1] = _mm_srli_si128(temp2[0], 4); // 1, 3
+ temp1[0] = _mm_mul_epu32(temp1[0], cospi_p16_p16); // ([0] + [2])*cospi_16_64
+ temp1[1] = _mm_mul_epu32(temp1[1], cospi_p16_p16); // ([0] + [2])*cospi_16_64
+ temp2[0] = _mm_mul_epu32(temp2[0], cospi_p16_p16); // ([0] - [2])*cospi_16_64
+ temp2[1] = _mm_mul_epu32(temp2[1], cospi_p16_p16); // ([0] - [2])*cospi_16_64
+ step[0] = dct_const_round_shift_4_sse2(temp1[0], temp1[1]);
+ step[1] = dct_const_round_shift_4_sse2(temp2[0], temp2[1]);
+
+ temp1[3] = _mm_srli_si128(io[1], 4);
+ temp2[3] = _mm_srli_si128(io[3], 4);
+ temp1[0] = _mm_mul_epu32(io[1], cospi_p24_p24); // input[1] * cospi_24_64
+ temp1[1] = _mm_mul_epu32(temp1[3], cospi_p24_p24); // input[1] * cospi_24_64
+ temp2[0] = _mm_mul_epu32(io[1], cospi_p08_p08); // input[1] * cospi_8_64
+ temp2[1] = _mm_mul_epu32(temp1[3], cospi_p08_p08); // input[1] * cospi_8_64
+ temp1[2] = _mm_mul_epu32(io[3], cospi_p08_p08); // input[3] * cospi_8_64
+ temp1[3] = _mm_mul_epu32(temp2[3], cospi_p08_p08); // input[3] * cospi_8_64
+ temp2[2] = _mm_mul_epu32(io[3], cospi_p24_p24); // input[3] * cospi_24_64
+ temp2[3] = _mm_mul_epu32(temp2[3], cospi_p24_p24); // input[3] * cospi_24_64
+ temp1[0] = _mm_sub_epi64(temp1[0], temp1[2]); // [1]*cospi_24 - [3]*cospi_8
+ temp1[1] = _mm_sub_epi64(temp1[1], temp1[3]); // [1]*cospi_24 - [3]*cospi_8
+ temp2[0] = _mm_add_epi64(temp2[0], temp2[2]); // [1]*cospi_8 + [3]*cospi_24
+ temp2[1] = _mm_add_epi64(temp2[1], temp2[3]); // [1]*cospi_8 + [3]*cospi_24
+ step[2] = dct_const_round_shift_4_sse2(temp1[0], temp1[1]);
+ step[3] = dct_const_round_shift_4_sse2(temp2[0], temp2[1]);
+
+ // stage 2
+ io[0] = _mm_add_epi32(step[0], step[3]); // step[0] + step[3]
+ io[1] = _mm_add_epi32(step[1], step[2]); // step[1] + step[2]
+ io[2] = _mm_sub_epi32(step[1], step[2]); // step[1] - step[2]
+ io[3] = _mm_sub_epi32(step[0], step[3]); // step[0] - step[3]
+}
+
+static INLINE void highbd_idct4_large_sse2(__m128i *const io) {
+ __m128i step[4];
+
+ transpose_32bit_4x4(io, io);
+
+ // stage 1
+ highbd_butterfly_cospi16_sse2(io[0], io[2], &step[0], &step[1]);
+ highbd_butterfly_sse2(io[1], io[3], cospi_24_64, cospi_8_64, &step[2],
+ &step[3]);
+
+ // stage 2
+ io[0] = _mm_add_epi32(step[0], step[3]); // step[0] + step[3]
+ io[1] = _mm_add_epi32(step[1], step[2]); // step[1] + step[2]
+ io[2] = _mm_sub_epi32(step[1], step[2]); // step[1] - step[2]
+ io[3] = _mm_sub_epi32(step[0], step[3]); // step[0] - step[3]
+}
+
+void vpx_highbd_idct4x4_16_add_sse2(const tran_low_t *input, uint16_t *dest,
+ int stride, int bd) {
+ int16_t max = 0, min = 0;
+ __m128i io[4], io_short[2];
+
+ io[0] = _mm_load_si128((const __m128i *)(input + 0));
+ io[1] = _mm_load_si128((const __m128i *)(input + 4));
+ io[2] = _mm_load_si128((const __m128i *)(input + 8));
+ io[3] = _mm_load_si128((const __m128i *)(input + 12));
+
+ io_short[0] = _mm_packs_epi32(io[0], io[1]);
+ io_short[1] = _mm_packs_epi32(io[2], io[3]);
+
+ if (bd != 8) {
+ __m128i max_input, min_input;
+
+ max_input = _mm_max_epi16(io_short[0], io_short[1]);
+ min_input = _mm_min_epi16(io_short[0], io_short[1]);
+ max_input = _mm_max_epi16(max_input, _mm_srli_si128(max_input, 8));
+ min_input = _mm_min_epi16(min_input, _mm_srli_si128(min_input, 8));
+ max_input = _mm_max_epi16(max_input, _mm_srli_si128(max_input, 4));
+ min_input = _mm_min_epi16(min_input, _mm_srli_si128(min_input, 4));
+ max_input = _mm_max_epi16(max_input, _mm_srli_si128(max_input, 2));
+ min_input = _mm_min_epi16(min_input, _mm_srli_si128(min_input, 2));
+ max = (int16_t)_mm_extract_epi16(max_input, 0);
+ min = (int16_t)_mm_extract_epi16(min_input, 0);
+ }
+
+ if (bd == 8 || (max < 4096 && min >= -4096)) {
+ idct4_sse2(io_short);
+ idct4_sse2(io_short);
+ io_short[0] = _mm_add_epi16(io_short[0], _mm_set1_epi16(8));
+ io_short[1] = _mm_add_epi16(io_short[1], _mm_set1_epi16(8));
+ io[0] = _mm_srai_epi16(io_short[0], 4);
+ io[1] = _mm_srai_epi16(io_short[1], 4);
+ } else {
+ if (max < 32767 && min > -32768) {
+ highbd_idct4_small_sse2(io);
+ highbd_idct4_small_sse2(io);
+ } else {
+ highbd_idct4_large_sse2(io);
+ highbd_idct4_large_sse2(io);
+ }
+ io[0] = wraplow_16bit_shift4(io[0], io[1], _mm_set1_epi32(8));
+ io[1] = wraplow_16bit_shift4(io[2], io[3], _mm_set1_epi32(8));
+ }
+
+ recon_and_store_4x4(io, dest, stride, bd);
+}
+
+void vpx_highbd_idct4x4_1_add_sse2(const tran_low_t *input, uint16_t *dest,
+ int stride, int bd) {
+ int a1, i;
+ tran_low_t out;
+ __m128i dc, d;
+
+ out = HIGHBD_WRAPLOW(
+ dct_const_round_shift(input[0] * (tran_high_t)cospi_16_64), bd);
+ out =
+ HIGHBD_WRAPLOW(dct_const_round_shift(out * (tran_high_t)cospi_16_64), bd);
+ a1 = ROUND_POWER_OF_TWO(out, 4);
+ dc = _mm_set1_epi16(a1);
+
+ for (i = 0; i < 4; ++i) {
+ d = _mm_loadl_epi64((const __m128i *)dest);
+ d = add_clamp(d, dc, bd);
+ _mm_storel_epi64((__m128i *)dest, d);
+ dest += stride;
+ }
+}
diff --git a/media/libvpx/libvpx/vpx_dsp/x86/highbd_idct4x4_add_sse4.c b/media/libvpx/libvpx/vpx_dsp/x86/highbd_idct4x4_add_sse4.c
new file mode 100644
index 0000000000..fe74d272ad
--- /dev/null
+++ b/media/libvpx/libvpx/vpx_dsp/x86/highbd_idct4x4_add_sse4.c
@@ -0,0 +1,47 @@
+/*
+ * Copyright (c) 2017 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <smmintrin.h> // SSE4.1
+
+#include "./vpx_dsp_rtcd.h"
+#include "vpx_dsp/x86/highbd_inv_txfm_sse2.h"
+#include "vpx_dsp/x86/highbd_inv_txfm_sse4.h"
+#include "vpx_dsp/x86/inv_txfm_sse2.h"
+#include "vpx_dsp/x86/transpose_sse2.h"
+
+void vpx_highbd_idct4x4_16_add_sse4_1(const tran_low_t *input, uint16_t *dest,
+ int stride, int bd) {
+ __m128i io[4];
+
+ io[0] = _mm_load_si128((const __m128i *)(input + 0));
+ io[1] = _mm_load_si128((const __m128i *)(input + 4));
+ io[2] = _mm_load_si128((const __m128i *)(input + 8));
+ io[3] = _mm_load_si128((const __m128i *)(input + 12));
+
+ if (bd == 8) {
+ __m128i io_short[2];
+
+ io_short[0] = _mm_packs_epi32(io[0], io[1]);
+ io_short[1] = _mm_packs_epi32(io[2], io[3]);
+ idct4_sse2(io_short);
+ idct4_sse2(io_short);
+ io_short[0] = _mm_add_epi16(io_short[0], _mm_set1_epi16(8));
+ io_short[1] = _mm_add_epi16(io_short[1], _mm_set1_epi16(8));
+ io[0] = _mm_srai_epi16(io_short[0], 4);
+ io[1] = _mm_srai_epi16(io_short[1], 4);
+ } else {
+ highbd_idct4_sse4_1(io);
+ highbd_idct4_sse4_1(io);
+ io[0] = wraplow_16bit_shift4(io[0], io[1], _mm_set1_epi32(8));
+ io[1] = wraplow_16bit_shift4(io[2], io[3], _mm_set1_epi32(8));
+ }
+
+ recon_and_store_4x4(io, dest, stride, bd);
+}
diff --git a/media/libvpx/libvpx/vpx_dsp/x86/highbd_idct8x8_add_sse2.c b/media/libvpx/libvpx/vpx_dsp/x86/highbd_idct8x8_add_sse2.c
new file mode 100644
index 0000000000..bb7a510e15
--- /dev/null
+++ b/media/libvpx/libvpx/vpx_dsp/x86/highbd_idct8x8_add_sse2.c
@@ -0,0 +1,213 @@
+/*
+ * Copyright (c) 2015 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <emmintrin.h> // SSE2
+
+#include "./vpx_dsp_rtcd.h"
+#include "vpx_dsp/x86/highbd_inv_txfm_sse2.h"
+#include "vpx_dsp/x86/inv_txfm_sse2.h"
+#include "vpx_dsp/x86/transpose_sse2.h"
+
+static void highbd_idct8x8_half1d(__m128i *const io) {
+ __m128i step1[8], step2[8];
+
+ transpose_32bit_4x4x2(io, io);
+
+ // stage 1
+ step1[0] = io[0];
+ step1[2] = io[4];
+ step1[1] = io[2];
+ step1[3] = io[6];
+ highbd_butterfly_sse2(io[1], io[7], cospi_28_64, cospi_4_64, &step1[4],
+ &step1[7]);
+ highbd_butterfly_sse2(io[5], io[3], cospi_12_64, cospi_20_64, &step1[5],
+ &step1[6]);
+
+ // stage 2
+ highbd_butterfly_cospi16_sse2(step1[0], step1[2], &step2[0], &step2[1]);
+ highbd_butterfly_sse2(step1[1], step1[3], cospi_24_64, cospi_8_64, &step2[2],
+ &step2[3]);
+ step2[4] = _mm_add_epi32(step1[4], step1[5]);
+ step2[5] = _mm_sub_epi32(step1[4], step1[5]);
+ step2[6] = _mm_sub_epi32(step1[7], step1[6]);
+ step2[7] = _mm_add_epi32(step1[7], step1[6]);
+
+ // stage 3
+ step1[0] = _mm_add_epi32(step2[0], step2[3]);
+ step1[1] = _mm_add_epi32(step2[1], step2[2]);
+ step1[2] = _mm_sub_epi32(step2[1], step2[2]);
+ step1[3] = _mm_sub_epi32(step2[0], step2[3]);
+ step1[4] = step2[4];
+ highbd_butterfly_cospi16_sse2(step2[6], step2[5], &step1[6], &step1[5]);
+ step1[7] = step2[7];
+
+ // stage 4
+ highbd_idct8_stage4(step1, io);
+}
+
+static void highbd_idct8x8_12_half1d(__m128i *const io) {
+ __m128i temp1[4], sign[2], step1[8], step2[8];
+
+ transpose_32bit_4x4(io, io);
+
+ // stage 1
+ step1[0] = io[0];
+ step1[1] = io[2];
+ abs_extend_64bit_sse2(io[1], temp1, sign);
+ step1[4] = multiplication_round_shift_sse2(temp1, sign, cospi_28_64);
+ step1[7] = multiplication_round_shift_sse2(temp1, sign, cospi_4_64);
+ abs_extend_64bit_sse2(io[3], temp1, sign);
+ step1[5] = multiplication_neg_round_shift_sse2(temp1, sign, cospi_20_64);
+ step1[6] = multiplication_round_shift_sse2(temp1, sign, cospi_12_64);
+
+ // stage 2
+ abs_extend_64bit_sse2(step1[0], temp1, sign);
+ step2[0] = multiplication_round_shift_sse2(temp1, sign, cospi_16_64);
+ abs_extend_64bit_sse2(step1[1], temp1, sign);
+ step2[2] = multiplication_round_shift_sse2(temp1, sign, cospi_24_64);
+ step2[3] = multiplication_round_shift_sse2(temp1, sign, cospi_8_64);
+ step2[4] = _mm_add_epi32(step1[4], step1[5]);
+ step2[5] = _mm_sub_epi32(step1[4], step1[5]);
+ step2[6] = _mm_sub_epi32(step1[7], step1[6]);
+ step2[7] = _mm_add_epi32(step1[7], step1[6]);
+
+ // stage 3
+ step1[0] = _mm_add_epi32(step2[0], step2[3]);
+ step1[1] = _mm_add_epi32(step2[0], step2[2]);
+ step1[2] = _mm_sub_epi32(step2[0], step2[2]);
+ step1[3] = _mm_sub_epi32(step2[0], step2[3]);
+ step1[4] = step2[4];
+ highbd_butterfly_cospi16_sse2(step2[6], step2[5], &step1[6], &step1[5]);
+ step1[7] = step2[7];
+
+ // stage 4
+ highbd_idct8_stage4(step1, io);
+}
+
+void vpx_highbd_idct8x8_64_add_sse2(const tran_low_t *input, uint16_t *dest,
+ int stride, int bd) {
+ __m128i io[16];
+
+ io[0] = _mm_load_si128((const __m128i *)(input + 0 * 8 + 0));
+ io[4] = _mm_load_si128((const __m128i *)(input + 0 * 8 + 4));
+ io[1] = _mm_load_si128((const __m128i *)(input + 1 * 8 + 0));
+ io[5] = _mm_load_si128((const __m128i *)(input + 1 * 8 + 4));
+ io[2] = _mm_load_si128((const __m128i *)(input + 2 * 8 + 0));
+ io[6] = _mm_load_si128((const __m128i *)(input + 2 * 8 + 4));
+ io[3] = _mm_load_si128((const __m128i *)(input + 3 * 8 + 0));
+ io[7] = _mm_load_si128((const __m128i *)(input + 3 * 8 + 4));
+
+ if (bd == 8) {
+ __m128i io_short[8];
+
+ io_short[0] = _mm_packs_epi32(io[0], io[4]);
+ io_short[1] = _mm_packs_epi32(io[1], io[5]);
+ io_short[2] = _mm_packs_epi32(io[2], io[6]);
+ io_short[3] = _mm_packs_epi32(io[3], io[7]);
+ io[8] = _mm_load_si128((const __m128i *)(input + 4 * 8 + 0));
+ io[12] = _mm_load_si128((const __m128i *)(input + 4 * 8 + 4));
+ io[9] = _mm_load_si128((const __m128i *)(input + 5 * 8 + 0));
+ io[13] = _mm_load_si128((const __m128i *)(input + 5 * 8 + 4));
+ io[10] = _mm_load_si128((const __m128i *)(input + 6 * 8 + 0));
+ io[14] = _mm_load_si128((const __m128i *)(input + 6 * 8 + 4));
+ io[11] = _mm_load_si128((const __m128i *)(input + 7 * 8 + 0));
+ io[15] = _mm_load_si128((const __m128i *)(input + 7 * 8 + 4));
+ io_short[4] = _mm_packs_epi32(io[8], io[12]);
+ io_short[5] = _mm_packs_epi32(io[9], io[13]);
+ io_short[6] = _mm_packs_epi32(io[10], io[14]);
+ io_short[7] = _mm_packs_epi32(io[11], io[15]);
+
+ vpx_idct8_sse2(io_short);
+ vpx_idct8_sse2(io_short);
+ round_shift_8x8(io_short, io);
+ } else {
+ __m128i temp[4];
+
+ highbd_idct8x8_half1d(io);
+
+ io[8] = _mm_load_si128((const __m128i *)(input + 4 * 8 + 0));
+ io[12] = _mm_load_si128((const __m128i *)(input + 4 * 8 + 4));
+ io[9] = _mm_load_si128((const __m128i *)(input + 5 * 8 + 0));
+ io[13] = _mm_load_si128((const __m128i *)(input + 5 * 8 + 4));
+ io[10] = _mm_load_si128((const __m128i *)(input + 6 * 8 + 0));
+ io[14] = _mm_load_si128((const __m128i *)(input + 6 * 8 + 4));
+ io[11] = _mm_load_si128((const __m128i *)(input + 7 * 8 + 0));
+ io[15] = _mm_load_si128((const __m128i *)(input + 7 * 8 + 4));
+ highbd_idct8x8_half1d(&io[8]);
+
+ temp[0] = io[4];
+ temp[1] = io[5];
+ temp[2] = io[6];
+ temp[3] = io[7];
+ io[4] = io[8];
+ io[5] = io[9];
+ io[6] = io[10];
+ io[7] = io[11];
+ highbd_idct8x8_half1d(io);
+
+ io[8] = temp[0];
+ io[9] = temp[1];
+ io[10] = temp[2];
+ io[11] = temp[3];
+ highbd_idct8x8_half1d(&io[8]);
+
+ highbd_idct8x8_final_round(io);
+ }
+
+ recon_and_store_8x8(io, dest, stride, bd);
+}
+
+void vpx_highbd_idct8x8_12_add_sse2(const tran_low_t *input, uint16_t *dest,
+ int stride, int bd) {
+ const __m128i zero = _mm_setzero_si128();
+ __m128i io[16];
+
+ io[0] = _mm_load_si128((const __m128i *)(input + 0 * 8 + 0));
+ io[1] = _mm_load_si128((const __m128i *)(input + 1 * 8 + 0));
+ io[2] = _mm_load_si128((const __m128i *)(input + 2 * 8 + 0));
+ io[3] = _mm_load_si128((const __m128i *)(input + 3 * 8 + 0));
+
+ if (bd == 8) {
+ __m128i io_short[8];
+
+ io_short[0] = _mm_packs_epi32(io[0], zero);
+ io_short[1] = _mm_packs_epi32(io[1], zero);
+ io_short[2] = _mm_packs_epi32(io[2], zero);
+ io_short[3] = _mm_packs_epi32(io[3], zero);
+
+ idct8x8_12_add_kernel_sse2(io_short);
+ round_shift_8x8(io_short, io);
+ } else {
+ __m128i temp[4];
+
+ highbd_idct8x8_12_half1d(io);
+
+ temp[0] = io[4];
+ temp[1] = io[5];
+ temp[2] = io[6];
+ temp[3] = io[7];
+ highbd_idct8x8_12_half1d(io);
+
+ io[8] = temp[0];
+ io[9] = temp[1];
+ io[10] = temp[2];
+ io[11] = temp[3];
+ highbd_idct8x8_12_half1d(&io[8]);
+
+ highbd_idct8x8_final_round(io);
+ }
+
+ recon_and_store_8x8(io, dest, stride, bd);
+}
+
+void vpx_highbd_idct8x8_1_add_sse2(const tran_low_t *input, uint16_t *dest,
+ int stride, int bd) {
+ highbd_idct_1_add_kernel(input, dest, stride, bd, 8);
+}
diff --git a/media/libvpx/libvpx/vpx_dsp/x86/highbd_idct8x8_add_sse4.c b/media/libvpx/libvpx/vpx_dsp/x86/highbd_idct8x8_add_sse4.c
new file mode 100644
index 0000000000..8b2e3d2415
--- /dev/null
+++ b/media/libvpx/libvpx/vpx_dsp/x86/highbd_idct8x8_add_sse4.c
@@ -0,0 +1,210 @@
+/*
+ * Copyright (c) 2017 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <smmintrin.h> // SSE4.1
+
+#include "./vpx_dsp_rtcd.h"
+#include "vpx_dsp/x86/highbd_inv_txfm_sse2.h"
+#include "vpx_dsp/x86/highbd_inv_txfm_sse4.h"
+#include "vpx_dsp/x86/inv_txfm_sse2.h"
+#include "vpx_dsp/x86/inv_txfm_ssse3.h"
+#include "vpx_dsp/x86/transpose_sse2.h"
+
+void vpx_highbd_idct8x8_half1d_sse4_1(__m128i *const io) {
+ __m128i step1[8], step2[8];
+
+ transpose_32bit_4x4x2(io, io);
+
+ // stage 1
+ step1[0] = io[0];
+ step1[2] = io[4];
+ step1[1] = io[2];
+ step1[3] = io[6];
+ highbd_butterfly_sse4_1(io[1], io[7], cospi_28_64, cospi_4_64, &step1[4],
+ &step1[7]);
+ highbd_butterfly_sse4_1(io[5], io[3], cospi_12_64, cospi_20_64, &step1[5],
+ &step1[6]);
+
+ // stage 2
+ highbd_butterfly_cospi16_sse4_1(step1[0], step1[2], &step2[0], &step2[1]);
+ highbd_butterfly_sse4_1(step1[1], step1[3], cospi_24_64, cospi_8_64,
+ &step2[2], &step2[3]);
+ step2[4] = _mm_add_epi32(step1[4], step1[5]);
+ step2[5] = _mm_sub_epi32(step1[4], step1[5]);
+ step2[6] = _mm_sub_epi32(step1[7], step1[6]);
+ step2[7] = _mm_add_epi32(step1[7], step1[6]);
+
+ // stage 3
+ step1[0] = _mm_add_epi32(step2[0], step2[3]);
+ step1[1] = _mm_add_epi32(step2[1], step2[2]);
+ step1[2] = _mm_sub_epi32(step2[1], step2[2]);
+ step1[3] = _mm_sub_epi32(step2[0], step2[3]);
+ step1[4] = step2[4];
+ highbd_butterfly_cospi16_sse4_1(step2[6], step2[5], &step1[6], &step1[5]);
+ step1[7] = step2[7];
+
+ // stage 4
+ highbd_idct8_stage4(step1, io);
+}
+
+static void highbd_idct8x8_12_half1d(__m128i *const io) {
+ __m128i temp1[2], step1[8], step2[8];
+
+ transpose_32bit_4x4(io, io);
+
+ // stage 1
+ step1[0] = io[0];
+ step1[1] = io[2];
+ extend_64bit(io[1], temp1);
+ step1[4] = multiplication_round_shift_sse4_1(temp1, cospi_28_64);
+ step1[7] = multiplication_round_shift_sse4_1(temp1, cospi_4_64);
+ extend_64bit(io[3], temp1);
+ step1[5] = multiplication_round_shift_sse4_1(temp1, -cospi_20_64);
+ step1[6] = multiplication_round_shift_sse4_1(temp1, cospi_12_64);
+
+ // stage 2
+ extend_64bit(step1[0], temp1);
+ step2[0] = multiplication_round_shift_sse4_1(temp1, cospi_16_64);
+ extend_64bit(step1[1], temp1);
+ step2[2] = multiplication_round_shift_sse4_1(temp1, cospi_24_64);
+ step2[3] = multiplication_round_shift_sse4_1(temp1, cospi_8_64);
+ step2[4] = _mm_add_epi32(step1[4], step1[5]);
+ step2[5] = _mm_sub_epi32(step1[4], step1[5]);
+ step2[6] = _mm_sub_epi32(step1[7], step1[6]);
+ step2[7] = _mm_add_epi32(step1[7], step1[6]);
+
+ // stage 3
+ step1[0] = _mm_add_epi32(step2[0], step2[3]);
+ step1[1] = _mm_add_epi32(step2[0], step2[2]);
+ step1[2] = _mm_sub_epi32(step2[0], step2[2]);
+ step1[3] = _mm_sub_epi32(step2[0], step2[3]);
+ step1[4] = step2[4];
+ highbd_butterfly_cospi16_sse4_1(step2[6], step2[5], &step1[6], &step1[5]);
+ step1[7] = step2[7];
+
+ // stage 4
+ highbd_idct8_stage4(step1, io);
+}
+
+void vpx_highbd_idct8x8_64_add_sse4_1(const tran_low_t *input, uint16_t *dest,
+ int stride, int bd) {
+ __m128i io[16];
+
+ io[0] = _mm_load_si128((const __m128i *)(input + 0 * 8 + 0));
+ io[4] = _mm_load_si128((const __m128i *)(input + 0 * 8 + 4));
+ io[1] = _mm_load_si128((const __m128i *)(input + 1 * 8 + 0));
+ io[5] = _mm_load_si128((const __m128i *)(input + 1 * 8 + 4));
+ io[2] = _mm_load_si128((const __m128i *)(input + 2 * 8 + 0));
+ io[6] = _mm_load_si128((const __m128i *)(input + 2 * 8 + 4));
+ io[3] = _mm_load_si128((const __m128i *)(input + 3 * 8 + 0));
+ io[7] = _mm_load_si128((const __m128i *)(input + 3 * 8 + 4));
+
+ if (bd == 8) {
+ __m128i io_short[8];
+
+ io_short[0] = _mm_packs_epi32(io[0], io[4]);
+ io_short[1] = _mm_packs_epi32(io[1], io[5]);
+ io_short[2] = _mm_packs_epi32(io[2], io[6]);
+ io_short[3] = _mm_packs_epi32(io[3], io[7]);
+ io[8] = _mm_load_si128((const __m128i *)(input + 4 * 8 + 0));
+ io[12] = _mm_load_si128((const __m128i *)(input + 4 * 8 + 4));
+ io[9] = _mm_load_si128((const __m128i *)(input + 5 * 8 + 0));
+ io[13] = _mm_load_si128((const __m128i *)(input + 5 * 8 + 4));
+ io[10] = _mm_load_si128((const __m128i *)(input + 6 * 8 + 0));
+ io[14] = _mm_load_si128((const __m128i *)(input + 6 * 8 + 4));
+ io[11] = _mm_load_si128((const __m128i *)(input + 7 * 8 + 0));
+ io[15] = _mm_load_si128((const __m128i *)(input + 7 * 8 + 4));
+ io_short[4] = _mm_packs_epi32(io[8], io[12]);
+ io_short[5] = _mm_packs_epi32(io[9], io[13]);
+ io_short[6] = _mm_packs_epi32(io[10], io[14]);
+ io_short[7] = _mm_packs_epi32(io[11], io[15]);
+
+ vpx_idct8_sse2(io_short);
+ vpx_idct8_sse2(io_short);
+ round_shift_8x8(io_short, io);
+ } else {
+ __m128i temp[4];
+
+ vpx_highbd_idct8x8_half1d_sse4_1(io);
+
+ io[8] = _mm_load_si128((const __m128i *)(input + 4 * 8 + 0));
+ io[12] = _mm_load_si128((const __m128i *)(input + 4 * 8 + 4));
+ io[9] = _mm_load_si128((const __m128i *)(input + 5 * 8 + 0));
+ io[13] = _mm_load_si128((const __m128i *)(input + 5 * 8 + 4));
+ io[10] = _mm_load_si128((const __m128i *)(input + 6 * 8 + 0));
+ io[14] = _mm_load_si128((const __m128i *)(input + 6 * 8 + 4));
+ io[11] = _mm_load_si128((const __m128i *)(input + 7 * 8 + 0));
+ io[15] = _mm_load_si128((const __m128i *)(input + 7 * 8 + 4));
+ vpx_highbd_idct8x8_half1d_sse4_1(&io[8]);
+
+ temp[0] = io[4];
+ temp[1] = io[5];
+ temp[2] = io[6];
+ temp[3] = io[7];
+ io[4] = io[8];
+ io[5] = io[9];
+ io[6] = io[10];
+ io[7] = io[11];
+ vpx_highbd_idct8x8_half1d_sse4_1(io);
+
+ io[8] = temp[0];
+ io[9] = temp[1];
+ io[10] = temp[2];
+ io[11] = temp[3];
+ vpx_highbd_idct8x8_half1d_sse4_1(&io[8]);
+
+ highbd_idct8x8_final_round(io);
+ }
+
+ recon_and_store_8x8(io, dest, stride, bd);
+}
+
+void vpx_highbd_idct8x8_12_add_sse4_1(const tran_low_t *input, uint16_t *dest,
+ int stride, int bd) {
+ const __m128i zero = _mm_setzero_si128();
+ __m128i io[16];
+
+ io[0] = _mm_load_si128((const __m128i *)(input + 0 * 8 + 0));
+ io[1] = _mm_load_si128((const __m128i *)(input + 1 * 8 + 0));
+ io[2] = _mm_load_si128((const __m128i *)(input + 2 * 8 + 0));
+ io[3] = _mm_load_si128((const __m128i *)(input + 3 * 8 + 0));
+
+ if (bd == 8) {
+ __m128i io_short[8];
+
+ io_short[0] = _mm_packs_epi32(io[0], zero);
+ io_short[1] = _mm_packs_epi32(io[1], zero);
+ io_short[2] = _mm_packs_epi32(io[2], zero);
+ io_short[3] = _mm_packs_epi32(io[3], zero);
+
+ idct8x8_12_add_kernel_ssse3(io_short);
+ round_shift_8x8(io_short, io);
+ } else {
+ __m128i temp[4];
+
+ highbd_idct8x8_12_half1d(io);
+
+ temp[0] = io[4];
+ temp[1] = io[5];
+ temp[2] = io[6];
+ temp[3] = io[7];
+ highbd_idct8x8_12_half1d(io);
+
+ io[8] = temp[0];
+ io[9] = temp[1];
+ io[10] = temp[2];
+ io[11] = temp[3];
+ highbd_idct8x8_12_half1d(&io[8]);
+
+ highbd_idct8x8_final_round(io);
+ }
+
+ recon_and_store_8x8(io, dest, stride, bd);
+}
diff --git a/media/libvpx/libvpx/vpx_dsp/x86/highbd_intrapred_intrin_sse2.c b/media/libvpx/libvpx/vpx_dsp/x86/highbd_intrapred_intrin_sse2.c
new file mode 100644
index 0000000000..43634aea3a
--- /dev/null
+++ b/media/libvpx/libvpx/vpx_dsp/x86/highbd_intrapred_intrin_sse2.c
@@ -0,0 +1,534 @@
+/*
+ * Copyright (c) 2017 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <emmintrin.h> // SSE2
+
+#include "./vpx_config.h"
+#include "./vpx_dsp_rtcd.h"
+#include "vpx/vpx_integer.h"
+
+// -----------------------------------------------------------------------------
+
+void vpx_highbd_h_predictor_4x4_sse2(uint16_t *dst, ptrdiff_t stride,
+ const uint16_t *above,
+ const uint16_t *left, int bd) {
+ const __m128i left_u16 = _mm_loadl_epi64((const __m128i *)left);
+ const __m128i row0 = _mm_shufflelo_epi16(left_u16, 0x0);
+ const __m128i row1 = _mm_shufflelo_epi16(left_u16, 0x55);
+ const __m128i row2 = _mm_shufflelo_epi16(left_u16, 0xaa);
+ const __m128i row3 = _mm_shufflelo_epi16(left_u16, 0xff);
+ (void)above;
+ (void)bd;
+ _mm_storel_epi64((__m128i *)dst, row0);
+ dst += stride;
+ _mm_storel_epi64((__m128i *)dst, row1);
+ dst += stride;
+ _mm_storel_epi64((__m128i *)dst, row2);
+ dst += stride;
+ _mm_storel_epi64((__m128i *)dst, row3);
+}
+
+void vpx_highbd_h_predictor_8x8_sse2(uint16_t *dst, ptrdiff_t stride,
+ const uint16_t *above,
+ const uint16_t *left, int bd) {
+ const __m128i left_u16 = _mm_load_si128((const __m128i *)left);
+ const __m128i row0 = _mm_shufflelo_epi16(left_u16, 0x0);
+ const __m128i row1 = _mm_shufflelo_epi16(left_u16, 0x55);
+ const __m128i row2 = _mm_shufflelo_epi16(left_u16, 0xaa);
+ const __m128i row3 = _mm_shufflelo_epi16(left_u16, 0xff);
+ const __m128i row4 = _mm_shufflehi_epi16(left_u16, 0x0);
+ const __m128i row5 = _mm_shufflehi_epi16(left_u16, 0x55);
+ const __m128i row6 = _mm_shufflehi_epi16(left_u16, 0xaa);
+ const __m128i row7 = _mm_shufflehi_epi16(left_u16, 0xff);
+ (void)above;
+ (void)bd;
+ _mm_store_si128((__m128i *)dst, _mm_unpacklo_epi64(row0, row0));
+ dst += stride;
+ _mm_store_si128((__m128i *)dst, _mm_unpacklo_epi64(row1, row1));
+ dst += stride;
+ _mm_store_si128((__m128i *)dst, _mm_unpacklo_epi64(row2, row2));
+ dst += stride;
+ _mm_store_si128((__m128i *)dst, _mm_unpacklo_epi64(row3, row3));
+ dst += stride;
+ _mm_store_si128((__m128i *)dst, _mm_unpackhi_epi64(row4, row4));
+ dst += stride;
+ _mm_store_si128((__m128i *)dst, _mm_unpackhi_epi64(row5, row5));
+ dst += stride;
+ _mm_store_si128((__m128i *)dst, _mm_unpackhi_epi64(row6, row6));
+ dst += stride;
+ _mm_store_si128((__m128i *)dst, _mm_unpackhi_epi64(row7, row7));
+}
+
+static INLINE void h_store_16_unpacklo(uint16_t **dst, const ptrdiff_t stride,
+ const __m128i *row) {
+ const __m128i val = _mm_unpacklo_epi64(*row, *row);
+ _mm_store_si128((__m128i *)*dst, val);
+ _mm_store_si128((__m128i *)(*dst + 8), val);
+ *dst += stride;
+}
+
+static INLINE void h_store_16_unpackhi(uint16_t **dst, const ptrdiff_t stride,
+ const __m128i *row) {
+ const __m128i val = _mm_unpackhi_epi64(*row, *row);
+ _mm_store_si128((__m128i *)(*dst), val);
+ _mm_store_si128((__m128i *)(*dst + 8), val);
+ *dst += stride;
+}
+
+void vpx_highbd_h_predictor_16x16_sse2(uint16_t *dst, ptrdiff_t stride,
+ const uint16_t *above,
+ const uint16_t *left, int bd) {
+ int i;
+ (void)above;
+ (void)bd;
+
+ for (i = 0; i < 2; i++, left += 8) {
+ const __m128i left_u16 = _mm_load_si128((const __m128i *)left);
+ const __m128i row0 = _mm_shufflelo_epi16(left_u16, 0x0);
+ const __m128i row1 = _mm_shufflelo_epi16(left_u16, 0x55);
+ const __m128i row2 = _mm_shufflelo_epi16(left_u16, 0xaa);
+ const __m128i row3 = _mm_shufflelo_epi16(left_u16, 0xff);
+ const __m128i row4 = _mm_shufflehi_epi16(left_u16, 0x0);
+ const __m128i row5 = _mm_shufflehi_epi16(left_u16, 0x55);
+ const __m128i row6 = _mm_shufflehi_epi16(left_u16, 0xaa);
+ const __m128i row7 = _mm_shufflehi_epi16(left_u16, 0xff);
+ h_store_16_unpacklo(&dst, stride, &row0);
+ h_store_16_unpacklo(&dst, stride, &row1);
+ h_store_16_unpacklo(&dst, stride, &row2);
+ h_store_16_unpacklo(&dst, stride, &row3);
+ h_store_16_unpackhi(&dst, stride, &row4);
+ h_store_16_unpackhi(&dst, stride, &row5);
+ h_store_16_unpackhi(&dst, stride, &row6);
+ h_store_16_unpackhi(&dst, stride, &row7);
+ }
+}
+
+static INLINE void h_store_32_unpacklo(uint16_t **dst, const ptrdiff_t stride,
+ const __m128i *row) {
+ const __m128i val = _mm_unpacklo_epi64(*row, *row);
+ _mm_store_si128((__m128i *)(*dst), val);
+ _mm_store_si128((__m128i *)(*dst + 8), val);
+ _mm_store_si128((__m128i *)(*dst + 16), val);
+ _mm_store_si128((__m128i *)(*dst + 24), val);
+ *dst += stride;
+}
+
+static INLINE void h_store_32_unpackhi(uint16_t **dst, const ptrdiff_t stride,
+ const __m128i *row) {
+ const __m128i val = _mm_unpackhi_epi64(*row, *row);
+ _mm_store_si128((__m128i *)(*dst), val);
+ _mm_store_si128((__m128i *)(*dst + 8), val);
+ _mm_store_si128((__m128i *)(*dst + 16), val);
+ _mm_store_si128((__m128i *)(*dst + 24), val);
+ *dst += stride;
+}
+
+void vpx_highbd_h_predictor_32x32_sse2(uint16_t *dst, ptrdiff_t stride,
+ const uint16_t *above,
+ const uint16_t *left, int bd) {
+ int i;
+ (void)above;
+ (void)bd;
+
+ for (i = 0; i < 4; i++, left += 8) {
+ const __m128i left_u16 = _mm_load_si128((const __m128i *)left);
+ const __m128i row0 = _mm_shufflelo_epi16(left_u16, 0x0);
+ const __m128i row1 = _mm_shufflelo_epi16(left_u16, 0x55);
+ const __m128i row2 = _mm_shufflelo_epi16(left_u16, 0xaa);
+ const __m128i row3 = _mm_shufflelo_epi16(left_u16, 0xff);
+ const __m128i row4 = _mm_shufflehi_epi16(left_u16, 0x0);
+ const __m128i row5 = _mm_shufflehi_epi16(left_u16, 0x55);
+ const __m128i row6 = _mm_shufflehi_epi16(left_u16, 0xaa);
+ const __m128i row7 = _mm_shufflehi_epi16(left_u16, 0xff);
+ h_store_32_unpacklo(&dst, stride, &row0);
+ h_store_32_unpacklo(&dst, stride, &row1);
+ h_store_32_unpacklo(&dst, stride, &row2);
+ h_store_32_unpacklo(&dst, stride, &row3);
+ h_store_32_unpackhi(&dst, stride, &row4);
+ h_store_32_unpackhi(&dst, stride, &row5);
+ h_store_32_unpackhi(&dst, stride, &row6);
+ h_store_32_unpackhi(&dst, stride, &row7);
+ }
+}
+
+//------------------------------------------------------------------------------
+// DC 4x4
+
+static INLINE __m128i dc_sum_4(const uint16_t *ref) {
+ const __m128i _dcba = _mm_loadl_epi64((const __m128i *)ref);
+ const __m128i _xxdc = _mm_shufflelo_epi16(_dcba, 0xe);
+ const __m128i a = _mm_add_epi16(_dcba, _xxdc);
+ return _mm_add_epi16(a, _mm_shufflelo_epi16(a, 0x1));
+}
+
+static INLINE void dc_store_4x4(uint16_t *dst, ptrdiff_t stride,
+ const __m128i *dc) {
+ const __m128i dc_dup = _mm_shufflelo_epi16(*dc, 0x0);
+ int i;
+ for (i = 0; i < 4; ++i, dst += stride) {
+ _mm_storel_epi64((__m128i *)dst, dc_dup);
+ }
+}
+
+void vpx_highbd_dc_left_predictor_4x4_sse2(uint16_t *dst, ptrdiff_t stride,
+ const uint16_t *above,
+ const uint16_t *left, int bd) {
+ const __m128i two = _mm_cvtsi32_si128(2);
+ const __m128i sum = dc_sum_4(left);
+ const __m128i dc = _mm_srli_epi16(_mm_add_epi16(sum, two), 2);
+ (void)above;
+ (void)bd;
+ dc_store_4x4(dst, stride, &dc);
+}
+
+void vpx_highbd_dc_top_predictor_4x4_sse2(uint16_t *dst, ptrdiff_t stride,
+ const uint16_t *above,
+ const uint16_t *left, int bd) {
+ const __m128i two = _mm_cvtsi32_si128(2);
+ const __m128i sum = dc_sum_4(above);
+ const __m128i dc = _mm_srli_epi16(_mm_add_epi16(sum, two), 2);
+ (void)left;
+ (void)bd;
+ dc_store_4x4(dst, stride, &dc);
+}
+
+void vpx_highbd_dc_128_predictor_4x4_sse2(uint16_t *dst, ptrdiff_t stride,
+ const uint16_t *above,
+ const uint16_t *left, int bd) {
+ const __m128i dc = _mm_cvtsi32_si128(1 << (bd - 1));
+ const __m128i dc_dup = _mm_shufflelo_epi16(dc, 0x0);
+ (void)above;
+ (void)left;
+ dc_store_4x4(dst, stride, &dc_dup);
+}
+
+//------------------------------------------------------------------------------
+// DC 8x8
+
+static INLINE __m128i dc_sum_8(const uint16_t *ref) {
+ const __m128i ref_u16 = _mm_load_si128((const __m128i *)ref);
+ const __m128i _dcba = _mm_add_epi16(ref_u16, _mm_srli_si128(ref_u16, 8));
+ const __m128i _xxdc = _mm_shufflelo_epi16(_dcba, 0xe);
+ const __m128i a = _mm_add_epi16(_dcba, _xxdc);
+
+ return _mm_add_epi16(a, _mm_shufflelo_epi16(a, 0x1));
+}
+
+static INLINE void dc_store_8x8(uint16_t *dst, ptrdiff_t stride,
+ const __m128i *dc) {
+ const __m128i dc_dup_lo = _mm_shufflelo_epi16(*dc, 0);
+ const __m128i dc_dup = _mm_unpacklo_epi64(dc_dup_lo, dc_dup_lo);
+ int i;
+ for (i = 0; i < 8; ++i, dst += stride) {
+ _mm_store_si128((__m128i *)dst, dc_dup);
+ }
+}
+
+void vpx_highbd_dc_left_predictor_8x8_sse2(uint16_t *dst, ptrdiff_t stride,
+ const uint16_t *above,
+ const uint16_t *left, int bd) {
+ const __m128i four = _mm_cvtsi32_si128(4);
+ const __m128i sum = dc_sum_8(left);
+ const __m128i dc = _mm_srli_epi16(_mm_add_epi16(sum, four), 3);
+ (void)above;
+ (void)bd;
+ dc_store_8x8(dst, stride, &dc);
+}
+
+void vpx_highbd_dc_top_predictor_8x8_sse2(uint16_t *dst, ptrdiff_t stride,
+ const uint16_t *above,
+ const uint16_t *left, int bd) {
+ const __m128i four = _mm_cvtsi32_si128(4);
+ const __m128i sum = dc_sum_8(above);
+ const __m128i dc = _mm_srli_epi16(_mm_add_epi16(sum, four), 3);
+ (void)left;
+ (void)bd;
+ dc_store_8x8(dst, stride, &dc);
+}
+
+void vpx_highbd_dc_128_predictor_8x8_sse2(uint16_t *dst, ptrdiff_t stride,
+ const uint16_t *above,
+ const uint16_t *left, int bd) {
+ const __m128i dc = _mm_cvtsi32_si128(1 << (bd - 1));
+ const __m128i dc_dup = _mm_shufflelo_epi16(dc, 0x0);
+ (void)above;
+ (void)left;
+ dc_store_8x8(dst, stride, &dc_dup);
+}
+
+//------------------------------------------------------------------------------
+// DC 16x16
+
+static INLINE __m128i dc_sum_16(const uint16_t *ref) {
+ const __m128i sum_lo = dc_sum_8(ref);
+ const __m128i sum_hi = dc_sum_8(ref + 8);
+ return _mm_add_epi16(sum_lo, sum_hi);
+}
+
+static INLINE void dc_store_16x16(uint16_t *dst, ptrdiff_t stride,
+ const __m128i *dc) {
+ const __m128i dc_dup_lo = _mm_shufflelo_epi16(*dc, 0);
+ const __m128i dc_dup = _mm_unpacklo_epi64(dc_dup_lo, dc_dup_lo);
+ int i;
+ for (i = 0; i < 16; ++i, dst += stride) {
+ _mm_store_si128((__m128i *)dst, dc_dup);
+ _mm_store_si128((__m128i *)(dst + 8), dc_dup);
+ }
+}
+
+void vpx_highbd_dc_left_predictor_16x16_sse2(uint16_t *dst, ptrdiff_t stride,
+ const uint16_t *above,
+ const uint16_t *left, int bd) {
+ const __m128i eight = _mm_cvtsi32_si128(8);
+ const __m128i sum = dc_sum_16(left);
+ const __m128i dc = _mm_srli_epi16(_mm_add_epi16(sum, eight), 4);
+ (void)above;
+ (void)bd;
+ dc_store_16x16(dst, stride, &dc);
+}
+
+void vpx_highbd_dc_top_predictor_16x16_sse2(uint16_t *dst, ptrdiff_t stride,
+ const uint16_t *above,
+ const uint16_t *left, int bd) {
+ const __m128i eight = _mm_cvtsi32_si128(8);
+ const __m128i sum = dc_sum_16(above);
+ const __m128i dc = _mm_srli_epi16(_mm_add_epi16(sum, eight), 4);
+ (void)left;
+ (void)bd;
+ dc_store_16x16(dst, stride, &dc);
+}
+
+void vpx_highbd_dc_128_predictor_16x16_sse2(uint16_t *dst, ptrdiff_t stride,
+ const uint16_t *above,
+ const uint16_t *left, int bd) {
+ const __m128i dc = _mm_cvtsi32_si128(1 << (bd - 1));
+ const __m128i dc_dup = _mm_shufflelo_epi16(dc, 0x0);
+ (void)above;
+ (void)left;
+ dc_store_16x16(dst, stride, &dc_dup);
+}
+
+//------------------------------------------------------------------------------
+// DC 32x32
+
+static INLINE __m128i dc_sum_32(const uint16_t *ref) {
+ const __m128i zero = _mm_setzero_si128();
+ const __m128i sum_a = dc_sum_16(ref);
+ const __m128i sum_b = dc_sum_16(ref + 16);
+ // 12 bit bd will outrange, so expand to 32 bit before adding final total
+ return _mm_add_epi32(_mm_unpacklo_epi16(sum_a, zero),
+ _mm_unpacklo_epi16(sum_b, zero));
+}
+
+static INLINE void dc_store_32x32(uint16_t *dst, ptrdiff_t stride,
+ const __m128i *dc) {
+ const __m128i dc_dup_lo = _mm_shufflelo_epi16(*dc, 0);
+ const __m128i dc_dup = _mm_unpacklo_epi64(dc_dup_lo, dc_dup_lo);
+ int i;
+ for (i = 0; i < 32; ++i, dst += stride) {
+ _mm_store_si128((__m128i *)dst, dc_dup);
+ _mm_store_si128((__m128i *)(dst + 8), dc_dup);
+ _mm_store_si128((__m128i *)(dst + 16), dc_dup);
+ _mm_store_si128((__m128i *)(dst + 24), dc_dup);
+ }
+}
+
+void vpx_highbd_dc_left_predictor_32x32_sse2(uint16_t *dst, ptrdiff_t stride,
+ const uint16_t *above,
+ const uint16_t *left, int bd) {
+ const __m128i sixteen = _mm_cvtsi32_si128(16);
+ const __m128i sum = dc_sum_32(left);
+ const __m128i dc = _mm_srli_epi32(_mm_add_epi32(sum, sixteen), 5);
+ (void)above;
+ (void)bd;
+ dc_store_32x32(dst, stride, &dc);
+}
+
+void vpx_highbd_dc_top_predictor_32x32_sse2(uint16_t *dst, ptrdiff_t stride,
+ const uint16_t *above,
+ const uint16_t *left, int bd) {
+ const __m128i sixteen = _mm_cvtsi32_si128(16);
+ const __m128i sum = dc_sum_32(above);
+ const __m128i dc = _mm_srli_epi32(_mm_add_epi32(sum, sixteen), 5);
+ (void)left;
+ (void)bd;
+ dc_store_32x32(dst, stride, &dc);
+}
+
+void vpx_highbd_dc_128_predictor_32x32_sse2(uint16_t *dst, ptrdiff_t stride,
+ const uint16_t *above,
+ const uint16_t *left, int bd) {
+ const __m128i dc = _mm_cvtsi32_si128(1 << (bd - 1));
+ const __m128i dc_dup = _mm_shufflelo_epi16(dc, 0x0);
+ (void)above;
+ (void)left;
+ dc_store_32x32(dst, stride, &dc_dup);
+}
+
+// -----------------------------------------------------------------------------
+/*
+; ------------------------------------------
+; input: x, y, z, result
+;
+; trick from pascal
+; (x+2y+z+2)>>2 can be calculated as:
+; result = avg(x,z)
+; result -= xor(x,z) & 1
+; result = avg(result,y)
+; ------------------------------------------
+*/
+static INLINE __m128i avg3_epu16(const __m128i *x, const __m128i *y,
+ const __m128i *z) {
+ const __m128i one = _mm_set1_epi16(1);
+ const __m128i a = _mm_avg_epu16(*x, *z);
+ const __m128i b =
+ _mm_subs_epu16(a, _mm_and_si128(_mm_xor_si128(*x, *z), one));
+ return _mm_avg_epu16(b, *y);
+}
+
+void vpx_highbd_d117_predictor_4x4_sse2(uint16_t *dst, ptrdiff_t stride,
+ const uint16_t *above,
+ const uint16_t *left, int bd) {
+ const int I = left[0];
+ const int J = left[1];
+ const int K = left[2];
+ const __m128i XXXXABCD = _mm_loadu_si128((const __m128i *)(above - 4));
+ const __m128i KXXXABCD = _mm_insert_epi16(XXXXABCD, K, 0);
+ const __m128i KJXXABCD = _mm_insert_epi16(KXXXABCD, J, 1);
+ const __m128i KJIXABCD = _mm_insert_epi16(KJXXABCD, I, 2);
+ const __m128i JIXABCD0 = _mm_srli_si128(KJIXABCD, 2);
+ const __m128i IXABCD00 = _mm_srli_si128(KJIXABCD, 4);
+ const __m128i avg2 = _mm_avg_epu16(KJIXABCD, JIXABCD0);
+ const __m128i avg3 = avg3_epu16(&KJIXABCD, &JIXABCD0, &IXABCD00);
+ const __m128i row0 = _mm_srli_si128(avg2, 6);
+ const __m128i row1 = _mm_srli_si128(avg3, 4);
+ const __m128i row2 = _mm_srli_si128(avg2, 4);
+ const __m128i row3 = _mm_srli_si128(avg3, 2);
+ (void)bd;
+ _mm_storel_epi64((__m128i *)dst, row0);
+ dst += stride;
+ _mm_storel_epi64((__m128i *)dst, row1);
+ dst += stride;
+ _mm_storel_epi64((__m128i *)dst, row2);
+ dst += stride;
+ _mm_storel_epi64((__m128i *)dst, row3);
+
+ dst -= stride;
+ dst[0] = _mm_extract_epi16(avg3, 1);
+ dst[stride] = _mm_extract_epi16(avg3, 0);
+}
+
+void vpx_highbd_d135_predictor_4x4_sse2(uint16_t *dst, ptrdiff_t stride,
+ const uint16_t *above,
+ const uint16_t *left, int bd) {
+ const int I = left[0];
+ const int J = left[1];
+ const int K = left[2];
+ const int L = left[3];
+ const __m128i XXXXABCD = _mm_loadu_si128((const __m128i *)(above - 4));
+ const __m128i KXXXABCD = _mm_insert_epi16(XXXXABCD, K, 0);
+ const __m128i KJXXABCD = _mm_insert_epi16(KXXXABCD, J, 1);
+ const __m128i KJIXABCD = _mm_insert_epi16(KJXXABCD, I, 2);
+ const __m128i JIXABCD0 = _mm_srli_si128(KJIXABCD, 2);
+ const __m128i LKJIXABC = _mm_insert_epi16(_mm_slli_si128(KJIXABCD, 2), L, 0);
+ const __m128i avg3 = avg3_epu16(&JIXABCD0, &KJIXABCD, &LKJIXABC);
+ const __m128i row0 = _mm_srli_si128(avg3, 6);
+ const __m128i row1 = _mm_srli_si128(avg3, 4);
+ const __m128i row2 = _mm_srli_si128(avg3, 2);
+ const __m128i row3 = avg3;
+ (void)bd;
+ _mm_storel_epi64((__m128i *)dst, row0);
+ dst += stride;
+ _mm_storel_epi64((__m128i *)dst, row1);
+ dst += stride;
+ _mm_storel_epi64((__m128i *)dst, row2);
+ dst += stride;
+ _mm_storel_epi64((__m128i *)dst, row3);
+}
+
+void vpx_highbd_d153_predictor_4x4_sse2(uint16_t *dst, ptrdiff_t stride,
+ const uint16_t *above,
+ const uint16_t *left, int bd) {
+ const int I = left[0];
+ const int J = left[1];
+ const int K = left[2];
+ const int L = left[3];
+ const __m128i XXXXXABC = _mm_castps_si128(
+ _mm_loadh_pi(_mm_setzero_ps(), (const __m64 *)(above - 1)));
+ const __m128i LXXXXABC = _mm_insert_epi16(XXXXXABC, L, 0);
+ const __m128i LKXXXABC = _mm_insert_epi16(LXXXXABC, K, 1);
+ const __m128i LKJXXABC = _mm_insert_epi16(LKXXXABC, J, 2);
+ const __m128i LKJIXABC = _mm_insert_epi16(LKJXXABC, I, 3);
+ const __m128i KJIXABC0 = _mm_srli_si128(LKJIXABC, 2);
+ const __m128i JIXABC00 = _mm_srli_si128(LKJIXABC, 4);
+ const __m128i avg3 = avg3_epu16(&LKJIXABC, &KJIXABC0, &JIXABC00);
+ const __m128i avg2 = _mm_avg_epu16(LKJIXABC, KJIXABC0);
+ const __m128i row3 = _mm_unpacklo_epi16(avg2, avg3);
+ const __m128i row2 = _mm_srli_si128(row3, 4);
+ const __m128i row1 = _mm_srli_si128(row3, 8);
+ const __m128i row0 = _mm_srli_si128(avg3, 4);
+ (void)bd;
+ _mm_storel_epi64((__m128i *)dst, row0);
+ dst[0] = _mm_extract_epi16(avg2, 3);
+ dst += stride;
+ _mm_storel_epi64((__m128i *)dst, row1);
+ dst += stride;
+ _mm_storel_epi64((__m128i *)dst, row2);
+ dst += stride;
+ _mm_storel_epi64((__m128i *)dst, row3);
+}
+
+void vpx_highbd_d207_predictor_4x4_sse2(uint16_t *dst, ptrdiff_t stride,
+ const uint16_t *above,
+ const uint16_t *left, int bd) {
+ const __m128i IJKL0000 = _mm_load_si128((const __m128i *)left);
+ const __m128i LLLL0000 = _mm_shufflelo_epi16(IJKL0000, 0xff);
+ const __m128i IJKLLLLL = _mm_unpacklo_epi64(IJKL0000, LLLL0000);
+ const __m128i JKLLLLL0 = _mm_srli_si128(IJKLLLLL, 2);
+ const __m128i KLLLLL00 = _mm_srli_si128(IJKLLLLL, 4);
+ const __m128i avg3 = avg3_epu16(&IJKLLLLL, &JKLLLLL0, &KLLLLL00);
+ const __m128i avg2 = _mm_avg_epu16(IJKLLLLL, JKLLLLL0);
+ const __m128i row0 = _mm_unpacklo_epi16(avg2, avg3);
+ const __m128i row1 = _mm_srli_si128(row0, 4);
+ const __m128i row2 = _mm_srli_si128(row0, 8);
+ const __m128i row3 = LLLL0000;
+ (void)above;
+ (void)bd;
+ _mm_storel_epi64((__m128i *)dst, row0);
+ dst += stride;
+ _mm_storel_epi64((__m128i *)dst, row1);
+ dst += stride;
+ _mm_storel_epi64((__m128i *)dst, row2);
+ dst += stride;
+ _mm_storel_epi64((__m128i *)dst, row3);
+}
+
+void vpx_highbd_d63_predictor_4x4_sse2(uint16_t *dst, ptrdiff_t stride,
+ const uint16_t *above,
+ const uint16_t *left, int bd) {
+ const __m128i ABCDEFGH = _mm_loadu_si128((const __m128i *)above);
+ const __m128i BCDEFGH0 = _mm_srli_si128(ABCDEFGH, 2);
+ const __m128i CDEFGH00 = _mm_srli_si128(ABCDEFGH, 4);
+ const __m128i avg3 = avg3_epu16(&ABCDEFGH, &BCDEFGH0, &CDEFGH00);
+ const __m128i avg2 = _mm_avg_epu16(ABCDEFGH, BCDEFGH0);
+ const __m128i row0 = avg2;
+ const __m128i row1 = avg3;
+ const __m128i row2 = _mm_srli_si128(avg2, 2);
+ const __m128i row3 = _mm_srli_si128(avg3, 2);
+ (void)left;
+ (void)bd;
+ _mm_storel_epi64((__m128i *)dst, row0);
+ dst += stride;
+ _mm_storel_epi64((__m128i *)dst, row1);
+ dst += stride;
+ _mm_storel_epi64((__m128i *)dst, row2);
+ dst += stride;
+ _mm_storel_epi64((__m128i *)dst, row3);
+}
diff --git a/media/libvpx/libvpx/vpx_dsp/x86/highbd_intrapred_intrin_ssse3.c b/media/libvpx/libvpx/vpx_dsp/x86/highbd_intrapred_intrin_ssse3.c
new file mode 100644
index 0000000000..d673fac493
--- /dev/null
+++ b/media/libvpx/libvpx/vpx_dsp/x86/highbd_intrapred_intrin_ssse3.c
@@ -0,0 +1,930 @@
+/*
+ * Copyright (c) 2017 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <tmmintrin.h>
+
+#include "./vpx_config.h"
+#include "./vpx_dsp_rtcd.h"
+#include "vpx/vpx_integer.h"
+
+// -----------------------------------------------------------------------------
+/*
+; ------------------------------------------
+; input: x, y, z, result
+;
+; trick from pascal
+; (x+2y+z+2)>>2 can be calculated as:
+; result = avg(x,z)
+; result -= xor(x,z) & 1
+; result = avg(result,y)
+; ------------------------------------------
+*/
+static INLINE __m128i avg3_epu16(const __m128i *x, const __m128i *y,
+ const __m128i *z) {
+ const __m128i one = _mm_set1_epi16(1);
+ const __m128i a = _mm_avg_epu16(*x, *z);
+ const __m128i b =
+ _mm_subs_epu16(a, _mm_and_si128(_mm_xor_si128(*x, *z), one));
+ return _mm_avg_epu16(b, *y);
+}
+
+void vpx_highbd_d45_predictor_4x4_ssse3(uint16_t *dst, ptrdiff_t stride,
+ const uint16_t *above,
+ const uint16_t *left, int bd) {
+ const __m128i ABCDEFGH = _mm_loadu_si128((const __m128i *)above);
+ const __m128i BCDEFGH0 = _mm_srli_si128(ABCDEFGH, 2);
+ const __m128i CDEFGH00 = _mm_srli_si128(ABCDEFGH, 4);
+ const __m128i avg3 = avg3_epu16(&ABCDEFGH, &BCDEFGH0, &CDEFGH00);
+ (void)left;
+ (void)bd;
+ _mm_storel_epi64((__m128i *)dst, avg3);
+ dst += stride;
+ _mm_storel_epi64((__m128i *)dst, _mm_srli_si128(avg3, 2));
+ dst += stride;
+ _mm_storel_epi64((__m128i *)dst, _mm_srli_si128(avg3, 4));
+ dst += stride;
+ _mm_storel_epi64((__m128i *)dst, _mm_srli_si128(avg3, 6));
+ dst[3] = above[7]; // aka H
+}
+
+static INLINE void d45_store_8(uint16_t **dst, const ptrdiff_t stride,
+ __m128i *row, const __m128i *ar) {
+ *row = _mm_alignr_epi8(*ar, *row, 2);
+ _mm_store_si128((__m128i *)*dst, *row);
+ *dst += stride;
+}
+
+void vpx_highbd_d45_predictor_8x8_ssse3(uint16_t *dst, ptrdiff_t stride,
+ const uint16_t *above,
+ const uint16_t *left, int bd) {
+ const __m128i ABCDEFGH = _mm_load_si128((const __m128i *)above);
+ const __m128i ABCDHHHH = _mm_shufflehi_epi16(ABCDEFGH, 0xff);
+ const __m128i HHHHHHHH = _mm_unpackhi_epi64(ABCDHHHH, ABCDHHHH);
+ const __m128i BCDEFGHH = _mm_alignr_epi8(HHHHHHHH, ABCDEFGH, 2);
+ const __m128i CDEFGHHH = _mm_alignr_epi8(HHHHHHHH, ABCDEFGH, 4);
+ __m128i avg3 = avg3_epu16(&ABCDEFGH, &BCDEFGHH, &CDEFGHHH);
+ (void)left;
+ (void)bd;
+ _mm_store_si128((__m128i *)dst, avg3);
+ dst += stride;
+ d45_store_8(&dst, stride, &avg3, &HHHHHHHH);
+ d45_store_8(&dst, stride, &avg3, &HHHHHHHH);
+ d45_store_8(&dst, stride, &avg3, &HHHHHHHH);
+ d45_store_8(&dst, stride, &avg3, &HHHHHHHH);
+ d45_store_8(&dst, stride, &avg3, &HHHHHHHH);
+ d45_store_8(&dst, stride, &avg3, &HHHHHHHH);
+ d45_store_8(&dst, stride, &avg3, &HHHHHHHH);
+}
+
+static INLINE void d45_store_16(uint16_t **dst, const ptrdiff_t stride,
+ __m128i *row_0, __m128i *row_1,
+ const __m128i *ar) {
+ *row_0 = _mm_alignr_epi8(*row_1, *row_0, 2);
+ *row_1 = _mm_alignr_epi8(*ar, *row_1, 2);
+ _mm_store_si128((__m128i *)*dst, *row_0);
+ _mm_store_si128((__m128i *)(*dst + 8), *row_1);
+ *dst += stride;
+}
+
+void vpx_highbd_d45_predictor_16x16_ssse3(uint16_t *dst, ptrdiff_t stride,
+ const uint16_t *above,
+ const uint16_t *left, int bd) {
+ const __m128i A0 = _mm_load_si128((const __m128i *)above);
+ const __m128i A1 = _mm_load_si128((const __m128i *)(above + 8));
+ const __m128i AR0 = _mm_shufflehi_epi16(A1, 0xff);
+ const __m128i AR = _mm_unpackhi_epi64(AR0, AR0);
+ const __m128i B0 = _mm_alignr_epi8(A1, A0, 2);
+ const __m128i B1 = _mm_alignr_epi8(AR, A1, 2);
+ const __m128i C0 = _mm_alignr_epi8(A1, A0, 4);
+ const __m128i C1 = _mm_alignr_epi8(AR, A1, 4);
+ __m128i avg3_0 = avg3_epu16(&A0, &B0, &C0);
+ __m128i avg3_1 = avg3_epu16(&A1, &B1, &C1);
+ (void)left;
+ (void)bd;
+ _mm_store_si128((__m128i *)dst, avg3_0);
+ _mm_store_si128((__m128i *)(dst + 8), avg3_1);
+ dst += stride;
+ d45_store_16(&dst, stride, &avg3_0, &avg3_1, &AR);
+ d45_store_16(&dst, stride, &avg3_0, &avg3_1, &AR);
+ d45_store_16(&dst, stride, &avg3_0, &avg3_1, &AR);
+ d45_store_16(&dst, stride, &avg3_0, &avg3_1, &AR);
+ d45_store_16(&dst, stride, &avg3_0, &avg3_1, &AR);
+ d45_store_16(&dst, stride, &avg3_0, &avg3_1, &AR);
+ d45_store_16(&dst, stride, &avg3_0, &avg3_1, &AR);
+ d45_store_16(&dst, stride, &avg3_0, &avg3_1, &AR);
+ d45_store_16(&dst, stride, &avg3_0, &avg3_1, &AR);
+ d45_store_16(&dst, stride, &avg3_0, &avg3_1, &AR);
+ d45_store_16(&dst, stride, &avg3_0, &avg3_1, &AR);
+ d45_store_16(&dst, stride, &avg3_0, &avg3_1, &AR);
+ d45_store_16(&dst, stride, &avg3_0, &avg3_1, &AR);
+ d45_store_16(&dst, stride, &avg3_0, &avg3_1, &AR);
+ d45_store_16(&dst, stride, &avg3_0, &avg3_1, &AR);
+}
+
+void vpx_highbd_d45_predictor_32x32_ssse3(uint16_t *dst, ptrdiff_t stride,
+ const uint16_t *above,
+ const uint16_t *left, int bd) {
+ const __m128i A0 = _mm_load_si128((const __m128i *)above);
+ const __m128i A1 = _mm_load_si128((const __m128i *)(above + 8));
+ const __m128i A2 = _mm_load_si128((const __m128i *)(above + 16));
+ const __m128i A3 = _mm_load_si128((const __m128i *)(above + 24));
+ const __m128i AR0 = _mm_shufflehi_epi16(A3, 0xff);
+ const __m128i AR = _mm_unpackhi_epi64(AR0, AR0);
+ const __m128i B0 = _mm_alignr_epi8(A1, A0, 2);
+ const __m128i B1 = _mm_alignr_epi8(A2, A1, 2);
+ const __m128i B2 = _mm_alignr_epi8(A3, A2, 2);
+ const __m128i B3 = _mm_alignr_epi8(AR, A3, 2);
+ const __m128i C0 = _mm_alignr_epi8(A1, A0, 4);
+ const __m128i C1 = _mm_alignr_epi8(A2, A1, 4);
+ const __m128i C2 = _mm_alignr_epi8(A3, A2, 4);
+ const __m128i C3 = _mm_alignr_epi8(AR, A3, 4);
+ __m128i avg3_0 = avg3_epu16(&A0, &B0, &C0);
+ __m128i avg3_1 = avg3_epu16(&A1, &B1, &C1);
+ __m128i avg3_2 = avg3_epu16(&A2, &B2, &C2);
+ __m128i avg3_3 = avg3_epu16(&A3, &B3, &C3);
+ int i;
+ (void)left;
+ (void)bd;
+ _mm_store_si128((__m128i *)dst, avg3_0);
+ _mm_store_si128((__m128i *)(dst + 8), avg3_1);
+ _mm_store_si128((__m128i *)(dst + 16), avg3_2);
+ _mm_store_si128((__m128i *)(dst + 24), avg3_3);
+ dst += stride;
+ for (i = 1; i < 32; ++i) {
+ avg3_0 = _mm_alignr_epi8(avg3_1, avg3_0, 2);
+ avg3_1 = _mm_alignr_epi8(avg3_2, avg3_1, 2);
+ avg3_2 = _mm_alignr_epi8(avg3_3, avg3_2, 2);
+ avg3_3 = _mm_alignr_epi8(AR, avg3_3, 2);
+ _mm_store_si128((__m128i *)dst, avg3_0);
+ _mm_store_si128((__m128i *)(dst + 8), avg3_1);
+ _mm_store_si128((__m128i *)(dst + 16), avg3_2);
+ _mm_store_si128((__m128i *)(dst + 24), avg3_3);
+ dst += stride;
+ }
+}
+
+DECLARE_ALIGNED(16, static const uint8_t,
+ rotate_right_epu16[16]) = { 2, 3, 4, 5, 6, 7, 8, 9,
+ 10, 11, 12, 13, 14, 15, 0, 1 };
+
+static INLINE __m128i rotr_epu16(__m128i *a, const __m128i *rotrw) {
+ *a = _mm_shuffle_epi8(*a, *rotrw);
+ return *a;
+}
+
+void vpx_highbd_d117_predictor_8x8_ssse3(uint16_t *dst, ptrdiff_t stride,
+ const uint16_t *above,
+ const uint16_t *left, int bd) {
+ const __m128i rotrw = _mm_load_si128((const __m128i *)rotate_right_epu16);
+ const __m128i XABCDEFG = _mm_loadu_si128((const __m128i *)(above - 1));
+ const __m128i ABCDEFGH = _mm_load_si128((const __m128i *)above);
+ const __m128i IJKLMNOP = _mm_load_si128((const __m128i *)left);
+ const __m128i IXABCDEF =
+ _mm_alignr_epi8(XABCDEFG, _mm_slli_si128(IJKLMNOP, 14), 14);
+ const __m128i avg3 = avg3_epu16(&ABCDEFGH, &XABCDEFG, &IXABCDEF);
+ const __m128i avg2 = _mm_avg_epu16(ABCDEFGH, XABCDEFG);
+ const __m128i XIJKLMNO =
+ _mm_alignr_epi8(IJKLMNOP, _mm_slli_si128(XABCDEFG, 14), 14);
+ const __m128i JKLMNOP0 = _mm_srli_si128(IJKLMNOP, 2);
+ __m128i avg3_left = avg3_epu16(&XIJKLMNO, &IJKLMNOP, &JKLMNOP0);
+ __m128i rowa = avg2;
+ __m128i rowb = avg3;
+ int i;
+ (void)bd;
+ for (i = 0; i < 8; i += 2) {
+ _mm_store_si128((__m128i *)dst, rowa);
+ dst += stride;
+ _mm_store_si128((__m128i *)dst, rowb);
+ dst += stride;
+ rowa = _mm_alignr_epi8(rowa, rotr_epu16(&avg3_left, &rotrw), 14);
+ rowb = _mm_alignr_epi8(rowb, rotr_epu16(&avg3_left, &rotrw), 14);
+ }
+}
+
+void vpx_highbd_d117_predictor_16x16_ssse3(uint16_t *dst, ptrdiff_t stride,
+ const uint16_t *above,
+ const uint16_t *left, int bd) {
+ const __m128i rotrw = _mm_load_si128((const __m128i *)rotate_right_epu16);
+ const __m128i B0 = _mm_loadu_si128((const __m128i *)(above - 1));
+ const __m128i A0 = _mm_load_si128((const __m128i *)above);
+ const __m128i B1 = _mm_loadu_si128((const __m128i *)(above + 7));
+ const __m128i A1 = _mm_load_si128((const __m128i *)(above + 8));
+ const __m128i avg2_0 = _mm_avg_epu16(A0, B0);
+ const __m128i avg2_1 = _mm_avg_epu16(A1, B1);
+ const __m128i L0 = _mm_load_si128((const __m128i *)left);
+ const __m128i L1 = _mm_load_si128((const __m128i *)(left + 8));
+ const __m128i C0 = _mm_alignr_epi8(B0, _mm_slli_si128(L0, 14), 14);
+ const __m128i C1 = _mm_alignr_epi8(B1, B0, 14);
+ const __m128i avg3_0 = avg3_epu16(&A0, &B0, &C0);
+ const __m128i avg3_1 = avg3_epu16(&A1, &B1, &C1);
+ const __m128i XL0 = _mm_alignr_epi8(L0, _mm_slli_si128(B0, 14), 14);
+ const __m128i XL1 = _mm_alignr_epi8(L1, L0, 14);
+ const __m128i L0_ = _mm_alignr_epi8(L1, L0, 2);
+ const __m128i L1_ = _mm_srli_si128(L1, 2);
+ __m128i rowa_0 = avg2_0;
+ __m128i rowa_1 = avg2_1;
+ __m128i rowb_0 = avg3_0;
+ __m128i rowb_1 = avg3_1;
+ __m128i avg3_left[2];
+ int i, j;
+ (void)bd;
+ avg3_left[0] = avg3_epu16(&XL0, &L0, &L0_);
+ avg3_left[1] = avg3_epu16(&XL1, &L1, &L1_);
+ for (i = 0; i < 2; ++i) {
+ __m128i avg_left = avg3_left[i];
+ for (j = 0; j < 8; j += 2) {
+ _mm_store_si128((__m128i *)dst, rowa_0);
+ _mm_store_si128((__m128i *)(dst + 8), rowa_1);
+ dst += stride;
+ _mm_store_si128((__m128i *)dst, rowb_0);
+ _mm_store_si128((__m128i *)(dst + 8), rowb_1);
+ dst += stride;
+ rowa_1 = _mm_alignr_epi8(rowa_1, rowa_0, 14);
+ rowa_0 = _mm_alignr_epi8(rowa_0, rotr_epu16(&avg_left, &rotrw), 14);
+ rowb_1 = _mm_alignr_epi8(rowb_1, rowb_0, 14);
+ rowb_0 = _mm_alignr_epi8(rowb_0, rotr_epu16(&avg_left, &rotrw), 14);
+ }
+ }
+}
+
+void vpx_highbd_d117_predictor_32x32_ssse3(uint16_t *dst, ptrdiff_t stride,
+ const uint16_t *above,
+ const uint16_t *left, int bd) {
+ const __m128i rotrw = _mm_load_si128((const __m128i *)rotate_right_epu16);
+ const __m128i A0 = _mm_load_si128((const __m128i *)above);
+ const __m128i A1 = _mm_load_si128((const __m128i *)(above + 8));
+ const __m128i A2 = _mm_load_si128((const __m128i *)(above + 16));
+ const __m128i A3 = _mm_load_si128((const __m128i *)(above + 24));
+ const __m128i B0 = _mm_loadu_si128((const __m128i *)(above - 1));
+ const __m128i B1 = _mm_loadu_si128((const __m128i *)(above + 7));
+ const __m128i B2 = _mm_loadu_si128((const __m128i *)(above + 15));
+ const __m128i B3 = _mm_loadu_si128((const __m128i *)(above + 23));
+ const __m128i avg2_0 = _mm_avg_epu16(A0, B0);
+ const __m128i avg2_1 = _mm_avg_epu16(A1, B1);
+ const __m128i avg2_2 = _mm_avg_epu16(A2, B2);
+ const __m128i avg2_3 = _mm_avg_epu16(A3, B3);
+ const __m128i L0 = _mm_load_si128((const __m128i *)left);
+ const __m128i L1 = _mm_load_si128((const __m128i *)(left + 8));
+ const __m128i L2 = _mm_load_si128((const __m128i *)(left + 16));
+ const __m128i L3 = _mm_load_si128((const __m128i *)(left + 24));
+ const __m128i C0 = _mm_alignr_epi8(B0, _mm_slli_si128(L0, 14), 14);
+ const __m128i C1 = _mm_alignr_epi8(B1, B0, 14);
+ const __m128i C2 = _mm_alignr_epi8(B2, B1, 14);
+ const __m128i C3 = _mm_alignr_epi8(B3, B2, 14);
+ const __m128i avg3_0 = avg3_epu16(&A0, &B0, &C0);
+ const __m128i avg3_1 = avg3_epu16(&A1, &B1, &C1);
+ const __m128i avg3_2 = avg3_epu16(&A2, &B2, &C2);
+ const __m128i avg3_3 = avg3_epu16(&A3, &B3, &C3);
+ const __m128i XL0 = _mm_alignr_epi8(L0, _mm_slli_si128(B0, 14), 14);
+ const __m128i XL1 = _mm_alignr_epi8(L1, L0, 14);
+ const __m128i XL2 = _mm_alignr_epi8(L2, L1, 14);
+ const __m128i XL3 = _mm_alignr_epi8(L3, L2, 14);
+ const __m128i L0_ = _mm_alignr_epi8(L1, L0, 2);
+ const __m128i L1_ = _mm_alignr_epi8(L2, L1, 2);
+ const __m128i L2_ = _mm_alignr_epi8(L3, L2, 2);
+ const __m128i L3_ = _mm_srli_si128(L3, 2);
+ __m128i rowa_0 = avg2_0;
+ __m128i rowa_1 = avg2_1;
+ __m128i rowa_2 = avg2_2;
+ __m128i rowa_3 = avg2_3;
+ __m128i rowb_0 = avg3_0;
+ __m128i rowb_1 = avg3_1;
+ __m128i rowb_2 = avg3_2;
+ __m128i rowb_3 = avg3_3;
+ __m128i avg3_left[4];
+ int i, j;
+ (void)bd;
+ avg3_left[0] = avg3_epu16(&XL0, &L0, &L0_);
+ avg3_left[1] = avg3_epu16(&XL1, &L1, &L1_);
+ avg3_left[2] = avg3_epu16(&XL2, &L2, &L2_);
+ avg3_left[3] = avg3_epu16(&XL3, &L3, &L3_);
+ for (i = 0; i < 4; ++i) {
+ __m128i avg_left = avg3_left[i];
+ for (j = 0; j < 8; j += 2) {
+ _mm_store_si128((__m128i *)dst, rowa_0);
+ _mm_store_si128((__m128i *)(dst + 8), rowa_1);
+ _mm_store_si128((__m128i *)(dst + 16), rowa_2);
+ _mm_store_si128((__m128i *)(dst + 24), rowa_3);
+ dst += stride;
+ _mm_store_si128((__m128i *)dst, rowb_0);
+ _mm_store_si128((__m128i *)(dst + 8), rowb_1);
+ _mm_store_si128((__m128i *)(dst + 16), rowb_2);
+ _mm_store_si128((__m128i *)(dst + 24), rowb_3);
+ dst += stride;
+ rowa_3 = _mm_alignr_epi8(rowa_3, rowa_2, 14);
+ rowa_2 = _mm_alignr_epi8(rowa_2, rowa_1, 14);
+ rowa_1 = _mm_alignr_epi8(rowa_1, rowa_0, 14);
+ rowa_0 = _mm_alignr_epi8(rowa_0, rotr_epu16(&avg_left, &rotrw), 14);
+ rowb_3 = _mm_alignr_epi8(rowb_3, rowb_2, 14);
+ rowb_2 = _mm_alignr_epi8(rowb_2, rowb_1, 14);
+ rowb_1 = _mm_alignr_epi8(rowb_1, rowb_0, 14);
+ rowb_0 = _mm_alignr_epi8(rowb_0, rotr_epu16(&avg_left, &rotrw), 14);
+ }
+ }
+}
+
+void vpx_highbd_d135_predictor_8x8_ssse3(uint16_t *dst, ptrdiff_t stride,
+ const uint16_t *above,
+ const uint16_t *left, int bd) {
+ const __m128i rotrw = _mm_load_si128((const __m128i *)rotate_right_epu16);
+ const __m128i XABCDEFG = _mm_loadu_si128((const __m128i *)(above - 1));
+ const __m128i ABCDEFGH = _mm_load_si128((const __m128i *)above);
+ const __m128i BCDEFGH0 = _mm_srli_si128(ABCDEFGH, 2);
+ const __m128i IJKLMNOP = _mm_load_si128((const __m128i *)left);
+ const __m128i XIJKLMNO =
+ _mm_alignr_epi8(IJKLMNOP, _mm_slli_si128(XABCDEFG, 14), 14);
+ const __m128i AXIJKLMN =
+ _mm_alignr_epi8(XIJKLMNO, _mm_slli_si128(ABCDEFGH, 14), 14);
+ const __m128i avg3 = avg3_epu16(&XABCDEFG, &ABCDEFGH, &BCDEFGH0);
+ __m128i avg3_left = avg3_epu16(&IJKLMNOP, &XIJKLMNO, &AXIJKLMN);
+ __m128i rowa = avg3;
+ int i;
+ (void)bd;
+ for (i = 0; i < 8; ++i) {
+ rowa = _mm_alignr_epi8(rowa, rotr_epu16(&avg3_left, &rotrw), 14);
+ _mm_store_si128((__m128i *)dst, rowa);
+ dst += stride;
+ }
+}
+
+void vpx_highbd_d135_predictor_16x16_ssse3(uint16_t *dst, ptrdiff_t stride,
+ const uint16_t *above,
+ const uint16_t *left, int bd) {
+ const __m128i rotrw = _mm_load_si128((const __m128i *)rotate_right_epu16);
+ const __m128i A0 = _mm_loadu_si128((const __m128i *)(above - 1));
+ const __m128i B0 = _mm_load_si128((const __m128i *)above);
+ const __m128i A1 = _mm_loadu_si128((const __m128i *)(above + 7));
+ const __m128i B1 = _mm_load_si128((const __m128i *)(above + 8));
+ const __m128i L0 = _mm_load_si128((const __m128i *)left);
+ const __m128i L1 = _mm_load_si128((const __m128i *)(left + 8));
+ const __m128i C0 = _mm_alignr_epi8(B1, B0, 2);
+ const __m128i C1 = _mm_srli_si128(B1, 2);
+ const __m128i avg3_0 = avg3_epu16(&A0, &B0, &C0);
+ const __m128i avg3_1 = avg3_epu16(&A1, &B1, &C1);
+ const __m128i XL0 = _mm_alignr_epi8(L0, _mm_slli_si128(A0, 14), 14);
+ const __m128i XL1 = _mm_alignr_epi8(L1, L0, 14);
+ const __m128i L0_ = _mm_alignr_epi8(XL0, _mm_slli_si128(B0, 14), 14);
+ const __m128i L1_ = _mm_alignr_epi8(XL1, XL0, 14);
+ __m128i rowa_0 = avg3_0;
+ __m128i rowa_1 = avg3_1;
+ __m128i avg3_left[2];
+ int i, j;
+ (void)bd;
+ avg3_left[0] = avg3_epu16(&L0, &XL0, &L0_);
+ avg3_left[1] = avg3_epu16(&L1, &XL1, &L1_);
+ for (i = 0; i < 2; ++i) {
+ __m128i avg_left = avg3_left[i];
+ for (j = 0; j < 8; ++j) {
+ rowa_1 = _mm_alignr_epi8(rowa_1, rowa_0, 14);
+ rowa_0 = _mm_alignr_epi8(rowa_0, rotr_epu16(&avg_left, &rotrw), 14);
+ _mm_store_si128((__m128i *)dst, rowa_0);
+ _mm_store_si128((__m128i *)(dst + 8), rowa_1);
+ dst += stride;
+ }
+ }
+}
+
+void vpx_highbd_d135_predictor_32x32_ssse3(uint16_t *dst, ptrdiff_t stride,
+ const uint16_t *above,
+ const uint16_t *left, int bd) {
+ const __m128i rotrw = _mm_load_si128((const __m128i *)rotate_right_epu16);
+ const __m128i A0 = _mm_loadu_si128((const __m128i *)(above - 1));
+ const __m128i A1 = _mm_loadu_si128((const __m128i *)(above + 7));
+ const __m128i A2 = _mm_loadu_si128((const __m128i *)(above + 15));
+ const __m128i A3 = _mm_loadu_si128((const __m128i *)(above + 23));
+ const __m128i B0 = _mm_load_si128((const __m128i *)above);
+ const __m128i B1 = _mm_load_si128((const __m128i *)(above + 8));
+ const __m128i B2 = _mm_load_si128((const __m128i *)(above + 16));
+ const __m128i B3 = _mm_load_si128((const __m128i *)(above + 24));
+ const __m128i L0 = _mm_load_si128((const __m128i *)left);
+ const __m128i L1 = _mm_load_si128((const __m128i *)(left + 8));
+ const __m128i L2 = _mm_load_si128((const __m128i *)(left + 16));
+ const __m128i L3 = _mm_load_si128((const __m128i *)(left + 24));
+ const __m128i C0 = _mm_alignr_epi8(B1, B0, 2);
+ const __m128i C1 = _mm_alignr_epi8(B2, B1, 2);
+ const __m128i C2 = _mm_alignr_epi8(B3, B2, 2);
+ const __m128i C3 = _mm_srli_si128(B3, 2);
+ const __m128i avg3_0 = avg3_epu16(&A0, &B0, &C0);
+ const __m128i avg3_1 = avg3_epu16(&A1, &B1, &C1);
+ const __m128i avg3_2 = avg3_epu16(&A2, &B2, &C2);
+ const __m128i avg3_3 = avg3_epu16(&A3, &B3, &C3);
+ const __m128i XL0 = _mm_alignr_epi8(L0, _mm_slli_si128(A0, 14), 14);
+ const __m128i XL1 = _mm_alignr_epi8(L1, L0, 14);
+ const __m128i XL2 = _mm_alignr_epi8(L2, L1, 14);
+ const __m128i XL3 = _mm_alignr_epi8(L3, L2, 14);
+ const __m128i L0_ = _mm_alignr_epi8(XL0, _mm_slli_si128(B0, 14), 14);
+ const __m128i L1_ = _mm_alignr_epi8(XL1, XL0, 14);
+ const __m128i L2_ = _mm_alignr_epi8(XL2, XL1, 14);
+ const __m128i L3_ = _mm_alignr_epi8(XL3, XL2, 14);
+ __m128i rowa_0 = avg3_0;
+ __m128i rowa_1 = avg3_1;
+ __m128i rowa_2 = avg3_2;
+ __m128i rowa_3 = avg3_3;
+ __m128i avg3_left[4];
+ int i, j;
+ (void)bd;
+ avg3_left[0] = avg3_epu16(&L0, &XL0, &L0_);
+ avg3_left[1] = avg3_epu16(&L1, &XL1, &L1_);
+ avg3_left[2] = avg3_epu16(&L2, &XL2, &L2_);
+ avg3_left[3] = avg3_epu16(&L3, &XL3, &L3_);
+ for (i = 0; i < 4; ++i) {
+ __m128i avg_left = avg3_left[i];
+ for (j = 0; j < 8; ++j) {
+ rowa_3 = _mm_alignr_epi8(rowa_3, rowa_2, 14);
+ rowa_2 = _mm_alignr_epi8(rowa_2, rowa_1, 14);
+ rowa_1 = _mm_alignr_epi8(rowa_1, rowa_0, 14);
+ rowa_0 = _mm_alignr_epi8(rowa_0, rotr_epu16(&avg_left, &rotrw), 14);
+ _mm_store_si128((__m128i *)dst, rowa_0);
+ _mm_store_si128((__m128i *)(dst + 8), rowa_1);
+ _mm_store_si128((__m128i *)(dst + 16), rowa_2);
+ _mm_store_si128((__m128i *)(dst + 24), rowa_3);
+ dst += stride;
+ }
+ }
+}
+
+void vpx_highbd_d153_predictor_8x8_ssse3(uint16_t *dst, ptrdiff_t stride,
+ const uint16_t *above,
+ const uint16_t *left, int bd) {
+ const __m128i XABCDEFG = _mm_loadu_si128((const __m128i *)(above - 1));
+ const __m128i ABCDEFG0 = _mm_srli_si128(XABCDEFG, 2);
+ const __m128i BCDEFG00 = _mm_srli_si128(XABCDEFG, 4);
+ const __m128i avg3 = avg3_epu16(&BCDEFG00, &ABCDEFG0, &XABCDEFG);
+ const __m128i IJKLMNOP = _mm_load_si128((const __m128i *)left);
+ const __m128i XIJKLMNO =
+ _mm_alignr_epi8(IJKLMNOP, _mm_slli_si128(XABCDEFG, 14), 14);
+ const __m128i AXIJKLMN =
+ _mm_alignr_epi8(XIJKLMNO, _mm_slli_si128(XABCDEFG, 12), 14);
+ const __m128i avg3_left = avg3_epu16(&IJKLMNOP, &XIJKLMNO, &AXIJKLMN);
+ const __m128i avg2_left = _mm_avg_epu16(IJKLMNOP, XIJKLMNO);
+ const __m128i avg2_avg3_lo = _mm_unpacklo_epi16(avg2_left, avg3_left);
+ const __m128i avg2_avg3_hi = _mm_unpackhi_epi16(avg2_left, avg3_left);
+ const __m128i row0 =
+ _mm_alignr_epi8(avg3, _mm_slli_si128(avg2_avg3_lo, 12), 12);
+ const __m128i row1 =
+ _mm_alignr_epi8(row0, _mm_slli_si128(avg2_avg3_lo, 8), 12);
+ const __m128i row2 =
+ _mm_alignr_epi8(row1, _mm_slli_si128(avg2_avg3_lo, 4), 12);
+ const __m128i row3 = _mm_alignr_epi8(row2, avg2_avg3_lo, 12);
+ const __m128i row4 =
+ _mm_alignr_epi8(row3, _mm_slli_si128(avg2_avg3_hi, 12), 12);
+ const __m128i row5 =
+ _mm_alignr_epi8(row4, _mm_slli_si128(avg2_avg3_hi, 8), 12);
+ const __m128i row6 =
+ _mm_alignr_epi8(row5, _mm_slli_si128(avg2_avg3_hi, 4), 12);
+ const __m128i row7 = _mm_alignr_epi8(row6, avg2_avg3_hi, 12);
+ (void)bd;
+ _mm_store_si128((__m128i *)dst, row0);
+ dst += stride;
+ _mm_store_si128((__m128i *)dst, row1);
+ dst += stride;
+ _mm_store_si128((__m128i *)dst, row2);
+ dst += stride;
+ _mm_store_si128((__m128i *)dst, row3);
+ dst += stride;
+ _mm_store_si128((__m128i *)dst, row4);
+ dst += stride;
+ _mm_store_si128((__m128i *)dst, row5);
+ dst += stride;
+ _mm_store_si128((__m128i *)dst, row6);
+ dst += stride;
+ _mm_store_si128((__m128i *)dst, row7);
+}
+
+void vpx_highbd_d153_predictor_16x16_ssse3(uint16_t *dst, ptrdiff_t stride,
+ const uint16_t *above,
+ const uint16_t *left, int bd) {
+ const __m128i A0 = _mm_loadu_si128((const __m128i *)(above - 1));
+ const __m128i A1 = _mm_loadu_si128((const __m128i *)(above + 7));
+ const __m128i B0 = _mm_alignr_epi8(A1, A0, 2);
+ const __m128i B1 = _mm_srli_si128(A1, 2);
+ const __m128i C0 = _mm_alignr_epi8(A1, A0, 4);
+ const __m128i C1 = _mm_srli_si128(A1, 4);
+ const __m128i avg3_0 = avg3_epu16(&A0, &B0, &C0);
+ const __m128i avg3_1 = avg3_epu16(&A1, &B1, &C1);
+ const __m128i L0 = _mm_load_si128((const __m128i *)left);
+ const __m128i L1 = _mm_load_si128((const __m128i *)(left + 8));
+ const __m128i XL0 = _mm_alignr_epi8(L0, _mm_slli_si128(A0, 14), 14);
+ const __m128i AXL0 = _mm_alignr_epi8(XL0, _mm_slli_si128(A0, 12), 14);
+ const __m128i XL1 = _mm_alignr_epi8(L1, L0, 14);
+ const __m128i AXL1 = _mm_alignr_epi8(L1, L0, 12);
+ const __m128i avg3_left_0 = avg3_epu16(&L0, &XL0, &AXL0);
+ const __m128i avg2_left_0 = _mm_avg_epu16(L0, XL0);
+ const __m128i avg3_left_1 = avg3_epu16(&L1, &XL1, &AXL1);
+ const __m128i avg2_left_1 = _mm_avg_epu16(L1, XL1);
+ __m128i row_0 = avg3_0;
+ __m128i row_1 = avg3_1;
+ __m128i avg2_avg3_left[2][2];
+ int i, j;
+ (void)bd;
+
+ avg2_avg3_left[0][0] = _mm_unpacklo_epi16(avg2_left_0, avg3_left_0);
+ avg2_avg3_left[0][1] = _mm_unpackhi_epi16(avg2_left_0, avg3_left_0);
+ avg2_avg3_left[1][0] = _mm_unpacklo_epi16(avg2_left_1, avg3_left_1);
+ avg2_avg3_left[1][1] = _mm_unpackhi_epi16(avg2_left_1, avg3_left_1);
+
+ for (j = 0; j < 2; ++j) {
+ for (i = 0; i < 2; ++i) {
+ const __m128i avg2_avg3 = avg2_avg3_left[j][i];
+ row_1 = _mm_alignr_epi8(row_1, row_0, 12);
+ row_0 = _mm_alignr_epi8(row_0, _mm_slli_si128(avg2_avg3, 12), 12);
+ _mm_store_si128((__m128i *)dst, row_0);
+ _mm_store_si128((__m128i *)(dst + 8), row_1);
+ dst += stride;
+ row_1 = _mm_alignr_epi8(row_1, row_0, 12);
+ row_0 = _mm_alignr_epi8(row_0, _mm_slli_si128(avg2_avg3, 8), 12);
+ _mm_store_si128((__m128i *)dst, row_0);
+ _mm_store_si128((__m128i *)(dst + 8), row_1);
+ dst += stride;
+ row_1 = _mm_alignr_epi8(row_1, row_0, 12);
+ row_0 = _mm_alignr_epi8(row_0, _mm_slli_si128(avg2_avg3, 4), 12);
+ _mm_store_si128((__m128i *)dst, row_0);
+ _mm_store_si128((__m128i *)(dst + 8), row_1);
+ dst += stride;
+ row_1 = _mm_alignr_epi8(row_1, row_0, 12);
+ row_0 = _mm_alignr_epi8(row_0, avg2_avg3, 12);
+ _mm_store_si128((__m128i *)dst, row_0);
+ _mm_store_si128((__m128i *)(dst + 8), row_1);
+ dst += stride;
+ }
+ }
+}
+
+void vpx_highbd_d153_predictor_32x32_ssse3(uint16_t *dst, ptrdiff_t stride,
+ const uint16_t *above,
+ const uint16_t *left, int bd) {
+ const __m128i A0 = _mm_loadu_si128((const __m128i *)(above - 1));
+ const __m128i A1 = _mm_loadu_si128((const __m128i *)(above + 7));
+ const __m128i A2 = _mm_loadu_si128((const __m128i *)(above + 15));
+ const __m128i A3 = _mm_loadu_si128((const __m128i *)(above + 23));
+ const __m128i B0 = _mm_alignr_epi8(A1, A0, 2);
+ const __m128i B1 = _mm_alignr_epi8(A2, A1, 2);
+ const __m128i B2 = _mm_alignr_epi8(A3, A2, 2);
+ const __m128i B3 = _mm_srli_si128(A3, 2);
+ const __m128i C0 = _mm_alignr_epi8(A1, A0, 4);
+ const __m128i C1 = _mm_alignr_epi8(A2, A1, 4);
+ const __m128i C2 = _mm_alignr_epi8(A3, A2, 4);
+ const __m128i C3 = _mm_srli_si128(A3, 4);
+ const __m128i avg3_0 = avg3_epu16(&A0, &B0, &C0);
+ const __m128i avg3_1 = avg3_epu16(&A1, &B1, &C1);
+ const __m128i avg3_2 = avg3_epu16(&A2, &B2, &C2);
+ const __m128i avg3_3 = avg3_epu16(&A3, &B3, &C3);
+ const __m128i L0 = _mm_load_si128((const __m128i *)left);
+ const __m128i L1 = _mm_load_si128((const __m128i *)(left + 8));
+ const __m128i L2 = _mm_load_si128((const __m128i *)(left + 16));
+ const __m128i L3 = _mm_load_si128((const __m128i *)(left + 24));
+ const __m128i XL0 = _mm_alignr_epi8(L0, _mm_slli_si128(A0, 14), 14);
+ const __m128i XL1 = _mm_alignr_epi8(L1, L0, 14);
+ const __m128i XL2 = _mm_alignr_epi8(L2, L1, 14);
+ const __m128i XL3 = _mm_alignr_epi8(L3, L2, 14);
+ const __m128i AXL0 = _mm_alignr_epi8(XL0, _mm_slli_si128(A0, 12), 14);
+ const __m128i AXL1 = _mm_alignr_epi8(L1, L0, 12);
+ const __m128i AXL2 = _mm_alignr_epi8(L2, L1, 12);
+ const __m128i AXL3 = _mm_alignr_epi8(L3, L2, 12);
+ const __m128i avg3_left_0 = avg3_epu16(&L0, &XL0, &AXL0);
+ const __m128i avg3_left_1 = avg3_epu16(&L1, &XL1, &AXL1);
+ const __m128i avg3_left_2 = avg3_epu16(&L2, &XL2, &AXL2);
+ const __m128i avg3_left_3 = avg3_epu16(&L3, &XL3, &AXL3);
+ const __m128i avg2_left_0 = _mm_avg_epu16(L0, XL0);
+ const __m128i avg2_left_1 = _mm_avg_epu16(L1, XL1);
+ const __m128i avg2_left_2 = _mm_avg_epu16(L2, XL2);
+ const __m128i avg2_left_3 = _mm_avg_epu16(L3, XL3);
+ __m128i row_0 = avg3_0;
+ __m128i row_1 = avg3_1;
+ __m128i row_2 = avg3_2;
+ __m128i row_3 = avg3_3;
+ __m128i avg2_avg3_left[4][2];
+ int i, j;
+ (void)bd;
+
+ avg2_avg3_left[0][0] = _mm_unpacklo_epi16(avg2_left_0, avg3_left_0);
+ avg2_avg3_left[0][1] = _mm_unpackhi_epi16(avg2_left_0, avg3_left_0);
+ avg2_avg3_left[1][0] = _mm_unpacklo_epi16(avg2_left_1, avg3_left_1);
+ avg2_avg3_left[1][1] = _mm_unpackhi_epi16(avg2_left_1, avg3_left_1);
+ avg2_avg3_left[2][0] = _mm_unpacklo_epi16(avg2_left_2, avg3_left_2);
+ avg2_avg3_left[2][1] = _mm_unpackhi_epi16(avg2_left_2, avg3_left_2);
+ avg2_avg3_left[3][0] = _mm_unpacklo_epi16(avg2_left_3, avg3_left_3);
+ avg2_avg3_left[3][1] = _mm_unpackhi_epi16(avg2_left_3, avg3_left_3);
+
+ for (j = 0; j < 4; ++j) {
+ for (i = 0; i < 2; ++i) {
+ const __m128i avg2_avg3 = avg2_avg3_left[j][i];
+ row_3 = _mm_alignr_epi8(row_3, row_2, 12);
+ row_2 = _mm_alignr_epi8(row_2, row_1, 12);
+ row_1 = _mm_alignr_epi8(row_1, row_0, 12);
+ row_0 = _mm_alignr_epi8(row_0, _mm_slli_si128(avg2_avg3, 12), 12);
+ _mm_store_si128((__m128i *)dst, row_0);
+ _mm_store_si128((__m128i *)(dst + 8), row_1);
+ _mm_store_si128((__m128i *)(dst + 16), row_2);
+ _mm_store_si128((__m128i *)(dst + 24), row_3);
+ dst += stride;
+ row_3 = _mm_alignr_epi8(row_3, row_2, 12);
+ row_2 = _mm_alignr_epi8(row_2, row_1, 12);
+ row_1 = _mm_alignr_epi8(row_1, row_0, 12);
+ row_0 = _mm_alignr_epi8(row_0, _mm_slli_si128(avg2_avg3, 8), 12);
+ _mm_store_si128((__m128i *)dst, row_0);
+ _mm_store_si128((__m128i *)(dst + 8), row_1);
+ _mm_store_si128((__m128i *)(dst + 16), row_2);
+ _mm_store_si128((__m128i *)(dst + 24), row_3);
+ dst += stride;
+ row_3 = _mm_alignr_epi8(row_3, row_2, 12);
+ row_2 = _mm_alignr_epi8(row_2, row_1, 12);
+ row_1 = _mm_alignr_epi8(row_1, row_0, 12);
+ row_0 = _mm_alignr_epi8(row_0, _mm_slli_si128(avg2_avg3, 4), 12);
+ _mm_store_si128((__m128i *)dst, row_0);
+ _mm_store_si128((__m128i *)(dst + 8), row_1);
+ _mm_store_si128((__m128i *)(dst + 16), row_2);
+ _mm_store_si128((__m128i *)(dst + 24), row_3);
+ dst += stride;
+ row_3 = _mm_alignr_epi8(row_3, row_2, 12);
+ row_2 = _mm_alignr_epi8(row_2, row_1, 12);
+ row_1 = _mm_alignr_epi8(row_1, row_0, 12);
+ row_0 = _mm_alignr_epi8(row_0, avg2_avg3, 12);
+ _mm_store_si128((__m128i *)dst, row_0);
+ _mm_store_si128((__m128i *)(dst + 8), row_1);
+ _mm_store_si128((__m128i *)(dst + 16), row_2);
+ _mm_store_si128((__m128i *)(dst + 24), row_3);
+ dst += stride;
+ }
+ }
+}
+
+static INLINE void d207_store_4x8(uint16_t **dst, const ptrdiff_t stride,
+ const __m128i *a, const __m128i *b) {
+ _mm_store_si128((__m128i *)*dst, *a);
+ *dst += stride;
+ _mm_store_si128((__m128i *)*dst, _mm_alignr_epi8(*b, *a, 4));
+ *dst += stride;
+ _mm_store_si128((__m128i *)*dst, _mm_alignr_epi8(*b, *a, 8));
+ *dst += stride;
+ _mm_store_si128((__m128i *)*dst, _mm_alignr_epi8(*b, *a, 12));
+ *dst += stride;
+}
+
+void vpx_highbd_d207_predictor_8x8_ssse3(uint16_t *dst, ptrdiff_t stride,
+ const uint16_t *above,
+ const uint16_t *left, int bd) {
+ const __m128i ABCDEFGH = _mm_load_si128((const __m128i *)left);
+ const __m128i ABCDHHHH = _mm_shufflehi_epi16(ABCDEFGH, 0xff);
+ const __m128i HHHHHHHH = _mm_unpackhi_epi64(ABCDHHHH, ABCDHHHH);
+ const __m128i BCDEFGHH = _mm_alignr_epi8(HHHHHHHH, ABCDEFGH, 2);
+ const __m128i CDEFGHHH = _mm_alignr_epi8(HHHHHHHH, ABCDEFGH, 4);
+ const __m128i avg3 = avg3_epu16(&ABCDEFGH, &BCDEFGHH, &CDEFGHHH);
+ const __m128i avg2 = _mm_avg_epu16(ABCDEFGH, BCDEFGHH);
+ const __m128i out_a = _mm_unpacklo_epi16(avg2, avg3);
+ const __m128i out_b = _mm_unpackhi_epi16(avg2, avg3);
+ (void)above;
+ (void)bd;
+ d207_store_4x8(&dst, stride, &out_a, &out_b);
+ d207_store_4x8(&dst, stride, &out_b, &HHHHHHHH);
+}
+
+static INLINE void d207_store_4x16(uint16_t **dst, const ptrdiff_t stride,
+ const __m128i *a, const __m128i *b,
+ const __m128i *c) {
+ _mm_store_si128((__m128i *)*dst, *a);
+ _mm_store_si128((__m128i *)(*dst + 8), *b);
+ *dst += stride;
+ _mm_store_si128((__m128i *)*dst, _mm_alignr_epi8(*b, *a, 4));
+ _mm_store_si128((__m128i *)(*dst + 8), _mm_alignr_epi8(*c, *b, 4));
+ *dst += stride;
+ _mm_store_si128((__m128i *)*dst, _mm_alignr_epi8(*b, *a, 8));
+ _mm_store_si128((__m128i *)(*dst + 8), _mm_alignr_epi8(*c, *b, 8));
+ *dst += stride;
+ _mm_store_si128((__m128i *)*dst, _mm_alignr_epi8(*b, *a, 12));
+ _mm_store_si128((__m128i *)(*dst + 8), _mm_alignr_epi8(*c, *b, 12));
+ *dst += stride;
+}
+
+void vpx_highbd_d207_predictor_16x16_ssse3(uint16_t *dst, ptrdiff_t stride,
+ const uint16_t *above,
+ const uint16_t *left, int bd) {
+ const __m128i A0 = _mm_load_si128((const __m128i *)left);
+ const __m128i A1 = _mm_load_si128((const __m128i *)(left + 8));
+ const __m128i LR0 = _mm_shufflehi_epi16(A1, 0xff);
+ const __m128i LR = _mm_unpackhi_epi64(LR0, LR0);
+ const __m128i B0 = _mm_alignr_epi8(A1, A0, 2);
+ const __m128i B1 = _mm_alignr_epi8(LR, A1, 2);
+ const __m128i C0 = _mm_alignr_epi8(A1, A0, 4);
+ const __m128i C1 = _mm_alignr_epi8(LR, A1, 4);
+ const __m128i avg3_0 = avg3_epu16(&A0, &B0, &C0);
+ const __m128i avg3_1 = avg3_epu16(&A1, &B1, &C1);
+ const __m128i avg2_0 = _mm_avg_epu16(A0, B0);
+ const __m128i avg2_1 = _mm_avg_epu16(A1, B1);
+ const __m128i out_a = _mm_unpacklo_epi16(avg2_0, avg3_0);
+ const __m128i out_b = _mm_unpackhi_epi16(avg2_0, avg3_0);
+ const __m128i out_c = _mm_unpacklo_epi16(avg2_1, avg3_1);
+ const __m128i out_d = _mm_unpackhi_epi16(avg2_1, avg3_1);
+ (void)above;
+ (void)bd;
+ d207_store_4x16(&dst, stride, &out_a, &out_b, &out_c);
+ d207_store_4x16(&dst, stride, &out_b, &out_c, &out_d);
+ d207_store_4x16(&dst, stride, &out_c, &out_d, &LR);
+ d207_store_4x16(&dst, stride, &out_d, &LR, &LR);
+}
+
+static INLINE void d207_store_4x32(uint16_t **dst, const ptrdiff_t stride,
+ const __m128i *a, const __m128i *b,
+ const __m128i *c, const __m128i *d,
+ const __m128i *e) {
+ _mm_store_si128((__m128i *)*dst, *a);
+ _mm_store_si128((__m128i *)(*dst + 8), *b);
+ _mm_store_si128((__m128i *)(*dst + 16), *c);
+ _mm_store_si128((__m128i *)(*dst + 24), *d);
+ *dst += stride;
+ _mm_store_si128((__m128i *)*dst, _mm_alignr_epi8(*b, *a, 4));
+ _mm_store_si128((__m128i *)(*dst + 8), _mm_alignr_epi8(*c, *b, 4));
+ _mm_store_si128((__m128i *)(*dst + 16), _mm_alignr_epi8(*d, *c, 4));
+ _mm_store_si128((__m128i *)(*dst + 24), _mm_alignr_epi8(*e, *d, 4));
+ *dst += stride;
+ _mm_store_si128((__m128i *)*dst, _mm_alignr_epi8(*b, *a, 8));
+ _mm_store_si128((__m128i *)(*dst + 8), _mm_alignr_epi8(*c, *b, 8));
+ _mm_store_si128((__m128i *)(*dst + 16), _mm_alignr_epi8(*d, *c, 8));
+ _mm_store_si128((__m128i *)(*dst + 24), _mm_alignr_epi8(*e, *d, 8));
+ *dst += stride;
+ _mm_store_si128((__m128i *)*dst, _mm_alignr_epi8(*b, *a, 12));
+ _mm_store_si128((__m128i *)(*dst + 8), _mm_alignr_epi8(*c, *b, 12));
+ _mm_store_si128((__m128i *)(*dst + 16), _mm_alignr_epi8(*d, *c, 12));
+ _mm_store_si128((__m128i *)(*dst + 24), _mm_alignr_epi8(*e, *d, 12));
+ *dst += stride;
+}
+
+void vpx_highbd_d207_predictor_32x32_ssse3(uint16_t *dst, ptrdiff_t stride,
+ const uint16_t *above,
+ const uint16_t *left, int bd) {
+ const __m128i A0 = _mm_load_si128((const __m128i *)left);
+ const __m128i A1 = _mm_load_si128((const __m128i *)(left + 8));
+ const __m128i A2 = _mm_load_si128((const __m128i *)(left + 16));
+ const __m128i A3 = _mm_load_si128((const __m128i *)(left + 24));
+ const __m128i LR0 = _mm_shufflehi_epi16(A3, 0xff);
+ const __m128i LR = _mm_unpackhi_epi64(LR0, LR0);
+ const __m128i B0 = _mm_alignr_epi8(A1, A0, 2);
+ const __m128i B1 = _mm_alignr_epi8(A2, A1, 2);
+ const __m128i B2 = _mm_alignr_epi8(A3, A2, 2);
+ const __m128i B3 = _mm_alignr_epi8(LR, A3, 2);
+ const __m128i C0 = _mm_alignr_epi8(A1, A0, 4);
+ const __m128i C1 = _mm_alignr_epi8(A2, A1, 4);
+ const __m128i C2 = _mm_alignr_epi8(A3, A2, 4);
+ const __m128i C3 = _mm_alignr_epi8(LR, A3, 4);
+ const __m128i avg3_0 = avg3_epu16(&A0, &B0, &C0);
+ const __m128i avg3_1 = avg3_epu16(&A1, &B1, &C1);
+ const __m128i avg3_2 = avg3_epu16(&A2, &B2, &C2);
+ const __m128i avg3_3 = avg3_epu16(&A3, &B3, &C3);
+ const __m128i avg2_0 = _mm_avg_epu16(A0, B0);
+ const __m128i avg2_1 = _mm_avg_epu16(A1, B1);
+ const __m128i avg2_2 = _mm_avg_epu16(A2, B2);
+ const __m128i avg2_3 = _mm_avg_epu16(A3, B3);
+ const __m128i out_a = _mm_unpacklo_epi16(avg2_0, avg3_0);
+ const __m128i out_b = _mm_unpackhi_epi16(avg2_0, avg3_0);
+ const __m128i out_c = _mm_unpacklo_epi16(avg2_1, avg3_1);
+ const __m128i out_d = _mm_unpackhi_epi16(avg2_1, avg3_1);
+ const __m128i out_e = _mm_unpacklo_epi16(avg2_2, avg3_2);
+ const __m128i out_f = _mm_unpackhi_epi16(avg2_2, avg3_2);
+ const __m128i out_g = _mm_unpacklo_epi16(avg2_3, avg3_3);
+ const __m128i out_h = _mm_unpackhi_epi16(avg2_3, avg3_3);
+ (void)above;
+ (void)bd;
+ d207_store_4x32(&dst, stride, &out_a, &out_b, &out_c, &out_d, &out_e);
+ d207_store_4x32(&dst, stride, &out_b, &out_c, &out_d, &out_e, &out_f);
+ d207_store_4x32(&dst, stride, &out_c, &out_d, &out_e, &out_f, &out_g);
+ d207_store_4x32(&dst, stride, &out_d, &out_e, &out_f, &out_g, &out_h);
+ d207_store_4x32(&dst, stride, &out_e, &out_f, &out_g, &out_h, &LR);
+ d207_store_4x32(&dst, stride, &out_f, &out_g, &out_h, &LR, &LR);
+ d207_store_4x32(&dst, stride, &out_g, &out_h, &LR, &LR, &LR);
+ d207_store_4x32(&dst, stride, &out_h, &LR, &LR, &LR, &LR);
+}
+
+static INLINE void d63_store_4x8(uint16_t **dst, const ptrdiff_t stride,
+ __m128i *a, __m128i *b, const __m128i *ar) {
+ _mm_store_si128((__m128i *)*dst, *a);
+ *dst += stride;
+ _mm_store_si128((__m128i *)*dst, *b);
+ *dst += stride;
+ *a = _mm_alignr_epi8(*ar, *a, 2);
+ *b = _mm_alignr_epi8(*ar, *b, 2);
+ _mm_store_si128((__m128i *)*dst, *a);
+ *dst += stride;
+ _mm_store_si128((__m128i *)*dst, *b);
+ *dst += stride;
+ *a = _mm_alignr_epi8(*ar, *a, 2);
+ *b = _mm_alignr_epi8(*ar, *b, 2);
+}
+
+void vpx_highbd_d63_predictor_8x8_ssse3(uint16_t *dst, ptrdiff_t stride,
+ const uint16_t *above,
+ const uint16_t *left, int bd) {
+ const __m128i ABCDEFGH = _mm_load_si128((const __m128i *)above);
+ const __m128i ABCDHHHH = _mm_shufflehi_epi16(ABCDEFGH, 0xff);
+ const __m128i HHHHHHHH = _mm_unpackhi_epi64(ABCDHHHH, ABCDHHHH);
+ const __m128i BCDEFGHH = _mm_alignr_epi8(HHHHHHHH, ABCDEFGH, 2);
+ const __m128i CDEFGHHH = _mm_alignr_epi8(HHHHHHHH, ABCDEFGH, 4);
+ __m128i avg3 = avg3_epu16(&ABCDEFGH, &BCDEFGHH, &CDEFGHHH);
+ __m128i avg2 = _mm_avg_epu16(ABCDEFGH, BCDEFGHH);
+ (void)left;
+ (void)bd;
+ d63_store_4x8(&dst, stride, &avg2, &avg3, &HHHHHHHH);
+ d63_store_4x8(&dst, stride, &avg2, &avg3, &HHHHHHHH);
+}
+
+void vpx_highbd_d63_predictor_16x16_ssse3(uint16_t *dst, ptrdiff_t stride,
+ const uint16_t *above,
+ const uint16_t *left, int bd) {
+ const __m128i A0 = _mm_load_si128((const __m128i *)above);
+ const __m128i A1 = _mm_load_si128((const __m128i *)(above + 8));
+ const __m128i AR0 = _mm_shufflehi_epi16(A1, 0xff);
+ const __m128i AR = _mm_unpackhi_epi64(AR0, AR0);
+ const __m128i B0 = _mm_alignr_epi8(A1, A0, 2);
+ const __m128i B1 = _mm_alignr_epi8(AR, A1, 2);
+ const __m128i C0 = _mm_alignr_epi8(A1, A0, 4);
+ const __m128i C1 = _mm_alignr_epi8(AR, A1, 4);
+ __m128i avg3_0 = avg3_epu16(&A0, &B0, &C0);
+ __m128i avg3_1 = avg3_epu16(&A1, &B1, &C1);
+ __m128i avg2_0 = _mm_avg_epu16(A0, B0);
+ __m128i avg2_1 = _mm_avg_epu16(A1, B1);
+ int i;
+ (void)left;
+ (void)bd;
+ for (i = 0; i < 14; i += 2) {
+ _mm_store_si128((__m128i *)dst, avg2_0);
+ _mm_store_si128((__m128i *)(dst + 8), avg2_1);
+ dst += stride;
+ _mm_store_si128((__m128i *)dst, avg3_0);
+ _mm_store_si128((__m128i *)(dst + 8), avg3_1);
+ dst += stride;
+ avg2_0 = _mm_alignr_epi8(avg2_1, avg2_0, 2);
+ avg2_1 = _mm_alignr_epi8(AR, avg2_1, 2);
+ avg3_0 = _mm_alignr_epi8(avg3_1, avg3_0, 2);
+ avg3_1 = _mm_alignr_epi8(AR, avg3_1, 2);
+ }
+ _mm_store_si128((__m128i *)dst, avg2_0);
+ _mm_store_si128((__m128i *)(dst + 8), avg2_1);
+ dst += stride;
+ _mm_store_si128((__m128i *)dst, avg3_0);
+ _mm_store_si128((__m128i *)(dst + 8), avg3_1);
+}
+
+void vpx_highbd_d63_predictor_32x32_ssse3(uint16_t *dst, ptrdiff_t stride,
+ const uint16_t *above,
+ const uint16_t *left, int bd) {
+ const __m128i A0 = _mm_load_si128((const __m128i *)above);
+ const __m128i A1 = _mm_load_si128((const __m128i *)(above + 8));
+ const __m128i A2 = _mm_load_si128((const __m128i *)(above + 16));
+ const __m128i A3 = _mm_load_si128((const __m128i *)(above + 24));
+ const __m128i AR0 = _mm_shufflehi_epi16(A3, 0xff);
+ const __m128i AR = _mm_unpackhi_epi64(AR0, AR0);
+ const __m128i B0 = _mm_alignr_epi8(A1, A0, 2);
+ const __m128i B1 = _mm_alignr_epi8(A2, A1, 2);
+ const __m128i B2 = _mm_alignr_epi8(A3, A2, 2);
+ const __m128i B3 = _mm_alignr_epi8(AR, A3, 2);
+ const __m128i C0 = _mm_alignr_epi8(A1, A0, 4);
+ const __m128i C1 = _mm_alignr_epi8(A2, A1, 4);
+ const __m128i C2 = _mm_alignr_epi8(A3, A2, 4);
+ const __m128i C3 = _mm_alignr_epi8(AR, A3, 4);
+ __m128i avg3_0 = avg3_epu16(&A0, &B0, &C0);
+ __m128i avg3_1 = avg3_epu16(&A1, &B1, &C1);
+ __m128i avg3_2 = avg3_epu16(&A2, &B2, &C2);
+ __m128i avg3_3 = avg3_epu16(&A3, &B3, &C3);
+ __m128i avg2_0 = _mm_avg_epu16(A0, B0);
+ __m128i avg2_1 = _mm_avg_epu16(A1, B1);
+ __m128i avg2_2 = _mm_avg_epu16(A2, B2);
+ __m128i avg2_3 = _mm_avg_epu16(A3, B3);
+ int i;
+ (void)left;
+ (void)bd;
+ for (i = 0; i < 30; i += 2) {
+ _mm_store_si128((__m128i *)dst, avg2_0);
+ _mm_store_si128((__m128i *)(dst + 8), avg2_1);
+ _mm_store_si128((__m128i *)(dst + 16), avg2_2);
+ _mm_store_si128((__m128i *)(dst + 24), avg2_3);
+ dst += stride;
+ _mm_store_si128((__m128i *)dst, avg3_0);
+ _mm_store_si128((__m128i *)(dst + 8), avg3_1);
+ _mm_store_si128((__m128i *)(dst + 16), avg3_2);
+ _mm_store_si128((__m128i *)(dst + 24), avg3_3);
+ dst += stride;
+ avg2_0 = _mm_alignr_epi8(avg2_1, avg2_0, 2);
+ avg2_1 = _mm_alignr_epi8(avg2_2, avg2_1, 2);
+ avg2_2 = _mm_alignr_epi8(avg2_3, avg2_2, 2);
+ avg2_3 = _mm_alignr_epi8(AR, avg2_3, 2);
+ avg3_0 = _mm_alignr_epi8(avg3_1, avg3_0, 2);
+ avg3_1 = _mm_alignr_epi8(avg3_2, avg3_1, 2);
+ avg3_2 = _mm_alignr_epi8(avg3_3, avg3_2, 2);
+ avg3_3 = _mm_alignr_epi8(AR, avg3_3, 2);
+ }
+ _mm_store_si128((__m128i *)dst, avg2_0);
+ _mm_store_si128((__m128i *)(dst + 8), avg2_1);
+ _mm_store_si128((__m128i *)(dst + 16), avg2_2);
+ _mm_store_si128((__m128i *)(dst + 24), avg2_3);
+ dst += stride;
+ _mm_store_si128((__m128i *)dst, avg3_0);
+ _mm_store_si128((__m128i *)(dst + 8), avg3_1);
+ _mm_store_si128((__m128i *)(dst + 16), avg3_2);
+ _mm_store_si128((__m128i *)(dst + 24), avg3_3);
+}
diff --git a/media/libvpx/libvpx/vpx_dsp/x86/highbd_intrapred_sse2.asm b/media/libvpx/libvpx/vpx_dsp/x86/highbd_intrapred_sse2.asm
new file mode 100644
index 0000000000..caf506ac07
--- /dev/null
+++ b/media/libvpx/libvpx/vpx_dsp/x86/highbd_intrapred_sse2.asm
@@ -0,0 +1,453 @@
+;
+; Copyright (c) 2014 The WebM project authors. All Rights Reserved.
+;
+; Use of this source code is governed by a BSD-style license
+; that can be found in the LICENSE file in the root of the source
+; tree. An additional intellectual property rights grant can be found
+; in the file PATENTS. All contributing project authors may
+; be found in the AUTHORS file in the root of the source tree.
+;
+
+%include "third_party/x86inc/x86inc.asm"
+
+SECTION_RODATA
+pw_4: times 8 dw 4
+pw_8: times 8 dw 8
+pw_16: times 4 dd 16
+pw_32: times 4 dd 32
+
+SECTION .text
+INIT_XMM sse2
+cglobal highbd_dc_predictor_4x4, 4, 5, 4, dst, stride, above, left, goffset
+ GET_GOT goffsetq
+
+ movq m0, [aboveq]
+ movq m2, [leftq]
+ paddw m0, m2
+ pshuflw m1, m0, 0xe
+ paddw m0, m1
+ pshuflw m1, m0, 0x1
+ paddw m0, m1
+ paddw m0, [GLOBAL(pw_4)]
+ psraw m0, 3
+ pshuflw m0, m0, 0x0
+ movq [dstq ], m0
+ movq [dstq+strideq*2], m0
+ lea dstq, [dstq+strideq*4]
+ movq [dstq ], m0
+ movq [dstq+strideq*2], m0
+
+ RESTORE_GOT
+ RET
+
+INIT_XMM sse2
+cglobal highbd_dc_predictor_8x8, 4, 5, 4, dst, stride, above, left, goffset
+ GET_GOT goffsetq
+
+ pxor m1, m1
+ mova m0, [aboveq]
+ mova m2, [leftq]
+ DEFINE_ARGS dst, stride, stride3, one
+ mov oned, 0x00010001
+ lea stride3q, [strideq*3]
+ movd m3, oned
+ pshufd m3, m3, 0x0
+ paddw m0, m2
+ pmaddwd m0, m3
+ packssdw m0, m1
+ pmaddwd m0, m3
+ packssdw m0, m1
+ pmaddwd m0, m3
+ paddw m0, [GLOBAL(pw_8)]
+ psrlw m0, 4
+ pshuflw m0, m0, 0x0
+ punpcklqdq m0, m0
+ mova [dstq ], m0
+ mova [dstq+strideq*2 ], m0
+ mova [dstq+strideq*4 ], m0
+ mova [dstq+stride3q*2], m0
+ lea dstq, [dstq+strideq*8]
+ mova [dstq ], m0
+ mova [dstq+strideq*2 ], m0
+ mova [dstq+strideq*4 ], m0
+ mova [dstq+stride3q*2], m0
+
+ RESTORE_GOT
+ RET
+
+INIT_XMM sse2
+cglobal highbd_dc_predictor_16x16, 4, 5, 5, dst, stride, above, left, goffset
+ GET_GOT goffsetq
+
+ pxor m1, m1
+ mova m0, [aboveq]
+ mova m3, [aboveq+16]
+ mova m2, [leftq]
+ mova m4, [leftq+16]
+ DEFINE_ARGS dst, stride, stride3, lines4
+ lea stride3q, [strideq*3]
+ mov lines4d, 4
+ paddw m0, m2
+ paddw m0, m3
+ paddw m0, m4
+ movhlps m2, m0
+ paddw m0, m2
+ punpcklwd m0, m1
+ movhlps m2, m0
+ paddd m0, m2
+ punpckldq m0, m1
+ movhlps m2, m0
+ paddd m0, m2
+ paddd m0, [GLOBAL(pw_16)]
+ psrad m0, 5
+ pshuflw m0, m0, 0x0
+ punpcklqdq m0, m0
+.loop:
+ mova [dstq ], m0
+ mova [dstq +16], m0
+ mova [dstq+strideq*2 ], m0
+ mova [dstq+strideq*2 +16], m0
+ mova [dstq+strideq*4 ], m0
+ mova [dstq+strideq*4 +16], m0
+ mova [dstq+stride3q*2 ], m0
+ mova [dstq+stride3q*2+16], m0
+ lea dstq, [dstq+strideq*8]
+ dec lines4d
+ jnz .loop
+
+ RESTORE_GOT
+ REP_RET
+
+INIT_XMM sse2
+cglobal highbd_dc_predictor_32x32, 4, 5, 7, dst, stride, above, left, goffset
+ GET_GOT goffsetq
+
+ mova m0, [aboveq]
+ mova m2, [aboveq+16]
+ mova m3, [aboveq+32]
+ mova m4, [aboveq+48]
+ paddw m0, m2
+ paddw m3, m4
+ mova m2, [leftq]
+ mova m4, [leftq+16]
+ mova m5, [leftq+32]
+ mova m6, [leftq+48]
+ paddw m2, m4
+ paddw m5, m6
+ paddw m0, m3
+ paddw m2, m5
+ pxor m1, m1
+ paddw m0, m2
+ DEFINE_ARGS dst, stride, stride3, lines4
+ lea stride3q, [strideq*3]
+ mov lines4d, 8
+ movhlps m2, m0
+ paddw m0, m2
+ punpcklwd m0, m1
+ movhlps m2, m0
+ paddd m0, m2
+ punpckldq m0, m1
+ movhlps m2, m0
+ paddd m0, m2
+ paddd m0, [GLOBAL(pw_32)]
+ psrad m0, 6
+ pshuflw m0, m0, 0x0
+ punpcklqdq m0, m0
+.loop:
+ mova [dstq ], m0
+ mova [dstq +16 ], m0
+ mova [dstq +32 ], m0
+ mova [dstq +48 ], m0
+ mova [dstq+strideq*2 ], m0
+ mova [dstq+strideq*2+16 ], m0
+ mova [dstq+strideq*2+32 ], m0
+ mova [dstq+strideq*2+48 ], m0
+ mova [dstq+strideq*4 ], m0
+ mova [dstq+strideq*4+16 ], m0
+ mova [dstq+strideq*4+32 ], m0
+ mova [dstq+strideq*4+48 ], m0
+ mova [dstq+stride3q*2 ], m0
+ mova [dstq+stride3q*2 +16], m0
+ mova [dstq+stride3q*2 +32], m0
+ mova [dstq+stride3q*2 +48], m0
+ lea dstq, [dstq+strideq*8]
+ dec lines4d
+ jnz .loop
+
+ RESTORE_GOT
+ REP_RET
+
+INIT_XMM sse2
+cglobal highbd_v_predictor_4x4, 3, 3, 1, dst, stride, above
+ movq m0, [aboveq]
+ movq [dstq ], m0
+ movq [dstq+strideq*2], m0
+ lea dstq, [dstq+strideq*4]
+ movq [dstq ], m0
+ movq [dstq+strideq*2], m0
+ RET
+
+INIT_XMM sse2
+cglobal highbd_v_predictor_8x8, 3, 3, 1, dst, stride, above
+ mova m0, [aboveq]
+ DEFINE_ARGS dst, stride, stride3
+ lea stride3q, [strideq*3]
+ mova [dstq ], m0
+ mova [dstq+strideq*2 ], m0
+ mova [dstq+strideq*4 ], m0
+ mova [dstq+stride3q*2], m0
+ lea dstq, [dstq+strideq*8]
+ mova [dstq ], m0
+ mova [dstq+strideq*2 ], m0
+ mova [dstq+strideq*4 ], m0
+ mova [dstq+stride3q*2], m0
+ RET
+
+INIT_XMM sse2
+cglobal highbd_v_predictor_16x16, 3, 4, 2, dst, stride, above
+ mova m0, [aboveq]
+ mova m1, [aboveq+16]
+ DEFINE_ARGS dst, stride, stride3, nlines4
+ lea stride3q, [strideq*3]
+ mov nlines4d, 4
+.loop:
+ mova [dstq ], m0
+ mova [dstq +16], m1
+ mova [dstq+strideq*2 ], m0
+ mova [dstq+strideq*2 +16], m1
+ mova [dstq+strideq*4 ], m0
+ mova [dstq+strideq*4 +16], m1
+ mova [dstq+stride3q*2 ], m0
+ mova [dstq+stride3q*2+16], m1
+ lea dstq, [dstq+strideq*8]
+ dec nlines4d
+ jnz .loop
+ REP_RET
+
+INIT_XMM sse2
+cglobal highbd_v_predictor_32x32, 3, 4, 4, dst, stride, above
+ mova m0, [aboveq]
+ mova m1, [aboveq+16]
+ mova m2, [aboveq+32]
+ mova m3, [aboveq+48]
+ DEFINE_ARGS dst, stride, stride3, nlines4
+ lea stride3q, [strideq*3]
+ mov nlines4d, 8
+.loop:
+ mova [dstq ], m0
+ mova [dstq +16], m1
+ mova [dstq +32], m2
+ mova [dstq +48], m3
+ mova [dstq+strideq*2 ], m0
+ mova [dstq+strideq*2 +16], m1
+ mova [dstq+strideq*2 +32], m2
+ mova [dstq+strideq*2 +48], m3
+ mova [dstq+strideq*4 ], m0
+ mova [dstq+strideq*4 +16], m1
+ mova [dstq+strideq*4 +32], m2
+ mova [dstq+strideq*4 +48], m3
+ mova [dstq+stride3q*2 ], m0
+ mova [dstq+stride3q*2 +16], m1
+ mova [dstq+stride3q*2 +32], m2
+ mova [dstq+stride3q*2 +48], m3
+ lea dstq, [dstq+strideq*8]
+ dec nlines4d
+ jnz .loop
+ REP_RET
+
+INIT_XMM sse2
+cglobal highbd_tm_predictor_4x4, 5, 5, 6, dst, stride, above, left, bd
+ movd m1, [aboveq-2]
+ movq m0, [aboveq]
+ pshuflw m1, m1, 0x0
+ movlhps m0, m0 ; t1 t2 t3 t4 t1 t2 t3 t4
+ movlhps m1, m1 ; tl tl tl tl tl tl tl tl
+ ; Get the values to compute the maximum value at this bit depth
+ pcmpeqw m3, m3
+ movd m4, bdd
+ psubw m0, m1 ; t1-tl t2-tl t3-tl t4-tl
+ psllw m3, m4
+ pcmpeqw m2, m2
+ pxor m4, m4 ; min possible value
+ pxor m3, m2 ; max possible value
+ mova m1, [leftq]
+ pshuflw m2, m1, 0x0
+ pshuflw m5, m1, 0x55
+ movlhps m2, m5 ; l1 l1 l1 l1 l2 l2 l2 l2
+ paddw m2, m0
+ ;Clamp to the bit-depth
+ pminsw m2, m3
+ pmaxsw m2, m4
+ ;Store the values
+ movq [dstq ], m2
+ movhpd [dstq+strideq*2], m2
+ lea dstq, [dstq+strideq*4]
+ pshuflw m2, m1, 0xaa
+ pshuflw m5, m1, 0xff
+ movlhps m2, m5
+ paddw m2, m0
+ ;Clamp to the bit-depth
+ pminsw m2, m3
+ pmaxsw m2, m4
+ ;Store the values
+ movq [dstq ], m2
+ movhpd [dstq+strideq*2], m2
+ RET
+
+INIT_XMM sse2
+cglobal highbd_tm_predictor_8x8, 5, 6, 5, dst, stride, above, left, bd, one
+ movd m1, [aboveq-2]
+ mova m0, [aboveq]
+ pshuflw m1, m1, 0x0
+ ; Get the values to compute the maximum value at this bit depth
+ mov oned, 1
+ pxor m3, m3
+ pxor m4, m4
+ pinsrw m3, oned, 0
+ pinsrw m4, bdd, 0
+ pshuflw m3, m3, 0x0
+ DEFINE_ARGS dst, stride, line, left
+ punpcklqdq m3, m3
+ mov lineq, -4
+ mova m2, m3
+ punpcklqdq m1, m1
+ psllw m3, m4
+ add leftq, 16
+ psubw m3, m2 ; max possible value
+ pxor m4, m4 ; min possible value
+ psubw m0, m1
+.loop:
+ movd m1, [leftq+lineq*4]
+ movd m2, [leftq+lineq*4+2]
+ pshuflw m1, m1, 0x0
+ pshuflw m2, m2, 0x0
+ punpcklqdq m1, m1
+ punpcklqdq m2, m2
+ paddw m1, m0
+ paddw m2, m0
+ ;Clamp to the bit-depth
+ pminsw m1, m3
+ pminsw m2, m3
+ pmaxsw m1, m4
+ pmaxsw m2, m4
+ ;Store the values
+ mova [dstq ], m1
+ mova [dstq+strideq*2], m2
+ lea dstq, [dstq+strideq*4]
+ inc lineq
+ jnz .loop
+ REP_RET
+
+INIT_XMM sse2
+cglobal highbd_tm_predictor_16x16, 5, 5, 8, dst, stride, above, left, bd
+ movd m2, [aboveq-2]
+ mova m0, [aboveq]
+ mova m1, [aboveq+16]
+ pshuflw m2, m2, 0x0
+ ; Get the values to compute the maximum value at this bit depth
+ pcmpeqw m3, m3
+ movd m4, bdd
+ punpcklqdq m2, m2
+ psllw m3, m4
+ pcmpeqw m5, m5
+ pxor m4, m4 ; min possible value
+ pxor m3, m5 ; max possible value
+ DEFINE_ARGS dst, stride, line, left
+ mov lineq, -8
+ psubw m0, m2
+ psubw m1, m2
+.loop:
+ movd m7, [leftq]
+ pshuflw m5, m7, 0x0
+ pshuflw m2, m7, 0x55
+ punpcklqdq m5, m5 ; l1 l1 l1 l1 l1 l1 l1 l1
+ punpcklqdq m2, m2 ; l2 l2 l2 l2 l2 l2 l2 l2
+ paddw m6, m5, m0 ; t1-tl+l1 to t4-tl+l1
+ paddw m5, m1 ; t5-tl+l1 to t8-tl+l1
+ pminsw m6, m3
+ pminsw m5, m3
+ pmaxsw m6, m4 ; Clamp to the bit-depth
+ pmaxsw m5, m4
+ mova [dstq ], m6
+ mova [dstq +16], m5
+ paddw m6, m2, m0
+ paddw m2, m1
+ pminsw m6, m3
+ pminsw m2, m3
+ pmaxsw m6, m4
+ pmaxsw m2, m4
+ mova [dstq+strideq*2 ], m6
+ mova [dstq+strideq*2+16], m2
+ lea dstq, [dstq+strideq*4]
+ inc lineq
+ lea leftq, [leftq+4]
+
+ jnz .loop
+ REP_RET
+
+INIT_XMM sse2
+cglobal highbd_tm_predictor_32x32, 5, 5, 8, dst, stride, above, left, bd
+ movd m0, [aboveq-2]
+ mova m1, [aboveq]
+ mova m2, [aboveq+16]
+ mova m3, [aboveq+32]
+ mova m4, [aboveq+48]
+ pshuflw m0, m0, 0x0
+ ; Get the values to compute the maximum value at this bit depth
+ pcmpeqw m5, m5
+ movd m6, bdd
+ psllw m5, m6
+ pcmpeqw m7, m7
+ pxor m6, m6 ; min possible value
+ pxor m5, m7 ; max possible value
+ punpcklqdq m0, m0
+ DEFINE_ARGS dst, stride, line, left
+ mov lineq, -16
+ psubw m1, m0
+ psubw m2, m0
+ psubw m3, m0
+ psubw m4, m0
+.loop:
+ movd m7, [leftq]
+ pshuflw m7, m7, 0x0
+ punpcklqdq m7, m7 ; l1 l1 l1 l1 l1 l1 l1 l1
+ paddw m0, m7, m1
+ pminsw m0, m5
+ pmaxsw m0, m6
+ mova [dstq ], m0
+ paddw m0, m7, m2
+ pminsw m0, m5
+ pmaxsw m0, m6
+ mova [dstq +16], m0
+ paddw m0, m7, m3
+ pminsw m0, m5
+ pmaxsw m0, m6
+ mova [dstq +32], m0
+ paddw m0, m7, m4
+ pminsw m0, m5
+ pmaxsw m0, m6
+ mova [dstq +48], m0
+ movd m7, [leftq+2]
+ pshuflw m7, m7, 0x0
+ punpcklqdq m7, m7 ; l2 l2 l2 l2 l2 l2 l2 l2
+ paddw m0, m7, m1
+ pminsw m0, m5
+ pmaxsw m0, m6
+ mova [dstq+strideq*2 ], m0
+ paddw m0, m7, m2
+ pminsw m0, m5
+ pmaxsw m0, m6
+ mova [dstq+strideq*2+16], m0
+ paddw m0, m7, m3
+ pminsw m0, m5
+ pmaxsw m0, m6
+ mova [dstq+strideq*2+32], m0
+ paddw m0, m7, m4
+ pminsw m0, m5
+ pmaxsw m0, m6
+ mova [dstq+strideq*2+48], m0
+ lea dstq, [dstq+strideq*4]
+ lea leftq, [leftq+4]
+ inc lineq
+ jnz .loop
+ REP_RET
diff --git a/media/libvpx/libvpx/vpx_dsp/x86/highbd_inv_txfm_sse2.h b/media/libvpx/libvpx/vpx_dsp/x86/highbd_inv_txfm_sse2.h
new file mode 100644
index 0000000000..1d07391b02
--- /dev/null
+++ b/media/libvpx/libvpx/vpx_dsp/x86/highbd_inv_txfm_sse2.h
@@ -0,0 +1,404 @@
+/*
+ * Copyright (c) 2015 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef VPX_VPX_DSP_X86_HIGHBD_INV_TXFM_SSE2_H_
+#define VPX_VPX_DSP_X86_HIGHBD_INV_TXFM_SSE2_H_
+
+#include <emmintrin.h> // SSE2
+
+#include "./vpx_config.h"
+#include "vpx/vpx_integer.h"
+#include "vpx_dsp/inv_txfm.h"
+#include "vpx_dsp/x86/transpose_sse2.h"
+#include "vpx_dsp/x86/txfm_common_sse2.h"
+
+// Note: There is no 64-bit bit-level shifting SIMD instruction. All
+// coefficients are left shifted by 2, so that dct_const_round_shift() can be
+// done by right shifting 2 bytes.
+
+static INLINE void extend_64bit(const __m128i in,
+ __m128i *const out /*out[2]*/) {
+ out[0] = _mm_unpacklo_epi32(in, in); // 0, 0, 1, 1
+ out[1] = _mm_unpackhi_epi32(in, in); // 2, 2, 3, 3
+}
+
+static INLINE __m128i wraplow_16bit_shift4(const __m128i in0, const __m128i in1,
+ const __m128i rounding) {
+ __m128i temp[2];
+ temp[0] = _mm_add_epi32(in0, rounding);
+ temp[1] = _mm_add_epi32(in1, rounding);
+ temp[0] = _mm_srai_epi32(temp[0], 4);
+ temp[1] = _mm_srai_epi32(temp[1], 4);
+ return _mm_packs_epi32(temp[0], temp[1]);
+}
+
+static INLINE __m128i wraplow_16bit_shift5(const __m128i in0, const __m128i in1,
+ const __m128i rounding) {
+ __m128i temp[2];
+ temp[0] = _mm_add_epi32(in0, rounding);
+ temp[1] = _mm_add_epi32(in1, rounding);
+ temp[0] = _mm_srai_epi32(temp[0], 5);
+ temp[1] = _mm_srai_epi32(temp[1], 5);
+ return _mm_packs_epi32(temp[0], temp[1]);
+}
+
+static INLINE __m128i dct_const_round_shift_64bit(const __m128i in) {
+ const __m128i t =
+ _mm_add_epi64(in, pair_set_epi32(DCT_CONST_ROUNDING << 2, 0));
+ return _mm_srli_si128(t, 2);
+}
+
+static INLINE __m128i pack_4(const __m128i in0, const __m128i in1) {
+ const __m128i t0 = _mm_unpacklo_epi32(in0, in1); // 0, 2
+ const __m128i t1 = _mm_unpackhi_epi32(in0, in1); // 1, 3
+ return _mm_unpacklo_epi32(t0, t1); // 0, 1, 2, 3
+}
+
+static INLINE void abs_extend_64bit_sse2(const __m128i in,
+ __m128i *const out /*out[2]*/,
+ __m128i *const sign /*sign[2]*/) {
+ sign[0] = _mm_srai_epi32(in, 31);
+ out[0] = _mm_xor_si128(in, sign[0]);
+ out[0] = _mm_sub_epi32(out[0], sign[0]);
+ sign[1] = _mm_unpackhi_epi32(sign[0], sign[0]); // 64-bit sign of 2, 3
+ sign[0] = _mm_unpacklo_epi32(sign[0], sign[0]); // 64-bit sign of 0, 1
+ out[1] = _mm_unpackhi_epi32(out[0], out[0]); // 2, 3
+ out[0] = _mm_unpacklo_epi32(out[0], out[0]); // 0, 1
+}
+
+// Note: cospi must be non negative.
+static INLINE __m128i multiply_apply_sign_sse2(const __m128i in,
+ const __m128i sign,
+ const __m128i cospi) {
+ __m128i out = _mm_mul_epu32(in, cospi);
+ out = _mm_xor_si128(out, sign);
+ return _mm_sub_epi64(out, sign);
+}
+
+// Note: c must be non negative.
+static INLINE __m128i multiplication_round_shift_sse2(
+ const __m128i *const in /*in[2]*/, const __m128i *const sign /*sign[2]*/,
+ const int c) {
+ const __m128i pair_c = pair_set_epi32(c << 2, 0);
+ __m128i t0, t1;
+
+ assert(c >= 0);
+ t0 = multiply_apply_sign_sse2(in[0], sign[0], pair_c);
+ t1 = multiply_apply_sign_sse2(in[1], sign[1], pair_c);
+ t0 = dct_const_round_shift_64bit(t0);
+ t1 = dct_const_round_shift_64bit(t1);
+
+ return pack_4(t0, t1);
+}
+
+// Note: c must be non negative.
+static INLINE __m128i multiplication_neg_round_shift_sse2(
+ const __m128i *const in /*in[2]*/, const __m128i *const sign /*sign[2]*/,
+ const int c) {
+ const __m128i pair_c = pair_set_epi32(c << 2, 0);
+ __m128i t0, t1;
+
+ assert(c >= 0);
+ t0 = multiply_apply_sign_sse2(in[0], sign[0], pair_c);
+ t1 = multiply_apply_sign_sse2(in[1], sign[1], pair_c);
+ t0 = _mm_sub_epi64(_mm_setzero_si128(), t0);
+ t1 = _mm_sub_epi64(_mm_setzero_si128(), t1);
+ t0 = dct_const_round_shift_64bit(t0);
+ t1 = dct_const_round_shift_64bit(t1);
+
+ return pack_4(t0, t1);
+}
+
+// Note: c0 and c1 must be non negative.
+static INLINE void highbd_butterfly_sse2(const __m128i in0, const __m128i in1,
+ const int c0, const int c1,
+ __m128i *const out0,
+ __m128i *const out1) {
+ const __m128i pair_c0 = pair_set_epi32(c0 << 2, 0);
+ const __m128i pair_c1 = pair_set_epi32(c1 << 2, 0);
+ __m128i temp1[4], temp2[4], sign1[2], sign2[2];
+
+ assert(c0 >= 0);
+ assert(c1 >= 0);
+ abs_extend_64bit_sse2(in0, temp1, sign1);
+ abs_extend_64bit_sse2(in1, temp2, sign2);
+ temp1[2] = multiply_apply_sign_sse2(temp1[0], sign1[0], pair_c1);
+ temp1[3] = multiply_apply_sign_sse2(temp1[1], sign1[1], pair_c1);
+ temp1[0] = multiply_apply_sign_sse2(temp1[0], sign1[0], pair_c0);
+ temp1[1] = multiply_apply_sign_sse2(temp1[1], sign1[1], pair_c0);
+ temp2[2] = multiply_apply_sign_sse2(temp2[0], sign2[0], pair_c0);
+ temp2[3] = multiply_apply_sign_sse2(temp2[1], sign2[1], pair_c0);
+ temp2[0] = multiply_apply_sign_sse2(temp2[0], sign2[0], pair_c1);
+ temp2[1] = multiply_apply_sign_sse2(temp2[1], sign2[1], pair_c1);
+ temp1[0] = _mm_sub_epi64(temp1[0], temp2[0]);
+ temp1[1] = _mm_sub_epi64(temp1[1], temp2[1]);
+ temp2[0] = _mm_add_epi64(temp1[2], temp2[2]);
+ temp2[1] = _mm_add_epi64(temp1[3], temp2[3]);
+ temp1[0] = dct_const_round_shift_64bit(temp1[0]);
+ temp1[1] = dct_const_round_shift_64bit(temp1[1]);
+ temp2[0] = dct_const_round_shift_64bit(temp2[0]);
+ temp2[1] = dct_const_round_shift_64bit(temp2[1]);
+ *out0 = pack_4(temp1[0], temp1[1]);
+ *out1 = pack_4(temp2[0], temp2[1]);
+}
+
+// Note: c0 and c1 must be non negative.
+static INLINE void highbd_partial_butterfly_sse2(const __m128i in, const int c0,
+ const int c1,
+ __m128i *const out0,
+ __m128i *const out1) {
+ __m128i temp[2], sign[2];
+
+ assert(c0 >= 0);
+ assert(c1 >= 0);
+ abs_extend_64bit_sse2(in, temp, sign);
+ *out0 = multiplication_round_shift_sse2(temp, sign, c0);
+ *out1 = multiplication_round_shift_sse2(temp, sign, c1);
+}
+
+// Note: c0 and c1 must be non negative.
+static INLINE void highbd_partial_butterfly_neg_sse2(const __m128i in,
+ const int c0, const int c1,
+ __m128i *const out0,
+ __m128i *const out1) {
+ __m128i temp[2], sign[2];
+
+ assert(c0 >= 0);
+ assert(c1 >= 0);
+ abs_extend_64bit_sse2(in, temp, sign);
+ *out0 = multiplication_neg_round_shift_sse2(temp, sign, c1);
+ *out1 = multiplication_round_shift_sse2(temp, sign, c0);
+}
+
+static INLINE void highbd_butterfly_cospi16_sse2(const __m128i in0,
+ const __m128i in1,
+ __m128i *const out0,
+ __m128i *const out1) {
+ __m128i temp1[2], temp2, sign[2];
+
+ temp2 = _mm_add_epi32(in0, in1);
+ abs_extend_64bit_sse2(temp2, temp1, sign);
+ *out0 = multiplication_round_shift_sse2(temp1, sign, cospi_16_64);
+ temp2 = _mm_sub_epi32(in0, in1);
+ abs_extend_64bit_sse2(temp2, temp1, sign);
+ *out1 = multiplication_round_shift_sse2(temp1, sign, cospi_16_64);
+}
+
+// Only do addition and subtraction butterfly, size = 16, 32
+static INLINE void highbd_add_sub_butterfly(const __m128i *in, __m128i *out,
+ int size) {
+ int i = 0;
+ const int num = size >> 1;
+ const int bound = size - 1;
+ while (i < num) {
+ out[i] = _mm_add_epi32(in[i], in[bound - i]);
+ out[bound - i] = _mm_sub_epi32(in[i], in[bound - i]);
+ i++;
+ }
+}
+
+static INLINE void highbd_idct8_stage4(const __m128i *const in,
+ __m128i *const out) {
+ out[0] = _mm_add_epi32(in[0], in[7]);
+ out[1] = _mm_add_epi32(in[1], in[6]);
+ out[2] = _mm_add_epi32(in[2], in[5]);
+ out[3] = _mm_add_epi32(in[3], in[4]);
+ out[4] = _mm_sub_epi32(in[3], in[4]);
+ out[5] = _mm_sub_epi32(in[2], in[5]);
+ out[6] = _mm_sub_epi32(in[1], in[6]);
+ out[7] = _mm_sub_epi32(in[0], in[7]);
+}
+
+static INLINE void highbd_idct8x8_final_round(__m128i *const io) {
+ io[0] = wraplow_16bit_shift5(io[0], io[8], _mm_set1_epi32(16));
+ io[1] = wraplow_16bit_shift5(io[1], io[9], _mm_set1_epi32(16));
+ io[2] = wraplow_16bit_shift5(io[2], io[10], _mm_set1_epi32(16));
+ io[3] = wraplow_16bit_shift5(io[3], io[11], _mm_set1_epi32(16));
+ io[4] = wraplow_16bit_shift5(io[4], io[12], _mm_set1_epi32(16));
+ io[5] = wraplow_16bit_shift5(io[5], io[13], _mm_set1_epi32(16));
+ io[6] = wraplow_16bit_shift5(io[6], io[14], _mm_set1_epi32(16));
+ io[7] = wraplow_16bit_shift5(io[7], io[15], _mm_set1_epi32(16));
+}
+
+static INLINE void highbd_idct16_4col_stage7(const __m128i *const in,
+ __m128i *const out) {
+ out[0] = _mm_add_epi32(in[0], in[15]);
+ out[1] = _mm_add_epi32(in[1], in[14]);
+ out[2] = _mm_add_epi32(in[2], in[13]);
+ out[3] = _mm_add_epi32(in[3], in[12]);
+ out[4] = _mm_add_epi32(in[4], in[11]);
+ out[5] = _mm_add_epi32(in[5], in[10]);
+ out[6] = _mm_add_epi32(in[6], in[9]);
+ out[7] = _mm_add_epi32(in[7], in[8]);
+ out[8] = _mm_sub_epi32(in[7], in[8]);
+ out[9] = _mm_sub_epi32(in[6], in[9]);
+ out[10] = _mm_sub_epi32(in[5], in[10]);
+ out[11] = _mm_sub_epi32(in[4], in[11]);
+ out[12] = _mm_sub_epi32(in[3], in[12]);
+ out[13] = _mm_sub_epi32(in[2], in[13]);
+ out[14] = _mm_sub_epi32(in[1], in[14]);
+ out[15] = _mm_sub_epi32(in[0], in[15]);
+}
+
+static INLINE __m128i add_clamp(const __m128i in0, const __m128i in1,
+ const int bd) {
+ const __m128i zero = _mm_setzero_si128();
+ // Faster than _mm_set1_epi16((1 << bd) - 1).
+ const __m128i one = _mm_set1_epi16(1);
+ const __m128i max = _mm_sub_epi16(_mm_slli_epi16(one, bd), one);
+ __m128i d;
+
+ d = _mm_adds_epi16(in0, in1);
+ d = _mm_max_epi16(d, zero);
+ d = _mm_min_epi16(d, max);
+
+ return d;
+}
+
+static INLINE void highbd_idct_1_add_kernel(const tran_low_t *input,
+ uint16_t *dest, int stride, int bd,
+ const int size) {
+ int a1, i, j;
+ tran_low_t out;
+ __m128i dc, d;
+
+ out = HIGHBD_WRAPLOW(
+ dct_const_round_shift(input[0] * (tran_high_t)cospi_16_64), bd);
+ out =
+ HIGHBD_WRAPLOW(dct_const_round_shift(out * (tran_high_t)cospi_16_64), bd);
+ a1 = ROUND_POWER_OF_TWO(out, (size == 8) ? 5 : 6);
+ dc = _mm_set1_epi16(a1);
+
+ for (i = 0; i < size; ++i) {
+ for (j = 0; j < size; j += 8) {
+ d = _mm_load_si128((const __m128i *)(&dest[j]));
+ d = add_clamp(d, dc, bd);
+ _mm_store_si128((__m128i *)(&dest[j]), d);
+ }
+ dest += stride;
+ }
+}
+
+static INLINE void recon_and_store_4(const __m128i in, uint16_t *const dest,
+ const int bd) {
+ __m128i d;
+
+ d = _mm_loadl_epi64((const __m128i *)dest);
+ d = add_clamp(d, in, bd);
+ _mm_storel_epi64((__m128i *)dest, d);
+}
+
+static INLINE void recon_and_store_4x2(const __m128i in, uint16_t *const dest,
+ const int stride, const int bd) {
+ __m128i d;
+
+ d = _mm_loadl_epi64((const __m128i *)(dest + 0 * stride));
+ d = _mm_castps_si128(
+ _mm_loadh_pi(_mm_castsi128_ps(d), (const __m64 *)(dest + 1 * stride)));
+ d = add_clamp(d, in, bd);
+ _mm_storel_epi64((__m128i *)(dest + 0 * stride), d);
+ _mm_storeh_pi((__m64 *)(dest + 1 * stride), _mm_castsi128_ps(d));
+}
+
+static INLINE void recon_and_store_4x4(const __m128i *const in, uint16_t *dest,
+ const int stride, const int bd) {
+ recon_and_store_4x2(in[0], dest, stride, bd);
+ dest += 2 * stride;
+ recon_and_store_4x2(in[1], dest, stride, bd);
+}
+
+static INLINE void recon_and_store_8(const __m128i in, uint16_t **const dest,
+ const int stride, const int bd) {
+ __m128i d;
+
+ d = _mm_load_si128((const __m128i *)(*dest));
+ d = add_clamp(d, in, bd);
+ _mm_store_si128((__m128i *)(*dest), d);
+ *dest += stride;
+}
+
+static INLINE void recon_and_store_8x8(const __m128i *const in, uint16_t *dest,
+ const int stride, const int bd) {
+ recon_and_store_8(in[0], &dest, stride, bd);
+ recon_and_store_8(in[1], &dest, stride, bd);
+ recon_and_store_8(in[2], &dest, stride, bd);
+ recon_and_store_8(in[3], &dest, stride, bd);
+ recon_and_store_8(in[4], &dest, stride, bd);
+ recon_and_store_8(in[5], &dest, stride, bd);
+ recon_and_store_8(in[6], &dest, stride, bd);
+ recon_and_store_8(in[7], &dest, stride, bd);
+}
+
+static INLINE __m128i load_pack_8_32bit(const tran_low_t *const input) {
+ const __m128i t0 = _mm_load_si128((const __m128i *)(input + 0));
+ const __m128i t1 = _mm_load_si128((const __m128i *)(input + 4));
+ return _mm_packs_epi32(t0, t1);
+}
+
+static INLINE void highbd_load_pack_transpose_32bit_8x8(const tran_low_t *input,
+ const int stride,
+ __m128i *const in) {
+ in[0] = load_pack_8_32bit(input + 0 * stride);
+ in[1] = load_pack_8_32bit(input + 1 * stride);
+ in[2] = load_pack_8_32bit(input + 2 * stride);
+ in[3] = load_pack_8_32bit(input + 3 * stride);
+ in[4] = load_pack_8_32bit(input + 4 * stride);
+ in[5] = load_pack_8_32bit(input + 5 * stride);
+ in[6] = load_pack_8_32bit(input + 6 * stride);
+ in[7] = load_pack_8_32bit(input + 7 * stride);
+ transpose_16bit_8x8(in, in);
+}
+
+static INLINE void highbd_load_transpose_32bit_8x4(const tran_low_t *input,
+ const int stride,
+ __m128i *in) {
+ in[0] = _mm_load_si128((const __m128i *)(input + 0 * stride + 0));
+ in[1] = _mm_load_si128((const __m128i *)(input + 0 * stride + 4));
+ in[2] = _mm_load_si128((const __m128i *)(input + 1 * stride + 0));
+ in[3] = _mm_load_si128((const __m128i *)(input + 1 * stride + 4));
+ in[4] = _mm_load_si128((const __m128i *)(input + 2 * stride + 0));
+ in[5] = _mm_load_si128((const __m128i *)(input + 2 * stride + 4));
+ in[6] = _mm_load_si128((const __m128i *)(input + 3 * stride + 0));
+ in[7] = _mm_load_si128((const __m128i *)(input + 3 * stride + 4));
+ transpose_32bit_8x4(in, in);
+}
+
+static INLINE void highbd_load_transpose_32bit_4x4(const tran_low_t *input,
+ const int stride,
+ __m128i *in) {
+ in[0] = _mm_load_si128((const __m128i *)(input + 0 * stride));
+ in[1] = _mm_load_si128((const __m128i *)(input + 1 * stride));
+ in[2] = _mm_load_si128((const __m128i *)(input + 2 * stride));
+ in[3] = _mm_load_si128((const __m128i *)(input + 3 * stride));
+ transpose_32bit_4x4(in, in);
+}
+
+static INLINE void highbd_write_buffer_8(uint16_t *dest, const __m128i in,
+ const int bd) {
+ const __m128i final_rounding = _mm_set1_epi16(1 << 5);
+ __m128i out;
+
+ out = _mm_adds_epi16(in, final_rounding);
+ out = _mm_srai_epi16(out, 6);
+ recon_and_store_8(out, &dest, 0, bd);
+}
+
+static INLINE void highbd_write_buffer_4(uint16_t *const dest, const __m128i in,
+ const int bd) {
+ const __m128i final_rounding = _mm_set1_epi32(1 << 5);
+ __m128i out;
+
+ out = _mm_add_epi32(in, final_rounding);
+ out = _mm_srai_epi32(out, 6);
+ out = _mm_packs_epi32(out, out);
+ recon_and_store_4(out, dest, bd);
+}
+
+#endif // VPX_VPX_DSP_X86_HIGHBD_INV_TXFM_SSE2_H_
diff --git a/media/libvpx/libvpx/vpx_dsp/x86/highbd_inv_txfm_sse4.h b/media/libvpx/libvpx/vpx_dsp/x86/highbd_inv_txfm_sse4.h
new file mode 100644
index 0000000000..f446bb13f3
--- /dev/null
+++ b/media/libvpx/libvpx/vpx_dsp/x86/highbd_inv_txfm_sse4.h
@@ -0,0 +1,112 @@
+/*
+ * Copyright (c) 2017 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef VPX_VPX_DSP_X86_HIGHBD_INV_TXFM_SSE4_H_
+#define VPX_VPX_DSP_X86_HIGHBD_INV_TXFM_SSE4_H_
+
+#include <smmintrin.h> // SSE4.1
+
+#include "./vpx_config.h"
+#include "vpx_dsp/x86/highbd_inv_txfm_sse2.h"
+
+static INLINE __m128i multiplication_round_shift_sse4_1(
+ const __m128i *const in /*in[2]*/, const int c) {
+ const __m128i pair_c = pair_set_epi32(c * 4, 0);
+ __m128i t0, t1;
+
+ t0 = _mm_mul_epi32(in[0], pair_c);
+ t1 = _mm_mul_epi32(in[1], pair_c);
+ t0 = dct_const_round_shift_64bit(t0);
+ t1 = dct_const_round_shift_64bit(t1);
+
+ return pack_4(t0, t1);
+}
+
+static INLINE void highbd_butterfly_sse4_1(const __m128i in0, const __m128i in1,
+ const int c0, const int c1,
+ __m128i *const out0,
+ __m128i *const out1) {
+ const __m128i pair_c0 = pair_set_epi32(4 * c0, 0);
+ const __m128i pair_c1 = pair_set_epi32(4 * c1, 0);
+ __m128i temp1[4], temp2[4];
+
+ extend_64bit(in0, temp1);
+ extend_64bit(in1, temp2);
+ temp1[2] = _mm_mul_epi32(temp1[0], pair_c1);
+ temp1[3] = _mm_mul_epi32(temp1[1], pair_c1);
+ temp1[0] = _mm_mul_epi32(temp1[0], pair_c0);
+ temp1[1] = _mm_mul_epi32(temp1[1], pair_c0);
+ temp2[2] = _mm_mul_epi32(temp2[0], pair_c0);
+ temp2[3] = _mm_mul_epi32(temp2[1], pair_c0);
+ temp2[0] = _mm_mul_epi32(temp2[0], pair_c1);
+ temp2[1] = _mm_mul_epi32(temp2[1], pair_c1);
+ temp1[0] = _mm_sub_epi64(temp1[0], temp2[0]);
+ temp1[1] = _mm_sub_epi64(temp1[1], temp2[1]);
+ temp2[0] = _mm_add_epi64(temp1[2], temp2[2]);
+ temp2[1] = _mm_add_epi64(temp1[3], temp2[3]);
+ temp1[0] = dct_const_round_shift_64bit(temp1[0]);
+ temp1[1] = dct_const_round_shift_64bit(temp1[1]);
+ temp2[0] = dct_const_round_shift_64bit(temp2[0]);
+ temp2[1] = dct_const_round_shift_64bit(temp2[1]);
+ *out0 = pack_4(temp1[0], temp1[1]);
+ *out1 = pack_4(temp2[0], temp2[1]);
+}
+
+static INLINE void highbd_butterfly_cospi16_sse4_1(const __m128i in0,
+ const __m128i in1,
+ __m128i *const out0,
+ __m128i *const out1) {
+ __m128i temp1[2], temp2;
+
+ temp2 = _mm_add_epi32(in0, in1);
+ extend_64bit(temp2, temp1);
+ *out0 = multiplication_round_shift_sse4_1(temp1, cospi_16_64);
+ temp2 = _mm_sub_epi32(in0, in1);
+ extend_64bit(temp2, temp1);
+ *out1 = multiplication_round_shift_sse4_1(temp1, cospi_16_64);
+}
+
+static INLINE void highbd_partial_butterfly_sse4_1(const __m128i in,
+ const int c0, const int c1,
+ __m128i *const out0,
+ __m128i *const out1) {
+ __m128i temp[2];
+
+ extend_64bit(in, temp);
+ *out0 = multiplication_round_shift_sse4_1(temp, c0);
+ *out1 = multiplication_round_shift_sse4_1(temp, c1);
+}
+
+static INLINE void highbd_idct4_sse4_1(__m128i *const io) {
+ __m128i temp[2], step[4];
+
+ transpose_32bit_4x4(io, io);
+
+ // stage 1
+ temp[0] = _mm_add_epi32(io[0], io[2]); // input[0] + input[2]
+ extend_64bit(temp[0], temp);
+ step[0] = multiplication_round_shift_sse4_1(temp, cospi_16_64);
+ temp[0] = _mm_sub_epi32(io[0], io[2]); // input[0] - input[2]
+ extend_64bit(temp[0], temp);
+ step[1] = multiplication_round_shift_sse4_1(temp, cospi_16_64);
+ highbd_butterfly_sse4_1(io[1], io[3], cospi_24_64, cospi_8_64, &step[2],
+ &step[3]);
+
+ // stage 2
+ io[0] = _mm_add_epi32(step[0], step[3]); // step[0] + step[3]
+ io[1] = _mm_add_epi32(step[1], step[2]); // step[1] + step[2]
+ io[2] = _mm_sub_epi32(step[1], step[2]); // step[1] - step[2]
+ io[3] = _mm_sub_epi32(step[0], step[3]); // step[0] - step[3]
+}
+
+void vpx_highbd_idct8x8_half1d_sse4_1(__m128i *const io);
+void vpx_highbd_idct16_4col_sse4_1(__m128i *const io /*io[16]*/);
+
+#endif // VPX_VPX_DSP_X86_HIGHBD_INV_TXFM_SSE4_H_
diff --git a/media/libvpx/libvpx/vpx_dsp/x86/highbd_loopfilter_sse2.c b/media/libvpx/libvpx/vpx_dsp/x86/highbd_loopfilter_sse2.c
new file mode 100644
index 0000000000..9f45623dee
--- /dev/null
+++ b/media/libvpx/libvpx/vpx_dsp/x86/highbd_loopfilter_sse2.c
@@ -0,0 +1,1140 @@
+/*
+ * Copyright (c) 2014 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <emmintrin.h> // SSE2
+
+#include "./vpx_dsp_rtcd.h"
+#include "vpx_ports/mem.h"
+
+static INLINE __m128i signed_char_clamp_bd_sse2(__m128i value, int bd) {
+ __m128i ubounded;
+ __m128i lbounded;
+ __m128i retval;
+
+ const __m128i zero = _mm_setzero_si128();
+ const __m128i one = _mm_set1_epi16(1);
+ __m128i t80, max, min;
+
+ if (bd == 8) {
+ t80 = _mm_set1_epi16(0x80);
+ max = _mm_subs_epi16(_mm_subs_epi16(_mm_slli_epi16(one, 8), one), t80);
+ } else if (bd == 10) {
+ t80 = _mm_set1_epi16(0x200);
+ max = _mm_subs_epi16(_mm_subs_epi16(_mm_slli_epi16(one, 10), one), t80);
+ } else { // bd == 12
+ t80 = _mm_set1_epi16(0x800);
+ max = _mm_subs_epi16(_mm_subs_epi16(_mm_slli_epi16(one, 12), one), t80);
+ }
+
+ min = _mm_subs_epi16(zero, t80);
+
+ ubounded = _mm_cmpgt_epi16(value, max);
+ lbounded = _mm_cmplt_epi16(value, min);
+ retval = _mm_andnot_si128(_mm_or_si128(ubounded, lbounded), value);
+ ubounded = _mm_and_si128(ubounded, max);
+ lbounded = _mm_and_si128(lbounded, min);
+ retval = _mm_or_si128(retval, ubounded);
+ retval = _mm_or_si128(retval, lbounded);
+ return retval;
+}
+
+// TODO(debargha, peter): Break up large functions into smaller ones
+// in this file.
+void vpx_highbd_lpf_horizontal_16_sse2(uint16_t *s, int pitch,
+ const uint8_t *blimit,
+ const uint8_t *limit,
+ const uint8_t *thresh, int bd) {
+ const __m128i zero = _mm_setzero_si128();
+ const __m128i one = _mm_set1_epi16(1);
+ __m128i blimit_v, limit_v, thresh_v;
+ __m128i q7, p7, q6, p6, q5, p5, q4, p4, q3, p3, q2, p2, q1, p1, q0, p0;
+ __m128i mask, hev, flat, flat2, abs_p1p0, abs_q1q0;
+ __m128i ps1, qs1, ps0, qs0;
+ __m128i abs_p0q0, abs_p1q1, ffff, work;
+ __m128i filt, work_a, filter1, filter2;
+ __m128i flat2_q6, flat2_p6, flat2_q5, flat2_p5, flat2_q4, flat2_p4;
+ __m128i flat2_q3, flat2_p3, flat2_q2, flat2_p2, flat2_q1, flat2_p1;
+ __m128i flat2_q0, flat2_p0;
+ __m128i flat_q2, flat_p2, flat_q1, flat_p1, flat_q0, flat_p0;
+ __m128i pixelFilter_p, pixelFilter_q;
+ __m128i pixetFilter_p2p1p0, pixetFilter_q2q1q0;
+ __m128i sum_p7, sum_q7, sum_p3, sum_q3;
+ __m128i t4, t3, t80, t1;
+ __m128i eight, four;
+
+ if (bd == 8) {
+ blimit_v = _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)blimit), zero);
+ limit_v = _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)limit), zero);
+ thresh_v = _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)thresh), zero);
+ } else if (bd == 10) {
+ blimit_v = _mm_slli_epi16(
+ _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)blimit), zero), 2);
+ limit_v = _mm_slli_epi16(
+ _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)limit), zero), 2);
+ thresh_v = _mm_slli_epi16(
+ _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)thresh), zero), 2);
+ } else { // bd == 12
+ blimit_v = _mm_slli_epi16(
+ _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)blimit), zero), 4);
+ limit_v = _mm_slli_epi16(
+ _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)limit), zero), 4);
+ thresh_v = _mm_slli_epi16(
+ _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)thresh), zero), 4);
+ }
+
+ q4 = _mm_load_si128((__m128i *)(s + 4 * pitch));
+ p4 = _mm_load_si128((__m128i *)(s - 5 * pitch));
+ q3 = _mm_load_si128((__m128i *)(s + 3 * pitch));
+ p3 = _mm_load_si128((__m128i *)(s - 4 * pitch));
+ q2 = _mm_load_si128((__m128i *)(s + 2 * pitch));
+ p2 = _mm_load_si128((__m128i *)(s - 3 * pitch));
+ q1 = _mm_load_si128((__m128i *)(s + 1 * pitch));
+ p1 = _mm_load_si128((__m128i *)(s - 2 * pitch));
+ q0 = _mm_load_si128((__m128i *)(s + 0 * pitch));
+ p0 = _mm_load_si128((__m128i *)(s - 1 * pitch));
+
+ // highbd_filter_mask
+ abs_p1p0 = _mm_or_si128(_mm_subs_epu16(p1, p0), _mm_subs_epu16(p0, p1));
+ abs_q1q0 = _mm_or_si128(_mm_subs_epu16(q1, q0), _mm_subs_epu16(q0, q1));
+
+ ffff = _mm_cmpeq_epi16(abs_p1p0, abs_p1p0);
+
+ abs_p0q0 = _mm_or_si128(_mm_subs_epu16(p0, q0), _mm_subs_epu16(q0, p0));
+ abs_p1q1 = _mm_or_si128(_mm_subs_epu16(p1, q1), _mm_subs_epu16(q1, p1));
+
+ // highbd_hev_mask (in C code this is actually called from highbd_filter4)
+ flat = _mm_max_epi16(abs_p1p0, abs_q1q0);
+ hev = _mm_subs_epu16(flat, thresh_v);
+ hev = _mm_xor_si128(_mm_cmpeq_epi16(hev, zero), ffff);
+
+ abs_p0q0 = _mm_adds_epu16(abs_p0q0, abs_p0q0); // abs(p0 - q0) * 2
+ abs_p1q1 = _mm_srli_epi16(abs_p1q1, 1); // abs(p1 - q1) / 2
+ mask = _mm_subs_epu16(_mm_adds_epu16(abs_p0q0, abs_p1q1), blimit_v);
+ mask = _mm_xor_si128(_mm_cmpeq_epi16(mask, zero), ffff);
+ mask = _mm_and_si128(mask, _mm_adds_epu16(limit_v, one));
+ work = _mm_max_epi16(
+ _mm_or_si128(_mm_subs_epu16(p1, p0), _mm_subs_epu16(p0, p1)),
+ _mm_or_si128(_mm_subs_epu16(q1, q0), _mm_subs_epu16(q0, q1)));
+ mask = _mm_max_epi16(work, mask);
+ work = _mm_max_epi16(
+ _mm_or_si128(_mm_subs_epu16(p2, p1), _mm_subs_epu16(p1, p2)),
+ _mm_or_si128(_mm_subs_epu16(q2, q1), _mm_subs_epu16(q1, q2)));
+ mask = _mm_max_epi16(work, mask);
+ work = _mm_max_epi16(
+ _mm_or_si128(_mm_subs_epu16(p3, p2), _mm_subs_epu16(p2, p3)),
+ _mm_or_si128(_mm_subs_epu16(q3, q2), _mm_subs_epu16(q2, q3)));
+ mask = _mm_max_epi16(work, mask);
+
+ mask = _mm_subs_epu16(mask, limit_v);
+ mask = _mm_cmpeq_epi16(mask, zero); // return ~mask
+
+ // lp filter
+ // highbd_filter4
+ t4 = _mm_set1_epi16(4);
+ t3 = _mm_set1_epi16(3);
+ if (bd == 8)
+ t80 = _mm_set1_epi16(0x80);
+ else if (bd == 10)
+ t80 = _mm_set1_epi16(0x200);
+ else // bd == 12
+ t80 = _mm_set1_epi16(0x800);
+
+ t1 = _mm_set1_epi16(0x1);
+
+ ps1 = _mm_subs_epi16(p1, t80);
+ qs1 = _mm_subs_epi16(q1, t80);
+ ps0 = _mm_subs_epi16(p0, t80);
+ qs0 = _mm_subs_epi16(q0, t80);
+
+ filt = _mm_and_si128(signed_char_clamp_bd_sse2(_mm_subs_epi16(ps1, qs1), bd),
+ hev);
+ work_a = _mm_subs_epi16(qs0, ps0);
+ filt = _mm_adds_epi16(filt, work_a);
+ filt = _mm_adds_epi16(filt, work_a);
+ filt = signed_char_clamp_bd_sse2(_mm_adds_epi16(filt, work_a), bd);
+ filt = _mm_and_si128(filt, mask);
+ filter1 = signed_char_clamp_bd_sse2(_mm_adds_epi16(filt, t4), bd);
+ filter2 = signed_char_clamp_bd_sse2(_mm_adds_epi16(filt, t3), bd);
+
+ // Filter1 >> 3
+ filter1 = _mm_srai_epi16(filter1, 0x3);
+ filter2 = _mm_srai_epi16(filter2, 0x3);
+
+ qs0 = _mm_adds_epi16(
+ signed_char_clamp_bd_sse2(_mm_subs_epi16(qs0, filter1), bd), t80);
+ ps0 = _mm_adds_epi16(
+ signed_char_clamp_bd_sse2(_mm_adds_epi16(ps0, filter2), bd), t80);
+ filt = _mm_adds_epi16(filter1, t1);
+ filt = _mm_srai_epi16(filt, 1);
+ filt = _mm_andnot_si128(hev, filt);
+ qs1 = _mm_adds_epi16(signed_char_clamp_bd_sse2(_mm_subs_epi16(qs1, filt), bd),
+ t80);
+ ps1 = _mm_adds_epi16(signed_char_clamp_bd_sse2(_mm_adds_epi16(ps1, filt), bd),
+ t80);
+
+ // end highbd_filter4
+ // loopfilter done
+
+ // highbd_flat_mask4
+ flat = _mm_max_epi16(
+ _mm_or_si128(_mm_subs_epu16(p2, p0), _mm_subs_epu16(p0, p2)),
+ _mm_or_si128(_mm_subs_epu16(p3, p0), _mm_subs_epu16(p0, p3)));
+ work = _mm_max_epi16(
+ _mm_or_si128(_mm_subs_epu16(q2, q0), _mm_subs_epu16(q0, q2)),
+ _mm_or_si128(_mm_subs_epu16(q3, q0), _mm_subs_epu16(q0, q3)));
+ flat = _mm_max_epi16(work, flat);
+ work = _mm_max_epi16(abs_p1p0, abs_q1q0);
+ flat = _mm_max_epi16(work, flat);
+
+ if (bd == 8)
+ flat = _mm_subs_epu16(flat, one);
+ else if (bd == 10)
+ flat = _mm_subs_epu16(flat, _mm_slli_epi16(one, 2));
+ else // bd == 12
+ flat = _mm_subs_epu16(flat, _mm_slli_epi16(one, 4));
+
+ flat = _mm_cmpeq_epi16(flat, zero);
+ // end flat_mask4
+
+ // flat & mask = flat && mask (as used in filter8)
+ // (because, in both vars, each block of 16 either all 1s or all 0s)
+ flat = _mm_and_si128(flat, mask);
+
+ p5 = _mm_load_si128((__m128i *)(s - 6 * pitch));
+ q5 = _mm_load_si128((__m128i *)(s + 5 * pitch));
+ p6 = _mm_load_si128((__m128i *)(s - 7 * pitch));
+ q6 = _mm_load_si128((__m128i *)(s + 6 * pitch));
+ p7 = _mm_load_si128((__m128i *)(s - 8 * pitch));
+ q7 = _mm_load_si128((__m128i *)(s + 7 * pitch));
+
+ // highbd_flat_mask5 (arguments passed in are p0, q0, p4-p7, q4-q7
+ // but referred to as p0-p4 & q0-q4 in fn)
+ flat2 = _mm_max_epi16(
+ _mm_or_si128(_mm_subs_epu16(p4, p0), _mm_subs_epu16(p0, p4)),
+ _mm_or_si128(_mm_subs_epu16(q4, q0), _mm_subs_epu16(q0, q4)));
+
+ work = _mm_max_epi16(
+ _mm_or_si128(_mm_subs_epu16(p5, p0), _mm_subs_epu16(p0, p5)),
+ _mm_or_si128(_mm_subs_epu16(q5, q0), _mm_subs_epu16(q0, q5)));
+ flat2 = _mm_max_epi16(work, flat2);
+
+ work = _mm_max_epi16(
+ _mm_or_si128(_mm_subs_epu16(p6, p0), _mm_subs_epu16(p0, p6)),
+ _mm_or_si128(_mm_subs_epu16(q6, q0), _mm_subs_epu16(q0, q6)));
+ flat2 = _mm_max_epi16(work, flat2);
+
+ work = _mm_max_epi16(
+ _mm_or_si128(_mm_subs_epu16(p7, p0), _mm_subs_epu16(p0, p7)),
+ _mm_or_si128(_mm_subs_epu16(q7, q0), _mm_subs_epu16(q0, q7)));
+ flat2 = _mm_max_epi16(work, flat2);
+
+ if (bd == 8)
+ flat2 = _mm_subs_epu16(flat2, one);
+ else if (bd == 10)
+ flat2 = _mm_subs_epu16(flat2, _mm_slli_epi16(one, 2));
+ else // bd == 12
+ flat2 = _mm_subs_epu16(flat2, _mm_slli_epi16(one, 4));
+
+ flat2 = _mm_cmpeq_epi16(flat2, zero);
+ flat2 = _mm_and_si128(flat2, flat); // flat2 & flat & mask
+ // end highbd_flat_mask5
+
+ // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+ // flat and wide flat calculations
+ eight = _mm_set1_epi16(8);
+ four = _mm_set1_epi16(4);
+
+ pixelFilter_p = _mm_add_epi16(_mm_add_epi16(p6, p5), _mm_add_epi16(p4, p3));
+ pixelFilter_q = _mm_add_epi16(_mm_add_epi16(q6, q5), _mm_add_epi16(q4, q3));
+
+ pixetFilter_p2p1p0 = _mm_add_epi16(p0, _mm_add_epi16(p2, p1));
+ pixelFilter_p = _mm_add_epi16(pixelFilter_p, pixetFilter_p2p1p0);
+
+ pixetFilter_q2q1q0 = _mm_add_epi16(q0, _mm_add_epi16(q2, q1));
+ pixelFilter_q = _mm_add_epi16(pixelFilter_q, pixetFilter_q2q1q0);
+ pixelFilter_p =
+ _mm_add_epi16(eight, _mm_add_epi16(pixelFilter_p, pixelFilter_q));
+ pixetFilter_p2p1p0 = _mm_add_epi16(
+ four, _mm_add_epi16(pixetFilter_p2p1p0, pixetFilter_q2q1q0));
+ flat2_p0 =
+ _mm_srli_epi16(_mm_add_epi16(pixelFilter_p, _mm_add_epi16(p7, p0)), 4);
+ flat2_q0 =
+ _mm_srli_epi16(_mm_add_epi16(pixelFilter_p, _mm_add_epi16(q7, q0)), 4);
+ flat_p0 = _mm_srli_epi16(
+ _mm_add_epi16(pixetFilter_p2p1p0, _mm_add_epi16(p3, p0)), 3);
+ flat_q0 = _mm_srli_epi16(
+ _mm_add_epi16(pixetFilter_p2p1p0, _mm_add_epi16(q3, q0)), 3);
+
+ sum_p7 = _mm_add_epi16(p7, p7);
+ sum_q7 = _mm_add_epi16(q7, q7);
+ sum_p3 = _mm_add_epi16(p3, p3);
+ sum_q3 = _mm_add_epi16(q3, q3);
+
+ pixelFilter_q = _mm_sub_epi16(pixelFilter_p, p6);
+ pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q6);
+ flat2_p1 = _mm_srli_epi16(
+ _mm_add_epi16(pixelFilter_p, _mm_add_epi16(sum_p7, p1)), 4);
+ flat2_q1 = _mm_srli_epi16(
+ _mm_add_epi16(pixelFilter_q, _mm_add_epi16(sum_q7, q1)), 4);
+
+ pixetFilter_q2q1q0 = _mm_sub_epi16(pixetFilter_p2p1p0, p2);
+ pixetFilter_p2p1p0 = _mm_sub_epi16(pixetFilter_p2p1p0, q2);
+ flat_p1 = _mm_srli_epi16(
+ _mm_add_epi16(pixetFilter_p2p1p0, _mm_add_epi16(sum_p3, p1)), 3);
+ flat_q1 = _mm_srli_epi16(
+ _mm_add_epi16(pixetFilter_q2q1q0, _mm_add_epi16(sum_q3, q1)), 3);
+
+ sum_p7 = _mm_add_epi16(sum_p7, p7);
+ sum_q7 = _mm_add_epi16(sum_q7, q7);
+ sum_p3 = _mm_add_epi16(sum_p3, p3);
+ sum_q3 = _mm_add_epi16(sum_q3, q3);
+
+ pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q5);
+ pixelFilter_q = _mm_sub_epi16(pixelFilter_q, p5);
+ flat2_p2 = _mm_srli_epi16(
+ _mm_add_epi16(pixelFilter_p, _mm_add_epi16(sum_p7, p2)), 4);
+ flat2_q2 = _mm_srli_epi16(
+ _mm_add_epi16(pixelFilter_q, _mm_add_epi16(sum_q7, q2)), 4);
+
+ pixetFilter_p2p1p0 = _mm_sub_epi16(pixetFilter_p2p1p0, q1);
+ pixetFilter_q2q1q0 = _mm_sub_epi16(pixetFilter_q2q1q0, p1);
+ flat_p2 = _mm_srli_epi16(
+ _mm_add_epi16(pixetFilter_p2p1p0, _mm_add_epi16(sum_p3, p2)), 3);
+ flat_q2 = _mm_srli_epi16(
+ _mm_add_epi16(pixetFilter_q2q1q0, _mm_add_epi16(sum_q3, q2)), 3);
+
+ sum_p7 = _mm_add_epi16(sum_p7, p7);
+ sum_q7 = _mm_add_epi16(sum_q7, q7);
+ pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q4);
+ pixelFilter_q = _mm_sub_epi16(pixelFilter_q, p4);
+ flat2_p3 = _mm_srli_epi16(
+ _mm_add_epi16(pixelFilter_p, _mm_add_epi16(sum_p7, p3)), 4);
+ flat2_q3 = _mm_srli_epi16(
+ _mm_add_epi16(pixelFilter_q, _mm_add_epi16(sum_q7, q3)), 4);
+
+ sum_p7 = _mm_add_epi16(sum_p7, p7);
+ sum_q7 = _mm_add_epi16(sum_q7, q7);
+ pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q3);
+ pixelFilter_q = _mm_sub_epi16(pixelFilter_q, p3);
+ flat2_p4 = _mm_srli_epi16(
+ _mm_add_epi16(pixelFilter_p, _mm_add_epi16(sum_p7, p4)), 4);
+ flat2_q4 = _mm_srli_epi16(
+ _mm_add_epi16(pixelFilter_q, _mm_add_epi16(sum_q7, q4)), 4);
+
+ sum_p7 = _mm_add_epi16(sum_p7, p7);
+ sum_q7 = _mm_add_epi16(sum_q7, q7);
+ pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q2);
+ pixelFilter_q = _mm_sub_epi16(pixelFilter_q, p2);
+ flat2_p5 = _mm_srli_epi16(
+ _mm_add_epi16(pixelFilter_p, _mm_add_epi16(sum_p7, p5)), 4);
+ flat2_q5 = _mm_srli_epi16(
+ _mm_add_epi16(pixelFilter_q, _mm_add_epi16(sum_q7, q5)), 4);
+
+ sum_p7 = _mm_add_epi16(sum_p7, p7);
+ sum_q7 = _mm_add_epi16(sum_q7, q7);
+ pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q1);
+ pixelFilter_q = _mm_sub_epi16(pixelFilter_q, p1);
+ flat2_p6 = _mm_srli_epi16(
+ _mm_add_epi16(pixelFilter_p, _mm_add_epi16(sum_p7, p6)), 4);
+ flat2_q6 = _mm_srli_epi16(
+ _mm_add_epi16(pixelFilter_q, _mm_add_epi16(sum_q7, q6)), 4);
+
+ // wide flat
+ // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+ // highbd_filter8
+ p2 = _mm_andnot_si128(flat, p2);
+ // p2 remains unchanged if !(flat && mask)
+ flat_p2 = _mm_and_si128(flat, flat_p2);
+ // when (flat && mask)
+ p2 = _mm_or_si128(p2, flat_p2); // full list of p2 values
+ q2 = _mm_andnot_si128(flat, q2);
+ flat_q2 = _mm_and_si128(flat, flat_q2);
+ q2 = _mm_or_si128(q2, flat_q2); // full list of q2 values
+
+ ps1 = _mm_andnot_si128(flat, ps1);
+ // p1 takes the value assigned to in in filter4 if !(flat && mask)
+ flat_p1 = _mm_and_si128(flat, flat_p1);
+ // when (flat && mask)
+ p1 = _mm_or_si128(ps1, flat_p1); // full list of p1 values
+ qs1 = _mm_andnot_si128(flat, qs1);
+ flat_q1 = _mm_and_si128(flat, flat_q1);
+ q1 = _mm_or_si128(qs1, flat_q1); // full list of q1 values
+
+ ps0 = _mm_andnot_si128(flat, ps0);
+ // p0 takes the value assigned to in in filter4 if !(flat && mask)
+ flat_p0 = _mm_and_si128(flat, flat_p0);
+ // when (flat && mask)
+ p0 = _mm_or_si128(ps0, flat_p0); // full list of p0 values
+ qs0 = _mm_andnot_si128(flat, qs0);
+ flat_q0 = _mm_and_si128(flat, flat_q0);
+ q0 = _mm_or_si128(qs0, flat_q0); // full list of q0 values
+ // end highbd_filter8
+
+ // highbd_filter16
+ p6 = _mm_andnot_si128(flat2, p6);
+ // p6 remains unchanged if !(flat2 && flat && mask)
+ flat2_p6 = _mm_and_si128(flat2, flat2_p6);
+ // get values for when (flat2 && flat && mask)
+ p6 = _mm_or_si128(p6, flat2_p6); // full list of p6 values
+ q6 = _mm_andnot_si128(flat2, q6);
+ // q6 remains unchanged if !(flat2 && flat && mask)
+ flat2_q6 = _mm_and_si128(flat2, flat2_q6);
+ // get values for when (flat2 && flat && mask)
+ q6 = _mm_or_si128(q6, flat2_q6); // full list of q6 values
+ _mm_store_si128((__m128i *)(s - 7 * pitch), p6);
+ _mm_store_si128((__m128i *)(s + 6 * pitch), q6);
+
+ p5 = _mm_andnot_si128(flat2, p5);
+ // p5 remains unchanged if !(flat2 && flat && mask)
+ flat2_p5 = _mm_and_si128(flat2, flat2_p5);
+ // get values for when (flat2 && flat && mask)
+ p5 = _mm_or_si128(p5, flat2_p5);
+ // full list of p5 values
+ q5 = _mm_andnot_si128(flat2, q5);
+ // q5 remains unchanged if !(flat2 && flat && mask)
+ flat2_q5 = _mm_and_si128(flat2, flat2_q5);
+ // get values for when (flat2 && flat && mask)
+ q5 = _mm_or_si128(q5, flat2_q5);
+ // full list of q5 values
+ _mm_store_si128((__m128i *)(s - 6 * pitch), p5);
+ _mm_store_si128((__m128i *)(s + 5 * pitch), q5);
+
+ p4 = _mm_andnot_si128(flat2, p4);
+ // p4 remains unchanged if !(flat2 && flat && mask)
+ flat2_p4 = _mm_and_si128(flat2, flat2_p4);
+ // get values for when (flat2 && flat && mask)
+ p4 = _mm_or_si128(p4, flat2_p4); // full list of p4 values
+ q4 = _mm_andnot_si128(flat2, q4);
+ // q4 remains unchanged if !(flat2 && flat && mask)
+ flat2_q4 = _mm_and_si128(flat2, flat2_q4);
+ // get values for when (flat2 && flat && mask)
+ q4 = _mm_or_si128(q4, flat2_q4); // full list of q4 values
+ _mm_store_si128((__m128i *)(s - 5 * pitch), p4);
+ _mm_store_si128((__m128i *)(s + 4 * pitch), q4);
+
+ p3 = _mm_andnot_si128(flat2, p3);
+ // p3 takes value from highbd_filter8 if !(flat2 && flat && mask)
+ flat2_p3 = _mm_and_si128(flat2, flat2_p3);
+ // get values for when (flat2 && flat && mask)
+ p3 = _mm_or_si128(p3, flat2_p3); // full list of p3 values
+ q3 = _mm_andnot_si128(flat2, q3);
+ // q3 takes value from highbd_filter8 if !(flat2 && flat && mask)
+ flat2_q3 = _mm_and_si128(flat2, flat2_q3);
+ // get values for when (flat2 && flat && mask)
+ q3 = _mm_or_si128(q3, flat2_q3); // full list of q3 values
+ _mm_store_si128((__m128i *)(s - 4 * pitch), p3);
+ _mm_store_si128((__m128i *)(s + 3 * pitch), q3);
+
+ p2 = _mm_andnot_si128(flat2, p2);
+ // p2 takes value from highbd_filter8 if !(flat2 && flat && mask)
+ flat2_p2 = _mm_and_si128(flat2, flat2_p2);
+ // get values for when (flat2 && flat && mask)
+ p2 = _mm_or_si128(p2, flat2_p2);
+ // full list of p2 values
+ q2 = _mm_andnot_si128(flat2, q2);
+ // q2 takes value from highbd_filter8 if !(flat2 && flat && mask)
+ flat2_q2 = _mm_and_si128(flat2, flat2_q2);
+ // get values for when (flat2 && flat && mask)
+ q2 = _mm_or_si128(q2, flat2_q2); // full list of q2 values
+ _mm_store_si128((__m128i *)(s - 3 * pitch), p2);
+ _mm_store_si128((__m128i *)(s + 2 * pitch), q2);
+
+ p1 = _mm_andnot_si128(flat2, p1);
+ // p1 takes value from highbd_filter8 if !(flat2 && flat && mask)
+ flat2_p1 = _mm_and_si128(flat2, flat2_p1);
+ // get values for when (flat2 && flat && mask)
+ p1 = _mm_or_si128(p1, flat2_p1); // full list of p1 values
+ q1 = _mm_andnot_si128(flat2, q1);
+ // q1 takes value from highbd_filter8 if !(flat2 && flat && mask)
+ flat2_q1 = _mm_and_si128(flat2, flat2_q1);
+ // get values for when (flat2 && flat && mask)
+ q1 = _mm_or_si128(q1, flat2_q1); // full list of q1 values
+ _mm_store_si128((__m128i *)(s - 2 * pitch), p1);
+ _mm_store_si128((__m128i *)(s + 1 * pitch), q1);
+
+ p0 = _mm_andnot_si128(flat2, p0);
+ // p0 takes value from highbd_filter8 if !(flat2 && flat && mask)
+ flat2_p0 = _mm_and_si128(flat2, flat2_p0);
+ // get values for when (flat2 && flat && mask)
+ p0 = _mm_or_si128(p0, flat2_p0); // full list of p0 values
+ q0 = _mm_andnot_si128(flat2, q0);
+ // q0 takes value from highbd_filter8 if !(flat2 && flat && mask)
+ flat2_q0 = _mm_and_si128(flat2, flat2_q0);
+ // get values for when (flat2 && flat && mask)
+ q0 = _mm_or_si128(q0, flat2_q0); // full list of q0 values
+ _mm_store_si128((__m128i *)(s - 1 * pitch), p0);
+ _mm_store_si128((__m128i *)(s - 0 * pitch), q0);
+}
+
+void vpx_highbd_lpf_horizontal_16_dual_sse2(uint16_t *s, int pitch,
+ const uint8_t *blimit,
+ const uint8_t *limit,
+ const uint8_t *thresh, int bd) {
+ vpx_highbd_lpf_horizontal_16_sse2(s, pitch, blimit, limit, thresh, bd);
+ vpx_highbd_lpf_horizontal_16_sse2(s + 8, pitch, blimit, limit, thresh, bd);
+}
+
+void vpx_highbd_lpf_horizontal_8_sse2(uint16_t *s, int pitch,
+ const uint8_t *blimit,
+ const uint8_t *limit,
+ const uint8_t *thresh, int bd) {
+ DECLARE_ALIGNED(16, uint16_t, flat_op2[16]);
+ DECLARE_ALIGNED(16, uint16_t, flat_op1[16]);
+ DECLARE_ALIGNED(16, uint16_t, flat_op0[16]);
+ DECLARE_ALIGNED(16, uint16_t, flat_oq2[16]);
+ DECLARE_ALIGNED(16, uint16_t, flat_oq1[16]);
+ DECLARE_ALIGNED(16, uint16_t, flat_oq0[16]);
+ const __m128i zero = _mm_setzero_si128();
+ __m128i blimit_v, limit_v, thresh_v;
+ __m128i mask, hev, flat;
+ __m128i p3 = _mm_load_si128((__m128i *)(s - 4 * pitch));
+ __m128i q3 = _mm_load_si128((__m128i *)(s + 3 * pitch));
+ __m128i p2 = _mm_load_si128((__m128i *)(s - 3 * pitch));
+ __m128i q2 = _mm_load_si128((__m128i *)(s + 2 * pitch));
+ __m128i p1 = _mm_load_si128((__m128i *)(s - 2 * pitch));
+ __m128i q1 = _mm_load_si128((__m128i *)(s + 1 * pitch));
+ __m128i p0 = _mm_load_si128((__m128i *)(s - 1 * pitch));
+ __m128i q0 = _mm_load_si128((__m128i *)(s + 0 * pitch));
+ const __m128i one = _mm_set1_epi16(1);
+ const __m128i ffff = _mm_cmpeq_epi16(one, one);
+ __m128i abs_p1q1, abs_p0q0, abs_q1q0, abs_p1p0, work;
+ const __m128i four = _mm_set1_epi16(4);
+ __m128i workp_a, workp_b, workp_shft;
+
+ const __m128i t4 = _mm_set1_epi16(4);
+ const __m128i t3 = _mm_set1_epi16(3);
+ __m128i t80;
+ const __m128i t1 = _mm_set1_epi16(0x1);
+ __m128i ps1, ps0, qs0, qs1;
+ __m128i filt;
+ __m128i work_a;
+ __m128i filter1, filter2;
+
+ if (bd == 8) {
+ blimit_v = _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)blimit), zero);
+ limit_v = _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)limit), zero);
+ thresh_v = _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)thresh), zero);
+ t80 = _mm_set1_epi16(0x80);
+ } else if (bd == 10) {
+ blimit_v = _mm_slli_epi16(
+ _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)blimit), zero), 2);
+ limit_v = _mm_slli_epi16(
+ _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)limit), zero), 2);
+ thresh_v = _mm_slli_epi16(
+ _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)thresh), zero), 2);
+ t80 = _mm_set1_epi16(0x200);
+ } else { // bd == 12
+ blimit_v = _mm_slli_epi16(
+ _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)blimit), zero), 4);
+ limit_v = _mm_slli_epi16(
+ _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)limit), zero), 4);
+ thresh_v = _mm_slli_epi16(
+ _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)thresh), zero), 4);
+ t80 = _mm_set1_epi16(0x800);
+ }
+
+ ps1 = _mm_subs_epi16(p1, t80);
+ ps0 = _mm_subs_epi16(p0, t80);
+ qs0 = _mm_subs_epi16(q0, t80);
+ qs1 = _mm_subs_epi16(q1, t80);
+
+ // filter_mask and hev_mask
+ abs_p1p0 = _mm_or_si128(_mm_subs_epu16(p1, p0), _mm_subs_epu16(p0, p1));
+ abs_q1q0 = _mm_or_si128(_mm_subs_epu16(q1, q0), _mm_subs_epu16(q0, q1));
+
+ abs_p0q0 = _mm_or_si128(_mm_subs_epu16(p0, q0), _mm_subs_epu16(q0, p0));
+ abs_p1q1 = _mm_or_si128(_mm_subs_epu16(p1, q1), _mm_subs_epu16(q1, p1));
+ flat = _mm_max_epi16(abs_p1p0, abs_q1q0);
+ hev = _mm_subs_epu16(flat, thresh_v);
+ hev = _mm_xor_si128(_mm_cmpeq_epi16(hev, zero), ffff);
+
+ abs_p0q0 = _mm_adds_epu16(abs_p0q0, abs_p0q0);
+ abs_p1q1 = _mm_srli_epi16(abs_p1q1, 1);
+ mask = _mm_subs_epu16(_mm_adds_epu16(abs_p0q0, abs_p1q1), blimit_v);
+ mask = _mm_xor_si128(_mm_cmpeq_epi16(mask, zero), ffff);
+ // mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2 > blimit) * -1;
+ // So taking maximums continues to work:
+ mask = _mm_and_si128(mask, _mm_adds_epu16(limit_v, one));
+ mask = _mm_max_epi16(abs_p1p0, mask);
+ // mask |= (abs(p1 - p0) > limit) * -1;
+ mask = _mm_max_epi16(abs_q1q0, mask);
+ // mask |= (abs(q1 - q0) > limit) * -1;
+
+ work = _mm_max_epi16(
+ _mm_or_si128(_mm_subs_epu16(p2, p1), _mm_subs_epu16(p1, p2)),
+ _mm_or_si128(_mm_subs_epu16(q2, q1), _mm_subs_epu16(q1, q2)));
+ mask = _mm_max_epi16(work, mask);
+ work = _mm_max_epi16(
+ _mm_or_si128(_mm_subs_epu16(p3, p2), _mm_subs_epu16(p2, p3)),
+ _mm_or_si128(_mm_subs_epu16(q3, q2), _mm_subs_epu16(q2, q3)));
+ mask = _mm_max_epi16(work, mask);
+ mask = _mm_subs_epu16(mask, limit_v);
+ mask = _mm_cmpeq_epi16(mask, zero);
+
+ // flat_mask4
+ flat = _mm_max_epi16(
+ _mm_or_si128(_mm_subs_epu16(p2, p0), _mm_subs_epu16(p0, p2)),
+ _mm_or_si128(_mm_subs_epu16(q2, q0), _mm_subs_epu16(q0, q2)));
+ work = _mm_max_epi16(
+ _mm_or_si128(_mm_subs_epu16(p3, p0), _mm_subs_epu16(p0, p3)),
+ _mm_or_si128(_mm_subs_epu16(q3, q0), _mm_subs_epu16(q0, q3)));
+ flat = _mm_max_epi16(work, flat);
+ flat = _mm_max_epi16(abs_p1p0, flat);
+ flat = _mm_max_epi16(abs_q1q0, flat);
+
+ if (bd == 8)
+ flat = _mm_subs_epu16(flat, one);
+ else if (bd == 10)
+ flat = _mm_subs_epu16(flat, _mm_slli_epi16(one, 2));
+ else // bd == 12
+ flat = _mm_subs_epu16(flat, _mm_slli_epi16(one, 4));
+
+ flat = _mm_cmpeq_epi16(flat, zero);
+ flat = _mm_and_si128(flat, mask); // flat & mask
+
+ // Added before shift for rounding part of ROUND_POWER_OF_TWO
+
+ workp_a = _mm_add_epi16(_mm_add_epi16(p3, p3), _mm_add_epi16(p2, p1));
+ workp_a = _mm_add_epi16(_mm_add_epi16(workp_a, four), p0);
+ workp_b = _mm_add_epi16(_mm_add_epi16(q0, p2), p3);
+ workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3);
+ _mm_store_si128((__m128i *)&flat_op2[0], workp_shft);
+
+ workp_b = _mm_add_epi16(_mm_add_epi16(q0, q1), p1);
+ workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3);
+ _mm_store_si128((__m128i *)&flat_op1[0], workp_shft);
+
+ workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p3), q2);
+ workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, p1), p0);
+ workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3);
+ _mm_store_si128((__m128i *)&flat_op0[0], workp_shft);
+
+ workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p3), q3);
+ workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, p0), q0);
+ workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3);
+ _mm_store_si128((__m128i *)&flat_oq0[0], workp_shft);
+
+ workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p2), q3);
+ workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, q0), q1);
+ workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3);
+ _mm_store_si128((__m128i *)&flat_oq1[0], workp_shft);
+
+ workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p1), q3);
+ workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, q1), q2);
+ workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3);
+ _mm_store_si128((__m128i *)&flat_oq2[0], workp_shft);
+
+ // lp filter
+ filt = signed_char_clamp_bd_sse2(_mm_subs_epi16(ps1, qs1), bd);
+ filt = _mm_and_si128(filt, hev);
+ work_a = _mm_subs_epi16(qs0, ps0);
+ filt = _mm_adds_epi16(filt, work_a);
+ filt = _mm_adds_epi16(filt, work_a);
+ filt = _mm_adds_epi16(filt, work_a);
+ // (vpx_filter + 3 * (qs0 - ps0)) & mask
+ filt = signed_char_clamp_bd_sse2(filt, bd);
+ filt = _mm_and_si128(filt, mask);
+
+ filter1 = _mm_adds_epi16(filt, t4);
+ filter2 = _mm_adds_epi16(filt, t3);
+
+ // Filter1 >> 3
+ filter1 = signed_char_clamp_bd_sse2(filter1, bd);
+ filter1 = _mm_srai_epi16(filter1, 3);
+
+ // Filter2 >> 3
+ filter2 = signed_char_clamp_bd_sse2(filter2, bd);
+ filter2 = _mm_srai_epi16(filter2, 3);
+
+ // filt >> 1
+ filt = _mm_adds_epi16(filter1, t1);
+ filt = _mm_srai_epi16(filt, 1);
+ // filter = ROUND_POWER_OF_TWO(filter1, 1) & ~hev;
+ filt = _mm_andnot_si128(hev, filt);
+
+ work_a = signed_char_clamp_bd_sse2(_mm_subs_epi16(qs0, filter1), bd);
+ work_a = _mm_adds_epi16(work_a, t80);
+ q0 = _mm_load_si128((__m128i *)flat_oq0);
+ work_a = _mm_andnot_si128(flat, work_a);
+ q0 = _mm_and_si128(flat, q0);
+ q0 = _mm_or_si128(work_a, q0);
+
+ work_a = signed_char_clamp_bd_sse2(_mm_subs_epi16(qs1, filt), bd);
+ work_a = _mm_adds_epi16(work_a, t80);
+ q1 = _mm_load_si128((__m128i *)flat_oq1);
+ work_a = _mm_andnot_si128(flat, work_a);
+ q1 = _mm_and_si128(flat, q1);
+ q1 = _mm_or_si128(work_a, q1);
+
+ work_a = _mm_loadu_si128((__m128i *)(s + 2 * pitch));
+ q2 = _mm_load_si128((__m128i *)flat_oq2);
+ work_a = _mm_andnot_si128(flat, work_a);
+ q2 = _mm_and_si128(flat, q2);
+ q2 = _mm_or_si128(work_a, q2);
+
+ work_a = signed_char_clamp_bd_sse2(_mm_adds_epi16(ps0, filter2), bd);
+ work_a = _mm_adds_epi16(work_a, t80);
+ p0 = _mm_load_si128((__m128i *)flat_op0);
+ work_a = _mm_andnot_si128(flat, work_a);
+ p0 = _mm_and_si128(flat, p0);
+ p0 = _mm_or_si128(work_a, p0);
+
+ work_a = signed_char_clamp_bd_sse2(_mm_adds_epi16(ps1, filt), bd);
+ work_a = _mm_adds_epi16(work_a, t80);
+ p1 = _mm_load_si128((__m128i *)flat_op1);
+ work_a = _mm_andnot_si128(flat, work_a);
+ p1 = _mm_and_si128(flat, p1);
+ p1 = _mm_or_si128(work_a, p1);
+
+ work_a = _mm_loadu_si128((__m128i *)(s - 3 * pitch));
+ p2 = _mm_load_si128((__m128i *)flat_op2);
+ work_a = _mm_andnot_si128(flat, work_a);
+ p2 = _mm_and_si128(flat, p2);
+ p2 = _mm_or_si128(work_a, p2);
+
+ _mm_store_si128((__m128i *)(s - 3 * pitch), p2);
+ _mm_store_si128((__m128i *)(s - 2 * pitch), p1);
+ _mm_store_si128((__m128i *)(s - 1 * pitch), p0);
+ _mm_store_si128((__m128i *)(s + 0 * pitch), q0);
+ _mm_store_si128((__m128i *)(s + 1 * pitch), q1);
+ _mm_store_si128((__m128i *)(s + 2 * pitch), q2);
+}
+
+void vpx_highbd_lpf_horizontal_8_dual_sse2(
+ uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0,
+ const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1,
+ const uint8_t *thresh1, int bd) {
+ vpx_highbd_lpf_horizontal_8_sse2(s, pitch, blimit0, limit0, thresh0, bd);
+ vpx_highbd_lpf_horizontal_8_sse2(s + 8, pitch, blimit1, limit1, thresh1, bd);
+}
+
+void vpx_highbd_lpf_horizontal_4_sse2(uint16_t *s, int pitch,
+ const uint8_t *blimit,
+ const uint8_t *limit,
+ const uint8_t *thresh, int bd) {
+ const __m128i zero = _mm_setzero_si128();
+ __m128i blimit_v, limit_v, thresh_v;
+ __m128i mask, hev, flat;
+ __m128i p3 = _mm_loadu_si128((__m128i *)(s - 4 * pitch));
+ __m128i p2 = _mm_loadu_si128((__m128i *)(s - 3 * pitch));
+ __m128i p1 = _mm_loadu_si128((__m128i *)(s - 2 * pitch));
+ __m128i p0 = _mm_loadu_si128((__m128i *)(s - 1 * pitch));
+ __m128i q0 = _mm_loadu_si128((__m128i *)(s - 0 * pitch));
+ __m128i q1 = _mm_loadu_si128((__m128i *)(s + 1 * pitch));
+ __m128i q2 = _mm_loadu_si128((__m128i *)(s + 2 * pitch));
+ __m128i q3 = _mm_loadu_si128((__m128i *)(s + 3 * pitch));
+ const __m128i abs_p1p0 =
+ _mm_or_si128(_mm_subs_epu16(p1, p0), _mm_subs_epu16(p0, p1));
+ const __m128i abs_q1q0 =
+ _mm_or_si128(_mm_subs_epu16(q1, q0), _mm_subs_epu16(q0, q1));
+ const __m128i ffff = _mm_cmpeq_epi16(abs_p1p0, abs_p1p0);
+ const __m128i one = _mm_set1_epi16(1);
+ __m128i abs_p0q0 =
+ _mm_or_si128(_mm_subs_epu16(p0, q0), _mm_subs_epu16(q0, p0));
+ __m128i abs_p1q1 =
+ _mm_or_si128(_mm_subs_epu16(p1, q1), _mm_subs_epu16(q1, p1));
+ __m128i work;
+ const __m128i t4 = _mm_set1_epi16(4);
+ const __m128i t3 = _mm_set1_epi16(3);
+ __m128i t80;
+ __m128i tff80;
+ __m128i tffe0;
+ __m128i t1f;
+ // equivalent to shifting 0x1f left by bitdepth - 8
+ // and setting new bits to 1
+ const __m128i t1 = _mm_set1_epi16(0x1);
+ __m128i t7f;
+ // equivalent to shifting 0x7f left by bitdepth - 8
+ // and setting new bits to 1
+ __m128i ps1, ps0, qs0, qs1;
+ __m128i filt;
+ __m128i work_a;
+ __m128i filter1, filter2;
+
+ if (bd == 8) {
+ blimit_v = _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)blimit), zero);
+ limit_v = _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)limit), zero);
+ thresh_v = _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)thresh), zero);
+ t80 = _mm_set1_epi16(0x80);
+ tff80 = _mm_set1_epi16((int16_t)0xff80);
+ tffe0 = _mm_set1_epi16((int16_t)0xffe0);
+ t1f = _mm_srli_epi16(_mm_set1_epi16(0x1fff), 8);
+ t7f = _mm_srli_epi16(_mm_set1_epi16(0x7fff), 8);
+ } else if (bd == 10) {
+ blimit_v = _mm_slli_epi16(
+ _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)blimit), zero), 2);
+ limit_v = _mm_slli_epi16(
+ _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)limit), zero), 2);
+ thresh_v = _mm_slli_epi16(
+ _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)thresh), zero), 2);
+ t80 = _mm_slli_epi16(_mm_set1_epi16(0x80), 2);
+ tff80 = _mm_slli_epi16(_mm_set1_epi16((int16_t)0xff80), 2);
+ tffe0 = _mm_slli_epi16(_mm_set1_epi16((int16_t)0xffe0), 2);
+ t1f = _mm_srli_epi16(_mm_set1_epi16(0x1fff), 6);
+ t7f = _mm_srli_epi16(_mm_set1_epi16(0x7fff), 6);
+ } else { // bd == 12
+ blimit_v = _mm_slli_epi16(
+ _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)blimit), zero), 4);
+ limit_v = _mm_slli_epi16(
+ _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)limit), zero), 4);
+ thresh_v = _mm_slli_epi16(
+ _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)thresh), zero), 4);
+ t80 = _mm_slli_epi16(_mm_set1_epi16(0x80), 4);
+ tff80 = _mm_slli_epi16(_mm_set1_epi16((int16_t)0xff80), 4);
+ tffe0 = _mm_slli_epi16(_mm_set1_epi16((int16_t)0xffe0), 4);
+ t1f = _mm_srli_epi16(_mm_set1_epi16(0x1fff), 4);
+ t7f = _mm_srli_epi16(_mm_set1_epi16(0x7fff), 4);
+ }
+
+ ps1 = _mm_subs_epi16(_mm_loadu_si128((__m128i *)(s - 2 * pitch)), t80);
+ ps0 = _mm_subs_epi16(_mm_loadu_si128((__m128i *)(s - 1 * pitch)), t80);
+ qs0 = _mm_subs_epi16(_mm_loadu_si128((__m128i *)(s + 0 * pitch)), t80);
+ qs1 = _mm_subs_epi16(_mm_loadu_si128((__m128i *)(s + 1 * pitch)), t80);
+
+ // filter_mask and hev_mask
+ flat = _mm_max_epi16(abs_p1p0, abs_q1q0);
+ hev = _mm_subs_epu16(flat, thresh_v);
+ hev = _mm_xor_si128(_mm_cmpeq_epi16(hev, zero), ffff);
+
+ abs_p0q0 = _mm_adds_epu16(abs_p0q0, abs_p0q0);
+ abs_p1q1 = _mm_srli_epi16(abs_p1q1, 1);
+ mask = _mm_subs_epu16(_mm_adds_epu16(abs_p0q0, abs_p1q1), blimit_v);
+ mask = _mm_xor_si128(_mm_cmpeq_epi16(mask, zero), ffff);
+ // mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2 > blimit) * -1;
+ // So taking maximums continues to work:
+ mask = _mm_and_si128(mask, _mm_adds_epu16(limit_v, one));
+ mask = _mm_max_epi16(flat, mask);
+ // mask |= (abs(p1 - p0) > limit) * -1;
+ // mask |= (abs(q1 - q0) > limit) * -1;
+ work = _mm_max_epi16(
+ _mm_or_si128(_mm_subs_epu16(p2, p1), _mm_subs_epu16(p1, p2)),
+ _mm_or_si128(_mm_subs_epu16(p3, p2), _mm_subs_epu16(p2, p3)));
+ mask = _mm_max_epi16(work, mask);
+ work = _mm_max_epi16(
+ _mm_or_si128(_mm_subs_epu16(q2, q1), _mm_subs_epu16(q1, q2)),
+ _mm_or_si128(_mm_subs_epu16(q3, q2), _mm_subs_epu16(q2, q3)));
+ mask = _mm_max_epi16(work, mask);
+ mask = _mm_subs_epu16(mask, limit_v);
+ mask = _mm_cmpeq_epi16(mask, zero);
+
+ // filter4
+ filt = signed_char_clamp_bd_sse2(_mm_subs_epi16(ps1, qs1), bd);
+ filt = _mm_and_si128(filt, hev);
+ work_a = _mm_subs_epi16(qs0, ps0);
+ filt = _mm_adds_epi16(filt, work_a);
+ filt = _mm_adds_epi16(filt, work_a);
+ filt = signed_char_clamp_bd_sse2(_mm_adds_epi16(filt, work_a), bd);
+
+ // (vpx_filter + 3 * (qs0 - ps0)) & mask
+ filt = _mm_and_si128(filt, mask);
+
+ filter1 = signed_char_clamp_bd_sse2(_mm_adds_epi16(filt, t4), bd);
+ filter2 = signed_char_clamp_bd_sse2(_mm_adds_epi16(filt, t3), bd);
+
+ // Filter1 >> 3
+ work_a = _mm_cmpgt_epi16(zero, filter1); // get the values that are <0
+ filter1 = _mm_srli_epi16(filter1, 3);
+ work_a = _mm_and_si128(work_a, tffe0); // sign bits for the values < 0
+ filter1 = _mm_and_si128(filter1, t1f); // clamp the range
+ filter1 = _mm_or_si128(filter1, work_a); // reinsert the sign bits
+
+ // Filter2 >> 3
+ work_a = _mm_cmpgt_epi16(zero, filter2);
+ filter2 = _mm_srli_epi16(filter2, 3);
+ work_a = _mm_and_si128(work_a, tffe0);
+ filter2 = _mm_and_si128(filter2, t1f);
+ filter2 = _mm_or_si128(filter2, work_a);
+
+ // filt >> 1
+ filt = _mm_adds_epi16(filter1, t1);
+ work_a = _mm_cmpgt_epi16(zero, filt);
+ filt = _mm_srli_epi16(filt, 1);
+ work_a = _mm_and_si128(work_a, tff80);
+ filt = _mm_and_si128(filt, t7f);
+ filt = _mm_or_si128(filt, work_a);
+
+ filt = _mm_andnot_si128(hev, filt);
+
+ q0 = _mm_adds_epi16(
+ signed_char_clamp_bd_sse2(_mm_subs_epi16(qs0, filter1), bd), t80);
+ q1 = _mm_adds_epi16(signed_char_clamp_bd_sse2(_mm_subs_epi16(qs1, filt), bd),
+ t80);
+ p0 = _mm_adds_epi16(
+ signed_char_clamp_bd_sse2(_mm_adds_epi16(ps0, filter2), bd), t80);
+ p1 = _mm_adds_epi16(signed_char_clamp_bd_sse2(_mm_adds_epi16(ps1, filt), bd),
+ t80);
+
+ _mm_storeu_si128((__m128i *)(s - 2 * pitch), p1);
+ _mm_storeu_si128((__m128i *)(s - 1 * pitch), p0);
+ _mm_storeu_si128((__m128i *)(s + 0 * pitch), q0);
+ _mm_storeu_si128((__m128i *)(s + 1 * pitch), q1);
+}
+
+void vpx_highbd_lpf_horizontal_4_dual_sse2(
+ uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0,
+ const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1,
+ const uint8_t *thresh1, int bd) {
+ vpx_highbd_lpf_horizontal_4_sse2(s, pitch, blimit0, limit0, thresh0, bd);
+ vpx_highbd_lpf_horizontal_4_sse2(s + 8, pitch, blimit1, limit1, thresh1, bd);
+}
+
+static INLINE void highbd_transpose(uint16_t *src[], int in_p, uint16_t *dst[],
+ int out_p, int num_8x8_to_transpose) {
+ int idx8x8 = 0;
+ __m128i p0, p1, p2, p3, p4, p5, p6, p7, x0, x1, x2, x3, x4, x5, x6, x7;
+ do {
+ uint16_t *in = src[idx8x8];
+ uint16_t *out = dst[idx8x8];
+
+ p0 =
+ _mm_loadu_si128((__m128i *)(in + 0 * in_p)); // 00 01 02 03 04 05 06 07
+ p1 =
+ _mm_loadu_si128((__m128i *)(in + 1 * in_p)); // 10 11 12 13 14 15 16 17
+ p2 =
+ _mm_loadu_si128((__m128i *)(in + 2 * in_p)); // 20 21 22 23 24 25 26 27
+ p3 =
+ _mm_loadu_si128((__m128i *)(in + 3 * in_p)); // 30 31 32 33 34 35 36 37
+ p4 =
+ _mm_loadu_si128((__m128i *)(in + 4 * in_p)); // 40 41 42 43 44 45 46 47
+ p5 =
+ _mm_loadu_si128((__m128i *)(in + 5 * in_p)); // 50 51 52 53 54 55 56 57
+ p6 =
+ _mm_loadu_si128((__m128i *)(in + 6 * in_p)); // 60 61 62 63 64 65 66 67
+ p7 =
+ _mm_loadu_si128((__m128i *)(in + 7 * in_p)); // 70 71 72 73 74 75 76 77
+ // 00 10 01 11 02 12 03 13
+ x0 = _mm_unpacklo_epi16(p0, p1);
+ // 20 30 21 31 22 32 23 33
+ x1 = _mm_unpacklo_epi16(p2, p3);
+ // 40 50 41 51 42 52 43 53
+ x2 = _mm_unpacklo_epi16(p4, p5);
+ // 60 70 61 71 62 72 63 73
+ x3 = _mm_unpacklo_epi16(p6, p7);
+ // 00 10 20 30 01 11 21 31
+ x4 = _mm_unpacklo_epi32(x0, x1);
+ // 40 50 60 70 41 51 61 71
+ x5 = _mm_unpacklo_epi32(x2, x3);
+ // 00 10 20 30 40 50 60 70
+ x6 = _mm_unpacklo_epi64(x4, x5);
+ // 01 11 21 31 41 51 61 71
+ x7 = _mm_unpackhi_epi64(x4, x5);
+
+ _mm_storeu_si128((__m128i *)(out + 0 * out_p), x6);
+ // 00 10 20 30 40 50 60 70
+ _mm_storeu_si128((__m128i *)(out + 1 * out_p), x7);
+ // 01 11 21 31 41 51 61 71
+
+ // 02 12 22 32 03 13 23 33
+ x4 = _mm_unpackhi_epi32(x0, x1);
+ // 42 52 62 72 43 53 63 73
+ x5 = _mm_unpackhi_epi32(x2, x3);
+ // 02 12 22 32 42 52 62 72
+ x6 = _mm_unpacklo_epi64(x4, x5);
+ // 03 13 23 33 43 53 63 73
+ x7 = _mm_unpackhi_epi64(x4, x5);
+
+ _mm_storeu_si128((__m128i *)(out + 2 * out_p), x6);
+ // 02 12 22 32 42 52 62 72
+ _mm_storeu_si128((__m128i *)(out + 3 * out_p), x7);
+ // 03 13 23 33 43 53 63 73
+
+ // 04 14 05 15 06 16 07 17
+ x0 = _mm_unpackhi_epi16(p0, p1);
+ // 24 34 25 35 26 36 27 37
+ x1 = _mm_unpackhi_epi16(p2, p3);
+ // 44 54 45 55 46 56 47 57
+ x2 = _mm_unpackhi_epi16(p4, p5);
+ // 64 74 65 75 66 76 67 77
+ x3 = _mm_unpackhi_epi16(p6, p7);
+ // 04 14 24 34 05 15 25 35
+ x4 = _mm_unpacklo_epi32(x0, x1);
+ // 44 54 64 74 45 55 65 75
+ x5 = _mm_unpacklo_epi32(x2, x3);
+ // 04 14 24 34 44 54 64 74
+ x6 = _mm_unpacklo_epi64(x4, x5);
+ // 05 15 25 35 45 55 65 75
+ x7 = _mm_unpackhi_epi64(x4, x5);
+
+ _mm_storeu_si128((__m128i *)(out + 4 * out_p), x6);
+ // 04 14 24 34 44 54 64 74
+ _mm_storeu_si128((__m128i *)(out + 5 * out_p), x7);
+ // 05 15 25 35 45 55 65 75
+
+ // 06 16 26 36 07 17 27 37
+ x4 = _mm_unpackhi_epi32(x0, x1);
+ // 46 56 66 76 47 57 67 77
+ x5 = _mm_unpackhi_epi32(x2, x3);
+ // 06 16 26 36 46 56 66 76
+ x6 = _mm_unpacklo_epi64(x4, x5);
+ // 07 17 27 37 47 57 67 77
+ x7 = _mm_unpackhi_epi64(x4, x5);
+
+ _mm_storeu_si128((__m128i *)(out + 6 * out_p), x6);
+ // 06 16 26 36 46 56 66 76
+ _mm_storeu_si128((__m128i *)(out + 7 * out_p), x7);
+ // 07 17 27 37 47 57 67 77
+ } while (++idx8x8 < num_8x8_to_transpose);
+}
+
+static INLINE void highbd_transpose8x16(uint16_t *in0, uint16_t *in1, int in_p,
+ uint16_t *out, int out_p) {
+ uint16_t *src0[1];
+ uint16_t *src1[1];
+ uint16_t *dest0[1];
+ uint16_t *dest1[1];
+ src0[0] = in0;
+ src1[0] = in1;
+ dest0[0] = out;
+ dest1[0] = out + 8;
+ highbd_transpose(src0, in_p, dest0, out_p, 1);
+ highbd_transpose(src1, in_p, dest1, out_p, 1);
+}
+
+void vpx_highbd_lpf_vertical_4_sse2(uint16_t *s, int pitch,
+ const uint8_t *blimit, const uint8_t *limit,
+ const uint8_t *thresh, int bd) {
+ DECLARE_ALIGNED(16, uint16_t, t_dst[8 * 8]);
+ uint16_t *src[1];
+ uint16_t *dst[1];
+
+ // Transpose 8x8
+ src[0] = s - 4;
+ dst[0] = t_dst;
+
+ highbd_transpose(src, pitch, dst, 8, 1);
+
+ // Loop filtering
+ vpx_highbd_lpf_horizontal_4_sse2(t_dst + 4 * 8, 8, blimit, limit, thresh, bd);
+
+ src[0] = t_dst;
+ dst[0] = s - 4;
+
+ // Transpose back
+ highbd_transpose(src, 8, dst, pitch, 1);
+}
+
+void vpx_highbd_lpf_vertical_4_dual_sse2(
+ uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0,
+ const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1,
+ const uint8_t *thresh1, int bd) {
+ DECLARE_ALIGNED(16, uint16_t, t_dst[16 * 8]);
+ uint16_t *src[2];
+ uint16_t *dst[2];
+
+ // Transpose 8x16
+ highbd_transpose8x16(s - 4, s - 4 + pitch * 8, pitch, t_dst, 16);
+
+ // Loop filtering
+ vpx_highbd_lpf_horizontal_4_dual_sse2(t_dst + 4 * 16, 16, blimit0, limit0,
+ thresh0, blimit1, limit1, thresh1, bd);
+ src[0] = t_dst;
+ src[1] = t_dst + 8;
+ dst[0] = s - 4;
+ dst[1] = s - 4 + pitch * 8;
+
+ // Transpose back
+ highbd_transpose(src, 16, dst, pitch, 2);
+}
+
+void vpx_highbd_lpf_vertical_8_sse2(uint16_t *s, int pitch,
+ const uint8_t *blimit, const uint8_t *limit,
+ const uint8_t *thresh, int bd) {
+ DECLARE_ALIGNED(16, uint16_t, t_dst[8 * 8]);
+ uint16_t *src[1];
+ uint16_t *dst[1];
+
+ // Transpose 8x8
+ src[0] = s - 4;
+ dst[0] = t_dst;
+
+ highbd_transpose(src, pitch, dst, 8, 1);
+
+ // Loop filtering
+ vpx_highbd_lpf_horizontal_8_sse2(t_dst + 4 * 8, 8, blimit, limit, thresh, bd);
+
+ src[0] = t_dst;
+ dst[0] = s - 4;
+
+ // Transpose back
+ highbd_transpose(src, 8, dst, pitch, 1);
+}
+
+void vpx_highbd_lpf_vertical_8_dual_sse2(
+ uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0,
+ const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1,
+ const uint8_t *thresh1, int bd) {
+ DECLARE_ALIGNED(16, uint16_t, t_dst[16 * 8]);
+ uint16_t *src[2];
+ uint16_t *dst[2];
+
+ // Transpose 8x16
+ highbd_transpose8x16(s - 4, s - 4 + pitch * 8, pitch, t_dst, 16);
+
+ // Loop filtering
+ vpx_highbd_lpf_horizontal_8_dual_sse2(t_dst + 4 * 16, 16, blimit0, limit0,
+ thresh0, blimit1, limit1, thresh1, bd);
+ src[0] = t_dst;
+ src[1] = t_dst + 8;
+
+ dst[0] = s - 4;
+ dst[1] = s - 4 + pitch * 8;
+
+ // Transpose back
+ highbd_transpose(src, 16, dst, pitch, 2);
+}
+
+void vpx_highbd_lpf_vertical_16_sse2(uint16_t *s, int pitch,
+ const uint8_t *blimit,
+ const uint8_t *limit,
+ const uint8_t *thresh, int bd) {
+ DECLARE_ALIGNED(16, uint16_t, t_dst[8 * 16]);
+ uint16_t *src[2];
+ uint16_t *dst[2];
+
+ src[0] = s - 8;
+ src[1] = s;
+ dst[0] = t_dst;
+ dst[1] = t_dst + 8 * 8;
+
+ // Transpose 16x8
+ highbd_transpose(src, pitch, dst, 8, 2);
+
+ // Loop filtering
+ vpx_highbd_lpf_horizontal_16_sse2(t_dst + 8 * 8, 8, blimit, limit, thresh,
+ bd);
+ src[0] = t_dst;
+ src[1] = t_dst + 8 * 8;
+ dst[0] = s - 8;
+ dst[1] = s;
+
+ // Transpose back
+ highbd_transpose(src, 8, dst, pitch, 2);
+}
+
+void vpx_highbd_lpf_vertical_16_dual_sse2(uint16_t *s, int pitch,
+ const uint8_t *blimit,
+ const uint8_t *limit,
+ const uint8_t *thresh, int bd) {
+ DECLARE_ALIGNED(16, uint16_t, t_dst[256]);
+
+ // Transpose 16x16
+ highbd_transpose8x16(s - 8, s - 8 + 8 * pitch, pitch, t_dst, 16);
+ highbd_transpose8x16(s, s + 8 * pitch, pitch, t_dst + 8 * 16, 16);
+
+ // Loop filtering
+ vpx_highbd_lpf_horizontal_16_dual_sse2(t_dst + 8 * 16, 16, blimit, limit,
+ thresh, bd);
+
+ // Transpose back
+ highbd_transpose8x16(t_dst, t_dst + 8 * 16, 16, s - 8, pitch);
+ highbd_transpose8x16(t_dst + 8, t_dst + 8 + 8 * 16, 16, s - 8 + 8 * pitch,
+ pitch);
+}
diff --git a/media/libvpx/libvpx/vpx_dsp/x86/highbd_quantize_intrin_avx2.c b/media/libvpx/libvpx/vpx_dsp/x86/highbd_quantize_intrin_avx2.c
new file mode 100644
index 0000000000..fbebd7db1c
--- /dev/null
+++ b/media/libvpx/libvpx/vpx_dsp/x86/highbd_quantize_intrin_avx2.c
@@ -0,0 +1,260 @@
+/*
+ * Copyright (c) 2022 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <immintrin.h>
+
+#include "./vpx_dsp_rtcd.h"
+#include "vp9/common/vp9_scan.h"
+#include "vp9/encoder/vp9_block.h"
+
+static VPX_FORCE_INLINE void init_one_qp(const __m128i *p, __m256i *qp) {
+ const __m128i sign = _mm_srai_epi16(*p, 15);
+ const __m128i dc = _mm_unpacklo_epi16(*p, sign);
+ const __m128i ac = _mm_unpackhi_epi16(*p, sign);
+ *qp = _mm256_insertf128_si256(_mm256_castsi128_si256(dc), ac, 1);
+}
+
+static VPX_FORCE_INLINE void update_qp(__m256i *qp) {
+ int i;
+ for (i = 0; i < 5; ++i) {
+ qp[i] = _mm256_permute2x128_si256(qp[i], qp[i], 0x11);
+ }
+}
+
+static VPX_FORCE_INLINE void init_qp(const int16_t *zbin_ptr,
+ const int16_t *round_ptr,
+ const int16_t *quant_ptr,
+ const int16_t *dequant_ptr,
+ const int16_t *quant_shift_ptr,
+ __m256i *qp, int log_scale) {
+ const __m128i zbin = _mm_loadu_si128((const __m128i *)zbin_ptr);
+ const __m128i round = _mm_loadu_si128((const __m128i *)round_ptr);
+ const __m128i quant = _mm_loadu_si128((const __m128i *)quant_ptr);
+ const __m128i dequant = _mm_loadu_si128((const __m128i *)dequant_ptr);
+ const __m128i quant_shift = _mm_loadu_si128((const __m128i *)quant_shift_ptr);
+ init_one_qp(&zbin, &qp[0]);
+ init_one_qp(&round, &qp[1]);
+ init_one_qp(&quant, &qp[2]);
+ init_one_qp(&dequant, &qp[3]);
+ init_one_qp(&quant_shift, &qp[4]);
+ if (log_scale > 0) {
+ const __m256i rnd = _mm256_set1_epi32((int16_t)(1 << (log_scale - 1)));
+ qp[0] = _mm256_add_epi32(qp[0], rnd);
+ qp[0] = _mm256_srai_epi32(qp[0], log_scale);
+
+ qp[1] = _mm256_add_epi32(qp[1], rnd);
+ qp[1] = _mm256_srai_epi32(qp[1], log_scale);
+ }
+ // Subtracting 1 here eliminates a _mm256_cmpeq_epi32() instruction when
+ // calculating the zbin mask.
+ qp[0] = _mm256_sub_epi32(qp[0], _mm256_set1_epi32(1));
+}
+
+// Note:
+// *x is vector multiplied by *y which is 16 int32_t parallel multiplication
+// and right shift 16. The output, 16 int32_t is save in *p.
+static VPX_FORCE_INLINE __m256i mm256_mul_shift_epi32(const __m256i *x,
+ const __m256i *y) {
+ __m256i prod_lo = _mm256_mul_epi32(*x, *y);
+ __m256i prod_hi = _mm256_srli_epi64(*x, 32);
+ const __m256i mult_hi = _mm256_srli_epi64(*y, 32);
+ const __m256i mask = _mm256_set_epi32(0, -1, 0, -1, 0, -1, 0, -1);
+ prod_hi = _mm256_mul_epi32(prod_hi, mult_hi);
+ prod_lo = _mm256_srli_epi64(prod_lo, 16);
+ prod_lo = _mm256_and_si256(prod_lo, mask);
+ prod_hi = _mm256_srli_epi64(prod_hi, 16);
+ prod_hi = _mm256_slli_epi64(prod_hi, 32);
+ return _mm256_or_si256(prod_lo, prod_hi);
+}
+
+static VPX_FORCE_INLINE __m256i get_max_lane_eob(const int16_t *iscan_ptr,
+ __m256i eobmax,
+ __m256i nz_mask) {
+ const __m256i packed_nz_mask = _mm256_packs_epi32(nz_mask, nz_mask);
+ const __m256i packed_nz_mask_perm =
+ _mm256_permute4x64_epi64(packed_nz_mask, 0xD8);
+ const __m256i iscan =
+ _mm256_castsi128_si256(_mm_loadu_si128((const __m128i *)iscan_ptr));
+ const __m256i nz_iscan = _mm256_and_si256(iscan, packed_nz_mask_perm);
+ return _mm256_max_epi16(eobmax, nz_iscan);
+}
+
+// Get the max eob from the lower 128 bits.
+static VPX_FORCE_INLINE uint16_t get_max_eob(__m256i eob) {
+ __m256i eob_s;
+ eob_s = _mm256_shuffle_epi32(eob, 0xe);
+ eob = _mm256_max_epi16(eob, eob_s);
+ eob_s = _mm256_shufflelo_epi16(eob, 0xe);
+ eob = _mm256_max_epi16(eob, eob_s);
+ eob_s = _mm256_shufflelo_epi16(eob, 1);
+ eob = _mm256_max_epi16(eob, eob_s);
+#if defined(_MSC_VER) && (_MSC_VER < 1910)
+ return _mm_cvtsi128_si32(_mm256_extracti128_si256(eob, 0)) & 0xffff;
+#else
+ return (uint16_t)_mm256_extract_epi16(eob, 0);
+#endif
+}
+
+static VPX_FORCE_INLINE void quantize(const __m256i *qp,
+ const tran_low_t *coeff_ptr,
+ const int16_t *iscan_ptr,
+ tran_low_t *qcoeff, tran_low_t *dqcoeff,
+ __m256i *eob) {
+ const __m256i coeff = _mm256_loadu_si256((const __m256i *)coeff_ptr);
+ const __m256i abs_coeff = _mm256_abs_epi32(coeff);
+ const __m256i zbin_mask = _mm256_cmpgt_epi32(abs_coeff, qp[0]);
+
+ if (_mm256_movemask_epi8(zbin_mask) == 0) {
+ const __m256i zero = _mm256_setzero_si256();
+ _mm256_storeu_si256((__m256i *)qcoeff, zero);
+ _mm256_storeu_si256((__m256i *)dqcoeff, zero);
+ return;
+ }
+ {
+ const __m256i tmp_rnd =
+ _mm256_and_si256(_mm256_add_epi32(abs_coeff, qp[1]), zbin_mask);
+ const __m256i tmp = mm256_mul_shift_epi32(&tmp_rnd, &qp[2]);
+ const __m256i tmp2 = _mm256_add_epi32(tmp, tmp_rnd);
+ const __m256i abs_q = mm256_mul_shift_epi32(&tmp2, &qp[4]);
+ const __m256i abs_dq = _mm256_mullo_epi32(abs_q, qp[3]);
+ const __m256i nz_mask = _mm256_cmpgt_epi32(abs_q, _mm256_setzero_si256());
+ const __m256i q = _mm256_sign_epi32(abs_q, coeff);
+ const __m256i dq = _mm256_sign_epi32(abs_dq, coeff);
+
+ _mm256_storeu_si256((__m256i *)qcoeff, q);
+ _mm256_storeu_si256((__m256i *)dqcoeff, dq);
+
+ *eob = get_max_lane_eob(iscan_ptr, *eob, nz_mask);
+ }
+}
+
+void vpx_highbd_quantize_b_avx2(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
+ const int16_t *zbin_ptr,
+ const int16_t *round_ptr,
+ const int16_t *quant_ptr,
+ const int16_t *quant_shift_ptr,
+ tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
+ const int16_t *dequant_ptr, uint16_t *eob_ptr,
+ const int16_t *scan, const int16_t *iscan) {
+ const int step = 8;
+ __m256i eob = _mm256_setzero_si256();
+ __m256i qp[5];
+ (void)scan;
+
+ init_qp(zbin_ptr, round_ptr, quant_ptr, dequant_ptr, quant_shift_ptr, qp, 0);
+
+ quantize(qp, coeff_ptr, iscan, qcoeff_ptr, dqcoeff_ptr, &eob);
+
+ coeff_ptr += step;
+ qcoeff_ptr += step;
+ dqcoeff_ptr += step;
+ iscan += step;
+ n_coeffs -= step;
+
+ update_qp(qp);
+
+ while (n_coeffs > 0) {
+ quantize(qp, coeff_ptr, iscan, qcoeff_ptr, dqcoeff_ptr, &eob);
+
+ coeff_ptr += step;
+ qcoeff_ptr += step;
+ dqcoeff_ptr += step;
+ iscan += step;
+ n_coeffs -= step;
+ }
+
+ *eob_ptr = get_max_eob(eob);
+}
+
+static VPX_FORCE_INLINE __m256i mm256_mul_shift_epi32_logscale(const __m256i *x,
+ const __m256i *y,
+ int log_scale) {
+ __m256i prod_lo = _mm256_mul_epi32(*x, *y);
+ __m256i prod_hi = _mm256_srli_epi64(*x, 32);
+ const __m256i mult_hi = _mm256_srli_epi64(*y, 32);
+ const __m256i mask = _mm256_set_epi32(0, -1, 0, -1, 0, -1, 0, -1);
+ prod_hi = _mm256_mul_epi32(prod_hi, mult_hi);
+ prod_lo = _mm256_srli_epi64(prod_lo, 16 - log_scale);
+ prod_lo = _mm256_and_si256(prod_lo, mask);
+ prod_hi = _mm256_srli_epi64(prod_hi, 16 - log_scale);
+ prod_hi = _mm256_slli_epi64(prod_hi, 32);
+ return _mm256_or_si256(prod_lo, prod_hi);
+}
+
+static VPX_FORCE_INLINE void quantize_b_32x32(
+ const __m256i *qp, const tran_low_t *coeff_ptr, const int16_t *iscan_ptr,
+ tran_low_t *qcoeff, tran_low_t *dqcoeff, __m256i *eob) {
+ const __m256i coeff = _mm256_loadu_si256((const __m256i *)coeff_ptr);
+ const __m256i abs_coeff = _mm256_abs_epi32(coeff);
+ const __m256i zbin_mask = _mm256_cmpgt_epi32(abs_coeff, qp[0]);
+
+ if (_mm256_movemask_epi8(zbin_mask) == 0) {
+ const __m256i zero = _mm256_setzero_si256();
+ _mm256_storeu_si256((__m256i *)qcoeff, zero);
+ _mm256_storeu_si256((__m256i *)dqcoeff, zero);
+ return;
+ }
+
+ {
+ const __m256i tmp_rnd =
+ _mm256_and_si256(_mm256_add_epi32(abs_coeff, qp[1]), zbin_mask);
+ // const int64_t tmp2 = ((tmpw * quant_ptr[rc != 0]) >> 16) + tmpw;
+ const __m256i tmp = mm256_mul_shift_epi32_logscale(&tmp_rnd, &qp[2], 0);
+ const __m256i tmp2 = _mm256_add_epi32(tmp, tmp_rnd);
+ // const int abs_qcoeff = (int)((tmp2 * quant_shift_ptr[rc != 0]) >> 15);
+ const __m256i abs_q = mm256_mul_shift_epi32_logscale(&tmp2, &qp[4], 1);
+ const __m256i abs_dq =
+ _mm256_srli_epi32(_mm256_mullo_epi32(abs_q, qp[3]), 1);
+ const __m256i nz_mask = _mm256_cmpgt_epi32(abs_q, _mm256_setzero_si256());
+ const __m256i q = _mm256_sign_epi32(abs_q, coeff);
+ const __m256i dq = _mm256_sign_epi32(abs_dq, coeff);
+
+ _mm256_storeu_si256((__m256i *)qcoeff, q);
+ _mm256_storeu_si256((__m256i *)dqcoeff, dq);
+
+ *eob = get_max_lane_eob(iscan_ptr, *eob, nz_mask);
+ }
+}
+
+void vpx_highbd_quantize_b_32x32_avx2(
+ const tran_low_t *coeff_ptr, const struct macroblock_plane *const mb_plane,
+ tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr,
+ uint16_t *eob_ptr, const struct ScanOrder *const scan_order) {
+ const unsigned int step = 8;
+ intptr_t n_coeffs = 32 * 32;
+ const int16_t *iscan = scan_order->iscan;
+ __m256i eob = _mm256_setzero_si256();
+ __m256i qp[5];
+
+ init_qp(mb_plane->zbin, mb_plane->round, mb_plane->quant, dequant_ptr,
+ mb_plane->quant_shift, qp, 1);
+
+ quantize_b_32x32(qp, coeff_ptr, iscan, qcoeff_ptr, dqcoeff_ptr, &eob);
+
+ coeff_ptr += step;
+ qcoeff_ptr += step;
+ dqcoeff_ptr += step;
+ iscan += step;
+ n_coeffs -= step;
+
+ update_qp(qp);
+
+ while (n_coeffs > 0) {
+ quantize_b_32x32(qp, coeff_ptr, iscan, qcoeff_ptr, dqcoeff_ptr, &eob);
+
+ coeff_ptr += step;
+ qcoeff_ptr += step;
+ dqcoeff_ptr += step;
+ iscan += step;
+ n_coeffs -= step;
+ }
+
+ *eob_ptr = get_max_eob(eob);
+}
diff --git a/media/libvpx/libvpx/vpx_dsp/x86/highbd_quantize_intrin_sse2.c b/media/libvpx/libvpx/vpx_dsp/x86/highbd_quantize_intrin_sse2.c
new file mode 100644
index 0000000000..a5d874f3bc
--- /dev/null
+++ b/media/libvpx/libvpx/vpx_dsp/x86/highbd_quantize_intrin_sse2.c
@@ -0,0 +1,155 @@
+/*
+ * Copyright (c) 2015 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <assert.h>
+#include <emmintrin.h>
+
+#include "./vpx_dsp_rtcd.h"
+#include "vpx_dsp/vpx_dsp_common.h"
+#include "vpx_mem/vpx_mem.h"
+#include "vpx_ports/mem.h"
+#include "vp9/common/vp9_scan.h"
+#include "vp9/encoder/vp9_block.h"
+
+#if CONFIG_VP9_HIGHBITDEPTH
+void vpx_highbd_quantize_b_sse2(const tran_low_t *coeff_ptr, intptr_t count,
+ const int16_t *zbin_ptr,
+ const int16_t *round_ptr,
+ const int16_t *quant_ptr,
+ const int16_t *quant_shift_ptr,
+ tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
+ const int16_t *dequant_ptr, uint16_t *eob_ptr,
+ const int16_t *scan, const int16_t *iscan) {
+ int i, j, non_zero_regs = (int)count / 4, eob_i = 0;
+ __m128i zbins[2];
+ __m128i nzbins[2];
+
+ zbins[0] = _mm_set_epi32((int)zbin_ptr[1], (int)zbin_ptr[1], (int)zbin_ptr[1],
+ (int)zbin_ptr[0]);
+ zbins[1] = _mm_set1_epi32((int)zbin_ptr[1]);
+
+ nzbins[0] = _mm_setzero_si128();
+ nzbins[1] = _mm_setzero_si128();
+ nzbins[0] = _mm_sub_epi32(nzbins[0], zbins[0]);
+ nzbins[1] = _mm_sub_epi32(nzbins[1], zbins[1]);
+
+ (void)scan;
+
+ memset(qcoeff_ptr, 0, count * sizeof(*qcoeff_ptr));
+ memset(dqcoeff_ptr, 0, count * sizeof(*dqcoeff_ptr));
+
+ // Pre-scan pass
+ for (i = ((int)count / 4) - 1; i >= 0; i--) {
+ __m128i coeffs, cmp1, cmp2;
+ int test;
+ coeffs = _mm_load_si128((const __m128i *)(coeff_ptr + i * 4));
+ cmp1 = _mm_cmplt_epi32(coeffs, zbins[i != 0]);
+ cmp2 = _mm_cmpgt_epi32(coeffs, nzbins[i != 0]);
+ cmp1 = _mm_and_si128(cmp1, cmp2);
+ test = _mm_movemask_epi8(cmp1);
+ if (test == 0xffff)
+ non_zero_regs--;
+ else
+ break;
+ }
+
+ // Quantization pass:
+ for (i = 0; i < non_zero_regs; i++) {
+ __m128i coeffs, coeffs_sign, tmp1, tmp2;
+ int test;
+ int abs_coeff[4];
+ int coeff_sign[4];
+
+ coeffs = _mm_load_si128((const __m128i *)(coeff_ptr + i * 4));
+ coeffs_sign = _mm_srai_epi32(coeffs, 31);
+ coeffs = _mm_sub_epi32(_mm_xor_si128(coeffs, coeffs_sign), coeffs_sign);
+ tmp1 = _mm_cmpgt_epi32(coeffs, zbins[i != 0]);
+ tmp2 = _mm_cmpeq_epi32(coeffs, zbins[i != 0]);
+ tmp1 = _mm_or_si128(tmp1, tmp2);
+ test = _mm_movemask_epi8(tmp1);
+ _mm_storeu_si128((__m128i *)abs_coeff, coeffs);
+ _mm_storeu_si128((__m128i *)coeff_sign, coeffs_sign);
+
+ for (j = 0; j < 4; j++) {
+ if (test & (1 << (4 * j))) {
+ int k = 4 * i + j;
+ const int64_t tmp3 = abs_coeff[j] + round_ptr[k != 0];
+ const int64_t tmp4 = ((tmp3 * quant_ptr[k != 0]) >> 16) + tmp3;
+ const uint32_t abs_qcoeff =
+ (uint32_t)((tmp4 * quant_shift_ptr[k != 0]) >> 16);
+ qcoeff_ptr[k] =
+ (int)(abs_qcoeff ^ (uint32_t)coeff_sign[j]) - coeff_sign[j];
+ dqcoeff_ptr[k] = qcoeff_ptr[k] * dequant_ptr[k != 0];
+ if (abs_qcoeff) eob_i = iscan[k] > eob_i ? iscan[k] : eob_i;
+ }
+ }
+ }
+ *eob_ptr = eob_i;
+}
+
+void vpx_highbd_quantize_b_32x32_sse2(
+ const tran_low_t *coeff_ptr, const struct macroblock_plane *const mb_plane,
+ tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr,
+ uint16_t *eob_ptr, const struct ScanOrder *const scan_order) {
+ __m128i zbins[2];
+ __m128i nzbins[2];
+ int idx = 0;
+ int idx_arr[1024];
+ int i, eob = 0;
+ const intptr_t n_coeffs = 32 * 32;
+ const int16_t *iscan = scan_order->iscan;
+ const int zbin0_tmp = ROUND_POWER_OF_TWO(mb_plane->zbin[0], 1);
+ const int zbin1_tmp = ROUND_POWER_OF_TWO(mb_plane->zbin[1], 1);
+
+ zbins[0] = _mm_set_epi32(zbin1_tmp, zbin1_tmp, zbin1_tmp, zbin0_tmp);
+ zbins[1] = _mm_set1_epi32(zbin1_tmp);
+
+ nzbins[0] = _mm_setzero_si128();
+ nzbins[1] = _mm_setzero_si128();
+ nzbins[0] = _mm_sub_epi32(nzbins[0], zbins[0]);
+ nzbins[1] = _mm_sub_epi32(nzbins[1], zbins[1]);
+
+ memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr));
+ memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr));
+
+ // Pre-scan pass
+ for (i = 0; i < n_coeffs / 4; i++) {
+ __m128i coeffs, cmp1, cmp2;
+ int test;
+ coeffs = _mm_load_si128((const __m128i *)(coeff_ptr + i * 4));
+ cmp1 = _mm_cmplt_epi32(coeffs, zbins[i != 0]);
+ cmp2 = _mm_cmpgt_epi32(coeffs, nzbins[i != 0]);
+ cmp1 = _mm_and_si128(cmp1, cmp2);
+ test = _mm_movemask_epi8(cmp1);
+ if (!(test & 0xf)) idx_arr[idx++] = i * 4;
+ if (!(test & 0xf0)) idx_arr[idx++] = i * 4 + 1;
+ if (!(test & 0xf00)) idx_arr[idx++] = i * 4 + 2;
+ if (!(test & 0xf000)) idx_arr[idx++] = i * 4 + 3;
+ }
+
+ // Quantization pass: only process the coefficients selected in
+ // pre-scan pass. Note: idx can be zero.
+ for (i = 0; i < idx; i++) {
+ const int rc = idx_arr[i];
+ const int coeff = coeff_ptr[rc];
+ const int coeff_sign = (coeff >> 31);
+ const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign;
+ const int64_t tmp1 =
+ abs_coeff + ROUND_POWER_OF_TWO(mb_plane->round[rc != 0], 1);
+ const int64_t tmp2 = ((tmp1 * mb_plane->quant[rc != 0]) >> 16) + tmp1;
+ const uint32_t abs_qcoeff =
+ (uint32_t)((tmp2 * mb_plane->quant_shift[rc != 0]) >> 15);
+ qcoeff_ptr[rc] = (int)(abs_qcoeff ^ (uint32_t)coeff_sign) - coeff_sign;
+ dqcoeff_ptr[rc] = qcoeff_ptr[rc] * dequant_ptr[rc != 0] / 2;
+ if (abs_qcoeff) eob = iscan[idx_arr[i]] > eob ? iscan[idx_arr[i]] : eob;
+ }
+ *eob_ptr = eob;
+}
+#endif
diff --git a/media/libvpx/libvpx/vpx_dsp/x86/highbd_sad4d_avx2.c b/media/libvpx/libvpx/vpx_dsp/x86/highbd_sad4d_avx2.c
new file mode 100644
index 0000000000..e483fdce73
--- /dev/null
+++ b/media/libvpx/libvpx/vpx_dsp/x86/highbd_sad4d_avx2.c
@@ -0,0 +1,462 @@
+/*
+ * Copyright (c) 2022 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+#include <immintrin.h> // AVX2
+#include "./vpx_dsp_rtcd.h"
+#include "vpx/vpx_integer.h"
+
+static VPX_FORCE_INLINE void calc_final_4(const __m256i *const sums /*[4]*/,
+ uint32_t sad_array[4]) {
+ const __m256i t0 = _mm256_hadd_epi32(sums[0], sums[1]);
+ const __m256i t1 = _mm256_hadd_epi32(sums[2], sums[3]);
+ const __m256i t2 = _mm256_hadd_epi32(t0, t1);
+ const __m128i sum = _mm_add_epi32(_mm256_castsi256_si128(t2),
+ _mm256_extractf128_si256(t2, 1));
+ _mm_storeu_si128((__m128i *)sad_array, sum);
+}
+
+static VPX_FORCE_INLINE void highbd_sad64xHx4d(__m256i *sums_16 /*[4]*/,
+ const uint16_t *src,
+ int src_stride,
+ uint16_t *refs[4],
+ int ref_stride, int height) {
+ int i;
+ for (i = 0; i < height; ++i) {
+ // load src and all ref[]
+ const __m256i s0 = _mm256_load_si256((const __m256i *)src);
+ const __m256i s1 = _mm256_load_si256((const __m256i *)(src + 16));
+ const __m256i s2 = _mm256_load_si256((const __m256i *)(src + 32));
+ const __m256i s3 = _mm256_load_si256((const __m256i *)(src + 48));
+ int x;
+
+ for (x = 0; x < 4; ++x) {
+ __m256i r[4];
+ r[0] = _mm256_loadu_si256((const __m256i *)refs[x]);
+ r[1] = _mm256_loadu_si256((const __m256i *)(refs[x] + 16));
+ r[2] = _mm256_loadu_si256((const __m256i *)(refs[x] + 32));
+ r[3] = _mm256_loadu_si256((const __m256i *)(refs[x] + 48));
+
+ // absolute differences between every ref[] to src
+ r[0] = _mm256_abs_epi16(_mm256_sub_epi16(r[0], s0));
+ r[1] = _mm256_abs_epi16(_mm256_sub_epi16(r[1], s1));
+ r[2] = _mm256_abs_epi16(_mm256_sub_epi16(r[2], s2));
+ r[3] = _mm256_abs_epi16(_mm256_sub_epi16(r[3], s3));
+
+ // sum every abs diff
+ sums_16[x] = _mm256_add_epi16(sums_16[x], _mm256_add_epi16(r[0], r[1]));
+ sums_16[x] = _mm256_add_epi16(sums_16[x], _mm256_add_epi16(r[2], r[3]));
+ }
+
+ src += src_stride;
+ refs[0] += ref_stride;
+ refs[1] += ref_stride;
+ refs[2] += ref_stride;
+ refs[3] += ref_stride;
+ }
+}
+
+static VPX_FORCE_INLINE void highbd_sad64xNx4d_avx2(
+ const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4],
+ int ref_stride, uint32_t sad_array[4], int n) {
+ const uint16_t *src = CONVERT_TO_SHORTPTR(src_ptr);
+ uint16_t *refs[4];
+ __m256i sums_16[4];
+ __m256i sums_32[4];
+ int i;
+
+ refs[0] = CONVERT_TO_SHORTPTR(ref_array[0]);
+ refs[1] = CONVERT_TO_SHORTPTR(ref_array[1]);
+ refs[2] = CONVERT_TO_SHORTPTR(ref_array[2]);
+ refs[3] = CONVERT_TO_SHORTPTR(ref_array[3]);
+ sums_32[0] = _mm256_setzero_si256();
+ sums_32[1] = _mm256_setzero_si256();
+ sums_32[2] = _mm256_setzero_si256();
+ sums_32[3] = _mm256_setzero_si256();
+
+ for (i = 0; i < (n / 2); ++i) {
+ sums_16[0] = _mm256_setzero_si256();
+ sums_16[1] = _mm256_setzero_si256();
+ sums_16[2] = _mm256_setzero_si256();
+ sums_16[3] = _mm256_setzero_si256();
+
+ highbd_sad64xHx4d(sums_16, src, src_stride, refs, ref_stride, 2);
+
+ /* sums_16 will outrange after 2 rows, so add current sums_16 to
+ * sums_32*/
+ sums_32[0] = _mm256_add_epi32(
+ sums_32[0],
+ _mm256_add_epi32(
+ _mm256_cvtepu16_epi32(_mm256_castsi256_si128(sums_16[0])),
+ _mm256_cvtepu16_epi32(_mm256_extractf128_si256(sums_16[0], 1))));
+ sums_32[1] = _mm256_add_epi32(
+ sums_32[1],
+ _mm256_add_epi32(
+ _mm256_cvtepu16_epi32(_mm256_castsi256_si128(sums_16[1])),
+ _mm256_cvtepu16_epi32(_mm256_extractf128_si256(sums_16[1], 1))));
+ sums_32[2] = _mm256_add_epi32(
+ sums_32[2],
+ _mm256_add_epi32(
+ _mm256_cvtepu16_epi32(_mm256_castsi256_si128(sums_16[2])),
+ _mm256_cvtepu16_epi32(_mm256_extractf128_si256(sums_16[2], 1))));
+ sums_32[3] = _mm256_add_epi32(
+ sums_32[3],
+ _mm256_add_epi32(
+ _mm256_cvtepu16_epi32(_mm256_castsi256_si128(sums_16[3])),
+ _mm256_cvtepu16_epi32(_mm256_extractf128_si256(sums_16[3], 1))));
+
+ src += src_stride << 1;
+ }
+ calc_final_4(sums_32, sad_array);
+}
+
+#define HIGHBD_SAD64XNX4D(n) \
+ void vpx_highbd_sad64x##n##x4d_avx2(const uint8_t *src, int src_stride, \
+ const uint8_t *const ref_array[4], \
+ int ref_stride, uint32_t sad_array[4]) { \
+ highbd_sad64xNx4d_avx2(src, src_stride, ref_array, ref_stride, sad_array, \
+ n); \
+ }
+
+#define HIGHBD_SADSKIP64XNx4D(n) \
+ void vpx_highbd_sad_skip_64x##n##x4d_avx2( \
+ const uint8_t *src, int src_stride, const uint8_t *const ref_array[4], \
+ int ref_stride, uint32_t sad_array[4]) { \
+ highbd_sad64xNx4d_avx2(src, 2 * src_stride, ref_array, 2 * ref_stride, \
+ sad_array, n / 2); \
+ sad_array[0] <<= 1; \
+ sad_array[1] <<= 1; \
+ sad_array[2] <<= 1; \
+ sad_array[3] <<= 1; \
+ }
+
+static VPX_FORCE_INLINE void highbd_sad32xHx4d(__m256i *sums_16 /*[4]*/,
+ const uint16_t *src,
+ int src_stride,
+ uint16_t *refs[4],
+ int ref_stride, int height) {
+ int i;
+ for (i = 0; i < height; i++) {
+ __m256i r[8];
+
+ // load src and all ref[]
+ const __m256i s = _mm256_load_si256((const __m256i *)src);
+ const __m256i s2 = _mm256_load_si256((const __m256i *)(src + 16));
+ r[0] = _mm256_loadu_si256((const __m256i *)refs[0]);
+ r[1] = _mm256_loadu_si256((const __m256i *)(refs[0] + 16));
+ r[2] = _mm256_loadu_si256((const __m256i *)refs[1]);
+ r[3] = _mm256_loadu_si256((const __m256i *)(refs[1] + 16));
+ r[4] = _mm256_loadu_si256((const __m256i *)refs[2]);
+ r[5] = _mm256_loadu_si256((const __m256i *)(refs[2] + 16));
+ r[6] = _mm256_loadu_si256((const __m256i *)refs[3]);
+ r[7] = _mm256_loadu_si256((const __m256i *)(refs[3] + 16));
+
+ // absolute differences between every ref[] to src
+ r[0] = _mm256_abs_epi16(_mm256_sub_epi16(r[0], s));
+ r[1] = _mm256_abs_epi16(_mm256_sub_epi16(r[1], s2));
+ r[2] = _mm256_abs_epi16(_mm256_sub_epi16(r[2], s));
+ r[3] = _mm256_abs_epi16(_mm256_sub_epi16(r[3], s2));
+ r[4] = _mm256_abs_epi16(_mm256_sub_epi16(r[4], s));
+ r[5] = _mm256_abs_epi16(_mm256_sub_epi16(r[5], s2));
+ r[6] = _mm256_abs_epi16(_mm256_sub_epi16(r[6], s));
+ r[7] = _mm256_abs_epi16(_mm256_sub_epi16(r[7], s2));
+
+ // sum every abs diff
+ sums_16[0] = _mm256_add_epi16(sums_16[0], _mm256_add_epi16(r[0], r[1]));
+ sums_16[1] = _mm256_add_epi16(sums_16[1], _mm256_add_epi16(r[2], r[3]));
+ sums_16[2] = _mm256_add_epi16(sums_16[2], _mm256_add_epi16(r[4], r[5]));
+ sums_16[3] = _mm256_add_epi16(sums_16[3], _mm256_add_epi16(r[6], r[7]));
+
+ src += src_stride;
+ refs[0] += ref_stride;
+ refs[1] += ref_stride;
+ refs[2] += ref_stride;
+ refs[3] += ref_stride;
+ }
+}
+
+static VPX_FORCE_INLINE void highbd_sad32xNx4d_avx2(
+ const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4],
+ int ref_stride, uint32_t sad_array[4], int n) {
+ const uint16_t *src = CONVERT_TO_SHORTPTR(src_ptr);
+ uint16_t *refs[4];
+ __m256i sums_16[4];
+ __m256i sums_32[4];
+ int i;
+
+ refs[0] = CONVERT_TO_SHORTPTR(ref_array[0]);
+ refs[1] = CONVERT_TO_SHORTPTR(ref_array[1]);
+ refs[2] = CONVERT_TO_SHORTPTR(ref_array[2]);
+ refs[3] = CONVERT_TO_SHORTPTR(ref_array[3]);
+ sums_32[0] = _mm256_setzero_si256();
+ sums_32[1] = _mm256_setzero_si256();
+ sums_32[2] = _mm256_setzero_si256();
+ sums_32[3] = _mm256_setzero_si256();
+
+ for (i = 0; i < (n / 8); ++i) {
+ sums_16[0] = _mm256_setzero_si256();
+ sums_16[1] = _mm256_setzero_si256();
+ sums_16[2] = _mm256_setzero_si256();
+ sums_16[3] = _mm256_setzero_si256();
+
+ highbd_sad32xHx4d(sums_16, src, src_stride, refs, ref_stride, 8);
+
+ /* sums_16 will outrange after 8 rows, so add current sums_16 to
+ * sums_32*/
+ sums_32[0] = _mm256_add_epi32(
+ sums_32[0],
+ _mm256_add_epi32(
+ _mm256_cvtepu16_epi32(_mm256_castsi256_si128(sums_16[0])),
+ _mm256_cvtepu16_epi32(_mm256_extractf128_si256(sums_16[0], 1))));
+ sums_32[1] = _mm256_add_epi32(
+ sums_32[1],
+ _mm256_add_epi32(
+ _mm256_cvtepu16_epi32(_mm256_castsi256_si128(sums_16[1])),
+ _mm256_cvtepu16_epi32(_mm256_extractf128_si256(sums_16[1], 1))));
+ sums_32[2] = _mm256_add_epi32(
+ sums_32[2],
+ _mm256_add_epi32(
+ _mm256_cvtepu16_epi32(_mm256_castsi256_si128(sums_16[2])),
+ _mm256_cvtepu16_epi32(_mm256_extractf128_si256(sums_16[2], 1))));
+ sums_32[3] = _mm256_add_epi32(
+ sums_32[3],
+ _mm256_add_epi32(
+ _mm256_cvtepu16_epi32(_mm256_castsi256_si128(sums_16[3])),
+ _mm256_cvtepu16_epi32(_mm256_extractf128_si256(sums_16[3], 1))));
+
+ src += src_stride << 3;
+ }
+ calc_final_4(sums_32, sad_array);
+}
+
+#define HIGHBD_SAD32XNX4D(n) \
+ void vpx_highbd_sad32x##n##x4d_avx2(const uint8_t *src, int src_stride, \
+ const uint8_t *const ref_array[4], \
+ int ref_stride, uint32_t sad_array[4]) { \
+ highbd_sad32xNx4d_avx2(src, src_stride, ref_array, ref_stride, sad_array, \
+ n); \
+ }
+
+#define HIGHBD_SADSKIP32XNx4D(n) \
+ void vpx_highbd_sad_skip_32x##n##x4d_avx2( \
+ const uint8_t *src, int src_stride, const uint8_t *const ref_array[4], \
+ int ref_stride, uint32_t sad_array[4]) { \
+ highbd_sad32xNx4d_avx2(src, 2 * src_stride, ref_array, 2 * ref_stride, \
+ sad_array, n / 2); \
+ sad_array[0] <<= 1; \
+ sad_array[1] <<= 1; \
+ sad_array[2] <<= 1; \
+ sad_array[3] <<= 1; \
+ }
+
+static VPX_FORCE_INLINE void highbd_sad16xHx4d(__m256i *sums_16 /*[4]*/,
+ const uint16_t *src,
+ int src_stride,
+ uint16_t *refs[4],
+ int ref_stride, int height) {
+ int i;
+ for (i = 0; i < height; i++) {
+ __m256i r[4];
+
+ // load src and all ref[]
+ const __m256i s = _mm256_load_si256((const __m256i *)src);
+ r[0] = _mm256_loadu_si256((const __m256i *)refs[0]);
+ r[1] = _mm256_loadu_si256((const __m256i *)refs[1]);
+ r[2] = _mm256_loadu_si256((const __m256i *)refs[2]);
+ r[3] = _mm256_loadu_si256((const __m256i *)refs[3]);
+
+ // absolute differences between every ref[] to src
+ r[0] = _mm256_abs_epi16(_mm256_sub_epi16(r[0], s));
+ r[1] = _mm256_abs_epi16(_mm256_sub_epi16(r[1], s));
+ r[2] = _mm256_abs_epi16(_mm256_sub_epi16(r[2], s));
+ r[3] = _mm256_abs_epi16(_mm256_sub_epi16(r[3], s));
+
+ // sum every abs diff
+ sums_16[0] = _mm256_add_epi16(sums_16[0], r[0]);
+ sums_16[1] = _mm256_add_epi16(sums_16[1], r[1]);
+ sums_16[2] = _mm256_add_epi16(sums_16[2], r[2]);
+ sums_16[3] = _mm256_add_epi16(sums_16[3], r[3]);
+
+ src += src_stride;
+ refs[0] += ref_stride;
+ refs[1] += ref_stride;
+ refs[2] += ref_stride;
+ refs[3] += ref_stride;
+ }
+}
+
+static VPX_FORCE_INLINE void highbd_sad16xNx4d_avx2(
+ const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4],
+ int ref_stride, uint32_t sad_array[4], int n) {
+ const uint16_t *src = CONVERT_TO_SHORTPTR(src_ptr);
+ uint16_t *refs[4];
+ __m256i sums_16[4];
+ __m256i sums_32[4];
+ const int height = VPXMIN(16, n);
+ const int num_iters = n / height;
+ int i;
+
+ refs[0] = CONVERT_TO_SHORTPTR(ref_array[0]);
+ refs[1] = CONVERT_TO_SHORTPTR(ref_array[1]);
+ refs[2] = CONVERT_TO_SHORTPTR(ref_array[2]);
+ refs[3] = CONVERT_TO_SHORTPTR(ref_array[3]);
+ sums_32[0] = _mm256_setzero_si256();
+ sums_32[1] = _mm256_setzero_si256();
+ sums_32[2] = _mm256_setzero_si256();
+ sums_32[3] = _mm256_setzero_si256();
+
+ for (i = 0; i < num_iters; ++i) {
+ sums_16[0] = _mm256_setzero_si256();
+ sums_16[1] = _mm256_setzero_si256();
+ sums_16[2] = _mm256_setzero_si256();
+ sums_16[3] = _mm256_setzero_si256();
+
+ highbd_sad16xHx4d(sums_16, src, src_stride, refs, ref_stride, height);
+
+ // sums_16 will outrange after 16 rows, so add current sums_16 to sums_32
+ sums_32[0] = _mm256_add_epi32(
+ sums_32[0],
+ _mm256_add_epi32(
+ _mm256_cvtepu16_epi32(_mm256_castsi256_si128(sums_16[0])),
+ _mm256_cvtepu16_epi32(_mm256_extractf128_si256(sums_16[0], 1))));
+ sums_32[1] = _mm256_add_epi32(
+ sums_32[1],
+ _mm256_add_epi32(
+ _mm256_cvtepu16_epi32(_mm256_castsi256_si128(sums_16[1])),
+ _mm256_cvtepu16_epi32(_mm256_extractf128_si256(sums_16[1], 1))));
+ sums_32[2] = _mm256_add_epi32(
+ sums_32[2],
+ _mm256_add_epi32(
+ _mm256_cvtepu16_epi32(_mm256_castsi256_si128(sums_16[2])),
+ _mm256_cvtepu16_epi32(_mm256_extractf128_si256(sums_16[2], 1))));
+ sums_32[3] = _mm256_add_epi32(
+ sums_32[3],
+ _mm256_add_epi32(
+ _mm256_cvtepu16_epi32(_mm256_castsi256_si128(sums_16[3])),
+ _mm256_cvtepu16_epi32(_mm256_extractf128_si256(sums_16[3], 1))));
+
+ src += src_stride << 4;
+ }
+ calc_final_4(sums_32, sad_array);
+}
+
+#define HIGHBD_SAD16XNX4D(n) \
+ void vpx_highbd_sad16x##n##x4d_avx2(const uint8_t *src, int src_stride, \
+ const uint8_t *const ref_array[4], \
+ int ref_stride, uint32_t sad_array[4]) { \
+ highbd_sad16xNx4d_avx2(src, src_stride, ref_array, ref_stride, sad_array, \
+ n); \
+ }
+
+#define HIGHBD_SADSKIP16XNx4D(n) \
+ void vpx_highbd_sad_skip_16x##n##x4d_avx2( \
+ const uint8_t *src, int src_stride, const uint8_t *const ref_array[4], \
+ int ref_stride, uint32_t sad_array[4]) { \
+ highbd_sad16xNx4d_avx2(src, 2 * src_stride, ref_array, 2 * ref_stride, \
+ sad_array, n / 2); \
+ sad_array[0] <<= 1; \
+ sad_array[1] <<= 1; \
+ sad_array[2] <<= 1; \
+ sad_array[3] <<= 1; \
+ }
+
+void vpx_highbd_sad16x16x4d_avx2(const uint8_t *src_ptr, int src_stride,
+ const uint8_t *const ref_array[4],
+ int ref_stride, uint32_t sad_array[4]) {
+ const uint16_t *src = CONVERT_TO_SHORTPTR(src_ptr);
+ uint16_t *refs[4];
+ __m256i sums_16[4];
+
+ refs[0] = CONVERT_TO_SHORTPTR(ref_array[0]);
+ refs[1] = CONVERT_TO_SHORTPTR(ref_array[1]);
+ refs[2] = CONVERT_TO_SHORTPTR(ref_array[2]);
+ refs[3] = CONVERT_TO_SHORTPTR(ref_array[3]);
+ sums_16[0] = _mm256_setzero_si256();
+ sums_16[1] = _mm256_setzero_si256();
+ sums_16[2] = _mm256_setzero_si256();
+ sums_16[3] = _mm256_setzero_si256();
+
+ highbd_sad16xHx4d(sums_16, src, src_stride, refs, ref_stride, 16);
+
+ {
+ __m256i sums_32[4];
+ sums_32[0] = _mm256_add_epi32(
+ _mm256_cvtepu16_epi32(_mm256_castsi256_si128(sums_16[0])),
+ _mm256_cvtepu16_epi32(_mm256_extractf128_si256(sums_16[0], 1)));
+ sums_32[1] = _mm256_add_epi32(
+ _mm256_cvtepu16_epi32(_mm256_castsi256_si128(sums_16[1])),
+ _mm256_cvtepu16_epi32(_mm256_extractf128_si256(sums_16[1], 1)));
+ sums_32[2] = _mm256_add_epi32(
+ _mm256_cvtepu16_epi32(_mm256_castsi256_si128(sums_16[2])),
+ _mm256_cvtepu16_epi32(_mm256_extractf128_si256(sums_16[2], 1)));
+ sums_32[3] = _mm256_add_epi32(
+ _mm256_cvtepu16_epi32(_mm256_castsi256_si128(sums_16[3])),
+ _mm256_cvtepu16_epi32(_mm256_extractf128_si256(sums_16[3], 1)));
+ calc_final_4(sums_32, sad_array);
+ }
+}
+
+void vpx_highbd_sad16x8x4d_avx2(const uint8_t *src_ptr, int src_stride,
+ const uint8_t *const ref_array[4],
+ int ref_stride, uint32_t sad_array[4]) {
+ const uint16_t *src = CONVERT_TO_SHORTPTR(src_ptr);
+ uint16_t *refs[4];
+ __m256i sums_16[4];
+
+ refs[0] = CONVERT_TO_SHORTPTR(ref_array[0]);
+ refs[1] = CONVERT_TO_SHORTPTR(ref_array[1]);
+ refs[2] = CONVERT_TO_SHORTPTR(ref_array[2]);
+ refs[3] = CONVERT_TO_SHORTPTR(ref_array[3]);
+ sums_16[0] = _mm256_setzero_si256();
+ sums_16[1] = _mm256_setzero_si256();
+ sums_16[2] = _mm256_setzero_si256();
+ sums_16[3] = _mm256_setzero_si256();
+
+ highbd_sad16xHx4d(sums_16, src, src_stride, refs, ref_stride, 8);
+
+ {
+ __m256i sums_32[4];
+ sums_32[0] = _mm256_add_epi32(
+ _mm256_cvtepu16_epi32(_mm256_castsi256_si128(sums_16[0])),
+ _mm256_cvtepu16_epi32(_mm256_extractf128_si256(sums_16[0], 1)));
+ sums_32[1] = _mm256_add_epi32(
+ _mm256_cvtepu16_epi32(_mm256_castsi256_si128(sums_16[1])),
+ _mm256_cvtepu16_epi32(_mm256_extractf128_si256(sums_16[1], 1)));
+ sums_32[2] = _mm256_add_epi32(
+ _mm256_cvtepu16_epi32(_mm256_castsi256_si128(sums_16[2])),
+ _mm256_cvtepu16_epi32(_mm256_extractf128_si256(sums_16[2], 1)));
+ sums_32[3] = _mm256_add_epi32(
+ _mm256_cvtepu16_epi32(_mm256_castsi256_si128(sums_16[3])),
+ _mm256_cvtepu16_epi32(_mm256_extractf128_si256(sums_16[3], 1)));
+ calc_final_4(sums_32, sad_array);
+ }
+}
+
+// clang-format off
+HIGHBD_SAD64XNX4D(64)
+HIGHBD_SADSKIP64XNx4D(64)
+
+HIGHBD_SAD64XNX4D(32)
+HIGHBD_SADSKIP64XNx4D(32)
+
+HIGHBD_SAD32XNX4D(64)
+HIGHBD_SADSKIP32XNx4D(64)
+
+HIGHBD_SAD32XNX4D(32)
+HIGHBD_SADSKIP32XNx4D(32)
+
+HIGHBD_SAD32XNX4D(16)
+HIGHBD_SADSKIP32XNx4D(16)
+
+HIGHBD_SAD16XNX4D(32)
+HIGHBD_SADSKIP16XNx4D(32)
+
+HIGHBD_SADSKIP16XNx4D(16)
+
+HIGHBD_SADSKIP16XNx4D(8)
+ // clang-format on
diff --git a/media/libvpx/libvpx/vpx_dsp/x86/highbd_sad4d_sse2.asm b/media/libvpx/libvpx/vpx_dsp/x86/highbd_sad4d_sse2.asm
new file mode 100644
index 0000000000..a07892d811
--- /dev/null
+++ b/media/libvpx/libvpx/vpx_dsp/x86/highbd_sad4d_sse2.asm
@@ -0,0 +1,326 @@
+;
+; Copyright (c) 2014 The WebM project authors. All Rights Reserved.
+;
+; Use of this source code is governed by a BSD-style license
+; that can be found in the LICENSE file in the root of the source
+; tree. An additional intellectual property rights grant can be found
+; in the file PATENTS. All contributing project authors may
+; be found in the AUTHORS file in the root of the source tree.
+;
+
+%include "third_party/x86inc/x86inc.asm"
+
+SECTION .text
+
+; HIGH_PROCESS_4x2x4 first, off_{first,second}_{src,ref}, advance_at_end
+%macro HIGH_PROCESS_4x2x4 5-6 0
+ movh m0, [srcq +%2*2]
+%if %1 == 1
+ movu m4, [ref1q+%3*2]
+ movu m5, [ref2q+%3*2]
+ movu m6, [ref3q+%3*2]
+ movu m7, [ref4q+%3*2]
+ movhps m0, [srcq +%4*2]
+ movhps m4, [ref1q+%5*2]
+ movhps m5, [ref2q+%5*2]
+ movhps m6, [ref3q+%5*2]
+ movhps m7, [ref4q+%5*2]
+ mova m3, m0
+ mova m2, m0
+ psubusw m3, m4
+ psubusw m2, m5
+ psubusw m4, m0
+ psubusw m5, m0
+ por m4, m3
+ por m5, m2
+ pmaddwd m4, m1
+ pmaddwd m5, m1
+ mova m3, m0
+ mova m2, m0
+ psubusw m3, m6
+ psubusw m2, m7
+ psubusw m6, m0
+ psubusw m7, m0
+ por m6, m3
+ por m7, m2
+ pmaddwd m6, m1
+ pmaddwd m7, m1
+%else
+ movu m2, [ref1q+%3*2]
+ movhps m0, [srcq +%4*2]
+ movhps m2, [ref1q+%5*2]
+ mova m3, m0
+ psubusw m3, m2
+ psubusw m2, m0
+ por m2, m3
+ pmaddwd m2, m1
+ paddd m4, m2
+
+ movu m2, [ref2q+%3*2]
+ mova m3, m0
+ movhps m2, [ref2q+%5*2]
+ psubusw m3, m2
+ psubusw m2, m0
+ por m2, m3
+ pmaddwd m2, m1
+ paddd m5, m2
+
+ movu m2, [ref3q+%3*2]
+ mova m3, m0
+ movhps m2, [ref3q+%5*2]
+ psubusw m3, m2
+ psubusw m2, m0
+ por m2, m3
+ pmaddwd m2, m1
+ paddd m6, m2
+
+ movu m2, [ref4q+%3*2]
+ mova m3, m0
+ movhps m2, [ref4q+%5*2]
+ psubusw m3, m2
+ psubusw m2, m0
+ por m2, m3
+ pmaddwd m2, m1
+ paddd m7, m2
+%endif
+%if %6 == 1
+ lea srcq, [srcq +src_strideq*4]
+ lea ref1q, [ref1q+ref_strideq*4]
+ lea ref2q, [ref2q+ref_strideq*4]
+ lea ref3q, [ref3q+ref_strideq*4]
+ lea ref4q, [ref4q+ref_strideq*4]
+%endif
+%endmacro
+
+; PROCESS_8x2x4 first, off_{first,second}_{src,ref}, advance_at_end
+%macro HIGH_PROCESS_8x2x4 5-6 0
+ ; 1st 8 px
+ mova m0, [srcq +%2*2]
+%if %1 == 1
+ movu m4, [ref1q+%3*2]
+ movu m5, [ref2q+%3*2]
+ movu m6, [ref3q+%3*2]
+ movu m7, [ref4q+%3*2]
+ mova m3, m0
+ mova m2, m0
+ psubusw m3, m4
+ psubusw m2, m5
+ psubusw m4, m0
+ psubusw m5, m0
+ por m4, m3
+ por m5, m2
+ pmaddwd m4, m1
+ pmaddwd m5, m1
+ mova m3, m0
+ mova m2, m0
+ psubusw m3, m6
+ psubusw m2, m7
+ psubusw m6, m0
+ psubusw m7, m0
+ por m6, m3
+ por m7, m2
+ pmaddwd m6, m1
+ pmaddwd m7, m1
+%else
+ mova m3, m0
+ movu m2, [ref1q+%3*2]
+ psubusw m3, m2
+ psubusw m2, m0
+ por m2, m3
+ mova m3, m0
+ pmaddwd m2, m1
+ paddd m4, m2
+ movu m2, [ref2q+%3*2]
+ psubusw m3, m2
+ psubusw m2, m0
+ por m2, m3
+ mova m3, m0
+ pmaddwd m2, m1
+ paddd m5, m2
+ movu m2, [ref3q+%3*2]
+ psubusw m3, m2
+ psubusw m2, m0
+ por m2, m3
+ mova m3, m0
+ pmaddwd m2, m1
+ paddd m6, m2
+ movu m2, [ref4q+%3*2]
+ psubusw m3, m2
+ psubusw m2, m0
+ por m2, m3
+ pmaddwd m2, m1
+ paddd m7, m2
+%endif
+
+ ; 2nd 8 px
+ mova m0, [srcq +(%4)*2]
+ mova m3, m0
+ movu m2, [ref1q+(%5)*2]
+ psubusw m3, m2
+ psubusw m2, m0
+ por m2, m3
+ mova m3, m0
+ pmaddwd m2, m1
+ paddd m4, m2
+ movu m2, [ref2q+(%5)*2]
+ psubusw m3, m2
+ psubusw m2, m0
+ por m2, m3
+ mova m3, m0
+ pmaddwd m2, m1
+ paddd m5, m2
+ movu m2, [ref3q+(%5)*2]
+ psubusw m3, m2
+ psubusw m2, m0
+ por m2, m3
+ mova m3, m0
+ pmaddwd m2, m1
+ paddd m6, m2
+ movu m2, [ref4q+(%5)*2]
+ psubusw m3, m2
+ psubusw m2, m0
+%if %6 == 1
+ lea srcq, [srcq +src_strideq*4]
+ lea ref1q, [ref1q+ref_strideq*4]
+ lea ref2q, [ref2q+ref_strideq*4]
+ lea ref3q, [ref3q+ref_strideq*4]
+ lea ref4q, [ref4q+ref_strideq*4]
+%endif
+ por m2, m3
+ pmaddwd m2, m1
+ paddd m7, m2
+%endmacro
+
+; HIGH_PROCESS_16x2x4 first, off_{first,second}_{src,ref}, advance_at_end
+%macro HIGH_PROCESS_16x2x4 5-6 0
+ HIGH_PROCESS_8x2x4 %1, %2, %3, (%2 + 8), (%3 + 8)
+ HIGH_PROCESS_8x2x4 0, %4, %5, (%4 + 8), (%5 + 8), %6
+%endmacro
+
+; HIGH_PROCESS_32x2x4 first, off_{first,second}_{src,ref}, advance_at_end
+%macro HIGH_PROCESS_32x2x4 5-6 0
+ HIGH_PROCESS_16x2x4 %1, %2, %3, (%2 + 16), (%3 + 16)
+ HIGH_PROCESS_16x2x4 0, %4, %5, (%4 + 16), (%5 + 16), %6
+%endmacro
+
+; HIGH_PROCESS_64x2x4 first, off_{first,second}_{src,ref}, advance_at_end
+%macro HIGH_PROCESS_64x2x4 5-6 0
+ HIGH_PROCESS_32x2x4 %1, %2, %3, (%2 + 32), (%3 + 32)
+ HIGH_PROCESS_32x2x4 0, %4, %5, (%4 + 32), (%5 + 32), %6
+%endmacro
+
+; void vpx_highbd_sadNxNx4d_sse2(uint8_t *src, int src_stride,
+; uint8_t *ref[4], int ref_stride,
+; uint32_t res[4]);
+; where NxN = 64x64, 32x32, 16x16, 16x8, 8x16 or 8x8
+; Macro Arguments:
+; 1: Width
+; 2: Height
+; 3: If 0, then normal sad, if 2, then skip every other row
+%macro HIGH_SADNXN4D 2-3 0
+%if %3 == 0 ; normal sad
+%if UNIX64
+cglobal highbd_sad%1x%2x4d, 5, 8, 8, src, src_stride, ref1, ref_stride, \
+ res, ref2, ref3, ref4
+%else
+cglobal highbd_sad%1x%2x4d, 4, 7, 8, src, src_stride, ref1, ref_stride, \
+ ref2, ref3, ref4
+%endif
+%else ; %3 == 2, downsample
+%if UNIX64
+cglobal highbd_sad_skip_%1x%2x4d, 5, 8, 8, src, src_stride, ref1, ref_stride, \
+ res, ref2, ref3, ref4
+%else
+cglobal highbd_sad_skip_%1x%2x4d, 4, 7, 8, src, src_stride, ref1, ref_stride, \
+ ref2, ref3, ref4
+%endif ;
+%endif ; sad/avg/skip
+
+; set m1
+ push srcq
+ mov srcd, 0x00010001
+ movd m1, srcd
+ pshufd m1, m1, 0x0
+ pop srcq
+
+%if %3 == 2 ; skip rows
+ lea src_strided, [2*src_strided]
+ lea ref_strided, [2*ref_strided]
+%endif ; skip rows
+ movsxdifnidn src_strideq, src_strided
+ movsxdifnidn ref_strideq, ref_strided
+ mov ref2q, [ref1q+gprsize*1]
+ mov ref3q, [ref1q+gprsize*2]
+ mov ref4q, [ref1q+gprsize*3]
+ mov ref1q, [ref1q+gprsize*0]
+
+; convert byte pointers to short pointers
+ shl srcq, 1
+ shl ref2q, 1
+ shl ref3q, 1
+ shl ref4q, 1
+ shl ref1q, 1
+
+ HIGH_PROCESS_%1x2x4 1, 0, 0, src_strideq, ref_strideq, 1
+%if %3 == 2 ; Downsampling by two
+%define num_rep (%2-8)/4
+%else
+%define num_rep (%2-4)/2
+%endif
+%rep num_rep
+ HIGH_PROCESS_%1x2x4 0, 0, 0, src_strideq, ref_strideq, 1
+%endrep
+%undef rep
+ HIGH_PROCESS_%1x2x4 0, 0, 0, src_strideq, ref_strideq, 0
+ ; N.B. HIGH_PROCESS outputs dwords (32 bits)
+ ; so in high bit depth even the smallest width (4) needs 128bits i.e. XMM
+ movhlps m0, m4
+ movhlps m1, m5
+ movhlps m2, m6
+ movhlps m3, m7
+ paddd m4, m0
+ paddd m5, m1
+ paddd m6, m2
+ paddd m7, m3
+ punpckldq m4, m5
+ punpckldq m6, m7
+ movhlps m0, m4
+ movhlps m1, m6
+ paddd m4, m0
+ paddd m6, m1
+ punpcklqdq m4, m6
+%if %3 == 2 ; skip rows
+ pslld m4, 1
+%endif
+ movifnidn r4, r4mp
+ movu [r4], m4
+ RET
+%endmacro
+
+
+INIT_XMM sse2
+HIGH_SADNXN4D 64, 64
+HIGH_SADNXN4D 64, 32
+HIGH_SADNXN4D 32, 64
+HIGH_SADNXN4D 32, 32
+HIGH_SADNXN4D 32, 16
+HIGH_SADNXN4D 16, 32
+HIGH_SADNXN4D 16, 16
+HIGH_SADNXN4D 16, 8
+HIGH_SADNXN4D 8, 16
+HIGH_SADNXN4D 8, 8
+HIGH_SADNXN4D 8, 4
+HIGH_SADNXN4D 4, 8
+HIGH_SADNXN4D 4, 4
+
+HIGH_SADNXN4D 64, 64, 2
+HIGH_SADNXN4D 64, 32, 2
+HIGH_SADNXN4D 32, 64, 2
+HIGH_SADNXN4D 32, 32, 2
+HIGH_SADNXN4D 32, 16, 2
+HIGH_SADNXN4D 16, 32, 2
+HIGH_SADNXN4D 16, 16, 2
+HIGH_SADNXN4D 16, 8, 2
+HIGH_SADNXN4D 8, 16, 2
+HIGH_SADNXN4D 8, 8, 2
+HIGH_SADNXN4D 4, 8, 2
diff --git a/media/libvpx/libvpx/vpx_dsp/x86/highbd_sad_avx2.c b/media/libvpx/libvpx/vpx_dsp/x86/highbd_sad_avx2.c
new file mode 100644
index 0000000000..78f8eb8bfa
--- /dev/null
+++ b/media/libvpx/libvpx/vpx_dsp/x86/highbd_sad_avx2.c
@@ -0,0 +1,522 @@
+/*
+ * Copyright (c) 2022 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+#include <immintrin.h>
+#include "./vpx_dsp_rtcd.h"
+#include "vpx/vpx_integer.h"
+
+static VPX_FORCE_INLINE unsigned int calc_final(const __m256i sums_32) {
+ const __m256i t0 = _mm256_add_epi32(sums_32, _mm256_srli_si256(sums_32, 8));
+ const __m256i t1 = _mm256_add_epi32(t0, _mm256_srli_si256(t0, 4));
+ const __m128i sum = _mm_add_epi32(_mm256_castsi256_si128(t1),
+ _mm256_extractf128_si256(t1, 1));
+ return (unsigned int)_mm_cvtsi128_si32(sum);
+}
+
+static VPX_FORCE_INLINE void highbd_sad64xH(__m256i *sums_16,
+ const uint16_t *src, int src_stride,
+ uint16_t *ref, int ref_stride,
+ int height) {
+ int i;
+ for (i = 0; i < height; ++i) {
+ // load src and all ref[]
+ const __m256i s0 = _mm256_load_si256((const __m256i *)src);
+ const __m256i s1 = _mm256_load_si256((const __m256i *)(src + 16));
+ const __m256i s2 = _mm256_load_si256((const __m256i *)(src + 32));
+ const __m256i s3 = _mm256_load_si256((const __m256i *)(src + 48));
+ const __m256i r0 = _mm256_loadu_si256((const __m256i *)ref);
+ const __m256i r1 = _mm256_loadu_si256((const __m256i *)(ref + 16));
+ const __m256i r2 = _mm256_loadu_si256((const __m256i *)(ref + 32));
+ const __m256i r3 = _mm256_loadu_si256((const __m256i *)(ref + 48));
+ // absolute differences between every ref[] to src
+ const __m256i abs_diff0 = _mm256_abs_epi16(_mm256_sub_epi16(r0, s0));
+ const __m256i abs_diff1 = _mm256_abs_epi16(_mm256_sub_epi16(r1, s1));
+ const __m256i abs_diff2 = _mm256_abs_epi16(_mm256_sub_epi16(r2, s2));
+ const __m256i abs_diff3 = _mm256_abs_epi16(_mm256_sub_epi16(r3, s3));
+ // sum every abs diff
+ *sums_16 =
+ _mm256_add_epi16(*sums_16, _mm256_add_epi16(abs_diff0, abs_diff1));
+ *sums_16 =
+ _mm256_add_epi16(*sums_16, _mm256_add_epi16(abs_diff2, abs_diff3));
+
+ src += src_stride;
+ ref += ref_stride;
+ }
+}
+
+static VPX_FORCE_INLINE unsigned int highbd_sad64xN_avx2(const uint8_t *src_ptr,
+ int src_stride,
+ const uint8_t *ref_ptr,
+ int ref_stride,
+ int n) {
+ const uint16_t *src = CONVERT_TO_SHORTPTR(src_ptr);
+ uint16_t *ref = CONVERT_TO_SHORTPTR(ref_ptr);
+ __m256i sums_32 = _mm256_setzero_si256();
+ int i;
+
+ for (i = 0; i < (n / 2); ++i) {
+ __m256i sums_16 = _mm256_setzero_si256();
+
+ highbd_sad64xH(&sums_16, src, src_stride, ref, ref_stride, 2);
+
+ /* sums_16 will outrange after 2 rows, so add current sums_16 to
+ * sums_32*/
+ sums_32 = _mm256_add_epi32(
+ sums_32,
+ _mm256_add_epi32(
+ _mm256_cvtepu16_epi32(_mm256_castsi256_si128(sums_16)),
+ _mm256_cvtepu16_epi32(_mm256_extractf128_si256(sums_16, 1))));
+
+ src += src_stride << 1;
+ ref += ref_stride << 1;
+ }
+ return calc_final(sums_32);
+}
+
+#define HIGHBD_SAD64XN(n) \
+ unsigned int vpx_highbd_sad64x##n##_avx2(const uint8_t *src, int src_stride, \
+ const uint8_t *ref, \
+ int ref_stride) { \
+ return highbd_sad64xN_avx2(src, src_stride, ref, ref_stride, n); \
+ }
+
+#define HIGHBD_SADSKIP64xN(n) \
+ unsigned int vpx_highbd_sad_skip_64x##n##_avx2( \
+ const uint8_t *src, int src_stride, const uint8_t *ref, \
+ int ref_stride) { \
+ return 2 * highbd_sad64xN_avx2(src, 2 * src_stride, ref, 2 * ref_stride, \
+ n / 2); \
+ }
+
+static VPX_FORCE_INLINE void highbd_sad32xH(__m256i *sums_16,
+ const uint16_t *src, int src_stride,
+ uint16_t *ref, int ref_stride,
+ int height) {
+ int i;
+ for (i = 0; i < height; ++i) {
+ // load src and all ref[]
+ const __m256i s0 = _mm256_load_si256((const __m256i *)src);
+ const __m256i s1 = _mm256_load_si256((const __m256i *)(src + 16));
+ const __m256i r0 = _mm256_loadu_si256((const __m256i *)ref);
+ const __m256i r1 = _mm256_loadu_si256((const __m256i *)(ref + 16));
+ // absolute differences between every ref[] to src
+ const __m256i abs_diff0 = _mm256_abs_epi16(_mm256_sub_epi16(r0, s0));
+ const __m256i abs_diff1 = _mm256_abs_epi16(_mm256_sub_epi16(r1, s1));
+ // sum every abs diff
+ *sums_16 = _mm256_add_epi16(*sums_16, abs_diff0);
+ *sums_16 = _mm256_add_epi16(*sums_16, abs_diff1);
+
+ src += src_stride;
+ ref += ref_stride;
+ }
+}
+
+static VPX_FORCE_INLINE unsigned int highbd_sad32xN_avx2(const uint8_t *src_ptr,
+ int src_stride,
+ const uint8_t *ref_ptr,
+ int ref_stride,
+ int n) {
+ const uint16_t *src = CONVERT_TO_SHORTPTR(src_ptr);
+ uint16_t *ref = CONVERT_TO_SHORTPTR(ref_ptr);
+ __m256i sums_32 = _mm256_setzero_si256();
+ int i;
+
+ for (i = 0; i < (n / 8); ++i) {
+ __m256i sums_16 = _mm256_setzero_si256();
+
+ highbd_sad32xH(&sums_16, src, src_stride, ref, ref_stride, 8);
+
+ /* sums_16 will outrange after 8 rows, so add current sums_16 to
+ * sums_32*/
+ sums_32 = _mm256_add_epi32(
+ sums_32,
+ _mm256_add_epi32(
+ _mm256_cvtepu16_epi32(_mm256_castsi256_si128(sums_16)),
+ _mm256_cvtepu16_epi32(_mm256_extractf128_si256(sums_16, 1))));
+
+ src += src_stride << 3;
+ ref += ref_stride << 3;
+ }
+ return calc_final(sums_32);
+}
+
+#define HIGHBD_SAD32XN(n) \
+ unsigned int vpx_highbd_sad32x##n##_avx2(const uint8_t *src, int src_stride, \
+ const uint8_t *ref, \
+ int ref_stride) { \
+ return highbd_sad32xN_avx2(src, src_stride, ref, ref_stride, n); \
+ }
+
+#define HIGHBD_SADSKIP32xN(n) \
+ unsigned int vpx_highbd_sad_skip_32x##n##_avx2( \
+ const uint8_t *src, int src_stride, const uint8_t *ref, \
+ int ref_stride) { \
+ return 2 * highbd_sad32xN_avx2(src, 2 * src_stride, ref, 2 * ref_stride, \
+ n / 2); \
+ }
+
+static VPX_FORCE_INLINE void highbd_sad16xH(__m256i *sums_16,
+ const uint16_t *src, int src_stride,
+ uint16_t *ref, int ref_stride,
+ int height) {
+ int i;
+ for (i = 0; i < height; i += 2) {
+ // load src and all ref[]
+ const __m256i s0 = _mm256_load_si256((const __m256i *)src);
+ const __m256i s1 = _mm256_load_si256((const __m256i *)(src + src_stride));
+ const __m256i r0 = _mm256_loadu_si256((const __m256i *)ref);
+ const __m256i r1 = _mm256_loadu_si256((const __m256i *)(ref + ref_stride));
+ // absolute differences between every ref[] to src
+ const __m256i abs_diff0 = _mm256_abs_epi16(_mm256_sub_epi16(r0, s0));
+ const __m256i abs_diff1 = _mm256_abs_epi16(_mm256_sub_epi16(r1, s1));
+ // sum every abs diff
+ *sums_16 = _mm256_add_epi16(*sums_16, abs_diff0);
+ *sums_16 = _mm256_add_epi16(*sums_16, abs_diff1);
+
+ src += src_stride << 1;
+ ref += ref_stride << 1;
+ }
+}
+
+static VPX_FORCE_INLINE unsigned int highbd_sad16xN_avx2(const uint8_t *src_ptr,
+ int src_stride,
+ const uint8_t *ref_ptr,
+ int ref_stride,
+ int n) {
+ const uint16_t *src = CONVERT_TO_SHORTPTR(src_ptr);
+ uint16_t *ref = CONVERT_TO_SHORTPTR(ref_ptr);
+ __m256i sums_32 = _mm256_setzero_si256();
+ const int height = VPXMIN(16, n);
+ const int num_iters = n / height;
+ int i;
+
+ for (i = 0; i < num_iters; ++i) {
+ __m256i sums_16 = _mm256_setzero_si256();
+
+ highbd_sad16xH(&sums_16, src, src_stride, ref, ref_stride, height);
+
+ // sums_16 will outrange after 16 rows, so add current sums_16 to sums_32
+ sums_32 = _mm256_add_epi32(
+ sums_32,
+ _mm256_add_epi32(
+ _mm256_cvtepu16_epi32(_mm256_castsi256_si128(sums_16)),
+ _mm256_cvtepu16_epi32(_mm256_extractf128_si256(sums_16, 1))));
+
+ src += src_stride << 4;
+ ref += ref_stride << 4;
+ }
+ return calc_final(sums_32);
+}
+
+#define HIGHBD_SAD16XN(n) \
+ unsigned int vpx_highbd_sad16x##n##_avx2(const uint8_t *src, int src_stride, \
+ const uint8_t *ref, \
+ int ref_stride) { \
+ return highbd_sad16xN_avx2(src, src_stride, ref, ref_stride, n); \
+ }
+
+#define HIGHBD_SADSKIP16xN(n) \
+ unsigned int vpx_highbd_sad_skip_16x##n##_avx2( \
+ const uint8_t *src, int src_stride, const uint8_t *ref, \
+ int ref_stride) { \
+ return 2 * highbd_sad16xN_avx2(src, 2 * src_stride, ref, 2 * ref_stride, \
+ n / 2); \
+ }
+
+unsigned int vpx_highbd_sad16x16_avx2(const uint8_t *src_ptr, int src_stride,
+ const uint8_t *ref_ptr, int ref_stride) {
+ const uint16_t *src = CONVERT_TO_SHORTPTR(src_ptr);
+ uint16_t *ref = CONVERT_TO_SHORTPTR(ref_ptr);
+ __m256i sums_16 = _mm256_setzero_si256();
+
+ highbd_sad16xH(&sums_16, src, src_stride, ref, ref_stride, 16);
+
+ {
+ const __m256i sums_32 = _mm256_add_epi32(
+ _mm256_cvtepu16_epi32(_mm256_castsi256_si128(sums_16)),
+ _mm256_cvtepu16_epi32(_mm256_extractf128_si256(sums_16, 1)));
+ return calc_final(sums_32);
+ }
+}
+
+unsigned int vpx_highbd_sad16x8_avx2(const uint8_t *src_ptr, int src_stride,
+ const uint8_t *ref_ptr, int ref_stride) {
+ const uint16_t *src = CONVERT_TO_SHORTPTR(src_ptr);
+ uint16_t *ref = CONVERT_TO_SHORTPTR(ref_ptr);
+ __m256i sums_16 = _mm256_setzero_si256();
+
+ highbd_sad16xH(&sums_16, src, src_stride, ref, ref_stride, 8);
+
+ {
+ const __m256i sums_32 = _mm256_add_epi32(
+ _mm256_cvtepu16_epi32(_mm256_castsi256_si128(sums_16)),
+ _mm256_cvtepu16_epi32(_mm256_extractf128_si256(sums_16, 1)));
+ return calc_final(sums_32);
+ }
+}
+
+// clang-format off
+HIGHBD_SAD64XN(64)
+HIGHBD_SADSKIP64xN(64)
+HIGHBD_SAD64XN(32)
+HIGHBD_SADSKIP64xN(32)
+HIGHBD_SAD32XN(64)
+HIGHBD_SADSKIP32xN(64)
+HIGHBD_SAD32XN(32)
+HIGHBD_SADSKIP32xN(32)
+HIGHBD_SAD32XN(16)
+HIGHBD_SADSKIP32xN(16)
+HIGHBD_SAD16XN(32)
+HIGHBD_SADSKIP16xN(32)
+HIGHBD_SADSKIP16xN(16)
+HIGHBD_SADSKIP16xN(8)
+//clang-format on
+
+// AVG -------------------------------------------------------------------------
+static VPX_FORCE_INLINE void highbd_sad64xH_avg(__m256i *sums_16,
+ const uint16_t *src,
+ int src_stride, uint16_t *ref,
+ int ref_stride, uint16_t *sec,
+ int height) {
+ int i;
+ for (i = 0; i < height; ++i) {
+ // load src and all ref[]
+ const __m256i s0 = _mm256_load_si256((const __m256i *)src);
+ const __m256i s1 = _mm256_load_si256((const __m256i *)(src + 16));
+ const __m256i s2 = _mm256_load_si256((const __m256i *)(src + 32));
+ const __m256i s3 = _mm256_load_si256((const __m256i *)(src + 48));
+ const __m256i r0 = _mm256_loadu_si256((const __m256i *)ref);
+ const __m256i r1 = _mm256_loadu_si256((const __m256i *)(ref + 16));
+ const __m256i r2 = _mm256_loadu_si256((const __m256i *)(ref + 32));
+ const __m256i r3 = _mm256_loadu_si256((const __m256i *)(ref + 48));
+ const __m256i x0 = _mm256_loadu_si256((const __m256i *)sec);
+ const __m256i x1 = _mm256_loadu_si256((const __m256i *)(sec + 16));
+ const __m256i x2 = _mm256_loadu_si256((const __m256i *)(sec + 32));
+ const __m256i x3 = _mm256_loadu_si256((const __m256i *)(sec + 48));
+ const __m256i avg0 = _mm256_avg_epu16(r0, x0);
+ const __m256i avg1 = _mm256_avg_epu16(r1, x1);
+ const __m256i avg2 = _mm256_avg_epu16(r2, x2);
+ const __m256i avg3 = _mm256_avg_epu16(r3, x3);
+ // absolute differences between every ref/pred avg to src
+ const __m256i abs_diff0 = _mm256_abs_epi16(_mm256_sub_epi16(avg0, s0));
+ const __m256i abs_diff1 = _mm256_abs_epi16(_mm256_sub_epi16(avg1, s1));
+ const __m256i abs_diff2 = _mm256_abs_epi16(_mm256_sub_epi16(avg2, s2));
+ const __m256i abs_diff3 = _mm256_abs_epi16(_mm256_sub_epi16(avg3, s3));
+ // sum every abs diff
+ *sums_16 =
+ _mm256_add_epi16(*sums_16, _mm256_add_epi16(abs_diff0, abs_diff1));
+ *sums_16 =
+ _mm256_add_epi16(*sums_16, _mm256_add_epi16(abs_diff2, abs_diff3));
+
+ src += src_stride;
+ ref += ref_stride;
+ sec += 64;
+ }
+}
+
+#define HIGHBD_SAD64XN_AVG(n) \
+ unsigned int vpx_highbd_sad64x##n##_avg_avx2( \
+ const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, \
+ int ref_stride, const uint8_t *second_pred) { \
+ const uint16_t *src = CONVERT_TO_SHORTPTR(src_ptr); \
+ uint16_t *ref = CONVERT_TO_SHORTPTR(ref_ptr); \
+ uint16_t *sec = CONVERT_TO_SHORTPTR(second_pred); \
+ __m256i sums_32 = _mm256_setzero_si256(); \
+ int i; \
+ \
+ for (i = 0; i < (n / 2); ++i) { \
+ __m256i sums_16 = _mm256_setzero_si256(); \
+ \
+ highbd_sad64xH_avg(&sums_16, src, src_stride, ref, ref_stride, sec, 2); \
+ \
+ /* sums_16 will outrange after 2 rows, so add current sums_16 to \
+ * sums_32*/ \
+ sums_32 = _mm256_add_epi32( \
+ sums_32, \
+ _mm256_add_epi32( \
+ _mm256_cvtepu16_epi32(_mm256_castsi256_si128(sums_16)), \
+ _mm256_cvtepu16_epi32(_mm256_extractf128_si256(sums_16, 1)))); \
+ \
+ src += src_stride << 1; \
+ ref += ref_stride << 1; \
+ sec += 64 << 1; \
+ } \
+ return calc_final(sums_32); \
+ }
+
+// 64x64
+HIGHBD_SAD64XN_AVG(64)
+
+// 64x32
+HIGHBD_SAD64XN_AVG(32)
+
+static VPX_FORCE_INLINE void highbd_sad32xH_avg(__m256i *sums_16,
+ const uint16_t *src,
+ int src_stride, uint16_t *ref,
+ int ref_stride, uint16_t *sec,
+ int height) {
+ int i;
+ for (i = 0; i < height; ++i) {
+ // load src and all ref[]
+ const __m256i s0 = _mm256_load_si256((const __m256i *)src);
+ const __m256i s1 = _mm256_load_si256((const __m256i *)(src + 16));
+ const __m256i r0 = _mm256_loadu_si256((const __m256i *)ref);
+ const __m256i r1 = _mm256_loadu_si256((const __m256i *)(ref + 16));
+ const __m256i x0 = _mm256_loadu_si256((const __m256i *)sec);
+ const __m256i x1 = _mm256_loadu_si256((const __m256i *)(sec + 16));
+ const __m256i avg0 = _mm256_avg_epu16(r0, x0);
+ const __m256i avg1 = _mm256_avg_epu16(r1, x1);
+ // absolute differences between every ref/pred avg to src
+ const __m256i abs_diff0 = _mm256_abs_epi16(_mm256_sub_epi16(avg0, s0));
+ const __m256i abs_diff1 = _mm256_abs_epi16(_mm256_sub_epi16(avg1, s1));
+ // sum every abs diff
+ *sums_16 = _mm256_add_epi16(*sums_16, abs_diff0);
+ *sums_16 = _mm256_add_epi16(*sums_16, abs_diff1);
+
+ src += src_stride;
+ ref += ref_stride;
+ sec += 32;
+ }
+}
+
+#define HIGHBD_SAD32XN_AVG(n) \
+ unsigned int vpx_highbd_sad32x##n##_avg_avx2( \
+ const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, \
+ int ref_stride, const uint8_t *second_pred) { \
+ const uint16_t *src = CONVERT_TO_SHORTPTR(src_ptr); \
+ uint16_t *ref = CONVERT_TO_SHORTPTR(ref_ptr); \
+ uint16_t *sec = CONVERT_TO_SHORTPTR(second_pred); \
+ __m256i sums_32 = _mm256_setzero_si256(); \
+ int i; \
+ \
+ for (i = 0; i < (n / 8); ++i) { \
+ __m256i sums_16 = _mm256_setzero_si256(); \
+ \
+ highbd_sad32xH_avg(&sums_16, src, src_stride, ref, ref_stride, sec, 8); \
+ \
+ /* sums_16 will outrange after 8 rows, so add current sums_16 to \
+ * sums_32*/ \
+ sums_32 = _mm256_add_epi32( \
+ sums_32, \
+ _mm256_add_epi32( \
+ _mm256_cvtepu16_epi32(_mm256_castsi256_si128(sums_16)), \
+ _mm256_cvtepu16_epi32(_mm256_extractf128_si256(sums_16, 1)))); \
+ \
+ src += src_stride << 3; \
+ ref += ref_stride << 3; \
+ sec += 32 << 3; \
+ } \
+ return calc_final(sums_32); \
+ }
+
+// 32x64
+HIGHBD_SAD32XN_AVG(64)
+
+// 32x32
+HIGHBD_SAD32XN_AVG(32)
+
+// 32x16
+HIGHBD_SAD32XN_AVG(16)
+
+static VPX_FORCE_INLINE void highbd_sad16xH_avg(__m256i *sums_16,
+ const uint16_t *src,
+ int src_stride, uint16_t *ref,
+ int ref_stride, uint16_t *sec,
+ int height) {
+ int i;
+ for (i = 0; i < height; i += 2) {
+ // load src and all ref[]
+ const __m256i s0 = _mm256_load_si256((const __m256i *)src);
+ const __m256i s1 = _mm256_load_si256((const __m256i *)(src + src_stride));
+ const __m256i r0 = _mm256_loadu_si256((const __m256i *)ref);
+ const __m256i r1 = _mm256_loadu_si256((const __m256i *)(ref + ref_stride));
+ const __m256i x0 = _mm256_loadu_si256((const __m256i *)sec);
+ const __m256i x1 = _mm256_loadu_si256((const __m256i *)(sec + 16));
+ const __m256i avg0 = _mm256_avg_epu16(r0, x0);
+ const __m256i avg1 = _mm256_avg_epu16(r1, x1);
+ // absolute differences between every ref[] to src
+ const __m256i abs_diff0 = _mm256_abs_epi16(_mm256_sub_epi16(avg0, s0));
+ const __m256i abs_diff1 = _mm256_abs_epi16(_mm256_sub_epi16(avg1, s1));
+ // sum every abs diff
+ *sums_16 = _mm256_add_epi16(*sums_16, abs_diff0);
+ *sums_16 = _mm256_add_epi16(*sums_16, abs_diff1);
+
+ src += src_stride << 1;
+ ref += ref_stride << 1;
+ sec += 32;
+ }
+}
+
+unsigned int vpx_highbd_sad16x32_avg_avx2(const uint8_t *src_ptr,
+ int src_stride,
+ const uint8_t *ref_ptr,
+ int ref_stride,
+ const uint8_t *second_pred) {
+ const uint16_t *src = CONVERT_TO_SHORTPTR(src_ptr);
+ uint16_t *ref = CONVERT_TO_SHORTPTR(ref_ptr);
+ uint16_t *sec = CONVERT_TO_SHORTPTR(second_pred);
+ __m256i sums_32 = _mm256_setzero_si256();
+ int i;
+
+ for (i = 0; i < 2; ++i) {
+ __m256i sums_16 = _mm256_setzero_si256();
+
+ highbd_sad16xH_avg(&sums_16, src, src_stride, ref, ref_stride, sec, 16);
+
+ // sums_16 will outrange after 16 rows, so add current sums_16 to sums_32
+ sums_32 = _mm256_add_epi32(
+ sums_32,
+ _mm256_add_epi32(
+ _mm256_cvtepu16_epi32(_mm256_castsi256_si128(sums_16)),
+ _mm256_cvtepu16_epi32(_mm256_extractf128_si256(sums_16, 1))));
+
+ src += src_stride << 4;
+ ref += ref_stride << 4;
+ sec += 16 << 4;
+ }
+ return calc_final(sums_32);
+}
+
+unsigned int vpx_highbd_sad16x16_avg_avx2(const uint8_t *src_ptr,
+ int src_stride,
+ const uint8_t *ref_ptr,
+ int ref_stride,
+ const uint8_t *second_pred) {
+ const uint16_t *src = CONVERT_TO_SHORTPTR(src_ptr);
+ uint16_t *ref = CONVERT_TO_SHORTPTR(ref_ptr);
+ uint16_t *sec = CONVERT_TO_SHORTPTR(second_pred);
+ __m256i sums_16 = _mm256_setzero_si256();
+
+ highbd_sad16xH_avg(&sums_16, src, src_stride, ref, ref_stride, sec, 16);
+
+ {
+ const __m256i sums_32 = _mm256_add_epi32(
+ _mm256_cvtepu16_epi32(_mm256_castsi256_si128(sums_16)),
+ _mm256_cvtepu16_epi32(_mm256_extractf128_si256(sums_16, 1)));
+ return calc_final(sums_32);
+ }
+}
+
+unsigned int vpx_highbd_sad16x8_avg_avx2(const uint8_t *src_ptr, int src_stride,
+ const uint8_t *ref_ptr, int ref_stride,
+ const uint8_t *second_pred) {
+ const uint16_t *src = CONVERT_TO_SHORTPTR(src_ptr);
+ uint16_t *ref = CONVERT_TO_SHORTPTR(ref_ptr);
+ uint16_t *sec = CONVERT_TO_SHORTPTR(second_pred);
+ __m256i sums_16 = _mm256_setzero_si256();
+
+ highbd_sad16xH_avg(&sums_16, src, src_stride, ref, ref_stride, sec, 8);
+
+ {
+ const __m256i sums_32 = _mm256_add_epi32(
+ _mm256_cvtepu16_epi32(_mm256_castsi256_si128(sums_16)),
+ _mm256_cvtepu16_epi32(_mm256_extractf128_si256(sums_16, 1)));
+ return calc_final(sums_32);
+ }
+}
diff --git a/media/libvpx/libvpx/vpx_dsp/x86/highbd_sad_sse2.asm b/media/libvpx/libvpx/vpx_dsp/x86/highbd_sad_sse2.asm
new file mode 100644
index 0000000000..62ad2237ff
--- /dev/null
+++ b/media/libvpx/libvpx/vpx_dsp/x86/highbd_sad_sse2.asm
@@ -0,0 +1,416 @@
+;
+; Copyright (c) 2014 The WebM project authors. All Rights Reserved.
+;
+; Use of this source code is governed by a BSD-style license
+; that can be found in the LICENSE file in the root of the source
+; tree. An additional intellectual property rights grant can be found
+; in the file PATENTS. All contributing project authors may
+; be found in the AUTHORS file in the root of the source tree.
+;
+
+%include "third_party/x86inc/x86inc.asm"
+
+SECTION .text
+
+; Macro Arguments
+; Arg 1: Width
+; Arg 2: Height
+; Arg 3: Number of general purpose registers
+; Arg 4: Type of function: if 0, normal sad; if 1, avg; if 2, skip rows
+%macro HIGH_SAD_FN 4
+%if %4 == 0
+%if %3 == 5
+cglobal highbd_sad%1x%2, 4, %3, 7, src, src_stride, ref, ref_stride, n_rows
+%else ; %3 == 7
+cglobal highbd_sad%1x%2, 4, %3, 7, src, src_stride, ref, ref_stride, \
+ src_stride3, ref_stride3, n_rows
+%endif ; %3 == 5/7
+%elif %4 == 1 ; avg
+%if %3 == 5
+cglobal highbd_sad%1x%2_avg, 5, 1 + %3, 7, src, src_stride, ref, ref_stride, \
+ second_pred, n_rows
+%else ; %3 == 7
+cglobal highbd_sad%1x%2_avg, 5, VPX_ARCH_X86_64 + %3, 7, src, src_stride, \
+ ref, ref_stride, \
+ second_pred, \
+ src_stride3, ref_stride3
+%if VPX_ARCH_X86_64
+%define n_rowsd r7d
+%else ; x86-32
+%define n_rowsd dword r0m
+%endif ; x86-32/64
+%endif ; %3 == 5/7
+%else ; %4 == 2, skip rows
+%if %3 == 5
+cglobal highbd_sad_skip_%1x%2, 4, %3, 7, src, src_stride, ref, ref_stride, n_rows
+%else ; %3 == 7
+cglobal highbd_sad_skip_%1x%2, 4, %3, 7, src, src_stride, ref, ref_stride, \
+ src_stride3, ref_stride3, n_rows
+%endif ; %3 == 5/7
+%endif ; sad/avg/skip
+%if %4 == 2 ; double the stride if we are skipping rows
+ lea src_strided, [src_strided*2]
+ lea ref_strided, [ref_strided*2]
+%endif
+ movsxdifnidn src_strideq, src_strided
+ movsxdifnidn ref_strideq, ref_strided
+%if %3 == 7
+ lea src_stride3q, [src_strideq*3]
+ lea ref_stride3q, [ref_strideq*3]
+%endif ; %3 == 7
+; convert src, ref & second_pred to short ptrs (from byte ptrs)
+ shl srcq, 1
+ shl refq, 1
+%if %4 == 1
+ shl second_predq, 1
+%endif
+%endmacro
+
+; unsigned int vpx_highbd_sad64x{16,32,64}_sse2(uint8_t *src, int src_stride,
+; uint8_t *ref, int ref_stride);
+%macro HIGH_SAD64XN 1-2 0
+ HIGH_SAD_FN 64, %1, 5, %2
+%if %2 == 2 ; skip rows, so divide number of rows by 2
+ mov n_rowsd, %1/2
+%else
+ mov n_rowsd, %1
+%endif
+ pxor m0, m0
+ pxor m6, m6
+
+.loop:
+ ; first half of each row
+ movu m1, [refq]
+ movu m2, [refq+16]
+ movu m3, [refq+32]
+ movu m4, [refq+48]
+%if %2 == 1
+ pavgw m1, [second_predq+mmsize*0]
+ pavgw m2, [second_predq+mmsize*1]
+ pavgw m3, [second_predq+mmsize*2]
+ pavgw m4, [second_predq+mmsize*3]
+ lea second_predq, [second_predq+mmsize*4]
+%endif
+ mova m5, [srcq]
+ psubusw m5, m1
+ psubusw m1, [srcq]
+ por m1, m5
+ mova m5, [srcq+16]
+ psubusw m5, m2
+ psubusw m2, [srcq+16]
+ por m2, m5
+ mova m5, [srcq+32]
+ psubusw m5, m3
+ psubusw m3, [srcq+32]
+ por m3, m5
+ mova m5, [srcq+48]
+ psubusw m5, m4
+ psubusw m4, [srcq+48]
+ por m4, m5
+ paddw m1, m2
+ paddw m3, m4
+ movhlps m2, m1
+ movhlps m4, m3
+ paddw m1, m2
+ paddw m3, m4
+ punpcklwd m1, m6
+ punpcklwd m3, m6
+ paddd m0, m1
+ paddd m0, m3
+ ; second half of each row
+ movu m1, [refq+64]
+ movu m2, [refq+80]
+ movu m3, [refq+96]
+ movu m4, [refq+112]
+%if %2 == 1
+ pavgw m1, [second_predq+mmsize*0]
+ pavgw m2, [second_predq+mmsize*1]
+ pavgw m3, [second_predq+mmsize*2]
+ pavgw m4, [second_predq+mmsize*3]
+ lea second_predq, [second_predq+mmsize*4]
+%endif
+ mova m5, [srcq+64]
+ psubusw m5, m1
+ psubusw m1, [srcq+64]
+ por m1, m5
+ mova m5, [srcq+80]
+ psubusw m5, m2
+ psubusw m2, [srcq+80]
+ por m2, m5
+ mova m5, [srcq+96]
+ psubusw m5, m3
+ psubusw m3, [srcq+96]
+ por m3, m5
+ mova m5, [srcq+112]
+ psubusw m5, m4
+ psubusw m4, [srcq+112]
+ por m4, m5
+ paddw m1, m2
+ paddw m3, m4
+ movhlps m2, m1
+ movhlps m4, m3
+ paddw m1, m2
+ paddw m3, m4
+ punpcklwd m1, m6
+ punpcklwd m3, m6
+ lea refq, [refq+ref_strideq*2]
+ paddd m0, m1
+ lea srcq, [srcq+src_strideq*2]
+ paddd m0, m3
+
+ dec n_rowsd
+ jg .loop
+
+ movhlps m1, m0
+ paddd m0, m1
+ punpckldq m0, m6
+ movhlps m1, m0
+ paddd m0, m1
+%if %2 == 2 ; we skipped rows, so we need to double the sad
+ pslld m0, 1
+%endif
+ movd eax, m0
+ RET
+%endmacro
+
+INIT_XMM sse2
+HIGH_SAD64XN 64 ; highbd_sad64x64_sse2
+HIGH_SAD64XN 32 ; highbd_sad64x32_sse2
+HIGH_SAD64XN 64, 1 ; highbd_sad64x64_avg_sse2
+HIGH_SAD64XN 32, 1 ; highbd_sad64x32_avg_sse2
+HIGH_SAD64XN 64, 2 ; highbd_sad_skip_64x64_sse2
+HIGH_SAD64XN 32, 2 ; highbd_sad_skip_64x32_sse2
+
+
+; unsigned int vpx_highbd_sad32x{16,32,64}_sse2(uint8_t *src, int src_stride,
+; uint8_t *ref, int ref_stride);
+%macro HIGH_SAD32XN 1-2 0
+ HIGH_SAD_FN 32, %1, 5, %2
+%if %2 == 2 ; skip rows, so divide number of rows by 2
+ mov n_rowsd, %1/2
+%else
+ mov n_rowsd, %1
+%endif
+ pxor m0, m0
+ pxor m6, m6
+
+.loop:
+ movu m1, [refq]
+ movu m2, [refq+16]
+ movu m3, [refq+32]
+ movu m4, [refq+48]
+%if %2 == 1
+ pavgw m1, [second_predq+mmsize*0]
+ pavgw m2, [second_predq+mmsize*1]
+ pavgw m3, [second_predq+mmsize*2]
+ pavgw m4, [second_predq+mmsize*3]
+ lea second_predq, [second_predq+mmsize*4]
+%endif
+ mova m5, [srcq]
+ psubusw m5, m1
+ psubusw m1, [srcq]
+ por m1, m5
+ mova m5, [srcq+16]
+ psubusw m5, m2
+ psubusw m2, [srcq+16]
+ por m2, m5
+ mova m5, [srcq+32]
+ psubusw m5, m3
+ psubusw m3, [srcq+32]
+ por m3, m5
+ mova m5, [srcq+48]
+ psubusw m5, m4
+ psubusw m4, [srcq+48]
+ por m4, m5
+ paddw m1, m2
+ paddw m3, m4
+ movhlps m2, m1
+ movhlps m4, m3
+ paddw m1, m2
+ paddw m3, m4
+ punpcklwd m1, m6
+ punpcklwd m3, m6
+ lea refq, [refq+ref_strideq*2]
+ paddd m0, m1
+ lea srcq, [srcq+src_strideq*2]
+ paddd m0, m3
+ dec n_rowsd
+ jg .loop
+
+ movhlps m1, m0
+ paddd m0, m1
+ punpckldq m0, m6
+ movhlps m1, m0
+ paddd m0, m1
+%if %2 == 2 ; we skipped rows, so we need to double the sad
+ pslld m0, 1
+%endif
+ movd eax, m0
+ RET
+%endmacro
+
+INIT_XMM sse2
+HIGH_SAD32XN 64 ; highbd_sad32x64_sse2
+HIGH_SAD32XN 32 ; highbd_sad32x32_sse2
+HIGH_SAD32XN 16 ; highbd_sad32x16_sse2
+HIGH_SAD32XN 64, 1 ; highbd_sad32x64_avg_sse2
+HIGH_SAD32XN 32, 1 ; highbd_sad32x32_avg_sse2
+HIGH_SAD32XN 16, 1 ; highbd_sad32x16_avg_sse2
+HIGH_SAD32XN 64, 2 ; highbd_sad_skip_32x64_sse2
+HIGH_SAD32XN 32, 2 ; highbd_sad_skip_32x32_sse2
+HIGH_SAD32XN 16, 2 ; highbd_sad_skip_32x16_sse2
+
+; unsigned int vpx_highbd_sad16x{8,16,32}_sse2(uint8_t *src, int src_stride,
+; uint8_t *ref, int ref_stride);
+%macro HIGH_SAD16XN 1-2 0
+ HIGH_SAD_FN 16, %1, 5, %2
+%if %2 == 2 ; skip rows, so divide number of rows by 2
+ mov n_rowsd, %1/4
+%else
+ mov n_rowsd, %1/2
+%endif
+ pxor m0, m0
+ pxor m6, m6
+
+.loop:
+ movu m1, [refq]
+ movu m2, [refq+16]
+ movu m3, [refq+ref_strideq*2]
+ movu m4, [refq+ref_strideq*2+16]
+%if %2 == 1
+ pavgw m1, [second_predq+mmsize*0]
+ pavgw m2, [second_predq+16]
+ pavgw m3, [second_predq+mmsize*2]
+ pavgw m4, [second_predq+mmsize*2+16]
+ lea second_predq, [second_predq+mmsize*4]
+%endif
+ mova m5, [srcq]
+ psubusw m5, m1
+ psubusw m1, [srcq]
+ por m1, m5
+ mova m5, [srcq+16]
+ psubusw m5, m2
+ psubusw m2, [srcq+16]
+ por m2, m5
+ mova m5, [srcq+src_strideq*2]
+ psubusw m5, m3
+ psubusw m3, [srcq+src_strideq*2]
+ por m3, m5
+ mova m5, [srcq+src_strideq*2+16]
+ psubusw m5, m4
+ psubusw m4, [srcq+src_strideq*2+16]
+ por m4, m5
+ paddw m1, m2
+ paddw m3, m4
+ movhlps m2, m1
+ movhlps m4, m3
+ paddw m1, m2
+ paddw m3, m4
+ punpcklwd m1, m6
+ punpcklwd m3, m6
+ lea refq, [refq+ref_strideq*4]
+ paddd m0, m1
+ lea srcq, [srcq+src_strideq*4]
+ paddd m0, m3
+ dec n_rowsd
+ jg .loop
+
+ movhlps m1, m0
+ paddd m0, m1
+ punpckldq m0, m6
+ movhlps m1, m0
+ paddd m0, m1
+%if %2 == 2 ; we skipped rows, so we need to double the sad
+ pslld m0, 1
+%endif
+ movd eax, m0
+ RET
+%endmacro
+
+INIT_XMM sse2
+HIGH_SAD16XN 32 ; highbd_sad16x32_sse2
+HIGH_SAD16XN 16 ; highbd_sad16x16_sse2
+HIGH_SAD16XN 8 ; highbd_sad16x8_sse2
+HIGH_SAD16XN 32, 1 ; highbd_sad16x32_avg_sse2
+HIGH_SAD16XN 16, 1 ; highbd_sad16x16_avg_sse2
+HIGH_SAD16XN 8, 1 ; highbd_sad16x8_avg_sse2
+HIGH_SAD16XN 32, 2 ; highbd_sad_skip_16x32_sse2
+HIGH_SAD16XN 16, 2 ; highbd_sad_skip_16x16_sse2
+HIGH_SAD16XN 8, 2 ; highbd_sad_skip_16x8_sse2
+
+; unsigned int vpx_highbd_sad8x{4,8,16}_sse2(uint8_t *src, int src_stride,
+; uint8_t *ref, int ref_stride);
+%macro HIGH_SAD8XN 1-2 0
+ HIGH_SAD_FN 8, %1, 7, %2
+%if %2 == 2 ; skip rows, so divide number of rows by 2
+ mov n_rowsd, %1/8
+%else
+ mov n_rowsd, %1/4
+%endif
+ pxor m0, m0
+ pxor m6, m6
+
+.loop:
+ movu m1, [refq]
+ movu m2, [refq+ref_strideq*2]
+ movu m3, [refq+ref_strideq*4]
+ movu m4, [refq+ref_stride3q*2]
+%if %2 == 1
+ pavgw m1, [second_predq+mmsize*0]
+ pavgw m2, [second_predq+mmsize*1]
+ pavgw m3, [second_predq+mmsize*2]
+ pavgw m4, [second_predq+mmsize*3]
+ lea second_predq, [second_predq+mmsize*4]
+%endif
+ mova m5, [srcq]
+ psubusw m5, m1
+ psubusw m1, [srcq]
+ por m1, m5
+ mova m5, [srcq+src_strideq*2]
+ psubusw m5, m2
+ psubusw m2, [srcq+src_strideq*2]
+ por m2, m5
+ mova m5, [srcq+src_strideq*4]
+ psubusw m5, m3
+ psubusw m3, [srcq+src_strideq*4]
+ por m3, m5
+ mova m5, [srcq+src_stride3q*2]
+ psubusw m5, m4
+ psubusw m4, [srcq+src_stride3q*2]
+ por m4, m5
+ paddw m1, m2
+ paddw m3, m4
+ movhlps m2, m1
+ movhlps m4, m3
+ paddw m1, m2
+ paddw m3, m4
+ punpcklwd m1, m6
+ punpcklwd m3, m6
+ lea refq, [refq+ref_strideq*8]
+ paddd m0, m1
+ lea srcq, [srcq+src_strideq*8]
+ paddd m0, m3
+ dec n_rowsd
+ jg .loop
+
+ movhlps m1, m0
+ paddd m0, m1
+ punpckldq m0, m6
+ movhlps m1, m0
+ paddd m0, m1
+%if %2 == 2 ; we skipped rows, so we need to double the sad
+ pslld m0, 1
+%endif
+ movd eax, m0
+ RET
+%endmacro
+
+INIT_XMM sse2
+HIGH_SAD8XN 16 ; highbd_sad8x16_sse2
+HIGH_SAD8XN 8 ; highbd_sad8x8_sse2
+HIGH_SAD8XN 4 ; highbd_sad8x4_sse2
+HIGH_SAD8XN 16, 1 ; highbd_sad8x16_avg_sse2
+HIGH_SAD8XN 8, 1 ; highbd_sad8x8_avg_sse2
+HIGH_SAD8XN 4, 1 ; highbd_sad8x4_avg_sse2
+HIGH_SAD8XN 16, 2 ; highbd_sad_skip_8x16_sse2
+HIGH_SAD8XN 8, 2 ; highbd_sad_skip_8x8_sse2
diff --git a/media/libvpx/libvpx/vpx_dsp/x86/highbd_subpel_variance_impl_sse2.asm b/media/libvpx/libvpx/vpx_dsp/x86/highbd_subpel_variance_impl_sse2.asm
new file mode 100644
index 0000000000..5a3a2818de
--- /dev/null
+++ b/media/libvpx/libvpx/vpx_dsp/x86/highbd_subpel_variance_impl_sse2.asm
@@ -0,0 +1,1021 @@
+;
+; Copyright (c) 2014 The WebM project authors. All Rights Reserved.
+;
+; Use of this source code is governed by a BSD-style license
+; that can be found in the LICENSE file in the root of the source
+; tree. An additional intellectual property rights grant can be found
+; in the file PATENTS. All contributing project authors may
+; be found in the AUTHORS file in the root of the source tree.
+;
+
+%include "third_party/x86inc/x86inc.asm"
+
+SECTION_RODATA
+pw_8: times 8 dw 8
+bilin_filter_m_sse2: times 8 dw 16
+ times 8 dw 0
+ times 8 dw 14
+ times 8 dw 2
+ times 8 dw 12
+ times 8 dw 4
+ times 8 dw 10
+ times 8 dw 6
+ times 16 dw 8
+ times 8 dw 6
+ times 8 dw 10
+ times 8 dw 4
+ times 8 dw 12
+ times 8 dw 2
+ times 8 dw 14
+
+SECTION .text
+
+; int vpx_sub_pixel_varianceNxh(const uint8_t *src, ptrdiff_t src_stride,
+; int x_offset, int y_offset,
+; const uint8_t *ref, ptrdiff_t ref_stride,
+; int height, unsigned int *sse);
+;
+; This function returns the SE and stores SSE in the given pointer.
+
+%macro SUM_SSE 6 ; src1, ref1, src2, ref2, sum, sse
+ psubw %3, %4
+ psubw %1, %2
+ mova %4, %3 ; make copies to manipulate to calc sum
+ mova %2, %1 ; use originals for calc sse
+ pmaddwd %3, %3
+ paddw %4, %2
+ pmaddwd %1, %1
+ movhlps %2, %4
+ paddd %6, %3
+ paddw %4, %2
+ pxor %2, %2
+ pcmpgtw %2, %4 ; mask for 0 > %4 (sum)
+ punpcklwd %4, %2 ; sign-extend word to dword
+ paddd %6, %1
+ paddd %5, %4
+
+%endmacro
+
+%macro STORE_AND_RET 0
+%if mmsize == 16
+ ; if H=64 and W=16, we have 8 words of each 2(1bit)x64(6bit)x9bit=16bit
+ ; in m6, i.e. it _exactly_ fits in a signed word per word in the xmm reg.
+ ; We have to sign-extend it before adding the words within the register
+ ; and outputing to a dword.
+ movhlps m3, m7
+ movhlps m4, m6
+ paddd m7, m3
+ paddd m6, m4
+ pshufd m3, m7, 0x1
+ pshufd m4, m6, 0x1
+ paddd m7, m3
+ paddd m6, m4
+ mov r1, ssem ; r1 = unsigned int *sse
+ movd [r1], m7 ; store sse
+ movd eax, m6 ; store sum as return value
+%endif
+ RET
+%endmacro
+
+%macro INC_SRC_BY_SRC_STRIDE 0
+%if VPX_ARCH_X86=1 && CONFIG_PIC=1
+ add srcq, src_stridemp
+ add srcq, src_stridemp
+%else
+ lea srcq, [srcq + src_strideq*2]
+%endif
+%endmacro
+
+%macro SUBPEL_VARIANCE 1-2 0 ; W
+%define bilin_filter_m bilin_filter_m_sse2
+%define filter_idx_shift 5
+
+
+%if VPX_ARCH_X86_64
+ %if %2 == 1 ; avg
+ cglobal highbd_sub_pixel_avg_variance%1xh, 9, 10, 13, src, src_stride, \
+ x_offset, y_offset, \
+ ref, ref_stride, \
+ second_pred, second_stride, height, sse
+ %define second_str second_strideq
+ %else
+ cglobal highbd_sub_pixel_variance%1xh, 7, 8, 13, src, src_stride, \
+ x_offset, y_offset, \
+ ref, ref_stride, height, sse
+ %endif
+ %define block_height heightd
+ %define bilin_filter sseq
+%else
+ %if CONFIG_PIC=1
+ %if %2 == 1 ; avg
+ cglobal highbd_sub_pixel_avg_variance%1xh, 7, 7, 13, src, src_stride, \
+ x_offset, y_offset, \
+ ref, ref_stride, \
+ second_pred, second_stride, height, sse
+ %define block_height dword heightm
+ %define second_str second_stridemp
+ %else
+ cglobal highbd_sub_pixel_variance%1xh, 7, 7, 13, src, src_stride, \
+ x_offset, y_offset, \
+ ref, ref_stride, height, sse
+ %define block_height heightd
+ %endif
+
+ ; reuse argument stack space
+ %define g_bilin_filterm x_offsetm
+ %define g_pw_8m y_offsetm
+
+ ; Store bilin_filter and pw_8 location in stack
+ %if GET_GOT_DEFINED == 1
+ GET_GOT eax
+ add esp, 4 ; restore esp
+ %endif
+
+ lea ecx, [GLOBAL(bilin_filter_m)]
+ mov g_bilin_filterm, ecx
+
+ lea ecx, [GLOBAL(pw_8)]
+ mov g_pw_8m, ecx
+
+ LOAD_IF_USED 0, 1 ; load eax, ecx back
+ %else
+ %if %2 == 1 ; avg
+ cglobal highbd_sub_pixel_avg_variance%1xh, 7, 7, 13, src, src_stride, \
+ x_offset, y_offset, \
+ ref, ref_stride, \
+ second_pred, second_stride, height, sse
+ %define block_height dword heightm
+ %define second_str second_stridemp
+ %else
+ cglobal highbd_sub_pixel_variance%1xh, 7, 7, 13, src, src_stride, \
+ x_offset, y_offset, \
+ ref, ref_stride, height, sse
+ %define block_height heightd
+ %endif
+
+ %define bilin_filter bilin_filter_m
+ %endif
+%endif
+
+ ASSERT %1 <= 16 ; m6 overflows if w > 16
+ pxor m6, m6 ; sum
+ pxor m7, m7 ; sse
+
+%if %1 < 16
+ sar block_height, 1
+%endif
+%if %2 == 1 ; avg
+ shl second_str, 1
+%endif
+
+ ; FIXME(rbultje) replace by jumptable?
+ test x_offsetd, x_offsetd
+ jnz .x_nonzero
+ ; x_offset == 0
+ test y_offsetd, y_offsetd
+ jnz .x_zero_y_nonzero
+
+ ; x_offset == 0 && y_offset == 0
+.x_zero_y_zero_loop:
+%if %1 == 16
+ movu m0, [srcq]
+ movu m2, [srcq + 16]
+ mova m1, [refq]
+ mova m3, [refq + 16]
+%if %2 == 1 ; avg
+ pavgw m0, [second_predq]
+ pavgw m2, [second_predq+16]
+%endif
+ SUM_SSE m0, m1, m2, m3, m6, m7
+
+ lea srcq, [srcq + src_strideq*2]
+ lea refq, [refq + ref_strideq*2]
+%if %2 == 1 ; avg
+ add second_predq, second_str
+%endif
+%else ; %1 < 16
+ movu m0, [srcq]
+ movu m2, [srcq + src_strideq*2]
+ mova m1, [refq]
+ mova m3, [refq + ref_strideq*2]
+%if %2 == 1 ; avg
+ pavgw m0, [second_predq]
+ add second_predq, second_str
+ pavgw m2, [second_predq]
+%endif
+ SUM_SSE m0, m1, m2, m3, m6, m7
+
+ lea srcq, [srcq + src_strideq*4]
+ lea refq, [refq + ref_strideq*4]
+%if %2 == 1 ; avg
+ add second_predq, second_str
+%endif
+%endif
+ dec block_height
+ jg .x_zero_y_zero_loop
+ STORE_AND_RET
+
+.x_zero_y_nonzero:
+ cmp y_offsetd, 8
+ jne .x_zero_y_nonhalf
+
+ ; x_offset == 0 && y_offset == 0.5
+.x_zero_y_half_loop:
+%if %1 == 16
+ movu m0, [srcq]
+ movu m1, [srcq+16]
+ movu m4, [srcq+src_strideq*2]
+ movu m5, [srcq+src_strideq*2+16]
+ mova m2, [refq]
+ mova m3, [refq+16]
+ pavgw m0, m4
+ pavgw m1, m5
+%if %2 == 1 ; avg
+ pavgw m0, [second_predq]
+ pavgw m1, [second_predq+16]
+%endif
+ SUM_SSE m0, m2, m1, m3, m6, m7
+
+ lea srcq, [srcq + src_strideq*2]
+ lea refq, [refq + ref_strideq*2]
+%if %2 == 1 ; avg
+ add second_predq, second_str
+%endif
+%else ; %1 < 16
+ movu m0, [srcq]
+ movu m1, [srcq+src_strideq*2]
+ movu m5, [srcq+src_strideq*4]
+ mova m2, [refq]
+ mova m3, [refq+ref_strideq*2]
+ pavgw m0, m1
+ pavgw m1, m5
+%if %2 == 1 ; avg
+ pavgw m0, [second_predq]
+ add second_predq, second_str
+ pavgw m1, [second_predq]
+%endif
+ SUM_SSE m0, m2, m1, m3, m6, m7
+
+ lea srcq, [srcq + src_strideq*4]
+ lea refq, [refq + ref_strideq*4]
+%if %2 == 1 ; avg
+ add second_predq, second_str
+%endif
+%endif
+ dec block_height
+ jg .x_zero_y_half_loop
+ STORE_AND_RET
+
+.x_zero_y_nonhalf:
+ ; x_offset == 0 && y_offset == bilin interpolation
+%if VPX_ARCH_X86_64
+ lea bilin_filter, [GLOBAL(bilin_filter_m)]
+%endif
+ shl y_offsetd, filter_idx_shift
+%if VPX_ARCH_X86_64 && mmsize == 16
+ mova m8, [bilin_filter+y_offsetq]
+ mova m9, [bilin_filter+y_offsetq+16]
+ mova m10, [GLOBAL(pw_8)]
+%define filter_y_a m8
+%define filter_y_b m9
+%define filter_rnd m10
+%else ; x86-32 or mmx
+%if VPX_ARCH_X86=1 && CONFIG_PIC=1
+; x_offset == 0, reuse x_offset reg
+%define tempq x_offsetq
+ add y_offsetq, g_bilin_filterm
+%define filter_y_a [y_offsetq]
+%define filter_y_b [y_offsetq+16]
+ mov tempq, g_pw_8m
+%define filter_rnd [tempq]
+%else
+ add y_offsetq, bilin_filter
+%define filter_y_a [y_offsetq]
+%define filter_y_b [y_offsetq+16]
+%define filter_rnd [GLOBAL(pw_8)]
+%endif
+%endif
+
+.x_zero_y_other_loop:
+%if %1 == 16
+ movu m0, [srcq]
+ movu m1, [srcq + 16]
+ movu m4, [srcq+src_strideq*2]
+ movu m5, [srcq+src_strideq*2+16]
+ mova m2, [refq]
+ mova m3, [refq+16]
+ ; FIXME(rbultje) instead of out=((num-x)*in1+x*in2+rnd)>>log2(num), we can
+ ; also do out=in1+(((num-x)*(in2-in1)+rnd)>>log2(num)). Total number of
+ ; instructions is the same (5), but it is 1 mul instead of 2, so might be
+ ; slightly faster because of pmullw latency. It would also cut our rodata
+ ; tables in half for this function, and save 1-2 registers on x86-64.
+ pmullw m1, filter_y_a
+ pmullw m5, filter_y_b
+ paddw m1, filter_rnd
+ pmullw m0, filter_y_a
+ pmullw m4, filter_y_b
+ paddw m0, filter_rnd
+ paddw m1, m5
+ paddw m0, m4
+ psrlw m1, 4
+ psrlw m0, 4
+%if %2 == 1 ; avg
+ pavgw m0, [second_predq]
+ pavgw m1, [second_predq+16]
+%endif
+ SUM_SSE m0, m2, m1, m3, m6, m7
+
+ lea srcq, [srcq + src_strideq*2]
+ lea refq, [refq + ref_strideq*2]
+%if %2 == 1 ; avg
+ add second_predq, second_str
+%endif
+%else ; %1 < 16
+ movu m0, [srcq]
+ movu m1, [srcq+src_strideq*2]
+ movu m5, [srcq+src_strideq*4]
+ mova m4, m1
+ mova m2, [refq]
+ mova m3, [refq+ref_strideq*2]
+ pmullw m1, filter_y_a
+ pmullw m5, filter_y_b
+ paddw m1, filter_rnd
+ pmullw m0, filter_y_a
+ pmullw m4, filter_y_b
+ paddw m0, filter_rnd
+ paddw m1, m5
+ paddw m0, m4
+ psrlw m1, 4
+ psrlw m0, 4
+%if %2 == 1 ; avg
+ pavgw m0, [second_predq]
+ add second_predq, second_str
+ pavgw m1, [second_predq]
+%endif
+ SUM_SSE m0, m2, m1, m3, m6, m7
+
+ lea srcq, [srcq + src_strideq*4]
+ lea refq, [refq + ref_strideq*4]
+%if %2 == 1 ; avg
+ add second_predq, second_str
+%endif
+%endif
+ dec block_height
+ jg .x_zero_y_other_loop
+%undef filter_y_a
+%undef filter_y_b
+%undef filter_rnd
+ STORE_AND_RET
+
+.x_nonzero:
+ cmp x_offsetd, 8
+ jne .x_nonhalf
+ ; x_offset == 0.5
+ test y_offsetd, y_offsetd
+ jnz .x_half_y_nonzero
+
+ ; x_offset == 0.5 && y_offset == 0
+.x_half_y_zero_loop:
+%if %1 == 16
+ movu m0, [srcq]
+ movu m1, [srcq + 16]
+ movu m4, [srcq + 2]
+ movu m5, [srcq + 18]
+ mova m2, [refq]
+ mova m3, [refq + 16]
+ pavgw m0, m4
+ pavgw m1, m5
+%if %2 == 1 ; avg
+ pavgw m0, [second_predq]
+ pavgw m1, [second_predq+16]
+%endif
+ SUM_SSE m0, m2, m1, m3, m6, m7
+
+ lea srcq, [srcq + src_strideq*2]
+ lea refq, [refq + ref_strideq*2]
+%if %2 == 1 ; avg
+ add second_predq, second_str
+%endif
+%else ; %1 < 16
+ movu m0, [srcq]
+ movu m1, [srcq + src_strideq*2]
+ movu m4, [srcq + 2]
+ movu m5, [srcq + src_strideq*2 + 2]
+ mova m2, [refq]
+ mova m3, [refq + ref_strideq*2]
+ pavgw m0, m4
+ pavgw m1, m5
+%if %2 == 1 ; avg
+ pavgw m0, [second_predq]
+ add second_predq, second_str
+ pavgw m1, [second_predq]
+%endif
+ SUM_SSE m0, m2, m1, m3, m6, m7
+
+ lea srcq, [srcq + src_strideq*4]
+ lea refq, [refq + ref_strideq*4]
+%if %2 == 1 ; avg
+ add second_predq, second_str
+%endif
+%endif
+ dec block_height
+ jg .x_half_y_zero_loop
+ STORE_AND_RET
+
+.x_half_y_nonzero:
+ cmp y_offsetd, 8
+ jne .x_half_y_nonhalf
+
+ ; x_offset == 0.5 && y_offset == 0.5
+%if %1 == 16
+ movu m0, [srcq]
+ movu m1, [srcq+16]
+ movu m2, [srcq+2]
+ movu m3, [srcq+18]
+ lea srcq, [srcq + src_strideq*2]
+ pavgw m0, m2
+ pavgw m1, m3
+.x_half_y_half_loop:
+ movu m2, [srcq]
+ movu m3, [srcq + 16]
+ movu m4, [srcq + 2]
+ movu m5, [srcq + 18]
+ pavgw m2, m4
+ pavgw m3, m5
+ pavgw m0, m2
+ pavgw m1, m3
+ mova m4, [refq]
+ mova m5, [refq + 16]
+%if %2 == 1 ; avg
+ pavgw m0, [second_predq]
+ pavgw m1, [second_predq+16]
+%endif
+ SUM_SSE m0, m4, m1, m5, m6, m7
+ mova m0, m2
+ mova m1, m3
+
+ lea srcq, [srcq + src_strideq*2]
+ lea refq, [refq + ref_strideq*2]
+%if %2 == 1 ; avg
+ add second_predq, second_str
+%endif
+%else ; %1 < 16
+ movu m0, [srcq]
+ movu m2, [srcq+2]
+ lea srcq, [srcq + src_strideq*2]
+ pavgw m0, m2
+.x_half_y_half_loop:
+ movu m2, [srcq]
+ movu m3, [srcq + src_strideq*2]
+ movu m4, [srcq + 2]
+ movu m5, [srcq + src_strideq*2 + 2]
+ pavgw m2, m4
+ pavgw m3, m5
+ pavgw m0, m2
+ pavgw m2, m3
+ mova m4, [refq]
+ mova m5, [refq + ref_strideq*2]
+%if %2 == 1 ; avg
+ pavgw m0, [second_predq]
+ add second_predq, second_str
+ pavgw m2, [second_predq]
+%endif
+ SUM_SSE m0, m4, m2, m5, m6, m7
+ mova m0, m3
+
+ lea srcq, [srcq + src_strideq*4]
+ lea refq, [refq + ref_strideq*4]
+%if %2 == 1 ; avg
+ add second_predq, second_str
+%endif
+%endif
+ dec block_height
+ jg .x_half_y_half_loop
+ STORE_AND_RET
+
+.x_half_y_nonhalf:
+ ; x_offset == 0.5 && y_offset == bilin interpolation
+%if VPX_ARCH_X86_64
+ lea bilin_filter, [GLOBAL(bilin_filter_m)]
+%endif
+ shl y_offsetd, filter_idx_shift
+%if VPX_ARCH_X86_64 && mmsize == 16
+ mova m8, [bilin_filter+y_offsetq]
+ mova m9, [bilin_filter+y_offsetq+16]
+ mova m10, [GLOBAL(pw_8)]
+%define filter_y_a m8
+%define filter_y_b m9
+%define filter_rnd m10
+%else ; x86_32
+%if VPX_ARCH_X86=1 && CONFIG_PIC=1
+; x_offset == 0.5. We can reuse x_offset reg
+%define tempq x_offsetq
+ add y_offsetq, g_bilin_filterm
+%define filter_y_a [y_offsetq]
+%define filter_y_b [y_offsetq+16]
+ mov tempq, g_pw_8m
+%define filter_rnd [tempq]
+%else
+ add y_offsetq, bilin_filter
+%define filter_y_a [y_offsetq]
+%define filter_y_b [y_offsetq+16]
+%define filter_rnd [GLOBAL(pw_8)]
+%endif
+%endif
+
+%if %1 == 16
+ movu m0, [srcq]
+ movu m1, [srcq+16]
+ movu m2, [srcq+2]
+ movu m3, [srcq+18]
+ lea srcq, [srcq + src_strideq*2]
+ pavgw m0, m2
+ pavgw m1, m3
+.x_half_y_other_loop:
+ movu m2, [srcq]
+ movu m3, [srcq+16]
+ movu m4, [srcq+2]
+ movu m5, [srcq+18]
+ pavgw m2, m4
+ pavgw m3, m5
+ mova m4, m2
+ mova m5, m3
+ pmullw m1, filter_y_a
+ pmullw m3, filter_y_b
+ paddw m1, filter_rnd
+ paddw m1, m3
+ pmullw m0, filter_y_a
+ pmullw m2, filter_y_b
+ paddw m0, filter_rnd
+ psrlw m1, 4
+ paddw m0, m2
+ mova m2, [refq]
+ psrlw m0, 4
+ mova m3, [refq+16]
+%if %2 == 1 ; avg
+ pavgw m0, [second_predq]
+ pavgw m1, [second_predq+16]
+%endif
+ SUM_SSE m0, m2, m1, m3, m6, m7
+ mova m0, m4
+ mova m1, m5
+
+ lea srcq, [srcq + src_strideq*2]
+ lea refq, [refq + ref_strideq*2]
+%if %2 == 1 ; avg
+ add second_predq, second_str
+%endif
+%else ; %1 < 16
+ movu m0, [srcq]
+ movu m2, [srcq+2]
+ lea srcq, [srcq + src_strideq*2]
+ pavgw m0, m2
+.x_half_y_other_loop:
+ movu m2, [srcq]
+ movu m3, [srcq+src_strideq*2]
+ movu m4, [srcq+2]
+ movu m5, [srcq+src_strideq*2+2]
+ pavgw m2, m4
+ pavgw m3, m5
+ mova m4, m2
+ mova m5, m3
+ pmullw m4, filter_y_a
+ pmullw m3, filter_y_b
+ paddw m4, filter_rnd
+ paddw m4, m3
+ pmullw m0, filter_y_a
+ pmullw m2, filter_y_b
+ paddw m0, filter_rnd
+ psrlw m4, 4
+ paddw m0, m2
+ mova m2, [refq]
+ psrlw m0, 4
+ mova m3, [refq+ref_strideq*2]
+%if %2 == 1 ; avg
+ pavgw m0, [second_predq]
+ add second_predq, second_str
+ pavgw m4, [second_predq]
+%endif
+ SUM_SSE m0, m2, m4, m3, m6, m7
+ mova m0, m5
+
+ lea srcq, [srcq + src_strideq*4]
+ lea refq, [refq + ref_strideq*4]
+%if %2 == 1 ; avg
+ add second_predq, second_str
+%endif
+%endif
+ dec block_height
+ jg .x_half_y_other_loop
+%undef filter_y_a
+%undef filter_y_b
+%undef filter_rnd
+ STORE_AND_RET
+
+.x_nonhalf:
+ test y_offsetd, y_offsetd
+ jnz .x_nonhalf_y_nonzero
+
+ ; x_offset == bilin interpolation && y_offset == 0
+%if VPX_ARCH_X86_64
+ lea bilin_filter, [GLOBAL(bilin_filter_m)]
+%endif
+ shl x_offsetd, filter_idx_shift
+%if VPX_ARCH_X86_64 && mmsize == 16
+ mova m8, [bilin_filter+x_offsetq]
+ mova m9, [bilin_filter+x_offsetq+16]
+ mova m10, [GLOBAL(pw_8)]
+%define filter_x_a m8
+%define filter_x_b m9
+%define filter_rnd m10
+%else ; x86-32
+%if VPX_ARCH_X86=1 && CONFIG_PIC=1
+; y_offset == 0. We can reuse y_offset reg.
+%define tempq y_offsetq
+ add x_offsetq, g_bilin_filterm
+%define filter_x_a [x_offsetq]
+%define filter_x_b [x_offsetq+16]
+ mov tempq, g_pw_8m
+%define filter_rnd [tempq]
+%else
+ add x_offsetq, bilin_filter
+%define filter_x_a [x_offsetq]
+%define filter_x_b [x_offsetq+16]
+%define filter_rnd [GLOBAL(pw_8)]
+%endif
+%endif
+
+.x_other_y_zero_loop:
+%if %1 == 16
+ movu m0, [srcq]
+ movu m1, [srcq+16]
+ movu m2, [srcq+2]
+ movu m3, [srcq+18]
+ mova m4, [refq]
+ mova m5, [refq+16]
+ pmullw m1, filter_x_a
+ pmullw m3, filter_x_b
+ paddw m1, filter_rnd
+ pmullw m0, filter_x_a
+ pmullw m2, filter_x_b
+ paddw m0, filter_rnd
+ paddw m1, m3
+ paddw m0, m2
+ psrlw m1, 4
+ psrlw m0, 4
+%if %2 == 1 ; avg
+ pavgw m0, [second_predq]
+ pavgw m1, [second_predq+16]
+%endif
+ SUM_SSE m0, m4, m1, m5, m6, m7
+
+ lea srcq, [srcq+src_strideq*2]
+ lea refq, [refq+ref_strideq*2]
+%if %2 == 1 ; avg
+ add second_predq, second_str
+%endif
+%else ; %1 < 16
+ movu m0, [srcq]
+ movu m1, [srcq+src_strideq*2]
+ movu m2, [srcq+2]
+ movu m3, [srcq+src_strideq*2+2]
+ mova m4, [refq]
+ mova m5, [refq+ref_strideq*2]
+ pmullw m1, filter_x_a
+ pmullw m3, filter_x_b
+ paddw m1, filter_rnd
+ pmullw m0, filter_x_a
+ pmullw m2, filter_x_b
+ paddw m0, filter_rnd
+ paddw m1, m3
+ paddw m0, m2
+ psrlw m1, 4
+ psrlw m0, 4
+%if %2 == 1 ; avg
+ pavgw m0, [second_predq]
+ add second_predq, second_str
+ pavgw m1, [second_predq]
+%endif
+ SUM_SSE m0, m4, m1, m5, m6, m7
+
+ lea srcq, [srcq+src_strideq*4]
+ lea refq, [refq+ref_strideq*4]
+%if %2 == 1 ; avg
+ add second_predq, second_str
+%endif
+%endif
+ dec block_height
+ jg .x_other_y_zero_loop
+%undef filter_x_a
+%undef filter_x_b
+%undef filter_rnd
+ STORE_AND_RET
+
+.x_nonhalf_y_nonzero:
+ cmp y_offsetd, 8
+ jne .x_nonhalf_y_nonhalf
+
+ ; x_offset == bilin interpolation && y_offset == 0.5
+%if VPX_ARCH_X86_64
+ lea bilin_filter, [GLOBAL(bilin_filter_m)]
+%endif
+ shl x_offsetd, filter_idx_shift
+%if VPX_ARCH_X86_64 && mmsize == 16
+ mova m8, [bilin_filter+x_offsetq]
+ mova m9, [bilin_filter+x_offsetq+16]
+ mova m10, [GLOBAL(pw_8)]
+%define filter_x_a m8
+%define filter_x_b m9
+%define filter_rnd m10
+%else ; x86-32
+%if VPX_ARCH_X86=1 && CONFIG_PIC=1
+; y_offset == 0.5. We can reuse y_offset reg.
+%define tempq y_offsetq
+ add x_offsetq, g_bilin_filterm
+%define filter_x_a [x_offsetq]
+%define filter_x_b [x_offsetq+16]
+ mov tempq, g_pw_8m
+%define filter_rnd [tempq]
+%else
+ add x_offsetq, bilin_filter
+%define filter_x_a [x_offsetq]
+%define filter_x_b [x_offsetq+16]
+%define filter_rnd [GLOBAL(pw_8)]
+%endif
+%endif
+
+%if %1 == 16
+ movu m0, [srcq]
+ movu m1, [srcq+16]
+ movu m2, [srcq+2]
+ movu m3, [srcq+18]
+ pmullw m0, filter_x_a
+ pmullw m2, filter_x_b
+ paddw m0, filter_rnd
+ pmullw m1, filter_x_a
+ pmullw m3, filter_x_b
+ paddw m1, filter_rnd
+ paddw m0, m2
+ paddw m1, m3
+ psrlw m0, 4
+ psrlw m1, 4
+ lea srcq, [srcq+src_strideq*2]
+.x_other_y_half_loop:
+ movu m2, [srcq]
+ movu m3, [srcq+16]
+ movu m4, [srcq+2]
+ movu m5, [srcq+18]
+ pmullw m2, filter_x_a
+ pmullw m4, filter_x_b
+ paddw m2, filter_rnd
+ pmullw m3, filter_x_a
+ pmullw m5, filter_x_b
+ paddw m3, filter_rnd
+ paddw m2, m4
+ paddw m3, m5
+ mova m4, [refq]
+ mova m5, [refq+16]
+ psrlw m2, 4
+ psrlw m3, 4
+ pavgw m0, m2
+ pavgw m1, m3
+%if %2 == 1 ; avg
+ pavgw m0, [second_predq]
+ pavgw m1, [second_predq+16]
+%endif
+ SUM_SSE m0, m4, m1, m5, m6, m7
+ mova m0, m2
+ mova m1, m3
+
+ lea srcq, [srcq+src_strideq*2]
+ lea refq, [refq+ref_strideq*2]
+%if %2 == 1 ; avg
+ add second_predq, second_str
+%endif
+%else ; %1 < 16
+ movu m0, [srcq]
+ movu m2, [srcq+2]
+ pmullw m0, filter_x_a
+ pmullw m2, filter_x_b
+ paddw m0, filter_rnd
+ paddw m0, m2
+ psrlw m0, 4
+ lea srcq, [srcq+src_strideq*2]
+.x_other_y_half_loop:
+ movu m2, [srcq]
+ movu m3, [srcq+src_strideq*2]
+ movu m4, [srcq+2]
+ movu m5, [srcq+src_strideq*2+2]
+ pmullw m2, filter_x_a
+ pmullw m4, filter_x_b
+ paddw m2, filter_rnd
+ pmullw m3, filter_x_a
+ pmullw m5, filter_x_b
+ paddw m3, filter_rnd
+ paddw m2, m4
+ paddw m3, m5
+ mova m4, [refq]
+ mova m5, [refq+ref_strideq*2]
+ psrlw m2, 4
+ psrlw m3, 4
+ pavgw m0, m2
+ pavgw m2, m3
+%if %2 == 1 ; avg
+ pavgw m0, [second_predq]
+ add second_predq, second_str
+ pavgw m2, [second_predq]
+%endif
+ SUM_SSE m0, m4, m2, m5, m6, m7
+ mova m0, m3
+
+ lea srcq, [srcq+src_strideq*4]
+ lea refq, [refq+ref_strideq*4]
+%if %2 == 1 ; avg
+ add second_predq, second_str
+%endif
+%endif
+ dec block_height
+ jg .x_other_y_half_loop
+%undef filter_x_a
+%undef filter_x_b
+%undef filter_rnd
+ STORE_AND_RET
+
+.x_nonhalf_y_nonhalf:
+; loading filter - this is same as in 8-bit depth
+%if VPX_ARCH_X86_64
+ lea bilin_filter, [GLOBAL(bilin_filter_m)]
+%endif
+ shl x_offsetd, filter_idx_shift ; filter_idx_shift = 5
+ shl y_offsetd, filter_idx_shift
+%if VPX_ARCH_X86_64 && mmsize == 16
+ mova m8, [bilin_filter+x_offsetq]
+ mova m9, [bilin_filter+x_offsetq+16]
+ mova m10, [bilin_filter+y_offsetq]
+ mova m11, [bilin_filter+y_offsetq+16]
+ mova m12, [GLOBAL(pw_8)]
+%define filter_x_a m8
+%define filter_x_b m9
+%define filter_y_a m10
+%define filter_y_b m11
+%define filter_rnd m12
+%else ; x86-32
+%if VPX_ARCH_X86=1 && CONFIG_PIC=1
+; In this case, there is NO unused register. Used src_stride register. Later,
+; src_stride has to be loaded from stack when it is needed.
+%define tempq src_strideq
+ mov tempq, g_bilin_filterm
+ add x_offsetq, tempq
+ add y_offsetq, tempq
+%define filter_x_a [x_offsetq]
+%define filter_x_b [x_offsetq+16]
+%define filter_y_a [y_offsetq]
+%define filter_y_b [y_offsetq+16]
+
+ mov tempq, g_pw_8m
+%define filter_rnd [tempq]
+%else
+ add x_offsetq, bilin_filter
+ add y_offsetq, bilin_filter
+%define filter_x_a [x_offsetq]
+%define filter_x_b [x_offsetq+16]
+%define filter_y_a [y_offsetq]
+%define filter_y_b [y_offsetq+16]
+%define filter_rnd [GLOBAL(pw_8)]
+%endif
+%endif
+; end of load filter
+
+ ; x_offset == bilin interpolation && y_offset == bilin interpolation
+%if %1 == 16
+ movu m0, [srcq]
+ movu m2, [srcq+2]
+ movu m1, [srcq+16]
+ movu m3, [srcq+18]
+ pmullw m0, filter_x_a
+ pmullw m2, filter_x_b
+ paddw m0, filter_rnd
+ pmullw m1, filter_x_a
+ pmullw m3, filter_x_b
+ paddw m1, filter_rnd
+ paddw m0, m2
+ paddw m1, m3
+ psrlw m0, 4
+ psrlw m1, 4
+
+ INC_SRC_BY_SRC_STRIDE
+
+.x_other_y_other_loop:
+ movu m2, [srcq]
+ movu m4, [srcq+2]
+ movu m3, [srcq+16]
+ movu m5, [srcq+18]
+ pmullw m2, filter_x_a
+ pmullw m4, filter_x_b
+ paddw m2, filter_rnd
+ pmullw m3, filter_x_a
+ pmullw m5, filter_x_b
+ paddw m3, filter_rnd
+ paddw m2, m4
+ paddw m3, m5
+ psrlw m2, 4
+ psrlw m3, 4
+ mova m4, m2
+ mova m5, m3
+ pmullw m0, filter_y_a
+ pmullw m2, filter_y_b
+ paddw m0, filter_rnd
+ pmullw m1, filter_y_a
+ pmullw m3, filter_y_b
+ paddw m0, m2
+ paddw m1, filter_rnd
+ mova m2, [refq]
+ paddw m1, m3
+ psrlw m0, 4
+ psrlw m1, 4
+ mova m3, [refq+16]
+%if %2 == 1 ; avg
+ pavgw m0, [second_predq]
+ pavgw m1, [second_predq+16]
+%endif
+ SUM_SSE m0, m2, m1, m3, m6, m7
+ mova m0, m4
+ mova m1, m5
+
+ INC_SRC_BY_SRC_STRIDE
+ lea refq, [refq + ref_strideq * 2]
+%if %2 == 1 ; avg
+ add second_predq, second_str
+%endif
+%else ; %1 < 16
+ movu m0, [srcq]
+ movu m2, [srcq+2]
+ pmullw m0, filter_x_a
+ pmullw m2, filter_x_b
+ paddw m0, filter_rnd
+ paddw m0, m2
+ psrlw m0, 4
+
+ INC_SRC_BY_SRC_STRIDE
+
+.x_other_y_other_loop:
+ movu m2, [srcq]
+ movu m4, [srcq+2]
+ INC_SRC_BY_SRC_STRIDE
+ movu m3, [srcq]
+ movu m5, [srcq+2]
+ pmullw m2, filter_x_a
+ pmullw m4, filter_x_b
+ paddw m2, filter_rnd
+ pmullw m3, filter_x_a
+ pmullw m5, filter_x_b
+ paddw m3, filter_rnd
+ paddw m2, m4
+ paddw m3, m5
+ psrlw m2, 4
+ psrlw m3, 4
+ mova m4, m2
+ mova m5, m3
+ pmullw m0, filter_y_a
+ pmullw m2, filter_y_b
+ paddw m0, filter_rnd
+ pmullw m4, filter_y_a
+ pmullw m3, filter_y_b
+ paddw m0, m2
+ paddw m4, filter_rnd
+ mova m2, [refq]
+ paddw m4, m3
+ psrlw m0, 4
+ psrlw m4, 4
+ mova m3, [refq+ref_strideq*2]
+%if %2 == 1 ; avg
+ pavgw m0, [second_predq]
+ add second_predq, second_str
+ pavgw m4, [second_predq]
+%endif
+ SUM_SSE m0, m2, m4, m3, m6, m7
+ mova m0, m5
+
+ INC_SRC_BY_SRC_STRIDE
+ lea refq, [refq + ref_strideq * 4]
+%if %2 == 1 ; avg
+ add second_predq, second_str
+%endif
+%endif
+ dec block_height
+ jg .x_other_y_other_loop
+%undef filter_x_a
+%undef filter_x_b
+%undef filter_y_a
+%undef filter_y_b
+%undef filter_rnd
+ STORE_AND_RET
+%endmacro
+
+INIT_XMM sse2
+SUBPEL_VARIANCE 8
+SUBPEL_VARIANCE 16
+
+INIT_XMM sse2
+SUBPEL_VARIANCE 8, 1
+SUBPEL_VARIANCE 16, 1
diff --git a/media/libvpx/libvpx/vpx_dsp/x86/highbd_variance_impl_sse2.asm b/media/libvpx/libvpx/vpx_dsp/x86/highbd_variance_impl_sse2.asm
new file mode 100644
index 0000000000..5bee51fa0c
--- /dev/null
+++ b/media/libvpx/libvpx/vpx_dsp/x86/highbd_variance_impl_sse2.asm
@@ -0,0 +1,315 @@
+;
+; Copyright (c) 2014 The WebM project authors. All Rights Reserved.
+;
+; Use of this source code is governed by a BSD-style license
+; that can be found in the LICENSE file in the root of the source
+; tree. An additional intellectual property rights grant can be found
+; in the file PATENTS. All contributing project authors may
+; be found in the AUTHORS file in the root of the source tree.
+;
+
+
+%include "vpx_ports/x86_abi_support.asm"
+
+SECTION .text
+
+;unsigned int vpx_highbd_calc16x16var_sse2
+;(
+; unsigned char * src_ptr,
+; int src_stride,
+; unsigned char * ref_ptr,
+; int ref_stride,
+; unsigned int * SSE,
+; int * Sum
+;)
+globalsym(vpx_highbd_calc16x16var_sse2)
+sym(vpx_highbd_calc16x16var_sse2):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 6
+ SAVE_XMM 7
+ push rbx
+ push rsi
+ push rdi
+ ; end prolog
+
+ mov rsi, arg(0) ;[src_ptr]
+ mov rdi, arg(2) ;[ref_ptr]
+
+ movsxd rax, DWORD PTR arg(1) ;[src_stride]
+ movsxd rdx, DWORD PTR arg(3) ;[ref_stride]
+ add rax, rax ; source stride in bytes
+ add rdx, rdx ; recon stride in bytes
+
+ ; Prefetch data
+ prefetcht0 [rsi]
+ prefetcht0 [rsi+16]
+ prefetcht0 [rsi+rax]
+ prefetcht0 [rsi+rax+16]
+ lea rbx, [rsi+rax*2]
+ prefetcht0 [rbx]
+ prefetcht0 [rbx+16]
+ prefetcht0 [rbx+rax]
+ prefetcht0 [rbx+rax+16]
+
+ prefetcht0 [rdi]
+ prefetcht0 [rdi+16]
+ prefetcht0 [rdi+rdx]
+ prefetcht0 [rdi+rdx+16]
+ lea rbx, [rdi+rdx*2]
+ prefetcht0 [rbx]
+ prefetcht0 [rbx+16]
+ prefetcht0 [rbx+rdx]
+ prefetcht0 [rbx+rdx+16]
+
+ pxor xmm0, xmm0 ; clear xmm0 for unpack
+ pxor xmm7, xmm7 ; clear xmm7 for accumulating diffs
+
+ pxor xmm6, xmm6 ; clear xmm6 for accumulating sse
+ mov rcx, 16
+
+.var16loop:
+ movdqu xmm1, XMMWORD PTR [rsi]
+ movdqu xmm2, XMMWORD PTR [rdi]
+
+ lea rbx, [rsi+rax*2]
+ prefetcht0 [rbx]
+ prefetcht0 [rbx+16]
+ prefetcht0 [rbx+rax]
+ prefetcht0 [rbx+rax+16]
+ lea rbx, [rdi+rdx*2]
+ prefetcht0 [rbx]
+ prefetcht0 [rbx+16]
+ prefetcht0 [rbx+rdx]
+ prefetcht0 [rbx+rdx+16]
+
+ pxor xmm5, xmm5
+
+ psubw xmm1, xmm2
+ movdqu xmm3, XMMWORD PTR [rsi+16]
+ paddw xmm5, xmm1
+ pmaddwd xmm1, xmm1
+ movdqu xmm2, XMMWORD PTR [rdi+16]
+ paddd xmm6, xmm1
+
+ psubw xmm3, xmm2
+ movdqu xmm1, XMMWORD PTR [rsi+rax]
+ paddw xmm5, xmm3
+ pmaddwd xmm3, xmm3
+ movdqu xmm2, XMMWORD PTR [rdi+rdx]
+ paddd xmm6, xmm3
+
+ psubw xmm1, xmm2
+ movdqu xmm3, XMMWORD PTR [rsi+rax+16]
+ paddw xmm5, xmm1
+ pmaddwd xmm1, xmm1
+ movdqu xmm2, XMMWORD PTR [rdi+rdx+16]
+ paddd xmm6, xmm1
+
+ psubw xmm3, xmm2
+ paddw xmm5, xmm3
+ pmaddwd xmm3, xmm3
+ paddd xmm6, xmm3
+
+ movdqa xmm1, xmm5
+ movdqa xmm2, xmm5
+ pcmpgtw xmm1, xmm0
+ pcmpeqw xmm2, xmm0
+ por xmm1, xmm2
+ pcmpeqw xmm1, xmm0
+ movdqa xmm2, xmm5
+ punpcklwd xmm5, xmm1
+ punpckhwd xmm2, xmm1
+ paddd xmm7, xmm5
+ paddd xmm7, xmm2
+
+ lea rsi, [rsi + 2*rax]
+ lea rdi, [rdi + 2*rdx]
+ sub rcx, 2
+ jnz .var16loop
+
+ movdqa xmm4, xmm6
+ punpckldq xmm6, xmm0
+
+ punpckhdq xmm4, xmm0
+ movdqa xmm5, xmm7
+
+ paddd xmm6, xmm4
+ punpckldq xmm7, xmm0
+
+ punpckhdq xmm5, xmm0
+ paddd xmm7, xmm5
+
+ movdqa xmm4, xmm6
+ movdqa xmm5, xmm7
+
+ psrldq xmm4, 8
+ psrldq xmm5, 8
+
+ paddd xmm6, xmm4
+ paddd xmm7, xmm5
+
+ mov rdi, arg(4) ; [SSE]
+ mov rax, arg(5) ; [Sum]
+
+ movd DWORD PTR [rdi], xmm6
+ movd DWORD PTR [rax], xmm7
+
+
+ ; begin epilog
+ pop rdi
+ pop rsi
+ pop rbx
+ RESTORE_XMM
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+
+
+;unsigned int vpx_highbd_calc8x8var_sse2
+;(
+; unsigned char * src_ptr,
+; int src_stride,
+; unsigned char * ref_ptr,
+; int ref_stride,
+; unsigned int * SSE,
+; int * Sum
+;)
+globalsym(vpx_highbd_calc8x8var_sse2)
+sym(vpx_highbd_calc8x8var_sse2):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 6
+ SAVE_XMM 7
+ push rbx
+ push rsi
+ push rdi
+ ; end prolog
+
+ mov rsi, arg(0) ;[src_ptr]
+ mov rdi, arg(2) ;[ref_ptr]
+
+ movsxd rax, DWORD PTR arg(1) ;[src_stride]
+ movsxd rdx, DWORD PTR arg(3) ;[ref_stride]
+ add rax, rax ; source stride in bytes
+ add rdx, rdx ; recon stride in bytes
+
+ ; Prefetch data
+ prefetcht0 [rsi]
+ prefetcht0 [rsi+rax]
+ lea rbx, [rsi+rax*2]
+ prefetcht0 [rbx]
+ prefetcht0 [rbx+rax]
+
+ prefetcht0 [rdi]
+ prefetcht0 [rdi+rdx]
+ lea rbx, [rdi+rdx*2]
+ prefetcht0 [rbx]
+ prefetcht0 [rbx+rdx]
+
+ pxor xmm0, xmm0 ; clear xmm0 for unpack
+ pxor xmm7, xmm7 ; clear xmm7 for accumulating diffs
+
+ pxor xmm6, xmm6 ; clear xmm6 for accumulating sse
+ mov rcx, 8
+
+.var8loop:
+ movdqu xmm1, XMMWORD PTR [rsi]
+ movdqu xmm2, XMMWORD PTR [rdi]
+
+ lea rbx, [rsi+rax*4]
+ prefetcht0 [rbx]
+ prefetcht0 [rbx+rax]
+ lea rbx, [rbx+rax*2]
+ prefetcht0 [rbx]
+ prefetcht0 [rbx+rax]
+ lea rbx, [rdi+rdx*4]
+ prefetcht0 [rbx]
+ prefetcht0 [rbx+rdx]
+ lea rbx, [rbx+rdx*2]
+ prefetcht0 [rbx]
+ prefetcht0 [rbx+rdx]
+
+ pxor xmm5, xmm5
+
+ psubw xmm1, xmm2
+ movdqu xmm3, XMMWORD PTR [rsi+rax]
+ paddw xmm5, xmm1
+ pmaddwd xmm1, xmm1
+ movdqu xmm2, XMMWORD PTR [rdi+rdx]
+ paddd xmm6, xmm1
+
+ lea rsi, [rsi + 2*rax]
+ lea rdi, [rdi + 2*rdx]
+
+ psubw xmm3, xmm2
+ movdqu xmm1, XMMWORD PTR [rsi]
+ paddw xmm5, xmm3
+ pmaddwd xmm3, xmm3
+ movdqu xmm2, XMMWORD PTR [rdi]
+ paddd xmm6, xmm3
+
+ psubw xmm1, xmm2
+ movdqu xmm3, XMMWORD PTR [rsi+rax]
+ paddw xmm5, xmm1
+ pmaddwd xmm1, xmm1
+ movdqu xmm2, XMMWORD PTR [rdi+rdx]
+ paddd xmm6, xmm1
+
+ psubw xmm3, xmm2
+ paddw xmm5, xmm3
+ pmaddwd xmm3, xmm3
+ paddd xmm6, xmm3
+
+ movdqa xmm1, xmm5
+ movdqa xmm2, xmm5
+ pcmpgtw xmm1, xmm0
+ pcmpeqw xmm2, xmm0
+ por xmm1, xmm2
+ pcmpeqw xmm1, xmm0
+ movdqa xmm2, xmm5
+ punpcklwd xmm5, xmm1
+ punpckhwd xmm2, xmm1
+ paddd xmm7, xmm5
+ paddd xmm7, xmm2
+
+ lea rsi, [rsi + 2*rax]
+ lea rdi, [rdi + 2*rdx]
+ sub rcx, 4
+ jnz .var8loop
+
+ movdqa xmm4, xmm6
+ punpckldq xmm6, xmm0
+
+ punpckhdq xmm4, xmm0
+ movdqa xmm5, xmm7
+
+ paddd xmm6, xmm4
+ punpckldq xmm7, xmm0
+
+ punpckhdq xmm5, xmm0
+ paddd xmm7, xmm5
+
+ movdqa xmm4, xmm6
+ movdqa xmm5, xmm7
+
+ psrldq xmm4, 8
+ psrldq xmm5, 8
+
+ paddd xmm6, xmm4
+ paddd xmm7, xmm5
+
+ mov rdi, arg(4) ; [SSE]
+ mov rax, arg(5) ; [Sum]
+
+ movd DWORD PTR [rdi], xmm6
+ movd DWORD PTR [rax], xmm7
+
+ ; begin epilog
+ pop rdi
+ pop rsi
+ pop rbx
+ RESTORE_XMM
+ UNSHADOW_ARGS
+ pop rbp
+ ret
diff --git a/media/libvpx/libvpx/vpx_dsp/x86/highbd_variance_sse2.c b/media/libvpx/libvpx/vpx_dsp/x86/highbd_variance_sse2.c
new file mode 100644
index 0000000000..381e0ad193
--- /dev/null
+++ b/media/libvpx/libvpx/vpx_dsp/x86/highbd_variance_sse2.c
@@ -0,0 +1,608 @@
+/*
+ * Copyright (c) 2014 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+#include <emmintrin.h> // SSE2
+
+#include "./vpx_config.h"
+#include "./vpx_dsp_rtcd.h"
+#include "vpx_ports/mem.h"
+
+typedef uint32_t (*high_variance_fn_t)(const uint16_t *src, int src_stride,
+ const uint16_t *ref, int ref_stride,
+ uint32_t *sse, int *sum);
+
+uint32_t vpx_highbd_calc8x8var_sse2(const uint16_t *src, int src_stride,
+ const uint16_t *ref, int ref_stride,
+ uint32_t *sse, int *sum);
+
+uint32_t vpx_highbd_calc16x16var_sse2(const uint16_t *src, int src_stride,
+ const uint16_t *ref, int ref_stride,
+ uint32_t *sse, int *sum);
+
+static void highbd_8_variance_sse2(const uint16_t *src, int src_stride,
+ const uint16_t *ref, int ref_stride, int w,
+ int h, uint32_t *sse, int *sum,
+ high_variance_fn_t var_fn, int block_size) {
+ int i, j;
+
+ *sse = 0;
+ *sum = 0;
+
+ for (i = 0; i < h; i += block_size) {
+ for (j = 0; j < w; j += block_size) {
+ unsigned int sse0;
+ int sum0;
+ var_fn(src + src_stride * i + j, src_stride, ref + ref_stride * i + j,
+ ref_stride, &sse0, &sum0);
+ *sse += sse0;
+ *sum += sum0;
+ }
+ }
+}
+
+static void highbd_10_variance_sse2(const uint16_t *src, int src_stride,
+ const uint16_t *ref, int ref_stride, int w,
+ int h, uint32_t *sse, int *sum,
+ high_variance_fn_t var_fn, int block_size) {
+ int i, j;
+ uint64_t sse_long = 0;
+ int32_t sum_long = 0;
+
+ for (i = 0; i < h; i += block_size) {
+ for (j = 0; j < w; j += block_size) {
+ unsigned int sse0;
+ int sum0;
+ var_fn(src + src_stride * i + j, src_stride, ref + ref_stride * i + j,
+ ref_stride, &sse0, &sum0);
+ sse_long += sse0;
+ sum_long += sum0;
+ }
+ }
+ *sum = ROUND_POWER_OF_TWO(sum_long, 2);
+ *sse = (uint32_t)ROUND_POWER_OF_TWO(sse_long, 4);
+}
+
+static void highbd_12_variance_sse2(const uint16_t *src, int src_stride,
+ const uint16_t *ref, int ref_stride, int w,
+ int h, uint32_t *sse, int *sum,
+ high_variance_fn_t var_fn, int block_size) {
+ int i, j;
+ uint64_t sse_long = 0;
+ int32_t sum_long = 0;
+
+ for (i = 0; i < h; i += block_size) {
+ for (j = 0; j < w; j += block_size) {
+ unsigned int sse0;
+ int sum0;
+ var_fn(src + src_stride * i + j, src_stride, ref + ref_stride * i + j,
+ ref_stride, &sse0, &sum0);
+ sse_long += sse0;
+ sum_long += sum0;
+ }
+ }
+ *sum = ROUND_POWER_OF_TWO(sum_long, 4);
+ *sse = (uint32_t)ROUND_POWER_OF_TWO(sse_long, 8);
+}
+
+#define HIGH_GET_VAR(S) \
+ void vpx_highbd_8_get##S##x##S##var_sse2( \
+ const uint8_t *src8, int src_stride, const uint8_t *ref8, \
+ int ref_stride, uint32_t *sse, int *sum) { \
+ uint16_t *src = CONVERT_TO_SHORTPTR(src8); \
+ uint16_t *ref = CONVERT_TO_SHORTPTR(ref8); \
+ vpx_highbd_calc##S##x##S##var_sse2(src, src_stride, ref, ref_stride, sse, \
+ sum); \
+ } \
+ \
+ void vpx_highbd_10_get##S##x##S##var_sse2( \
+ const uint8_t *src8, int src_stride, const uint8_t *ref8, \
+ int ref_stride, uint32_t *sse, int *sum) { \
+ uint16_t *src = CONVERT_TO_SHORTPTR(src8); \
+ uint16_t *ref = CONVERT_TO_SHORTPTR(ref8); \
+ vpx_highbd_calc##S##x##S##var_sse2(src, src_stride, ref, ref_stride, sse, \
+ sum); \
+ *sum = ROUND_POWER_OF_TWO(*sum, 2); \
+ *sse = ROUND_POWER_OF_TWO(*sse, 4); \
+ } \
+ \
+ void vpx_highbd_12_get##S##x##S##var_sse2( \
+ const uint8_t *src8, int src_stride, const uint8_t *ref8, \
+ int ref_stride, uint32_t *sse, int *sum) { \
+ uint16_t *src = CONVERT_TO_SHORTPTR(src8); \
+ uint16_t *ref = CONVERT_TO_SHORTPTR(ref8); \
+ vpx_highbd_calc##S##x##S##var_sse2(src, src_stride, ref, ref_stride, sse, \
+ sum); \
+ *sum = ROUND_POWER_OF_TWO(*sum, 4); \
+ *sse = ROUND_POWER_OF_TWO(*sse, 8); \
+ }
+
+HIGH_GET_VAR(16)
+HIGH_GET_VAR(8)
+
+#undef HIGH_GET_VAR
+
+#define VAR_FN(w, h, block_size, shift) \
+ uint32_t vpx_highbd_8_variance##w##x##h##_sse2( \
+ const uint8_t *src8, int src_stride, const uint8_t *ref8, \
+ int ref_stride, uint32_t *sse) { \
+ int sum; \
+ uint16_t *src = CONVERT_TO_SHORTPTR(src8); \
+ uint16_t *ref = CONVERT_TO_SHORTPTR(ref8); \
+ highbd_8_variance_sse2( \
+ src, src_stride, ref, ref_stride, w, h, sse, &sum, \
+ vpx_highbd_calc##block_size##x##block_size##var_sse2, block_size); \
+ return *sse - (uint32_t)(((int64_t)sum * sum) >> (shift)); \
+ } \
+ \
+ uint32_t vpx_highbd_10_variance##w##x##h##_sse2( \
+ const uint8_t *src8, int src_stride, const uint8_t *ref8, \
+ int ref_stride, uint32_t *sse) { \
+ int sum; \
+ int64_t var; \
+ uint16_t *src = CONVERT_TO_SHORTPTR(src8); \
+ uint16_t *ref = CONVERT_TO_SHORTPTR(ref8); \
+ highbd_10_variance_sse2( \
+ src, src_stride, ref, ref_stride, w, h, sse, &sum, \
+ vpx_highbd_calc##block_size##x##block_size##var_sse2, block_size); \
+ var = (int64_t)(*sse) - (((int64_t)sum * sum) >> (shift)); \
+ return (var >= 0) ? (uint32_t)var : 0; \
+ } \
+ \
+ uint32_t vpx_highbd_12_variance##w##x##h##_sse2( \
+ const uint8_t *src8, int src_stride, const uint8_t *ref8, \
+ int ref_stride, uint32_t *sse) { \
+ int sum; \
+ int64_t var; \
+ uint16_t *src = CONVERT_TO_SHORTPTR(src8); \
+ uint16_t *ref = CONVERT_TO_SHORTPTR(ref8); \
+ highbd_12_variance_sse2( \
+ src, src_stride, ref, ref_stride, w, h, sse, &sum, \
+ vpx_highbd_calc##block_size##x##block_size##var_sse2, block_size); \
+ var = (int64_t)(*sse) - (((int64_t)sum * sum) >> (shift)); \
+ return (var >= 0) ? (uint32_t)var : 0; \
+ }
+
+VAR_FN(64, 64, 16, 12)
+VAR_FN(64, 32, 16, 11)
+VAR_FN(32, 64, 16, 11)
+VAR_FN(32, 32, 16, 10)
+VAR_FN(32, 16, 16, 9)
+VAR_FN(16, 32, 16, 9)
+VAR_FN(16, 16, 16, 8)
+VAR_FN(16, 8, 8, 7)
+VAR_FN(8, 16, 8, 7)
+VAR_FN(8, 8, 8, 6)
+
+#undef VAR_FN
+
+unsigned int vpx_highbd_8_mse16x16_sse2(const uint8_t *src8, int src_stride,
+ const uint8_t *ref8, int ref_stride,
+ unsigned int *sse) {
+ int sum;
+ uint16_t *src = CONVERT_TO_SHORTPTR(src8);
+ uint16_t *ref = CONVERT_TO_SHORTPTR(ref8);
+ highbd_8_variance_sse2(src, src_stride, ref, ref_stride, 16, 16, sse, &sum,
+ vpx_highbd_calc16x16var_sse2, 16);
+ return *sse;
+}
+
+unsigned int vpx_highbd_10_mse16x16_sse2(const uint8_t *src8, int src_stride,
+ const uint8_t *ref8, int ref_stride,
+ unsigned int *sse) {
+ int sum;
+ uint16_t *src = CONVERT_TO_SHORTPTR(src8);
+ uint16_t *ref = CONVERT_TO_SHORTPTR(ref8);
+ highbd_10_variance_sse2(src, src_stride, ref, ref_stride, 16, 16, sse, &sum,
+ vpx_highbd_calc16x16var_sse2, 16);
+ return *sse;
+}
+
+unsigned int vpx_highbd_12_mse16x16_sse2(const uint8_t *src8, int src_stride,
+ const uint8_t *ref8, int ref_stride,
+ unsigned int *sse) {
+ int sum;
+ uint16_t *src = CONVERT_TO_SHORTPTR(src8);
+ uint16_t *ref = CONVERT_TO_SHORTPTR(ref8);
+ highbd_12_variance_sse2(src, src_stride, ref, ref_stride, 16, 16, sse, &sum,
+ vpx_highbd_calc16x16var_sse2, 16);
+ return *sse;
+}
+
+unsigned int vpx_highbd_8_mse8x8_sse2(const uint8_t *src8, int src_stride,
+ const uint8_t *ref8, int ref_stride,
+ unsigned int *sse) {
+ int sum;
+ uint16_t *src = CONVERT_TO_SHORTPTR(src8);
+ uint16_t *ref = CONVERT_TO_SHORTPTR(ref8);
+ highbd_8_variance_sse2(src, src_stride, ref, ref_stride, 8, 8, sse, &sum,
+ vpx_highbd_calc8x8var_sse2, 8);
+ return *sse;
+}
+
+unsigned int vpx_highbd_10_mse8x8_sse2(const uint8_t *src8, int src_stride,
+ const uint8_t *ref8, int ref_stride,
+ unsigned int *sse) {
+ int sum;
+ uint16_t *src = CONVERT_TO_SHORTPTR(src8);
+ uint16_t *ref = CONVERT_TO_SHORTPTR(ref8);
+ highbd_10_variance_sse2(src, src_stride, ref, ref_stride, 8, 8, sse, &sum,
+ vpx_highbd_calc8x8var_sse2, 8);
+ return *sse;
+}
+
+unsigned int vpx_highbd_12_mse8x8_sse2(const uint8_t *src8, int src_stride,
+ const uint8_t *ref8, int ref_stride,
+ unsigned int *sse) {
+ int sum;
+ uint16_t *src = CONVERT_TO_SHORTPTR(src8);
+ uint16_t *ref = CONVERT_TO_SHORTPTR(ref8);
+ highbd_12_variance_sse2(src, src_stride, ref, ref_stride, 8, 8, sse, &sum,
+ vpx_highbd_calc8x8var_sse2, 8);
+ return *sse;
+}
+
+// The 2 unused parameters are place holders for PIC enabled build.
+// These definitions are for functions defined in
+// highbd_subpel_variance_impl_sse2.asm
+#define DECL(w, opt) \
+ int vpx_highbd_sub_pixel_variance##w##xh_##opt( \
+ const uint16_t *src, ptrdiff_t src_stride, int x_offset, int y_offset, \
+ const uint16_t *ref, ptrdiff_t ref_stride, int height, \
+ unsigned int *sse, void *unused0, void *unused);
+#define DECLS(opt) \
+ DECL(8, opt) \
+ DECL(16, opt)
+
+DECLS(sse2)
+
+#undef DECLS
+#undef DECL
+
+#define FN(w, h, wf, wlog2, hlog2, opt, cast) \
+ uint32_t vpx_highbd_8_sub_pixel_variance##w##x##h##_##opt( \
+ const uint8_t *src8, int src_stride, int x_offset, int y_offset, \
+ const uint8_t *ref8, int ref_stride, uint32_t *sse_ptr) { \
+ uint32_t sse; \
+ uint16_t *src = CONVERT_TO_SHORTPTR(src8); \
+ uint16_t *ref = CONVERT_TO_SHORTPTR(ref8); \
+ int se = vpx_highbd_sub_pixel_variance##wf##xh_##opt( \
+ src, src_stride, x_offset, y_offset, ref, ref_stride, h, &sse, NULL, \
+ NULL); \
+ if (w > wf) { \
+ unsigned int sse2; \
+ int se2 = vpx_highbd_sub_pixel_variance##wf##xh_##opt( \
+ src + 16, src_stride, x_offset, y_offset, ref + 16, ref_stride, h, \
+ &sse2, NULL, NULL); \
+ se += se2; \
+ sse += sse2; \
+ if (w > wf * 2) { \
+ se2 = vpx_highbd_sub_pixel_variance##wf##xh_##opt( \
+ src + 32, src_stride, x_offset, y_offset, ref + 32, ref_stride, h, \
+ &sse2, NULL, NULL); \
+ se += se2; \
+ sse += sse2; \
+ se2 = vpx_highbd_sub_pixel_variance##wf##xh_##opt( \
+ src + 48, src_stride, x_offset, y_offset, ref + 48, ref_stride, h, \
+ &sse2, NULL, NULL); \
+ se += se2; \
+ sse += sse2; \
+ } \
+ } \
+ *sse_ptr = sse; \
+ return sse - (uint32_t)((cast se * se) >> (wlog2 + hlog2)); \
+ } \
+ \
+ uint32_t vpx_highbd_10_sub_pixel_variance##w##x##h##_##opt( \
+ const uint8_t *src8, int src_stride, int x_offset, int y_offset, \
+ const uint8_t *ref8, int ref_stride, uint32_t *sse_ptr) { \
+ int64_t var; \
+ uint32_t sse; \
+ uint16_t *src = CONVERT_TO_SHORTPTR(src8); \
+ uint16_t *ref = CONVERT_TO_SHORTPTR(ref8); \
+ int se = vpx_highbd_sub_pixel_variance##wf##xh_##opt( \
+ src, src_stride, x_offset, y_offset, ref, ref_stride, h, &sse, NULL, \
+ NULL); \
+ if (w > wf) { \
+ uint32_t sse2; \
+ int se2 = vpx_highbd_sub_pixel_variance##wf##xh_##opt( \
+ src + 16, src_stride, x_offset, y_offset, ref + 16, ref_stride, h, \
+ &sse2, NULL, NULL); \
+ se += se2; \
+ sse += sse2; \
+ if (w > wf * 2) { \
+ se2 = vpx_highbd_sub_pixel_variance##wf##xh_##opt( \
+ src + 32, src_stride, x_offset, y_offset, ref + 32, ref_stride, h, \
+ &sse2, NULL, NULL); \
+ se += se2; \
+ sse += sse2; \
+ se2 = vpx_highbd_sub_pixel_variance##wf##xh_##opt( \
+ src + 48, src_stride, x_offset, y_offset, ref + 48, ref_stride, h, \
+ &sse2, NULL, NULL); \
+ se += se2; \
+ sse += sse2; \
+ } \
+ } \
+ se = ROUND_POWER_OF_TWO(se, 2); \
+ sse = ROUND_POWER_OF_TWO(sse, 4); \
+ *sse_ptr = sse; \
+ var = (int64_t)(sse) - ((cast se * se) >> (wlog2 + hlog2)); \
+ return (var >= 0) ? (uint32_t)var : 0; \
+ } \
+ \
+ uint32_t vpx_highbd_12_sub_pixel_variance##w##x##h##_##opt( \
+ const uint8_t *src8, int src_stride, int x_offset, int y_offset, \
+ const uint8_t *ref8, int ref_stride, uint32_t *sse_ptr) { \
+ int start_row; \
+ uint32_t sse; \
+ int se = 0; \
+ int64_t var; \
+ uint64_t long_sse = 0; \
+ uint16_t *src = CONVERT_TO_SHORTPTR(src8); \
+ uint16_t *ref = CONVERT_TO_SHORTPTR(ref8); \
+ for (start_row = 0; start_row < h; start_row += 16) { \
+ uint32_t sse2; \
+ int height = h - start_row < 16 ? h - start_row : 16; \
+ int se2 = vpx_highbd_sub_pixel_variance##wf##xh_##opt( \
+ src + (start_row * src_stride), src_stride, x_offset, y_offset, \
+ ref + (start_row * ref_stride), ref_stride, height, &sse2, NULL, \
+ NULL); \
+ se += se2; \
+ long_sse += sse2; \
+ if (w > wf) { \
+ se2 = vpx_highbd_sub_pixel_variance##wf##xh_##opt( \
+ src + 16 + (start_row * src_stride), src_stride, x_offset, \
+ y_offset, ref + 16 + (start_row * ref_stride), ref_stride, height, \
+ &sse2, NULL, NULL); \
+ se += se2; \
+ long_sse += sse2; \
+ if (w > wf * 2) { \
+ se2 = vpx_highbd_sub_pixel_variance##wf##xh_##opt( \
+ src + 32 + (start_row * src_stride), src_stride, x_offset, \
+ y_offset, ref + 32 + (start_row * ref_stride), ref_stride, \
+ height, &sse2, NULL, NULL); \
+ se += se2; \
+ long_sse += sse2; \
+ se2 = vpx_highbd_sub_pixel_variance##wf##xh_##opt( \
+ src + 48 + (start_row * src_stride), src_stride, x_offset, \
+ y_offset, ref + 48 + (start_row * ref_stride), ref_stride, \
+ height, &sse2, NULL, NULL); \
+ se += se2; \
+ long_sse += sse2; \
+ } \
+ } \
+ } \
+ se = ROUND_POWER_OF_TWO(se, 4); \
+ sse = (uint32_t)ROUND_POWER_OF_TWO(long_sse, 8); \
+ *sse_ptr = sse; \
+ var = (int64_t)(sse) - ((cast se * se) >> (wlog2 + hlog2)); \
+ return (var >= 0) ? (uint32_t)var : 0; \
+ }
+
+#define FNS(opt) \
+ FN(64, 64, 16, 6, 6, opt, (int64_t)) \
+ FN(64, 32, 16, 6, 5, opt, (int64_t)) \
+ FN(32, 64, 16, 5, 6, opt, (int64_t)) \
+ FN(32, 32, 16, 5, 5, opt, (int64_t)) \
+ FN(32, 16, 16, 5, 4, opt, (int64_t)) \
+ FN(16, 32, 16, 4, 5, opt, (int64_t)) \
+ FN(16, 16, 16, 4, 4, opt, (int64_t)) \
+ FN(16, 8, 16, 4, 3, opt, (int64_t)) \
+ FN(8, 16, 8, 3, 4, opt, (int64_t)) \
+ FN(8, 8, 8, 3, 3, opt, (int64_t)) \
+ FN(8, 4, 8, 3, 2, opt, (int64_t))
+
+FNS(sse2)
+
+#undef FNS
+#undef FN
+
+// The 2 unused parameters are place holders for PIC enabled build.
+#define DECL(w, opt) \
+ int vpx_highbd_sub_pixel_avg_variance##w##xh_##opt( \
+ const uint16_t *src, ptrdiff_t src_stride, int x_offset, int y_offset, \
+ const uint16_t *ref, ptrdiff_t ref_stride, const uint16_t *second, \
+ ptrdiff_t second_stride, int height, unsigned int *sse, void *unused0, \
+ void *unused);
+#define DECLS(opt1) \
+ DECL(16, opt1) \
+ DECL(8, opt1)
+
+DECLS(sse2)
+#undef DECL
+#undef DECLS
+
+#define FN(w, h, wf, wlog2, hlog2, opt, cast) \
+ uint32_t vpx_highbd_8_sub_pixel_avg_variance##w##x##h##_##opt( \
+ const uint8_t *src8, int src_stride, int x_offset, int y_offset, \
+ const uint8_t *ref8, int ref_stride, uint32_t *sse_ptr, \
+ const uint8_t *sec8) { \
+ uint32_t sse; \
+ uint16_t *src = CONVERT_TO_SHORTPTR(src8); \
+ uint16_t *ref = CONVERT_TO_SHORTPTR(ref8); \
+ uint16_t *sec = CONVERT_TO_SHORTPTR(sec8); \
+ int se = vpx_highbd_sub_pixel_avg_variance##wf##xh_##opt( \
+ src, src_stride, x_offset, y_offset, ref, ref_stride, sec, w, h, &sse, \
+ NULL, NULL); \
+ if (w > wf) { \
+ uint32_t sse2; \
+ int se2 = vpx_highbd_sub_pixel_avg_variance##wf##xh_##opt( \
+ src + 16, src_stride, x_offset, y_offset, ref + 16, ref_stride, \
+ sec + 16, w, h, &sse2, NULL, NULL); \
+ se += se2; \
+ sse += sse2; \
+ if (w > wf * 2) { \
+ se2 = vpx_highbd_sub_pixel_avg_variance##wf##xh_##opt( \
+ src + 32, src_stride, x_offset, y_offset, ref + 32, ref_stride, \
+ sec + 32, w, h, &sse2, NULL, NULL); \
+ se += se2; \
+ sse += sse2; \
+ se2 = vpx_highbd_sub_pixel_avg_variance##wf##xh_##opt( \
+ src + 48, src_stride, x_offset, y_offset, ref + 48, ref_stride, \
+ sec + 48, w, h, &sse2, NULL, NULL); \
+ se += se2; \
+ sse += sse2; \
+ } \
+ } \
+ *sse_ptr = sse; \
+ return sse - (uint32_t)((cast se * se) >> (wlog2 + hlog2)); \
+ } \
+ \
+ uint32_t vpx_highbd_10_sub_pixel_avg_variance##w##x##h##_##opt( \
+ const uint8_t *src8, int src_stride, int x_offset, int y_offset, \
+ const uint8_t *ref8, int ref_stride, uint32_t *sse_ptr, \
+ const uint8_t *sec8) { \
+ int64_t var; \
+ uint32_t sse; \
+ uint16_t *src = CONVERT_TO_SHORTPTR(src8); \
+ uint16_t *ref = CONVERT_TO_SHORTPTR(ref8); \
+ uint16_t *sec = CONVERT_TO_SHORTPTR(sec8); \
+ int se = vpx_highbd_sub_pixel_avg_variance##wf##xh_##opt( \
+ src, src_stride, x_offset, y_offset, ref, ref_stride, sec, w, h, &sse, \
+ NULL, NULL); \
+ if (w > wf) { \
+ uint32_t sse2; \
+ int se2 = vpx_highbd_sub_pixel_avg_variance##wf##xh_##opt( \
+ src + 16, src_stride, x_offset, y_offset, ref + 16, ref_stride, \
+ sec + 16, w, h, &sse2, NULL, NULL); \
+ se += se2; \
+ sse += sse2; \
+ if (w > wf * 2) { \
+ se2 = vpx_highbd_sub_pixel_avg_variance##wf##xh_##opt( \
+ src + 32, src_stride, x_offset, y_offset, ref + 32, ref_stride, \
+ sec + 32, w, h, &sse2, NULL, NULL); \
+ se += se2; \
+ sse += sse2; \
+ se2 = vpx_highbd_sub_pixel_avg_variance##wf##xh_##opt( \
+ src + 48, src_stride, x_offset, y_offset, ref + 48, ref_stride, \
+ sec + 48, w, h, &sse2, NULL, NULL); \
+ se += se2; \
+ sse += sse2; \
+ } \
+ } \
+ se = ROUND_POWER_OF_TWO(se, 2); \
+ sse = ROUND_POWER_OF_TWO(sse, 4); \
+ *sse_ptr = sse; \
+ var = (int64_t)(sse) - ((cast se * se) >> (wlog2 + hlog2)); \
+ return (var >= 0) ? (uint32_t)var : 0; \
+ } \
+ \
+ uint32_t vpx_highbd_12_sub_pixel_avg_variance##w##x##h##_##opt( \
+ const uint8_t *src8, int src_stride, int x_offset, int y_offset, \
+ const uint8_t *ref8, int ref_stride, uint32_t *sse_ptr, \
+ const uint8_t *sec8) { \
+ int start_row; \
+ int64_t var; \
+ uint32_t sse; \
+ int se = 0; \
+ uint64_t long_sse = 0; \
+ uint16_t *src = CONVERT_TO_SHORTPTR(src8); \
+ uint16_t *ref = CONVERT_TO_SHORTPTR(ref8); \
+ uint16_t *sec = CONVERT_TO_SHORTPTR(sec8); \
+ for (start_row = 0; start_row < h; start_row += 16) { \
+ uint32_t sse2; \
+ int height = h - start_row < 16 ? h - start_row : 16; \
+ int se2 = vpx_highbd_sub_pixel_avg_variance##wf##xh_##opt( \
+ src + (start_row * src_stride), src_stride, x_offset, y_offset, \
+ ref + (start_row * ref_stride), ref_stride, sec + (start_row * w), \
+ w, height, &sse2, NULL, NULL); \
+ se += se2; \
+ long_sse += sse2; \
+ if (w > wf) { \
+ se2 = vpx_highbd_sub_pixel_avg_variance##wf##xh_##opt( \
+ src + 16 + (start_row * src_stride), src_stride, x_offset, \
+ y_offset, ref + 16 + (start_row * ref_stride), ref_stride, \
+ sec + 16 + (start_row * w), w, height, &sse2, NULL, NULL); \
+ se += se2; \
+ long_sse += sse2; \
+ if (w > wf * 2) { \
+ se2 = vpx_highbd_sub_pixel_avg_variance##wf##xh_##opt( \
+ src + 32 + (start_row * src_stride), src_stride, x_offset, \
+ y_offset, ref + 32 + (start_row * ref_stride), ref_stride, \
+ sec + 32 + (start_row * w), w, height, &sse2, NULL, NULL); \
+ se += se2; \
+ long_sse += sse2; \
+ se2 = vpx_highbd_sub_pixel_avg_variance##wf##xh_##opt( \
+ src + 48 + (start_row * src_stride), src_stride, x_offset, \
+ y_offset, ref + 48 + (start_row * ref_stride), ref_stride, \
+ sec + 48 + (start_row * w), w, height, &sse2, NULL, NULL); \
+ se += se2; \
+ long_sse += sse2; \
+ } \
+ } \
+ } \
+ se = ROUND_POWER_OF_TWO(se, 4); \
+ sse = (uint32_t)ROUND_POWER_OF_TWO(long_sse, 8); \
+ *sse_ptr = sse; \
+ var = (int64_t)(sse) - ((cast se * se) >> (wlog2 + hlog2)); \
+ return (var >= 0) ? (uint32_t)var : 0; \
+ }
+
+#define FNS(opt1) \
+ FN(64, 64, 16, 6, 6, opt1, (int64_t)) \
+ FN(64, 32, 16, 6, 5, opt1, (int64_t)) \
+ FN(32, 64, 16, 5, 6, opt1, (int64_t)) \
+ FN(32, 32, 16, 5, 5, opt1, (int64_t)) \
+ FN(32, 16, 16, 5, 4, opt1, (int64_t)) \
+ FN(16, 32, 16, 4, 5, opt1, (int64_t)) \
+ FN(16, 16, 16, 4, 4, opt1, (int64_t)) \
+ FN(16, 8, 16, 4, 3, opt1, (int64_t)) \
+ FN(8, 16, 8, 4, 3, opt1, (int64_t)) \
+ FN(8, 8, 8, 3, 3, opt1, (int64_t)) \
+ FN(8, 4, 8, 3, 2, opt1, (int64_t))
+
+FNS(sse2)
+
+#undef FNS
+#undef FN
+
+void vpx_highbd_comp_avg_pred_sse2(uint16_t *comp_pred, const uint16_t *pred,
+ int width, int height, const uint16_t *ref,
+ int ref_stride) {
+ int i, j;
+ if (width > 8) {
+ for (i = 0; i < height; ++i) {
+ for (j = 0; j < width; j += 16) {
+ const __m128i p0 = _mm_loadu_si128((const __m128i *)&pred[j]);
+ const __m128i p1 = _mm_loadu_si128((const __m128i *)&pred[j + 8]);
+ const __m128i r0 = _mm_loadu_si128((const __m128i *)&ref[j]);
+ const __m128i r1 = _mm_loadu_si128((const __m128i *)&ref[j + 8]);
+ _mm_storeu_si128((__m128i *)&comp_pred[j], _mm_avg_epu16(p0, r0));
+ _mm_storeu_si128((__m128i *)&comp_pred[j + 8], _mm_avg_epu16(p1, r1));
+ }
+ comp_pred += width;
+ pred += width;
+ ref += ref_stride;
+ }
+ } else if (width == 8) {
+ for (i = 0; i < height; i += 2) {
+ const __m128i p0 = _mm_loadu_si128((const __m128i *)&pred[0]);
+ const __m128i p1 = _mm_loadu_si128((const __m128i *)&pred[8]);
+ const __m128i r0 = _mm_loadu_si128((const __m128i *)&ref[0]);
+ const __m128i r1 = _mm_loadu_si128((const __m128i *)&ref[ref_stride]);
+ _mm_storeu_si128((__m128i *)&comp_pred[0], _mm_avg_epu16(p0, r0));
+ _mm_storeu_si128((__m128i *)&comp_pred[8], _mm_avg_epu16(p1, r1));
+ comp_pred += 8 << 1;
+ pred += 8 << 1;
+ ref += ref_stride << 1;
+ }
+ } else {
+ assert(width == 4);
+ for (i = 0; i < height; i += 2) {
+ const __m128i p0 = _mm_loadl_epi64((const __m128i *)&pred[0]);
+ const __m128i p1 = _mm_loadl_epi64((const __m128i *)&pred[4]);
+ const __m128i r0 = _mm_loadl_epi64((const __m128i *)&ref[0]);
+ const __m128i r1 = _mm_loadl_epi64((const __m128i *)&ref[ref_stride]);
+ _mm_storel_epi64((__m128i *)&comp_pred[0], _mm_avg_epu16(p0, r0));
+ _mm_storel_epi64((__m128i *)&comp_pred[4], _mm_avg_epu16(p1, r1));
+ comp_pred += 4 << 1;
+ pred += 4 << 1;
+ ref += ref_stride << 1;
+ }
+ }
+}
diff --git a/media/libvpx/libvpx/vpx_dsp/x86/intrapred_sse2.asm b/media/libvpx/libvpx/vpx_dsp/x86/intrapred_sse2.asm
new file mode 100644
index 0000000000..61af6236ed
--- /dev/null
+++ b/media/libvpx/libvpx/vpx_dsp/x86/intrapred_sse2.asm
@@ -0,0 +1,860 @@
+;
+; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+;
+; Use of this source code is governed by a BSD-style license
+; that can be found in the LICENSE file in the root of the source
+; tree. An additional intellectual property rights grant can be found
+; in the file PATENTS. All contributing project authors may
+; be found in the AUTHORS file in the root of the source tree.
+;
+
+%include "third_party/x86inc/x86inc.asm"
+
+SECTION_RODATA
+pb_1: times 16 db 1
+pw_4: times 8 dw 4
+pw_8: times 8 dw 8
+pw_16: times 8 dw 16
+pw_32: times 8 dw 32
+dc_128: times 16 db 128
+pw2_4: times 8 dw 2
+pw2_8: times 8 dw 4
+pw2_16: times 8 dw 8
+pw2_32: times 8 dw 16
+
+SECTION .text
+
+; ------------------------------------------
+; input: x, y, z, result
+;
+; trick from pascal
+; (x+2y+z+2)>>2 can be calculated as:
+; result = avg(x,z)
+; result -= xor(x,z) & 1
+; result = avg(result,y)
+; ------------------------------------------
+%macro X_PLUS_2Y_PLUS_Z_PLUS_2_RSH_2 4
+ pavgb %4, %1, %3
+ pxor %3, %1
+ pand %3, [GLOBAL(pb_1)]
+ psubb %4, %3
+ pavgb %4, %2
+%endmacro
+
+INIT_XMM sse2
+cglobal d45_predictor_4x4, 3, 4, 4, dst, stride, above, goffset
+ GET_GOT goffsetq
+
+ movq m0, [aboveq]
+ DEFINE_ARGS dst, stride, temp
+ psrldq m1, m0, 1
+ psrldq m2, m0, 2
+ X_PLUS_2Y_PLUS_Z_PLUS_2_RSH_2 m0, m1, m2, m3
+
+ ; store 4 lines
+ movd [dstq ], m3
+ psrlq m3, 8
+ movd [dstq+strideq ], m3
+ lea dstq, [dstq+strideq*2]
+ psrlq m3, 8
+ movd [dstq ], m3
+ psrlq m3, 8
+ movd [dstq+strideq ], m3
+ psrlq m0, 56
+ movd tempd, m0
+ mov [dstq+strideq+3], tempb
+
+ RESTORE_GOT
+ RET
+
+INIT_XMM sse2
+cglobal d45_predictor_8x8, 3, 4, 4, dst, stride, above, goffset
+ GET_GOT goffsetq
+
+ movu m1, [aboveq]
+ pslldq m0, m1, 1
+ psrldq m2, m1, 1
+ DEFINE_ARGS dst, stride, stride3
+ lea stride3q, [strideq*3]
+ X_PLUS_2Y_PLUS_Z_PLUS_2_RSH_2 m0, m1, m2, m3
+ punpckhbw m0, m0 ; 7 7
+ punpcklwd m0, m0 ; 7 7 7 7
+ punpckldq m0, m0 ; 7 7 7 7 7 7 7 7
+ punpcklqdq m3, m0 ; -1 0 1 2 3 4 5 6 7 7 7 7 7 7 7 7
+
+ ; store 4 lines
+ psrldq m3, 1
+ movq [dstq ], m3
+ psrldq m3, 1
+ movq [dstq+strideq ], m3
+ psrldq m3, 1
+ movq [dstq+strideq*2], m3
+ psrldq m3, 1
+ movq [dstq+stride3q ], m3
+ lea dstq, [dstq+strideq*4]
+
+ ; store next 4 lines
+ psrldq m3, 1
+ movq [dstq ], m3
+ psrldq m3, 1
+ movq [dstq+strideq ], m3
+ psrldq m3, 1
+ movq [dstq+strideq*2], m3
+ psrldq m3, 1
+ movq [dstq+stride3q ], m3
+
+ RESTORE_GOT
+ RET
+
+INIT_XMM sse2
+cglobal d207_predictor_4x4, 4, 4, 5, dst, stride, unused, left, goffset
+ GET_GOT goffsetq
+
+ movd m0, [leftq] ; abcd [byte]
+ punpcklbw m4, m0, m0 ; aabb ccdd
+ punpcklwd m4, m4 ; aaaa bbbb cccc dddd
+ psrldq m4, 12 ; dddd
+ punpckldq m0, m4 ; abcd dddd
+ psrldq m1, m0, 1 ; bcdd
+ psrldq m2, m0, 2 ; cddd
+
+ X_PLUS_2Y_PLUS_Z_PLUS_2_RSH_2 m0, m1, m2, m3 ; a2bc b2cd c3d d
+ pavgb m1, m0 ; ab, bc, cd, d [byte]
+
+ punpcklbw m1, m3 ; ab, a2bc, bc, b2cd, cd, c3d, d, d
+ movd [dstq ], m1
+ psrlq m1, 16 ; bc, b2cd, cd, c3d, d, d
+ movd [dstq+strideq], m1
+
+ lea dstq, [dstq+strideq*2]
+ psrlq m1, 16 ; cd, c3d, d, d
+ movd [dstq ], m1
+ movd [dstq+strideq], m4 ; d, d, d, d
+ RESTORE_GOT
+ RET
+
+INIT_XMM sse2
+cglobal dc_predictor_4x4, 4, 5, 3, dst, stride, above, left, goffset
+ GET_GOT goffsetq
+
+ movd m2, [leftq]
+ movd m0, [aboveq]
+ pxor m1, m1
+ punpckldq m0, m2
+ psadbw m0, m1
+ paddw m0, [GLOBAL(pw_4)]
+ psraw m0, 3
+ pshuflw m0, m0, 0x0
+ packuswb m0, m0
+ movd [dstq ], m0
+ movd [dstq+strideq], m0
+ lea dstq, [dstq+strideq*2]
+ movd [dstq ], m0
+ movd [dstq+strideq], m0
+
+ RESTORE_GOT
+ RET
+
+INIT_XMM sse2
+cglobal dc_left_predictor_4x4, 2, 5, 2, dst, stride, above, left, goffset
+ movifnidn leftq, leftmp
+ GET_GOT goffsetq
+
+ pxor m1, m1
+ movd m0, [leftq]
+ psadbw m0, m1
+ paddw m0, [GLOBAL(pw2_4)]
+ psraw m0, 2
+ pshuflw m0, m0, 0x0
+ packuswb m0, m0
+ movd [dstq ], m0
+ movd [dstq+strideq], m0
+ lea dstq, [dstq+strideq*2]
+ movd [dstq ], m0
+ movd [dstq+strideq], m0
+
+ RESTORE_GOT
+ RET
+
+INIT_XMM sse2
+cglobal dc_top_predictor_4x4, 3, 5, 2, dst, stride, above, left, goffset
+ GET_GOT goffsetq
+
+ pxor m1, m1
+ movd m0, [aboveq]
+ psadbw m0, m1
+ paddw m0, [GLOBAL(pw2_4)]
+ psraw m0, 2
+ pshuflw m0, m0, 0x0
+ packuswb m0, m0
+ movd [dstq ], m0
+ movd [dstq+strideq], m0
+ lea dstq, [dstq+strideq*2]
+ movd [dstq ], m0
+ movd [dstq+strideq], m0
+
+ RESTORE_GOT
+ RET
+
+INIT_XMM sse2
+cglobal dc_predictor_8x8, 4, 5, 3, dst, stride, above, left, goffset
+ GET_GOT goffsetq
+
+ pxor m1, m1
+ movq m0, [aboveq]
+ movq m2, [leftq]
+ DEFINE_ARGS dst, stride, stride3
+ lea stride3q, [strideq*3]
+ psadbw m0, m1
+ psadbw m2, m1
+ paddw m0, m2
+ paddw m0, [GLOBAL(pw_8)]
+ psraw m0, 4
+ punpcklbw m0, m0
+ pshuflw m0, m0, 0x0
+ movq [dstq ], m0
+ movq [dstq+strideq ], m0
+ movq [dstq+strideq*2], m0
+ movq [dstq+stride3q ], m0
+ lea dstq, [dstq+strideq*4]
+ movq [dstq ], m0
+ movq [dstq+strideq ], m0
+ movq [dstq+strideq*2], m0
+ movq [dstq+stride3q ], m0
+
+ RESTORE_GOT
+ RET
+
+INIT_XMM sse2
+cglobal dc_top_predictor_8x8, 3, 5, 2, dst, stride, above, left, goffset
+ GET_GOT goffsetq
+
+ pxor m1, m1
+ movq m0, [aboveq]
+ DEFINE_ARGS dst, stride, stride3
+ lea stride3q, [strideq*3]
+ psadbw m0, m1
+ paddw m0, [GLOBAL(pw2_8)]
+ psraw m0, 3
+ punpcklbw m0, m0
+ pshuflw m0, m0, 0x0
+ movq [dstq ], m0
+ movq [dstq+strideq ], m0
+ movq [dstq+strideq*2], m0
+ movq [dstq+stride3q ], m0
+ lea dstq, [dstq+strideq*4]
+ movq [dstq ], m0
+ movq [dstq+strideq ], m0
+ movq [dstq+strideq*2], m0
+ movq [dstq+stride3q ], m0
+
+ RESTORE_GOT
+ RET
+
+INIT_XMM sse2
+cglobal dc_left_predictor_8x8, 2, 5, 2, dst, stride, above, left, goffset
+ movifnidn leftq, leftmp
+ GET_GOT goffsetq
+
+ pxor m1, m1
+ movq m0, [leftq]
+ DEFINE_ARGS dst, stride, stride3
+ lea stride3q, [strideq*3]
+ psadbw m0, m1
+ paddw m0, [GLOBAL(pw2_8)]
+ psraw m0, 3
+ punpcklbw m0, m0
+ pshuflw m0, m0, 0x0
+ movq [dstq ], m0
+ movq [dstq+strideq ], m0
+ movq [dstq+strideq*2], m0
+ movq [dstq+stride3q ], m0
+ lea dstq, [dstq+strideq*4]
+ movq [dstq ], m0
+ movq [dstq+strideq ], m0
+ movq [dstq+strideq*2], m0
+ movq [dstq+stride3q ], m0
+
+ RESTORE_GOT
+ RET
+
+INIT_XMM sse2
+cglobal dc_128_predictor_4x4, 2, 5, 1, dst, stride, above, left, goffset
+ GET_GOT goffsetq
+
+ DEFINE_ARGS dst, stride, stride3
+ lea stride3q, [strideq*3]
+ movd m0, [GLOBAL(dc_128)]
+ movd [dstq ], m0
+ movd [dstq+strideq ], m0
+ movd [dstq+strideq*2], m0
+ movd [dstq+stride3q ], m0
+ RESTORE_GOT
+ RET
+
+INIT_XMM sse2
+cglobal dc_128_predictor_8x8, 2, 5, 1, dst, stride, above, left, goffset
+ GET_GOT goffsetq
+
+ DEFINE_ARGS dst, stride, stride3
+ lea stride3q, [strideq*3]
+ movq m0, [GLOBAL(dc_128)]
+ movq [dstq ], m0
+ movq [dstq+strideq ], m0
+ movq [dstq+strideq*2], m0
+ movq [dstq+stride3q ], m0
+ lea dstq, [dstq+strideq*4]
+ movq [dstq ], m0
+ movq [dstq+strideq ], m0
+ movq [dstq+strideq*2], m0
+ movq [dstq+stride3q ], m0
+ RESTORE_GOT
+ RET
+
+INIT_XMM sse2
+cglobal dc_predictor_16x16, 4, 5, 3, dst, stride, above, left, goffset
+ GET_GOT goffsetq
+
+ pxor m1, m1
+ mova m0, [aboveq]
+ mova m2, [leftq]
+ DEFINE_ARGS dst, stride, stride3, lines4
+ lea stride3q, [strideq*3]
+ mov lines4d, 4
+ psadbw m0, m1
+ psadbw m2, m1
+ paddw m0, m2
+ movhlps m2, m0
+ paddw m0, m2
+ paddw m0, [GLOBAL(pw_16)]
+ psraw m0, 5
+ pshuflw m0, m0, 0x0
+ punpcklqdq m0, m0
+ packuswb m0, m0
+.loop:
+ mova [dstq ], m0
+ mova [dstq+strideq ], m0
+ mova [dstq+strideq*2], m0
+ mova [dstq+stride3q ], m0
+ lea dstq, [dstq+strideq*4]
+ dec lines4d
+ jnz .loop
+
+ RESTORE_GOT
+ REP_RET
+
+
+INIT_XMM sse2
+cglobal dc_top_predictor_16x16, 4, 5, 3, dst, stride, above, left, goffset
+ GET_GOT goffsetq
+
+ pxor m1, m1
+ mova m0, [aboveq]
+ DEFINE_ARGS dst, stride, stride3, lines4
+ lea stride3q, [strideq*3]
+ mov lines4d, 4
+ psadbw m0, m1
+ movhlps m2, m0
+ paddw m0, m2
+ paddw m0, [GLOBAL(pw2_16)]
+ psraw m0, 4
+ pshuflw m0, m0, 0x0
+ punpcklqdq m0, m0
+ packuswb m0, m0
+.loop:
+ mova [dstq ], m0
+ mova [dstq+strideq ], m0
+ mova [dstq+strideq*2], m0
+ mova [dstq+stride3q ], m0
+ lea dstq, [dstq+strideq*4]
+ dec lines4d
+ jnz .loop
+
+ RESTORE_GOT
+ REP_RET
+
+INIT_XMM sse2
+cglobal dc_left_predictor_16x16, 4, 5, 3, dst, stride, above, left, goffset
+ GET_GOT goffsetq
+
+ pxor m1, m1
+ mova m0, [leftq]
+ DEFINE_ARGS dst, stride, stride3, lines4
+ lea stride3q, [strideq*3]
+ mov lines4d, 4
+ psadbw m0, m1
+ movhlps m2, m0
+ paddw m0, m2
+ paddw m0, [GLOBAL(pw2_16)]
+ psraw m0, 4
+ pshuflw m0, m0, 0x0
+ punpcklqdq m0, m0
+ packuswb m0, m0
+.loop:
+ mova [dstq ], m0
+ mova [dstq+strideq ], m0
+ mova [dstq+strideq*2], m0
+ mova [dstq+stride3q ], m0
+ lea dstq, [dstq+strideq*4]
+ dec lines4d
+ jnz .loop
+
+ RESTORE_GOT
+ REP_RET
+
+INIT_XMM sse2
+cglobal dc_128_predictor_16x16, 4, 5, 3, dst, stride, above, left, goffset
+ GET_GOT goffsetq
+
+ DEFINE_ARGS dst, stride, stride3, lines4
+ lea stride3q, [strideq*3]
+ mov lines4d, 4
+ mova m0, [GLOBAL(dc_128)]
+.loop:
+ mova [dstq ], m0
+ mova [dstq+strideq ], m0
+ mova [dstq+strideq*2], m0
+ mova [dstq+stride3q ], m0
+ lea dstq, [dstq+strideq*4]
+ dec lines4d
+ jnz .loop
+ RESTORE_GOT
+ RET
+
+
+INIT_XMM sse2
+cglobal dc_predictor_32x32, 4, 5, 5, dst, stride, above, left, goffset
+ GET_GOT goffsetq
+
+ pxor m1, m1
+ mova m0, [aboveq]
+ mova m2, [aboveq+16]
+ mova m3, [leftq]
+ mova m4, [leftq+16]
+ DEFINE_ARGS dst, stride, stride3, lines4
+ lea stride3q, [strideq*3]
+ mov lines4d, 8
+ psadbw m0, m1
+ psadbw m2, m1
+ psadbw m3, m1
+ psadbw m4, m1
+ paddw m0, m2
+ paddw m0, m3
+ paddw m0, m4
+ movhlps m2, m0
+ paddw m0, m2
+ paddw m0, [GLOBAL(pw_32)]
+ psraw m0, 6
+ pshuflw m0, m0, 0x0
+ punpcklqdq m0, m0
+ packuswb m0, m0
+.loop:
+ mova [dstq ], m0
+ mova [dstq +16], m0
+ mova [dstq+strideq ], m0
+ mova [dstq+strideq +16], m0
+ mova [dstq+strideq*2 ], m0
+ mova [dstq+strideq*2+16], m0
+ mova [dstq+stride3q ], m0
+ mova [dstq+stride3q +16], m0
+ lea dstq, [dstq+strideq*4]
+ dec lines4d
+ jnz .loop
+
+ RESTORE_GOT
+ REP_RET
+
+INIT_XMM sse2
+cglobal dc_top_predictor_32x32, 4, 5, 5, dst, stride, above, left, goffset
+ GET_GOT goffsetq
+
+ pxor m1, m1
+ mova m0, [aboveq]
+ mova m2, [aboveq+16]
+ DEFINE_ARGS dst, stride, stride3, lines4
+ lea stride3q, [strideq*3]
+ mov lines4d, 8
+ psadbw m0, m1
+ psadbw m2, m1
+ paddw m0, m2
+ movhlps m2, m0
+ paddw m0, m2
+ paddw m0, [GLOBAL(pw2_32)]
+ psraw m0, 5
+ pshuflw m0, m0, 0x0
+ punpcklqdq m0, m0
+ packuswb m0, m0
+.loop:
+ mova [dstq ], m0
+ mova [dstq +16], m0
+ mova [dstq+strideq ], m0
+ mova [dstq+strideq +16], m0
+ mova [dstq+strideq*2 ], m0
+ mova [dstq+strideq*2+16], m0
+ mova [dstq+stride3q ], m0
+ mova [dstq+stride3q +16], m0
+ lea dstq, [dstq+strideq*4]
+ dec lines4d
+ jnz .loop
+
+ RESTORE_GOT
+ REP_RET
+
+INIT_XMM sse2
+cglobal dc_left_predictor_32x32, 4, 5, 5, dst, stride, above, left, goffset
+ GET_GOT goffsetq
+
+ pxor m1, m1
+ mova m0, [leftq]
+ mova m2, [leftq+16]
+ DEFINE_ARGS dst, stride, stride3, lines4
+ lea stride3q, [strideq*3]
+ mov lines4d, 8
+ psadbw m0, m1
+ psadbw m2, m1
+ paddw m0, m2
+ movhlps m2, m0
+ paddw m0, m2
+ paddw m0, [GLOBAL(pw2_32)]
+ psraw m0, 5
+ pshuflw m0, m0, 0x0
+ punpcklqdq m0, m0
+ packuswb m0, m0
+.loop:
+ mova [dstq ], m0
+ mova [dstq +16], m0
+ mova [dstq+strideq ], m0
+ mova [dstq+strideq +16], m0
+ mova [dstq+strideq*2 ], m0
+ mova [dstq+strideq*2+16], m0
+ mova [dstq+stride3q ], m0
+ mova [dstq+stride3q +16], m0
+ lea dstq, [dstq+strideq*4]
+ dec lines4d
+ jnz .loop
+
+ RESTORE_GOT
+ REP_RET
+
+INIT_XMM sse2
+cglobal dc_128_predictor_32x32, 4, 5, 3, dst, stride, above, left, goffset
+ GET_GOT goffsetq
+
+ DEFINE_ARGS dst, stride, stride3, lines4
+ lea stride3q, [strideq*3]
+ mov lines4d, 8
+ mova m0, [GLOBAL(dc_128)]
+.loop:
+ mova [dstq ], m0
+ mova [dstq +16], m0
+ mova [dstq+strideq ], m0
+ mova [dstq+strideq +16], m0
+ mova [dstq+strideq*2 ], m0
+ mova [dstq+strideq*2+16], m0
+ mova [dstq+stride3q ], m0
+ mova [dstq+stride3q +16], m0
+ lea dstq, [dstq+strideq*4]
+ dec lines4d
+ jnz .loop
+ RESTORE_GOT
+ RET
+
+INIT_XMM sse2
+cglobal v_predictor_4x4, 3, 3, 1, dst, stride, above
+ movd m0, [aboveq]
+ movd [dstq ], m0
+ movd [dstq+strideq], m0
+ lea dstq, [dstq+strideq*2]
+ movd [dstq ], m0
+ movd [dstq+strideq], m0
+ RET
+
+INIT_XMM sse2
+cglobal v_predictor_8x8, 3, 3, 1, dst, stride, above
+ movq m0, [aboveq]
+ DEFINE_ARGS dst, stride, stride3
+ lea stride3q, [strideq*3]
+ movq [dstq ], m0
+ movq [dstq+strideq ], m0
+ movq [dstq+strideq*2], m0
+ movq [dstq+stride3q ], m0
+ lea dstq, [dstq+strideq*4]
+ movq [dstq ], m0
+ movq [dstq+strideq ], m0
+ movq [dstq+strideq*2], m0
+ movq [dstq+stride3q ], m0
+ RET
+
+INIT_XMM sse2
+cglobal v_predictor_16x16, 3, 4, 1, dst, stride, above
+ mova m0, [aboveq]
+ DEFINE_ARGS dst, stride, stride3, nlines4
+ lea stride3q, [strideq*3]
+ mov nlines4d, 4
+.loop:
+ mova [dstq ], m0
+ mova [dstq+strideq ], m0
+ mova [dstq+strideq*2], m0
+ mova [dstq+stride3q ], m0
+ lea dstq, [dstq+strideq*4]
+ dec nlines4d
+ jnz .loop
+ REP_RET
+
+INIT_XMM sse2
+cglobal v_predictor_32x32, 3, 4, 2, dst, stride, above
+ mova m0, [aboveq]
+ mova m1, [aboveq+16]
+ DEFINE_ARGS dst, stride, stride3, nlines4
+ lea stride3q, [strideq*3]
+ mov nlines4d, 8
+.loop:
+ mova [dstq ], m0
+ mova [dstq +16], m1
+ mova [dstq+strideq ], m0
+ mova [dstq+strideq +16], m1
+ mova [dstq+strideq*2 ], m0
+ mova [dstq+strideq*2+16], m1
+ mova [dstq+stride3q ], m0
+ mova [dstq+stride3q +16], m1
+ lea dstq, [dstq+strideq*4]
+ dec nlines4d
+ jnz .loop
+ REP_RET
+
+INIT_XMM sse2
+cglobal h_predictor_4x4, 2, 4, 4, dst, stride, line, left
+ movifnidn leftq, leftmp
+ movd m0, [leftq]
+ punpcklbw m0, m0
+ punpcklbw m0, m0
+ pshufd m1, m0, 0x1
+ movd [dstq ], m0
+ movd [dstq+strideq], m1
+ pshufd m2, m0, 0x2
+ lea dstq, [dstq+strideq*2]
+ pshufd m3, m0, 0x3
+ movd [dstq ], m2
+ movd [dstq+strideq], m3
+ RET
+
+INIT_XMM sse2
+cglobal h_predictor_8x8, 2, 5, 3, dst, stride, line, left
+ movifnidn leftq, leftmp
+ mov lineq, -2
+ DEFINE_ARGS dst, stride, line, left, stride3
+ lea stride3q, [strideq*3]
+ movq m0, [leftq ]
+ punpcklbw m0, m0 ; l1 l1 l2 l2 ... l8 l8
+.loop:
+ pshuflw m1, m0, 0x0 ; l1 l1 l1 l1 l1 l1 l1 l1
+ pshuflw m2, m0, 0x55 ; l2 l2 l2 l2 l2 l2 l2 l2
+ movq [dstq ], m1
+ movq [dstq+strideq], m2
+ pshuflw m1, m0, 0xaa
+ pshuflw m2, m0, 0xff
+ movq [dstq+strideq*2], m1
+ movq [dstq+stride3q ], m2
+ pshufd m0, m0, 0xe ; [63:0] l5 l5 l6 l6 l7 l7 l8 l8
+ inc lineq
+ lea dstq, [dstq+strideq*4]
+ jnz .loop
+ REP_RET
+
+INIT_XMM sse2
+cglobal h_predictor_16x16, 2, 5, 3, dst, stride, line, left
+ movifnidn leftq, leftmp
+ mov lineq, -4
+ DEFINE_ARGS dst, stride, line, left, stride3
+ lea stride3q, [strideq*3]
+.loop:
+ movd m0, [leftq]
+ punpcklbw m0, m0
+ punpcklbw m0, m0 ; l1 to l4 each repeated 4 times
+ pshufd m1, m0, 0x0 ; l1 repeated 16 times
+ pshufd m2, m0, 0x55 ; l2 repeated 16 times
+ mova [dstq ], m1
+ mova [dstq+strideq ], m2
+ pshufd m1, m0, 0xaa
+ pshufd m2, m0, 0xff
+ mova [dstq+strideq*2], m1
+ mova [dstq+stride3q ], m2
+ inc lineq
+ lea leftq, [leftq+4 ]
+ lea dstq, [dstq+strideq*4]
+ jnz .loop
+ REP_RET
+
+INIT_XMM sse2
+cglobal h_predictor_32x32, 2, 5, 3, dst, stride, line, left
+ movifnidn leftq, leftmp
+ mov lineq, -8
+ DEFINE_ARGS dst, stride, line, left, stride3
+ lea stride3q, [strideq*3]
+.loop:
+ movd m0, [leftq]
+ punpcklbw m0, m0
+ punpcklbw m0, m0 ; l1 to l4 each repeated 4 times
+ pshufd m1, m0, 0x0 ; l1 repeated 16 times
+ pshufd m2, m0, 0x55 ; l2 repeated 16 times
+ mova [dstq ], m1
+ mova [dstq+16 ], m1
+ mova [dstq+strideq ], m2
+ mova [dstq+strideq+16 ], m2
+ pshufd m1, m0, 0xaa
+ pshufd m2, m0, 0xff
+ mova [dstq+strideq*2 ], m1
+ mova [dstq+strideq*2+16], m1
+ mova [dstq+stride3q ], m2
+ mova [dstq+stride3q+16 ], m2
+ inc lineq
+ lea leftq, [leftq+4 ]
+ lea dstq, [dstq+strideq*4]
+ jnz .loop
+ REP_RET
+
+INIT_XMM sse2
+cglobal tm_predictor_4x4, 4, 4, 5, dst, stride, above, left
+ pxor m1, m1
+ movq m0, [aboveq-1]; [63:0] tl t1 t2 t3 t4 x x x
+ punpcklbw m0, m1
+ pshuflw m2, m0, 0x0 ; [63:0] tl tl tl tl [word]
+ psrldq m0, 2
+ psubw m0, m2 ; [63:0] t1-tl t2-tl t3-tl t4-tl [word]
+ movd m2, [leftq]
+ punpcklbw m2, m1
+ pshuflw m4, m2, 0x0 ; [63:0] l1 l1 l1 l1 [word]
+ pshuflw m3, m2, 0x55 ; [63:0] l2 l2 l2 l2 [word]
+ paddw m4, m0
+ paddw m3, m0
+ packuswb m4, m4
+ packuswb m3, m3
+ movd [dstq ], m4
+ movd [dstq+strideq], m3
+ lea dstq, [dstq+strideq*2]
+ pshuflw m4, m2, 0xaa
+ pshuflw m3, m2, 0xff
+ paddw m4, m0
+ paddw m3, m0
+ packuswb m4, m4
+ packuswb m3, m3
+ movd [dstq ], m4
+ movd [dstq+strideq], m3
+ RET
+
+INIT_XMM sse2
+cglobal tm_predictor_8x8, 4, 4, 5, dst, stride, above, left
+ pxor m1, m1
+ movd m2, [aboveq-1]
+ movq m0, [aboveq]
+ punpcklbw m2, m1
+ punpcklbw m0, m1 ; t1 t2 t3 t4 t5 t6 t7 t8 [word]
+ pshuflw m2, m2, 0x0 ; [63:0] tl tl tl tl [word]
+ DEFINE_ARGS dst, stride, line, left
+ mov lineq, -4
+ punpcklqdq m2, m2 ; tl tl tl tl tl tl tl tl [word]
+ psubw m0, m2 ; t1-tl t2-tl ... t8-tl [word]
+ movq m2, [leftq]
+ punpcklbw m2, m1 ; l1 l2 l3 l4 l5 l6 l7 l8 [word]
+.loop:
+ pshuflw m4, m2, 0x0 ; [63:0] l1 l1 l1 l1 [word]
+ pshuflw m3, m2, 0x55 ; [63:0] l2 l2 l2 l2 [word]
+ punpcklqdq m4, m4 ; l1 l1 l1 l1 l1 l1 l1 l1 [word]
+ punpcklqdq m3, m3 ; l2 l2 l2 l2 l2 l2 l2 l2 [word]
+ paddw m4, m0
+ paddw m3, m0
+ packuswb m4, m3
+ movq [dstq ], m4
+ movhps [dstq+strideq], m4
+ lea dstq, [dstq+strideq*2]
+ psrldq m2, 4
+ inc lineq
+ jnz .loop
+ REP_RET
+
+INIT_XMM sse2
+cglobal tm_predictor_16x16, 4, 5, 8, dst, stride, above, left
+ pxor m1, m1
+ mova m2, [aboveq-16];
+ mova m0, [aboveq] ; t1 t2 ... t16 [byte]
+ punpckhbw m2, m1 ; [127:112] tl [word]
+ punpckhbw m4, m0, m1
+ punpcklbw m0, m1 ; m0:m4 t1 t2 ... t16 [word]
+ DEFINE_ARGS dst, stride, line, left, stride8
+ mov lineq, -8
+ pshufhw m2, m2, 0xff
+ mova m3, [leftq] ; l1 l2 ... l16 [byte]
+ punpckhqdq m2, m2 ; tl repeated 8 times [word]
+ psubw m0, m2
+ psubw m4, m2 ; m0:m4 t1-tl t2-tl ... t16-tl [word]
+ punpckhbw m5, m3, m1
+ punpcklbw m3, m1 ; m3:m5 l1 l2 ... l16 [word]
+ lea stride8q, [strideq*8]
+.loop:
+ pshuflw m6, m3, 0x0
+ pshuflw m7, m5, 0x0
+ punpcklqdq m6, m6 ; l1 repeated 8 times [word]
+ punpcklqdq m7, m7 ; l8 repeated 8 times [word]
+ paddw m1, m6, m0
+ paddw m6, m4 ; m1:m6 ti-tl+l1 [i=1,15] [word]
+ psrldq m5, 2
+ packuswb m1, m6
+ mova [dstq ], m1
+ paddw m1, m7, m0
+ paddw m7, m4 ; m1:m7 ti-tl+l8 [i=1,15] [word]
+ psrldq m3, 2
+ packuswb m1, m7
+ mova [dstq+stride8q], m1
+ inc lineq
+ lea dstq, [dstq+strideq]
+ jnz .loop
+ REP_RET
+
+INIT_XMM sse2
+cglobal tm_predictor_32x32, 4, 4, 8, dst, stride, above, left
+ pxor m1, m1
+ movd m2, [aboveq-1]
+ mova m0, [aboveq]
+ mova m4, [aboveq+16]
+ punpcklbw m2, m1
+ punpckhbw m3, m0, m1
+ punpckhbw m5, m4, m1
+ punpcklbw m0, m1
+ punpcklbw m4, m1
+ pshuflw m2, m2, 0x0
+ DEFINE_ARGS dst, stride, line, left
+ mov lineq, -16
+ punpcklqdq m2, m2
+ add leftq, 32
+ psubw m0, m2
+ psubw m3, m2
+ psubw m4, m2
+ psubw m5, m2
+.loop:
+ movd m2, [leftq+lineq*2]
+ pxor m1, m1
+ punpcklbw m2, m1
+ pshuflw m7, m2, 0x55
+ pshuflw m2, m2, 0x0
+ punpcklqdq m2, m2
+ punpcklqdq m7, m7
+ paddw m6, m2, m3
+ paddw m1, m2, m0
+ packuswb m1, m6
+ mova [dstq ], m1
+ paddw m6, m2, m5
+ paddw m1, m2, m4
+ packuswb m1, m6
+ mova [dstq+16 ], m1
+ paddw m6, m7, m3
+ paddw m1, m7, m0
+ packuswb m1, m6
+ mova [dstq+strideq ], m1
+ paddw m6, m7, m5
+ paddw m1, m7, m4
+ packuswb m1, m6
+ mova [dstq+strideq+16], m1
+ lea dstq, [dstq+strideq*2]
+ inc lineq
+ jnz .loop
+ REP_RET
diff --git a/media/libvpx/libvpx/vpx_dsp/x86/intrapred_ssse3.asm b/media/libvpx/libvpx/vpx_dsp/x86/intrapred_ssse3.asm
new file mode 100644
index 0000000000..5e0139fa8d
--- /dev/null
+++ b/media/libvpx/libvpx/vpx_dsp/x86/intrapred_ssse3.asm
@@ -0,0 +1,871 @@
+;
+; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+;
+; Use of this source code is governed by a BSD-style license
+; that can be found in the LICENSE file in the root of the source
+; tree. An additional intellectual property rights grant can be found
+; in the file PATENTS. All contributing project authors may
+; be found in the AUTHORS file in the root of the source tree.
+;
+
+%include "third_party/x86inc/x86inc.asm"
+
+SECTION_RODATA
+
+pb_1: times 16 db 1
+sh_b12345677: db 1, 2, 3, 4, 5, 6, 7, 7, 0, 0, 0, 0, 0, 0, 0, 0
+sh_b23456777: db 2, 3, 4, 5, 6, 7, 7, 7, 0, 0, 0, 0, 0, 0, 0, 0
+sh_b0123456777777777: db 0, 1, 2, 3, 4, 5, 6, 7, 7, 7, 7, 7, 7, 7, 7, 7
+sh_b1234567777777777: db 1, 2, 3, 4, 5, 6, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7
+sh_b2345677777777777: db 2, 3, 4, 5, 6, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7
+sh_b123456789abcdeff: db 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 15
+sh_b23456789abcdefff: db 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 15, 15
+sh_b32104567: db 3, 2, 1, 0, 4, 5, 6, 7, 0, 0, 0, 0, 0, 0, 0, 0
+sh_b8091a2b345: db 8, 0, 9, 1, 10, 2, 11, 3, 4, 5, 0, 0, 0, 0, 0, 0
+sh_b76543210: db 7, 6, 5, 4, 3, 2, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0
+sh_b65432108: db 6, 5, 4, 3, 2, 1, 0, 8, 0, 0, 0, 0, 0, 0, 0, 0
+sh_b54321089: db 5, 4, 3, 2, 1, 0, 8, 9, 0, 0, 0, 0, 0, 0, 0, 0
+sh_b89abcdef: db 8, 9, 10, 11, 12, 13, 14, 15, 0, 0, 0, 0, 0, 0, 0, 0
+sh_bfedcba9876543210: db 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0
+
+SECTION .text
+
+INIT_XMM ssse3
+cglobal d45_predictor_16x16, 3, 6, 4, dst, stride, above, dst8, line, goffset
+ GET_GOT goffsetq
+
+ mova m0, [aboveq]
+ DEFINE_ARGS dst, stride, stride3, dst8, line
+ lea stride3q, [strideq*3]
+ lea dst8q, [dstq+strideq*8]
+ mova m1, [GLOBAL(sh_b123456789abcdeff)]
+ pshufb m2, m0, [GLOBAL(sh_b23456789abcdefff)]
+ pavgb m3, m2, m0
+ pxor m2, m0
+ pshufb m0, m1
+ pand m2, [GLOBAL(pb_1)]
+ psubb m3, m2
+ pavgb m0, m3
+
+ ; first 4 lines and first half of 3rd 4 lines
+ mov lined, 2
+.loop:
+ mova [dstq ], m0
+ movhps [dst8q ], m0
+ pshufb m0, m1
+ mova [dstq +strideq ], m0
+ movhps [dst8q+strideq ], m0
+ pshufb m0, m1
+ mova [dstq +strideq*2 ], m0
+ movhps [dst8q+strideq*2 ], m0
+ pshufb m0, m1
+ mova [dstq +stride3q ], m0
+ movhps [dst8q+stride3q ], m0
+ pshufb m0, m1
+ lea dstq, [dstq +strideq*4]
+ lea dst8q, [dst8q+strideq*4]
+ dec lined
+ jnz .loop
+
+ ; bottom-right 8x8 block
+ movhps [dstq +8], m0
+ movhps [dstq+strideq +8], m0
+ movhps [dstq+strideq*2+8], m0
+ movhps [dstq+stride3q +8], m0
+ lea dstq, [dstq+strideq*4]
+ movhps [dstq +8], m0
+ movhps [dstq+strideq +8], m0
+ movhps [dstq+strideq*2+8], m0
+ movhps [dstq+stride3q +8], m0
+
+ RESTORE_GOT
+ RET
+
+INIT_XMM ssse3
+cglobal d45_predictor_32x32, 3, 6, 7, dst, stride, above, dst16, line, goffset
+ GET_GOT goffsetq
+
+ mova m0, [aboveq]
+ mova m4, [aboveq+16]
+ DEFINE_ARGS dst, stride, stride3, dst16, line
+ lea stride3q, [strideq*3]
+ lea dst16q, [dstq +strideq*8]
+ lea dst16q, [dst16q+strideq*8]
+ mova m1, [GLOBAL(sh_b123456789abcdeff)]
+ pshufb m2, m4, [GLOBAL(sh_b23456789abcdefff)]
+ pavgb m3, m2, m4
+ pxor m2, m4
+ palignr m5, m4, m0, 1
+ palignr m6, m4, m0, 2
+ pshufb m4, m1
+ pand m2, [GLOBAL(pb_1)]
+ psubb m3, m2
+ pavgb m4, m3
+ pavgb m3, m0, m6
+ pxor m0, m6
+ pand m0, [GLOBAL(pb_1)]
+ psubb m3, m0
+ pavgb m5, m3
+
+ ; write 4x4 lines (and the first half of the second 4x4 lines)
+ mov lined, 4
+.loop:
+ mova [dstq ], m5
+ mova [dstq +16], m4
+ mova [dst16q ], m4
+ palignr m3, m4, m5, 1
+ pshufb m4, m1
+ mova [dstq +strideq ], m3
+ mova [dstq +strideq +16], m4
+ mova [dst16q+strideq ], m4
+ palignr m5, m4, m3, 1
+ pshufb m4, m1
+ mova [dstq +strideq*2 ], m5
+ mova [dstq +strideq*2+16], m4
+ mova [dst16q+strideq*2 ], m4
+ palignr m3, m4, m5, 1
+ pshufb m4, m1
+ mova [dstq +stride3q ], m3
+ mova [dstq +stride3q +16], m4
+ mova [dst16q+stride3q ], m4
+ palignr m5, m4, m3, 1
+ pshufb m4, m1
+ lea dstq, [dstq +strideq*4]
+ lea dst16q, [dst16q+strideq*4]
+ dec lined
+ jnz .loop
+
+ ; write second half of second 4x4 lines
+ mova [dstq +16], m4
+ mova [dstq +strideq +16], m4
+ mova [dstq +strideq*2+16], m4
+ mova [dstq +stride3q +16], m4
+ lea dstq, [dstq +strideq*4]
+ mova [dstq +16], m4
+ mova [dstq +strideq +16], m4
+ mova [dstq +strideq*2+16], m4
+ mova [dstq +stride3q +16], m4
+ lea dstq, [dstq +strideq*4]
+ mova [dstq +16], m4
+ mova [dstq +strideq +16], m4
+ mova [dstq +strideq*2+16], m4
+ mova [dstq +stride3q +16], m4
+ lea dstq, [dstq +strideq*4]
+ mova [dstq +16], m4
+ mova [dstq +strideq +16], m4
+ mova [dstq +strideq*2+16], m4
+ mova [dstq +stride3q +16], m4
+
+ RESTORE_GOT
+ RET
+
+; ------------------------------------------
+; input: x, y, z, result
+;
+; trick from pascal
+; (x+2y+z+2)>>2 can be calculated as:
+; result = avg(x,z)
+; result -= xor(x,z) & 1
+; result = avg(result,y)
+; ------------------------------------------
+%macro X_PLUS_2Y_PLUS_Z_PLUS_2_RSH_2 4
+ pavgb %4, %1, %3
+ pxor %3, %1
+ pand %3, [GLOBAL(pb_1)]
+ psubb %4, %3
+ pavgb %4, %2
+%endmacro
+
+INIT_XMM ssse3
+cglobal d63_predictor_4x4, 3, 4, 5, dst, stride, above, goffset
+ GET_GOT goffsetq
+
+ movq m3, [aboveq]
+ pshufb m1, m3, [GLOBAL(sh_b23456777)]
+ pshufb m2, m3, [GLOBAL(sh_b12345677)]
+
+ X_PLUS_2Y_PLUS_Z_PLUS_2_RSH_2 m3, m2, m1, m4
+ pavgb m3, m2
+
+ ; store 4 lines
+ movd [dstq ], m3
+ movd [dstq+strideq], m4
+ lea dstq, [dstq+strideq*2]
+ psrldq m3, 1
+ psrldq m4, 1
+ movd [dstq ], m3
+ movd [dstq+strideq], m4
+ RESTORE_GOT
+ RET
+
+INIT_XMM ssse3
+cglobal d63_predictor_8x8, 3, 4, 5, dst, stride, above, goffset
+ GET_GOT goffsetq
+
+ movq m3, [aboveq]
+ DEFINE_ARGS dst, stride, stride3
+ lea stride3q, [strideq*3]
+ pshufb m1, m3, [GLOBAL(sh_b2345677777777777)]
+ pshufb m0, m3, [GLOBAL(sh_b0123456777777777)]
+ pshufb m2, m3, [GLOBAL(sh_b1234567777777777)]
+ pshufb m3, [GLOBAL(sh_b0123456777777777)]
+
+ X_PLUS_2Y_PLUS_Z_PLUS_2_RSH_2 m0, m2, m1, m4
+ pavgb m3, m2
+
+ ; store 4 lines
+ movq [dstq ], m3
+ movq [dstq+strideq], m4
+ psrldq m3, 1
+ psrldq m4, 1
+ movq [dstq+strideq*2], m3
+ movq [dstq+stride3q ], m4
+ lea dstq, [dstq+strideq*4]
+ psrldq m3, 1
+ psrldq m4, 1
+
+ ; store 4 lines
+ movq [dstq ], m3
+ movq [dstq+strideq], m4
+ psrldq m3, 1
+ psrldq m4, 1
+ movq [dstq+strideq*2], m3
+ movq [dstq+stride3q ], m4
+ RESTORE_GOT
+ RET
+
+INIT_XMM ssse3
+cglobal d63_predictor_16x16, 3, 5, 5, dst, stride, above, line, goffset
+ GET_GOT goffsetq
+
+ mova m0, [aboveq]
+ DEFINE_ARGS dst, stride, stride3, line
+ lea stride3q, [strideq*3]
+ mova m1, [GLOBAL(sh_b123456789abcdeff)]
+ pshufb m2, m0, [GLOBAL(sh_b23456789abcdefff)]
+ pshufb m3, m0, m1
+
+ X_PLUS_2Y_PLUS_Z_PLUS_2_RSH_2 m0, m3, m2, m4
+ pavgb m0, m3
+
+ mov lined, 4
+.loop:
+ mova [dstq ], m0
+ mova [dstq+strideq ], m4
+ pshufb m0, m1
+ pshufb m4, m1
+ mova [dstq+strideq*2], m0
+ mova [dstq+stride3q ], m4
+ pshufb m0, m1
+ pshufb m4, m1
+ lea dstq, [dstq+strideq*4]
+ dec lined
+ jnz .loop
+ RESTORE_GOT
+ REP_RET
+
+INIT_XMM ssse3
+cglobal d63_predictor_32x32, 3, 5, 8, dst, stride, above, line, goffset
+ GET_GOT goffsetq
+
+ mova m0, [aboveq]
+ mova m7, [aboveq+16]
+ DEFINE_ARGS dst, stride, stride3, line
+ mova m1, [GLOBAL(sh_b123456789abcdeff)]
+ lea stride3q, [strideq*3]
+ pshufb m2, m7, [GLOBAL(sh_b23456789abcdefff)]
+ pshufb m3, m7, m1
+
+ X_PLUS_2Y_PLUS_Z_PLUS_2_RSH_2 m7, m3, m2, m4
+ palignr m6, m7, m0, 1
+ palignr m5, m7, m0, 2
+ pavgb m7, m3
+
+ X_PLUS_2Y_PLUS_Z_PLUS_2_RSH_2 m0, m6, m5, m2
+ pavgb m0, m6
+
+ mov lined, 8
+.loop:
+ mova [dstq ], m0
+ mova [dstq +16], m7
+ mova [dstq+strideq ], m2
+ mova [dstq+strideq +16], m4
+ palignr m3, m7, m0, 1
+ palignr m5, m4, m2, 1
+ pshufb m7, m1
+ pshufb m4, m1
+
+ mova [dstq+strideq*2 ], m3
+ mova [dstq+strideq*2+16], m7
+ mova [dstq+stride3q ], m5
+ mova [dstq+stride3q +16], m4
+ palignr m0, m7, m3, 1
+ palignr m2, m4, m5, 1
+ pshufb m7, m1
+ pshufb m4, m1
+ lea dstq, [dstq+strideq*4]
+ dec lined
+ jnz .loop
+ RESTORE_GOT
+ REP_RET
+
+INIT_XMM ssse3
+cglobal d153_predictor_4x4, 4, 5, 4, dst, stride, above, left, goffset
+ GET_GOT goffsetq
+ movd m0, [leftq] ; l1, l2, l3, l4
+ movd m1, [aboveq-1] ; tl, t1, t2, t3
+ punpckldq m0, m1 ; l1, l2, l3, l4, tl, t1, t2, t3
+ pshufb m0, [GLOBAL(sh_b32104567)]; l4, l3, l2, l1, tl, t1, t2, t3
+ psrldq m1, m0, 1 ; l3, l2, l1, tl, t1, t2, t3
+ psrldq m2, m0, 2 ; l2, l1, tl, t1, t2, t3
+ ; comments below are for a predictor like this
+ ; A1 B1 C1 D1
+ ; A2 B2 A1 B1
+ ; A3 B3 A2 B2
+ ; A4 B4 A3 B3
+ X_PLUS_2Y_PLUS_Z_PLUS_2_RSH_2 m0, m1, m2, m3 ; 3-tap avg B4 B3 B2 B1 C1 D1
+ pavgb m1, m0 ; 2-tap avg A4 A3 A2 A1
+
+ punpcklqdq m3, m1 ; B4 B3 B2 B1 C1 D1 x x A4 A3 A2 A1 ..
+
+ DEFINE_ARGS dst, stride, stride3
+ lea stride3q, [strideq*3]
+ pshufb m3, [GLOBAL(sh_b8091a2b345)] ; A4 B4 A3 B3 A2 B2 A1 B1 C1 D1 ..
+ movd [dstq+stride3q ], m3
+ psrldq m3, 2 ; A3 B3 A2 B2 A1 B1 C1 D1 ..
+ movd [dstq+strideq*2], m3
+ psrldq m3, 2 ; A2 B2 A1 B1 C1 D1 ..
+ movd [dstq+strideq ], m3
+ psrldq m3, 2 ; A1 B1 C1 D1 ..
+ movd [dstq ], m3
+ RESTORE_GOT
+ RET
+
+INIT_XMM ssse3
+cglobal d153_predictor_8x8, 4, 5, 8, dst, stride, above, left, goffset
+ GET_GOT goffsetq
+ movq m0, [leftq] ; [0- 7] l1-8 [byte]
+ movhps m0, [aboveq-1] ; [8-15] tl, t1-7 [byte]
+ pshufb m1, m0, [GLOBAL(sh_b76543210)] ; l8-1 [word]
+ pshufb m2, m0, [GLOBAL(sh_b65432108)] ; l7-1,tl [word]
+ pshufb m3, m0, [GLOBAL(sh_b54321089)] ; l6-1,tl,t1 [word]
+ pshufb m0, [GLOBAL(sh_b89abcdef)] ; tl,t1-7 [word]
+ psrldq m4, m0, 1 ; t1-7 [word]
+ psrldq m5, m0, 2 ; t2-7 [word]
+ ; comments below are for a predictor like this
+ ; A1 B1 C1 D1 E1 F1 G1 H1
+ ; A2 B2 A1 B1 C1 D1 E1 F1
+ ; A3 B3 A2 B2 A1 B1 C1 D1
+ ; A4 B4 A3 B3 A2 B2 A1 B1
+ ; A5 B5 A4 B4 A3 B3 A2 B2
+ ; A6 B6 A5 B5 A4 B4 A3 B3
+ ; A7 B7 A6 B6 A5 B5 A4 B4
+ ; A8 B8 A7 B7 A6 B6 A5 B5
+ pavgb m6, m1, m2 ; 2-tap avg A8-A1
+
+ X_PLUS_2Y_PLUS_Z_PLUS_2_RSH_2 m0, m4, m5, m7 ; 3-tap avg C-H1
+
+ X_PLUS_2Y_PLUS_Z_PLUS_2_RSH_2 m1, m2, m3, m0 ; 3-tap avg B8-1
+
+ punpcklbw m6, m0 ; A-B8, A-B7 ... A-B2, A-B1
+
+ DEFINE_ARGS dst, stride, stride3
+ lea stride3q, [strideq*3]
+
+ movhps [dstq+stride3q], m6 ; A-B4, A-B3, A-B2, A-B1
+ palignr m0, m7, m6, 10 ; A-B3, A-B2, A-B1, C-H1
+ movq [dstq+strideq*2], m0
+ psrldq m0, 2 ; A-B2, A-B1, C-H1
+ movq [dstq+strideq ], m0
+ psrldq m0, 2 ; A-H1
+ movq [dstq ], m0
+ lea dstq, [dstq+strideq*4]
+ movq [dstq+stride3q ], m6 ; A-B8, A-B7, A-B6, A-B5
+ psrldq m6, 2 ; A-B7, A-B6, A-B5, A-B4
+ movq [dstq+strideq*2], m6
+ psrldq m6, 2 ; A-B6, A-B5, A-B4, A-B3
+ movq [dstq+strideq ], m6
+ psrldq m6, 2 ; A-B5, A-B4, A-B3, A-B2
+ movq [dstq ], m6
+ RESTORE_GOT
+ RET
+
+INIT_XMM ssse3
+cglobal d153_predictor_16x16, 4, 5, 8, dst, stride, above, left, goffset
+ GET_GOT goffsetq
+ mova m0, [leftq]
+ movu m7, [aboveq-1]
+ ; comments below are for a predictor like this
+ ; A1 B1 C1 D1 E1 F1 G1 H1 I1 J1 K1 L1 M1 N1 O1 P1
+ ; A2 B2 A1 B1 C1 D1 E1 F1 G1 H1 I1 J1 K1 L1 M1 N1
+ ; A3 B3 A2 B2 A1 B1 C1 D1 E1 F1 G1 H1 I1 J1 K1 L1
+ ; A4 B4 A3 B3 A2 B2 A1 B1 C1 D1 E1 F1 G1 H1 I1 J1
+ ; A5 B5 A4 B4 A3 B3 A2 B2 A1 B1 C1 D1 E1 F1 G1 H1
+ ; A6 B6 A5 B5 A4 B4 A3 B3 A2 B2 A1 B1 C1 D1 E1 F1
+ ; A7 B7 A6 B6 A5 B5 A4 B4 A3 B3 A2 B2 A1 B1 C1 D1
+ ; A8 B8 A7 B7 A6 B6 A5 B5 A4 B4 A3 B3 A2 B2 A1 B1
+ ; A9 B9 A8 B8 A7 B7 A6 B6 A5 B5 A4 B4 A3 B3 A2 B2
+ ; Aa Ba A9 B9 A8 B8 A7 B7 A6 B6 A5 B5 A4 B4 A3 B3
+ ; Ab Bb Aa Ba A9 B9 A8 B8 A7 B7 A6 B6 A5 B5 A4 B4
+ ; Ac Bc Ab Bb Aa Ba A9 B9 A8 B8 A7 B7 A6 B6 A5 B5
+ ; Ad Bd Ac Bc Ab Bb Aa Ba A9 B9 A8 B8 A7 B7 A6 B6
+ ; Ae Be Ad Bd Ac Bc Ab Bb Aa Ba A9 B9 A8 B8 A7 B7
+ ; Af Bf Ae Be Ad Bd Ac Bc Ab Bb Aa Ba A9 B9 A8 B8
+ ; Ag Bg Af Bf Ae Be Ad Bd Ac Bc Ab Bb Aa Ba A9 B9
+ pshufb m6, m7, [GLOBAL(sh_bfedcba9876543210)]
+ palignr m5, m0, m6, 15
+ palignr m3, m0, m6, 14
+
+ X_PLUS_2Y_PLUS_Z_PLUS_2_RSH_2 m0, m5, m3, m4 ; 3-tap avg B3-Bg
+ pshufb m1, m0, [GLOBAL(sh_b123456789abcdeff)]
+ pavgb m5, m0 ; A1 - Ag
+
+ punpcklbw m0, m4, m5 ; A-B8 ... A-B1
+ punpckhbw m4, m5 ; A-B9 ... A-Bg
+
+ pshufb m3, m7, [GLOBAL(sh_b123456789abcdeff)]
+ pshufb m5, m7, [GLOBAL(sh_b23456789abcdefff)]
+
+ X_PLUS_2Y_PLUS_Z_PLUS_2_RSH_2 m7, m3, m5, m1 ; 3-tap avg C1-P1
+
+ pshufb m6, m0, [GLOBAL(sh_bfedcba9876543210)]
+ DEFINE_ARGS dst, stride, stride3
+ lea stride3q, [strideq*3]
+ palignr m2, m1, m6, 14
+ mova [dstq ], m2
+ palignr m2, m1, m6, 12
+ mova [dstq+strideq ], m2
+ palignr m2, m1, m6, 10
+ mova [dstq+strideq*2], m2
+ palignr m2, m1, m6, 8
+ mova [dstq+stride3q ], m2
+ lea dstq, [dstq+strideq*4]
+ palignr m2, m1, m6, 6
+ mova [dstq ], m2
+ palignr m2, m1, m6, 4
+ mova [dstq+strideq ], m2
+ palignr m2, m1, m6, 2
+ mova [dstq+strideq*2], m2
+ pshufb m4, [GLOBAL(sh_bfedcba9876543210)]
+ mova [dstq+stride3q ], m6
+ lea dstq, [dstq+strideq*4]
+
+ palignr m2, m6, m4, 14
+ mova [dstq ], m2
+ palignr m2, m6, m4, 12
+ mova [dstq+strideq ], m2
+ palignr m2, m6, m4, 10
+ mova [dstq+strideq*2], m2
+ palignr m2, m6, m4, 8
+ mova [dstq+stride3q ], m2
+ lea dstq, [dstq+strideq*4]
+ palignr m2, m6, m4, 6
+ mova [dstq ], m2
+ palignr m2, m6, m4, 4
+ mova [dstq+strideq ], m2
+ palignr m2, m6, m4, 2
+ mova [dstq+strideq*2], m2
+ mova [dstq+stride3q ], m4
+ RESTORE_GOT
+ RET
+
+INIT_XMM ssse3
+cglobal d153_predictor_32x32, 4, 5, 8, dst, stride, above, left, goffset
+ GET_GOT goffsetq
+ mova m0, [leftq]
+ movu m7, [aboveq-1]
+ movu m1, [aboveq+15]
+
+ pshufb m4, m1, [GLOBAL(sh_b123456789abcdeff)]
+ pshufb m6, m1, [GLOBAL(sh_b23456789abcdefff)]
+
+ X_PLUS_2Y_PLUS_Z_PLUS_2_RSH_2 m1, m4, m6, m2 ; 3-tap avg above [high]
+
+ palignr m3, m1, m7, 1
+ palignr m5, m1, m7, 2
+
+ X_PLUS_2Y_PLUS_Z_PLUS_2_RSH_2 m7, m3, m5, m1 ; 3-tap avg above [low]
+
+ pshufb m7, [GLOBAL(sh_bfedcba9876543210)]
+ palignr m5, m0, m7, 15
+ palignr m3, m0, m7, 14
+
+ X_PLUS_2Y_PLUS_Z_PLUS_2_RSH_2 m0, m5, m3, m4 ; 3-tap avg B3-Bg
+ pavgb m5, m0 ; A1 - Ag
+ punpcklbw m6, m4, m5 ; A-B8 ... A-B1
+ punpckhbw m4, m5 ; A-B9 ... A-Bg
+ pshufb m6, [GLOBAL(sh_bfedcba9876543210)]
+ pshufb m4, [GLOBAL(sh_bfedcba9876543210)]
+
+ DEFINE_ARGS dst, stride, stride3, left, line
+ lea stride3q, [strideq*3]
+
+ palignr m5, m2, m1, 14
+ palignr m7, m1, m6, 14
+ mova [dstq ], m7
+ mova [dstq+16 ], m5
+ palignr m5, m2, m1, 12
+ palignr m7, m1, m6, 12
+ mova [dstq+strideq ], m7
+ mova [dstq+strideq+16 ], m5
+ palignr m5, m2, m1, 10
+ palignr m7, m1, m6, 10
+ mova [dstq+strideq*2 ], m7
+ mova [dstq+strideq*2+16], m5
+ palignr m5, m2, m1, 8
+ palignr m7, m1, m6, 8
+ mova [dstq+stride3q ], m7
+ mova [dstq+stride3q+16 ], m5
+ lea dstq, [dstq+strideq*4]
+ palignr m5, m2, m1, 6
+ palignr m7, m1, m6, 6
+ mova [dstq ], m7
+ mova [dstq+16 ], m5
+ palignr m5, m2, m1, 4
+ palignr m7, m1, m6, 4
+ mova [dstq+strideq ], m7
+ mova [dstq+strideq+16 ], m5
+ palignr m5, m2, m1, 2
+ palignr m7, m1, m6, 2
+ mova [dstq+strideq*2 ], m7
+ mova [dstq+strideq*2+16], m5
+ mova [dstq+stride3q ], m6
+ mova [dstq+stride3q+16 ], m1
+ lea dstq, [dstq+strideq*4]
+
+ palignr m5, m1, m6, 14
+ palignr m3, m6, m4, 14
+ mova [dstq ], m3
+ mova [dstq+16 ], m5
+ palignr m5, m1, m6, 12
+ palignr m3, m6, m4, 12
+ mova [dstq+strideq ], m3
+ mova [dstq+strideq+16 ], m5
+ palignr m5, m1, m6, 10
+ palignr m3, m6, m4, 10
+ mova [dstq+strideq*2 ], m3
+ mova [dstq+strideq*2+16], m5
+ palignr m5, m1, m6, 8
+ palignr m3, m6, m4, 8
+ mova [dstq+stride3q ], m3
+ mova [dstq+stride3q+16 ], m5
+ lea dstq, [dstq+strideq*4]
+ palignr m5, m1, m6, 6
+ palignr m3, m6, m4, 6
+ mova [dstq ], m3
+ mova [dstq+16 ], m5
+ palignr m5, m1, m6, 4
+ palignr m3, m6, m4, 4
+ mova [dstq+strideq ], m3
+ mova [dstq+strideq+16 ], m5
+ palignr m5, m1, m6, 2
+ palignr m3, m6, m4, 2
+ mova [dstq+strideq*2 ], m3
+ mova [dstq+strideq*2+16], m5
+ mova [dstq+stride3q ], m4
+ mova [dstq+stride3q+16 ], m6
+ lea dstq, [dstq+strideq*4]
+
+ mova m7, [leftq]
+ mova m3, [leftq+16]
+ palignr m5, m3, m7, 15
+ palignr m0, m3, m7, 14
+
+ X_PLUS_2Y_PLUS_Z_PLUS_2_RSH_2 m3, m5, m0, m2 ; 3-tap avg Bh -
+ pavgb m5, m3 ; Ah -
+ punpcklbw m3, m2, m5 ; A-B8 ... A-B1
+ punpckhbw m2, m5 ; A-B9 ... A-Bg
+ pshufb m3, [GLOBAL(sh_bfedcba9876543210)]
+ pshufb m2, [GLOBAL(sh_bfedcba9876543210)]
+
+ palignr m7, m6, m4, 14
+ palignr m0, m4, m3, 14
+ mova [dstq ], m0
+ mova [dstq+16 ], m7
+ palignr m7, m6, m4, 12
+ palignr m0, m4, m3, 12
+ mova [dstq+strideq ], m0
+ mova [dstq+strideq+16 ], m7
+ palignr m7, m6, m4, 10
+ palignr m0, m4, m3, 10
+ mova [dstq+strideq*2 ], m0
+ mova [dstq+strideq*2+16], m7
+ palignr m7, m6, m4, 8
+ palignr m0, m4, m3, 8
+ mova [dstq+stride3q ], m0
+ mova [dstq+stride3q+16 ], m7
+ lea dstq, [dstq+strideq*4]
+ palignr m7, m6, m4, 6
+ palignr m0, m4, m3, 6
+ mova [dstq ], m0
+ mova [dstq+16 ], m7
+ palignr m7, m6, m4, 4
+ palignr m0, m4, m3, 4
+ mova [dstq+strideq ], m0
+ mova [dstq+strideq+16 ], m7
+ palignr m7, m6, m4, 2
+ palignr m0, m4, m3, 2
+ mova [dstq+strideq*2 ], m0
+ mova [dstq+strideq*2+16], m7
+ mova [dstq+stride3q ], m3
+ mova [dstq+stride3q+16 ], m4
+ lea dstq, [dstq+strideq*4]
+
+ palignr m7, m4, m3, 14
+ palignr m0, m3, m2, 14
+ mova [dstq ], m0
+ mova [dstq+16 ], m7
+ palignr m7, m4, m3, 12
+ palignr m0, m3, m2, 12
+ mova [dstq+strideq ], m0
+ mova [dstq+strideq+16 ], m7
+ palignr m7, m4, m3, 10
+ palignr m0, m3, m2, 10
+ mova [dstq+strideq*2 ], m0
+ mova [dstq+strideq*2+16], m7
+ palignr m7, m4, m3, 8
+ palignr m0, m3, m2, 8
+ mova [dstq+stride3q ], m0
+ mova [dstq+stride3q+16 ], m7
+ lea dstq, [dstq+strideq*4]
+ palignr m7, m4, m3, 6
+ palignr m0, m3, m2, 6
+ mova [dstq ], m0
+ mova [dstq+16 ], m7
+ palignr m7, m4, m3, 4
+ palignr m0, m3, m2, 4
+ mova [dstq+strideq ], m0
+ mova [dstq+strideq+16 ], m7
+ palignr m7, m4, m3, 2
+ palignr m0, m3, m2, 2
+ mova [dstq+strideq*2 ], m0
+ mova [dstq+strideq*2+16], m7
+ mova [dstq+stride3q ], m2
+ mova [dstq+stride3q+16 ], m3
+
+ RESTORE_GOT
+ RET
+
+INIT_XMM ssse3
+cglobal d207_predictor_8x8, 4, 5, 4, dst, stride, stride3, left, goffset
+ GET_GOT goffsetq
+ movq m3, [leftq] ; abcdefgh [byte]
+ lea stride3q, [strideq*3]
+
+ pshufb m1, m3, [GLOBAL(sh_b2345677777777777)]
+ pshufb m0, m3, [GLOBAL(sh_b0123456777777777)]
+ pshufb m2, m3, [GLOBAL(sh_b1234567777777777)]
+
+ X_PLUS_2Y_PLUS_Z_PLUS_2_RSH_2 m0, m2, m1, m3
+ pavgb m0, m2
+ punpcklbw m0, m3 ; interleaved output
+
+ movq [dstq ], m0
+ psrldq m0, 2
+ movq [dstq+strideq ], m0
+ psrldq m0, 2
+ movq [dstq+strideq*2], m0
+ psrldq m0, 2
+ movq [dstq+stride3q ], m0
+ lea dstq, [dstq+strideq*4]
+ pshufhw m0, m0, q0000 ; de, d2ef, ef, e2fg, fg, f2gh, gh, g3h, 8xh
+ psrldq m0, 2
+ movq [dstq ], m0
+ psrldq m0, 2
+ movq [dstq+strideq ], m0
+ psrldq m0, 2
+ movq [dstq+strideq*2], m0
+ psrldq m0, 2
+ movq [dstq+stride3q ], m0
+ RESTORE_GOT
+ RET
+
+INIT_XMM ssse3
+cglobal d207_predictor_16x16, 4, 5, 5, dst, stride, stride3, left, goffset
+ GET_GOT goffsetq
+ lea stride3q, [strideq*3]
+ mova m0, [leftq] ; abcdefghijklmnop [byte]
+ pshufb m1, m0, [GLOBAL(sh_b123456789abcdeff)] ; bcdefghijklmnopp
+ pshufb m2, m0, [GLOBAL(sh_b23456789abcdefff)]
+
+ X_PLUS_2Y_PLUS_Z_PLUS_2_RSH_2 m0, m1, m2, m3
+ pavgb m1, m0 ; ab, bc, cd .. no, op, pp [byte]
+
+ punpckhbw m4, m1, m3 ; interleaved input
+ punpcklbw m1, m3 ; interleaved output
+ mova [dstq ], m1
+ palignr m3, m4, m1, 2
+ mova [dstq+strideq ], m3
+ palignr m3, m4, m1, 4
+ mova [dstq+strideq*2], m3
+ palignr m3, m4, m1, 6
+ mova [dstq+stride3q ], m3
+ lea dstq, [dstq+strideq*4]
+ palignr m3, m4, m1, 8
+ mova [dstq ], m3
+ palignr m3, m4, m1, 10
+ mova [dstq+strideq ], m3
+ palignr m3, m4, m1, 12
+ mova [dstq+strideq*2], m3
+ palignr m3, m4, m1, 14
+ mova [dstq+stride3q ], m3
+ DEFINE_ARGS dst, stride, stride3, line
+ mov lined, 2
+ mova m0, [GLOBAL(sh_b23456789abcdefff)]
+.loop:
+ lea dstq, [dstq+strideq*4]
+ mova [dstq ], m4
+ pshufb m4, m0
+ mova [dstq+strideq ], m4
+ pshufb m4, m0
+ mova [dstq+strideq*2], m4
+ pshufb m4, m0
+ mova [dstq+stride3q ], m4
+ pshufb m4, m0
+ dec lined
+ jnz .loop
+ RESTORE_GOT
+ REP_RET
+
+INIT_XMM ssse3
+cglobal d207_predictor_32x32, 4, 5, 8, dst, stride, stride3, left, goffset
+ GET_GOT goffsetq
+ lea stride3q, [strideq*3]
+ mova m1, [leftq] ; 0-15 [byte]
+ mova m2, [leftq+16] ; 16-31 [byte]
+ pshufb m0, m2, [GLOBAL(sh_b23456789abcdefff)]
+ pshufb m4, m2, [GLOBAL(sh_b123456789abcdeff)]
+
+ X_PLUS_2Y_PLUS_Z_PLUS_2_RSH_2 m2, m4, m0, m3
+ palignr m6, m2, m1, 1
+ palignr m5, m2, m1, 2
+ pavgb m2, m4 ; high 16px even lines
+
+ X_PLUS_2Y_PLUS_Z_PLUS_2_RSH_2 m1, m6, m5, m0
+ pavgb m1, m6 ; low 16px even lines
+
+ punpckhbw m6, m1, m0 ; interleaved output 2
+ punpcklbw m1, m0 ; interleaved output 1
+
+ punpckhbw m7, m2, m3 ; interleaved output 4
+ punpcklbw m2, m3 ; interleaved output 3
+
+ ; output 1st 8 lines (and half of 2nd 8 lines)
+ DEFINE_ARGS dst, stride, stride3, dst8
+ lea dst8q, [dstq+strideq*8]
+ mova [dstq ], m1
+ mova [dstq +16], m6
+ mova [dst8q ], m6
+ palignr m0, m6, m1, 2
+ palignr m4, m2, m6, 2
+ mova [dstq +strideq ], m0
+ mova [dstq +strideq +16], m4
+ mova [dst8q+strideq ], m4
+ palignr m0, m6, m1, 4
+ palignr m4, m2, m6, 4
+ mova [dstq +strideq*2 ], m0
+ mova [dstq +strideq*2+16], m4
+ mova [dst8q+strideq*2 ], m4
+ palignr m0, m6, m1, 6
+ palignr m4, m2, m6, 6
+ mova [dstq +stride3q ], m0
+ mova [dstq +stride3q +16], m4
+ mova [dst8q+stride3q ], m4
+ lea dstq, [dstq +strideq*4]
+ lea dst8q, [dst8q+strideq*4]
+ palignr m0, m6, m1, 8
+ palignr m4, m2, m6, 8
+ mova [dstq ], m0
+ mova [dstq +16], m4
+ mova [dst8q ], m4
+ palignr m0, m6, m1, 10
+ palignr m4, m2, m6, 10
+ mova [dstq +strideq ], m0
+ mova [dstq +strideq +16], m4
+ mova [dst8q+strideq ], m4
+ palignr m0, m6, m1, 12
+ palignr m4, m2, m6, 12
+ mova [dstq +strideq*2 ], m0
+ mova [dstq +strideq*2+16], m4
+ mova [dst8q+strideq*2 ], m4
+ palignr m0, m6, m1, 14
+ palignr m4, m2, m6, 14
+ mova [dstq +stride3q ], m0
+ mova [dstq +stride3q +16], m4
+ mova [dst8q+stride3q ], m4
+ lea dstq, [dstq+strideq*4]
+ lea dst8q, [dst8q+strideq*4]
+
+ ; output 2nd half of 2nd 8 lines and half of 3rd 8 lines
+ mova [dstq +16], m2
+ mova [dst8q ], m2
+ palignr m4, m7, m2, 2
+ mova [dstq +strideq +16], m4
+ mova [dst8q+strideq ], m4
+ palignr m4, m7, m2, 4
+ mova [dstq +strideq*2+16], m4
+ mova [dst8q+strideq*2 ], m4
+ palignr m4, m7, m2, 6
+ mova [dstq +stride3q +16], m4
+ mova [dst8q+stride3q ], m4
+ lea dstq, [dstq+strideq*4]
+ lea dst8q, [dst8q+strideq*4]
+ palignr m4, m7, m2, 8
+ mova [dstq +16], m4
+ mova [dst8q ], m4
+ palignr m4, m7, m2, 10
+ mova [dstq +strideq +16], m4
+ mova [dst8q+strideq ], m4
+ palignr m4, m7, m2, 12
+ mova [dstq +strideq*2+16], m4
+ mova [dst8q+strideq*2 ], m4
+ palignr m4, m7, m2, 14
+ mova [dstq +stride3q +16], m4
+ mova [dst8q+stride3q ], m4
+ lea dstq, [dstq+strideq*4]
+ lea dst8q, [dst8q+strideq*4]
+
+ ; output 2nd half of 3rd 8 lines and half of 4th 8 lines
+ mova m0, [GLOBAL(sh_b23456789abcdefff)]
+ mova [dstq +16], m7
+ mova [dst8q ], m7
+ pshufb m7, m0
+ mova [dstq +strideq +16], m7
+ mova [dst8q+strideq ], m7
+ pshufb m7, m0
+ mova [dstq +strideq*2+16], m7
+ mova [dst8q+strideq*2 ], m7
+ pshufb m7, m0
+ mova [dstq +stride3q +16], m7
+ mova [dst8q+stride3q ], m7
+ pshufb m7, m0
+ lea dstq, [dstq+strideq*4]
+ lea dst8q, [dst8q+strideq*4]
+ mova [dstq +16], m7
+ mova [dst8q ], m7
+ pshufb m7, m0
+ mova [dstq +strideq +16], m7
+ mova [dst8q+strideq ], m7
+ pshufb m7, m0
+ mova [dstq +strideq*2+16], m7
+ mova [dst8q+strideq*2 ], m7
+ pshufb m7, m0
+ mova [dstq +stride3q +16], m7
+ mova [dst8q+stride3q ], m7
+ pshufb m7, m0
+ lea dstq, [dstq+strideq*4]
+
+ ; output last half of 4th 8 lines
+ mova [dstq +16], m7
+ mova [dstq +strideq +16], m7
+ mova [dstq +strideq*2+16], m7
+ mova [dstq +stride3q +16], m7
+ lea dstq, [dstq+strideq*4]
+ mova [dstq +16], m7
+ mova [dstq +strideq +16], m7
+ mova [dstq +strideq*2+16], m7
+ mova [dstq +stride3q +16], m7
+
+ ; done!
+ RESTORE_GOT
+ RET
diff --git a/media/libvpx/libvpx/vpx_dsp/x86/inv_txfm_avx2.c b/media/libvpx/libvpx/vpx_dsp/x86/inv_txfm_avx2.c
new file mode 100644
index 0000000000..752435d240
--- /dev/null
+++ b/media/libvpx/libvpx/vpx_dsp/x86/inv_txfm_avx2.c
@@ -0,0 +1,626 @@
+/*
+ * Copyright (c) 2023 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <immintrin.h> // AVX2
+
+#include "./vpx_dsp_rtcd.h"
+#include "vpx_dsp/txfm_common.h"
+
+#define PAIR256_SET_EPI16(a, b) \
+ _mm256_set_epi16((int16_t)(b), (int16_t)(a), (int16_t)(b), (int16_t)(a), \
+ (int16_t)(b), (int16_t)(a), (int16_t)(b), (int16_t)(a), \
+ (int16_t)(b), (int16_t)(a), (int16_t)(b), (int16_t)(a), \
+ (int16_t)(b), (int16_t)(a), (int16_t)(b), (int16_t)(a))
+
+static INLINE void idct_load16x16(const tran_low_t *input, __m256i *in,
+ int stride) {
+ int i;
+ // Load 16x16 values
+ for (i = 0; i < 16; i++) {
+#if CONFIG_VP9_HIGHBITDEPTH
+ const __m128i in0 = _mm_loadu_si128((const __m128i *)(input + i * stride));
+ const __m128i in1 =
+ _mm_loadu_si128((const __m128i *)((input + i * stride) + 4));
+ const __m128i in2 =
+ _mm_loadu_si128((const __m128i *)((input + i * stride) + 8));
+ const __m128i in3 =
+ _mm_loadu_si128((const __m128i *)((input + i * stride) + 12));
+ const __m128i ls = _mm_packs_epi32(in0, in1);
+ const __m128i rs = _mm_packs_epi32(in2, in3);
+ in[i] = _mm256_inserti128_si256(_mm256_castsi128_si256(ls), rs, 1);
+#else
+ in[i] = _mm256_load_si256((const __m256i *)(input + i * stride));
+#endif
+ }
+}
+
+static INLINE __m256i dct_round_shift_avx2(__m256i in) {
+ const __m256i t = _mm256_add_epi32(in, _mm256_set1_epi32(DCT_CONST_ROUNDING));
+ return _mm256_srai_epi32(t, DCT_CONST_BITS);
+}
+
+static INLINE __m256i idct_madd_round_shift_avx2(__m256i *in, __m256i *cospi) {
+ const __m256i t = _mm256_madd_epi16(*in, *cospi);
+ return dct_round_shift_avx2(t);
+}
+
+// Calculate the dot product between in0/1 and x and wrap to short.
+static INLINE __m256i idct_calc_wraplow_avx2(__m256i *in0, __m256i *in1,
+ __m256i *x) {
+ const __m256i t0 = idct_madd_round_shift_avx2(in0, x);
+ const __m256i t1 = idct_madd_round_shift_avx2(in1, x);
+ return _mm256_packs_epi32(t0, t1);
+}
+
+// Multiply elements by constants and add them together.
+static INLINE void butterfly16(__m256i in0, __m256i in1, int c0, int c1,
+ __m256i *out0, __m256i *out1) {
+ __m256i cst0 = PAIR256_SET_EPI16(c0, -c1);
+ __m256i cst1 = PAIR256_SET_EPI16(c1, c0);
+ __m256i lo = _mm256_unpacklo_epi16(in0, in1);
+ __m256i hi = _mm256_unpackhi_epi16(in0, in1);
+ *out0 = idct_calc_wraplow_avx2(&lo, &hi, &cst0);
+ *out1 = idct_calc_wraplow_avx2(&lo, &hi, &cst1);
+}
+
+static INLINE void idct16_16col(__m256i *in, __m256i *out) {
+ __m256i step1[16], step2[16];
+
+ // stage 2
+ butterfly16(in[1], in[15], cospi_30_64, cospi_2_64, &step2[8], &step2[15]);
+ butterfly16(in[9], in[7], cospi_14_64, cospi_18_64, &step2[9], &step2[14]);
+ butterfly16(in[5], in[11], cospi_22_64, cospi_10_64, &step2[10], &step2[13]);
+ butterfly16(in[13], in[3], cospi_6_64, cospi_26_64, &step2[11], &step2[12]);
+
+ // stage 3
+ butterfly16(in[2], in[14], cospi_28_64, cospi_4_64, &step1[4], &step1[7]);
+ butterfly16(in[10], in[6], cospi_12_64, cospi_20_64, &step1[5], &step1[6]);
+ step1[8] = _mm256_add_epi16(step2[8], step2[9]);
+ step1[9] = _mm256_sub_epi16(step2[8], step2[9]);
+ step1[10] = _mm256_sub_epi16(step2[11], step2[10]);
+ step1[11] = _mm256_add_epi16(step2[10], step2[11]);
+ step1[12] = _mm256_add_epi16(step2[12], step2[13]);
+ step1[13] = _mm256_sub_epi16(step2[12], step2[13]);
+ step1[14] = _mm256_sub_epi16(step2[15], step2[14]);
+ step1[15] = _mm256_add_epi16(step2[14], step2[15]);
+
+ // stage 4
+ butterfly16(in[0], in[8], cospi_16_64, cospi_16_64, &step2[1], &step2[0]);
+ butterfly16(in[4], in[12], cospi_24_64, cospi_8_64, &step2[2], &step2[3]);
+ butterfly16(step1[14], step1[9], cospi_24_64, cospi_8_64, &step2[9],
+ &step2[14]);
+ butterfly16(step1[10], step1[13], -cospi_8_64, -cospi_24_64, &step2[13],
+ &step2[10]);
+ step2[5] = _mm256_sub_epi16(step1[4], step1[5]);
+ step1[4] = _mm256_add_epi16(step1[4], step1[5]);
+ step2[6] = _mm256_sub_epi16(step1[7], step1[6]);
+ step1[7] = _mm256_add_epi16(step1[6], step1[7]);
+ step2[8] = step1[8];
+ step2[11] = step1[11];
+ step2[12] = step1[12];
+ step2[15] = step1[15];
+
+ // stage 5
+ step1[0] = _mm256_add_epi16(step2[0], step2[3]);
+ step1[1] = _mm256_add_epi16(step2[1], step2[2]);
+ step1[2] = _mm256_sub_epi16(step2[1], step2[2]);
+ step1[3] = _mm256_sub_epi16(step2[0], step2[3]);
+ butterfly16(step2[6], step2[5], cospi_16_64, cospi_16_64, &step1[5],
+ &step1[6]);
+ step1[8] = _mm256_add_epi16(step2[8], step2[11]);
+ step1[9] = _mm256_add_epi16(step2[9], step2[10]);
+ step1[10] = _mm256_sub_epi16(step2[9], step2[10]);
+ step1[11] = _mm256_sub_epi16(step2[8], step2[11]);
+ step1[12] = _mm256_sub_epi16(step2[15], step2[12]);
+ step1[13] = _mm256_sub_epi16(step2[14], step2[13]);
+ step1[14] = _mm256_add_epi16(step2[14], step2[13]);
+ step1[15] = _mm256_add_epi16(step2[15], step2[12]);
+
+ // stage 6
+ step2[0] = _mm256_add_epi16(step1[0], step1[7]);
+ step2[1] = _mm256_add_epi16(step1[1], step1[6]);
+ step2[2] = _mm256_add_epi16(step1[2], step1[5]);
+ step2[3] = _mm256_add_epi16(step1[3], step1[4]);
+ step2[4] = _mm256_sub_epi16(step1[3], step1[4]);
+ step2[5] = _mm256_sub_epi16(step1[2], step1[5]);
+ step2[6] = _mm256_sub_epi16(step1[1], step1[6]);
+ step2[7] = _mm256_sub_epi16(step1[0], step1[7]);
+ butterfly16(step1[13], step1[10], cospi_16_64, cospi_16_64, &step2[10],
+ &step2[13]);
+ butterfly16(step1[12], step1[11], cospi_16_64, cospi_16_64, &step2[11],
+ &step2[12]);
+
+ // stage 7
+ out[0] = _mm256_add_epi16(step2[0], step1[15]);
+ out[1] = _mm256_add_epi16(step2[1], step1[14]);
+ out[2] = _mm256_add_epi16(step2[2], step2[13]);
+ out[3] = _mm256_add_epi16(step2[3], step2[12]);
+ out[4] = _mm256_add_epi16(step2[4], step2[11]);
+ out[5] = _mm256_add_epi16(step2[5], step2[10]);
+ out[6] = _mm256_add_epi16(step2[6], step1[9]);
+ out[7] = _mm256_add_epi16(step2[7], step1[8]);
+ out[8] = _mm256_sub_epi16(step2[7], step1[8]);
+ out[9] = _mm256_sub_epi16(step2[6], step1[9]);
+ out[10] = _mm256_sub_epi16(step2[5], step2[10]);
+ out[11] = _mm256_sub_epi16(step2[4], step2[11]);
+ out[12] = _mm256_sub_epi16(step2[3], step2[12]);
+ out[13] = _mm256_sub_epi16(step2[2], step2[13]);
+ out[14] = _mm256_sub_epi16(step2[1], step1[14]);
+ out[15] = _mm256_sub_epi16(step2[0], step1[15]);
+}
+
+static INLINE void recon_and_store16(uint8_t *dest, __m256i in_x) {
+ const __m256i zero = _mm256_setzero_si256();
+ __m256i d0 = _mm256_castsi128_si256(_mm_loadu_si128((__m128i *)(dest)));
+ d0 = _mm256_permute4x64_epi64(d0, 0xd8);
+ d0 = _mm256_unpacklo_epi8(d0, zero);
+ d0 = _mm256_add_epi16(in_x, d0);
+ d0 = _mm256_packus_epi16(
+ d0, _mm256_castsi128_si256(_mm256_extractf128_si256(d0, 1)));
+
+ _mm_storeu_si128((__m128i *)dest, _mm256_castsi256_si128(d0));
+}
+
+static INLINE void write_buffer_16x1(uint8_t *dest, __m256i in) {
+ const __m256i final_rounding = _mm256_set1_epi16(1 << 5);
+ __m256i out;
+ out = _mm256_adds_epi16(in, final_rounding);
+ out = _mm256_srai_epi16(out, 6);
+ recon_and_store16(dest, out);
+}
+
+static INLINE void store_buffer_16x32(__m256i *in, uint8_t *dst, int stride) {
+ const __m256i final_rounding = _mm256_set1_epi16(1 << 5);
+ int j = 0;
+ while (j < 32) {
+ in[j] = _mm256_adds_epi16(in[j], final_rounding);
+ in[j + 1] = _mm256_adds_epi16(in[j + 1], final_rounding);
+
+ in[j] = _mm256_srai_epi16(in[j], 6);
+ in[j + 1] = _mm256_srai_epi16(in[j + 1], 6);
+
+ recon_and_store16(dst, in[j]);
+ dst += stride;
+ recon_and_store16(dst, in[j + 1]);
+ dst += stride;
+ j += 2;
+ }
+}
+
+static INLINE void transpose2_8x8_avx2(__m256i *in, __m256i *out) {
+ int i;
+ __m256i t[16], u[16];
+ // (1st, 2nd) ==> (lo, hi)
+ // (0, 1) ==> (0, 1)
+ // (2, 3) ==> (2, 3)
+ // (4, 5) ==> (4, 5)
+ // (6, 7) ==> (6, 7)
+ for (i = 0; i < 4; i++) {
+ t[2 * i] = _mm256_unpacklo_epi16(in[2 * i], in[2 * i + 1]);
+ t[2 * i + 1] = _mm256_unpackhi_epi16(in[2 * i], in[2 * i + 1]);
+ }
+
+ // (1st, 2nd) ==> (lo, hi)
+ // (0, 2) ==> (0, 2)
+ // (1, 3) ==> (1, 3)
+ // (4, 6) ==> (4, 6)
+ // (5, 7) ==> (5, 7)
+ for (i = 0; i < 2; i++) {
+ u[i] = _mm256_unpacklo_epi32(t[i], t[i + 2]);
+ u[i + 2] = _mm256_unpackhi_epi32(t[i], t[i + 2]);
+
+ u[i + 4] = _mm256_unpacklo_epi32(t[i + 4], t[i + 6]);
+ u[i + 6] = _mm256_unpackhi_epi32(t[i + 4], t[i + 6]);
+ }
+
+ // (1st, 2nd) ==> (lo, hi)
+ // (0, 4) ==> (0, 1)
+ // (1, 5) ==> (4, 5)
+ // (2, 6) ==> (2, 3)
+ // (3, 7) ==> (6, 7)
+ for (i = 0; i < 2; i++) {
+ out[2 * i] = _mm256_unpacklo_epi64(u[2 * i], u[2 * i + 4]);
+ out[2 * i + 1] = _mm256_unpackhi_epi64(u[2 * i], u[2 * i + 4]);
+
+ out[2 * i + 4] = _mm256_unpacklo_epi64(u[2 * i + 1], u[2 * i + 5]);
+ out[2 * i + 5] = _mm256_unpackhi_epi64(u[2 * i + 1], u[2 * i + 5]);
+ }
+}
+
+static INLINE void transpose_16bit_16x16_avx2(__m256i *in, __m256i *out) {
+ __m256i t[16];
+
+#define LOADL(idx) \
+ t[idx] = _mm256_castsi128_si256(_mm_load_si128((__m128i const *)&in[idx])); \
+ t[idx] = _mm256_inserti128_si256( \
+ t[idx], _mm_load_si128((__m128i const *)&in[(idx) + 8]), 1);
+
+#define LOADR(idx) \
+ t[8 + (idx)] = \
+ _mm256_castsi128_si256(_mm_load_si128((__m128i const *)&in[idx] + 1)); \
+ t[8 + (idx)] = _mm256_inserti128_si256( \
+ t[8 + (idx)], _mm_load_si128((__m128i const *)&in[(idx) + 8] + 1), 1);
+
+ // load left 8x16
+ LOADL(0)
+ LOADL(1)
+ LOADL(2)
+ LOADL(3)
+ LOADL(4)
+ LOADL(5)
+ LOADL(6)
+ LOADL(7)
+
+ // load right 8x16
+ LOADR(0)
+ LOADR(1)
+ LOADR(2)
+ LOADR(3)
+ LOADR(4)
+ LOADR(5)
+ LOADR(6)
+ LOADR(7)
+
+ // get the top 16x8 result
+ transpose2_8x8_avx2(t, out);
+ // get the bottom 16x8 result
+ transpose2_8x8_avx2(&t[8], &out[8]);
+}
+
+void vpx_idct16x16_256_add_avx2(const tran_low_t *input, uint8_t *dest,
+ int stride) {
+ int i;
+ __m256i in[16];
+
+ // Load 16x16 values
+ idct_load16x16(input, in, 16);
+
+ transpose_16bit_16x16_avx2(in, in);
+ idct16_16col(in, in);
+
+ transpose_16bit_16x16_avx2(in, in);
+ idct16_16col(in, in);
+
+ for (i = 0; i < 16; ++i) {
+ write_buffer_16x1(dest + i * stride, in[i]);
+ }
+}
+
+// Only do addition and subtraction butterfly, size = 16, 32
+static INLINE void add_sub_butterfly_avx2(__m256i *in, __m256i *out, int size) {
+ int i = 0;
+ const int num = size >> 1;
+ const int bound = size - 1;
+ while (i < num) {
+ out[i] = _mm256_add_epi16(in[i], in[bound - i]);
+ out[bound - i] = _mm256_sub_epi16(in[i], in[bound - i]);
+ i++;
+ }
+}
+
+// For each 16x32 block __m256i in[32],
+// Input with index, 0, 4, 8, 12, 16, 20, 24, 28
+// output pixels: 0-7 in __m256i out[32]
+static INLINE void idct32_1024_16x32_quarter_1(__m256i *in, __m256i *out) {
+ __m256i step1[8], step2[8];
+
+ // stage 3
+ butterfly16(in[4], in[28], cospi_28_64, cospi_4_64, &step1[4], &step1[7]);
+ butterfly16(in[20], in[12], cospi_12_64, cospi_20_64, &step1[5], &step1[6]);
+
+ // stage 4
+ butterfly16(in[0], in[16], cospi_16_64, cospi_16_64, &step2[1], &step2[0]);
+ butterfly16(in[8], in[24], cospi_24_64, cospi_8_64, &step2[2], &step2[3]);
+ step2[4] = _mm256_add_epi16(step1[4], step1[5]);
+ step2[5] = _mm256_sub_epi16(step1[4], step1[5]);
+ step2[6] = _mm256_sub_epi16(step1[7], step1[6]);
+ step2[7] = _mm256_add_epi16(step1[7], step1[6]);
+
+ // stage 5
+ step1[0] = _mm256_add_epi16(step2[0], step2[3]);
+ step1[1] = _mm256_add_epi16(step2[1], step2[2]);
+ step1[2] = _mm256_sub_epi16(step2[1], step2[2]);
+ step1[3] = _mm256_sub_epi16(step2[0], step2[3]);
+ step1[4] = step2[4];
+ butterfly16(step2[6], step2[5], cospi_16_64, cospi_16_64, &step1[5],
+ &step1[6]);
+ step1[7] = step2[7];
+
+ // stage 6
+ out[0] = _mm256_add_epi16(step1[0], step1[7]);
+ out[1] = _mm256_add_epi16(step1[1], step1[6]);
+ out[2] = _mm256_add_epi16(step1[2], step1[5]);
+ out[3] = _mm256_add_epi16(step1[3], step1[4]);
+ out[4] = _mm256_sub_epi16(step1[3], step1[4]);
+ out[5] = _mm256_sub_epi16(step1[2], step1[5]);
+ out[6] = _mm256_sub_epi16(step1[1], step1[6]);
+ out[7] = _mm256_sub_epi16(step1[0], step1[7]);
+}
+
+static INLINE void idct32_16x32_quarter_2_stage_4_to_6(__m256i *step1,
+ __m256i *out) {
+ __m256i step2[32];
+
+ // stage 4
+ step2[8] = step1[8];
+ step2[15] = step1[15];
+ butterfly16(step1[14], step1[9], cospi_24_64, cospi_8_64, &step2[9],
+ &step2[14]);
+ butterfly16(step1[13], step1[10], -cospi_8_64, cospi_24_64, &step2[10],
+ &step2[13]);
+ step2[11] = step1[11];
+ step2[12] = step1[12];
+
+ // stage 5
+ step1[8] = _mm256_add_epi16(step2[8], step2[11]);
+ step1[9] = _mm256_add_epi16(step2[9], step2[10]);
+ step1[10] = _mm256_sub_epi16(step2[9], step2[10]);
+ step1[11] = _mm256_sub_epi16(step2[8], step2[11]);
+ step1[12] = _mm256_sub_epi16(step2[15], step2[12]);
+ step1[13] = _mm256_sub_epi16(step2[14], step2[13]);
+ step1[14] = _mm256_add_epi16(step2[14], step2[13]);
+ step1[15] = _mm256_add_epi16(step2[15], step2[12]);
+
+ // stage 6
+ out[8] = step1[8];
+ out[9] = step1[9];
+ butterfly16(step1[13], step1[10], cospi_16_64, cospi_16_64, &out[10],
+ &out[13]);
+ butterfly16(step1[12], step1[11], cospi_16_64, cospi_16_64, &out[11],
+ &out[12]);
+ out[14] = step1[14];
+ out[15] = step1[15];
+}
+
+// For each 16x32 block __m256i in[32],
+// Input with index, 2, 6, 10, 14, 18, 22, 26, 30
+// output pixels: 8-15 in __m256i out[32]
+static INLINE void idct32_1024_16x32_quarter_2(__m256i *in, __m256i *out) {
+ __m256i step1[16], step2[16];
+
+ // stage 2
+ butterfly16(in[2], in[30], cospi_30_64, cospi_2_64, &step2[8], &step2[15]);
+ butterfly16(in[18], in[14], cospi_14_64, cospi_18_64, &step2[9], &step2[14]);
+ butterfly16(in[10], in[22], cospi_22_64, cospi_10_64, &step2[10], &step2[13]);
+ butterfly16(in[26], in[6], cospi_6_64, cospi_26_64, &step2[11], &step2[12]);
+
+ // stage 3
+ step1[8] = _mm256_add_epi16(step2[8], step2[9]);
+ step1[9] = _mm256_sub_epi16(step2[8], step2[9]);
+ step1[10] = _mm256_sub_epi16(step2[11], step2[10]);
+ step1[11] = _mm256_add_epi16(step2[11], step2[10]);
+ step1[12] = _mm256_add_epi16(step2[12], step2[13]);
+ step1[13] = _mm256_sub_epi16(step2[12], step2[13]);
+ step1[14] = _mm256_sub_epi16(step2[15], step2[14]);
+ step1[15] = _mm256_add_epi16(step2[15], step2[14]);
+
+ idct32_16x32_quarter_2_stage_4_to_6(step1, out);
+}
+
+static INLINE void idct32_16x32_quarter_3_4_stage_4_to_7(__m256i *step1,
+ __m256i *out) {
+ __m256i step2[32];
+
+ // stage 4
+ step2[16] = _mm256_add_epi16(step1[16], step1[19]);
+ step2[17] = _mm256_add_epi16(step1[17], step1[18]);
+ step2[18] = _mm256_sub_epi16(step1[17], step1[18]);
+ step2[19] = _mm256_sub_epi16(step1[16], step1[19]);
+ step2[20] = _mm256_sub_epi16(step1[23], step1[20]);
+ step2[21] = _mm256_sub_epi16(step1[22], step1[21]);
+ step2[22] = _mm256_add_epi16(step1[22], step1[21]);
+ step2[23] = _mm256_add_epi16(step1[23], step1[20]);
+
+ step2[24] = _mm256_add_epi16(step1[24], step1[27]);
+ step2[25] = _mm256_add_epi16(step1[25], step1[26]);
+ step2[26] = _mm256_sub_epi16(step1[25], step1[26]);
+ step2[27] = _mm256_sub_epi16(step1[24], step1[27]);
+ step2[28] = _mm256_sub_epi16(step1[31], step1[28]);
+ step2[29] = _mm256_sub_epi16(step1[30], step1[29]);
+ step2[30] = _mm256_add_epi16(step1[29], step1[30]);
+ step2[31] = _mm256_add_epi16(step1[28], step1[31]);
+
+ // stage 5
+ step1[16] = step2[16];
+ step1[17] = step2[17];
+ butterfly16(step2[29], step2[18], cospi_24_64, cospi_8_64, &step1[18],
+ &step1[29]);
+ butterfly16(step2[28], step2[19], cospi_24_64, cospi_8_64, &step1[19],
+ &step1[28]);
+ butterfly16(step2[27], step2[20], -cospi_8_64, cospi_24_64, &step1[20],
+ &step1[27]);
+ butterfly16(step2[26], step2[21], -cospi_8_64, cospi_24_64, &step1[21],
+ &step1[26]);
+ step1[22] = step2[22];
+ step1[23] = step2[23];
+ step1[24] = step2[24];
+ step1[25] = step2[25];
+ step1[30] = step2[30];
+ step1[31] = step2[31];
+
+ // stage 6
+ out[16] = _mm256_add_epi16(step1[16], step1[23]);
+ out[17] = _mm256_add_epi16(step1[17], step1[22]);
+ out[18] = _mm256_add_epi16(step1[18], step1[21]);
+ out[19] = _mm256_add_epi16(step1[19], step1[20]);
+ step2[20] = _mm256_sub_epi16(step1[19], step1[20]);
+ step2[21] = _mm256_sub_epi16(step1[18], step1[21]);
+ step2[22] = _mm256_sub_epi16(step1[17], step1[22]);
+ step2[23] = _mm256_sub_epi16(step1[16], step1[23]);
+
+ step2[24] = _mm256_sub_epi16(step1[31], step1[24]);
+ step2[25] = _mm256_sub_epi16(step1[30], step1[25]);
+ step2[26] = _mm256_sub_epi16(step1[29], step1[26]);
+ step2[27] = _mm256_sub_epi16(step1[28], step1[27]);
+ out[28] = _mm256_add_epi16(step1[27], step1[28]);
+ out[29] = _mm256_add_epi16(step1[26], step1[29]);
+ out[30] = _mm256_add_epi16(step1[25], step1[30]);
+ out[31] = _mm256_add_epi16(step1[24], step1[31]);
+
+ // stage 7
+ butterfly16(step2[27], step2[20], cospi_16_64, cospi_16_64, &out[20],
+ &out[27]);
+ butterfly16(step2[26], step2[21], cospi_16_64, cospi_16_64, &out[21],
+ &out[26]);
+ butterfly16(step2[25], step2[22], cospi_16_64, cospi_16_64, &out[22],
+ &out[25]);
+ butterfly16(step2[24], step2[23], cospi_16_64, cospi_16_64, &out[23],
+ &out[24]);
+}
+
+static INLINE void idct32_1024_16x32_quarter_1_2(__m256i *in, __m256i *out) {
+ __m256i temp[16];
+
+ // For each 16x32 block __m256i in[32],
+ // Input with index, 0, 4, 8, 12, 16, 20, 24, 28
+ // output pixels: 0-7 in __m256i out[32]
+ idct32_1024_16x32_quarter_1(in, temp);
+
+ // Input with index, 2, 6, 10, 14, 18, 22, 26, 30
+ // output pixels: 8-15 in __m256i out[32]
+ idct32_1024_16x32_quarter_2(in, temp);
+
+ // stage 7
+ add_sub_butterfly_avx2(temp, out, 16);
+}
+
+// For each 16x32 block __m256i in[32],
+// Input with odd index,
+// 1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31
+// output pixels: 16-23, 24-31 in __m256i out[32]
+static INLINE void idct32_1024_16x32_quarter_3_4(__m256i *in, __m256i *out) {
+ __m256i step1[32], step2[32];
+
+ // stage 1
+ butterfly16(in[1], in[31], cospi_31_64, cospi_1_64, &step1[16], &step1[31]);
+ butterfly16(in[17], in[15], cospi_15_64, cospi_17_64, &step1[17], &step1[30]);
+ butterfly16(in[9], in[23], cospi_23_64, cospi_9_64, &step1[18], &step1[29]);
+ butterfly16(in[25], in[7], cospi_7_64, cospi_25_64, &step1[19], &step1[28]);
+
+ butterfly16(in[5], in[27], cospi_27_64, cospi_5_64, &step1[20], &step1[27]);
+ butterfly16(in[21], in[11], cospi_11_64, cospi_21_64, &step1[21], &step1[26]);
+
+ butterfly16(in[13], in[19], cospi_19_64, cospi_13_64, &step1[22], &step1[25]);
+ butterfly16(in[29], in[3], cospi_3_64, cospi_29_64, &step1[23], &step1[24]);
+
+ // stage 2
+ step2[16] = _mm256_add_epi16(step1[16], step1[17]);
+ step2[17] = _mm256_sub_epi16(step1[16], step1[17]);
+ step2[18] = _mm256_sub_epi16(step1[19], step1[18]);
+ step2[19] = _mm256_add_epi16(step1[19], step1[18]);
+ step2[20] = _mm256_add_epi16(step1[20], step1[21]);
+ step2[21] = _mm256_sub_epi16(step1[20], step1[21]);
+ step2[22] = _mm256_sub_epi16(step1[23], step1[22]);
+ step2[23] = _mm256_add_epi16(step1[23], step1[22]);
+
+ step2[24] = _mm256_add_epi16(step1[24], step1[25]);
+ step2[25] = _mm256_sub_epi16(step1[24], step1[25]);
+ step2[26] = _mm256_sub_epi16(step1[27], step1[26]);
+ step2[27] = _mm256_add_epi16(step1[27], step1[26]);
+ step2[28] = _mm256_add_epi16(step1[28], step1[29]);
+ step2[29] = _mm256_sub_epi16(step1[28], step1[29]);
+ step2[30] = _mm256_sub_epi16(step1[31], step1[30]);
+ step2[31] = _mm256_add_epi16(step1[31], step1[30]);
+
+ // stage 3
+ step1[16] = step2[16];
+ step1[31] = step2[31];
+ butterfly16(step2[30], step2[17], cospi_28_64, cospi_4_64, &step1[17],
+ &step1[30]);
+ butterfly16(step2[29], step2[18], -cospi_4_64, cospi_28_64, &step1[18],
+ &step1[29]);
+ step1[19] = step2[19];
+ step1[20] = step2[20];
+ butterfly16(step2[26], step2[21], cospi_12_64, cospi_20_64, &step1[21],
+ &step1[26]);
+ butterfly16(step2[25], step2[22], -cospi_20_64, cospi_12_64, &step1[22],
+ &step1[25]);
+ step1[23] = step2[23];
+ step1[24] = step2[24];
+ step1[27] = step2[27];
+ step1[28] = step2[28];
+
+ idct32_16x32_quarter_3_4_stage_4_to_7(step1, out);
+}
+
+static INLINE void idct32_1024_16x32(__m256i *in, __m256i *out) {
+ __m256i temp[32];
+
+ // For each 16x32 block __m256i in[32],
+ // Input with index, 0, 4, 8, 12, 16, 20, 24, 28
+ // output pixels: 0-7 in __m256i out[32]
+ // AND
+ // Input with index, 2, 6, 10, 14, 18, 22, 26, 30
+ // output pixels: 8-15 in __m256i out[32]
+ idct32_1024_16x32_quarter_1_2(in, temp);
+
+ // For each 16x32 block __m256i in[32],
+ // Input with odd index,
+ // 1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31
+ // output pixels: 16-23, 24-31 in __m256i out[32]
+ idct32_1024_16x32_quarter_3_4(in, temp);
+
+ // final stage
+ add_sub_butterfly_avx2(temp, out, 32);
+}
+
+void vpx_idct32x32_1024_add_avx2(const tran_low_t *input, uint8_t *dest,
+ int stride) {
+ __m256i l[32], r[32], out[32], *in;
+ int i;
+
+ in = l;
+
+ for (i = 0; i < 2; i++) {
+ idct_load16x16(input, in, 32);
+ transpose_16bit_16x16_avx2(in, in);
+
+ idct_load16x16(input + 16, in + 16, 32);
+ transpose_16bit_16x16_avx2(in + 16, in + 16);
+ idct32_1024_16x32(in, in);
+
+ in = r;
+ input += 32 << 4;
+ }
+
+ for (i = 0; i < 32; i += 16) {
+ transpose_16bit_16x16_avx2(l + i, out);
+ transpose_16bit_16x16_avx2(r + i, out + 16);
+ idct32_1024_16x32(out, out);
+
+ store_buffer_16x32(out, dest, stride);
+ dest += 16;
+ }
+}
+
+// Case when only upper-left 16x16 has non-zero coeff
+void vpx_idct32x32_135_add_avx2(const tran_low_t *input, uint8_t *dest,
+ int stride) {
+ __m256i in[32], io[32], out[32];
+ int i;
+
+ for (i = 16; i < 32; i++) {
+ in[i] = _mm256_setzero_si256();
+ }
+
+ // rows
+ idct_load16x16(input, in, 32);
+ transpose_16bit_16x16_avx2(in, in);
+ idct32_1024_16x32(in, io);
+
+ // columns
+ for (i = 0; i < 32; i += 16) {
+ transpose_16bit_16x16_avx2(io + i, in);
+ idct32_1024_16x32(in, out);
+
+ store_buffer_16x32(out, dest, stride);
+ dest += 16;
+ }
+}
diff --git a/media/libvpx/libvpx/vpx_dsp/x86/inv_txfm_sse2.c b/media/libvpx/libvpx/vpx_dsp/x86/inv_txfm_sse2.c
new file mode 100644
index 0000000000..f42b3df849
--- /dev/null
+++ b/media/libvpx/libvpx/vpx_dsp/x86/inv_txfm_sse2.c
@@ -0,0 +1,1235 @@
+/*
+ * Copyright (c) 2015 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <emmintrin.h> // SSE2
+
+#include "./vpx_dsp_rtcd.h"
+#include "vpx_dsp/x86/inv_txfm_sse2.h"
+#include "vpx_dsp/x86/transpose_sse2.h"
+#include "vpx_dsp/x86/txfm_common_sse2.h"
+
+static INLINE void transpose_16bit_4(__m128i *res) {
+ const __m128i tr0_0 = _mm_unpacklo_epi16(res[0], res[1]);
+ const __m128i tr0_1 = _mm_unpackhi_epi16(res[0], res[1]);
+
+ res[0] = _mm_unpacklo_epi16(tr0_0, tr0_1);
+ res[1] = _mm_unpackhi_epi16(tr0_0, tr0_1);
+}
+
+void vpx_idct4x4_16_add_sse2(const tran_low_t *input, uint8_t *dest,
+ int stride) {
+ const __m128i eight = _mm_set1_epi16(8);
+ __m128i in[2];
+
+ // Rows
+ in[0] = load_input_data8(input);
+ in[1] = load_input_data8(input + 8);
+ idct4_sse2(in);
+
+ // Columns
+ idct4_sse2(in);
+
+ // Final round and shift
+ in[0] = _mm_add_epi16(in[0], eight);
+ in[1] = _mm_add_epi16(in[1], eight);
+ in[0] = _mm_srai_epi16(in[0], 4);
+ in[1] = _mm_srai_epi16(in[1], 4);
+
+ recon_and_store4x4_sse2(in, dest, stride);
+}
+
+void vpx_idct4x4_1_add_sse2(const tran_low_t *input, uint8_t *dest,
+ int stride) {
+ const __m128i zero = _mm_setzero_si128();
+ int a;
+ __m128i dc_value, d[2];
+
+ a = (int)dct_const_round_shift((int16_t)input[0] * cospi_16_64);
+ a = (int)dct_const_round_shift(a * cospi_16_64);
+ a = ROUND_POWER_OF_TWO(a, 4);
+
+ dc_value = _mm_set1_epi16(a);
+
+ // Reconstruction and Store
+ d[0] = _mm_cvtsi32_si128(*(const int *)(dest));
+ d[1] = _mm_cvtsi32_si128(*(const int *)(dest + stride * 3));
+ d[0] = _mm_unpacklo_epi32(d[0],
+ _mm_cvtsi32_si128(*(const int *)(dest + stride)));
+ d[1] = _mm_unpacklo_epi32(
+ _mm_cvtsi32_si128(*(const int *)(dest + stride * 2)), d[1]);
+ d[0] = _mm_unpacklo_epi8(d[0], zero);
+ d[1] = _mm_unpacklo_epi8(d[1], zero);
+ d[0] = _mm_add_epi16(d[0], dc_value);
+ d[1] = _mm_add_epi16(d[1], dc_value);
+ d[0] = _mm_packus_epi16(d[0], d[1]);
+
+ *(int *)dest = _mm_cvtsi128_si32(d[0]);
+ d[0] = _mm_srli_si128(d[0], 4);
+ *(int *)(dest + stride) = _mm_cvtsi128_si32(d[0]);
+ d[0] = _mm_srli_si128(d[0], 4);
+ *(int *)(dest + stride * 2) = _mm_cvtsi128_si32(d[0]);
+ d[0] = _mm_srli_si128(d[0], 4);
+ *(int *)(dest + stride * 3) = _mm_cvtsi128_si32(d[0]);
+}
+
+void idct4_sse2(__m128i *const in) {
+ const __m128i k__cospi_p16_p16 = pair_set_epi16(cospi_16_64, cospi_16_64);
+ const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64);
+ const __m128i k__cospi_p24_m08 = pair_set_epi16(cospi_24_64, -cospi_8_64);
+ const __m128i k__cospi_p08_p24 = pair_set_epi16(cospi_8_64, cospi_24_64);
+ __m128i u[2];
+
+ transpose_16bit_4(in);
+ // stage 1
+ u[0] = _mm_unpacklo_epi16(in[0], in[1]);
+ u[1] = _mm_unpackhi_epi16(in[0], in[1]);
+ u[0] = idct_calc_wraplow_sse2(k__cospi_p16_p16, k__cospi_p16_m16, u[0]);
+ u[1] = idct_calc_wraplow_sse2(k__cospi_p08_p24, k__cospi_p24_m08, u[1]);
+
+ // stage 2
+ in[0] = _mm_add_epi16(u[0], u[1]);
+ in[1] = _mm_sub_epi16(u[0], u[1]);
+ in[1] = _mm_shuffle_epi32(in[1], 0x4E);
+}
+
+void iadst4_sse2(__m128i *const in) {
+ const __m128i k__sinpi_1_3 = pair_set_epi16(sinpi_1_9, sinpi_3_9);
+ const __m128i k__sinpi_4_2 = pair_set_epi16(sinpi_4_9, sinpi_2_9);
+ const __m128i k__sinpi_2_3 = pair_set_epi16(sinpi_2_9, sinpi_3_9);
+ const __m128i k__sinpi_1_4 = pair_set_epi16(sinpi_1_9, sinpi_4_9);
+ const __m128i k__sinpi_12_n3 =
+ pair_set_epi16(sinpi_1_9 + sinpi_2_9, -sinpi_3_9);
+ __m128i u[4], v[5];
+
+ // 00 01 20 21 02 03 22 23
+ // 10 11 30 31 12 13 32 33
+ const __m128i tr0_0 = _mm_unpacklo_epi32(in[0], in[1]);
+ const __m128i tr0_1 = _mm_unpackhi_epi32(in[0], in[1]);
+
+ // 00 01 10 11 20 21 30 31
+ // 02 03 12 13 22 23 32 33
+ in[0] = _mm_unpacklo_epi32(tr0_0, tr0_1);
+ in[1] = _mm_unpackhi_epi32(tr0_0, tr0_1);
+
+ v[0] = _mm_madd_epi16(in[0], k__sinpi_1_3); // s_1 * x0 + s_3 * x1
+ v[1] = _mm_madd_epi16(in[1], k__sinpi_4_2); // s_4 * x2 + s_2 * x3
+ v[2] = _mm_madd_epi16(in[0], k__sinpi_2_3); // s_2 * x0 + s_3 * x1
+ v[3] = _mm_madd_epi16(in[1], k__sinpi_1_4); // s_1 * x2 + s_4 * x3
+ v[4] = _mm_madd_epi16(in[0], k__sinpi_12_n3); // (s_1 + s_2) * x0 - s_3 * x1
+ in[0] = _mm_sub_epi16(in[0], in[1]); // x0 - x2
+ in[1] = _mm_srli_epi32(in[1], 16);
+ in[0] = _mm_add_epi16(in[0], in[1]);
+ in[0] = _mm_slli_epi32(in[0], 16); // x0 - x2 + x3
+
+ u[0] = _mm_add_epi32(v[0], v[1]);
+ u[1] = _mm_sub_epi32(v[2], v[3]);
+ u[2] = _mm_madd_epi16(in[0], k__sinpi_1_3);
+ u[3] = _mm_sub_epi32(v[1], v[3]);
+ u[3] = _mm_add_epi32(u[3], v[4]);
+
+ u[0] = dct_const_round_shift_sse2(u[0]);
+ u[1] = dct_const_round_shift_sse2(u[1]);
+ u[2] = dct_const_round_shift_sse2(u[2]);
+ u[3] = dct_const_round_shift_sse2(u[3]);
+
+ in[0] = _mm_packs_epi32(u[0], u[1]);
+ in[1] = _mm_packs_epi32(u[2], u[3]);
+}
+
+static INLINE void load_buffer_8x8(const tran_low_t *const input,
+ __m128i *const in) {
+ in[0] = load_input_data8(input + 0 * 8);
+ in[1] = load_input_data8(input + 1 * 8);
+ in[2] = load_input_data8(input + 2 * 8);
+ in[3] = load_input_data8(input + 3 * 8);
+ in[4] = load_input_data8(input + 4 * 8);
+ in[5] = load_input_data8(input + 5 * 8);
+ in[6] = load_input_data8(input + 6 * 8);
+ in[7] = load_input_data8(input + 7 * 8);
+}
+
+void vpx_idct8x8_64_add_sse2(const tran_low_t *input, uint8_t *dest,
+ int stride) {
+ __m128i in[8];
+ int i;
+
+ // Load input data.
+ load_buffer_8x8(input, in);
+
+ // 2-D
+ for (i = 0; i < 2; i++) {
+ vpx_idct8_sse2(in);
+ }
+
+ write_buffer_8x8(in, dest, stride);
+}
+
+void vpx_idct8x8_12_add_sse2(const tran_low_t *input, uint8_t *dest,
+ int stride) {
+ __m128i io[8];
+
+ io[0] = load_input_data4(input + 0 * 8);
+ io[1] = load_input_data4(input + 1 * 8);
+ io[2] = load_input_data4(input + 2 * 8);
+ io[3] = load_input_data4(input + 3 * 8);
+
+ idct8x8_12_add_kernel_sse2(io);
+ write_buffer_8x8(io, dest, stride);
+}
+
+static INLINE void recon_and_store_8_dual(uint8_t *const dest,
+ const __m128i in_x,
+ const int stride) {
+ const __m128i zero = _mm_setzero_si128();
+ __m128i d0, d1;
+
+ d0 = _mm_loadl_epi64((__m128i *)(dest + 0 * stride));
+ d1 = _mm_loadl_epi64((__m128i *)(dest + 1 * stride));
+ d0 = _mm_unpacklo_epi8(d0, zero);
+ d1 = _mm_unpacklo_epi8(d1, zero);
+ d0 = _mm_add_epi16(in_x, d0);
+ d1 = _mm_add_epi16(in_x, d1);
+ d0 = _mm_packus_epi16(d0, d1);
+ _mm_storel_epi64((__m128i *)(dest + 0 * stride), d0);
+ _mm_storeh_pi((__m64 *)(dest + 1 * stride), _mm_castsi128_ps(d0));
+}
+
+void vpx_idct8x8_1_add_sse2(const tran_low_t *input, uint8_t *dest,
+ int stride) {
+ __m128i dc_value;
+ tran_high_t a1;
+ tran_low_t out =
+ WRAPLOW(dct_const_round_shift((int16_t)input[0] * cospi_16_64));
+
+ out = WRAPLOW(dct_const_round_shift(out * cospi_16_64));
+ a1 = ROUND_POWER_OF_TWO(out, 5);
+ dc_value = _mm_set1_epi16((int16_t)a1);
+
+ recon_and_store_8_dual(dest, dc_value, stride);
+ dest += 2 * stride;
+ recon_and_store_8_dual(dest, dc_value, stride);
+ dest += 2 * stride;
+ recon_and_store_8_dual(dest, dc_value, stride);
+ dest += 2 * stride;
+ recon_and_store_8_dual(dest, dc_value, stride);
+}
+
+void vpx_idct8_sse2(__m128i *const in) {
+ // 8x8 Transpose is copied from vpx_fdct8x8_sse2()
+ transpose_16bit_8x8(in, in);
+
+ // 4-stage 1D idct8x8
+ idct8(in, in);
+}
+
+void iadst8_sse2(__m128i *const in) {
+ const __m128i k__cospi_p02_p30 = pair_set_epi16(cospi_2_64, cospi_30_64);
+ const __m128i k__cospi_p30_m02 = pair_set_epi16(cospi_30_64, -cospi_2_64);
+ const __m128i k__cospi_p10_p22 = pair_set_epi16(cospi_10_64, cospi_22_64);
+ const __m128i k__cospi_p22_m10 = pair_set_epi16(cospi_22_64, -cospi_10_64);
+ const __m128i k__cospi_p18_p14 = pair_set_epi16(cospi_18_64, cospi_14_64);
+ const __m128i k__cospi_p14_m18 = pair_set_epi16(cospi_14_64, -cospi_18_64);
+ const __m128i k__cospi_p26_p06 = pair_set_epi16(cospi_26_64, cospi_6_64);
+ const __m128i k__cospi_p06_m26 = pair_set_epi16(cospi_6_64, -cospi_26_64);
+ const __m128i k__cospi_p08_p24 = pair_set_epi16(cospi_8_64, cospi_24_64);
+ const __m128i k__cospi_p24_m08 = pair_set_epi16(cospi_24_64, -cospi_8_64);
+ const __m128i k__cospi_m24_p08 = pair_set_epi16(-cospi_24_64, cospi_8_64);
+ const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64);
+ const __m128i k__cospi_p16_p16 = _mm_set1_epi16(cospi_16_64);
+ const __m128i kZero = _mm_setzero_si128();
+ __m128i s[8], u[16], v[8], w[16];
+
+ // transpose
+ transpose_16bit_8x8(in, in);
+
+ // column transformation
+ // stage 1
+ // interleave and multiply/add into 32-bit integer
+ s[0] = _mm_unpacklo_epi16(in[7], in[0]);
+ s[1] = _mm_unpackhi_epi16(in[7], in[0]);
+ s[2] = _mm_unpacklo_epi16(in[5], in[2]);
+ s[3] = _mm_unpackhi_epi16(in[5], in[2]);
+ s[4] = _mm_unpacklo_epi16(in[3], in[4]);
+ s[5] = _mm_unpackhi_epi16(in[3], in[4]);
+ s[6] = _mm_unpacklo_epi16(in[1], in[6]);
+ s[7] = _mm_unpackhi_epi16(in[1], in[6]);
+
+ u[0] = _mm_madd_epi16(s[0], k__cospi_p02_p30);
+ u[1] = _mm_madd_epi16(s[1], k__cospi_p02_p30);
+ u[2] = _mm_madd_epi16(s[0], k__cospi_p30_m02);
+ u[3] = _mm_madd_epi16(s[1], k__cospi_p30_m02);
+ u[4] = _mm_madd_epi16(s[2], k__cospi_p10_p22);
+ u[5] = _mm_madd_epi16(s[3], k__cospi_p10_p22);
+ u[6] = _mm_madd_epi16(s[2], k__cospi_p22_m10);
+ u[7] = _mm_madd_epi16(s[3], k__cospi_p22_m10);
+ u[8] = _mm_madd_epi16(s[4], k__cospi_p18_p14);
+ u[9] = _mm_madd_epi16(s[5], k__cospi_p18_p14);
+ u[10] = _mm_madd_epi16(s[4], k__cospi_p14_m18);
+ u[11] = _mm_madd_epi16(s[5], k__cospi_p14_m18);
+ u[12] = _mm_madd_epi16(s[6], k__cospi_p26_p06);
+ u[13] = _mm_madd_epi16(s[7], k__cospi_p26_p06);
+ u[14] = _mm_madd_epi16(s[6], k__cospi_p06_m26);
+ u[15] = _mm_madd_epi16(s[7], k__cospi_p06_m26);
+
+ // addition
+ w[0] = _mm_add_epi32(u[0], u[8]);
+ w[1] = _mm_add_epi32(u[1], u[9]);
+ w[2] = _mm_add_epi32(u[2], u[10]);
+ w[3] = _mm_add_epi32(u[3], u[11]);
+ w[4] = _mm_add_epi32(u[4], u[12]);
+ w[5] = _mm_add_epi32(u[5], u[13]);
+ w[6] = _mm_add_epi32(u[6], u[14]);
+ w[7] = _mm_add_epi32(u[7], u[15]);
+ w[8] = _mm_sub_epi32(u[0], u[8]);
+ w[9] = _mm_sub_epi32(u[1], u[9]);
+ w[10] = _mm_sub_epi32(u[2], u[10]);
+ w[11] = _mm_sub_epi32(u[3], u[11]);
+ w[12] = _mm_sub_epi32(u[4], u[12]);
+ w[13] = _mm_sub_epi32(u[5], u[13]);
+ w[14] = _mm_sub_epi32(u[6], u[14]);
+ w[15] = _mm_sub_epi32(u[7], u[15]);
+
+ // shift and rounding
+ u[0] = dct_const_round_shift_sse2(w[0]);
+ u[1] = dct_const_round_shift_sse2(w[1]);
+ u[2] = dct_const_round_shift_sse2(w[2]);
+ u[3] = dct_const_round_shift_sse2(w[3]);
+ u[4] = dct_const_round_shift_sse2(w[4]);
+ u[5] = dct_const_round_shift_sse2(w[5]);
+ u[6] = dct_const_round_shift_sse2(w[6]);
+ u[7] = dct_const_round_shift_sse2(w[7]);
+ u[8] = dct_const_round_shift_sse2(w[8]);
+ u[9] = dct_const_round_shift_sse2(w[9]);
+ u[10] = dct_const_round_shift_sse2(w[10]);
+ u[11] = dct_const_round_shift_sse2(w[11]);
+ u[12] = dct_const_round_shift_sse2(w[12]);
+ u[13] = dct_const_round_shift_sse2(w[13]);
+ u[14] = dct_const_round_shift_sse2(w[14]);
+ u[15] = dct_const_round_shift_sse2(w[15]);
+
+ // back to 16-bit and pack 8 integers into __m128i
+ in[0] = _mm_packs_epi32(u[0], u[1]);
+ in[1] = _mm_packs_epi32(u[2], u[3]);
+ in[2] = _mm_packs_epi32(u[4], u[5]);
+ in[3] = _mm_packs_epi32(u[6], u[7]);
+ in[4] = _mm_packs_epi32(u[8], u[9]);
+ in[5] = _mm_packs_epi32(u[10], u[11]);
+ in[6] = _mm_packs_epi32(u[12], u[13]);
+ in[7] = _mm_packs_epi32(u[14], u[15]);
+
+ // stage 2
+ s[0] = _mm_add_epi16(in[0], in[2]);
+ s[1] = _mm_add_epi16(in[1], in[3]);
+ s[2] = _mm_sub_epi16(in[0], in[2]);
+ s[3] = _mm_sub_epi16(in[1], in[3]);
+ u[0] = _mm_unpacklo_epi16(in[4], in[5]);
+ u[1] = _mm_unpackhi_epi16(in[4], in[5]);
+ u[2] = _mm_unpacklo_epi16(in[6], in[7]);
+ u[3] = _mm_unpackhi_epi16(in[6], in[7]);
+
+ v[0] = _mm_madd_epi16(u[0], k__cospi_p08_p24);
+ v[1] = _mm_madd_epi16(u[1], k__cospi_p08_p24);
+ v[2] = _mm_madd_epi16(u[0], k__cospi_p24_m08);
+ v[3] = _mm_madd_epi16(u[1], k__cospi_p24_m08);
+ v[4] = _mm_madd_epi16(u[2], k__cospi_m24_p08);
+ v[5] = _mm_madd_epi16(u[3], k__cospi_m24_p08);
+ v[6] = _mm_madd_epi16(u[2], k__cospi_p08_p24);
+ v[7] = _mm_madd_epi16(u[3], k__cospi_p08_p24);
+
+ w[0] = _mm_add_epi32(v[0], v[4]);
+ w[1] = _mm_add_epi32(v[1], v[5]);
+ w[2] = _mm_add_epi32(v[2], v[6]);
+ w[3] = _mm_add_epi32(v[3], v[7]);
+ w[4] = _mm_sub_epi32(v[0], v[4]);
+ w[5] = _mm_sub_epi32(v[1], v[5]);
+ w[6] = _mm_sub_epi32(v[2], v[6]);
+ w[7] = _mm_sub_epi32(v[3], v[7]);
+
+ u[0] = dct_const_round_shift_sse2(w[0]);
+ u[1] = dct_const_round_shift_sse2(w[1]);
+ u[2] = dct_const_round_shift_sse2(w[2]);
+ u[3] = dct_const_round_shift_sse2(w[3]);
+ u[4] = dct_const_round_shift_sse2(w[4]);
+ u[5] = dct_const_round_shift_sse2(w[5]);
+ u[6] = dct_const_round_shift_sse2(w[6]);
+ u[7] = dct_const_round_shift_sse2(w[7]);
+
+ // back to 16-bit intergers
+ s[4] = _mm_packs_epi32(u[0], u[1]);
+ s[5] = _mm_packs_epi32(u[2], u[3]);
+ s[6] = _mm_packs_epi32(u[4], u[5]);
+ s[7] = _mm_packs_epi32(u[6], u[7]);
+
+ // stage 3
+ u[0] = _mm_unpacklo_epi16(s[2], s[3]);
+ u[1] = _mm_unpackhi_epi16(s[2], s[3]);
+ u[2] = _mm_unpacklo_epi16(s[6], s[7]);
+ u[3] = _mm_unpackhi_epi16(s[6], s[7]);
+
+ s[2] = idct_calc_wraplow_sse2(u[0], u[1], k__cospi_p16_p16);
+ s[3] = idct_calc_wraplow_sse2(u[0], u[1], k__cospi_p16_m16);
+ s[6] = idct_calc_wraplow_sse2(u[2], u[3], k__cospi_p16_p16);
+ s[7] = idct_calc_wraplow_sse2(u[2], u[3], k__cospi_p16_m16);
+
+ in[0] = s[0];
+ in[1] = _mm_sub_epi16(kZero, s[4]);
+ in[2] = s[6];
+ in[3] = _mm_sub_epi16(kZero, s[2]);
+ in[4] = s[3];
+ in[5] = _mm_sub_epi16(kZero, s[7]);
+ in[6] = s[5];
+ in[7] = _mm_sub_epi16(kZero, s[1]);
+}
+
+static INLINE void idct16_load8x8(const tran_low_t *const input,
+ __m128i *const in) {
+ in[0] = load_input_data8(input + 0 * 16);
+ in[1] = load_input_data8(input + 1 * 16);
+ in[2] = load_input_data8(input + 2 * 16);
+ in[3] = load_input_data8(input + 3 * 16);
+ in[4] = load_input_data8(input + 4 * 16);
+ in[5] = load_input_data8(input + 5 * 16);
+ in[6] = load_input_data8(input + 6 * 16);
+ in[7] = load_input_data8(input + 7 * 16);
+}
+
+void vpx_idct16x16_256_add_sse2(const tran_low_t *input, uint8_t *dest,
+ int stride) {
+ __m128i l[16], r[16], out[16], *in;
+ int i;
+
+ in = l;
+ for (i = 0; i < 2; i++) {
+ idct16_load8x8(input, in);
+ transpose_16bit_8x8(in, in);
+ idct16_load8x8(input + 8, in + 8);
+ transpose_16bit_8x8(in + 8, in + 8);
+ idct16_8col(in, in);
+ in = r;
+ input += 128;
+ }
+
+ for (i = 0; i < 16; i += 8) {
+ int j;
+ transpose_16bit_8x8(l + i, out);
+ transpose_16bit_8x8(r + i, out + 8);
+ idct16_8col(out, out);
+
+ for (j = 0; j < 16; ++j) {
+ write_buffer_8x1(dest + j * stride, out[j]);
+ }
+
+ dest += 8;
+ }
+}
+
+void vpx_idct16x16_38_add_sse2(const tran_low_t *input, uint8_t *dest,
+ int stride) {
+ __m128i in[16], temp[16], out[16];
+ int i;
+
+ idct16_load8x8(input, in);
+ transpose_16bit_8x8(in, in);
+
+ for (i = 8; i < 16; i++) {
+ in[i] = _mm_setzero_si128();
+ }
+ idct16_8col(in, temp);
+
+ for (i = 0; i < 16; i += 8) {
+ int j;
+ transpose_16bit_8x8(temp + i, in);
+ idct16_8col(in, out);
+
+ for (j = 0; j < 16; ++j) {
+ write_buffer_8x1(dest + j * stride, out[j]);
+ }
+
+ dest += 8;
+ }
+}
+
+void vpx_idct16x16_10_add_sse2(const tran_low_t *input, uint8_t *dest,
+ int stride) {
+ __m128i in[16], l[16];
+ int i;
+
+ // First 1-D inverse DCT
+ // Load input data.
+ in[0] = load_input_data4(input + 0 * 16);
+ in[1] = load_input_data4(input + 1 * 16);
+ in[2] = load_input_data4(input + 2 * 16);
+ in[3] = load_input_data4(input + 3 * 16);
+
+ idct16x16_10_pass1(in, l);
+
+ // Second 1-D inverse transform, performed per 8x16 block
+ for (i = 0; i < 16; i += 8) {
+ int j;
+ idct16x16_10_pass2(l + i, in);
+
+ for (j = 0; j < 16; ++j) {
+ write_buffer_8x1(dest + j * stride, in[j]);
+ }
+
+ dest += 8;
+ }
+}
+
+static INLINE void recon_and_store_16(uint8_t *const dest, const __m128i in_x) {
+ const __m128i zero = _mm_setzero_si128();
+ __m128i d0, d1;
+
+ d0 = _mm_load_si128((__m128i *)(dest));
+ d1 = _mm_unpackhi_epi8(d0, zero);
+ d0 = _mm_unpacklo_epi8(d0, zero);
+ d0 = _mm_add_epi16(in_x, d0);
+ d1 = _mm_add_epi16(in_x, d1);
+ d0 = _mm_packus_epi16(d0, d1);
+ _mm_store_si128((__m128i *)(dest), d0);
+}
+
+void vpx_idct16x16_1_add_sse2(const tran_low_t *input, uint8_t *dest,
+ int stride) {
+ __m128i dc_value;
+ int i;
+ tran_high_t a1;
+ tran_low_t out =
+ WRAPLOW(dct_const_round_shift((int16_t)input[0] * cospi_16_64));
+
+ out = WRAPLOW(dct_const_round_shift(out * cospi_16_64));
+ a1 = ROUND_POWER_OF_TWO(out, 6);
+ dc_value = _mm_set1_epi16((int16_t)a1);
+
+ for (i = 0; i < 16; ++i) {
+ recon_and_store_16(dest, dc_value);
+ dest += stride;
+ }
+}
+
+void vpx_iadst16_8col_sse2(__m128i *const in) {
+ // perform 16x16 1-D ADST for 8 columns
+ __m128i s[16], x[16], u[32], v[32];
+ const __m128i k__cospi_p01_p31 = pair_set_epi16(cospi_1_64, cospi_31_64);
+ const __m128i k__cospi_p31_m01 = pair_set_epi16(cospi_31_64, -cospi_1_64);
+ const __m128i k__cospi_p05_p27 = pair_set_epi16(cospi_5_64, cospi_27_64);
+ const __m128i k__cospi_p27_m05 = pair_set_epi16(cospi_27_64, -cospi_5_64);
+ const __m128i k__cospi_p09_p23 = pair_set_epi16(cospi_9_64, cospi_23_64);
+ const __m128i k__cospi_p23_m09 = pair_set_epi16(cospi_23_64, -cospi_9_64);
+ const __m128i k__cospi_p13_p19 = pair_set_epi16(cospi_13_64, cospi_19_64);
+ const __m128i k__cospi_p19_m13 = pair_set_epi16(cospi_19_64, -cospi_13_64);
+ const __m128i k__cospi_p17_p15 = pair_set_epi16(cospi_17_64, cospi_15_64);
+ const __m128i k__cospi_p15_m17 = pair_set_epi16(cospi_15_64, -cospi_17_64);
+ const __m128i k__cospi_p21_p11 = pair_set_epi16(cospi_21_64, cospi_11_64);
+ const __m128i k__cospi_p11_m21 = pair_set_epi16(cospi_11_64, -cospi_21_64);
+ const __m128i k__cospi_p25_p07 = pair_set_epi16(cospi_25_64, cospi_7_64);
+ const __m128i k__cospi_p07_m25 = pair_set_epi16(cospi_7_64, -cospi_25_64);
+ const __m128i k__cospi_p29_p03 = pair_set_epi16(cospi_29_64, cospi_3_64);
+ const __m128i k__cospi_p03_m29 = pair_set_epi16(cospi_3_64, -cospi_29_64);
+ const __m128i k__cospi_p04_p28 = pair_set_epi16(cospi_4_64, cospi_28_64);
+ const __m128i k__cospi_p28_m04 = pair_set_epi16(cospi_28_64, -cospi_4_64);
+ const __m128i k__cospi_p20_p12 = pair_set_epi16(cospi_20_64, cospi_12_64);
+ const __m128i k__cospi_p12_m20 = pair_set_epi16(cospi_12_64, -cospi_20_64);
+ const __m128i k__cospi_m28_p04 = pair_set_epi16(-cospi_28_64, cospi_4_64);
+ const __m128i k__cospi_m12_p20 = pair_set_epi16(-cospi_12_64, cospi_20_64);
+ const __m128i k__cospi_p08_p24 = pair_set_epi16(cospi_8_64, cospi_24_64);
+ const __m128i k__cospi_p24_m08 = pair_set_epi16(cospi_24_64, -cospi_8_64);
+ const __m128i k__cospi_m24_p08 = pair_set_epi16(-cospi_24_64, cospi_8_64);
+ const __m128i k__cospi_m16_m16 = _mm_set1_epi16(-cospi_16_64);
+ const __m128i k__cospi_p16_p16 = _mm_set1_epi16(cospi_16_64);
+ const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64);
+ const __m128i k__cospi_m16_p16 = pair_set_epi16(-cospi_16_64, cospi_16_64);
+ const __m128i kZero = _mm_setzero_si128();
+
+ u[0] = _mm_unpacklo_epi16(in[15], in[0]);
+ u[1] = _mm_unpackhi_epi16(in[15], in[0]);
+ u[2] = _mm_unpacklo_epi16(in[13], in[2]);
+ u[3] = _mm_unpackhi_epi16(in[13], in[2]);
+ u[4] = _mm_unpacklo_epi16(in[11], in[4]);
+ u[5] = _mm_unpackhi_epi16(in[11], in[4]);
+ u[6] = _mm_unpacklo_epi16(in[9], in[6]);
+ u[7] = _mm_unpackhi_epi16(in[9], in[6]);
+ u[8] = _mm_unpacklo_epi16(in[7], in[8]);
+ u[9] = _mm_unpackhi_epi16(in[7], in[8]);
+ u[10] = _mm_unpacklo_epi16(in[5], in[10]);
+ u[11] = _mm_unpackhi_epi16(in[5], in[10]);
+ u[12] = _mm_unpacklo_epi16(in[3], in[12]);
+ u[13] = _mm_unpackhi_epi16(in[3], in[12]);
+ u[14] = _mm_unpacklo_epi16(in[1], in[14]);
+ u[15] = _mm_unpackhi_epi16(in[1], in[14]);
+
+ v[0] = _mm_madd_epi16(u[0], k__cospi_p01_p31);
+ v[1] = _mm_madd_epi16(u[1], k__cospi_p01_p31);
+ v[2] = _mm_madd_epi16(u[0], k__cospi_p31_m01);
+ v[3] = _mm_madd_epi16(u[1], k__cospi_p31_m01);
+ v[4] = _mm_madd_epi16(u[2], k__cospi_p05_p27);
+ v[5] = _mm_madd_epi16(u[3], k__cospi_p05_p27);
+ v[6] = _mm_madd_epi16(u[2], k__cospi_p27_m05);
+ v[7] = _mm_madd_epi16(u[3], k__cospi_p27_m05);
+ v[8] = _mm_madd_epi16(u[4], k__cospi_p09_p23);
+ v[9] = _mm_madd_epi16(u[5], k__cospi_p09_p23);
+ v[10] = _mm_madd_epi16(u[4], k__cospi_p23_m09);
+ v[11] = _mm_madd_epi16(u[5], k__cospi_p23_m09);
+ v[12] = _mm_madd_epi16(u[6], k__cospi_p13_p19);
+ v[13] = _mm_madd_epi16(u[7], k__cospi_p13_p19);
+ v[14] = _mm_madd_epi16(u[6], k__cospi_p19_m13);
+ v[15] = _mm_madd_epi16(u[7], k__cospi_p19_m13);
+ v[16] = _mm_madd_epi16(u[8], k__cospi_p17_p15);
+ v[17] = _mm_madd_epi16(u[9], k__cospi_p17_p15);
+ v[18] = _mm_madd_epi16(u[8], k__cospi_p15_m17);
+ v[19] = _mm_madd_epi16(u[9], k__cospi_p15_m17);
+ v[20] = _mm_madd_epi16(u[10], k__cospi_p21_p11);
+ v[21] = _mm_madd_epi16(u[11], k__cospi_p21_p11);
+ v[22] = _mm_madd_epi16(u[10], k__cospi_p11_m21);
+ v[23] = _mm_madd_epi16(u[11], k__cospi_p11_m21);
+ v[24] = _mm_madd_epi16(u[12], k__cospi_p25_p07);
+ v[25] = _mm_madd_epi16(u[13], k__cospi_p25_p07);
+ v[26] = _mm_madd_epi16(u[12], k__cospi_p07_m25);
+ v[27] = _mm_madd_epi16(u[13], k__cospi_p07_m25);
+ v[28] = _mm_madd_epi16(u[14], k__cospi_p29_p03);
+ v[29] = _mm_madd_epi16(u[15], k__cospi_p29_p03);
+ v[30] = _mm_madd_epi16(u[14], k__cospi_p03_m29);
+ v[31] = _mm_madd_epi16(u[15], k__cospi_p03_m29);
+
+ u[0] = _mm_add_epi32(v[0], v[16]);
+ u[1] = _mm_add_epi32(v[1], v[17]);
+ u[2] = _mm_add_epi32(v[2], v[18]);
+ u[3] = _mm_add_epi32(v[3], v[19]);
+ u[4] = _mm_add_epi32(v[4], v[20]);
+ u[5] = _mm_add_epi32(v[5], v[21]);
+ u[6] = _mm_add_epi32(v[6], v[22]);
+ u[7] = _mm_add_epi32(v[7], v[23]);
+ u[8] = _mm_add_epi32(v[8], v[24]);
+ u[9] = _mm_add_epi32(v[9], v[25]);
+ u[10] = _mm_add_epi32(v[10], v[26]);
+ u[11] = _mm_add_epi32(v[11], v[27]);
+ u[12] = _mm_add_epi32(v[12], v[28]);
+ u[13] = _mm_add_epi32(v[13], v[29]);
+ u[14] = _mm_add_epi32(v[14], v[30]);
+ u[15] = _mm_add_epi32(v[15], v[31]);
+ u[16] = _mm_sub_epi32(v[0], v[16]);
+ u[17] = _mm_sub_epi32(v[1], v[17]);
+ u[18] = _mm_sub_epi32(v[2], v[18]);
+ u[19] = _mm_sub_epi32(v[3], v[19]);
+ u[20] = _mm_sub_epi32(v[4], v[20]);
+ u[21] = _mm_sub_epi32(v[5], v[21]);
+ u[22] = _mm_sub_epi32(v[6], v[22]);
+ u[23] = _mm_sub_epi32(v[7], v[23]);
+ u[24] = _mm_sub_epi32(v[8], v[24]);
+ u[25] = _mm_sub_epi32(v[9], v[25]);
+ u[26] = _mm_sub_epi32(v[10], v[26]);
+ u[27] = _mm_sub_epi32(v[11], v[27]);
+ u[28] = _mm_sub_epi32(v[12], v[28]);
+ u[29] = _mm_sub_epi32(v[13], v[29]);
+ u[30] = _mm_sub_epi32(v[14], v[30]);
+ u[31] = _mm_sub_epi32(v[15], v[31]);
+
+ u[0] = dct_const_round_shift_sse2(u[0]);
+ u[1] = dct_const_round_shift_sse2(u[1]);
+ u[2] = dct_const_round_shift_sse2(u[2]);
+ u[3] = dct_const_round_shift_sse2(u[3]);
+ u[4] = dct_const_round_shift_sse2(u[4]);
+ u[5] = dct_const_round_shift_sse2(u[5]);
+ u[6] = dct_const_round_shift_sse2(u[6]);
+ u[7] = dct_const_round_shift_sse2(u[7]);
+ u[8] = dct_const_round_shift_sse2(u[8]);
+ u[9] = dct_const_round_shift_sse2(u[9]);
+ u[10] = dct_const_round_shift_sse2(u[10]);
+ u[11] = dct_const_round_shift_sse2(u[11]);
+ u[12] = dct_const_round_shift_sse2(u[12]);
+ u[13] = dct_const_round_shift_sse2(u[13]);
+ u[14] = dct_const_round_shift_sse2(u[14]);
+ u[15] = dct_const_round_shift_sse2(u[15]);
+ u[16] = dct_const_round_shift_sse2(u[16]);
+ u[17] = dct_const_round_shift_sse2(u[17]);
+ u[18] = dct_const_round_shift_sse2(u[18]);
+ u[19] = dct_const_round_shift_sse2(u[19]);
+ u[20] = dct_const_round_shift_sse2(u[20]);
+ u[21] = dct_const_round_shift_sse2(u[21]);
+ u[22] = dct_const_round_shift_sse2(u[22]);
+ u[23] = dct_const_round_shift_sse2(u[23]);
+ u[24] = dct_const_round_shift_sse2(u[24]);
+ u[25] = dct_const_round_shift_sse2(u[25]);
+ u[26] = dct_const_round_shift_sse2(u[26]);
+ u[27] = dct_const_round_shift_sse2(u[27]);
+ u[28] = dct_const_round_shift_sse2(u[28]);
+ u[29] = dct_const_round_shift_sse2(u[29]);
+ u[30] = dct_const_round_shift_sse2(u[30]);
+ u[31] = dct_const_round_shift_sse2(u[31]);
+
+ s[0] = _mm_packs_epi32(u[0], u[1]);
+ s[1] = _mm_packs_epi32(u[2], u[3]);
+ s[2] = _mm_packs_epi32(u[4], u[5]);
+ s[3] = _mm_packs_epi32(u[6], u[7]);
+ s[4] = _mm_packs_epi32(u[8], u[9]);
+ s[5] = _mm_packs_epi32(u[10], u[11]);
+ s[6] = _mm_packs_epi32(u[12], u[13]);
+ s[7] = _mm_packs_epi32(u[14], u[15]);
+ s[8] = _mm_packs_epi32(u[16], u[17]);
+ s[9] = _mm_packs_epi32(u[18], u[19]);
+ s[10] = _mm_packs_epi32(u[20], u[21]);
+ s[11] = _mm_packs_epi32(u[22], u[23]);
+ s[12] = _mm_packs_epi32(u[24], u[25]);
+ s[13] = _mm_packs_epi32(u[26], u[27]);
+ s[14] = _mm_packs_epi32(u[28], u[29]);
+ s[15] = _mm_packs_epi32(u[30], u[31]);
+
+ // stage 2
+ u[0] = _mm_unpacklo_epi16(s[8], s[9]);
+ u[1] = _mm_unpackhi_epi16(s[8], s[9]);
+ u[2] = _mm_unpacklo_epi16(s[10], s[11]);
+ u[3] = _mm_unpackhi_epi16(s[10], s[11]);
+ u[4] = _mm_unpacklo_epi16(s[12], s[13]);
+ u[5] = _mm_unpackhi_epi16(s[12], s[13]);
+ u[6] = _mm_unpacklo_epi16(s[14], s[15]);
+ u[7] = _mm_unpackhi_epi16(s[14], s[15]);
+
+ v[0] = _mm_madd_epi16(u[0], k__cospi_p04_p28);
+ v[1] = _mm_madd_epi16(u[1], k__cospi_p04_p28);
+ v[2] = _mm_madd_epi16(u[0], k__cospi_p28_m04);
+ v[3] = _mm_madd_epi16(u[1], k__cospi_p28_m04);
+ v[4] = _mm_madd_epi16(u[2], k__cospi_p20_p12);
+ v[5] = _mm_madd_epi16(u[3], k__cospi_p20_p12);
+ v[6] = _mm_madd_epi16(u[2], k__cospi_p12_m20);
+ v[7] = _mm_madd_epi16(u[3], k__cospi_p12_m20);
+ v[8] = _mm_madd_epi16(u[4], k__cospi_m28_p04);
+ v[9] = _mm_madd_epi16(u[5], k__cospi_m28_p04);
+ v[10] = _mm_madd_epi16(u[4], k__cospi_p04_p28);
+ v[11] = _mm_madd_epi16(u[5], k__cospi_p04_p28);
+ v[12] = _mm_madd_epi16(u[6], k__cospi_m12_p20);
+ v[13] = _mm_madd_epi16(u[7], k__cospi_m12_p20);
+ v[14] = _mm_madd_epi16(u[6], k__cospi_p20_p12);
+ v[15] = _mm_madd_epi16(u[7], k__cospi_p20_p12);
+
+ u[0] = _mm_add_epi32(v[0], v[8]);
+ u[1] = _mm_add_epi32(v[1], v[9]);
+ u[2] = _mm_add_epi32(v[2], v[10]);
+ u[3] = _mm_add_epi32(v[3], v[11]);
+ u[4] = _mm_add_epi32(v[4], v[12]);
+ u[5] = _mm_add_epi32(v[5], v[13]);
+ u[6] = _mm_add_epi32(v[6], v[14]);
+ u[7] = _mm_add_epi32(v[7], v[15]);
+ u[8] = _mm_sub_epi32(v[0], v[8]);
+ u[9] = _mm_sub_epi32(v[1], v[9]);
+ u[10] = _mm_sub_epi32(v[2], v[10]);
+ u[11] = _mm_sub_epi32(v[3], v[11]);
+ u[12] = _mm_sub_epi32(v[4], v[12]);
+ u[13] = _mm_sub_epi32(v[5], v[13]);
+ u[14] = _mm_sub_epi32(v[6], v[14]);
+ u[15] = _mm_sub_epi32(v[7], v[15]);
+
+ u[0] = dct_const_round_shift_sse2(u[0]);
+ u[1] = dct_const_round_shift_sse2(u[1]);
+ u[2] = dct_const_round_shift_sse2(u[2]);
+ u[3] = dct_const_round_shift_sse2(u[3]);
+ u[4] = dct_const_round_shift_sse2(u[4]);
+ u[5] = dct_const_round_shift_sse2(u[5]);
+ u[6] = dct_const_round_shift_sse2(u[6]);
+ u[7] = dct_const_round_shift_sse2(u[7]);
+ u[8] = dct_const_round_shift_sse2(u[8]);
+ u[9] = dct_const_round_shift_sse2(u[9]);
+ u[10] = dct_const_round_shift_sse2(u[10]);
+ u[11] = dct_const_round_shift_sse2(u[11]);
+ u[12] = dct_const_round_shift_sse2(u[12]);
+ u[13] = dct_const_round_shift_sse2(u[13]);
+ u[14] = dct_const_round_shift_sse2(u[14]);
+ u[15] = dct_const_round_shift_sse2(u[15]);
+
+ x[0] = _mm_add_epi16(s[0], s[4]);
+ x[1] = _mm_add_epi16(s[1], s[5]);
+ x[2] = _mm_add_epi16(s[2], s[6]);
+ x[3] = _mm_add_epi16(s[3], s[7]);
+ x[4] = _mm_sub_epi16(s[0], s[4]);
+ x[5] = _mm_sub_epi16(s[1], s[5]);
+ x[6] = _mm_sub_epi16(s[2], s[6]);
+ x[7] = _mm_sub_epi16(s[3], s[7]);
+ x[8] = _mm_packs_epi32(u[0], u[1]);
+ x[9] = _mm_packs_epi32(u[2], u[3]);
+ x[10] = _mm_packs_epi32(u[4], u[5]);
+ x[11] = _mm_packs_epi32(u[6], u[7]);
+ x[12] = _mm_packs_epi32(u[8], u[9]);
+ x[13] = _mm_packs_epi32(u[10], u[11]);
+ x[14] = _mm_packs_epi32(u[12], u[13]);
+ x[15] = _mm_packs_epi32(u[14], u[15]);
+
+ // stage 3
+ u[0] = _mm_unpacklo_epi16(x[4], x[5]);
+ u[1] = _mm_unpackhi_epi16(x[4], x[5]);
+ u[2] = _mm_unpacklo_epi16(x[6], x[7]);
+ u[3] = _mm_unpackhi_epi16(x[6], x[7]);
+ u[4] = _mm_unpacklo_epi16(x[12], x[13]);
+ u[5] = _mm_unpackhi_epi16(x[12], x[13]);
+ u[6] = _mm_unpacklo_epi16(x[14], x[15]);
+ u[7] = _mm_unpackhi_epi16(x[14], x[15]);
+
+ v[0] = _mm_madd_epi16(u[0], k__cospi_p08_p24);
+ v[1] = _mm_madd_epi16(u[1], k__cospi_p08_p24);
+ v[2] = _mm_madd_epi16(u[0], k__cospi_p24_m08);
+ v[3] = _mm_madd_epi16(u[1], k__cospi_p24_m08);
+ v[4] = _mm_madd_epi16(u[2], k__cospi_m24_p08);
+ v[5] = _mm_madd_epi16(u[3], k__cospi_m24_p08);
+ v[6] = _mm_madd_epi16(u[2], k__cospi_p08_p24);
+ v[7] = _mm_madd_epi16(u[3], k__cospi_p08_p24);
+ v[8] = _mm_madd_epi16(u[4], k__cospi_p08_p24);
+ v[9] = _mm_madd_epi16(u[5], k__cospi_p08_p24);
+ v[10] = _mm_madd_epi16(u[4], k__cospi_p24_m08);
+ v[11] = _mm_madd_epi16(u[5], k__cospi_p24_m08);
+ v[12] = _mm_madd_epi16(u[6], k__cospi_m24_p08);
+ v[13] = _mm_madd_epi16(u[7], k__cospi_m24_p08);
+ v[14] = _mm_madd_epi16(u[6], k__cospi_p08_p24);
+ v[15] = _mm_madd_epi16(u[7], k__cospi_p08_p24);
+
+ u[0] = _mm_add_epi32(v[0], v[4]);
+ u[1] = _mm_add_epi32(v[1], v[5]);
+ u[2] = _mm_add_epi32(v[2], v[6]);
+ u[3] = _mm_add_epi32(v[3], v[7]);
+ u[4] = _mm_sub_epi32(v[0], v[4]);
+ u[5] = _mm_sub_epi32(v[1], v[5]);
+ u[6] = _mm_sub_epi32(v[2], v[6]);
+ u[7] = _mm_sub_epi32(v[3], v[7]);
+ u[8] = _mm_add_epi32(v[8], v[12]);
+ u[9] = _mm_add_epi32(v[9], v[13]);
+ u[10] = _mm_add_epi32(v[10], v[14]);
+ u[11] = _mm_add_epi32(v[11], v[15]);
+ u[12] = _mm_sub_epi32(v[8], v[12]);
+ u[13] = _mm_sub_epi32(v[9], v[13]);
+ u[14] = _mm_sub_epi32(v[10], v[14]);
+ u[15] = _mm_sub_epi32(v[11], v[15]);
+
+ v[0] = dct_const_round_shift_sse2(u[0]);
+ v[1] = dct_const_round_shift_sse2(u[1]);
+ v[2] = dct_const_round_shift_sse2(u[2]);
+ v[3] = dct_const_round_shift_sse2(u[3]);
+ v[4] = dct_const_round_shift_sse2(u[4]);
+ v[5] = dct_const_round_shift_sse2(u[5]);
+ v[6] = dct_const_round_shift_sse2(u[6]);
+ v[7] = dct_const_round_shift_sse2(u[7]);
+ v[8] = dct_const_round_shift_sse2(u[8]);
+ v[9] = dct_const_round_shift_sse2(u[9]);
+ v[10] = dct_const_round_shift_sse2(u[10]);
+ v[11] = dct_const_round_shift_sse2(u[11]);
+ v[12] = dct_const_round_shift_sse2(u[12]);
+ v[13] = dct_const_round_shift_sse2(u[13]);
+ v[14] = dct_const_round_shift_sse2(u[14]);
+ v[15] = dct_const_round_shift_sse2(u[15]);
+
+ s[0] = _mm_add_epi16(x[0], x[2]);
+ s[1] = _mm_add_epi16(x[1], x[3]);
+ s[2] = _mm_sub_epi16(x[0], x[2]);
+ s[3] = _mm_sub_epi16(x[1], x[3]);
+ s[4] = _mm_packs_epi32(v[0], v[1]);
+ s[5] = _mm_packs_epi32(v[2], v[3]);
+ s[6] = _mm_packs_epi32(v[4], v[5]);
+ s[7] = _mm_packs_epi32(v[6], v[7]);
+ s[8] = _mm_add_epi16(x[8], x[10]);
+ s[9] = _mm_add_epi16(x[9], x[11]);
+ s[10] = _mm_sub_epi16(x[8], x[10]);
+ s[11] = _mm_sub_epi16(x[9], x[11]);
+ s[12] = _mm_packs_epi32(v[8], v[9]);
+ s[13] = _mm_packs_epi32(v[10], v[11]);
+ s[14] = _mm_packs_epi32(v[12], v[13]);
+ s[15] = _mm_packs_epi32(v[14], v[15]);
+
+ // stage 4
+ u[0] = _mm_unpacklo_epi16(s[2], s[3]);
+ u[1] = _mm_unpackhi_epi16(s[2], s[3]);
+ u[2] = _mm_unpacklo_epi16(s[6], s[7]);
+ u[3] = _mm_unpackhi_epi16(s[6], s[7]);
+ u[4] = _mm_unpacklo_epi16(s[10], s[11]);
+ u[5] = _mm_unpackhi_epi16(s[10], s[11]);
+ u[6] = _mm_unpacklo_epi16(s[14], s[15]);
+ u[7] = _mm_unpackhi_epi16(s[14], s[15]);
+
+ in[7] = idct_calc_wraplow_sse2(u[0], u[1], k__cospi_m16_m16);
+ in[8] = idct_calc_wraplow_sse2(u[0], u[1], k__cospi_p16_m16);
+ in[4] = idct_calc_wraplow_sse2(u[2], u[3], k__cospi_p16_p16);
+ in[11] = idct_calc_wraplow_sse2(u[2], u[3], k__cospi_m16_p16);
+ in[6] = idct_calc_wraplow_sse2(u[4], u[5], k__cospi_p16_p16);
+ in[9] = idct_calc_wraplow_sse2(u[4], u[5], k__cospi_m16_p16);
+ in[5] = idct_calc_wraplow_sse2(u[6], u[7], k__cospi_m16_m16);
+ in[10] = idct_calc_wraplow_sse2(u[6], u[7], k__cospi_p16_m16);
+
+ in[0] = s[0];
+ in[1] = _mm_sub_epi16(kZero, s[8]);
+ in[2] = s[12];
+ in[3] = _mm_sub_epi16(kZero, s[4]);
+ in[12] = s[5];
+ in[13] = _mm_sub_epi16(kZero, s[13]);
+ in[14] = s[9];
+ in[15] = _mm_sub_epi16(kZero, s[1]);
+}
+
+void idct16_sse2(__m128i *const in0, __m128i *const in1) {
+ transpose_16bit_16x16(in0, in1);
+ idct16_8col(in0, in0);
+ idct16_8col(in1, in1);
+}
+
+void iadst16_sse2(__m128i *const in0, __m128i *const in1) {
+ transpose_16bit_16x16(in0, in1);
+ vpx_iadst16_8col_sse2(in0);
+ vpx_iadst16_8col_sse2(in1);
+}
+
+// Group the coefficient calculation into smaller functions to prevent stack
+// spillover in 32x32 idct optimizations:
+// quarter_1: 0-7
+// quarter_2: 8-15
+// quarter_3_4: 16-23, 24-31
+
+// For each 8x32 block __m128i in[32],
+// Input with index, 0, 4
+// output pixels: 0-7 in __m128i out[32]
+static INLINE void idct32_34_8x32_quarter_1(const __m128i *const in /*in[32]*/,
+ __m128i *const out /*out[8]*/) {
+ const __m128i zero = _mm_setzero_si128();
+ __m128i step1[8], step2[8];
+
+ // stage 3
+ butterfly(in[4], zero, cospi_28_64, cospi_4_64, &step1[4], &step1[7]);
+
+ // stage 4
+ step2[0] = butterfly_cospi16(in[0]);
+ step2[4] = step1[4];
+ step2[5] = step1[4];
+ step2[6] = step1[7];
+ step2[7] = step1[7];
+
+ // stage 5
+ step1[0] = step2[0];
+ step1[1] = step2[0];
+ step1[2] = step2[0];
+ step1[3] = step2[0];
+ step1[4] = step2[4];
+ butterfly(step2[6], step2[5], cospi_16_64, cospi_16_64, &step1[5], &step1[6]);
+ step1[7] = step2[7];
+
+ // stage 6
+ out[0] = _mm_add_epi16(step1[0], step1[7]);
+ out[1] = _mm_add_epi16(step1[1], step1[6]);
+ out[2] = _mm_add_epi16(step1[2], step1[5]);
+ out[3] = _mm_add_epi16(step1[3], step1[4]);
+ out[4] = _mm_sub_epi16(step1[3], step1[4]);
+ out[5] = _mm_sub_epi16(step1[2], step1[5]);
+ out[6] = _mm_sub_epi16(step1[1], step1[6]);
+ out[7] = _mm_sub_epi16(step1[0], step1[7]);
+}
+
+// For each 8x32 block __m128i in[32],
+// Input with index, 2, 6
+// output pixels: 8-15 in __m128i out[32]
+static INLINE void idct32_34_8x32_quarter_2(const __m128i *const in /*in[32]*/,
+ __m128i *const out /*out[16]*/) {
+ const __m128i zero = _mm_setzero_si128();
+ __m128i step1[16], step2[16];
+
+ // stage 2
+ butterfly(in[2], zero, cospi_30_64, cospi_2_64, &step2[8], &step2[15]);
+ butterfly(zero, in[6], cospi_6_64, cospi_26_64, &step2[11], &step2[12]);
+
+ // stage 3
+ step1[8] = step2[8];
+ step1[9] = step2[8];
+ step1[14] = step2[15];
+ step1[15] = step2[15];
+ step1[10] = step2[11];
+ step1[11] = step2[11];
+ step1[12] = step2[12];
+ step1[13] = step2[12];
+
+ idct32_8x32_quarter_2_stage_4_to_6(step1, out);
+}
+
+static INLINE void idct32_34_8x32_quarter_1_2(
+ const __m128i *const in /*in[32]*/, __m128i *const out /*out[32]*/) {
+ __m128i temp[16];
+ idct32_34_8x32_quarter_1(in, temp);
+ idct32_34_8x32_quarter_2(in, temp);
+ // stage 7
+ add_sub_butterfly(temp, out, 16);
+}
+
+// For each 8x32 block __m128i in[32],
+// Input with odd index, 1, 3, 5, 7
+// output pixels: 16-23, 24-31 in __m128i out[32]
+static INLINE void idct32_34_8x32_quarter_3_4(
+ const __m128i *const in /*in[32]*/, __m128i *const out /*out[32]*/) {
+ const __m128i zero = _mm_setzero_si128();
+ __m128i step1[32];
+
+ // stage 1
+ butterfly(in[1], zero, cospi_31_64, cospi_1_64, &step1[16], &step1[31]);
+ butterfly(zero, in[7], cospi_7_64, cospi_25_64, &step1[19], &step1[28]);
+ butterfly(in[5], zero, cospi_27_64, cospi_5_64, &step1[20], &step1[27]);
+ butterfly(zero, in[3], cospi_3_64, cospi_29_64, &step1[23], &step1[24]);
+
+ // stage 3
+ butterfly(step1[31], step1[16], cospi_28_64, cospi_4_64, &step1[17],
+ &step1[30]);
+ butterfly(step1[28], step1[19], -cospi_4_64, cospi_28_64, &step1[18],
+ &step1[29]);
+ butterfly(step1[27], step1[20], cospi_12_64, cospi_20_64, &step1[21],
+ &step1[26]);
+ butterfly(step1[24], step1[23], -cospi_20_64, cospi_12_64, &step1[22],
+ &step1[25]);
+
+ idct32_8x32_quarter_3_4_stage_4_to_7(step1, out);
+}
+
+void idct32_34_8x32_sse2(const __m128i *const in /*in[32]*/,
+ __m128i *const out /*out[32]*/) {
+ __m128i temp[32];
+
+ idct32_34_8x32_quarter_1_2(in, temp);
+ idct32_34_8x32_quarter_3_4(in, temp);
+ // final stage
+ add_sub_butterfly(temp, out, 32);
+}
+
+// Only upper-left 8x8 has non-zero coeff
+void vpx_idct32x32_34_add_sse2(const tran_low_t *input, uint8_t *dest,
+ int stride) {
+ __m128i io[32], col[32];
+ int i;
+
+ // Load input data. Only need to load the top left 8x8 block.
+ load_transpose_16bit_8x8(input, 32, io);
+ idct32_34_8x32_sse2(io, col);
+
+ for (i = 0; i < 32; i += 8) {
+ int j;
+ transpose_16bit_8x8(col + i, io);
+ idct32_34_8x32_sse2(io, io);
+
+ for (j = 0; j < 32; ++j) {
+ write_buffer_8x1(dest + j * stride, io[j]);
+ }
+
+ dest += 8;
+ }
+}
+
+// For each 8x32 block __m128i in[32],
+// Input with index, 0, 4, 8, 12, 16, 20, 24, 28
+// output pixels: 0-7 in __m128i out[32]
+static INLINE void idct32_1024_8x32_quarter_1(
+ const __m128i *const in /*in[32]*/, __m128i *const out /*out[8]*/) {
+ __m128i step1[8], step2[8];
+
+ // stage 3
+ butterfly(in[4], in[28], cospi_28_64, cospi_4_64, &step1[4], &step1[7]);
+ butterfly(in[20], in[12], cospi_12_64, cospi_20_64, &step1[5], &step1[6]);
+
+ // stage 4
+ butterfly(in[0], in[16], cospi_16_64, cospi_16_64, &step2[1], &step2[0]);
+ butterfly(in[8], in[24], cospi_24_64, cospi_8_64, &step2[2], &step2[3]);
+ step2[4] = _mm_add_epi16(step1[4], step1[5]);
+ step2[5] = _mm_sub_epi16(step1[4], step1[5]);
+ step2[6] = _mm_sub_epi16(step1[7], step1[6]);
+ step2[7] = _mm_add_epi16(step1[7], step1[6]);
+
+ // stage 5
+ step1[0] = _mm_add_epi16(step2[0], step2[3]);
+ step1[1] = _mm_add_epi16(step2[1], step2[2]);
+ step1[2] = _mm_sub_epi16(step2[1], step2[2]);
+ step1[3] = _mm_sub_epi16(step2[0], step2[3]);
+ step1[4] = step2[4];
+ butterfly(step2[6], step2[5], cospi_16_64, cospi_16_64, &step1[5], &step1[6]);
+ step1[7] = step2[7];
+
+ // stage 6
+ out[0] = _mm_add_epi16(step1[0], step1[7]);
+ out[1] = _mm_add_epi16(step1[1], step1[6]);
+ out[2] = _mm_add_epi16(step1[2], step1[5]);
+ out[3] = _mm_add_epi16(step1[3], step1[4]);
+ out[4] = _mm_sub_epi16(step1[3], step1[4]);
+ out[5] = _mm_sub_epi16(step1[2], step1[5]);
+ out[6] = _mm_sub_epi16(step1[1], step1[6]);
+ out[7] = _mm_sub_epi16(step1[0], step1[7]);
+}
+
+// For each 8x32 block __m128i in[32],
+// Input with index, 2, 6, 10, 14, 18, 22, 26, 30
+// output pixels: 8-15 in __m128i out[32]
+static INLINE void idct32_1024_8x32_quarter_2(
+ const __m128i *const in /*in[32]*/, __m128i *const out /*out[16]*/) {
+ __m128i step1[16], step2[16];
+
+ // stage 2
+ butterfly(in[2], in[30], cospi_30_64, cospi_2_64, &step2[8], &step2[15]);
+ butterfly(in[18], in[14], cospi_14_64, cospi_18_64, &step2[9], &step2[14]);
+ butterfly(in[10], in[22], cospi_22_64, cospi_10_64, &step2[10], &step2[13]);
+ butterfly(in[26], in[6], cospi_6_64, cospi_26_64, &step2[11], &step2[12]);
+
+ // stage 3
+ step1[8] = _mm_add_epi16(step2[8], step2[9]);
+ step1[9] = _mm_sub_epi16(step2[8], step2[9]);
+ step1[10] = _mm_sub_epi16(step2[11], step2[10]);
+ step1[11] = _mm_add_epi16(step2[11], step2[10]);
+ step1[12] = _mm_add_epi16(step2[12], step2[13]);
+ step1[13] = _mm_sub_epi16(step2[12], step2[13]);
+ step1[14] = _mm_sub_epi16(step2[15], step2[14]);
+ step1[15] = _mm_add_epi16(step2[15], step2[14]);
+
+ idct32_8x32_quarter_2_stage_4_to_6(step1, out);
+}
+
+static INLINE void idct32_1024_8x32_quarter_1_2(
+ const __m128i *const in /*in[32]*/, __m128i *const out /*out[32]*/) {
+ __m128i temp[16];
+ idct32_1024_8x32_quarter_1(in, temp);
+ idct32_1024_8x32_quarter_2(in, temp);
+ // stage 7
+ add_sub_butterfly(temp, out, 16);
+}
+
+// For each 8x32 block __m128i in[32],
+// Input with odd index,
+// 1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31
+// output pixels: 16-23, 24-31 in __m128i out[32]
+static INLINE void idct32_1024_8x32_quarter_3_4(
+ const __m128i *const in /*in[32]*/, __m128i *const out /*out[32]*/) {
+ __m128i step1[32], step2[32];
+
+ // stage 1
+ butterfly(in[1], in[31], cospi_31_64, cospi_1_64, &step1[16], &step1[31]);
+ butterfly(in[17], in[15], cospi_15_64, cospi_17_64, &step1[17], &step1[30]);
+ butterfly(in[9], in[23], cospi_23_64, cospi_9_64, &step1[18], &step1[29]);
+ butterfly(in[25], in[7], cospi_7_64, cospi_25_64, &step1[19], &step1[28]);
+
+ butterfly(in[5], in[27], cospi_27_64, cospi_5_64, &step1[20], &step1[27]);
+ butterfly(in[21], in[11], cospi_11_64, cospi_21_64, &step1[21], &step1[26]);
+
+ butterfly(in[13], in[19], cospi_19_64, cospi_13_64, &step1[22], &step1[25]);
+ butterfly(in[29], in[3], cospi_3_64, cospi_29_64, &step1[23], &step1[24]);
+
+ // stage 2
+ step2[16] = _mm_add_epi16(step1[16], step1[17]);
+ step2[17] = _mm_sub_epi16(step1[16], step1[17]);
+ step2[18] = _mm_sub_epi16(step1[19], step1[18]);
+ step2[19] = _mm_add_epi16(step1[19], step1[18]);
+ step2[20] = _mm_add_epi16(step1[20], step1[21]);
+ step2[21] = _mm_sub_epi16(step1[20], step1[21]);
+ step2[22] = _mm_sub_epi16(step1[23], step1[22]);
+ step2[23] = _mm_add_epi16(step1[23], step1[22]);
+
+ step2[24] = _mm_add_epi16(step1[24], step1[25]);
+ step2[25] = _mm_sub_epi16(step1[24], step1[25]);
+ step2[26] = _mm_sub_epi16(step1[27], step1[26]);
+ step2[27] = _mm_add_epi16(step1[27], step1[26]);
+ step2[28] = _mm_add_epi16(step1[28], step1[29]);
+ step2[29] = _mm_sub_epi16(step1[28], step1[29]);
+ step2[30] = _mm_sub_epi16(step1[31], step1[30]);
+ step2[31] = _mm_add_epi16(step1[31], step1[30]);
+
+ // stage 3
+ step1[16] = step2[16];
+ step1[31] = step2[31];
+ butterfly(step2[30], step2[17], cospi_28_64, cospi_4_64, &step1[17],
+ &step1[30]);
+ butterfly(step2[29], step2[18], -cospi_4_64, cospi_28_64, &step1[18],
+ &step1[29]);
+ step1[19] = step2[19];
+ step1[20] = step2[20];
+ butterfly(step2[26], step2[21], cospi_12_64, cospi_20_64, &step1[21],
+ &step1[26]);
+ butterfly(step2[25], step2[22], -cospi_20_64, cospi_12_64, &step1[22],
+ &step1[25]);
+ step1[23] = step2[23];
+ step1[24] = step2[24];
+ step1[27] = step2[27];
+ step1[28] = step2[28];
+
+ idct32_8x32_quarter_3_4_stage_4_to_7(step1, out);
+}
+
+void idct32_1024_8x32(const __m128i *const in /*in[32]*/,
+ __m128i *const out /*out[32]*/) {
+ __m128i temp[32];
+
+ idct32_1024_8x32_quarter_1_2(in, temp);
+ idct32_1024_8x32_quarter_3_4(in, temp);
+ // final stage
+ add_sub_butterfly(temp, out, 32);
+}
+
+void vpx_idct32x32_1024_add_sse2(const tran_low_t *input, uint8_t *dest,
+ int stride) {
+ __m128i col[4][32], io[32];
+ int i;
+
+ // rows
+ for (i = 0; i < 4; i++) {
+ load_transpose_16bit_8x8(&input[0], 32, &io[0]);
+ load_transpose_16bit_8x8(&input[8], 32, &io[8]);
+ load_transpose_16bit_8x8(&input[16], 32, &io[16]);
+ load_transpose_16bit_8x8(&input[24], 32, &io[24]);
+ idct32_1024_8x32(io, col[i]);
+ input += 32 << 3;
+ }
+
+ // columns
+ for (i = 0; i < 32; i += 8) {
+ // Transpose 32x8 block to 8x32 block
+ transpose_16bit_8x8(col[0] + i, io);
+ transpose_16bit_8x8(col[1] + i, io + 8);
+ transpose_16bit_8x8(col[2] + i, io + 16);
+ transpose_16bit_8x8(col[3] + i, io + 24);
+
+ idct32_1024_8x32(io, io);
+ store_buffer_8x32(io, dest, stride);
+ dest += 8;
+ }
+}
+
+void vpx_idct32x32_135_add_sse2(const tran_low_t *input, uint8_t *dest,
+ int stride) {
+ __m128i col[2][32], in[32], out[32];
+ int i;
+
+ for (i = 16; i < 32; i++) {
+ in[i] = _mm_setzero_si128();
+ }
+
+ // rows
+ for (i = 0; i < 2; i++) {
+ load_transpose_16bit_8x8(&input[0], 32, &in[0]);
+ load_transpose_16bit_8x8(&input[8], 32, &in[8]);
+ idct32_1024_8x32(in, col[i]);
+ input += 32 << 3;
+ }
+
+ // columns
+ for (i = 0; i < 32; i += 8) {
+ transpose_16bit_8x8(col[0] + i, in);
+ transpose_16bit_8x8(col[1] + i, in + 8);
+ idct32_1024_8x32(in, out);
+ store_buffer_8x32(out, dest, stride);
+ dest += 8;
+ }
+}
+
+void vpx_idct32x32_1_add_sse2(const tran_low_t *input, uint8_t *dest,
+ int stride) {
+ __m128i dc_value;
+ int j;
+ tran_high_t a1;
+ tran_low_t out =
+ WRAPLOW(dct_const_round_shift((int16_t)input[0] * cospi_16_64));
+
+ out = WRAPLOW(dct_const_round_shift(out * cospi_16_64));
+ a1 = ROUND_POWER_OF_TWO(out, 6);
+ dc_value = _mm_set1_epi16((int16_t)a1);
+
+ for (j = 0; j < 32; ++j) {
+ recon_and_store_16(dest + j * stride + 0, dc_value);
+ recon_and_store_16(dest + j * stride + 16, dc_value);
+ }
+}
diff --git a/media/libvpx/libvpx/vpx_dsp/x86/inv_txfm_sse2.h b/media/libvpx/libvpx/vpx_dsp/x86/inv_txfm_sse2.h
new file mode 100644
index 0000000000..b4bbd186d2
--- /dev/null
+++ b/media/libvpx/libvpx/vpx_dsp/x86/inv_txfm_sse2.h
@@ -0,0 +1,710 @@
+/*
+ * Copyright (c) 2015 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef VPX_VPX_DSP_X86_INV_TXFM_SSE2_H_
+#define VPX_VPX_DSP_X86_INV_TXFM_SSE2_H_
+
+#include <emmintrin.h> // SSE2
+
+#include "./vpx_config.h"
+#include "vpx/vpx_integer.h"
+#include "vpx_dsp/inv_txfm.h"
+#include "vpx_dsp/x86/transpose_sse2.h"
+#include "vpx_dsp/x86/txfm_common_sse2.h"
+
+static INLINE void idct8x8_12_transpose_16bit_4x8(const __m128i *const in,
+ __m128i *const out) {
+ // Unpack 16 bit elements. Goes from:
+ // in[0]: 30 31 32 33 00 01 02 03
+ // in[1]: 20 21 22 23 10 11 12 13
+ // in[2]: 40 41 42 43 70 71 72 73
+ // in[3]: 50 51 52 53 60 61 62 63
+ // to:
+ // tr0_0: 00 10 01 11 02 12 03 13
+ // tr0_1: 20 30 21 31 22 32 23 33
+ // tr0_2: 40 50 41 51 42 52 43 53
+ // tr0_3: 60 70 61 71 62 72 63 73
+ const __m128i tr0_0 = _mm_unpackhi_epi16(in[0], in[1]);
+ const __m128i tr0_1 = _mm_unpacklo_epi16(in[1], in[0]);
+ const __m128i tr0_2 = _mm_unpacklo_epi16(in[2], in[3]);
+ const __m128i tr0_3 = _mm_unpackhi_epi16(in[3], in[2]);
+
+ // Unpack 32 bit elements resulting in:
+ // tr1_0: 00 10 20 30 01 11 21 31
+ // tr1_1: 02 12 22 32 03 13 23 33
+ // tr1_2: 40 50 60 70 41 51 61 71
+ // tr1_3: 42 52 62 72 43 53 63 73
+ const __m128i tr1_0 = _mm_unpacklo_epi32(tr0_0, tr0_1);
+ const __m128i tr1_1 = _mm_unpacklo_epi32(tr0_2, tr0_3);
+ const __m128i tr1_2 = _mm_unpackhi_epi32(tr0_0, tr0_1);
+ const __m128i tr1_3 = _mm_unpackhi_epi32(tr0_2, tr0_3);
+
+ // Unpack 64 bit elements resulting in:
+ // out[0]: 00 10 20 30 40 50 60 70
+ // out[1]: 01 11 21 31 41 51 61 71
+ // out[2]: 02 12 22 32 42 52 62 72
+ // out[3]: 03 13 23 33 43 53 63 73
+ out[0] = _mm_unpacklo_epi64(tr1_0, tr1_1);
+ out[1] = _mm_unpackhi_epi64(tr1_0, tr1_1);
+ out[2] = _mm_unpacklo_epi64(tr1_2, tr1_3);
+ out[3] = _mm_unpackhi_epi64(tr1_2, tr1_3);
+}
+
+static INLINE __m128i dct_const_round_shift_sse2(const __m128i in) {
+ const __m128i t = _mm_add_epi32(in, _mm_set1_epi32(DCT_CONST_ROUNDING));
+ return _mm_srai_epi32(t, DCT_CONST_BITS);
+}
+
+static INLINE __m128i idct_madd_round_shift_sse2(const __m128i in,
+ const __m128i cospi) {
+ const __m128i t = _mm_madd_epi16(in, cospi);
+ return dct_const_round_shift_sse2(t);
+}
+
+// Calculate the dot product between in0/1 and x and wrap to short.
+static INLINE __m128i idct_calc_wraplow_sse2(const __m128i in0,
+ const __m128i in1,
+ const __m128i x) {
+ const __m128i t0 = idct_madd_round_shift_sse2(in0, x);
+ const __m128i t1 = idct_madd_round_shift_sse2(in1, x);
+ return _mm_packs_epi32(t0, t1);
+}
+
+// Multiply elements by constants and add them together.
+static INLINE void butterfly(const __m128i in0, const __m128i in1, const int c0,
+ const int c1, __m128i *const out0,
+ __m128i *const out1) {
+ const __m128i cst0 = pair_set_epi16(c0, -c1);
+ const __m128i cst1 = pair_set_epi16(c1, c0);
+ const __m128i lo = _mm_unpacklo_epi16(in0, in1);
+ const __m128i hi = _mm_unpackhi_epi16(in0, in1);
+ *out0 = idct_calc_wraplow_sse2(lo, hi, cst0);
+ *out1 = idct_calc_wraplow_sse2(lo, hi, cst1);
+}
+
+static INLINE __m128i butterfly_cospi16(const __m128i in) {
+ const __m128i cst = pair_set_epi16(cospi_16_64, cospi_16_64);
+ const __m128i lo = _mm_unpacklo_epi16(in, _mm_setzero_si128());
+ const __m128i hi = _mm_unpackhi_epi16(in, _mm_setzero_si128());
+ return idct_calc_wraplow_sse2(lo, hi, cst);
+}
+
+// Functions to allow 8 bit optimisations to be used when profile 0 is used with
+// highbitdepth enabled
+static INLINE __m128i load_input_data4(const tran_low_t *data) {
+#if CONFIG_VP9_HIGHBITDEPTH
+ const __m128i zero = _mm_setzero_si128();
+ const __m128i in = _mm_load_si128((const __m128i *)data);
+ return _mm_packs_epi32(in, zero);
+#else
+ return _mm_loadl_epi64((const __m128i *)data);
+#endif
+}
+
+static INLINE __m128i load_input_data8(const tran_low_t *data) {
+#if CONFIG_VP9_HIGHBITDEPTH
+ const __m128i in0 = _mm_load_si128((const __m128i *)data);
+ const __m128i in1 = _mm_load_si128((const __m128i *)(data + 4));
+ return _mm_packs_epi32(in0, in1);
+#else
+ return _mm_load_si128((const __m128i *)data);
+#endif
+}
+
+static INLINE void load_transpose_16bit_8x8(const tran_low_t *input,
+ const int stride,
+ __m128i *const in) {
+ in[0] = load_input_data8(input + 0 * stride);
+ in[1] = load_input_data8(input + 1 * stride);
+ in[2] = load_input_data8(input + 2 * stride);
+ in[3] = load_input_data8(input + 3 * stride);
+ in[4] = load_input_data8(input + 4 * stride);
+ in[5] = load_input_data8(input + 5 * stride);
+ in[6] = load_input_data8(input + 6 * stride);
+ in[7] = load_input_data8(input + 7 * stride);
+ transpose_16bit_8x8(in, in);
+}
+
+static INLINE void recon_and_store(uint8_t *const dest, const __m128i in_x) {
+ const __m128i zero = _mm_setzero_si128();
+ __m128i d0 = _mm_loadl_epi64((__m128i *)(dest));
+ d0 = _mm_unpacklo_epi8(d0, zero);
+ d0 = _mm_add_epi16(in_x, d0);
+ d0 = _mm_packus_epi16(d0, d0);
+ _mm_storel_epi64((__m128i *)(dest), d0);
+}
+
+static INLINE void round_shift_8x8(const __m128i *const in,
+ __m128i *const out) {
+ const __m128i final_rounding = _mm_set1_epi16(1 << 4);
+
+ out[0] = _mm_add_epi16(in[0], final_rounding);
+ out[1] = _mm_add_epi16(in[1], final_rounding);
+ out[2] = _mm_add_epi16(in[2], final_rounding);
+ out[3] = _mm_add_epi16(in[3], final_rounding);
+ out[4] = _mm_add_epi16(in[4], final_rounding);
+ out[5] = _mm_add_epi16(in[5], final_rounding);
+ out[6] = _mm_add_epi16(in[6], final_rounding);
+ out[7] = _mm_add_epi16(in[7], final_rounding);
+
+ out[0] = _mm_srai_epi16(out[0], 5);
+ out[1] = _mm_srai_epi16(out[1], 5);
+ out[2] = _mm_srai_epi16(out[2], 5);
+ out[3] = _mm_srai_epi16(out[3], 5);
+ out[4] = _mm_srai_epi16(out[4], 5);
+ out[5] = _mm_srai_epi16(out[5], 5);
+ out[6] = _mm_srai_epi16(out[6], 5);
+ out[7] = _mm_srai_epi16(out[7], 5);
+}
+
+static INLINE void write_buffer_8x8(const __m128i *const in,
+ uint8_t *const dest, const int stride) {
+ __m128i t[8];
+
+ round_shift_8x8(in, t);
+
+ recon_and_store(dest + 0 * stride, t[0]);
+ recon_and_store(dest + 1 * stride, t[1]);
+ recon_and_store(dest + 2 * stride, t[2]);
+ recon_and_store(dest + 3 * stride, t[3]);
+ recon_and_store(dest + 4 * stride, t[4]);
+ recon_and_store(dest + 5 * stride, t[5]);
+ recon_and_store(dest + 6 * stride, t[6]);
+ recon_and_store(dest + 7 * stride, t[7]);
+}
+
+static INLINE void recon_and_store4x4_sse2(const __m128i *const in,
+ uint8_t *const dest,
+ const int stride) {
+ const __m128i zero = _mm_setzero_si128();
+ __m128i d[2];
+
+ // Reconstruction and Store
+ d[0] = _mm_cvtsi32_si128(*(const int *)(dest));
+ d[1] = _mm_cvtsi32_si128(*(const int *)(dest + stride * 3));
+ d[0] = _mm_unpacklo_epi32(d[0],
+ _mm_cvtsi32_si128(*(const int *)(dest + stride)));
+ d[1] = _mm_unpacklo_epi32(
+ _mm_cvtsi32_si128(*(const int *)(dest + stride * 2)), d[1]);
+ d[0] = _mm_unpacklo_epi8(d[0], zero);
+ d[1] = _mm_unpacklo_epi8(d[1], zero);
+ d[0] = _mm_add_epi16(d[0], in[0]);
+ d[1] = _mm_add_epi16(d[1], in[1]);
+ d[0] = _mm_packus_epi16(d[0], d[1]);
+
+ *(int *)dest = _mm_cvtsi128_si32(d[0]);
+ d[0] = _mm_srli_si128(d[0], 4);
+ *(int *)(dest + stride) = _mm_cvtsi128_si32(d[0]);
+ d[0] = _mm_srli_si128(d[0], 4);
+ *(int *)(dest + stride * 2) = _mm_cvtsi128_si32(d[0]);
+ d[0] = _mm_srli_si128(d[0], 4);
+ *(int *)(dest + stride * 3) = _mm_cvtsi128_si32(d[0]);
+}
+
+static INLINE void store_buffer_8x32(__m128i *in, uint8_t *dst, int stride) {
+ const __m128i final_rounding = _mm_set1_epi16(1 << 5);
+ int j = 0;
+ while (j < 32) {
+ in[j] = _mm_adds_epi16(in[j], final_rounding);
+ in[j + 1] = _mm_adds_epi16(in[j + 1], final_rounding);
+
+ in[j] = _mm_srai_epi16(in[j], 6);
+ in[j + 1] = _mm_srai_epi16(in[j + 1], 6);
+
+ recon_and_store(dst, in[j]);
+ dst += stride;
+ recon_and_store(dst, in[j + 1]);
+ dst += stride;
+ j += 2;
+ }
+}
+
+static INLINE void write_buffer_8x1(uint8_t *const dest, const __m128i in) {
+ const __m128i final_rounding = _mm_set1_epi16(1 << 5);
+ __m128i out;
+ out = _mm_adds_epi16(in, final_rounding);
+ out = _mm_srai_epi16(out, 6);
+ recon_and_store(dest, out);
+}
+
+// Only do addition and subtraction butterfly, size = 16, 32
+static INLINE void add_sub_butterfly(const __m128i *in, __m128i *out,
+ int size) {
+ int i = 0;
+ const int num = size >> 1;
+ const int bound = size - 1;
+ while (i < num) {
+ out[i] = _mm_add_epi16(in[i], in[bound - i]);
+ out[bound - i] = _mm_sub_epi16(in[i], in[bound - i]);
+ i++;
+ }
+}
+
+static INLINE void idct8(const __m128i *const in /*in[8]*/,
+ __m128i *const out /*out[8]*/) {
+ __m128i step1[8], step2[8];
+
+ // stage 1
+ butterfly(in[1], in[7], cospi_28_64, cospi_4_64, &step1[4], &step1[7]);
+ butterfly(in[5], in[3], cospi_12_64, cospi_20_64, &step1[5], &step1[6]);
+
+ // stage 2
+ butterfly(in[0], in[4], cospi_16_64, cospi_16_64, &step2[1], &step2[0]);
+ butterfly(in[2], in[6], cospi_24_64, cospi_8_64, &step2[2], &step2[3]);
+
+ step2[4] = _mm_add_epi16(step1[4], step1[5]);
+ step2[5] = _mm_sub_epi16(step1[4], step1[5]);
+ step2[6] = _mm_sub_epi16(step1[7], step1[6]);
+ step2[7] = _mm_add_epi16(step1[7], step1[6]);
+
+ // stage 3
+ step1[0] = _mm_add_epi16(step2[0], step2[3]);
+ step1[1] = _mm_add_epi16(step2[1], step2[2]);
+ step1[2] = _mm_sub_epi16(step2[1], step2[2]);
+ step1[3] = _mm_sub_epi16(step2[0], step2[3]);
+ butterfly(step2[6], step2[5], cospi_16_64, cospi_16_64, &step1[5], &step1[6]);
+
+ // stage 4
+ out[0] = _mm_add_epi16(step1[0], step2[7]);
+ out[1] = _mm_add_epi16(step1[1], step1[6]);
+ out[2] = _mm_add_epi16(step1[2], step1[5]);
+ out[3] = _mm_add_epi16(step1[3], step2[4]);
+ out[4] = _mm_sub_epi16(step1[3], step2[4]);
+ out[5] = _mm_sub_epi16(step1[2], step1[5]);
+ out[6] = _mm_sub_epi16(step1[1], step1[6]);
+ out[7] = _mm_sub_epi16(step1[0], step2[7]);
+}
+
+static INLINE void idct8x8_12_add_kernel_sse2(__m128i *const io /*io[8]*/) {
+ const __m128i zero = _mm_setzero_si128();
+ const __m128i cp_16_16 = pair_set_epi16(cospi_16_64, cospi_16_64);
+ const __m128i cp_16_n16 = pair_set_epi16(cospi_16_64, -cospi_16_64);
+ __m128i step1[8], step2[8], tmp[4];
+
+ transpose_16bit_4x4(io, io);
+ // io[0]: 00 10 20 30 01 11 21 31
+ // io[1]: 02 12 22 32 03 13 23 33
+
+ // stage 1
+ {
+ const __m128i cp_28_n4 = pair_set_epi16(cospi_28_64, -cospi_4_64);
+ const __m128i cp_4_28 = pair_set_epi16(cospi_4_64, cospi_28_64);
+ const __m128i cp_n20_12 = pair_set_epi16(-cospi_20_64, cospi_12_64);
+ const __m128i cp_12_20 = pair_set_epi16(cospi_12_64, cospi_20_64);
+ const __m128i lo_1 = _mm_unpackhi_epi16(io[0], zero);
+ const __m128i lo_3 = _mm_unpackhi_epi16(io[1], zero);
+ step1[4] = idct_calc_wraplow_sse2(cp_28_n4, cp_4_28, lo_1); // step1 4&7
+ step1[5] = idct_calc_wraplow_sse2(cp_n20_12, cp_12_20, lo_3); // step1 5&6
+ }
+
+ // stage 2
+ {
+ const __m128i cp_24_n8 = pair_set_epi16(cospi_24_64, -cospi_8_64);
+ const __m128i cp_8_24 = pair_set_epi16(cospi_8_64, cospi_24_64);
+ const __m128i lo_0 = _mm_unpacklo_epi16(io[0], zero);
+ const __m128i lo_2 = _mm_unpacklo_epi16(io[1], zero);
+ const __m128i t = idct_madd_round_shift_sse2(cp_16_16, lo_0);
+ step2[0] = _mm_packs_epi32(t, t); // step2 0&1
+ step2[2] = idct_calc_wraplow_sse2(cp_8_24, cp_24_n8, lo_2); // step2 3&2
+ step2[4] = _mm_add_epi16(step1[4], step1[5]); // step2 4&7
+ step2[5] = _mm_sub_epi16(step1[4], step1[5]); // step2 5&6
+ step2[6] = _mm_unpackhi_epi64(step2[5], zero); // step2 6
+ }
+
+ // stage 3
+ {
+ const __m128i lo_65 = _mm_unpacklo_epi16(step2[6], step2[5]);
+ tmp[0] = _mm_add_epi16(step2[0], step2[2]); // step1 0&1
+ tmp[1] = _mm_sub_epi16(step2[0], step2[2]); // step1 3&2
+ step1[2] = _mm_unpackhi_epi64(tmp[1], tmp[0]); // step1 2&1
+ step1[3] = _mm_unpacklo_epi64(tmp[1], tmp[0]); // step1 3&0
+ step1[5] = idct_calc_wraplow_sse2(cp_16_n16, cp_16_16, lo_65); // step1 5&6
+ }
+
+ // stage 4
+ tmp[0] = _mm_add_epi16(step1[3], step2[4]); // output 3&0
+ tmp[1] = _mm_add_epi16(step1[2], step1[5]); // output 2&1
+ tmp[2] = _mm_sub_epi16(step1[3], step2[4]); // output 4&7
+ tmp[3] = _mm_sub_epi16(step1[2], step1[5]); // output 5&6
+
+ idct8x8_12_transpose_16bit_4x8(tmp, io);
+ io[4] = io[5] = io[6] = io[7] = zero;
+
+ idct8(io, io);
+}
+
+static INLINE void idct16_8col(const __m128i *const in /*in[16]*/,
+ __m128i *const out /*out[16]*/) {
+ __m128i step1[16], step2[16];
+
+ // stage 2
+ butterfly(in[1], in[15], cospi_30_64, cospi_2_64, &step2[8], &step2[15]);
+ butterfly(in[9], in[7], cospi_14_64, cospi_18_64, &step2[9], &step2[14]);
+ butterfly(in[5], in[11], cospi_22_64, cospi_10_64, &step2[10], &step2[13]);
+ butterfly(in[13], in[3], cospi_6_64, cospi_26_64, &step2[11], &step2[12]);
+
+ // stage 3
+ butterfly(in[2], in[14], cospi_28_64, cospi_4_64, &step1[4], &step1[7]);
+ butterfly(in[10], in[6], cospi_12_64, cospi_20_64, &step1[5], &step1[6]);
+ step1[8] = _mm_add_epi16(step2[8], step2[9]);
+ step1[9] = _mm_sub_epi16(step2[8], step2[9]);
+ step1[10] = _mm_sub_epi16(step2[11], step2[10]);
+ step1[11] = _mm_add_epi16(step2[10], step2[11]);
+ step1[12] = _mm_add_epi16(step2[12], step2[13]);
+ step1[13] = _mm_sub_epi16(step2[12], step2[13]);
+ step1[14] = _mm_sub_epi16(step2[15], step2[14]);
+ step1[15] = _mm_add_epi16(step2[14], step2[15]);
+
+ // stage 4
+ butterfly(in[0], in[8], cospi_16_64, cospi_16_64, &step2[1], &step2[0]);
+ butterfly(in[4], in[12], cospi_24_64, cospi_8_64, &step2[2], &step2[3]);
+ butterfly(step1[14], step1[9], cospi_24_64, cospi_8_64, &step2[9],
+ &step2[14]);
+ butterfly(step1[10], step1[13], -cospi_8_64, -cospi_24_64, &step2[13],
+ &step2[10]);
+ step2[5] = _mm_sub_epi16(step1[4], step1[5]);
+ step1[4] = _mm_add_epi16(step1[4], step1[5]);
+ step2[6] = _mm_sub_epi16(step1[7], step1[6]);
+ step1[7] = _mm_add_epi16(step1[6], step1[7]);
+ step2[8] = step1[8];
+ step2[11] = step1[11];
+ step2[12] = step1[12];
+ step2[15] = step1[15];
+
+ // stage 5
+ step1[0] = _mm_add_epi16(step2[0], step2[3]);
+ step1[1] = _mm_add_epi16(step2[1], step2[2]);
+ step1[2] = _mm_sub_epi16(step2[1], step2[2]);
+ step1[3] = _mm_sub_epi16(step2[0], step2[3]);
+ butterfly(step2[6], step2[5], cospi_16_64, cospi_16_64, &step1[5], &step1[6]);
+ step1[8] = _mm_add_epi16(step2[8], step2[11]);
+ step1[9] = _mm_add_epi16(step2[9], step2[10]);
+ step1[10] = _mm_sub_epi16(step2[9], step2[10]);
+ step1[11] = _mm_sub_epi16(step2[8], step2[11]);
+ step1[12] = _mm_sub_epi16(step2[15], step2[12]);
+ step1[13] = _mm_sub_epi16(step2[14], step2[13]);
+ step1[14] = _mm_add_epi16(step2[14], step2[13]);
+ step1[15] = _mm_add_epi16(step2[15], step2[12]);
+
+ // stage 6
+ step2[0] = _mm_add_epi16(step1[0], step1[7]);
+ step2[1] = _mm_add_epi16(step1[1], step1[6]);
+ step2[2] = _mm_add_epi16(step1[2], step1[5]);
+ step2[3] = _mm_add_epi16(step1[3], step1[4]);
+ step2[4] = _mm_sub_epi16(step1[3], step1[4]);
+ step2[5] = _mm_sub_epi16(step1[2], step1[5]);
+ step2[6] = _mm_sub_epi16(step1[1], step1[6]);
+ step2[7] = _mm_sub_epi16(step1[0], step1[7]);
+ butterfly(step1[13], step1[10], cospi_16_64, cospi_16_64, &step2[10],
+ &step2[13]);
+ butterfly(step1[12], step1[11], cospi_16_64, cospi_16_64, &step2[11],
+ &step2[12]);
+
+ // stage 7
+ out[0] = _mm_add_epi16(step2[0], step1[15]);
+ out[1] = _mm_add_epi16(step2[1], step1[14]);
+ out[2] = _mm_add_epi16(step2[2], step2[13]);
+ out[3] = _mm_add_epi16(step2[3], step2[12]);
+ out[4] = _mm_add_epi16(step2[4], step2[11]);
+ out[5] = _mm_add_epi16(step2[5], step2[10]);
+ out[6] = _mm_add_epi16(step2[6], step1[9]);
+ out[7] = _mm_add_epi16(step2[7], step1[8]);
+ out[8] = _mm_sub_epi16(step2[7], step1[8]);
+ out[9] = _mm_sub_epi16(step2[6], step1[9]);
+ out[10] = _mm_sub_epi16(step2[5], step2[10]);
+ out[11] = _mm_sub_epi16(step2[4], step2[11]);
+ out[12] = _mm_sub_epi16(step2[3], step2[12]);
+ out[13] = _mm_sub_epi16(step2[2], step2[13]);
+ out[14] = _mm_sub_epi16(step2[1], step1[14]);
+ out[15] = _mm_sub_epi16(step2[0], step1[15]);
+}
+
+static INLINE void idct16x16_10_pass1(const __m128i *const input /*input[4]*/,
+ __m128i *const output /*output[16]*/) {
+ const __m128i zero = _mm_setzero_si128();
+ const __m128i k__cospi_p16_p16 = pair_set_epi16(cospi_16_64, cospi_16_64);
+ const __m128i k__cospi_m16_p16 = pair_set_epi16(-cospi_16_64, cospi_16_64);
+ __m128i step1[16], step2[16];
+
+ transpose_16bit_4x4(input, output);
+
+ // stage 2
+ {
+ const __m128i k__cospi_p30_m02 = pair_set_epi16(cospi_30_64, -cospi_2_64);
+ const __m128i k__cospi_p02_p30 = pair_set_epi16(cospi_2_64, cospi_30_64);
+ const __m128i k__cospi_p06_m26 = pair_set_epi16(cospi_6_64, -cospi_26_64);
+ const __m128i k__cospi_p26_p06 = pair_set_epi16(cospi_26_64, cospi_6_64);
+ const __m128i lo_1_15 = _mm_unpackhi_epi16(output[0], zero);
+ const __m128i lo_13_3 = _mm_unpackhi_epi16(zero, output[1]);
+ step2[8] = idct_calc_wraplow_sse2(k__cospi_p30_m02, k__cospi_p02_p30,
+ lo_1_15); // step2 8&15
+ step2[11] = idct_calc_wraplow_sse2(k__cospi_p06_m26, k__cospi_p26_p06,
+ lo_13_3); // step2 11&12
+ }
+
+ // stage 3
+ {
+ const __m128i k__cospi_p28_m04 = pair_set_epi16(cospi_28_64, -cospi_4_64);
+ const __m128i k__cospi_p04_p28 = pair_set_epi16(cospi_4_64, cospi_28_64);
+ const __m128i lo_2_14 = _mm_unpacklo_epi16(output[1], zero);
+ step1[4] = idct_calc_wraplow_sse2(k__cospi_p28_m04, k__cospi_p04_p28,
+ lo_2_14); // step1 4&7
+ step1[13] = _mm_unpackhi_epi64(step2[11], zero);
+ step1[14] = _mm_unpackhi_epi64(step2[8], zero);
+ }
+
+ // stage 4
+ {
+ const __m128i k__cospi_m08_p24 = pair_set_epi16(-cospi_8_64, cospi_24_64);
+ const __m128i k__cospi_p24_p08 = pair_set_epi16(cospi_24_64, cospi_8_64);
+ const __m128i k__cospi_m24_m08 = pair_set_epi16(-cospi_24_64, -cospi_8_64);
+ const __m128i lo_0_8 = _mm_unpacklo_epi16(output[0], zero);
+ const __m128i lo_9_14 = _mm_unpacklo_epi16(step2[8], step1[14]);
+ const __m128i lo_10_13 = _mm_unpacklo_epi16(step2[11], step1[13]);
+ const __m128i t = idct_madd_round_shift_sse2(lo_0_8, k__cospi_p16_p16);
+ step1[0] = _mm_packs_epi32(t, t); // step2 0&1
+ step2[9] = idct_calc_wraplow_sse2(k__cospi_m08_p24, k__cospi_p24_p08,
+ lo_9_14); // step2 9&14
+ step2[10] = idct_calc_wraplow_sse2(k__cospi_m24_m08, k__cospi_m08_p24,
+ lo_10_13); // step2 10&13
+ step2[6] = _mm_unpackhi_epi64(step1[4], zero);
+ }
+
+ // stage 5
+ {
+ const __m128i lo_5_6 = _mm_unpacklo_epi16(step1[4], step2[6]);
+ step1[6] = idct_calc_wraplow_sse2(k__cospi_p16_p16, k__cospi_m16_p16,
+ lo_5_6); // step1 6&5
+ step1[8] = _mm_add_epi16(step2[8], step2[11]);
+ step1[9] = _mm_add_epi16(step2[9], step2[10]);
+ step1[10] = _mm_sub_epi16(step2[9], step2[10]);
+ step1[11] = _mm_sub_epi16(step2[8], step2[11]);
+ step1[12] = _mm_unpackhi_epi64(step1[11], zero);
+ step1[13] = _mm_unpackhi_epi64(step1[10], zero);
+ step1[14] = _mm_unpackhi_epi64(step1[9], zero);
+ step1[15] = _mm_unpackhi_epi64(step1[8], zero);
+ }
+
+ // stage 6
+ {
+ const __m128i lo_10_13 = _mm_unpacklo_epi16(step1[10], step1[13]);
+ const __m128i lo_11_12 = _mm_unpacklo_epi16(step1[11], step1[12]);
+ step2[10] = idct_calc_wraplow_sse2(k__cospi_m16_p16, k__cospi_p16_p16,
+ lo_10_13); // step2 10&13
+ step2[11] = idct_calc_wraplow_sse2(k__cospi_m16_p16, k__cospi_p16_p16,
+ lo_11_12); // step2 11&12
+ step2[13] = _mm_unpackhi_epi64(step2[10], zero);
+ step2[12] = _mm_unpackhi_epi64(step2[11], zero);
+ step2[3] = _mm_add_epi16(step1[0], step1[4]);
+ step2[1] = _mm_add_epi16(step1[0], step1[6]);
+ step2[6] = _mm_sub_epi16(step1[0], step1[6]);
+ step2[4] = _mm_sub_epi16(step1[0], step1[4]);
+ step2[0] = _mm_unpackhi_epi64(step2[3], zero);
+ step2[2] = _mm_unpackhi_epi64(step2[1], zero);
+ step2[5] = _mm_unpackhi_epi64(step2[6], zero);
+ step2[7] = _mm_unpackhi_epi64(step2[4], zero);
+ }
+
+ // stage 7. Left 8x16 only.
+ output[0] = _mm_add_epi16(step2[0], step1[15]);
+ output[1] = _mm_add_epi16(step2[1], step1[14]);
+ output[2] = _mm_add_epi16(step2[2], step2[13]);
+ output[3] = _mm_add_epi16(step2[3], step2[12]);
+ output[4] = _mm_add_epi16(step2[4], step2[11]);
+ output[5] = _mm_add_epi16(step2[5], step2[10]);
+ output[6] = _mm_add_epi16(step2[6], step1[9]);
+ output[7] = _mm_add_epi16(step2[7], step1[8]);
+ output[8] = _mm_sub_epi16(step2[7], step1[8]);
+ output[9] = _mm_sub_epi16(step2[6], step1[9]);
+ output[10] = _mm_sub_epi16(step2[5], step2[10]);
+ output[11] = _mm_sub_epi16(step2[4], step2[11]);
+ output[12] = _mm_sub_epi16(step2[3], step2[12]);
+ output[13] = _mm_sub_epi16(step2[2], step2[13]);
+ output[14] = _mm_sub_epi16(step2[1], step1[14]);
+ output[15] = _mm_sub_epi16(step2[0], step1[15]);
+}
+
+static INLINE void idct16x16_10_pass2(__m128i *const l /*l[8]*/,
+ __m128i *const io /*io[16]*/) {
+ const __m128i zero = _mm_setzero_si128();
+ __m128i step1[16], step2[16];
+
+ transpose_16bit_4x8(l, io);
+
+ // stage 2
+ butterfly(io[1], zero, cospi_30_64, cospi_2_64, &step2[8], &step2[15]);
+ butterfly(zero, io[3], cospi_6_64, cospi_26_64, &step2[11], &step2[12]);
+
+ // stage 3
+ butterfly(io[2], zero, cospi_28_64, cospi_4_64, &step1[4], &step1[7]);
+
+ // stage 4
+ step1[0] = butterfly_cospi16(io[0]);
+ butterfly(step2[15], step2[8], cospi_24_64, cospi_8_64, &step2[9],
+ &step2[14]);
+ butterfly(step2[11], step2[12], -cospi_8_64, -cospi_24_64, &step2[13],
+ &step2[10]);
+
+ // stage 5
+ butterfly(step1[7], step1[4], cospi_16_64, cospi_16_64, &step1[5], &step1[6]);
+ step1[8] = _mm_add_epi16(step2[8], step2[11]);
+ step1[9] = _mm_add_epi16(step2[9], step2[10]);
+ step1[10] = _mm_sub_epi16(step2[9], step2[10]);
+ step1[11] = _mm_sub_epi16(step2[8], step2[11]);
+ step1[12] = _mm_sub_epi16(step2[15], step2[12]);
+ step1[13] = _mm_sub_epi16(step2[14], step2[13]);
+ step1[14] = _mm_add_epi16(step2[14], step2[13]);
+ step1[15] = _mm_add_epi16(step2[15], step2[12]);
+
+ // stage 6
+ step2[0] = _mm_add_epi16(step1[0], step1[7]);
+ step2[1] = _mm_add_epi16(step1[0], step1[6]);
+ step2[2] = _mm_add_epi16(step1[0], step1[5]);
+ step2[3] = _mm_add_epi16(step1[0], step1[4]);
+ step2[4] = _mm_sub_epi16(step1[0], step1[4]);
+ step2[5] = _mm_sub_epi16(step1[0], step1[5]);
+ step2[6] = _mm_sub_epi16(step1[0], step1[6]);
+ step2[7] = _mm_sub_epi16(step1[0], step1[7]);
+ butterfly(step1[13], step1[10], cospi_16_64, cospi_16_64, &step2[10],
+ &step2[13]);
+ butterfly(step1[12], step1[11], cospi_16_64, cospi_16_64, &step2[11],
+ &step2[12]);
+
+ // stage 7
+ io[0] = _mm_add_epi16(step2[0], step1[15]);
+ io[1] = _mm_add_epi16(step2[1], step1[14]);
+ io[2] = _mm_add_epi16(step2[2], step2[13]);
+ io[3] = _mm_add_epi16(step2[3], step2[12]);
+ io[4] = _mm_add_epi16(step2[4], step2[11]);
+ io[5] = _mm_add_epi16(step2[5], step2[10]);
+ io[6] = _mm_add_epi16(step2[6], step1[9]);
+ io[7] = _mm_add_epi16(step2[7], step1[8]);
+ io[8] = _mm_sub_epi16(step2[7], step1[8]);
+ io[9] = _mm_sub_epi16(step2[6], step1[9]);
+ io[10] = _mm_sub_epi16(step2[5], step2[10]);
+ io[11] = _mm_sub_epi16(step2[4], step2[11]);
+ io[12] = _mm_sub_epi16(step2[3], step2[12]);
+ io[13] = _mm_sub_epi16(step2[2], step2[13]);
+ io[14] = _mm_sub_epi16(step2[1], step1[14]);
+ io[15] = _mm_sub_epi16(step2[0], step1[15]);
+}
+
+static INLINE void idct32_8x32_quarter_2_stage_4_to_6(
+ __m128i *const step1 /*step1[16]*/, __m128i *const out /*out[16]*/) {
+ __m128i step2[32];
+
+ // stage 4
+ step2[8] = step1[8];
+ step2[15] = step1[15];
+ butterfly(step1[14], step1[9], cospi_24_64, cospi_8_64, &step2[9],
+ &step2[14]);
+ butterfly(step1[13], step1[10], -cospi_8_64, cospi_24_64, &step2[10],
+ &step2[13]);
+ step2[11] = step1[11];
+ step2[12] = step1[12];
+
+ // stage 5
+ step1[8] = _mm_add_epi16(step2[8], step2[11]);
+ step1[9] = _mm_add_epi16(step2[9], step2[10]);
+ step1[10] = _mm_sub_epi16(step2[9], step2[10]);
+ step1[11] = _mm_sub_epi16(step2[8], step2[11]);
+ step1[12] = _mm_sub_epi16(step2[15], step2[12]);
+ step1[13] = _mm_sub_epi16(step2[14], step2[13]);
+ step1[14] = _mm_add_epi16(step2[14], step2[13]);
+ step1[15] = _mm_add_epi16(step2[15], step2[12]);
+
+ // stage 6
+ out[8] = step1[8];
+ out[9] = step1[9];
+ butterfly(step1[13], step1[10], cospi_16_64, cospi_16_64, &out[10], &out[13]);
+ butterfly(step1[12], step1[11], cospi_16_64, cospi_16_64, &out[11], &out[12]);
+ out[14] = step1[14];
+ out[15] = step1[15];
+}
+
+static INLINE void idct32_8x32_quarter_3_4_stage_4_to_7(
+ __m128i *const step1 /*step1[32]*/, __m128i *const out /*out[32]*/) {
+ __m128i step2[32];
+
+ // stage 4
+ step2[16] = _mm_add_epi16(step1[16], step1[19]);
+ step2[17] = _mm_add_epi16(step1[17], step1[18]);
+ step2[18] = _mm_sub_epi16(step1[17], step1[18]);
+ step2[19] = _mm_sub_epi16(step1[16], step1[19]);
+ step2[20] = _mm_sub_epi16(step1[23], step1[20]);
+ step2[21] = _mm_sub_epi16(step1[22], step1[21]);
+ step2[22] = _mm_add_epi16(step1[22], step1[21]);
+ step2[23] = _mm_add_epi16(step1[23], step1[20]);
+
+ step2[24] = _mm_add_epi16(step1[24], step1[27]);
+ step2[25] = _mm_add_epi16(step1[25], step1[26]);
+ step2[26] = _mm_sub_epi16(step1[25], step1[26]);
+ step2[27] = _mm_sub_epi16(step1[24], step1[27]);
+ step2[28] = _mm_sub_epi16(step1[31], step1[28]);
+ step2[29] = _mm_sub_epi16(step1[30], step1[29]);
+ step2[30] = _mm_add_epi16(step1[29], step1[30]);
+ step2[31] = _mm_add_epi16(step1[28], step1[31]);
+
+ // stage 5
+ step1[16] = step2[16];
+ step1[17] = step2[17];
+ butterfly(step2[29], step2[18], cospi_24_64, cospi_8_64, &step1[18],
+ &step1[29]);
+ butterfly(step2[28], step2[19], cospi_24_64, cospi_8_64, &step1[19],
+ &step1[28]);
+ butterfly(step2[27], step2[20], -cospi_8_64, cospi_24_64, &step1[20],
+ &step1[27]);
+ butterfly(step2[26], step2[21], -cospi_8_64, cospi_24_64, &step1[21],
+ &step1[26]);
+ step1[22] = step2[22];
+ step1[23] = step2[23];
+ step1[24] = step2[24];
+ step1[25] = step2[25];
+ step1[30] = step2[30];
+ step1[31] = step2[31];
+
+ // stage 6
+ out[16] = _mm_add_epi16(step1[16], step1[23]);
+ out[17] = _mm_add_epi16(step1[17], step1[22]);
+ out[18] = _mm_add_epi16(step1[18], step1[21]);
+ out[19] = _mm_add_epi16(step1[19], step1[20]);
+ step2[20] = _mm_sub_epi16(step1[19], step1[20]);
+ step2[21] = _mm_sub_epi16(step1[18], step1[21]);
+ step2[22] = _mm_sub_epi16(step1[17], step1[22]);
+ step2[23] = _mm_sub_epi16(step1[16], step1[23]);
+
+ step2[24] = _mm_sub_epi16(step1[31], step1[24]);
+ step2[25] = _mm_sub_epi16(step1[30], step1[25]);
+ step2[26] = _mm_sub_epi16(step1[29], step1[26]);
+ step2[27] = _mm_sub_epi16(step1[28], step1[27]);
+ out[28] = _mm_add_epi16(step1[27], step1[28]);
+ out[29] = _mm_add_epi16(step1[26], step1[29]);
+ out[30] = _mm_add_epi16(step1[25], step1[30]);
+ out[31] = _mm_add_epi16(step1[24], step1[31]);
+
+ // stage 7
+ butterfly(step2[27], step2[20], cospi_16_64, cospi_16_64, &out[20], &out[27]);
+ butterfly(step2[26], step2[21], cospi_16_64, cospi_16_64, &out[21], &out[26]);
+ butterfly(step2[25], step2[22], cospi_16_64, cospi_16_64, &out[22], &out[25]);
+ butterfly(step2[24], step2[23], cospi_16_64, cospi_16_64, &out[23], &out[24]);
+}
+
+void idct4_sse2(__m128i *const in);
+void vpx_idct8_sse2(__m128i *const in);
+void idct16_sse2(__m128i *const in0, __m128i *const in1);
+void iadst4_sse2(__m128i *const in);
+void iadst8_sse2(__m128i *const in);
+void vpx_iadst16_8col_sse2(__m128i *const in);
+void iadst16_sse2(__m128i *const in0, __m128i *const in1);
+void idct32_1024_8x32(const __m128i *const in, __m128i *const out);
+void idct32_34_8x32_sse2(const __m128i *const in, __m128i *const out);
+void idct32_34_8x32_ssse3(const __m128i *const in, __m128i *const out);
+
+#endif // VPX_VPX_DSP_X86_INV_TXFM_SSE2_H_
diff --git a/media/libvpx/libvpx/vpx_dsp/x86/inv_txfm_ssse3.c b/media/libvpx/libvpx/vpx_dsp/x86/inv_txfm_ssse3.c
new file mode 100644
index 0000000000..6e99469b63
--- /dev/null
+++ b/media/libvpx/libvpx/vpx_dsp/x86/inv_txfm_ssse3.c
@@ -0,0 +1,364 @@
+/*
+ * Copyright (c) 2017 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <tmmintrin.h>
+
+#include "./vpx_dsp_rtcd.h"
+#include "vpx_dsp/x86/inv_txfm_sse2.h"
+#include "vpx_dsp/x86/inv_txfm_ssse3.h"
+#include "vpx_dsp/x86/transpose_sse2.h"
+#include "vpx_dsp/x86/txfm_common_sse2.h"
+
+static INLINE void partial_butterfly_ssse3(const __m128i in, const int c0,
+ const int c1, __m128i *const out0,
+ __m128i *const out1) {
+ const __m128i cst0 = _mm_set1_epi16(2 * c0);
+ const __m128i cst1 = _mm_set1_epi16(2 * c1);
+ *out0 = _mm_mulhrs_epi16(in, cst0);
+ *out1 = _mm_mulhrs_epi16(in, cst1);
+}
+
+static INLINE __m128i partial_butterfly_cospi16_ssse3(const __m128i in) {
+ const __m128i coef_pair = _mm_set1_epi16(2 * cospi_16_64);
+ return _mm_mulhrs_epi16(in, coef_pair);
+}
+
+void vpx_idct8x8_12_add_ssse3(const tran_low_t *input, uint8_t *dest,
+ int stride) {
+ __m128i io[8];
+
+ io[0] = load_input_data4(input + 0 * 8);
+ io[1] = load_input_data4(input + 1 * 8);
+ io[2] = load_input_data4(input + 2 * 8);
+ io[3] = load_input_data4(input + 3 * 8);
+
+ idct8x8_12_add_kernel_ssse3(io);
+ write_buffer_8x8(io, dest, stride);
+}
+
+// Group the coefficient calculation into smaller functions to prevent stack
+// spillover in 32x32 idct optimizations:
+// quarter_1: 0-7
+// quarter_2: 8-15
+// quarter_3_4: 16-23, 24-31
+
+// For each 8x32 block __m128i in[32],
+// Input with index, 0, 4
+// output pixels: 0-7 in __m128i out[32]
+static INLINE void idct32_34_8x32_quarter_1(const __m128i *const in /*in[32]*/,
+ __m128i *const out /*out[8]*/) {
+ __m128i step1[8], step2[8];
+
+ // stage 3
+ partial_butterfly_ssse3(in[4], cospi_28_64, cospi_4_64, &step1[4], &step1[7]);
+
+ // stage 4
+ step2[0] = partial_butterfly_cospi16_ssse3(in[0]);
+ step2[4] = step1[4];
+ step2[5] = step1[4];
+ step2[6] = step1[7];
+ step2[7] = step1[7];
+
+ // stage 5
+ step1[0] = step2[0];
+ step1[1] = step2[0];
+ step1[2] = step2[0];
+ step1[3] = step2[0];
+ step1[4] = step2[4];
+ butterfly(step2[6], step2[5], cospi_16_64, cospi_16_64, &step1[5], &step1[6]);
+ step1[7] = step2[7];
+
+ // stage 6
+ out[0] = _mm_add_epi16(step1[0], step1[7]);
+ out[1] = _mm_add_epi16(step1[1], step1[6]);
+ out[2] = _mm_add_epi16(step1[2], step1[5]);
+ out[3] = _mm_add_epi16(step1[3], step1[4]);
+ out[4] = _mm_sub_epi16(step1[3], step1[4]);
+ out[5] = _mm_sub_epi16(step1[2], step1[5]);
+ out[6] = _mm_sub_epi16(step1[1], step1[6]);
+ out[7] = _mm_sub_epi16(step1[0], step1[7]);
+}
+
+// For each 8x32 block __m128i in[32],
+// Input with index, 2, 6
+// output pixels: 8-15 in __m128i out[32]
+static INLINE void idct32_34_8x32_quarter_2(const __m128i *const in /*in[32]*/,
+ __m128i *const out /*out[16]*/) {
+ __m128i step1[16], step2[16];
+
+ // stage 2
+ partial_butterfly_ssse3(in[2], cospi_30_64, cospi_2_64, &step2[8],
+ &step2[15]);
+ partial_butterfly_ssse3(in[6], -cospi_26_64, cospi_6_64, &step2[11],
+ &step2[12]);
+
+ // stage 3
+ step1[8] = step2[8];
+ step1[9] = step2[8];
+ step1[14] = step2[15];
+ step1[15] = step2[15];
+ step1[10] = step2[11];
+ step1[11] = step2[11];
+ step1[12] = step2[12];
+ step1[13] = step2[12];
+
+ idct32_8x32_quarter_2_stage_4_to_6(step1, out);
+}
+
+static INLINE void idct32_34_8x32_quarter_1_2(
+ const __m128i *const in /*in[32]*/, __m128i *const out /*out[32]*/) {
+ __m128i temp[16];
+ idct32_34_8x32_quarter_1(in, temp);
+ idct32_34_8x32_quarter_2(in, temp);
+ // stage 7
+ add_sub_butterfly(temp, out, 16);
+}
+
+// For each 8x32 block __m128i in[32],
+// Input with odd index, 1, 3, 5, 7
+// output pixels: 16-23, 24-31 in __m128i out[32]
+static INLINE void idct32_34_8x32_quarter_3_4(
+ const __m128i *const in /*in[32]*/, __m128i *const out /*out[32]*/) {
+ __m128i step1[32];
+
+ // stage 1
+ partial_butterfly_ssse3(in[1], cospi_31_64, cospi_1_64, &step1[16],
+ &step1[31]);
+ partial_butterfly_ssse3(in[7], -cospi_25_64, cospi_7_64, &step1[19],
+ &step1[28]);
+ partial_butterfly_ssse3(in[5], cospi_27_64, cospi_5_64, &step1[20],
+ &step1[27]);
+ partial_butterfly_ssse3(in[3], -cospi_29_64, cospi_3_64, &step1[23],
+ &step1[24]);
+
+ // stage 3
+ butterfly(step1[31], step1[16], cospi_28_64, cospi_4_64, &step1[17],
+ &step1[30]);
+ butterfly(step1[28], step1[19], -cospi_4_64, cospi_28_64, &step1[18],
+ &step1[29]);
+ butterfly(step1[27], step1[20], cospi_12_64, cospi_20_64, &step1[21],
+ &step1[26]);
+ butterfly(step1[24], step1[23], -cospi_20_64, cospi_12_64, &step1[22],
+ &step1[25]);
+
+ idct32_8x32_quarter_3_4_stage_4_to_7(step1, out);
+}
+
+void idct32_34_8x32_ssse3(const __m128i *const in /*in[32]*/,
+ __m128i *const out /*out[32]*/) {
+ __m128i temp[32];
+
+ idct32_34_8x32_quarter_1_2(in, temp);
+ idct32_34_8x32_quarter_3_4(in, temp);
+ // final stage
+ add_sub_butterfly(temp, out, 32);
+}
+
+// Only upper-left 8x8 has non-zero coeff
+void vpx_idct32x32_34_add_ssse3(const tran_low_t *input, uint8_t *dest,
+ int stride) {
+ __m128i io[32], col[32];
+ int i;
+
+ // Load input data. Only need to load the top left 8x8 block.
+ load_transpose_16bit_8x8(input, 32, io);
+ idct32_34_8x32_ssse3(io, col);
+
+ for (i = 0; i < 32; i += 8) {
+ int j;
+ transpose_16bit_8x8(col + i, io);
+ idct32_34_8x32_ssse3(io, io);
+
+ for (j = 0; j < 32; ++j) {
+ write_buffer_8x1(dest + j * stride, io[j]);
+ }
+
+ dest += 8;
+ }
+}
+
+// For each 8x32 block __m128i in[32],
+// Input with index, 0, 4, 8, 12
+// output pixels: 0-7 in __m128i out[32]
+static INLINE void idct32_135_8x32_quarter_1(const __m128i *const in /*in[32]*/,
+ __m128i *const out /*out[8]*/) {
+ __m128i step1[8], step2[8];
+
+ // stage 3
+ partial_butterfly_ssse3(in[4], cospi_28_64, cospi_4_64, &step1[4], &step1[7]);
+ partial_butterfly_ssse3(in[12], -cospi_20_64, cospi_12_64, &step1[5],
+ &step1[6]);
+
+ // stage 4
+ step2[0] = partial_butterfly_cospi16_ssse3(in[0]);
+ partial_butterfly_ssse3(in[8], cospi_24_64, cospi_8_64, &step2[2], &step2[3]);
+ step2[4] = _mm_add_epi16(step1[4], step1[5]);
+ step2[5] = _mm_sub_epi16(step1[4], step1[5]);
+ step2[6] = _mm_sub_epi16(step1[7], step1[6]);
+ step2[7] = _mm_add_epi16(step1[7], step1[6]);
+
+ // stage 5
+ step1[0] = _mm_add_epi16(step2[0], step2[3]);
+ step1[1] = _mm_add_epi16(step2[0], step2[2]);
+ step1[2] = _mm_sub_epi16(step2[0], step2[2]);
+ step1[3] = _mm_sub_epi16(step2[0], step2[3]);
+ step1[4] = step2[4];
+ butterfly(step2[6], step2[5], cospi_16_64, cospi_16_64, &step1[5], &step1[6]);
+ step1[7] = step2[7];
+
+ // stage 6
+ out[0] = _mm_add_epi16(step1[0], step1[7]);
+ out[1] = _mm_add_epi16(step1[1], step1[6]);
+ out[2] = _mm_add_epi16(step1[2], step1[5]);
+ out[3] = _mm_add_epi16(step1[3], step1[4]);
+ out[4] = _mm_sub_epi16(step1[3], step1[4]);
+ out[5] = _mm_sub_epi16(step1[2], step1[5]);
+ out[6] = _mm_sub_epi16(step1[1], step1[6]);
+ out[7] = _mm_sub_epi16(step1[0], step1[7]);
+}
+
+// For each 8x32 block __m128i in[32],
+// Input with index, 2, 6, 10, 14
+// output pixels: 8-15 in __m128i out[32]
+static INLINE void idct32_135_8x32_quarter_2(const __m128i *const in /*in[32]*/,
+ __m128i *const out /*out[16]*/) {
+ __m128i step1[16], step2[16];
+
+ // stage 2
+ partial_butterfly_ssse3(in[2], cospi_30_64, cospi_2_64, &step2[8],
+ &step2[15]);
+ partial_butterfly_ssse3(in[14], -cospi_18_64, cospi_14_64, &step2[9],
+ &step2[14]);
+ partial_butterfly_ssse3(in[10], cospi_22_64, cospi_10_64, &step2[10],
+ &step2[13]);
+ partial_butterfly_ssse3(in[6], -cospi_26_64, cospi_6_64, &step2[11],
+ &step2[12]);
+
+ // stage 3
+ step1[8] = _mm_add_epi16(step2[8], step2[9]);
+ step1[9] = _mm_sub_epi16(step2[8], step2[9]);
+ step1[10] = _mm_sub_epi16(step2[11], step2[10]);
+ step1[11] = _mm_add_epi16(step2[11], step2[10]);
+ step1[12] = _mm_add_epi16(step2[12], step2[13]);
+ step1[13] = _mm_sub_epi16(step2[12], step2[13]);
+ step1[14] = _mm_sub_epi16(step2[15], step2[14]);
+ step1[15] = _mm_add_epi16(step2[15], step2[14]);
+
+ idct32_8x32_quarter_2_stage_4_to_6(step1, out);
+}
+
+static INLINE void idct32_135_8x32_quarter_1_2(
+ const __m128i *const in /*in[32]*/, __m128i *const out /*out[32]*/) {
+ __m128i temp[16];
+ idct32_135_8x32_quarter_1(in, temp);
+ idct32_135_8x32_quarter_2(in, temp);
+ // stage 7
+ add_sub_butterfly(temp, out, 16);
+}
+
+// For each 8x32 block __m128i in[32],
+// Input with odd index,
+// 1, 3, 5, 7, 9, 11, 13, 15
+// output pixels: 16-23, 24-31 in __m128i out[32]
+static INLINE void idct32_135_8x32_quarter_3_4(
+ const __m128i *const in /*in[32]*/, __m128i *const out /*out[32]*/) {
+ __m128i step1[32], step2[32];
+
+ // stage 1
+ partial_butterfly_ssse3(in[1], cospi_31_64, cospi_1_64, &step1[16],
+ &step1[31]);
+ partial_butterfly_ssse3(in[15], -cospi_17_64, cospi_15_64, &step1[17],
+ &step1[30]);
+ partial_butterfly_ssse3(in[9], cospi_23_64, cospi_9_64, &step1[18],
+ &step1[29]);
+ partial_butterfly_ssse3(in[7], -cospi_25_64, cospi_7_64, &step1[19],
+ &step1[28]);
+
+ partial_butterfly_ssse3(in[5], cospi_27_64, cospi_5_64, &step1[20],
+ &step1[27]);
+ partial_butterfly_ssse3(in[11], -cospi_21_64, cospi_11_64, &step1[21],
+ &step1[26]);
+
+ partial_butterfly_ssse3(in[13], cospi_19_64, cospi_13_64, &step1[22],
+ &step1[25]);
+ partial_butterfly_ssse3(in[3], -cospi_29_64, cospi_3_64, &step1[23],
+ &step1[24]);
+
+ // stage 2
+ step2[16] = _mm_add_epi16(step1[16], step1[17]);
+ step2[17] = _mm_sub_epi16(step1[16], step1[17]);
+ step2[18] = _mm_sub_epi16(step1[19], step1[18]);
+ step2[19] = _mm_add_epi16(step1[19], step1[18]);
+ step2[20] = _mm_add_epi16(step1[20], step1[21]);
+ step2[21] = _mm_sub_epi16(step1[20], step1[21]);
+ step2[22] = _mm_sub_epi16(step1[23], step1[22]);
+ step2[23] = _mm_add_epi16(step1[23], step1[22]);
+
+ step2[24] = _mm_add_epi16(step1[24], step1[25]);
+ step2[25] = _mm_sub_epi16(step1[24], step1[25]);
+ step2[26] = _mm_sub_epi16(step1[27], step1[26]);
+ step2[27] = _mm_add_epi16(step1[27], step1[26]);
+ step2[28] = _mm_add_epi16(step1[28], step1[29]);
+ step2[29] = _mm_sub_epi16(step1[28], step1[29]);
+ step2[30] = _mm_sub_epi16(step1[31], step1[30]);
+ step2[31] = _mm_add_epi16(step1[31], step1[30]);
+
+ // stage 3
+ step1[16] = step2[16];
+ step1[31] = step2[31];
+ butterfly(step2[30], step2[17], cospi_28_64, cospi_4_64, &step1[17],
+ &step1[30]);
+ butterfly(step2[29], step2[18], -cospi_4_64, cospi_28_64, &step1[18],
+ &step1[29]);
+ step1[19] = step2[19];
+ step1[20] = step2[20];
+ butterfly(step2[26], step2[21], cospi_12_64, cospi_20_64, &step1[21],
+ &step1[26]);
+ butterfly(step2[25], step2[22], -cospi_20_64, cospi_12_64, &step1[22],
+ &step1[25]);
+ step1[23] = step2[23];
+ step1[24] = step2[24];
+ step1[27] = step2[27];
+ step1[28] = step2[28];
+
+ idct32_8x32_quarter_3_4_stage_4_to_7(step1, out);
+}
+
+void idct32_135_8x32_ssse3(const __m128i *const in /*in[32]*/,
+ __m128i *const out /*out[32]*/) {
+ __m128i temp[32];
+ idct32_135_8x32_quarter_1_2(in, temp);
+ idct32_135_8x32_quarter_3_4(in, temp);
+ // final stage
+ add_sub_butterfly(temp, out, 32);
+}
+
+void vpx_idct32x32_135_add_ssse3(const tran_low_t *input, uint8_t *dest,
+ int stride) {
+ __m128i col[2][32], io[32];
+ int i;
+
+ // rows
+ for (i = 0; i < 2; i++) {
+ load_transpose_16bit_8x8(&input[0], 32, &io[0]);
+ load_transpose_16bit_8x8(&input[8], 32, &io[8]);
+ idct32_135_8x32_ssse3(io, col[i]);
+ input += 32 << 3;
+ }
+
+ // columns
+ for (i = 0; i < 32; i += 8) {
+ transpose_16bit_8x8(col[0] + i, io);
+ transpose_16bit_8x8(col[1] + i, io + 8);
+ idct32_135_8x32_ssse3(io, io);
+ store_buffer_8x32(io, dest, stride);
+ dest += 8;
+ }
+}
diff --git a/media/libvpx/libvpx/vpx_dsp/x86/inv_txfm_ssse3.h b/media/libvpx/libvpx/vpx_dsp/x86/inv_txfm_ssse3.h
new file mode 100644
index 0000000000..e9f0f69033
--- /dev/null
+++ b/media/libvpx/libvpx/vpx_dsp/x86/inv_txfm_ssse3.h
@@ -0,0 +1,110 @@
+/*
+ * Copyright (c) 2017 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef VPX_VPX_DSP_X86_INV_TXFM_SSSE3_H_
+#define VPX_VPX_DSP_X86_INV_TXFM_SSSE3_H_
+
+#include <tmmintrin.h>
+
+#include "./vpx_dsp_rtcd.h"
+#include "vpx_dsp/x86/inv_txfm_sse2.h"
+#include "vpx_dsp/x86/transpose_sse2.h"
+#include "vpx_dsp/x86/txfm_common_sse2.h"
+
+static INLINE void idct8x8_12_add_kernel_ssse3(__m128i *const io /* io[8] */) {
+ const __m128i cp_28d_4d = dual_set_epi16(2 * cospi_28_64, 2 * cospi_4_64);
+ const __m128i cp_n20d_12d = dual_set_epi16(-2 * cospi_20_64, 2 * cospi_12_64);
+ const __m128i cp_8d_24d = dual_set_epi16(2 * cospi_8_64, 2 * cospi_24_64);
+ const __m128i cp_16_16 = _mm_set1_epi16(cospi_16_64);
+ const __m128i cp_16_n16 = pair_set_epi16(cospi_16_64, -cospi_16_64);
+ const __m128i cospi_16_64d = _mm_set1_epi16((int16_t)(2 * cospi_16_64));
+ const __m128i cospi_28_64d = _mm_set1_epi16((int16_t)(2 * cospi_28_64));
+ const __m128i cospi_4_64d = _mm_set1_epi16((int16_t)(2 * cospi_4_64));
+ const __m128i cospi_n20_64d = _mm_set1_epi16((int16_t)(-2 * cospi_20_64));
+ const __m128i cospi_12_64d = _mm_set1_epi16((int16_t)(2 * cospi_12_64));
+ const __m128i cospi_24_64d = _mm_set1_epi16((int16_t)(2 * cospi_24_64));
+ const __m128i cospi_8_64d = _mm_set1_epi16((int16_t)(2 * cospi_8_64));
+ __m128i step1[8], step2[8], tmp[4];
+
+ // pass 1
+
+ transpose_16bit_4x4(io, io);
+ // io[0]: 00 10 20 30 01 11 21 31
+ // io[1]: 02 12 22 32 03 13 23 33
+
+ // stage 1
+ tmp[0] = _mm_unpacklo_epi64(io[0], io[0]);
+ tmp[1] = _mm_unpackhi_epi64(io[0], io[0]);
+ tmp[2] = _mm_unpacklo_epi64(io[1], io[1]);
+ tmp[3] = _mm_unpackhi_epi64(io[1], io[1]);
+ step1[4] = _mm_mulhrs_epi16(tmp[1], cp_28d_4d); // step1 4&7
+ step1[5] = _mm_mulhrs_epi16(tmp[3], cp_n20d_12d); // step1 5&6
+
+ // stage 2
+ step2[0] = _mm_mulhrs_epi16(tmp[0], cospi_16_64d); // step2 0&1
+ step2[2] = _mm_mulhrs_epi16(tmp[2], cp_8d_24d); // step2 3&2
+ step2[4] = _mm_add_epi16(step1[4], step1[5]); // step2 4&7
+ step2[5] = _mm_sub_epi16(step1[4], step1[5]); // step2 5&6
+ step2[6] = _mm_unpackhi_epi64(step2[5], step2[5]); // step2 6
+
+ // stage 3
+ tmp[0] = _mm_unpacklo_epi16(step2[6], step2[5]);
+ step1[5] = idct_calc_wraplow_sse2(cp_16_n16, cp_16_16, tmp[0]); // step1 5&6
+ tmp[0] = _mm_add_epi16(step2[0], step2[2]); // step1 0&1
+ tmp[1] = _mm_sub_epi16(step2[0], step2[2]); // step1 3&2
+ step1[2] = _mm_unpackhi_epi64(tmp[1], tmp[0]); // step1 2&1
+ step1[3] = _mm_unpacklo_epi64(tmp[1], tmp[0]); // step1 3&0
+
+ // stage 4
+ tmp[0] = _mm_add_epi16(step1[3], step2[4]); // output 3&0
+ tmp[1] = _mm_add_epi16(step1[2], step1[5]); // output 2&1
+ tmp[2] = _mm_sub_epi16(step1[3], step2[4]); // output 4&7
+ tmp[3] = _mm_sub_epi16(step1[2], step1[5]); // output 5&6
+
+ // pass 2
+
+ idct8x8_12_transpose_16bit_4x8(tmp, io);
+
+ // stage 1
+ step1[4] = _mm_mulhrs_epi16(io[1], cospi_28_64d);
+ step1[7] = _mm_mulhrs_epi16(io[1], cospi_4_64d);
+ step1[5] = _mm_mulhrs_epi16(io[3], cospi_n20_64d);
+ step1[6] = _mm_mulhrs_epi16(io[3], cospi_12_64d);
+
+ // stage 2
+ step2[0] = _mm_mulhrs_epi16(io[0], cospi_16_64d); // step2[1] = step2[0]
+ step2[2] = _mm_mulhrs_epi16(io[2], cospi_24_64d);
+ step2[3] = _mm_mulhrs_epi16(io[2], cospi_8_64d);
+ step2[4] = _mm_add_epi16(step1[4], step1[5]);
+ step2[5] = _mm_sub_epi16(step1[4], step1[5]);
+ step2[6] = _mm_sub_epi16(step1[7], step1[6]);
+ step2[7] = _mm_add_epi16(step1[7], step1[6]);
+
+ // stage 3
+ step1[0] = _mm_add_epi16(step2[0], step2[3]);
+ step1[1] = _mm_add_epi16(step2[0], step2[2]);
+ step1[2] = _mm_sub_epi16(step2[0], step2[2]);
+ step1[3] = _mm_sub_epi16(step2[0], step2[3]);
+ butterfly(step2[6], step2[5], cospi_16_64, cospi_16_64, &step1[5], &step1[6]);
+
+ // stage 4
+ io[0] = _mm_add_epi16(step1[0], step2[7]);
+ io[1] = _mm_add_epi16(step1[1], step1[6]);
+ io[2] = _mm_add_epi16(step1[2], step1[5]);
+ io[3] = _mm_add_epi16(step1[3], step2[4]);
+ io[4] = _mm_sub_epi16(step1[3], step2[4]);
+ io[5] = _mm_sub_epi16(step1[2], step1[5]);
+ io[6] = _mm_sub_epi16(step1[1], step1[6]);
+ io[7] = _mm_sub_epi16(step1[0], step2[7]);
+}
+
+void idct32_135_8x32_ssse3(const __m128i *const in, __m128i *const out);
+
+#endif // VPX_VPX_DSP_X86_INV_TXFM_SSSE3_H_
diff --git a/media/libvpx/libvpx/vpx_dsp/x86/inv_wht_sse2.asm b/media/libvpx/libvpx/vpx_dsp/x86/inv_wht_sse2.asm
new file mode 100644
index 0000000000..bcf1a6ef98
--- /dev/null
+++ b/media/libvpx/libvpx/vpx_dsp/x86/inv_wht_sse2.asm
@@ -0,0 +1,103 @@
+;
+; Copyright (c) 2015 The WebM project authors. All Rights Reserved.
+;
+; Use of this source code is governed by a BSD-style license
+; that can be found in the LICENSE file in the root of the source
+; tree. An additional intellectual property rights grant can be found
+; in the file PATENTS. All contributing project authors may
+; be found in the AUTHORS file in the root of the source tree.
+;
+
+%include "third_party/x86inc/x86inc.asm"
+%include "vpx_dsp/x86/bitdepth_conversion_sse2.asm"
+
+SECTION .text
+
+%macro REORDER_INPUTS 0
+ ; a c d b to a b c d
+ SWAP 1, 3, 2
+%endmacro
+
+%macro TRANSFORM_COLS 0
+ ; input:
+ ; m0 a
+ ; m1 b
+ ; m2 c
+ ; m3 d
+ paddw m0, m2
+ psubw m3, m1
+
+ ; wide subtract
+ punpcklwd m4, m0
+ punpcklwd m5, m3
+ psrad m4, 16
+ psrad m5, 16
+ psubd m4, m5
+ psrad m4, 1
+ packssdw m4, m4 ; e
+
+ psubw m5, m4, m1 ; b
+ psubw m4, m2 ; c
+ psubw m0, m5
+ paddw m3, m4
+ ; m0 a
+ SWAP 1, 5 ; m1 b
+ SWAP 2, 4 ; m2 c
+ ; m3 d
+%endmacro
+
+%macro TRANSPOSE_4X4 0
+ punpcklwd m0, m2
+ punpcklwd m1, m3
+ mova m2, m0
+ punpcklwd m0, m1
+ punpckhwd m2, m1
+ pshufd m1, m0, 0x0e
+ pshufd m3, m2, 0x0e
+%endmacro
+
+; transpose a 4x4 int16 matrix in xmm0 and xmm1 to the bottom half of xmm0-xmm3
+%macro TRANSPOSE_4X4_WIDE 0
+ mova m3, m0
+ punpcklwd m0, m1
+ punpckhwd m3, m1
+ mova m2, m0
+ punpcklwd m0, m3
+ punpckhwd m2, m3
+ pshufd m1, m0, 0x0e
+ pshufd m3, m2, 0x0e
+%endmacro
+
+%macro ADD_STORE_4P_2X 5 ; src1, src2, tmp1, tmp2, zero
+ movd m%3, [outputq]
+ movd m%4, [outputq + strideq]
+ punpcklbw m%3, m%5
+ punpcklbw m%4, m%5
+ paddw m%1, m%3
+ paddw m%2, m%4
+ packuswb m%1, m%5
+ packuswb m%2, m%5
+ movd [outputq], m%1
+ movd [outputq + strideq], m%2
+%endmacro
+
+INIT_XMM sse2
+cglobal iwht4x4_16_add, 3, 3, 7, input, output, stride
+ LOAD_TRAN_LOW 0, inputq, 0
+ LOAD_TRAN_LOW 1, inputq, 8
+ psraw m0, 2
+ psraw m1, 2
+
+ TRANSPOSE_4X4_WIDE
+ REORDER_INPUTS
+ TRANSFORM_COLS
+ TRANSPOSE_4X4
+ REORDER_INPUTS
+ TRANSFORM_COLS
+
+ pxor m4, m4
+ ADD_STORE_4P_2X 0, 1, 5, 6, 4
+ lea outputq, [outputq + 2 * strideq]
+ ADD_STORE_4P_2X 2, 3, 5, 6, 4
+
+ RET
diff --git a/media/libvpx/libvpx/vpx_dsp/x86/loopfilter_avx2.c b/media/libvpx/libvpx/vpx_dsp/x86/loopfilter_avx2.c
new file mode 100644
index 0000000000..a58fb65539
--- /dev/null
+++ b/media/libvpx/libvpx/vpx_dsp/x86/loopfilter_avx2.c
@@ -0,0 +1,913 @@
+/*
+ * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <immintrin.h> /* AVX2 */
+
+#include "./vpx_dsp_rtcd.h"
+#include "vpx_ports/mem.h"
+
+void vpx_lpf_horizontal_16_avx2(unsigned char *s, int pitch,
+ const unsigned char *blimit,
+ const unsigned char *limit,
+ const unsigned char *thresh) {
+ __m128i mask, hev, flat, flat2;
+ const __m128i zero = _mm_setzero_si128();
+ const __m128i one = _mm_set1_epi8(1);
+ __m128i q7p7, q6p6, q5p5, q4p4, q3p3, q2p2, q1p1, q0p0, p0q0, p1q1;
+ __m128i abs_p1p0;
+
+ const __m128i thresh_v =
+ _mm_broadcastb_epi8(_mm_cvtsi32_si128((int)thresh[0]));
+ const __m128i limit_v = _mm_broadcastb_epi8(_mm_cvtsi32_si128((int)limit[0]));
+ const __m128i blimit_v =
+ _mm_broadcastb_epi8(_mm_cvtsi32_si128((int)blimit[0]));
+
+ q4p4 = _mm_loadl_epi64((__m128i *)(s - 5 * pitch));
+ q4p4 = _mm_castps_si128(
+ _mm_loadh_pi(_mm_castsi128_ps(q4p4), (__m64 *)(s + 4 * pitch)));
+ q3p3 = _mm_loadl_epi64((__m128i *)(s - 4 * pitch));
+ q3p3 = _mm_castps_si128(
+ _mm_loadh_pi(_mm_castsi128_ps(q3p3), (__m64 *)(s + 3 * pitch)));
+ q2p2 = _mm_loadl_epi64((__m128i *)(s - 3 * pitch));
+ q2p2 = _mm_castps_si128(
+ _mm_loadh_pi(_mm_castsi128_ps(q2p2), (__m64 *)(s + 2 * pitch)));
+ q1p1 = _mm_loadl_epi64((__m128i *)(s - 2 * pitch));
+ q1p1 = _mm_castps_si128(
+ _mm_loadh_pi(_mm_castsi128_ps(q1p1), (__m64 *)(s + 1 * pitch)));
+ p1q1 = _mm_shuffle_epi32(q1p1, 78);
+ q0p0 = _mm_loadl_epi64((__m128i *)(s - 1 * pitch));
+ q0p0 = _mm_castps_si128(
+ _mm_loadh_pi(_mm_castsi128_ps(q0p0), (__m64 *)(s - 0 * pitch)));
+ p0q0 = _mm_shuffle_epi32(q0p0, 78);
+
+ {
+ __m128i abs_p1q1, abs_p0q0, abs_q1q0, fe, ff, work;
+ abs_p1p0 =
+ _mm_or_si128(_mm_subs_epu8(q1p1, q0p0), _mm_subs_epu8(q0p0, q1p1));
+ abs_q1q0 = _mm_srli_si128(abs_p1p0, 8);
+ fe = _mm_set1_epi8((int8_t)0xfe);
+ ff = _mm_cmpeq_epi8(abs_p1p0, abs_p1p0);
+ abs_p0q0 =
+ _mm_or_si128(_mm_subs_epu8(q0p0, p0q0), _mm_subs_epu8(p0q0, q0p0));
+ abs_p1q1 =
+ _mm_or_si128(_mm_subs_epu8(q1p1, p1q1), _mm_subs_epu8(p1q1, q1p1));
+ flat = _mm_max_epu8(abs_p1p0, abs_q1q0);
+ hev = _mm_subs_epu8(flat, thresh_v);
+ hev = _mm_xor_si128(_mm_cmpeq_epi8(hev, zero), ff);
+
+ abs_p0q0 = _mm_adds_epu8(abs_p0q0, abs_p0q0);
+ abs_p1q1 = _mm_srli_epi16(_mm_and_si128(abs_p1q1, fe), 1);
+ mask = _mm_subs_epu8(_mm_adds_epu8(abs_p0q0, abs_p1q1), blimit_v);
+ mask = _mm_xor_si128(_mm_cmpeq_epi8(mask, zero), ff);
+ // mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2 > blimit) * -1;
+ mask = _mm_max_epu8(abs_p1p0, mask);
+ // mask |= (abs(p1 - p0) > limit) * -1;
+ // mask |= (abs(q1 - q0) > limit) * -1;
+
+ work = _mm_max_epu8(
+ _mm_or_si128(_mm_subs_epu8(q2p2, q1p1), _mm_subs_epu8(q1p1, q2p2)),
+ _mm_or_si128(_mm_subs_epu8(q3p3, q2p2), _mm_subs_epu8(q2p2, q3p3)));
+ mask = _mm_max_epu8(work, mask);
+ mask = _mm_max_epu8(mask, _mm_srli_si128(mask, 8));
+ mask = _mm_subs_epu8(mask, limit_v);
+ mask = _mm_cmpeq_epi8(mask, zero);
+ }
+
+ // lp filter
+ {
+ const __m128i t4 = _mm_set1_epi8(4);
+ const __m128i t3 = _mm_set1_epi8(3);
+ const __m128i t80 = _mm_set1_epi8((int8_t)0x80);
+ const __m128i t1 = _mm_set1_epi16(0x1);
+ __m128i qs1ps1 = _mm_xor_si128(q1p1, t80);
+ __m128i qs0ps0 = _mm_xor_si128(q0p0, t80);
+ __m128i qs0 = _mm_xor_si128(p0q0, t80);
+ __m128i qs1 = _mm_xor_si128(p1q1, t80);
+ __m128i filt;
+ __m128i work_a;
+ __m128i filter1, filter2;
+ __m128i flat2_q6p6, flat2_q5p5, flat2_q4p4, flat2_q3p3, flat2_q2p2;
+ __m128i flat2_q1p1, flat2_q0p0, flat_q2p2, flat_q1p1, flat_q0p0;
+
+ filt = _mm_and_si128(_mm_subs_epi8(qs1ps1, qs1), hev);
+ work_a = _mm_subs_epi8(qs0, qs0ps0);
+ filt = _mm_adds_epi8(filt, work_a);
+ filt = _mm_adds_epi8(filt, work_a);
+ filt = _mm_adds_epi8(filt, work_a);
+ /* (vpx_filter + 3 * (qs0 - ps0)) & mask */
+ filt = _mm_and_si128(filt, mask);
+
+ filter1 = _mm_adds_epi8(filt, t4);
+ filter2 = _mm_adds_epi8(filt, t3);
+
+ filter1 = _mm_unpacklo_epi8(zero, filter1);
+ filter1 = _mm_srai_epi16(filter1, 0xB);
+ filter2 = _mm_unpacklo_epi8(zero, filter2);
+ filter2 = _mm_srai_epi16(filter2, 0xB);
+
+ /* Filter1 >> 3 */
+ filt = _mm_packs_epi16(filter2, _mm_subs_epi16(zero, filter1));
+ qs0ps0 = _mm_xor_si128(_mm_adds_epi8(qs0ps0, filt), t80);
+
+ /* filt >> 1 */
+ filt = _mm_adds_epi16(filter1, t1);
+ filt = _mm_srai_epi16(filt, 1);
+ filt = _mm_andnot_si128(_mm_srai_epi16(_mm_unpacklo_epi8(zero, hev), 0x8),
+ filt);
+ filt = _mm_packs_epi16(filt, _mm_subs_epi16(zero, filt));
+ qs1ps1 = _mm_xor_si128(_mm_adds_epi8(qs1ps1, filt), t80);
+ // loopfilter done
+
+ {
+ __m128i work;
+ flat = _mm_max_epu8(
+ _mm_or_si128(_mm_subs_epu8(q2p2, q0p0), _mm_subs_epu8(q0p0, q2p2)),
+ _mm_or_si128(_mm_subs_epu8(q3p3, q0p0), _mm_subs_epu8(q0p0, q3p3)));
+ flat = _mm_max_epu8(abs_p1p0, flat);
+ flat = _mm_max_epu8(flat, _mm_srli_si128(flat, 8));
+ flat = _mm_subs_epu8(flat, one);
+ flat = _mm_cmpeq_epi8(flat, zero);
+ flat = _mm_and_si128(flat, mask);
+
+ q5p5 = _mm_loadl_epi64((__m128i *)(s - 6 * pitch));
+ q5p5 = _mm_castps_si128(
+ _mm_loadh_pi(_mm_castsi128_ps(q5p5), (__m64 *)(s + 5 * pitch)));
+
+ q6p6 = _mm_loadl_epi64((__m128i *)(s - 7 * pitch));
+ q6p6 = _mm_castps_si128(
+ _mm_loadh_pi(_mm_castsi128_ps(q6p6), (__m64 *)(s + 6 * pitch)));
+
+ flat2 = _mm_max_epu8(
+ _mm_or_si128(_mm_subs_epu8(q4p4, q0p0), _mm_subs_epu8(q0p0, q4p4)),
+ _mm_or_si128(_mm_subs_epu8(q5p5, q0p0), _mm_subs_epu8(q0p0, q5p5)));
+
+ q7p7 = _mm_loadl_epi64((__m128i *)(s - 8 * pitch));
+ q7p7 = _mm_castps_si128(
+ _mm_loadh_pi(_mm_castsi128_ps(q7p7), (__m64 *)(s + 7 * pitch)));
+
+ work = _mm_max_epu8(
+ _mm_or_si128(_mm_subs_epu8(q6p6, q0p0), _mm_subs_epu8(q0p0, q6p6)),
+ _mm_or_si128(_mm_subs_epu8(q7p7, q0p0), _mm_subs_epu8(q0p0, q7p7)));
+
+ flat2 = _mm_max_epu8(work, flat2);
+ flat2 = _mm_max_epu8(flat2, _mm_srli_si128(flat2, 8));
+ flat2 = _mm_subs_epu8(flat2, one);
+ flat2 = _mm_cmpeq_epi8(flat2, zero);
+ flat2 = _mm_and_si128(flat2, flat); // flat2 & flat & mask
+ }
+
+ // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+ // flat and wide flat calculations
+ {
+ const __m128i eight = _mm_set1_epi16(8);
+ const __m128i four = _mm_set1_epi16(4);
+ __m128i p7_16, p6_16, p5_16, p4_16, p3_16, p2_16, p1_16, p0_16;
+ __m128i q7_16, q6_16, q5_16, q4_16, q3_16, q2_16, q1_16, q0_16;
+ __m128i pixelFilter_p, pixelFilter_q;
+ __m128i pixetFilter_p2p1p0, pixetFilter_q2q1q0;
+ __m128i sum_p7, sum_q7, sum_p3, sum_q3, res_p, res_q;
+
+ p7_16 = _mm_unpacklo_epi8(q7p7, zero);
+ p6_16 = _mm_unpacklo_epi8(q6p6, zero);
+ p5_16 = _mm_unpacklo_epi8(q5p5, zero);
+ p4_16 = _mm_unpacklo_epi8(q4p4, zero);
+ p3_16 = _mm_unpacklo_epi8(q3p3, zero);
+ p2_16 = _mm_unpacklo_epi8(q2p2, zero);
+ p1_16 = _mm_unpacklo_epi8(q1p1, zero);
+ p0_16 = _mm_unpacklo_epi8(q0p0, zero);
+ q0_16 = _mm_unpackhi_epi8(q0p0, zero);
+ q1_16 = _mm_unpackhi_epi8(q1p1, zero);
+ q2_16 = _mm_unpackhi_epi8(q2p2, zero);
+ q3_16 = _mm_unpackhi_epi8(q3p3, zero);
+ q4_16 = _mm_unpackhi_epi8(q4p4, zero);
+ q5_16 = _mm_unpackhi_epi8(q5p5, zero);
+ q6_16 = _mm_unpackhi_epi8(q6p6, zero);
+ q7_16 = _mm_unpackhi_epi8(q7p7, zero);
+
+ pixelFilter_p = _mm_add_epi16(_mm_add_epi16(p6_16, p5_16),
+ _mm_add_epi16(p4_16, p3_16));
+ pixelFilter_q = _mm_add_epi16(_mm_add_epi16(q6_16, q5_16),
+ _mm_add_epi16(q4_16, q3_16));
+
+ pixetFilter_p2p1p0 = _mm_add_epi16(p0_16, _mm_add_epi16(p2_16, p1_16));
+ pixelFilter_p = _mm_add_epi16(pixelFilter_p, pixetFilter_p2p1p0);
+
+ pixetFilter_q2q1q0 = _mm_add_epi16(q0_16, _mm_add_epi16(q2_16, q1_16));
+ pixelFilter_q = _mm_add_epi16(pixelFilter_q, pixetFilter_q2q1q0);
+ pixelFilter_p =
+ _mm_add_epi16(eight, _mm_add_epi16(pixelFilter_p, pixelFilter_q));
+ pixetFilter_p2p1p0 = _mm_add_epi16(
+ four, _mm_add_epi16(pixetFilter_p2p1p0, pixetFilter_q2q1q0));
+ res_p = _mm_srli_epi16(
+ _mm_add_epi16(pixelFilter_p, _mm_add_epi16(p7_16, p0_16)), 4);
+ res_q = _mm_srli_epi16(
+ _mm_add_epi16(pixelFilter_p, _mm_add_epi16(q7_16, q0_16)), 4);
+ flat2_q0p0 = _mm_packus_epi16(res_p, res_q);
+ res_p = _mm_srli_epi16(
+ _mm_add_epi16(pixetFilter_p2p1p0, _mm_add_epi16(p3_16, p0_16)), 3);
+ res_q = _mm_srli_epi16(
+ _mm_add_epi16(pixetFilter_p2p1p0, _mm_add_epi16(q3_16, q0_16)), 3);
+
+ flat_q0p0 = _mm_packus_epi16(res_p, res_q);
+
+ sum_p7 = _mm_add_epi16(p7_16, p7_16);
+ sum_q7 = _mm_add_epi16(q7_16, q7_16);
+ sum_p3 = _mm_add_epi16(p3_16, p3_16);
+ sum_q3 = _mm_add_epi16(q3_16, q3_16);
+
+ pixelFilter_q = _mm_sub_epi16(pixelFilter_p, p6_16);
+ pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q6_16);
+ res_p = _mm_srli_epi16(
+ _mm_add_epi16(pixelFilter_p, _mm_add_epi16(sum_p7, p1_16)), 4);
+ res_q = _mm_srli_epi16(
+ _mm_add_epi16(pixelFilter_q, _mm_add_epi16(sum_q7, q1_16)), 4);
+ flat2_q1p1 = _mm_packus_epi16(res_p, res_q);
+
+ pixetFilter_q2q1q0 = _mm_sub_epi16(pixetFilter_p2p1p0, p2_16);
+ pixetFilter_p2p1p0 = _mm_sub_epi16(pixetFilter_p2p1p0, q2_16);
+ res_p = _mm_srli_epi16(
+ _mm_add_epi16(pixetFilter_p2p1p0, _mm_add_epi16(sum_p3, p1_16)), 3);
+ res_q = _mm_srli_epi16(
+ _mm_add_epi16(pixetFilter_q2q1q0, _mm_add_epi16(sum_q3, q1_16)), 3);
+ flat_q1p1 = _mm_packus_epi16(res_p, res_q);
+
+ sum_p7 = _mm_add_epi16(sum_p7, p7_16);
+ sum_q7 = _mm_add_epi16(sum_q7, q7_16);
+ sum_p3 = _mm_add_epi16(sum_p3, p3_16);
+ sum_q3 = _mm_add_epi16(sum_q3, q3_16);
+
+ pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q5_16);
+ pixelFilter_q = _mm_sub_epi16(pixelFilter_q, p5_16);
+ res_p = _mm_srli_epi16(
+ _mm_add_epi16(pixelFilter_p, _mm_add_epi16(sum_p7, p2_16)), 4);
+ res_q = _mm_srli_epi16(
+ _mm_add_epi16(pixelFilter_q, _mm_add_epi16(sum_q7, q2_16)), 4);
+ flat2_q2p2 = _mm_packus_epi16(res_p, res_q);
+
+ pixetFilter_p2p1p0 = _mm_sub_epi16(pixetFilter_p2p1p0, q1_16);
+ pixetFilter_q2q1q0 = _mm_sub_epi16(pixetFilter_q2q1q0, p1_16);
+
+ res_p = _mm_srli_epi16(
+ _mm_add_epi16(pixetFilter_p2p1p0, _mm_add_epi16(sum_p3, p2_16)), 3);
+ res_q = _mm_srli_epi16(
+ _mm_add_epi16(pixetFilter_q2q1q0, _mm_add_epi16(sum_q3, q2_16)), 3);
+ flat_q2p2 = _mm_packus_epi16(res_p, res_q);
+
+ sum_p7 = _mm_add_epi16(sum_p7, p7_16);
+ sum_q7 = _mm_add_epi16(sum_q7, q7_16);
+ pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q4_16);
+ pixelFilter_q = _mm_sub_epi16(pixelFilter_q, p4_16);
+ res_p = _mm_srli_epi16(
+ _mm_add_epi16(pixelFilter_p, _mm_add_epi16(sum_p7, p3_16)), 4);
+ res_q = _mm_srli_epi16(
+ _mm_add_epi16(pixelFilter_q, _mm_add_epi16(sum_q7, q3_16)), 4);
+ flat2_q3p3 = _mm_packus_epi16(res_p, res_q);
+
+ sum_p7 = _mm_add_epi16(sum_p7, p7_16);
+ sum_q7 = _mm_add_epi16(sum_q7, q7_16);
+ pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q3_16);
+ pixelFilter_q = _mm_sub_epi16(pixelFilter_q, p3_16);
+ res_p = _mm_srli_epi16(
+ _mm_add_epi16(pixelFilter_p, _mm_add_epi16(sum_p7, p4_16)), 4);
+ res_q = _mm_srli_epi16(
+ _mm_add_epi16(pixelFilter_q, _mm_add_epi16(sum_q7, q4_16)), 4);
+ flat2_q4p4 = _mm_packus_epi16(res_p, res_q);
+
+ sum_p7 = _mm_add_epi16(sum_p7, p7_16);
+ sum_q7 = _mm_add_epi16(sum_q7, q7_16);
+ pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q2_16);
+ pixelFilter_q = _mm_sub_epi16(pixelFilter_q, p2_16);
+ res_p = _mm_srli_epi16(
+ _mm_add_epi16(pixelFilter_p, _mm_add_epi16(sum_p7, p5_16)), 4);
+ res_q = _mm_srli_epi16(
+ _mm_add_epi16(pixelFilter_q, _mm_add_epi16(sum_q7, q5_16)), 4);
+ flat2_q5p5 = _mm_packus_epi16(res_p, res_q);
+
+ sum_p7 = _mm_add_epi16(sum_p7, p7_16);
+ sum_q7 = _mm_add_epi16(sum_q7, q7_16);
+ pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q1_16);
+ pixelFilter_q = _mm_sub_epi16(pixelFilter_q, p1_16);
+ res_p = _mm_srli_epi16(
+ _mm_add_epi16(pixelFilter_p, _mm_add_epi16(sum_p7, p6_16)), 4);
+ res_q = _mm_srli_epi16(
+ _mm_add_epi16(pixelFilter_q, _mm_add_epi16(sum_q7, q6_16)), 4);
+ flat2_q6p6 = _mm_packus_epi16(res_p, res_q);
+ }
+ // wide flat
+ // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+ flat = _mm_shuffle_epi32(flat, 68);
+ flat2 = _mm_shuffle_epi32(flat2, 68);
+
+ q2p2 = _mm_andnot_si128(flat, q2p2);
+ flat_q2p2 = _mm_and_si128(flat, flat_q2p2);
+ q2p2 = _mm_or_si128(q2p2, flat_q2p2);
+
+ qs1ps1 = _mm_andnot_si128(flat, qs1ps1);
+ flat_q1p1 = _mm_and_si128(flat, flat_q1p1);
+ q1p1 = _mm_or_si128(qs1ps1, flat_q1p1);
+
+ qs0ps0 = _mm_andnot_si128(flat, qs0ps0);
+ flat_q0p0 = _mm_and_si128(flat, flat_q0p0);
+ q0p0 = _mm_or_si128(qs0ps0, flat_q0p0);
+
+ q6p6 = _mm_andnot_si128(flat2, q6p6);
+ flat2_q6p6 = _mm_and_si128(flat2, flat2_q6p6);
+ q6p6 = _mm_or_si128(q6p6, flat2_q6p6);
+ _mm_storel_epi64((__m128i *)(s - 7 * pitch), q6p6);
+ _mm_storeh_pi((__m64 *)(s + 6 * pitch), _mm_castsi128_ps(q6p6));
+
+ q5p5 = _mm_andnot_si128(flat2, q5p5);
+ flat2_q5p5 = _mm_and_si128(flat2, flat2_q5p5);
+ q5p5 = _mm_or_si128(q5p5, flat2_q5p5);
+ _mm_storel_epi64((__m128i *)(s - 6 * pitch), q5p5);
+ _mm_storeh_pi((__m64 *)(s + 5 * pitch), _mm_castsi128_ps(q5p5));
+
+ q4p4 = _mm_andnot_si128(flat2, q4p4);
+ flat2_q4p4 = _mm_and_si128(flat2, flat2_q4p4);
+ q4p4 = _mm_or_si128(q4p4, flat2_q4p4);
+ _mm_storel_epi64((__m128i *)(s - 5 * pitch), q4p4);
+ _mm_storeh_pi((__m64 *)(s + 4 * pitch), _mm_castsi128_ps(q4p4));
+
+ q3p3 = _mm_andnot_si128(flat2, q3p3);
+ flat2_q3p3 = _mm_and_si128(flat2, flat2_q3p3);
+ q3p3 = _mm_or_si128(q3p3, flat2_q3p3);
+ _mm_storel_epi64((__m128i *)(s - 4 * pitch), q3p3);
+ _mm_storeh_pi((__m64 *)(s + 3 * pitch), _mm_castsi128_ps(q3p3));
+
+ q2p2 = _mm_andnot_si128(flat2, q2p2);
+ flat2_q2p2 = _mm_and_si128(flat2, flat2_q2p2);
+ q2p2 = _mm_or_si128(q2p2, flat2_q2p2);
+ _mm_storel_epi64((__m128i *)(s - 3 * pitch), q2p2);
+ _mm_storeh_pi((__m64 *)(s + 2 * pitch), _mm_castsi128_ps(q2p2));
+
+ q1p1 = _mm_andnot_si128(flat2, q1p1);
+ flat2_q1p1 = _mm_and_si128(flat2, flat2_q1p1);
+ q1p1 = _mm_or_si128(q1p1, flat2_q1p1);
+ _mm_storel_epi64((__m128i *)(s - 2 * pitch), q1p1);
+ _mm_storeh_pi((__m64 *)(s + 1 * pitch), _mm_castsi128_ps(q1p1));
+
+ q0p0 = _mm_andnot_si128(flat2, q0p0);
+ flat2_q0p0 = _mm_and_si128(flat2, flat2_q0p0);
+ q0p0 = _mm_or_si128(q0p0, flat2_q0p0);
+ _mm_storel_epi64((__m128i *)(s - 1 * pitch), q0p0);
+ _mm_storeh_pi((__m64 *)(s - 0 * pitch), _mm_castsi128_ps(q0p0));
+ }
+}
+
+DECLARE_ALIGNED(32, static const uint8_t, filt_loopfilter_avx2[32]) = {
+ 0, 128, 1, 128, 2, 128, 3, 128, 4, 128, 5, 128, 6, 128, 7, 128,
+ 8, 128, 9, 128, 10, 128, 11, 128, 12, 128, 13, 128, 14, 128, 15, 128
+};
+
+void vpx_lpf_horizontal_16_dual_avx2(unsigned char *s, int pitch,
+ const unsigned char *blimit,
+ const unsigned char *limit,
+ const unsigned char *thresh) {
+ __m128i mask, hev, flat, flat2;
+ const __m128i zero = _mm_setzero_si128();
+ const __m128i one = _mm_set1_epi8(1);
+ __m128i p7, p6, p5;
+ __m128i p4, p3, p2, p1, p0, q0, q1, q2, q3, q4;
+ __m128i q5, q6, q7;
+ __m256i p256_7, q256_7, p256_6, q256_6, p256_5, q256_5, p256_4, q256_4,
+ p256_3, q256_3, p256_2, q256_2, p256_1, q256_1, p256_0, q256_0;
+
+ const __m128i thresh_v =
+ _mm_broadcastb_epi8(_mm_cvtsi32_si128((int)thresh[0]));
+ const __m128i limit_v = _mm_broadcastb_epi8(_mm_cvtsi32_si128((int)limit[0]));
+ const __m128i blimit_v =
+ _mm_broadcastb_epi8(_mm_cvtsi32_si128((int)blimit[0]));
+
+ p256_4 = _mm256_castpd_si256(
+ _mm256_broadcast_pd((__m128d const *)(s - 5 * pitch)));
+ p256_3 = _mm256_castpd_si256(
+ _mm256_broadcast_pd((__m128d const *)(s - 4 * pitch)));
+ p256_2 = _mm256_castpd_si256(
+ _mm256_broadcast_pd((__m128d const *)(s - 3 * pitch)));
+ p256_1 = _mm256_castpd_si256(
+ _mm256_broadcast_pd((__m128d const *)(s - 2 * pitch)));
+ p256_0 = _mm256_castpd_si256(
+ _mm256_broadcast_pd((__m128d const *)(s - 1 * pitch)));
+ q256_0 = _mm256_castpd_si256(
+ _mm256_broadcast_pd((__m128d const *)(s - 0 * pitch)));
+ q256_1 = _mm256_castpd_si256(
+ _mm256_broadcast_pd((__m128d const *)(s + 1 * pitch)));
+ q256_2 = _mm256_castpd_si256(
+ _mm256_broadcast_pd((__m128d const *)(s + 2 * pitch)));
+ q256_3 = _mm256_castpd_si256(
+ _mm256_broadcast_pd((__m128d const *)(s + 3 * pitch)));
+ q256_4 = _mm256_castpd_si256(
+ _mm256_broadcast_pd((__m128d const *)(s + 4 * pitch)));
+
+ p4 = _mm256_castsi256_si128(p256_4);
+ p3 = _mm256_castsi256_si128(p256_3);
+ p2 = _mm256_castsi256_si128(p256_2);
+ p1 = _mm256_castsi256_si128(p256_1);
+ p0 = _mm256_castsi256_si128(p256_0);
+ q0 = _mm256_castsi256_si128(q256_0);
+ q1 = _mm256_castsi256_si128(q256_1);
+ q2 = _mm256_castsi256_si128(q256_2);
+ q3 = _mm256_castsi256_si128(q256_3);
+ q4 = _mm256_castsi256_si128(q256_4);
+
+ {
+ const __m128i abs_p1p0 =
+ _mm_or_si128(_mm_subs_epu8(p1, p0), _mm_subs_epu8(p0, p1));
+ const __m128i abs_q1q0 =
+ _mm_or_si128(_mm_subs_epu8(q1, q0), _mm_subs_epu8(q0, q1));
+ const __m128i fe = _mm_set1_epi8((int8_t)0xfe);
+ const __m128i ff = _mm_cmpeq_epi8(abs_p1p0, abs_p1p0);
+ __m128i abs_p0q0 =
+ _mm_or_si128(_mm_subs_epu8(p0, q0), _mm_subs_epu8(q0, p0));
+ __m128i abs_p1q1 =
+ _mm_or_si128(_mm_subs_epu8(p1, q1), _mm_subs_epu8(q1, p1));
+ __m128i work;
+ flat = _mm_max_epu8(abs_p1p0, abs_q1q0);
+ hev = _mm_subs_epu8(flat, thresh_v);
+ hev = _mm_xor_si128(_mm_cmpeq_epi8(hev, zero), ff);
+
+ abs_p0q0 = _mm_adds_epu8(abs_p0q0, abs_p0q0);
+ abs_p1q1 = _mm_srli_epi16(_mm_and_si128(abs_p1q1, fe), 1);
+ mask = _mm_subs_epu8(_mm_adds_epu8(abs_p0q0, abs_p1q1), blimit_v);
+ mask = _mm_xor_si128(_mm_cmpeq_epi8(mask, zero), ff);
+ // mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2 > blimit) * -1;
+ mask = _mm_max_epu8(flat, mask);
+ // mask |= (abs(p1 - p0) > limit) * -1;
+ // mask |= (abs(q1 - q0) > limit) * -1;
+ work = _mm_max_epu8(
+ _mm_or_si128(_mm_subs_epu8(p2, p1), _mm_subs_epu8(p1, p2)),
+ _mm_or_si128(_mm_subs_epu8(p3, p2), _mm_subs_epu8(p2, p3)));
+ mask = _mm_max_epu8(work, mask);
+ work = _mm_max_epu8(
+ _mm_or_si128(_mm_subs_epu8(q2, q1), _mm_subs_epu8(q1, q2)),
+ _mm_or_si128(_mm_subs_epu8(q3, q2), _mm_subs_epu8(q2, q3)));
+ mask = _mm_max_epu8(work, mask);
+ mask = _mm_subs_epu8(mask, limit_v);
+ mask = _mm_cmpeq_epi8(mask, zero);
+ }
+
+ // lp filter
+ {
+ const __m128i t4 = _mm_set1_epi8(4);
+ const __m128i t3 = _mm_set1_epi8(3);
+ const __m128i t80 = _mm_set1_epi8((int8_t)0x80);
+ const __m128i te0 = _mm_set1_epi8((int8_t)0xe0);
+ const __m128i t1f = _mm_set1_epi8(0x1f);
+ const __m128i t1 = _mm_set1_epi8(0x1);
+ const __m128i t7f = _mm_set1_epi8(0x7f);
+
+ __m128i ps1 = _mm_xor_si128(p1, t80);
+ __m128i ps0 = _mm_xor_si128(p0, t80);
+ __m128i qs0 = _mm_xor_si128(q0, t80);
+ __m128i qs1 = _mm_xor_si128(q1, t80);
+ __m128i filt;
+ __m128i work_a;
+ __m128i filter1, filter2;
+ __m128i flat2_p6, flat2_p5, flat2_p4, flat2_p3, flat2_p2, flat2_p1,
+ flat2_p0, flat2_q0, flat2_q1, flat2_q2, flat2_q3, flat2_q4, flat2_q5,
+ flat2_q6, flat_p2, flat_p1, flat_p0, flat_q0, flat_q1, flat_q2;
+
+ filt = _mm_and_si128(_mm_subs_epi8(ps1, qs1), hev);
+ work_a = _mm_subs_epi8(qs0, ps0);
+ filt = _mm_adds_epi8(filt, work_a);
+ filt = _mm_adds_epi8(filt, work_a);
+ filt = _mm_adds_epi8(filt, work_a);
+ /* (vpx_filter + 3 * (qs0 - ps0)) & mask */
+ filt = _mm_and_si128(filt, mask);
+
+ filter1 = _mm_adds_epi8(filt, t4);
+ filter2 = _mm_adds_epi8(filt, t3);
+
+ /* Filter1 >> 3 */
+ work_a = _mm_cmpgt_epi8(zero, filter1);
+ filter1 = _mm_srli_epi16(filter1, 3);
+ work_a = _mm_and_si128(work_a, te0);
+ filter1 = _mm_and_si128(filter1, t1f);
+ filter1 = _mm_or_si128(filter1, work_a);
+ qs0 = _mm_xor_si128(_mm_subs_epi8(qs0, filter1), t80);
+
+ /* Filter2 >> 3 */
+ work_a = _mm_cmpgt_epi8(zero, filter2);
+ filter2 = _mm_srli_epi16(filter2, 3);
+ work_a = _mm_and_si128(work_a, te0);
+ filter2 = _mm_and_si128(filter2, t1f);
+ filter2 = _mm_or_si128(filter2, work_a);
+ ps0 = _mm_xor_si128(_mm_adds_epi8(ps0, filter2), t80);
+
+ /* filt >> 1 */
+ filt = _mm_adds_epi8(filter1, t1);
+ work_a = _mm_cmpgt_epi8(zero, filt);
+ filt = _mm_srli_epi16(filt, 1);
+ work_a = _mm_and_si128(work_a, t80);
+ filt = _mm_and_si128(filt, t7f);
+ filt = _mm_or_si128(filt, work_a);
+ filt = _mm_andnot_si128(hev, filt);
+ ps1 = _mm_xor_si128(_mm_adds_epi8(ps1, filt), t80);
+ qs1 = _mm_xor_si128(_mm_subs_epi8(qs1, filt), t80);
+ // loopfilter done
+
+ {
+ __m128i work;
+ work = _mm_max_epu8(
+ _mm_or_si128(_mm_subs_epu8(p2, p0), _mm_subs_epu8(p0, p2)),
+ _mm_or_si128(_mm_subs_epu8(q2, q0), _mm_subs_epu8(q0, q2)));
+ flat = _mm_max_epu8(work, flat);
+ work = _mm_max_epu8(
+ _mm_or_si128(_mm_subs_epu8(p3, p0), _mm_subs_epu8(p0, p3)),
+ _mm_or_si128(_mm_subs_epu8(q3, q0), _mm_subs_epu8(q0, q3)));
+ flat = _mm_max_epu8(work, flat);
+ work = _mm_max_epu8(
+ _mm_or_si128(_mm_subs_epu8(p4, p0), _mm_subs_epu8(p0, p4)),
+ _mm_or_si128(_mm_subs_epu8(q4, q0), _mm_subs_epu8(q0, q4)));
+ flat = _mm_subs_epu8(flat, one);
+ flat = _mm_cmpeq_epi8(flat, zero);
+ flat = _mm_and_si128(flat, mask);
+
+ p256_5 = _mm256_castpd_si256(
+ _mm256_broadcast_pd((__m128d const *)(s - 6 * pitch)));
+ q256_5 = _mm256_castpd_si256(
+ _mm256_broadcast_pd((__m128d const *)(s + 5 * pitch)));
+ p5 = _mm256_castsi256_si128(p256_5);
+ q5 = _mm256_castsi256_si128(q256_5);
+ flat2 = _mm_max_epu8(
+ _mm_or_si128(_mm_subs_epu8(p5, p0), _mm_subs_epu8(p0, p5)),
+ _mm_or_si128(_mm_subs_epu8(q5, q0), _mm_subs_epu8(q0, q5)));
+
+ flat2 = _mm_max_epu8(work, flat2);
+ p256_6 = _mm256_castpd_si256(
+ _mm256_broadcast_pd((__m128d const *)(s - 7 * pitch)));
+ q256_6 = _mm256_castpd_si256(
+ _mm256_broadcast_pd((__m128d const *)(s + 6 * pitch)));
+ p6 = _mm256_castsi256_si128(p256_6);
+ q6 = _mm256_castsi256_si128(q256_6);
+ work = _mm_max_epu8(
+ _mm_or_si128(_mm_subs_epu8(p6, p0), _mm_subs_epu8(p0, p6)),
+ _mm_or_si128(_mm_subs_epu8(q6, q0), _mm_subs_epu8(q0, q6)));
+
+ flat2 = _mm_max_epu8(work, flat2);
+
+ p256_7 = _mm256_castpd_si256(
+ _mm256_broadcast_pd((__m128d const *)(s - 8 * pitch)));
+ q256_7 = _mm256_castpd_si256(
+ _mm256_broadcast_pd((__m128d const *)(s + 7 * pitch)));
+ p7 = _mm256_castsi256_si128(p256_7);
+ q7 = _mm256_castsi256_si128(q256_7);
+ work = _mm_max_epu8(
+ _mm_or_si128(_mm_subs_epu8(p7, p0), _mm_subs_epu8(p0, p7)),
+ _mm_or_si128(_mm_subs_epu8(q7, q0), _mm_subs_epu8(q0, q7)));
+
+ flat2 = _mm_max_epu8(work, flat2);
+ flat2 = _mm_subs_epu8(flat2, one);
+ flat2 = _mm_cmpeq_epi8(flat2, zero);
+ flat2 = _mm_and_si128(flat2, flat); // flat2 & flat & mask
+ }
+
+ // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+ // flat and wide flat calculations
+ {
+ const __m256i eight = _mm256_set1_epi16(8);
+ const __m256i four = _mm256_set1_epi16(4);
+ __m256i pixelFilter_p, pixelFilter_q, pixetFilter_p2p1p0,
+ pixetFilter_q2q1q0, sum_p7, sum_q7, sum_p3, sum_q3, res_p, res_q;
+
+ const __m256i filter =
+ _mm256_load_si256((__m256i const *)filt_loopfilter_avx2);
+ p256_7 = _mm256_shuffle_epi8(p256_7, filter);
+ p256_6 = _mm256_shuffle_epi8(p256_6, filter);
+ p256_5 = _mm256_shuffle_epi8(p256_5, filter);
+ p256_4 = _mm256_shuffle_epi8(p256_4, filter);
+ p256_3 = _mm256_shuffle_epi8(p256_3, filter);
+ p256_2 = _mm256_shuffle_epi8(p256_2, filter);
+ p256_1 = _mm256_shuffle_epi8(p256_1, filter);
+ p256_0 = _mm256_shuffle_epi8(p256_0, filter);
+ q256_0 = _mm256_shuffle_epi8(q256_0, filter);
+ q256_1 = _mm256_shuffle_epi8(q256_1, filter);
+ q256_2 = _mm256_shuffle_epi8(q256_2, filter);
+ q256_3 = _mm256_shuffle_epi8(q256_3, filter);
+ q256_4 = _mm256_shuffle_epi8(q256_4, filter);
+ q256_5 = _mm256_shuffle_epi8(q256_5, filter);
+ q256_6 = _mm256_shuffle_epi8(q256_6, filter);
+ q256_7 = _mm256_shuffle_epi8(q256_7, filter);
+
+ pixelFilter_p = _mm256_add_epi16(_mm256_add_epi16(p256_6, p256_5),
+ _mm256_add_epi16(p256_4, p256_3));
+ pixelFilter_q = _mm256_add_epi16(_mm256_add_epi16(q256_6, q256_5),
+ _mm256_add_epi16(q256_4, q256_3));
+
+ pixetFilter_p2p1p0 =
+ _mm256_add_epi16(p256_0, _mm256_add_epi16(p256_2, p256_1));
+ pixelFilter_p = _mm256_add_epi16(pixelFilter_p, pixetFilter_p2p1p0);
+
+ pixetFilter_q2q1q0 =
+ _mm256_add_epi16(q256_0, _mm256_add_epi16(q256_2, q256_1));
+ pixelFilter_q = _mm256_add_epi16(pixelFilter_q, pixetFilter_q2q1q0);
+
+ pixelFilter_p = _mm256_add_epi16(
+ eight, _mm256_add_epi16(pixelFilter_p, pixelFilter_q));
+
+ pixetFilter_p2p1p0 = _mm256_add_epi16(
+ four, _mm256_add_epi16(pixetFilter_p2p1p0, pixetFilter_q2q1q0));
+
+ res_p = _mm256_srli_epi16(
+ _mm256_add_epi16(pixelFilter_p, _mm256_add_epi16(p256_7, p256_0)), 4);
+
+ flat2_p0 = _mm256_castsi256_si128(
+ _mm256_permute4x64_epi64(_mm256_packus_epi16(res_p, res_p), 168));
+
+ res_q = _mm256_srli_epi16(
+ _mm256_add_epi16(pixelFilter_p, _mm256_add_epi16(q256_7, q256_0)), 4);
+
+ flat2_q0 = _mm256_castsi256_si128(
+ _mm256_permute4x64_epi64(_mm256_packus_epi16(res_q, res_q), 168));
+
+ res_p =
+ _mm256_srli_epi16(_mm256_add_epi16(pixetFilter_p2p1p0,
+ _mm256_add_epi16(p256_3, p256_0)),
+ 3);
+
+ flat_p0 = _mm256_castsi256_si128(
+ _mm256_permute4x64_epi64(_mm256_packus_epi16(res_p, res_p), 168));
+
+ res_q =
+ _mm256_srli_epi16(_mm256_add_epi16(pixetFilter_p2p1p0,
+ _mm256_add_epi16(q256_3, q256_0)),
+ 3);
+
+ flat_q0 = _mm256_castsi256_si128(
+ _mm256_permute4x64_epi64(_mm256_packus_epi16(res_q, res_q), 168));
+
+ sum_p7 = _mm256_add_epi16(p256_7, p256_7);
+
+ sum_q7 = _mm256_add_epi16(q256_7, q256_7);
+
+ sum_p3 = _mm256_add_epi16(p256_3, p256_3);
+
+ sum_q3 = _mm256_add_epi16(q256_3, q256_3);
+
+ pixelFilter_q = _mm256_sub_epi16(pixelFilter_p, p256_6);
+
+ pixelFilter_p = _mm256_sub_epi16(pixelFilter_p, q256_6);
+
+ res_p = _mm256_srli_epi16(
+ _mm256_add_epi16(pixelFilter_p, _mm256_add_epi16(sum_p7, p256_1)), 4);
+
+ flat2_p1 = _mm256_castsi256_si128(
+ _mm256_permute4x64_epi64(_mm256_packus_epi16(res_p, res_p), 168));
+
+ res_q = _mm256_srli_epi16(
+ _mm256_add_epi16(pixelFilter_q, _mm256_add_epi16(sum_q7, q256_1)), 4);
+
+ flat2_q1 = _mm256_castsi256_si128(
+ _mm256_permute4x64_epi64(_mm256_packus_epi16(res_q, res_q), 168));
+
+ pixetFilter_q2q1q0 = _mm256_sub_epi16(pixetFilter_p2p1p0, p256_2);
+
+ pixetFilter_p2p1p0 = _mm256_sub_epi16(pixetFilter_p2p1p0, q256_2);
+
+ res_p =
+ _mm256_srli_epi16(_mm256_add_epi16(pixetFilter_p2p1p0,
+ _mm256_add_epi16(sum_p3, p256_1)),
+ 3);
+
+ flat_p1 = _mm256_castsi256_si128(
+ _mm256_permute4x64_epi64(_mm256_packus_epi16(res_p, res_p), 168));
+
+ res_q =
+ _mm256_srli_epi16(_mm256_add_epi16(pixetFilter_q2q1q0,
+ _mm256_add_epi16(sum_q3, q256_1)),
+ 3);
+
+ flat_q1 = _mm256_castsi256_si128(
+ _mm256_permute4x64_epi64(_mm256_packus_epi16(res_q, res_q), 168));
+
+ sum_p7 = _mm256_add_epi16(sum_p7, p256_7);
+
+ sum_q7 = _mm256_add_epi16(sum_q7, q256_7);
+
+ sum_p3 = _mm256_add_epi16(sum_p3, p256_3);
+
+ sum_q3 = _mm256_add_epi16(sum_q3, q256_3);
+
+ pixelFilter_p = _mm256_sub_epi16(pixelFilter_p, q256_5);
+
+ pixelFilter_q = _mm256_sub_epi16(pixelFilter_q, p256_5);
+
+ res_p = _mm256_srli_epi16(
+ _mm256_add_epi16(pixelFilter_p, _mm256_add_epi16(sum_p7, p256_2)), 4);
+
+ flat2_p2 = _mm256_castsi256_si128(
+ _mm256_permute4x64_epi64(_mm256_packus_epi16(res_p, res_p), 168));
+
+ res_q = _mm256_srli_epi16(
+ _mm256_add_epi16(pixelFilter_q, _mm256_add_epi16(sum_q7, q256_2)), 4);
+
+ flat2_q2 = _mm256_castsi256_si128(
+ _mm256_permute4x64_epi64(_mm256_packus_epi16(res_q, res_q), 168));
+
+ pixetFilter_p2p1p0 = _mm256_sub_epi16(pixetFilter_p2p1p0, q256_1);
+
+ pixetFilter_q2q1q0 = _mm256_sub_epi16(pixetFilter_q2q1q0, p256_1);
+
+ res_p =
+ _mm256_srli_epi16(_mm256_add_epi16(pixetFilter_p2p1p0,
+ _mm256_add_epi16(sum_p3, p256_2)),
+ 3);
+
+ flat_p2 = _mm256_castsi256_si128(
+ _mm256_permute4x64_epi64(_mm256_packus_epi16(res_p, res_p), 168));
+
+ res_q =
+ _mm256_srli_epi16(_mm256_add_epi16(pixetFilter_q2q1q0,
+ _mm256_add_epi16(sum_q3, q256_2)),
+ 3);
+
+ flat_q2 = _mm256_castsi256_si128(
+ _mm256_permute4x64_epi64(_mm256_packus_epi16(res_q, res_q), 168));
+
+ sum_p7 = _mm256_add_epi16(sum_p7, p256_7);
+
+ sum_q7 = _mm256_add_epi16(sum_q7, q256_7);
+
+ pixelFilter_p = _mm256_sub_epi16(pixelFilter_p, q256_4);
+
+ pixelFilter_q = _mm256_sub_epi16(pixelFilter_q, p256_4);
+
+ res_p = _mm256_srli_epi16(
+ _mm256_add_epi16(pixelFilter_p, _mm256_add_epi16(sum_p7, p256_3)), 4);
+
+ flat2_p3 = _mm256_castsi256_si128(
+ _mm256_permute4x64_epi64(_mm256_packus_epi16(res_p, res_p), 168));
+
+ res_q = _mm256_srli_epi16(
+ _mm256_add_epi16(pixelFilter_q, _mm256_add_epi16(sum_q7, q256_3)), 4);
+
+ flat2_q3 = _mm256_castsi256_si128(
+ _mm256_permute4x64_epi64(_mm256_packus_epi16(res_q, res_q), 168));
+
+ sum_p7 = _mm256_add_epi16(sum_p7, p256_7);
+
+ sum_q7 = _mm256_add_epi16(sum_q7, q256_7);
+
+ pixelFilter_p = _mm256_sub_epi16(pixelFilter_p, q256_3);
+
+ pixelFilter_q = _mm256_sub_epi16(pixelFilter_q, p256_3);
+
+ res_p = _mm256_srli_epi16(
+ _mm256_add_epi16(pixelFilter_p, _mm256_add_epi16(sum_p7, p256_4)), 4);
+
+ flat2_p4 = _mm256_castsi256_si128(
+ _mm256_permute4x64_epi64(_mm256_packus_epi16(res_p, res_p), 168));
+
+ res_q = _mm256_srli_epi16(
+ _mm256_add_epi16(pixelFilter_q, _mm256_add_epi16(sum_q7, q256_4)), 4);
+
+ flat2_q4 = _mm256_castsi256_si128(
+ _mm256_permute4x64_epi64(_mm256_packus_epi16(res_q, res_q), 168));
+
+ sum_p7 = _mm256_add_epi16(sum_p7, p256_7);
+
+ sum_q7 = _mm256_add_epi16(sum_q7, q256_7);
+
+ pixelFilter_p = _mm256_sub_epi16(pixelFilter_p, q256_2);
+
+ pixelFilter_q = _mm256_sub_epi16(pixelFilter_q, p256_2);
+
+ res_p = _mm256_srli_epi16(
+ _mm256_add_epi16(pixelFilter_p, _mm256_add_epi16(sum_p7, p256_5)), 4);
+
+ flat2_p5 = _mm256_castsi256_si128(
+ _mm256_permute4x64_epi64(_mm256_packus_epi16(res_p, res_p), 168));
+
+ res_q = _mm256_srli_epi16(
+ _mm256_add_epi16(pixelFilter_q, _mm256_add_epi16(sum_q7, q256_5)), 4);
+
+ flat2_q5 = _mm256_castsi256_si128(
+ _mm256_permute4x64_epi64(_mm256_packus_epi16(res_q, res_q), 168));
+
+ sum_p7 = _mm256_add_epi16(sum_p7, p256_7);
+
+ sum_q7 = _mm256_add_epi16(sum_q7, q256_7);
+
+ pixelFilter_p = _mm256_sub_epi16(pixelFilter_p, q256_1);
+
+ pixelFilter_q = _mm256_sub_epi16(pixelFilter_q, p256_1);
+
+ res_p = _mm256_srli_epi16(
+ _mm256_add_epi16(pixelFilter_p, _mm256_add_epi16(sum_p7, p256_6)), 4);
+
+ flat2_p6 = _mm256_castsi256_si128(
+ _mm256_permute4x64_epi64(_mm256_packus_epi16(res_p, res_p), 168));
+
+ res_q = _mm256_srli_epi16(
+ _mm256_add_epi16(pixelFilter_q, _mm256_add_epi16(sum_q7, q256_6)), 4);
+
+ flat2_q6 = _mm256_castsi256_si128(
+ _mm256_permute4x64_epi64(_mm256_packus_epi16(res_q, res_q), 168));
+ }
+
+ // wide flat
+ // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+ p2 = _mm_andnot_si128(flat, p2);
+ flat_p2 = _mm_and_si128(flat, flat_p2);
+ p2 = _mm_or_si128(flat_p2, p2);
+
+ p1 = _mm_andnot_si128(flat, ps1);
+ flat_p1 = _mm_and_si128(flat, flat_p1);
+ p1 = _mm_or_si128(flat_p1, p1);
+
+ p0 = _mm_andnot_si128(flat, ps0);
+ flat_p0 = _mm_and_si128(flat, flat_p0);
+ p0 = _mm_or_si128(flat_p0, p0);
+
+ q0 = _mm_andnot_si128(flat, qs0);
+ flat_q0 = _mm_and_si128(flat, flat_q0);
+ q0 = _mm_or_si128(flat_q0, q0);
+
+ q1 = _mm_andnot_si128(flat, qs1);
+ flat_q1 = _mm_and_si128(flat, flat_q1);
+ q1 = _mm_or_si128(flat_q1, q1);
+
+ q2 = _mm_andnot_si128(flat, q2);
+ flat_q2 = _mm_and_si128(flat, flat_q2);
+ q2 = _mm_or_si128(flat_q2, q2);
+
+ p6 = _mm_andnot_si128(flat2, p6);
+ flat2_p6 = _mm_and_si128(flat2, flat2_p6);
+ p6 = _mm_or_si128(flat2_p6, p6);
+ _mm_storeu_si128((__m128i *)(s - 7 * pitch), p6);
+
+ p5 = _mm_andnot_si128(flat2, p5);
+ flat2_p5 = _mm_and_si128(flat2, flat2_p5);
+ p5 = _mm_or_si128(flat2_p5, p5);
+ _mm_storeu_si128((__m128i *)(s - 6 * pitch), p5);
+
+ p4 = _mm_andnot_si128(flat2, p4);
+ flat2_p4 = _mm_and_si128(flat2, flat2_p4);
+ p4 = _mm_or_si128(flat2_p4, p4);
+ _mm_storeu_si128((__m128i *)(s - 5 * pitch), p4);
+
+ p3 = _mm_andnot_si128(flat2, p3);
+ flat2_p3 = _mm_and_si128(flat2, flat2_p3);
+ p3 = _mm_or_si128(flat2_p3, p3);
+ _mm_storeu_si128((__m128i *)(s - 4 * pitch), p3);
+
+ p2 = _mm_andnot_si128(flat2, p2);
+ flat2_p2 = _mm_and_si128(flat2, flat2_p2);
+ p2 = _mm_or_si128(flat2_p2, p2);
+ _mm_storeu_si128((__m128i *)(s - 3 * pitch), p2);
+
+ p1 = _mm_andnot_si128(flat2, p1);
+ flat2_p1 = _mm_and_si128(flat2, flat2_p1);
+ p1 = _mm_or_si128(flat2_p1, p1);
+ _mm_storeu_si128((__m128i *)(s - 2 * pitch), p1);
+
+ p0 = _mm_andnot_si128(flat2, p0);
+ flat2_p0 = _mm_and_si128(flat2, flat2_p0);
+ p0 = _mm_or_si128(flat2_p0, p0);
+ _mm_storeu_si128((__m128i *)(s - 1 * pitch), p0);
+
+ q0 = _mm_andnot_si128(flat2, q0);
+ flat2_q0 = _mm_and_si128(flat2, flat2_q0);
+ q0 = _mm_or_si128(flat2_q0, q0);
+ _mm_storeu_si128((__m128i *)(s - 0 * pitch), q0);
+
+ q1 = _mm_andnot_si128(flat2, q1);
+ flat2_q1 = _mm_and_si128(flat2, flat2_q1);
+ q1 = _mm_or_si128(flat2_q1, q1);
+ _mm_storeu_si128((__m128i *)(s + 1 * pitch), q1);
+
+ q2 = _mm_andnot_si128(flat2, q2);
+ flat2_q2 = _mm_and_si128(flat2, flat2_q2);
+ q2 = _mm_or_si128(flat2_q2, q2);
+ _mm_storeu_si128((__m128i *)(s + 2 * pitch), q2);
+
+ q3 = _mm_andnot_si128(flat2, q3);
+ flat2_q3 = _mm_and_si128(flat2, flat2_q3);
+ q3 = _mm_or_si128(flat2_q3, q3);
+ _mm_storeu_si128((__m128i *)(s + 3 * pitch), q3);
+
+ q4 = _mm_andnot_si128(flat2, q4);
+ flat2_q4 = _mm_and_si128(flat2, flat2_q4);
+ q4 = _mm_or_si128(flat2_q4, q4);
+ _mm_storeu_si128((__m128i *)(s + 4 * pitch), q4);
+
+ q5 = _mm_andnot_si128(flat2, q5);
+ flat2_q5 = _mm_and_si128(flat2, flat2_q5);
+ q5 = _mm_or_si128(flat2_q5, q5);
+ _mm_storeu_si128((__m128i *)(s + 5 * pitch), q5);
+
+ q6 = _mm_andnot_si128(flat2, q6);
+ flat2_q6 = _mm_and_si128(flat2, flat2_q6);
+ q6 = _mm_or_si128(flat2_q6, q6);
+ _mm_storeu_si128((__m128i *)(s + 6 * pitch), q6);
+ }
+}
diff --git a/media/libvpx/libvpx/vpx_dsp/x86/loopfilter_intrin_sse2.c b/media/libvpx/libvpx/vpx_dsp/x86/loopfilter_intrin_sse2.c
new file mode 100644
index 0000000000..6ea34cdd16
--- /dev/null
+++ b/media/libvpx/libvpx/vpx_dsp/x86/loopfilter_intrin_sse2.c
@@ -0,0 +1,1779 @@
+/*
+ * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <emmintrin.h> // SSE2
+
+#include "./vpx_dsp_rtcd.h"
+#include "vpx_ports/mem.h"
+#include "vpx_ports/emmintrin_compat.h"
+#include "vpx_dsp/x86/mem_sse2.h"
+
+static INLINE __m128i abs_diff(__m128i a, __m128i b) {
+ return _mm_or_si128(_mm_subs_epu8(a, b), _mm_subs_epu8(b, a));
+}
+
+// filter_mask and hev_mask
+#define FILTER_HEV_MASK \
+ do { \
+ /* (abs(q1 - q0), abs(p1 - p0) */ \
+ __m128i flat = abs_diff(q1p1, q0p0); \
+ /* abs(p1 - q1), abs(p0 - q0) */ \
+ const __m128i abs_p1q1p0q0 = abs_diff(p1p0, q1q0); \
+ __m128i abs_p0q0, abs_p1q1, work; \
+ \
+ /* const uint8_t hev = hev_mask(thresh, *op1, *op0, *oq0, *oq1); */ \
+ hev = \
+ _mm_unpacklo_epi8(_mm_max_epu8(flat, _mm_srli_si128(flat, 8)), zero); \
+ hev = _mm_cmpgt_epi16(hev, thresh_v); \
+ hev = _mm_packs_epi16(hev, hev); \
+ \
+ /* const int8_t mask = filter_mask(*limit, *blimit, */ \
+ /* p3, p2, p1, p0, q0, q1, q2, q3); */ \
+ abs_p0q0 = \
+ _mm_adds_epu8(abs_p1q1p0q0, abs_p1q1p0q0); /* abs(p0 - q0) * 2 */ \
+ abs_p1q1 = \
+ _mm_unpackhi_epi8(abs_p1q1p0q0, abs_p1q1p0q0); /* abs(p1 - q1) */ \
+ abs_p1q1 = _mm_srli_epi16(abs_p1q1, 9); \
+ abs_p1q1 = _mm_packs_epi16(abs_p1q1, abs_p1q1); /* abs(p1 - q1) / 2 */ \
+ /* abs(p0 - q0) * 2 + abs(p1 - q1) / 2 */ \
+ mask = _mm_adds_epu8(abs_p0q0, abs_p1q1); \
+ /* abs(p3 - p2), abs(p2 - p1) */ \
+ work = abs_diff(p3p2, p2p1); \
+ flat = _mm_max_epu8(work, flat); \
+ /* abs(q3 - q2), abs(q2 - q1) */ \
+ work = abs_diff(q3q2, q2q1); \
+ flat = _mm_max_epu8(work, flat); \
+ flat = _mm_max_epu8(flat, _mm_srli_si128(flat, 8)); \
+ mask = _mm_unpacklo_epi64(mask, flat); \
+ mask = _mm_subs_epu8(mask, limit_v); \
+ mask = _mm_cmpeq_epi8(mask, zero); \
+ mask = _mm_and_si128(mask, _mm_srli_si128(mask, 8)); \
+ } while (0)
+
+#define FILTER4 \
+ do { \
+ const __m128i t3t4 = \
+ _mm_set_epi8(3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4); \
+ const __m128i t80 = _mm_set1_epi8((int8_t)0x80); \
+ __m128i filter, filter2filter1, work; \
+ \
+ ps1ps0 = _mm_xor_si128(p1p0, t80); /* ^ 0x80 */ \
+ qs1qs0 = _mm_xor_si128(q1q0, t80); \
+ \
+ /* int8_t filter = signed_char_clamp(ps1 - qs1) & hev; */ \
+ work = _mm_subs_epi8(ps1ps0, qs1qs0); \
+ filter = _mm_and_si128(_mm_srli_si128(work, 8), hev); \
+ /* filter = signed_char_clamp(filter + 3 * (qs0 - ps0)) & mask; */ \
+ filter = _mm_subs_epi8(filter, work); \
+ filter = _mm_subs_epi8(filter, work); \
+ filter = _mm_subs_epi8(filter, work); /* + 3 * (qs0 - ps0) */ \
+ filter = _mm_and_si128(filter, mask); /* & mask */ \
+ filter = _mm_unpacklo_epi64(filter, filter); \
+ \
+ /* filter1 = signed_char_clamp(filter + 4) >> 3; */ \
+ /* filter2 = signed_char_clamp(filter + 3) >> 3; */ \
+ filter2filter1 = _mm_adds_epi8(filter, t3t4); /* signed_char_clamp */ \
+ filter = _mm_unpackhi_epi8(filter2filter1, filter2filter1); \
+ filter2filter1 = _mm_unpacklo_epi8(filter2filter1, filter2filter1); \
+ filter2filter1 = _mm_srai_epi16(filter2filter1, 11); /* >> 3 */ \
+ filter = _mm_srai_epi16(filter, 11); /* >> 3 */ \
+ filter2filter1 = _mm_packs_epi16(filter2filter1, filter); \
+ \
+ /* filter = ROUND_POWER_OF_TWO(filter1, 1) & ~hev; */ \
+ filter = _mm_subs_epi8(filter2filter1, ff); /* + 1 */ \
+ filter = _mm_unpacklo_epi8(filter, filter); \
+ filter = _mm_srai_epi16(filter, 9); /* round */ \
+ filter = _mm_packs_epi16(filter, filter); \
+ filter = _mm_andnot_si128(hev, filter); \
+ \
+ hev = _mm_unpackhi_epi64(filter2filter1, filter); \
+ filter2filter1 = _mm_unpacklo_epi64(filter2filter1, filter); \
+ \
+ /* signed_char_clamp(qs1 - filter), signed_char_clamp(qs0 - filter1) */ \
+ qs1qs0 = _mm_subs_epi8(qs1qs0, filter2filter1); \
+ /* signed_char_clamp(ps1 + filter), signed_char_clamp(ps0 + filter2) */ \
+ ps1ps0 = _mm_adds_epi8(ps1ps0, hev); \
+ qs1qs0 = _mm_xor_si128(qs1qs0, t80); /* ^ 0x80 */ \
+ ps1ps0 = _mm_xor_si128(ps1ps0, t80); /* ^ 0x80 */ \
+ } while (0)
+
+void vpx_lpf_horizontal_4_sse2(uint8_t *s, int pitch, const uint8_t *blimit,
+ const uint8_t *limit, const uint8_t *thresh) {
+ const __m128i zero = _mm_setzero_si128();
+ const __m128i limit_v =
+ _mm_unpacklo_epi64(_mm_loadl_epi64((const __m128i *)blimit),
+ _mm_loadl_epi64((const __m128i *)limit));
+ const __m128i thresh_v =
+ _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)thresh), zero);
+ const __m128i ff = _mm_cmpeq_epi8(zero, zero);
+ __m128i q1p1, q0p0, p3p2, p2p1, p1p0, q3q2, q2q1, q1q0, ps1ps0, qs1qs0;
+ __m128i mask, hev;
+
+ p3p2 = _mm_unpacklo_epi64(_mm_loadl_epi64((__m128i *)(s - 3 * pitch)),
+ _mm_loadl_epi64((__m128i *)(s - 4 * pitch)));
+ q1p1 = _mm_unpacklo_epi64(_mm_loadl_epi64((__m128i *)(s - 2 * pitch)),
+ _mm_loadl_epi64((__m128i *)(s + 1 * pitch)));
+ q0p0 = _mm_unpacklo_epi64(_mm_loadl_epi64((__m128i *)(s - 1 * pitch)),
+ _mm_loadl_epi64((__m128i *)(s + 0 * pitch)));
+ q3q2 = _mm_unpacklo_epi64(_mm_loadl_epi64((__m128i *)(s + 2 * pitch)),
+ _mm_loadl_epi64((__m128i *)(s + 3 * pitch)));
+ p1p0 = _mm_unpacklo_epi64(q0p0, q1p1);
+ p2p1 = _mm_unpacklo_epi64(q1p1, p3p2);
+ q1q0 = _mm_unpackhi_epi64(q0p0, q1p1);
+ q2q1 = _mm_unpacklo_epi64(_mm_srli_si128(q1p1, 8), q3q2);
+
+ FILTER_HEV_MASK;
+ FILTER4;
+
+ _mm_storeh_pi((__m64 *)(s - 2 * pitch), _mm_castsi128_ps(ps1ps0)); // *op1
+ _mm_storel_epi64((__m128i *)(s - 1 * pitch), ps1ps0); // *op0
+ _mm_storel_epi64((__m128i *)(s + 0 * pitch), qs1qs0); // *oq0
+ _mm_storeh_pi((__m64 *)(s + 1 * pitch), _mm_castsi128_ps(qs1qs0)); // *oq1
+}
+
+void vpx_lpf_vertical_4_sse2(uint8_t *s, int pitch, const uint8_t *blimit,
+ const uint8_t *limit, const uint8_t *thresh) {
+ const __m128i zero = _mm_setzero_si128();
+ const __m128i limit_v =
+ _mm_unpacklo_epi64(_mm_loadl_epi64((const __m128i *)blimit),
+ _mm_loadl_epi64((const __m128i *)limit));
+ const __m128i thresh_v =
+ _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)thresh), zero);
+ const __m128i ff = _mm_cmpeq_epi8(zero, zero);
+ __m128i x0, x1, x2, x3;
+ __m128i q1p1, q0p0, p3p2, p2p1, p1p0, q3q2, q2q1, q1q0, ps1ps0, qs1qs0;
+ __m128i mask, hev;
+
+ // 00 10 01 11 02 12 03 13 04 14 05 15 06 16 07 17
+ q1q0 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(s + 0 * pitch - 4)),
+ _mm_loadl_epi64((__m128i *)(s + 1 * pitch - 4)));
+
+ // 20 30 21 31 22 32 23 33 24 34 25 35 26 36 27 37
+ x1 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(s + 2 * pitch - 4)),
+ _mm_loadl_epi64((__m128i *)(s + 3 * pitch - 4)));
+
+ // 40 50 41 51 42 52 43 53 44 54 45 55 46 56 47 57
+ x2 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(s + 4 * pitch - 4)),
+ _mm_loadl_epi64((__m128i *)(s + 5 * pitch - 4)));
+
+ // 60 70 61 71 62 72 63 73 64 74 65 75 66 76 67 77
+ x3 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(s + 6 * pitch - 4)),
+ _mm_loadl_epi64((__m128i *)(s + 7 * pitch - 4)));
+
+ // Transpose 8x8
+ // 00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33
+ p1p0 = _mm_unpacklo_epi16(q1q0, x1);
+ // 40 50 60 70 41 51 61 71 42 52 62 72 43 53 63 73
+ x0 = _mm_unpacklo_epi16(x2, x3);
+ // 00 10 20 30 40 50 60 70 01 11 21 31 41 51 61 71
+ p3p2 = _mm_unpacklo_epi32(p1p0, x0);
+ // 02 12 22 32 42 52 62 72 03 13 23 33 43 53 63 73
+ p1p0 = _mm_unpackhi_epi32(p1p0, x0);
+ p3p2 = _mm_unpackhi_epi64(p3p2, _mm_slli_si128(p3p2, 8)); // swap lo and high
+ p1p0 = _mm_unpackhi_epi64(p1p0, _mm_slli_si128(p1p0, 8)); // swap lo and high
+
+ // 04 14 24 34 05 15 25 35 06 16 26 36 07 17 27 37
+ q1q0 = _mm_unpackhi_epi16(q1q0, x1);
+ // 44 54 64 74 45 55 65 75 46 56 66 76 47 57 67 77
+ x2 = _mm_unpackhi_epi16(x2, x3);
+ // 06 16 26 36 46 56 66 76 07 17 27 37 47 57 67 77
+ q3q2 = _mm_unpackhi_epi32(q1q0, x2);
+ // 04 14 24 34 44 54 64 74 05 15 25 35 45 55 65 75
+ q1q0 = _mm_unpacklo_epi32(q1q0, x2);
+
+ q0p0 = _mm_unpacklo_epi64(p1p0, q1q0);
+ q1p1 = _mm_unpackhi_epi64(p1p0, q1q0);
+ p1p0 = _mm_unpacklo_epi64(q0p0, q1p1);
+ p2p1 = _mm_unpacklo_epi64(q1p1, p3p2);
+ q2q1 = _mm_unpacklo_epi64(_mm_srli_si128(q1p1, 8), q3q2);
+
+ FILTER_HEV_MASK;
+ FILTER4;
+
+ // Transpose 8x4 to 4x8
+ // qs1qs0: 20 21 22 23 24 25 26 27 30 31 32 33 34 34 36 37
+ // ps1ps0: 10 11 12 13 14 15 16 17 00 01 02 03 04 05 06 07
+ // 00 01 02 03 04 05 06 07 10 11 12 13 14 15 16 17
+ ps1ps0 = _mm_unpackhi_epi64(ps1ps0, _mm_slli_si128(ps1ps0, 8));
+ // 10 30 11 31 12 32 13 33 14 34 15 35 16 36 17 37
+ x0 = _mm_unpackhi_epi8(ps1ps0, qs1qs0);
+ // 00 20 01 21 02 22 03 23 04 24 05 25 06 26 07 27
+ ps1ps0 = _mm_unpacklo_epi8(ps1ps0, qs1qs0);
+ // 04 14 24 34 05 15 25 35 06 16 26 36 07 17 27 37
+ qs1qs0 = _mm_unpackhi_epi8(ps1ps0, x0);
+ // 00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33
+ ps1ps0 = _mm_unpacklo_epi8(ps1ps0, x0);
+
+ storeu_int32(s + 0 * pitch - 2, _mm_cvtsi128_si32(ps1ps0));
+ ps1ps0 = _mm_srli_si128(ps1ps0, 4);
+ storeu_int32(s + 1 * pitch - 2, _mm_cvtsi128_si32(ps1ps0));
+ ps1ps0 = _mm_srli_si128(ps1ps0, 4);
+ storeu_int32(s + 2 * pitch - 2, _mm_cvtsi128_si32(ps1ps0));
+ ps1ps0 = _mm_srli_si128(ps1ps0, 4);
+ storeu_int32(s + 3 * pitch - 2, _mm_cvtsi128_si32(ps1ps0));
+
+ storeu_int32(s + 4 * pitch - 2, _mm_cvtsi128_si32(qs1qs0));
+ qs1qs0 = _mm_srli_si128(qs1qs0, 4);
+ storeu_int32(s + 5 * pitch - 2, _mm_cvtsi128_si32(qs1qs0));
+ qs1qs0 = _mm_srli_si128(qs1qs0, 4);
+ storeu_int32(s + 6 * pitch - 2, _mm_cvtsi128_si32(qs1qs0));
+ qs1qs0 = _mm_srli_si128(qs1qs0, 4);
+ storeu_int32(s + 7 * pitch - 2, _mm_cvtsi128_si32(qs1qs0));
+}
+
+void vpx_lpf_horizontal_16_sse2(unsigned char *s, int pitch,
+ const unsigned char *blimit,
+ const unsigned char *limit,
+ const unsigned char *thresh) {
+ const __m128i zero = _mm_setzero_si128();
+ const __m128i one = _mm_set1_epi8(1);
+ const __m128i blimit_v = _mm_load_si128((const __m128i *)blimit);
+ const __m128i limit_v = _mm_load_si128((const __m128i *)limit);
+ const __m128i thresh_v = _mm_load_si128((const __m128i *)thresh);
+ __m128i mask, hev, flat, flat2;
+ __m128i q7p7, q6p6, q5p5, q4p4, q3p3, q2p2, q1p1, q0p0, p0q0, p1q1;
+ __m128i abs_p1p0;
+
+ q4p4 = _mm_loadl_epi64((__m128i *)(s - 5 * pitch));
+ q4p4 = _mm_castps_si128(
+ _mm_loadh_pi(_mm_castsi128_ps(q4p4), (__m64 *)(s + 4 * pitch)));
+ q3p3 = _mm_loadl_epi64((__m128i *)(s - 4 * pitch));
+ q3p3 = _mm_castps_si128(
+ _mm_loadh_pi(_mm_castsi128_ps(q3p3), (__m64 *)(s + 3 * pitch)));
+ q2p2 = _mm_loadl_epi64((__m128i *)(s - 3 * pitch));
+ q2p2 = _mm_castps_si128(
+ _mm_loadh_pi(_mm_castsi128_ps(q2p2), (__m64 *)(s + 2 * pitch)));
+ q1p1 = _mm_loadl_epi64((__m128i *)(s - 2 * pitch));
+ q1p1 = _mm_castps_si128(
+ _mm_loadh_pi(_mm_castsi128_ps(q1p1), (__m64 *)(s + 1 * pitch)));
+ p1q1 = _mm_shuffle_epi32(q1p1, 78);
+ q0p0 = _mm_loadl_epi64((__m128i *)(s - 1 * pitch));
+ q0p0 = _mm_castps_si128(
+ _mm_loadh_pi(_mm_castsi128_ps(q0p0), (__m64 *)(s - 0 * pitch)));
+ p0q0 = _mm_shuffle_epi32(q0p0, 78);
+
+ {
+ __m128i abs_p1q1, abs_p0q0, abs_q1q0, fe, ff, work;
+ abs_p1p0 = abs_diff(q1p1, q0p0);
+ abs_q1q0 = _mm_srli_si128(abs_p1p0, 8);
+ fe = _mm_set1_epi8((int8_t)0xfe);
+ ff = _mm_cmpeq_epi8(abs_p1p0, abs_p1p0);
+ abs_p0q0 = abs_diff(q0p0, p0q0);
+ abs_p1q1 = abs_diff(q1p1, p1q1);
+ flat = _mm_max_epu8(abs_p1p0, abs_q1q0);
+ hev = _mm_subs_epu8(flat, thresh_v);
+ hev = _mm_xor_si128(_mm_cmpeq_epi8(hev, zero), ff);
+
+ abs_p0q0 = _mm_adds_epu8(abs_p0q0, abs_p0q0);
+ abs_p1q1 = _mm_srli_epi16(_mm_and_si128(abs_p1q1, fe), 1);
+ mask = _mm_subs_epu8(_mm_adds_epu8(abs_p0q0, abs_p1q1), blimit_v);
+ mask = _mm_xor_si128(_mm_cmpeq_epi8(mask, zero), ff);
+ // mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2 > blimit) * -1;
+ mask = _mm_max_epu8(abs_p1p0, mask);
+ // mask |= (abs(p1 - p0) > limit) * -1;
+ // mask |= (abs(q1 - q0) > limit) * -1;
+
+ work = _mm_max_epu8(abs_diff(q2p2, q1p1), abs_diff(q3p3, q2p2));
+ mask = _mm_max_epu8(work, mask);
+ mask = _mm_max_epu8(mask, _mm_srli_si128(mask, 8));
+ mask = _mm_subs_epu8(mask, limit_v);
+ mask = _mm_cmpeq_epi8(mask, zero);
+ }
+
+ // lp filter
+ {
+ const __m128i t4 = _mm_set1_epi8(4);
+ const __m128i t3 = _mm_set1_epi8(3);
+ const __m128i t80 = _mm_set1_epi8((int8_t)0x80);
+ const __m128i t1 = _mm_set1_epi16(0x1);
+ __m128i qs1ps1 = _mm_xor_si128(q1p1, t80);
+ __m128i qs0ps0 = _mm_xor_si128(q0p0, t80);
+ __m128i qs0 = _mm_xor_si128(p0q0, t80);
+ __m128i qs1 = _mm_xor_si128(p1q1, t80);
+ __m128i filt;
+ __m128i work_a;
+ __m128i filter1, filter2;
+ __m128i flat2_q6p6, flat2_q5p5, flat2_q4p4, flat2_q3p3, flat2_q2p2;
+ __m128i flat2_q1p1, flat2_q0p0, flat_q2p2, flat_q1p1, flat_q0p0;
+
+ filt = _mm_and_si128(_mm_subs_epi8(qs1ps1, qs1), hev);
+ work_a = _mm_subs_epi8(qs0, qs0ps0);
+ filt = _mm_adds_epi8(filt, work_a);
+ filt = _mm_adds_epi8(filt, work_a);
+ filt = _mm_adds_epi8(filt, work_a);
+ // (vpx_filter + 3 * (qs0 - ps0)) & mask
+ filt = _mm_and_si128(filt, mask);
+
+ filter1 = _mm_adds_epi8(filt, t4);
+ filter2 = _mm_adds_epi8(filt, t3);
+
+ filter1 = _mm_unpacklo_epi8(zero, filter1);
+ filter1 = _mm_srai_epi16(filter1, 0xB);
+ filter2 = _mm_unpacklo_epi8(zero, filter2);
+ filter2 = _mm_srai_epi16(filter2, 0xB);
+
+ // Filter1 >> 3
+ filt = _mm_packs_epi16(filter2, _mm_subs_epi16(zero, filter1));
+ qs0ps0 = _mm_xor_si128(_mm_adds_epi8(qs0ps0, filt), t80);
+
+ // filt >> 1
+ filt = _mm_adds_epi16(filter1, t1);
+ filt = _mm_srai_epi16(filt, 1);
+ filt = _mm_andnot_si128(_mm_srai_epi16(_mm_unpacklo_epi8(zero, hev), 0x8),
+ filt);
+ filt = _mm_packs_epi16(filt, _mm_subs_epi16(zero, filt));
+ qs1ps1 = _mm_xor_si128(_mm_adds_epi8(qs1ps1, filt), t80);
+ // loopfilter done
+
+ {
+ __m128i work;
+ flat = _mm_max_epu8(abs_diff(q2p2, q0p0), abs_diff(q3p3, q0p0));
+ flat = _mm_max_epu8(abs_p1p0, flat);
+ flat = _mm_max_epu8(flat, _mm_srli_si128(flat, 8));
+ flat = _mm_subs_epu8(flat, one);
+ flat = _mm_cmpeq_epi8(flat, zero);
+ flat = _mm_and_si128(flat, mask);
+
+ q5p5 = _mm_loadl_epi64((__m128i *)(s - 6 * pitch));
+ q5p5 = _mm_castps_si128(
+ _mm_loadh_pi(_mm_castsi128_ps(q5p5), (__m64 *)(s + 5 * pitch)));
+
+ q6p6 = _mm_loadl_epi64((__m128i *)(s - 7 * pitch));
+ q6p6 = _mm_castps_si128(
+ _mm_loadh_pi(_mm_castsi128_ps(q6p6), (__m64 *)(s + 6 * pitch)));
+ flat2 = _mm_max_epu8(abs_diff(q4p4, q0p0), abs_diff(q5p5, q0p0));
+
+ q7p7 = _mm_loadl_epi64((__m128i *)(s - 8 * pitch));
+ q7p7 = _mm_castps_si128(
+ _mm_loadh_pi(_mm_castsi128_ps(q7p7), (__m64 *)(s + 7 * pitch)));
+ work = _mm_max_epu8(abs_diff(q6p6, q0p0), abs_diff(q7p7, q0p0));
+ flat2 = _mm_max_epu8(work, flat2);
+ flat2 = _mm_max_epu8(flat2, _mm_srli_si128(flat2, 8));
+ flat2 = _mm_subs_epu8(flat2, one);
+ flat2 = _mm_cmpeq_epi8(flat2, zero);
+ flat2 = _mm_and_si128(flat2, flat); // flat2 & flat & mask
+ }
+
+ // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+ // flat and wide flat calculations
+ {
+ const __m128i eight = _mm_set1_epi16(8);
+ const __m128i four = _mm_set1_epi16(4);
+ __m128i p7_16, p6_16, p5_16, p4_16, p3_16, p2_16, p1_16, p0_16;
+ __m128i q7_16, q6_16, q5_16, q4_16, q3_16, q2_16, q1_16, q0_16;
+ __m128i pixelFilter_p, pixelFilter_q;
+ __m128i pixetFilter_p2p1p0, pixetFilter_q2q1q0;
+ __m128i sum_p7, sum_q7, sum_p3, sum_q3, res_p, res_q;
+
+ p7_16 = _mm_unpacklo_epi8(q7p7, zero);
+ p6_16 = _mm_unpacklo_epi8(q6p6, zero);
+ p5_16 = _mm_unpacklo_epi8(q5p5, zero);
+ p4_16 = _mm_unpacklo_epi8(q4p4, zero);
+ p3_16 = _mm_unpacklo_epi8(q3p3, zero);
+ p2_16 = _mm_unpacklo_epi8(q2p2, zero);
+ p1_16 = _mm_unpacklo_epi8(q1p1, zero);
+ p0_16 = _mm_unpacklo_epi8(q0p0, zero);
+ q0_16 = _mm_unpackhi_epi8(q0p0, zero);
+ q1_16 = _mm_unpackhi_epi8(q1p1, zero);
+ q2_16 = _mm_unpackhi_epi8(q2p2, zero);
+ q3_16 = _mm_unpackhi_epi8(q3p3, zero);
+ q4_16 = _mm_unpackhi_epi8(q4p4, zero);
+ q5_16 = _mm_unpackhi_epi8(q5p5, zero);
+ q6_16 = _mm_unpackhi_epi8(q6p6, zero);
+ q7_16 = _mm_unpackhi_epi8(q7p7, zero);
+
+ pixelFilter_p = _mm_add_epi16(_mm_add_epi16(p6_16, p5_16),
+ _mm_add_epi16(p4_16, p3_16));
+ pixelFilter_q = _mm_add_epi16(_mm_add_epi16(q6_16, q5_16),
+ _mm_add_epi16(q4_16, q3_16));
+
+ pixetFilter_p2p1p0 = _mm_add_epi16(p0_16, _mm_add_epi16(p2_16, p1_16));
+ pixelFilter_p = _mm_add_epi16(pixelFilter_p, pixetFilter_p2p1p0);
+
+ pixetFilter_q2q1q0 = _mm_add_epi16(q0_16, _mm_add_epi16(q2_16, q1_16));
+ pixelFilter_q = _mm_add_epi16(pixelFilter_q, pixetFilter_q2q1q0);
+ pixelFilter_p =
+ _mm_add_epi16(eight, _mm_add_epi16(pixelFilter_p, pixelFilter_q));
+ pixetFilter_p2p1p0 = _mm_add_epi16(
+ four, _mm_add_epi16(pixetFilter_p2p1p0, pixetFilter_q2q1q0));
+ res_p = _mm_srli_epi16(
+ _mm_add_epi16(pixelFilter_p, _mm_add_epi16(p7_16, p0_16)), 4);
+ res_q = _mm_srli_epi16(
+ _mm_add_epi16(pixelFilter_p, _mm_add_epi16(q7_16, q0_16)), 4);
+ flat2_q0p0 = _mm_packus_epi16(res_p, res_q);
+ res_p = _mm_srli_epi16(
+ _mm_add_epi16(pixetFilter_p2p1p0, _mm_add_epi16(p3_16, p0_16)), 3);
+ res_q = _mm_srli_epi16(
+ _mm_add_epi16(pixetFilter_p2p1p0, _mm_add_epi16(q3_16, q0_16)), 3);
+
+ flat_q0p0 = _mm_packus_epi16(res_p, res_q);
+
+ sum_p7 = _mm_add_epi16(p7_16, p7_16);
+ sum_q7 = _mm_add_epi16(q7_16, q7_16);
+ sum_p3 = _mm_add_epi16(p3_16, p3_16);
+ sum_q3 = _mm_add_epi16(q3_16, q3_16);
+
+ pixelFilter_q = _mm_sub_epi16(pixelFilter_p, p6_16);
+ pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q6_16);
+ res_p = _mm_srli_epi16(
+ _mm_add_epi16(pixelFilter_p, _mm_add_epi16(sum_p7, p1_16)), 4);
+ res_q = _mm_srli_epi16(
+ _mm_add_epi16(pixelFilter_q, _mm_add_epi16(sum_q7, q1_16)), 4);
+ flat2_q1p1 = _mm_packus_epi16(res_p, res_q);
+
+ pixetFilter_q2q1q0 = _mm_sub_epi16(pixetFilter_p2p1p0, p2_16);
+ pixetFilter_p2p1p0 = _mm_sub_epi16(pixetFilter_p2p1p0, q2_16);
+ res_p = _mm_srli_epi16(
+ _mm_add_epi16(pixetFilter_p2p1p0, _mm_add_epi16(sum_p3, p1_16)), 3);
+ res_q = _mm_srli_epi16(
+ _mm_add_epi16(pixetFilter_q2q1q0, _mm_add_epi16(sum_q3, q1_16)), 3);
+ flat_q1p1 = _mm_packus_epi16(res_p, res_q);
+
+ sum_p7 = _mm_add_epi16(sum_p7, p7_16);
+ sum_q7 = _mm_add_epi16(sum_q7, q7_16);
+ sum_p3 = _mm_add_epi16(sum_p3, p3_16);
+ sum_q3 = _mm_add_epi16(sum_q3, q3_16);
+
+ pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q5_16);
+ pixelFilter_q = _mm_sub_epi16(pixelFilter_q, p5_16);
+ res_p = _mm_srli_epi16(
+ _mm_add_epi16(pixelFilter_p, _mm_add_epi16(sum_p7, p2_16)), 4);
+ res_q = _mm_srli_epi16(
+ _mm_add_epi16(pixelFilter_q, _mm_add_epi16(sum_q7, q2_16)), 4);
+ flat2_q2p2 = _mm_packus_epi16(res_p, res_q);
+
+ pixetFilter_p2p1p0 = _mm_sub_epi16(pixetFilter_p2p1p0, q1_16);
+ pixetFilter_q2q1q0 = _mm_sub_epi16(pixetFilter_q2q1q0, p1_16);
+
+ res_p = _mm_srli_epi16(
+ _mm_add_epi16(pixetFilter_p2p1p0, _mm_add_epi16(sum_p3, p2_16)), 3);
+ res_q = _mm_srli_epi16(
+ _mm_add_epi16(pixetFilter_q2q1q0, _mm_add_epi16(sum_q3, q2_16)), 3);
+ flat_q2p2 = _mm_packus_epi16(res_p, res_q);
+
+ sum_p7 = _mm_add_epi16(sum_p7, p7_16);
+ sum_q7 = _mm_add_epi16(sum_q7, q7_16);
+ pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q4_16);
+ pixelFilter_q = _mm_sub_epi16(pixelFilter_q, p4_16);
+ res_p = _mm_srli_epi16(
+ _mm_add_epi16(pixelFilter_p, _mm_add_epi16(sum_p7, p3_16)), 4);
+ res_q = _mm_srli_epi16(
+ _mm_add_epi16(pixelFilter_q, _mm_add_epi16(sum_q7, q3_16)), 4);
+ flat2_q3p3 = _mm_packus_epi16(res_p, res_q);
+
+ sum_p7 = _mm_add_epi16(sum_p7, p7_16);
+ sum_q7 = _mm_add_epi16(sum_q7, q7_16);
+ pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q3_16);
+ pixelFilter_q = _mm_sub_epi16(pixelFilter_q, p3_16);
+ res_p = _mm_srli_epi16(
+ _mm_add_epi16(pixelFilter_p, _mm_add_epi16(sum_p7, p4_16)), 4);
+ res_q = _mm_srli_epi16(
+ _mm_add_epi16(pixelFilter_q, _mm_add_epi16(sum_q7, q4_16)), 4);
+ flat2_q4p4 = _mm_packus_epi16(res_p, res_q);
+
+ sum_p7 = _mm_add_epi16(sum_p7, p7_16);
+ sum_q7 = _mm_add_epi16(sum_q7, q7_16);
+ pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q2_16);
+ pixelFilter_q = _mm_sub_epi16(pixelFilter_q, p2_16);
+ res_p = _mm_srli_epi16(
+ _mm_add_epi16(pixelFilter_p, _mm_add_epi16(sum_p7, p5_16)), 4);
+ res_q = _mm_srli_epi16(
+ _mm_add_epi16(pixelFilter_q, _mm_add_epi16(sum_q7, q5_16)), 4);
+ flat2_q5p5 = _mm_packus_epi16(res_p, res_q);
+
+ sum_p7 = _mm_add_epi16(sum_p7, p7_16);
+ sum_q7 = _mm_add_epi16(sum_q7, q7_16);
+ pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q1_16);
+ pixelFilter_q = _mm_sub_epi16(pixelFilter_q, p1_16);
+ res_p = _mm_srli_epi16(
+ _mm_add_epi16(pixelFilter_p, _mm_add_epi16(sum_p7, p6_16)), 4);
+ res_q = _mm_srli_epi16(
+ _mm_add_epi16(pixelFilter_q, _mm_add_epi16(sum_q7, q6_16)), 4);
+ flat2_q6p6 = _mm_packus_epi16(res_p, res_q);
+ }
+ // wide flat
+ // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+ flat = _mm_shuffle_epi32(flat, 68);
+ flat2 = _mm_shuffle_epi32(flat2, 68);
+
+ q2p2 = _mm_andnot_si128(flat, q2p2);
+ flat_q2p2 = _mm_and_si128(flat, flat_q2p2);
+ q2p2 = _mm_or_si128(q2p2, flat_q2p2);
+
+ qs1ps1 = _mm_andnot_si128(flat, qs1ps1);
+ flat_q1p1 = _mm_and_si128(flat, flat_q1p1);
+ q1p1 = _mm_or_si128(qs1ps1, flat_q1p1);
+
+ qs0ps0 = _mm_andnot_si128(flat, qs0ps0);
+ flat_q0p0 = _mm_and_si128(flat, flat_q0p0);
+ q0p0 = _mm_or_si128(qs0ps0, flat_q0p0);
+
+ q6p6 = _mm_andnot_si128(flat2, q6p6);
+ flat2_q6p6 = _mm_and_si128(flat2, flat2_q6p6);
+ q6p6 = _mm_or_si128(q6p6, flat2_q6p6);
+ _mm_storel_epi64((__m128i *)(s - 7 * pitch), q6p6);
+ _mm_storeh_pi((__m64 *)(s + 6 * pitch), _mm_castsi128_ps(q6p6));
+
+ q5p5 = _mm_andnot_si128(flat2, q5p5);
+ flat2_q5p5 = _mm_and_si128(flat2, flat2_q5p5);
+ q5p5 = _mm_or_si128(q5p5, flat2_q5p5);
+ _mm_storel_epi64((__m128i *)(s - 6 * pitch), q5p5);
+ _mm_storeh_pi((__m64 *)(s + 5 * pitch), _mm_castsi128_ps(q5p5));
+
+ q4p4 = _mm_andnot_si128(flat2, q4p4);
+ flat2_q4p4 = _mm_and_si128(flat2, flat2_q4p4);
+ q4p4 = _mm_or_si128(q4p4, flat2_q4p4);
+ _mm_storel_epi64((__m128i *)(s - 5 * pitch), q4p4);
+ _mm_storeh_pi((__m64 *)(s + 4 * pitch), _mm_castsi128_ps(q4p4));
+
+ q3p3 = _mm_andnot_si128(flat2, q3p3);
+ flat2_q3p3 = _mm_and_si128(flat2, flat2_q3p3);
+ q3p3 = _mm_or_si128(q3p3, flat2_q3p3);
+ _mm_storel_epi64((__m128i *)(s - 4 * pitch), q3p3);
+ _mm_storeh_pi((__m64 *)(s + 3 * pitch), _mm_castsi128_ps(q3p3));
+
+ q2p2 = _mm_andnot_si128(flat2, q2p2);
+ flat2_q2p2 = _mm_and_si128(flat2, flat2_q2p2);
+ q2p2 = _mm_or_si128(q2p2, flat2_q2p2);
+ _mm_storel_epi64((__m128i *)(s - 3 * pitch), q2p2);
+ _mm_storeh_pi((__m64 *)(s + 2 * pitch), _mm_castsi128_ps(q2p2));
+
+ q1p1 = _mm_andnot_si128(flat2, q1p1);
+ flat2_q1p1 = _mm_and_si128(flat2, flat2_q1p1);
+ q1p1 = _mm_or_si128(q1p1, flat2_q1p1);
+ _mm_storel_epi64((__m128i *)(s - 2 * pitch), q1p1);
+ _mm_storeh_pi((__m64 *)(s + 1 * pitch), _mm_castsi128_ps(q1p1));
+
+ q0p0 = _mm_andnot_si128(flat2, q0p0);
+ flat2_q0p0 = _mm_and_si128(flat2, flat2_q0p0);
+ q0p0 = _mm_or_si128(q0p0, flat2_q0p0);
+ _mm_storel_epi64((__m128i *)(s - 1 * pitch), q0p0);
+ _mm_storeh_pi((__m64 *)(s - 0 * pitch), _mm_castsi128_ps(q0p0));
+ }
+}
+
+static INLINE __m128i filter_add2_sub2(const __m128i *const total,
+ const __m128i *const a1,
+ const __m128i *const a2,
+ const __m128i *const s1,
+ const __m128i *const s2) {
+ __m128i x = _mm_add_epi16(*a1, *total);
+ x = _mm_add_epi16(_mm_sub_epi16(x, _mm_add_epi16(*s1, *s2)), *a2);
+ return x;
+}
+
+static INLINE __m128i filter8_mask(const __m128i *const flat,
+ const __m128i *const other_filt,
+ const __m128i *const f8_lo,
+ const __m128i *const f8_hi) {
+ const __m128i f8 =
+ _mm_packus_epi16(_mm_srli_epi16(*f8_lo, 3), _mm_srli_epi16(*f8_hi, 3));
+ const __m128i result = _mm_and_si128(*flat, f8);
+ return _mm_or_si128(_mm_andnot_si128(*flat, *other_filt), result);
+}
+
+static INLINE __m128i filter16_mask(const __m128i *const flat,
+ const __m128i *const other_filt,
+ const __m128i *const f_lo,
+ const __m128i *const f_hi) {
+ const __m128i f =
+ _mm_packus_epi16(_mm_srli_epi16(*f_lo, 4), _mm_srli_epi16(*f_hi, 4));
+ const __m128i result = _mm_and_si128(*flat, f);
+ return _mm_or_si128(_mm_andnot_si128(*flat, *other_filt), result);
+}
+
+void vpx_lpf_horizontal_16_dual_sse2(unsigned char *s, int pitch,
+ const unsigned char *blimit,
+ const unsigned char *limit,
+ const unsigned char *thresh) {
+ const __m128i zero = _mm_setzero_si128();
+ const __m128i one = _mm_set1_epi8(1);
+ const __m128i blimit_v = _mm_load_si128((const __m128i *)blimit);
+ const __m128i limit_v = _mm_load_si128((const __m128i *)limit);
+ const __m128i thresh_v = _mm_load_si128((const __m128i *)thresh);
+ __m128i mask, hev, flat, flat2;
+ __m128i p7, p6, p5;
+ __m128i p4, p3, p2, p1, p0, q0, q1, q2, q3, q4;
+ __m128i q5, q6, q7;
+
+ __m128i op2, op1, op0, oq0, oq1, oq2;
+
+ __m128i max_abs_p1p0q1q0;
+
+ p7 = _mm_loadu_si128((__m128i *)(s - 8 * pitch));
+ p6 = _mm_loadu_si128((__m128i *)(s - 7 * pitch));
+ p5 = _mm_loadu_si128((__m128i *)(s - 6 * pitch));
+ p4 = _mm_loadu_si128((__m128i *)(s - 5 * pitch));
+ p3 = _mm_loadu_si128((__m128i *)(s - 4 * pitch));
+ p2 = _mm_loadu_si128((__m128i *)(s - 3 * pitch));
+ p1 = _mm_loadu_si128((__m128i *)(s - 2 * pitch));
+ p0 = _mm_loadu_si128((__m128i *)(s - 1 * pitch));
+ q0 = _mm_loadu_si128((__m128i *)(s - 0 * pitch));
+ q1 = _mm_loadu_si128((__m128i *)(s + 1 * pitch));
+ q2 = _mm_loadu_si128((__m128i *)(s + 2 * pitch));
+ q3 = _mm_loadu_si128((__m128i *)(s + 3 * pitch));
+ q4 = _mm_loadu_si128((__m128i *)(s + 4 * pitch));
+ q5 = _mm_loadu_si128((__m128i *)(s + 5 * pitch));
+ q6 = _mm_loadu_si128((__m128i *)(s + 6 * pitch));
+ q7 = _mm_loadu_si128((__m128i *)(s + 7 * pitch));
+
+ {
+ const __m128i abs_p1p0 = abs_diff(p1, p0);
+ const __m128i abs_q1q0 = abs_diff(q1, q0);
+ const __m128i fe = _mm_set1_epi8((int8_t)0xfe);
+ const __m128i ff = _mm_cmpeq_epi8(zero, zero);
+ __m128i abs_p0q0 = abs_diff(p0, q0);
+ __m128i abs_p1q1 = abs_diff(p1, q1);
+ __m128i work;
+ max_abs_p1p0q1q0 = _mm_max_epu8(abs_p1p0, abs_q1q0);
+
+ abs_p0q0 = _mm_adds_epu8(abs_p0q0, abs_p0q0);
+ abs_p1q1 = _mm_srli_epi16(_mm_and_si128(abs_p1q1, fe), 1);
+ mask = _mm_subs_epu8(_mm_adds_epu8(abs_p0q0, abs_p1q1), blimit_v);
+ mask = _mm_xor_si128(_mm_cmpeq_epi8(mask, zero), ff);
+ // mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2 > blimit) * -1;
+ mask = _mm_max_epu8(max_abs_p1p0q1q0, mask);
+ // mask |= (abs(p1 - p0) > limit) * -1;
+ // mask |= (abs(q1 - q0) > limit) * -1;
+ work = _mm_max_epu8(abs_diff(p2, p1), abs_diff(p3, p2));
+ mask = _mm_max_epu8(work, mask);
+ work = _mm_max_epu8(abs_diff(q2, q1), abs_diff(q3, q2));
+ mask = _mm_max_epu8(work, mask);
+ mask = _mm_subs_epu8(mask, limit_v);
+ mask = _mm_cmpeq_epi8(mask, zero);
+ }
+
+ {
+ __m128i work;
+ work = _mm_max_epu8(abs_diff(p2, p0), abs_diff(q2, q0));
+ flat = _mm_max_epu8(work, max_abs_p1p0q1q0);
+ work = _mm_max_epu8(abs_diff(p3, p0), abs_diff(q3, q0));
+ flat = _mm_max_epu8(work, flat);
+ work = _mm_max_epu8(abs_diff(p4, p0), abs_diff(q4, q0));
+ flat = _mm_subs_epu8(flat, one);
+ flat = _mm_cmpeq_epi8(flat, zero);
+ flat = _mm_and_si128(flat, mask);
+ flat2 = _mm_max_epu8(abs_diff(p5, p0), abs_diff(q5, q0));
+ flat2 = _mm_max_epu8(work, flat2);
+ work = _mm_max_epu8(abs_diff(p6, p0), abs_diff(q6, q0));
+ flat2 = _mm_max_epu8(work, flat2);
+ work = _mm_max_epu8(abs_diff(p7, p0), abs_diff(q7, q0));
+ flat2 = _mm_max_epu8(work, flat2);
+ flat2 = _mm_subs_epu8(flat2, one);
+ flat2 = _mm_cmpeq_epi8(flat2, zero);
+ flat2 = _mm_and_si128(flat2, flat); // flat2 & flat & mask
+ }
+
+ // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+ // filter4
+ {
+ const __m128i t4 = _mm_set1_epi8(4);
+ const __m128i t3 = _mm_set1_epi8(3);
+ const __m128i t80 = _mm_set1_epi8((int8_t)0x80);
+ const __m128i te0 = _mm_set1_epi8((int8_t)0xe0);
+ const __m128i t1f = _mm_set1_epi8(0x1f);
+ const __m128i t1 = _mm_set1_epi8(0x1);
+ const __m128i t7f = _mm_set1_epi8(0x7f);
+ const __m128i ff = _mm_cmpeq_epi8(t4, t4);
+
+ __m128i filt;
+ __m128i work_a;
+ __m128i filter1, filter2;
+
+ op1 = _mm_xor_si128(p1, t80);
+ op0 = _mm_xor_si128(p0, t80);
+ oq0 = _mm_xor_si128(q0, t80);
+ oq1 = _mm_xor_si128(q1, t80);
+
+ hev = _mm_subs_epu8(max_abs_p1p0q1q0, thresh_v);
+ hev = _mm_xor_si128(_mm_cmpeq_epi8(hev, zero), ff);
+ filt = _mm_and_si128(_mm_subs_epi8(op1, oq1), hev);
+
+ work_a = _mm_subs_epi8(oq0, op0);
+ filt = _mm_adds_epi8(filt, work_a);
+ filt = _mm_adds_epi8(filt, work_a);
+ filt = _mm_adds_epi8(filt, work_a);
+ // (vpx_filter + 3 * (qs0 - ps0)) & mask
+ filt = _mm_and_si128(filt, mask);
+ filter1 = _mm_adds_epi8(filt, t4);
+ filter2 = _mm_adds_epi8(filt, t3);
+
+ // Filter1 >> 3
+ work_a = _mm_cmpgt_epi8(zero, filter1);
+ filter1 = _mm_srli_epi16(filter1, 3);
+ work_a = _mm_and_si128(work_a, te0);
+ filter1 = _mm_and_si128(filter1, t1f);
+ filter1 = _mm_or_si128(filter1, work_a);
+ oq0 = _mm_xor_si128(_mm_subs_epi8(oq0, filter1), t80);
+
+ // Filter2 >> 3
+ work_a = _mm_cmpgt_epi8(zero, filter2);
+ filter2 = _mm_srli_epi16(filter2, 3);
+ work_a = _mm_and_si128(work_a, te0);
+ filter2 = _mm_and_si128(filter2, t1f);
+ filter2 = _mm_or_si128(filter2, work_a);
+ op0 = _mm_xor_si128(_mm_adds_epi8(op0, filter2), t80);
+
+ // filt >> 1
+ filt = _mm_adds_epi8(filter1, t1);
+ work_a = _mm_cmpgt_epi8(zero, filt);
+ filt = _mm_srli_epi16(filt, 1);
+ work_a = _mm_and_si128(work_a, t80);
+ filt = _mm_and_si128(filt, t7f);
+ filt = _mm_or_si128(filt, work_a);
+ filt = _mm_andnot_si128(hev, filt);
+ op1 = _mm_xor_si128(_mm_adds_epi8(op1, filt), t80);
+ oq1 = _mm_xor_si128(_mm_subs_epi8(oq1, filt), t80);
+ // loopfilter done
+
+ // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+ // filter8
+ {
+ const __m128i four = _mm_set1_epi16(4);
+ const __m128i p3_lo = _mm_unpacklo_epi8(p3, zero);
+ const __m128i p2_lo = _mm_unpacklo_epi8(p2, zero);
+ const __m128i p1_lo = _mm_unpacklo_epi8(p1, zero);
+ const __m128i p0_lo = _mm_unpacklo_epi8(p0, zero);
+ const __m128i q0_lo = _mm_unpacklo_epi8(q0, zero);
+ const __m128i q1_lo = _mm_unpacklo_epi8(q1, zero);
+ const __m128i q2_lo = _mm_unpacklo_epi8(q2, zero);
+ const __m128i q3_lo = _mm_unpacklo_epi8(q3, zero);
+
+ const __m128i p3_hi = _mm_unpackhi_epi8(p3, zero);
+ const __m128i p2_hi = _mm_unpackhi_epi8(p2, zero);
+ const __m128i p1_hi = _mm_unpackhi_epi8(p1, zero);
+ const __m128i p0_hi = _mm_unpackhi_epi8(p0, zero);
+ const __m128i q0_hi = _mm_unpackhi_epi8(q0, zero);
+ const __m128i q1_hi = _mm_unpackhi_epi8(q1, zero);
+ const __m128i q2_hi = _mm_unpackhi_epi8(q2, zero);
+ const __m128i q3_hi = _mm_unpackhi_epi8(q3, zero);
+ __m128i f8_lo, f8_hi;
+
+ f8_lo = _mm_add_epi16(_mm_add_epi16(p3_lo, four),
+ _mm_add_epi16(p3_lo, p2_lo));
+ f8_lo = _mm_add_epi16(_mm_add_epi16(p3_lo, f8_lo),
+ _mm_add_epi16(p2_lo, p1_lo));
+ f8_lo = _mm_add_epi16(_mm_add_epi16(p0_lo, q0_lo), f8_lo);
+
+ f8_hi = _mm_add_epi16(_mm_add_epi16(p3_hi, four),
+ _mm_add_epi16(p3_hi, p2_hi));
+ f8_hi = _mm_add_epi16(_mm_add_epi16(p3_hi, f8_hi),
+ _mm_add_epi16(p2_hi, p1_hi));
+ f8_hi = _mm_add_epi16(_mm_add_epi16(p0_hi, q0_hi), f8_hi);
+
+ op2 = filter8_mask(&flat, &p2, &f8_lo, &f8_hi);
+
+ f8_lo = filter_add2_sub2(&f8_lo, &q1_lo, &p1_lo, &p2_lo, &p3_lo);
+ f8_hi = filter_add2_sub2(&f8_hi, &q1_hi, &p1_hi, &p2_hi, &p3_hi);
+ op1 = filter8_mask(&flat, &op1, &f8_lo, &f8_hi);
+
+ f8_lo = filter_add2_sub2(&f8_lo, &q2_lo, &p0_lo, &p1_lo, &p3_lo);
+ f8_hi = filter_add2_sub2(&f8_hi, &q2_hi, &p0_hi, &p1_hi, &p3_hi);
+ op0 = filter8_mask(&flat, &op0, &f8_lo, &f8_hi);
+
+ f8_lo = filter_add2_sub2(&f8_lo, &q3_lo, &q0_lo, &p0_lo, &p3_lo);
+ f8_hi = filter_add2_sub2(&f8_hi, &q3_hi, &q0_hi, &p0_hi, &p3_hi);
+ oq0 = filter8_mask(&flat, &oq0, &f8_lo, &f8_hi);
+
+ f8_lo = filter_add2_sub2(&f8_lo, &q3_lo, &q1_lo, &q0_lo, &p2_lo);
+ f8_hi = filter_add2_sub2(&f8_hi, &q3_hi, &q1_hi, &q0_hi, &p2_hi);
+ oq1 = filter8_mask(&flat, &oq1, &f8_lo, &f8_hi);
+
+ f8_lo = filter_add2_sub2(&f8_lo, &q3_lo, &q2_lo, &q1_lo, &p1_lo);
+ f8_hi = filter_add2_sub2(&f8_hi, &q3_hi, &q2_hi, &q1_hi, &p1_hi);
+ oq2 = filter8_mask(&flat, &q2, &f8_lo, &f8_hi);
+ }
+
+ // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+ // wide flat calculations
+ {
+ const __m128i eight = _mm_set1_epi16(8);
+ const __m128i p7_lo = _mm_unpacklo_epi8(p7, zero);
+ const __m128i p6_lo = _mm_unpacklo_epi8(p6, zero);
+ const __m128i p5_lo = _mm_unpacklo_epi8(p5, zero);
+ const __m128i p4_lo = _mm_unpacklo_epi8(p4, zero);
+ const __m128i p3_lo = _mm_unpacklo_epi8(p3, zero);
+ const __m128i p2_lo = _mm_unpacklo_epi8(p2, zero);
+ const __m128i p1_lo = _mm_unpacklo_epi8(p1, zero);
+ const __m128i p0_lo = _mm_unpacklo_epi8(p0, zero);
+ const __m128i q0_lo = _mm_unpacklo_epi8(q0, zero);
+ const __m128i q1_lo = _mm_unpacklo_epi8(q1, zero);
+ const __m128i q2_lo = _mm_unpacklo_epi8(q2, zero);
+ const __m128i q3_lo = _mm_unpacklo_epi8(q3, zero);
+ const __m128i q4_lo = _mm_unpacklo_epi8(q4, zero);
+ const __m128i q5_lo = _mm_unpacklo_epi8(q5, zero);
+ const __m128i q6_lo = _mm_unpacklo_epi8(q6, zero);
+ const __m128i q7_lo = _mm_unpacklo_epi8(q7, zero);
+
+ const __m128i p7_hi = _mm_unpackhi_epi8(p7, zero);
+ const __m128i p6_hi = _mm_unpackhi_epi8(p6, zero);
+ const __m128i p5_hi = _mm_unpackhi_epi8(p5, zero);
+ const __m128i p4_hi = _mm_unpackhi_epi8(p4, zero);
+ const __m128i p3_hi = _mm_unpackhi_epi8(p3, zero);
+ const __m128i p2_hi = _mm_unpackhi_epi8(p2, zero);
+ const __m128i p1_hi = _mm_unpackhi_epi8(p1, zero);
+ const __m128i p0_hi = _mm_unpackhi_epi8(p0, zero);
+ const __m128i q0_hi = _mm_unpackhi_epi8(q0, zero);
+ const __m128i q1_hi = _mm_unpackhi_epi8(q1, zero);
+ const __m128i q2_hi = _mm_unpackhi_epi8(q2, zero);
+ const __m128i q3_hi = _mm_unpackhi_epi8(q3, zero);
+ const __m128i q4_hi = _mm_unpackhi_epi8(q4, zero);
+ const __m128i q5_hi = _mm_unpackhi_epi8(q5, zero);
+ const __m128i q6_hi = _mm_unpackhi_epi8(q6, zero);
+ const __m128i q7_hi = _mm_unpackhi_epi8(q7, zero);
+
+ __m128i f_lo;
+ __m128i f_hi;
+
+ f_lo = _mm_sub_epi16(_mm_slli_epi16(p7_lo, 3), p7_lo); // p7 * 7
+ f_lo =
+ _mm_add_epi16(_mm_slli_epi16(p6_lo, 1), _mm_add_epi16(p4_lo, f_lo));
+ f_lo = _mm_add_epi16(_mm_add_epi16(p3_lo, f_lo),
+ _mm_add_epi16(p2_lo, p1_lo));
+ f_lo = _mm_add_epi16(_mm_add_epi16(p0_lo, q0_lo), f_lo);
+ f_lo = _mm_add_epi16(_mm_add_epi16(p5_lo, eight), f_lo);
+
+ f_hi = _mm_sub_epi16(_mm_slli_epi16(p7_hi, 3), p7_hi); // p7 * 7
+ f_hi =
+ _mm_add_epi16(_mm_slli_epi16(p6_hi, 1), _mm_add_epi16(p4_hi, f_hi));
+ f_hi = _mm_add_epi16(_mm_add_epi16(p3_hi, f_hi),
+ _mm_add_epi16(p2_hi, p1_hi));
+ f_hi = _mm_add_epi16(_mm_add_epi16(p0_hi, q0_hi), f_hi);
+ f_hi = _mm_add_epi16(_mm_add_epi16(p5_hi, eight), f_hi);
+
+ p6 = filter16_mask(&flat2, &p6, &f_lo, &f_hi);
+ _mm_storeu_si128((__m128i *)(s - 7 * pitch), p6);
+
+ f_lo = filter_add2_sub2(&f_lo, &q1_lo, &p5_lo, &p6_lo, &p7_lo);
+ f_hi = filter_add2_sub2(&f_hi, &q1_hi, &p5_hi, &p6_hi, &p7_hi);
+ p5 = filter16_mask(&flat2, &p5, &f_lo, &f_hi);
+ _mm_storeu_si128((__m128i *)(s - 6 * pitch), p5);
+
+ f_lo = filter_add2_sub2(&f_lo, &q2_lo, &p4_lo, &p5_lo, &p7_lo);
+ f_hi = filter_add2_sub2(&f_hi, &q2_hi, &p4_hi, &p5_hi, &p7_hi);
+ p4 = filter16_mask(&flat2, &p4, &f_lo, &f_hi);
+ _mm_storeu_si128((__m128i *)(s - 5 * pitch), p4);
+
+ f_lo = filter_add2_sub2(&f_lo, &q3_lo, &p3_lo, &p4_lo, &p7_lo);
+ f_hi = filter_add2_sub2(&f_hi, &q3_hi, &p3_hi, &p4_hi, &p7_hi);
+ p3 = filter16_mask(&flat2, &p3, &f_lo, &f_hi);
+ _mm_storeu_si128((__m128i *)(s - 4 * pitch), p3);
+
+ f_lo = filter_add2_sub2(&f_lo, &q4_lo, &p2_lo, &p3_lo, &p7_lo);
+ f_hi = filter_add2_sub2(&f_hi, &q4_hi, &p2_hi, &p3_hi, &p7_hi);
+ op2 = filter16_mask(&flat2, &op2, &f_lo, &f_hi);
+ _mm_storeu_si128((__m128i *)(s - 3 * pitch), op2);
+
+ f_lo = filter_add2_sub2(&f_lo, &q5_lo, &p1_lo, &p2_lo, &p7_lo);
+ f_hi = filter_add2_sub2(&f_hi, &q5_hi, &p1_hi, &p2_hi, &p7_hi);
+ op1 = filter16_mask(&flat2, &op1, &f_lo, &f_hi);
+ _mm_storeu_si128((__m128i *)(s - 2 * pitch), op1);
+
+ f_lo = filter_add2_sub2(&f_lo, &q6_lo, &p0_lo, &p1_lo, &p7_lo);
+ f_hi = filter_add2_sub2(&f_hi, &q6_hi, &p0_hi, &p1_hi, &p7_hi);
+ op0 = filter16_mask(&flat2, &op0, &f_lo, &f_hi);
+ _mm_storeu_si128((__m128i *)(s - 1 * pitch), op0);
+
+ f_lo = filter_add2_sub2(&f_lo, &q7_lo, &q0_lo, &p0_lo, &p7_lo);
+ f_hi = filter_add2_sub2(&f_hi, &q7_hi, &q0_hi, &p0_hi, &p7_hi);
+ oq0 = filter16_mask(&flat2, &oq0, &f_lo, &f_hi);
+ _mm_storeu_si128((__m128i *)(s - 0 * pitch), oq0);
+
+ f_lo = filter_add2_sub2(&f_lo, &q7_lo, &q1_lo, &p6_lo, &q0_lo);
+ f_hi = filter_add2_sub2(&f_hi, &q7_hi, &q1_hi, &p6_hi, &q0_hi);
+ oq1 = filter16_mask(&flat2, &oq1, &f_lo, &f_hi);
+ _mm_storeu_si128((__m128i *)(s + 1 * pitch), oq1);
+
+ f_lo = filter_add2_sub2(&f_lo, &q7_lo, &q2_lo, &p5_lo, &q1_lo);
+ f_hi = filter_add2_sub2(&f_hi, &q7_hi, &q2_hi, &p5_hi, &q1_hi);
+ oq2 = filter16_mask(&flat2, &oq2, &f_lo, &f_hi);
+ _mm_storeu_si128((__m128i *)(s + 2 * pitch), oq2);
+
+ f_lo = filter_add2_sub2(&f_lo, &q7_lo, &q3_lo, &p4_lo, &q2_lo);
+ f_hi = filter_add2_sub2(&f_hi, &q7_hi, &q3_hi, &p4_hi, &q2_hi);
+ q3 = filter16_mask(&flat2, &q3, &f_lo, &f_hi);
+ _mm_storeu_si128((__m128i *)(s + 3 * pitch), q3);
+
+ f_lo = filter_add2_sub2(&f_lo, &q7_lo, &q4_lo, &p3_lo, &q3_lo);
+ f_hi = filter_add2_sub2(&f_hi, &q7_hi, &q4_hi, &p3_hi, &q3_hi);
+ q4 = filter16_mask(&flat2, &q4, &f_lo, &f_hi);
+ _mm_storeu_si128((__m128i *)(s + 4 * pitch), q4);
+
+ f_lo = filter_add2_sub2(&f_lo, &q7_lo, &q5_lo, &p2_lo, &q4_lo);
+ f_hi = filter_add2_sub2(&f_hi, &q7_hi, &q5_hi, &p2_hi, &q4_hi);
+ q5 = filter16_mask(&flat2, &q5, &f_lo, &f_hi);
+ _mm_storeu_si128((__m128i *)(s + 5 * pitch), q5);
+
+ f_lo = filter_add2_sub2(&f_lo, &q7_lo, &q6_lo, &p1_lo, &q5_lo);
+ f_hi = filter_add2_sub2(&f_hi, &q7_hi, &q6_hi, &p1_hi, &q5_hi);
+ q6 = filter16_mask(&flat2, &q6, &f_lo, &f_hi);
+ _mm_storeu_si128((__m128i *)(s + 6 * pitch), q6);
+ }
+ // wide flat
+ // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+ }
+}
+
+void vpx_lpf_horizontal_8_sse2(unsigned char *s, int pitch,
+ const unsigned char *blimit,
+ const unsigned char *limit,
+ const unsigned char *thresh) {
+ DECLARE_ALIGNED(16, unsigned char, flat_op2[16]);
+ DECLARE_ALIGNED(16, unsigned char, flat_op1[16]);
+ DECLARE_ALIGNED(16, unsigned char, flat_op0[16]);
+ DECLARE_ALIGNED(16, unsigned char, flat_oq2[16]);
+ DECLARE_ALIGNED(16, unsigned char, flat_oq1[16]);
+ DECLARE_ALIGNED(16, unsigned char, flat_oq0[16]);
+ const __m128i zero = _mm_setzero_si128();
+ const __m128i blimit_v = _mm_load_si128((const __m128i *)blimit);
+ const __m128i limit_v = _mm_load_si128((const __m128i *)limit);
+ const __m128i thresh_v = _mm_load_si128((const __m128i *)thresh);
+ __m128i mask, hev, flat;
+ __m128i p3, p2, p1, p0, q0, q1, q2, q3;
+ __m128i q3p3, q2p2, q1p1, q0p0, p1q1, p0q0;
+
+ q3p3 = _mm_unpacklo_epi64(_mm_loadl_epi64((__m128i *)(s - 4 * pitch)),
+ _mm_loadl_epi64((__m128i *)(s + 3 * pitch)));
+ q2p2 = _mm_unpacklo_epi64(_mm_loadl_epi64((__m128i *)(s - 3 * pitch)),
+ _mm_loadl_epi64((__m128i *)(s + 2 * pitch)));
+ q1p1 = _mm_unpacklo_epi64(_mm_loadl_epi64((__m128i *)(s - 2 * pitch)),
+ _mm_loadl_epi64((__m128i *)(s + 1 * pitch)));
+ q0p0 = _mm_unpacklo_epi64(_mm_loadl_epi64((__m128i *)(s - 1 * pitch)),
+ _mm_loadl_epi64((__m128i *)(s - 0 * pitch)));
+ p1q1 = _mm_shuffle_epi32(q1p1, 78);
+ p0q0 = _mm_shuffle_epi32(q0p0, 78);
+
+ {
+ // filter_mask and hev_mask
+ const __m128i one = _mm_set1_epi8(1);
+ const __m128i fe = _mm_set1_epi8((int8_t)0xfe);
+ const __m128i ff = _mm_cmpeq_epi8(fe, fe);
+ __m128i abs_p1q1, abs_p0q0, abs_q1q0, abs_p1p0, work;
+ abs_p1p0 = abs_diff(q1p1, q0p0);
+ abs_q1q0 = _mm_srli_si128(abs_p1p0, 8);
+
+ abs_p0q0 = abs_diff(q0p0, p0q0);
+ abs_p1q1 = abs_diff(q1p1, p1q1);
+ flat = _mm_max_epu8(abs_p1p0, abs_q1q0);
+ hev = _mm_subs_epu8(flat, thresh_v);
+ hev = _mm_xor_si128(_mm_cmpeq_epi8(hev, zero), ff);
+
+ abs_p0q0 = _mm_adds_epu8(abs_p0q0, abs_p0q0);
+ abs_p1q1 = _mm_srli_epi16(_mm_and_si128(abs_p1q1, fe), 1);
+ mask = _mm_subs_epu8(_mm_adds_epu8(abs_p0q0, abs_p1q1), blimit_v);
+ mask = _mm_xor_si128(_mm_cmpeq_epi8(mask, zero), ff);
+ // mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2 > blimit) * -1;
+ mask = _mm_max_epu8(abs_p1p0, mask);
+ // mask |= (abs(p1 - p0) > limit) * -1;
+ // mask |= (abs(q1 - q0) > limit) * -1;
+
+ work = _mm_max_epu8(abs_diff(q2p2, q1p1), abs_diff(q3p3, q2p2));
+ mask = _mm_max_epu8(work, mask);
+ mask = _mm_max_epu8(mask, _mm_srli_si128(mask, 8));
+ mask = _mm_subs_epu8(mask, limit_v);
+ mask = _mm_cmpeq_epi8(mask, zero);
+
+ // flat_mask4
+
+ flat = _mm_max_epu8(abs_diff(q2p2, q0p0), abs_diff(q3p3, q0p0));
+ flat = _mm_max_epu8(abs_p1p0, flat);
+ flat = _mm_max_epu8(flat, _mm_srli_si128(flat, 8));
+ flat = _mm_subs_epu8(flat, one);
+ flat = _mm_cmpeq_epi8(flat, zero);
+ flat = _mm_and_si128(flat, mask);
+ }
+
+ {
+ const __m128i four = _mm_set1_epi16(4);
+ unsigned char *src = s;
+ {
+ __m128i workp_a, workp_b, workp_shft;
+ p3 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src - 4 * pitch)),
+ zero);
+ p2 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src - 3 * pitch)),
+ zero);
+ p1 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src - 2 * pitch)),
+ zero);
+ p0 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src - 1 * pitch)),
+ zero);
+ q0 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src - 0 * pitch)),
+ zero);
+ q1 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src + 1 * pitch)),
+ zero);
+ q2 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src + 2 * pitch)),
+ zero);
+ q3 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src + 3 * pitch)),
+ zero);
+
+ workp_a = _mm_add_epi16(_mm_add_epi16(p3, p3), _mm_add_epi16(p2, p1));
+ workp_a = _mm_add_epi16(_mm_add_epi16(workp_a, four), p0);
+ workp_b = _mm_add_epi16(_mm_add_epi16(q0, p2), p3);
+ workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3);
+ _mm_storel_epi64((__m128i *)&flat_op2[0],
+ _mm_packus_epi16(workp_shft, workp_shft));
+
+ workp_b = _mm_add_epi16(_mm_add_epi16(q0, q1), p1);
+ workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3);
+ _mm_storel_epi64((__m128i *)&flat_op1[0],
+ _mm_packus_epi16(workp_shft, workp_shft));
+
+ workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p3), q2);
+ workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, p1), p0);
+ workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3);
+ _mm_storel_epi64((__m128i *)&flat_op0[0],
+ _mm_packus_epi16(workp_shft, workp_shft));
+
+ workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p3), q3);
+ workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, p0), q0);
+ workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3);
+ _mm_storel_epi64((__m128i *)&flat_oq0[0],
+ _mm_packus_epi16(workp_shft, workp_shft));
+
+ workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p2), q3);
+ workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, q0), q1);
+ workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3);
+ _mm_storel_epi64((__m128i *)&flat_oq1[0],
+ _mm_packus_epi16(workp_shft, workp_shft));
+
+ workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p1), q3);
+ workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, q1), q2);
+ workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3);
+ _mm_storel_epi64((__m128i *)&flat_oq2[0],
+ _mm_packus_epi16(workp_shft, workp_shft));
+ }
+ }
+ // lp filter
+ {
+ const __m128i t4 = _mm_set1_epi8(4);
+ const __m128i t3 = _mm_set1_epi8(3);
+ const __m128i t80 = _mm_set1_epi8((int8_t)0x80);
+ const __m128i t1 = _mm_set1_epi8(0x1);
+ const __m128i ps1 =
+ _mm_xor_si128(_mm_loadl_epi64((__m128i *)(s - 2 * pitch)), t80);
+ const __m128i ps0 =
+ _mm_xor_si128(_mm_loadl_epi64((__m128i *)(s - 1 * pitch)), t80);
+ const __m128i qs0 =
+ _mm_xor_si128(_mm_loadl_epi64((__m128i *)(s + 0 * pitch)), t80);
+ const __m128i qs1 =
+ _mm_xor_si128(_mm_loadl_epi64((__m128i *)(s + 1 * pitch)), t80);
+ __m128i filt;
+ __m128i work_a;
+ __m128i filter1, filter2;
+
+ filt = _mm_and_si128(_mm_subs_epi8(ps1, qs1), hev);
+ work_a = _mm_subs_epi8(qs0, ps0);
+ filt = _mm_adds_epi8(filt, work_a);
+ filt = _mm_adds_epi8(filt, work_a);
+ filt = _mm_adds_epi8(filt, work_a);
+ // (vpx_filter + 3 * (qs0 - ps0)) & mask
+ filt = _mm_and_si128(filt, mask);
+
+ filter1 = _mm_adds_epi8(filt, t4);
+ filter2 = _mm_adds_epi8(filt, t3);
+
+ // Filter1 >> 3
+ filter1 = _mm_unpacklo_epi8(zero, filter1);
+ filter1 = _mm_srai_epi16(filter1, 11);
+ filter1 = _mm_packs_epi16(filter1, filter1);
+
+ // Filter2 >> 3
+ filter2 = _mm_unpacklo_epi8(zero, filter2);
+ filter2 = _mm_srai_epi16(filter2, 11);
+ filter2 = _mm_packs_epi16(filter2, zero);
+
+ // filt >> 1
+ filt = _mm_adds_epi8(filter1, t1);
+ filt = _mm_unpacklo_epi8(zero, filt);
+ filt = _mm_srai_epi16(filt, 9);
+ filt = _mm_packs_epi16(filt, zero);
+
+ filt = _mm_andnot_si128(hev, filt);
+
+ work_a = _mm_xor_si128(_mm_subs_epi8(qs0, filter1), t80);
+ q0 = _mm_loadl_epi64((__m128i *)flat_oq0);
+ work_a = _mm_andnot_si128(flat, work_a);
+ q0 = _mm_and_si128(flat, q0);
+ q0 = _mm_or_si128(work_a, q0);
+
+ work_a = _mm_xor_si128(_mm_subs_epi8(qs1, filt), t80);
+ q1 = _mm_loadl_epi64((__m128i *)flat_oq1);
+ work_a = _mm_andnot_si128(flat, work_a);
+ q1 = _mm_and_si128(flat, q1);
+ q1 = _mm_or_si128(work_a, q1);
+
+ work_a = _mm_loadu_si128((__m128i *)(s + 2 * pitch));
+ q2 = _mm_loadl_epi64((__m128i *)flat_oq2);
+ work_a = _mm_andnot_si128(flat, work_a);
+ q2 = _mm_and_si128(flat, q2);
+ q2 = _mm_or_si128(work_a, q2);
+
+ work_a = _mm_xor_si128(_mm_adds_epi8(ps0, filter2), t80);
+ p0 = _mm_loadl_epi64((__m128i *)flat_op0);
+ work_a = _mm_andnot_si128(flat, work_a);
+ p0 = _mm_and_si128(flat, p0);
+ p0 = _mm_or_si128(work_a, p0);
+
+ work_a = _mm_xor_si128(_mm_adds_epi8(ps1, filt), t80);
+ p1 = _mm_loadl_epi64((__m128i *)flat_op1);
+ work_a = _mm_andnot_si128(flat, work_a);
+ p1 = _mm_and_si128(flat, p1);
+ p1 = _mm_or_si128(work_a, p1);
+
+ work_a = _mm_loadu_si128((__m128i *)(s - 3 * pitch));
+ p2 = _mm_loadl_epi64((__m128i *)flat_op2);
+ work_a = _mm_andnot_si128(flat, work_a);
+ p2 = _mm_and_si128(flat, p2);
+ p2 = _mm_or_si128(work_a, p2);
+
+ _mm_storel_epi64((__m128i *)(s - 3 * pitch), p2);
+ _mm_storel_epi64((__m128i *)(s - 2 * pitch), p1);
+ _mm_storel_epi64((__m128i *)(s - 1 * pitch), p0);
+ _mm_storel_epi64((__m128i *)(s + 0 * pitch), q0);
+ _mm_storel_epi64((__m128i *)(s + 1 * pitch), q1);
+ _mm_storel_epi64((__m128i *)(s + 2 * pitch), q2);
+ }
+}
+
+void vpx_lpf_horizontal_8_dual_sse2(
+ uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0,
+ const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1,
+ const uint8_t *thresh1) {
+ DECLARE_ALIGNED(16, unsigned char, flat_op2[16]);
+ DECLARE_ALIGNED(16, unsigned char, flat_op1[16]);
+ DECLARE_ALIGNED(16, unsigned char, flat_op0[16]);
+ DECLARE_ALIGNED(16, unsigned char, flat_oq2[16]);
+ DECLARE_ALIGNED(16, unsigned char, flat_oq1[16]);
+ DECLARE_ALIGNED(16, unsigned char, flat_oq0[16]);
+ const __m128i zero = _mm_setzero_si128();
+ const __m128i blimit =
+ _mm_unpacklo_epi64(_mm_load_si128((const __m128i *)blimit0),
+ _mm_load_si128((const __m128i *)blimit1));
+ const __m128i limit =
+ _mm_unpacklo_epi64(_mm_load_si128((const __m128i *)limit0),
+ _mm_load_si128((const __m128i *)limit1));
+ const __m128i thresh =
+ _mm_unpacklo_epi64(_mm_load_si128((const __m128i *)thresh0),
+ _mm_load_si128((const __m128i *)thresh1));
+
+ __m128i mask, hev, flat;
+ __m128i p3, p2, p1, p0, q0, q1, q2, q3;
+
+ p3 = _mm_loadu_si128((__m128i *)(s - 4 * pitch));
+ p2 = _mm_loadu_si128((__m128i *)(s - 3 * pitch));
+ p1 = _mm_loadu_si128((__m128i *)(s - 2 * pitch));
+ p0 = _mm_loadu_si128((__m128i *)(s - 1 * pitch));
+ q0 = _mm_loadu_si128((__m128i *)(s - 0 * pitch));
+ q1 = _mm_loadu_si128((__m128i *)(s + 1 * pitch));
+ q2 = _mm_loadu_si128((__m128i *)(s + 2 * pitch));
+ q3 = _mm_loadu_si128((__m128i *)(s + 3 * pitch));
+ {
+ const __m128i abs_p1p0 =
+ _mm_or_si128(_mm_subs_epu8(p1, p0), _mm_subs_epu8(p0, p1));
+ const __m128i abs_q1q0 =
+ _mm_or_si128(_mm_subs_epu8(q1, q0), _mm_subs_epu8(q0, q1));
+ const __m128i one = _mm_set1_epi8(1);
+ const __m128i fe = _mm_set1_epi8((int8_t)0xfe);
+ const __m128i ff = _mm_cmpeq_epi8(abs_p1p0, abs_p1p0);
+ __m128i abs_p0q0 =
+ _mm_or_si128(_mm_subs_epu8(p0, q0), _mm_subs_epu8(q0, p0));
+ __m128i abs_p1q1 =
+ _mm_or_si128(_mm_subs_epu8(p1, q1), _mm_subs_epu8(q1, p1));
+ __m128i work;
+
+ // filter_mask and hev_mask
+ flat = _mm_max_epu8(abs_p1p0, abs_q1q0);
+ hev = _mm_subs_epu8(flat, thresh);
+ hev = _mm_xor_si128(_mm_cmpeq_epi8(hev, zero), ff);
+
+ abs_p0q0 = _mm_adds_epu8(abs_p0q0, abs_p0q0);
+ abs_p1q1 = _mm_srli_epi16(_mm_and_si128(abs_p1q1, fe), 1);
+ mask = _mm_subs_epu8(_mm_adds_epu8(abs_p0q0, abs_p1q1), blimit);
+ mask = _mm_xor_si128(_mm_cmpeq_epi8(mask, zero), ff);
+ // mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2 > blimit) * -1;
+ mask = _mm_max_epu8(flat, mask);
+ // mask |= (abs(p1 - p0) > limit) * -1;
+ // mask |= (abs(q1 - q0) > limit) * -1;
+ work = _mm_max_epu8(
+ _mm_or_si128(_mm_subs_epu8(p2, p1), _mm_subs_epu8(p1, p2)),
+ _mm_or_si128(_mm_subs_epu8(p3, p2), _mm_subs_epu8(p2, p3)));
+ mask = _mm_max_epu8(work, mask);
+ work = _mm_max_epu8(
+ _mm_or_si128(_mm_subs_epu8(q2, q1), _mm_subs_epu8(q1, q2)),
+ _mm_or_si128(_mm_subs_epu8(q3, q2), _mm_subs_epu8(q2, q3)));
+ mask = _mm_max_epu8(work, mask);
+ mask = _mm_subs_epu8(mask, limit);
+ mask = _mm_cmpeq_epi8(mask, zero);
+
+ // flat_mask4
+ work = _mm_max_epu8(
+ _mm_or_si128(_mm_subs_epu8(p2, p0), _mm_subs_epu8(p0, p2)),
+ _mm_or_si128(_mm_subs_epu8(q2, q0), _mm_subs_epu8(q0, q2)));
+ flat = _mm_max_epu8(work, flat);
+ work = _mm_max_epu8(
+ _mm_or_si128(_mm_subs_epu8(p3, p0), _mm_subs_epu8(p0, p3)),
+ _mm_or_si128(_mm_subs_epu8(q3, q0), _mm_subs_epu8(q0, q3)));
+ flat = _mm_max_epu8(work, flat);
+ flat = _mm_subs_epu8(flat, one);
+ flat = _mm_cmpeq_epi8(flat, zero);
+ flat = _mm_and_si128(flat, mask);
+ }
+ {
+ const __m128i four = _mm_set1_epi16(4);
+ unsigned char *src = s;
+ int i = 0;
+
+ do {
+ __m128i workp_a, workp_b, workp_shft;
+ p3 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src - 4 * pitch)),
+ zero);
+ p2 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src - 3 * pitch)),
+ zero);
+ p1 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src - 2 * pitch)),
+ zero);
+ p0 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src - 1 * pitch)),
+ zero);
+ q0 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src - 0 * pitch)),
+ zero);
+ q1 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src + 1 * pitch)),
+ zero);
+ q2 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src + 2 * pitch)),
+ zero);
+ q3 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src + 3 * pitch)),
+ zero);
+
+ workp_a = _mm_add_epi16(_mm_add_epi16(p3, p3), _mm_add_epi16(p2, p1));
+ workp_a = _mm_add_epi16(_mm_add_epi16(workp_a, four), p0);
+ workp_b = _mm_add_epi16(_mm_add_epi16(q0, p2), p3);
+ workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3);
+ _mm_storel_epi64((__m128i *)&flat_op2[i * 8],
+ _mm_packus_epi16(workp_shft, workp_shft));
+
+ workp_b = _mm_add_epi16(_mm_add_epi16(q0, q1), p1);
+ workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3);
+ _mm_storel_epi64((__m128i *)&flat_op1[i * 8],
+ _mm_packus_epi16(workp_shft, workp_shft));
+
+ workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p3), q2);
+ workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, p1), p0);
+ workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3);
+ _mm_storel_epi64((__m128i *)&flat_op0[i * 8],
+ _mm_packus_epi16(workp_shft, workp_shft));
+
+ workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p3), q3);
+ workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, p0), q0);
+ workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3);
+ _mm_storel_epi64((__m128i *)&flat_oq0[i * 8],
+ _mm_packus_epi16(workp_shft, workp_shft));
+
+ workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p2), q3);
+ workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, q0), q1);
+ workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3);
+ _mm_storel_epi64((__m128i *)&flat_oq1[i * 8],
+ _mm_packus_epi16(workp_shft, workp_shft));
+
+ workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p1), q3);
+ workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, q1), q2);
+ workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3);
+ _mm_storel_epi64((__m128i *)&flat_oq2[i * 8],
+ _mm_packus_epi16(workp_shft, workp_shft));
+
+ src += 8;
+ } while (++i < 2);
+ }
+ // lp filter
+ {
+ const __m128i t4 = _mm_set1_epi8(4);
+ const __m128i t3 = _mm_set1_epi8(3);
+ const __m128i t80 = _mm_set1_epi8((int8_t)0x80);
+ const __m128i te0 = _mm_set1_epi8((int8_t)0xe0);
+ const __m128i t1f = _mm_set1_epi8(0x1f);
+ const __m128i t1 = _mm_set1_epi8(0x1);
+ const __m128i t7f = _mm_set1_epi8(0x7f);
+
+ const __m128i ps1 =
+ _mm_xor_si128(_mm_loadu_si128((__m128i *)(s - 2 * pitch)), t80);
+ const __m128i ps0 =
+ _mm_xor_si128(_mm_loadu_si128((__m128i *)(s - 1 * pitch)), t80);
+ const __m128i qs0 =
+ _mm_xor_si128(_mm_loadu_si128((__m128i *)(s + 0 * pitch)), t80);
+ const __m128i qs1 =
+ _mm_xor_si128(_mm_loadu_si128((__m128i *)(s + 1 * pitch)), t80);
+ __m128i filt;
+ __m128i work_a;
+ __m128i filter1, filter2;
+
+ filt = _mm_and_si128(_mm_subs_epi8(ps1, qs1), hev);
+ work_a = _mm_subs_epi8(qs0, ps0);
+ filt = _mm_adds_epi8(filt, work_a);
+ filt = _mm_adds_epi8(filt, work_a);
+ filt = _mm_adds_epi8(filt, work_a);
+ // (vpx_filter + 3 * (qs0 - ps0)) & mask
+ filt = _mm_and_si128(filt, mask);
+
+ filter1 = _mm_adds_epi8(filt, t4);
+ filter2 = _mm_adds_epi8(filt, t3);
+
+ // Filter1 >> 3
+ work_a = _mm_cmpgt_epi8(zero, filter1);
+ filter1 = _mm_srli_epi16(filter1, 3);
+ work_a = _mm_and_si128(work_a, te0);
+ filter1 = _mm_and_si128(filter1, t1f);
+ filter1 = _mm_or_si128(filter1, work_a);
+
+ // Filter2 >> 3
+ work_a = _mm_cmpgt_epi8(zero, filter2);
+ filter2 = _mm_srli_epi16(filter2, 3);
+ work_a = _mm_and_si128(work_a, te0);
+ filter2 = _mm_and_si128(filter2, t1f);
+ filter2 = _mm_or_si128(filter2, work_a);
+
+ // filt >> 1
+ filt = _mm_adds_epi8(filter1, t1);
+ work_a = _mm_cmpgt_epi8(zero, filt);
+ filt = _mm_srli_epi16(filt, 1);
+ work_a = _mm_and_si128(work_a, t80);
+ filt = _mm_and_si128(filt, t7f);
+ filt = _mm_or_si128(filt, work_a);
+
+ filt = _mm_andnot_si128(hev, filt);
+
+ work_a = _mm_xor_si128(_mm_subs_epi8(qs0, filter1), t80);
+ q0 = _mm_load_si128((__m128i *)flat_oq0);
+ work_a = _mm_andnot_si128(flat, work_a);
+ q0 = _mm_and_si128(flat, q0);
+ q0 = _mm_or_si128(work_a, q0);
+
+ work_a = _mm_xor_si128(_mm_subs_epi8(qs1, filt), t80);
+ q1 = _mm_load_si128((__m128i *)flat_oq1);
+ work_a = _mm_andnot_si128(flat, work_a);
+ q1 = _mm_and_si128(flat, q1);
+ q1 = _mm_or_si128(work_a, q1);
+
+ work_a = _mm_loadu_si128((__m128i *)(s + 2 * pitch));
+ q2 = _mm_load_si128((__m128i *)flat_oq2);
+ work_a = _mm_andnot_si128(flat, work_a);
+ q2 = _mm_and_si128(flat, q2);
+ q2 = _mm_or_si128(work_a, q2);
+
+ work_a = _mm_xor_si128(_mm_adds_epi8(ps0, filter2), t80);
+ p0 = _mm_load_si128((__m128i *)flat_op0);
+ work_a = _mm_andnot_si128(flat, work_a);
+ p0 = _mm_and_si128(flat, p0);
+ p0 = _mm_or_si128(work_a, p0);
+
+ work_a = _mm_xor_si128(_mm_adds_epi8(ps1, filt), t80);
+ p1 = _mm_load_si128((__m128i *)flat_op1);
+ work_a = _mm_andnot_si128(flat, work_a);
+ p1 = _mm_and_si128(flat, p1);
+ p1 = _mm_or_si128(work_a, p1);
+
+ work_a = _mm_loadu_si128((__m128i *)(s - 3 * pitch));
+ p2 = _mm_load_si128((__m128i *)flat_op2);
+ work_a = _mm_andnot_si128(flat, work_a);
+ p2 = _mm_and_si128(flat, p2);
+ p2 = _mm_or_si128(work_a, p2);
+
+ _mm_storeu_si128((__m128i *)(s - 3 * pitch), p2);
+ _mm_storeu_si128((__m128i *)(s - 2 * pitch), p1);
+ _mm_storeu_si128((__m128i *)(s - 1 * pitch), p0);
+ _mm_storeu_si128((__m128i *)(s + 0 * pitch), q0);
+ _mm_storeu_si128((__m128i *)(s + 1 * pitch), q1);
+ _mm_storeu_si128((__m128i *)(s + 2 * pitch), q2);
+ }
+}
+
+void vpx_lpf_horizontal_4_dual_sse2(unsigned char *s, int pitch,
+ const unsigned char *blimit0,
+ const unsigned char *limit0,
+ const unsigned char *thresh0,
+ const unsigned char *blimit1,
+ const unsigned char *limit1,
+ const unsigned char *thresh1) {
+ const __m128i blimit =
+ _mm_unpacklo_epi64(_mm_load_si128((const __m128i *)blimit0),
+ _mm_load_si128((const __m128i *)blimit1));
+ const __m128i limit =
+ _mm_unpacklo_epi64(_mm_load_si128((const __m128i *)limit0),
+ _mm_load_si128((const __m128i *)limit1));
+ const __m128i thresh =
+ _mm_unpacklo_epi64(_mm_load_si128((const __m128i *)thresh0),
+ _mm_load_si128((const __m128i *)thresh1));
+ const __m128i zero = _mm_setzero_si128();
+ __m128i p3, p2, p1, p0, q0, q1, q2, q3;
+ __m128i mask, hev, flat;
+
+ p3 = _mm_loadu_si128((__m128i *)(s - 4 * pitch));
+ p2 = _mm_loadu_si128((__m128i *)(s - 3 * pitch));
+ p1 = _mm_loadu_si128((__m128i *)(s - 2 * pitch));
+ p0 = _mm_loadu_si128((__m128i *)(s - 1 * pitch));
+ q0 = _mm_loadu_si128((__m128i *)(s - 0 * pitch));
+ q1 = _mm_loadu_si128((__m128i *)(s + 1 * pitch));
+ q2 = _mm_loadu_si128((__m128i *)(s + 2 * pitch));
+ q3 = _mm_loadu_si128((__m128i *)(s + 3 * pitch));
+
+ // filter_mask and hev_mask
+ {
+ const __m128i abs_p1p0 =
+ _mm_or_si128(_mm_subs_epu8(p1, p0), _mm_subs_epu8(p0, p1));
+ const __m128i abs_q1q0 =
+ _mm_or_si128(_mm_subs_epu8(q1, q0), _mm_subs_epu8(q0, q1));
+ const __m128i fe = _mm_set1_epi8((int8_t)0xfe);
+ const __m128i ff = _mm_cmpeq_epi8(abs_p1p0, abs_p1p0);
+ __m128i abs_p0q0 =
+ _mm_or_si128(_mm_subs_epu8(p0, q0), _mm_subs_epu8(q0, p0));
+ __m128i abs_p1q1 =
+ _mm_or_si128(_mm_subs_epu8(p1, q1), _mm_subs_epu8(q1, p1));
+ __m128i work;
+
+ flat = _mm_max_epu8(abs_p1p0, abs_q1q0);
+ hev = _mm_subs_epu8(flat, thresh);
+ hev = _mm_xor_si128(_mm_cmpeq_epi8(hev, zero), ff);
+
+ abs_p0q0 = _mm_adds_epu8(abs_p0q0, abs_p0q0);
+ abs_p1q1 = _mm_srli_epi16(_mm_and_si128(abs_p1q1, fe), 1);
+ mask = _mm_subs_epu8(_mm_adds_epu8(abs_p0q0, abs_p1q1), blimit);
+ mask = _mm_xor_si128(_mm_cmpeq_epi8(mask, zero), ff);
+ // mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2 > blimit) * -1;
+ mask = _mm_max_epu8(flat, mask);
+ // mask |= (abs(p1 - p0) > limit) * -1;
+ // mask |= (abs(q1 - q0) > limit) * -1;
+ work = _mm_max_epu8(
+ _mm_or_si128(_mm_subs_epu8(p2, p1), _mm_subs_epu8(p1, p2)),
+ _mm_or_si128(_mm_subs_epu8(p3, p2), _mm_subs_epu8(p2, p3)));
+ mask = _mm_max_epu8(work, mask);
+ work = _mm_max_epu8(
+ _mm_or_si128(_mm_subs_epu8(q2, q1), _mm_subs_epu8(q1, q2)),
+ _mm_or_si128(_mm_subs_epu8(q3, q2), _mm_subs_epu8(q2, q3)));
+ mask = _mm_max_epu8(work, mask);
+ mask = _mm_subs_epu8(mask, limit);
+ mask = _mm_cmpeq_epi8(mask, zero);
+ }
+
+ // filter4
+ {
+ const __m128i t4 = _mm_set1_epi8(4);
+ const __m128i t3 = _mm_set1_epi8(3);
+ const __m128i t80 = _mm_set1_epi8((int8_t)0x80);
+ const __m128i te0 = _mm_set1_epi8((int8_t)0xe0);
+ const __m128i t1f = _mm_set1_epi8(0x1f);
+ const __m128i t1 = _mm_set1_epi8(0x1);
+ const __m128i t7f = _mm_set1_epi8(0x7f);
+
+ const __m128i ps1 =
+ _mm_xor_si128(_mm_loadu_si128((__m128i *)(s - 2 * pitch)), t80);
+ const __m128i ps0 =
+ _mm_xor_si128(_mm_loadu_si128((__m128i *)(s - 1 * pitch)), t80);
+ const __m128i qs0 =
+ _mm_xor_si128(_mm_loadu_si128((__m128i *)(s + 0 * pitch)), t80);
+ const __m128i qs1 =
+ _mm_xor_si128(_mm_loadu_si128((__m128i *)(s + 1 * pitch)), t80);
+ __m128i filt;
+ __m128i work_a;
+ __m128i filter1, filter2;
+
+ filt = _mm_and_si128(_mm_subs_epi8(ps1, qs1), hev);
+ work_a = _mm_subs_epi8(qs0, ps0);
+ filt = _mm_adds_epi8(filt, work_a);
+ filt = _mm_adds_epi8(filt, work_a);
+ filt = _mm_adds_epi8(filt, work_a);
+ // (vpx_filter + 3 * (qs0 - ps0)) & mask
+ filt = _mm_and_si128(filt, mask);
+
+ filter1 = _mm_adds_epi8(filt, t4);
+ filter2 = _mm_adds_epi8(filt, t3);
+
+ // Filter1 >> 3
+ work_a = _mm_cmpgt_epi8(zero, filter1);
+ filter1 = _mm_srli_epi16(filter1, 3);
+ work_a = _mm_and_si128(work_a, te0);
+ filter1 = _mm_and_si128(filter1, t1f);
+ filter1 = _mm_or_si128(filter1, work_a);
+
+ // Filter2 >> 3
+ work_a = _mm_cmpgt_epi8(zero, filter2);
+ filter2 = _mm_srli_epi16(filter2, 3);
+ work_a = _mm_and_si128(work_a, te0);
+ filter2 = _mm_and_si128(filter2, t1f);
+ filter2 = _mm_or_si128(filter2, work_a);
+
+ // filt >> 1
+ filt = _mm_adds_epi8(filter1, t1);
+ work_a = _mm_cmpgt_epi8(zero, filt);
+ filt = _mm_srli_epi16(filt, 1);
+ work_a = _mm_and_si128(work_a, t80);
+ filt = _mm_and_si128(filt, t7f);
+ filt = _mm_or_si128(filt, work_a);
+
+ filt = _mm_andnot_si128(hev, filt);
+
+ q0 = _mm_xor_si128(_mm_subs_epi8(qs0, filter1), t80);
+ q1 = _mm_xor_si128(_mm_subs_epi8(qs1, filt), t80);
+ p0 = _mm_xor_si128(_mm_adds_epi8(ps0, filter2), t80);
+ p1 = _mm_xor_si128(_mm_adds_epi8(ps1, filt), t80);
+
+ _mm_storeu_si128((__m128i *)(s - 2 * pitch), p1);
+ _mm_storeu_si128((__m128i *)(s - 1 * pitch), p0);
+ _mm_storeu_si128((__m128i *)(s + 0 * pitch), q0);
+ _mm_storeu_si128((__m128i *)(s + 1 * pitch), q1);
+ }
+}
+
+static INLINE void transpose8x16(unsigned char *in0, unsigned char *in1,
+ int in_p, unsigned char *out, int out_p) {
+ __m128i x0, x1, x2, x3, x4, x5, x6, x7;
+ __m128i x8, x9, x10, x11, x12, x13, x14, x15;
+
+ // 2-way interleave w/hoisting of unpacks
+ x0 = _mm_loadl_epi64((__m128i *)in0); // 1
+ x1 = _mm_loadl_epi64((__m128i *)(in0 + in_p)); // 3
+ x0 = _mm_unpacklo_epi8(x0, x1); // 1
+
+ x2 = _mm_loadl_epi64((__m128i *)(in0 + 2 * in_p)); // 5
+ x3 = _mm_loadl_epi64((__m128i *)(in0 + 3 * in_p)); // 7
+ x1 = _mm_unpacklo_epi8(x2, x3); // 2
+
+ x4 = _mm_loadl_epi64((__m128i *)(in0 + 4 * in_p)); // 9
+ x5 = _mm_loadl_epi64((__m128i *)(in0 + 5 * in_p)); // 11
+ x2 = _mm_unpacklo_epi8(x4, x5); // 3
+
+ x6 = _mm_loadl_epi64((__m128i *)(in0 + 6 * in_p)); // 13
+ x7 = _mm_loadl_epi64((__m128i *)(in0 + 7 * in_p)); // 15
+ x3 = _mm_unpacklo_epi8(x6, x7); // 4
+ x4 = _mm_unpacklo_epi16(x0, x1); // 9
+
+ x8 = _mm_loadl_epi64((__m128i *)in1); // 2
+ x9 = _mm_loadl_epi64((__m128i *)(in1 + in_p)); // 4
+ x8 = _mm_unpacklo_epi8(x8, x9); // 5
+ x5 = _mm_unpacklo_epi16(x2, x3); // 10
+
+ x10 = _mm_loadl_epi64((__m128i *)(in1 + 2 * in_p)); // 6
+ x11 = _mm_loadl_epi64((__m128i *)(in1 + 3 * in_p)); // 8
+ x9 = _mm_unpacklo_epi8(x10, x11); // 6
+
+ x12 = _mm_loadl_epi64((__m128i *)(in1 + 4 * in_p)); // 10
+ x13 = _mm_loadl_epi64((__m128i *)(in1 + 5 * in_p)); // 12
+ x10 = _mm_unpacklo_epi8(x12, x13); // 7
+ x12 = _mm_unpacklo_epi16(x8, x9); // 11
+
+ x14 = _mm_loadl_epi64((__m128i *)(in1 + 6 * in_p)); // 14
+ x15 = _mm_loadl_epi64((__m128i *)(in1 + 7 * in_p)); // 16
+ x11 = _mm_unpacklo_epi8(x14, x15); // 8
+ x13 = _mm_unpacklo_epi16(x10, x11); // 12
+
+ x6 = _mm_unpacklo_epi32(x4, x5); // 13
+ x7 = _mm_unpackhi_epi32(x4, x5); // 14
+ x14 = _mm_unpacklo_epi32(x12, x13); // 15
+ x15 = _mm_unpackhi_epi32(x12, x13); // 16
+
+ // Store first 4-line result
+ _mm_storeu_si128((__m128i *)out, _mm_unpacklo_epi64(x6, x14));
+ _mm_storeu_si128((__m128i *)(out + out_p), _mm_unpackhi_epi64(x6, x14));
+ _mm_storeu_si128((__m128i *)(out + 2 * out_p), _mm_unpacklo_epi64(x7, x15));
+ _mm_storeu_si128((__m128i *)(out + 3 * out_p), _mm_unpackhi_epi64(x7, x15));
+
+ x4 = _mm_unpackhi_epi16(x0, x1);
+ x5 = _mm_unpackhi_epi16(x2, x3);
+ x12 = _mm_unpackhi_epi16(x8, x9);
+ x13 = _mm_unpackhi_epi16(x10, x11);
+
+ x6 = _mm_unpacklo_epi32(x4, x5);
+ x7 = _mm_unpackhi_epi32(x4, x5);
+ x14 = _mm_unpacklo_epi32(x12, x13);
+ x15 = _mm_unpackhi_epi32(x12, x13);
+
+ // Store second 4-line result
+ _mm_storeu_si128((__m128i *)(out + 4 * out_p), _mm_unpacklo_epi64(x6, x14));
+ _mm_storeu_si128((__m128i *)(out + 5 * out_p), _mm_unpackhi_epi64(x6, x14));
+ _mm_storeu_si128((__m128i *)(out + 6 * out_p), _mm_unpacklo_epi64(x7, x15));
+ _mm_storeu_si128((__m128i *)(out + 7 * out_p), _mm_unpackhi_epi64(x7, x15));
+}
+
+static INLINE void transpose(unsigned char *src[], int in_p,
+ unsigned char *dst[], int out_p,
+ int num_8x8_to_transpose) {
+ int idx8x8 = 0;
+ __m128i x0, x1, x2, x3, x4, x5, x6, x7;
+ do {
+ unsigned char *in = src[idx8x8];
+ unsigned char *out = dst[idx8x8];
+
+ x0 =
+ _mm_loadl_epi64((__m128i *)(in + 0 * in_p)); // 00 01 02 03 04 05 06 07
+ x1 =
+ _mm_loadl_epi64((__m128i *)(in + 1 * in_p)); // 10 11 12 13 14 15 16 17
+ // 00 10 01 11 02 12 03 13 04 14 05 15 06 16 07 17
+ x0 = _mm_unpacklo_epi8(x0, x1);
+
+ x2 =
+ _mm_loadl_epi64((__m128i *)(in + 2 * in_p)); // 20 21 22 23 24 25 26 27
+ x3 =
+ _mm_loadl_epi64((__m128i *)(in + 3 * in_p)); // 30 31 32 33 34 35 36 37
+ // 20 30 21 31 22 32 23 33 24 34 25 35 26 36 27 37
+ x1 = _mm_unpacklo_epi8(x2, x3);
+
+ x4 =
+ _mm_loadl_epi64((__m128i *)(in + 4 * in_p)); // 40 41 42 43 44 45 46 47
+ x5 =
+ _mm_loadl_epi64((__m128i *)(in + 5 * in_p)); // 50 51 52 53 54 55 56 57
+ // 40 50 41 51 42 52 43 53 44 54 45 55 46 56 47 57
+ x2 = _mm_unpacklo_epi8(x4, x5);
+
+ x6 =
+ _mm_loadl_epi64((__m128i *)(in + 6 * in_p)); // 60 61 62 63 64 65 66 67
+ x7 =
+ _mm_loadl_epi64((__m128i *)(in + 7 * in_p)); // 70 71 72 73 74 75 76 77
+ // 60 70 61 71 62 72 63 73 64 74 65 75 66 76 67 77
+ x3 = _mm_unpacklo_epi8(x6, x7);
+
+ // 00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33
+ x4 = _mm_unpacklo_epi16(x0, x1);
+ // 40 50 60 70 41 51 61 71 42 52 62 72 43 53 63 73
+ x5 = _mm_unpacklo_epi16(x2, x3);
+ // 00 10 20 30 40 50 60 70 01 11 21 31 41 51 61 71
+ x6 = _mm_unpacklo_epi32(x4, x5);
+ mm_storelu(out + 0 * out_p, x6); // 00 10 20 30 40 50 60 70
+ mm_storehu(out + 1 * out_p, x6); // 01 11 21 31 41 51 61 71
+ // 02 12 22 32 42 52 62 72 03 13 23 33 43 53 63 73
+ x7 = _mm_unpackhi_epi32(x4, x5);
+ mm_storelu(out + 2 * out_p, x7); // 02 12 22 32 42 52 62 72
+ mm_storehu(out + 3 * out_p, x7); // 03 13 23 33 43 53 63 73
+
+ // 04 14 24 34 05 15 25 35 06 16 26 36 07 17 27 37
+ x4 = _mm_unpackhi_epi16(x0, x1);
+ // 44 54 64 74 45 55 65 75 46 56 66 76 47 57 67 77
+ x5 = _mm_unpackhi_epi16(x2, x3);
+ // 04 14 24 34 44 54 64 74 05 15 25 35 45 55 65 75
+ x6 = _mm_unpacklo_epi32(x4, x5);
+ mm_storelu(out + 4 * out_p, x6); // 04 14 24 34 44 54 64 74
+ mm_storehu(out + 5 * out_p, x6); // 05 15 25 35 45 55 65 75
+ // 06 16 26 36 46 56 66 76 07 17 27 37 47 57 67 77
+ x7 = _mm_unpackhi_epi32(x4, x5);
+
+ mm_storelu(out + 6 * out_p, x7); // 06 16 26 36 46 56 66 76
+ mm_storehu(out + 7 * out_p, x7); // 07 17 27 37 47 57 67 77
+ } while (++idx8x8 < num_8x8_to_transpose);
+}
+
+void vpx_lpf_vertical_4_dual_sse2(uint8_t *s, int pitch, const uint8_t *blimit0,
+ const uint8_t *limit0, const uint8_t *thresh0,
+ const uint8_t *blimit1, const uint8_t *limit1,
+ const uint8_t *thresh1) {
+ DECLARE_ALIGNED(16, unsigned char, t_dst[16 * 8]);
+ unsigned char *src[2];
+ unsigned char *dst[2];
+
+ // Transpose 8x16
+ transpose8x16(s - 4, s - 4 + pitch * 8, pitch, t_dst, 16);
+
+ // Loop filtering
+ vpx_lpf_horizontal_4_dual(t_dst + 4 * 16, 16, blimit0, limit0, thresh0,
+ blimit1, limit1, thresh1);
+ src[0] = t_dst;
+ src[1] = t_dst + 8;
+ dst[0] = s - 4;
+ dst[1] = s - 4 + pitch * 8;
+
+ // Transpose back
+ transpose(src, 16, dst, pitch, 2);
+}
+
+void vpx_lpf_vertical_8_sse2(unsigned char *s, int pitch,
+ const unsigned char *blimit,
+ const unsigned char *limit,
+ const unsigned char *thresh) {
+ DECLARE_ALIGNED(8, unsigned char, t_dst[8 * 8]);
+ unsigned char *src[1];
+ unsigned char *dst[1];
+
+ // Transpose 8x8
+ src[0] = s - 4;
+ dst[0] = t_dst;
+
+ transpose(src, pitch, dst, 8, 1);
+
+ // Loop filtering
+ vpx_lpf_horizontal_8(t_dst + 4 * 8, 8, blimit, limit, thresh);
+
+ src[0] = t_dst;
+ dst[0] = s - 4;
+
+ // Transpose back
+ transpose(src, 8, dst, pitch, 1);
+}
+
+void vpx_lpf_vertical_8_dual_sse2(uint8_t *s, int pitch, const uint8_t *blimit0,
+ const uint8_t *limit0, const uint8_t *thresh0,
+ const uint8_t *blimit1, const uint8_t *limit1,
+ const uint8_t *thresh1) {
+ DECLARE_ALIGNED(16, unsigned char, t_dst[16 * 8]);
+ unsigned char *src[2];
+ unsigned char *dst[2];
+
+ // Transpose 8x16
+ transpose8x16(s - 4, s - 4 + pitch * 8, pitch, t_dst, 16);
+
+ // Loop filtering
+ vpx_lpf_horizontal_8_dual(t_dst + 4 * 16, 16, blimit0, limit0, thresh0,
+ blimit1, limit1, thresh1);
+ src[0] = t_dst;
+ src[1] = t_dst + 8;
+
+ dst[0] = s - 4;
+ dst[1] = s - 4 + pitch * 8;
+
+ // Transpose back
+ transpose(src, 16, dst, pitch, 2);
+}
+
+void vpx_lpf_vertical_16_sse2(unsigned char *s, int pitch,
+ const unsigned char *blimit,
+ const unsigned char *limit,
+ const unsigned char *thresh) {
+ DECLARE_ALIGNED(8, unsigned char, t_dst[8 * 16]);
+ unsigned char *src[2];
+ unsigned char *dst[2];
+
+ src[0] = s - 8;
+ src[1] = s;
+ dst[0] = t_dst;
+ dst[1] = t_dst + 8 * 8;
+
+ // Transpose 16x8
+ transpose(src, pitch, dst, 8, 2);
+
+ // Loop filtering
+ vpx_lpf_horizontal_16(t_dst + 8 * 8, 8, blimit, limit, thresh);
+
+ src[0] = t_dst;
+ src[1] = t_dst + 8 * 8;
+ dst[0] = s - 8;
+ dst[1] = s;
+
+ // Transpose back
+ transpose(src, 8, dst, pitch, 2);
+}
+
+void vpx_lpf_vertical_16_dual_sse2(unsigned char *s, int pitch,
+ const uint8_t *blimit, const uint8_t *limit,
+ const uint8_t *thresh) {
+ DECLARE_ALIGNED(16, unsigned char, t_dst[256]);
+
+ // Transpose 16x16
+ transpose8x16(s - 8, s - 8 + 8 * pitch, pitch, t_dst, 16);
+ transpose8x16(s, s + 8 * pitch, pitch, t_dst + 8 * 16, 16);
+
+ // Loop filtering
+ vpx_lpf_horizontal_16_dual(t_dst + 8 * 16, 16, blimit, limit, thresh);
+
+ // Transpose back
+ transpose8x16(t_dst, t_dst + 8 * 16, 16, s - 8, pitch);
+ transpose8x16(t_dst + 8, t_dst + 8 + 8 * 16, 16, s - 8 + 8 * pitch, pitch);
+}
diff --git a/media/libvpx/libvpx/vpx_dsp/x86/mem_sse2.h b/media/libvpx/libvpx/vpx_dsp/x86/mem_sse2.h
new file mode 100644
index 0000000000..031f361a41
--- /dev/null
+++ b/media/libvpx/libvpx/vpx_dsp/x86/mem_sse2.h
@@ -0,0 +1,154 @@
+/*
+ * Copyright (c) 2017 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef VPX_VPX_DSP_X86_MEM_SSE2_H_
+#define VPX_VPX_DSP_X86_MEM_SSE2_H_
+
+#include <emmintrin.h> // SSE2
+#include <string.h>
+
+#include "./vpx_config.h"
+
+static INLINE void storeu_int32(void *dst, int32_t v) {
+ memcpy(dst, &v, sizeof(v));
+}
+
+static INLINE int32_t loadu_int32(const void *src) {
+ int32_t v;
+ memcpy(&v, src, sizeof(v));
+ return v;
+}
+
+static INLINE __m128i load_unaligned_u32(const void *a) {
+ int val;
+ memcpy(&val, a, sizeof(val));
+ return _mm_cvtsi32_si128(val);
+}
+
+static INLINE void store_unaligned_u32(void *const a, const __m128i v) {
+ const int val = _mm_cvtsi128_si32(v);
+ memcpy(a, &val, sizeof(val));
+}
+
+#define mm_storelu(dst, v) memcpy((dst), (const char *)&(v), 8)
+#define mm_storehu(dst, v) memcpy((dst), (const char *)&(v) + 8, 8)
+
+static INLINE __m128i loadh_epi64(const __m128i s, const void *const src) {
+ return _mm_castps_si128(
+ _mm_loadh_pi(_mm_castsi128_ps(s), (const __m64 *)src));
+}
+
+static INLINE void load_8bit_4x4(const uint8_t *const s, const ptrdiff_t stride,
+ __m128i *const d) {
+ d[0] = _mm_cvtsi32_si128(*(const int *)(s + 0 * stride));
+ d[1] = _mm_cvtsi32_si128(*(const int *)(s + 1 * stride));
+ d[2] = _mm_cvtsi32_si128(*(const int *)(s + 2 * stride));
+ d[3] = _mm_cvtsi32_si128(*(const int *)(s + 3 * stride));
+}
+
+static INLINE void load_8bit_4x8(const uint8_t *const s, const ptrdiff_t stride,
+ __m128i *const d) {
+ load_8bit_4x4(s + 0 * stride, stride, &d[0]);
+ load_8bit_4x4(s + 4 * stride, stride, &d[4]);
+}
+
+static INLINE void load_8bit_8x4(const uint8_t *const s, const ptrdiff_t stride,
+ __m128i *const d) {
+ d[0] = _mm_loadl_epi64((const __m128i *)(s + 0 * stride));
+ d[1] = _mm_loadl_epi64((const __m128i *)(s + 1 * stride));
+ d[2] = _mm_loadl_epi64((const __m128i *)(s + 2 * stride));
+ d[3] = _mm_loadl_epi64((const __m128i *)(s + 3 * stride));
+}
+
+static INLINE void load_8bit_8x8(const uint8_t *const s, const ptrdiff_t stride,
+ __m128i *const d) {
+ load_8bit_8x4(s + 0 * stride, stride, &d[0]);
+ load_8bit_8x4(s + 4 * stride, stride, &d[4]);
+}
+
+static INLINE void load_8bit_16x8(const uint8_t *const s,
+ const ptrdiff_t stride, __m128i *const d) {
+ d[0] = _mm_load_si128((const __m128i *)(s + 0 * stride));
+ d[1] = _mm_load_si128((const __m128i *)(s + 1 * stride));
+ d[2] = _mm_load_si128((const __m128i *)(s + 2 * stride));
+ d[3] = _mm_load_si128((const __m128i *)(s + 3 * stride));
+ d[4] = _mm_load_si128((const __m128i *)(s + 4 * stride));
+ d[5] = _mm_load_si128((const __m128i *)(s + 5 * stride));
+ d[6] = _mm_load_si128((const __m128i *)(s + 6 * stride));
+ d[7] = _mm_load_si128((const __m128i *)(s + 7 * stride));
+}
+
+static INLINE void loadu_8bit_16x4(const uint8_t *const s,
+ const ptrdiff_t stride, __m128i *const d) {
+ d[0] = _mm_loadu_si128((const __m128i *)(s + 0 * stride));
+ d[1] = _mm_loadu_si128((const __m128i *)(s + 1 * stride));
+ d[2] = _mm_loadu_si128((const __m128i *)(s + 2 * stride));
+ d[3] = _mm_loadu_si128((const __m128i *)(s + 3 * stride));
+}
+
+static INLINE void loadu_8bit_16x8(const uint8_t *const s,
+ const ptrdiff_t stride, __m128i *const d) {
+ loadu_8bit_16x4(s + 0 * stride, stride, &d[0]);
+ loadu_8bit_16x4(s + 4 * stride, stride, &d[4]);
+}
+
+static INLINE void _mm_storeh_epi64(__m128i *const d, const __m128i s) {
+ _mm_storeh_pi((__m64 *)d, _mm_castsi128_ps(s));
+}
+
+static INLINE void store_8bit_4x4(const __m128i *const s, uint8_t *const d,
+ const ptrdiff_t stride) {
+ *(int *)(d + 0 * stride) = _mm_cvtsi128_si32(s[0]);
+ *(int *)(d + 1 * stride) = _mm_cvtsi128_si32(s[1]);
+ *(int *)(d + 2 * stride) = _mm_cvtsi128_si32(s[2]);
+ *(int *)(d + 3 * stride) = _mm_cvtsi128_si32(s[3]);
+}
+
+static INLINE void store_8bit_4x4_sse2(const __m128i s, uint8_t *const d,
+ const ptrdiff_t stride) {
+ __m128i ss[4];
+
+ ss[0] = s;
+ ss[1] = _mm_srli_si128(s, 4);
+ ss[2] = _mm_srli_si128(s, 8);
+ ss[3] = _mm_srli_si128(s, 12);
+ store_8bit_4x4(ss, d, stride);
+}
+
+static INLINE void store_8bit_8x4_from_16x2(const __m128i *const s,
+ uint8_t *const d,
+ const ptrdiff_t stride) {
+ _mm_storel_epi64((__m128i *)(d + 0 * stride), s[0]);
+ _mm_storeh_epi64((__m128i *)(d + 1 * stride), s[0]);
+ _mm_storel_epi64((__m128i *)(d + 2 * stride), s[1]);
+ _mm_storeh_epi64((__m128i *)(d + 3 * stride), s[1]);
+}
+
+static INLINE void store_8bit_8x8(const __m128i *const s, uint8_t *const d,
+ const ptrdiff_t stride) {
+ _mm_storel_epi64((__m128i *)(d + 0 * stride), s[0]);
+ _mm_storel_epi64((__m128i *)(d + 1 * stride), s[1]);
+ _mm_storel_epi64((__m128i *)(d + 2 * stride), s[2]);
+ _mm_storel_epi64((__m128i *)(d + 3 * stride), s[3]);
+ _mm_storel_epi64((__m128i *)(d + 4 * stride), s[4]);
+ _mm_storel_epi64((__m128i *)(d + 5 * stride), s[5]);
+ _mm_storel_epi64((__m128i *)(d + 6 * stride), s[6]);
+ _mm_storel_epi64((__m128i *)(d + 7 * stride), s[7]);
+}
+
+static INLINE void storeu_8bit_16x4(const __m128i *const s, uint8_t *const d,
+ const ptrdiff_t stride) {
+ _mm_storeu_si128((__m128i *)(d + 0 * stride), s[0]);
+ _mm_storeu_si128((__m128i *)(d + 1 * stride), s[1]);
+ _mm_storeu_si128((__m128i *)(d + 2 * stride), s[2]);
+ _mm_storeu_si128((__m128i *)(d + 3 * stride), s[3]);
+}
+
+#endif // VPX_VPX_DSP_X86_MEM_SSE2_H_
diff --git a/media/libvpx/libvpx/vpx_dsp/x86/post_proc_sse2.c b/media/libvpx/libvpx/vpx_dsp/x86/post_proc_sse2.c
new file mode 100644
index 0000000000..119fa7cd1a
--- /dev/null
+++ b/media/libvpx/libvpx/vpx_dsp/x86/post_proc_sse2.c
@@ -0,0 +1,141 @@
+/*
+ * Copyright (c) 2018 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <assert.h>
+#include <emmintrin.h>
+
+#include <stdio.h>
+
+#include "./vpx_dsp_rtcd.h"
+#include "vpx/vpx_integer.h"
+#include "vpx_dsp/x86/mem_sse2.h"
+
+extern const int16_t vpx_rv[];
+
+void vpx_mbpost_proc_down_sse2(unsigned char *dst, int pitch, int rows,
+ int cols, int flimit) {
+ int col;
+ const __m128i zero = _mm_setzero_si128();
+ const __m128i f = _mm_set1_epi32(flimit);
+ DECLARE_ALIGNED(16, int16_t, above_context[8 * 8]);
+
+ // 8 columns are processed at a time.
+ // If rows is less than 8 the bottom border extension fails.
+ assert(cols % 8 == 0);
+ assert(rows >= 8);
+
+ for (col = 0; col < cols; col += 8) {
+ int row, i;
+ __m128i s = _mm_loadl_epi64((__m128i *)dst);
+ __m128i sum, sumsq_0, sumsq_1;
+ __m128i tmp_0, tmp_1;
+ __m128i below_context = _mm_setzero_si128();
+
+ s = _mm_unpacklo_epi8(s, zero);
+
+ for (i = 0; i < 8; ++i) {
+ _mm_store_si128((__m128i *)above_context + i, s);
+ }
+
+ // sum *= 9
+ sum = _mm_slli_epi16(s, 3);
+ sum = _mm_add_epi16(s, sum);
+
+ // sum^2 * 9 == (sum * 9) * sum
+ tmp_0 = _mm_mullo_epi16(sum, s);
+ tmp_1 = _mm_mulhi_epi16(sum, s);
+
+ sumsq_0 = _mm_unpacklo_epi16(tmp_0, tmp_1);
+ sumsq_1 = _mm_unpackhi_epi16(tmp_0, tmp_1);
+
+ // Prime sum/sumsq
+ for (i = 1; i <= 6; ++i) {
+ __m128i a = _mm_loadl_epi64((__m128i *)(dst + i * pitch));
+ a = _mm_unpacklo_epi8(a, zero);
+ sum = _mm_add_epi16(sum, a);
+ a = _mm_mullo_epi16(a, a);
+ sumsq_0 = _mm_add_epi32(sumsq_0, _mm_unpacklo_epi16(a, zero));
+ sumsq_1 = _mm_add_epi32(sumsq_1, _mm_unpackhi_epi16(a, zero));
+ }
+
+ for (row = 0; row < rows + 8; row++) {
+ const __m128i above =
+ _mm_load_si128((__m128i *)above_context + (row & 7));
+ __m128i this_row = _mm_loadl_epi64((__m128i *)(dst + row * pitch));
+ __m128i above_sq, below_sq;
+ __m128i mask_0, mask_1;
+ __m128i multmp_0, multmp_1;
+ __m128i rv;
+ __m128i out;
+
+ this_row = _mm_unpacklo_epi8(this_row, zero);
+
+ if (row + 7 < rows) {
+ // Instead of copying the end context we just stop loading when we get
+ // to the last one.
+ below_context = _mm_loadl_epi64((__m128i *)(dst + (row + 7) * pitch));
+ below_context = _mm_unpacklo_epi8(below_context, zero);
+ }
+
+ sum = _mm_sub_epi16(sum, above);
+ sum = _mm_add_epi16(sum, below_context);
+
+ // context^2 fits in 16 bits. Don't need to mulhi and combine. Just zero
+ // extend. Unfortunately we can't do below_sq - above_sq in 16 bits
+ // because x86 does not have unpack with sign extension.
+ above_sq = _mm_mullo_epi16(above, above);
+ sumsq_0 = _mm_sub_epi32(sumsq_0, _mm_unpacklo_epi16(above_sq, zero));
+ sumsq_1 = _mm_sub_epi32(sumsq_1, _mm_unpackhi_epi16(above_sq, zero));
+
+ below_sq = _mm_mullo_epi16(below_context, below_context);
+ sumsq_0 = _mm_add_epi32(sumsq_0, _mm_unpacklo_epi16(below_sq, zero));
+ sumsq_1 = _mm_add_epi32(sumsq_1, _mm_unpackhi_epi16(below_sq, zero));
+
+ // sumsq * 16 - sumsq == sumsq * 15
+ mask_0 = _mm_slli_epi32(sumsq_0, 4);
+ mask_0 = _mm_sub_epi32(mask_0, sumsq_0);
+ mask_1 = _mm_slli_epi32(sumsq_1, 4);
+ mask_1 = _mm_sub_epi32(mask_1, sumsq_1);
+
+ multmp_0 = _mm_mullo_epi16(sum, sum);
+ multmp_1 = _mm_mulhi_epi16(sum, sum);
+
+ mask_0 = _mm_sub_epi32(mask_0, _mm_unpacklo_epi16(multmp_0, multmp_1));
+ mask_1 = _mm_sub_epi32(mask_1, _mm_unpackhi_epi16(multmp_0, multmp_1));
+
+ // mask - f gives a negative value when mask < f
+ mask_0 = _mm_sub_epi32(mask_0, f);
+ mask_1 = _mm_sub_epi32(mask_1, f);
+
+ // Shift the sign bit down to create a mask
+ mask_0 = _mm_srai_epi32(mask_0, 31);
+ mask_1 = _mm_srai_epi32(mask_1, 31);
+
+ mask_0 = _mm_packs_epi32(mask_0, mask_1);
+
+ rv = _mm_loadu_si128((__m128i const *)(vpx_rv + (row & 127)));
+
+ mask_1 = _mm_add_epi16(rv, sum);
+ mask_1 = _mm_add_epi16(mask_1, this_row);
+ mask_1 = _mm_srai_epi16(mask_1, 4);
+
+ mask_1 = _mm_and_si128(mask_0, mask_1);
+ mask_0 = _mm_andnot_si128(mask_0, this_row);
+ out = _mm_or_si128(mask_1, mask_0);
+
+ _mm_storel_epi64((__m128i *)(dst + row * pitch),
+ _mm_packus_epi16(out, zero));
+
+ _mm_store_si128((__m128i *)above_context + ((row + 8) & 7), this_row);
+ }
+
+ dst += 8;
+ }
+}
diff --git a/media/libvpx/libvpx/vpx_dsp/x86/quantize_avx.c b/media/libvpx/libvpx/vpx_dsp/x86/quantize_avx.c
new file mode 100644
index 0000000000..6837a5cf28
--- /dev/null
+++ b/media/libvpx/libvpx/vpx_dsp/x86/quantize_avx.c
@@ -0,0 +1,258 @@
+/*
+ * Copyright (c) 2017 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <assert.h>
+#if defined(_MSC_VER)
+#include <intrin.h>
+#endif
+#include <immintrin.h>
+
+#include "./vpx_dsp_rtcd.h"
+#include "vpx/vpx_integer.h"
+#include "vpx_dsp/x86/bitdepth_conversion_sse2.h"
+#include "vpx_dsp/x86/quantize_sse2.h"
+#include "vpx_dsp/x86/quantize_ssse3.h"
+#include "vp9/common/vp9_scan.h"
+#include "vp9/encoder/vp9_block.h"
+
+void vpx_quantize_b_avx(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
+ const int16_t *zbin_ptr, const int16_t *round_ptr,
+ const int16_t *quant_ptr,
+ const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr,
+ tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr,
+ uint16_t *eob_ptr, const int16_t *scan,
+ const int16_t *iscan) {
+ const __m128i zero = _mm_setzero_si128();
+ const __m256i big_zero = _mm256_setzero_si256();
+ int index;
+
+ __m128i zbin, round, quant, dequant, shift;
+ __m128i coeff0, coeff1;
+ __m128i qcoeff0, qcoeff1;
+ __m128i cmp_mask0, cmp_mask1;
+ __m128i all_zero;
+ __m128i eob = zero, eob0;
+
+ (void)scan;
+
+ *eob_ptr = 0;
+
+ load_b_values(zbin_ptr, &zbin, round_ptr, &round, quant_ptr, &quant,
+ dequant_ptr, &dequant, quant_shift_ptr, &shift);
+
+ // Do DC and first 15 AC.
+ coeff0 = load_tran_low(coeff_ptr);
+ coeff1 = load_tran_low(coeff_ptr + 8);
+
+ qcoeff0 = _mm_abs_epi16(coeff0);
+ qcoeff1 = _mm_abs_epi16(coeff1);
+
+ cmp_mask0 = _mm_cmpgt_epi16(qcoeff0, zbin);
+ zbin = _mm_unpackhi_epi64(zbin, zbin); // Switch DC to AC
+ cmp_mask1 = _mm_cmpgt_epi16(qcoeff1, zbin);
+
+ all_zero = _mm_or_si128(cmp_mask0, cmp_mask1);
+ if (_mm_test_all_zeros(all_zero, all_zero)) {
+ _mm256_store_si256((__m256i *)(qcoeff_ptr), big_zero);
+ _mm256_store_si256((__m256i *)(dqcoeff_ptr), big_zero);
+#if CONFIG_VP9_HIGHBITDEPTH
+ _mm256_store_si256((__m256i *)(qcoeff_ptr + 8), big_zero);
+ _mm256_store_si256((__m256i *)(dqcoeff_ptr + 8), big_zero);
+#endif // CONFIG_VP9_HIGHBITDEPTH
+
+ if (n_coeffs == 16) return;
+
+ round = _mm_unpackhi_epi64(round, round);
+ quant = _mm_unpackhi_epi64(quant, quant);
+ shift = _mm_unpackhi_epi64(shift, shift);
+ dequant = _mm_unpackhi_epi64(dequant, dequant);
+ } else {
+ calculate_qcoeff(&qcoeff0, round, quant, shift);
+ round = _mm_unpackhi_epi64(round, round);
+ quant = _mm_unpackhi_epi64(quant, quant);
+ shift = _mm_unpackhi_epi64(shift, shift);
+ calculate_qcoeff(&qcoeff1, round, quant, shift);
+
+ // Reinsert signs
+ qcoeff0 = _mm_sign_epi16(qcoeff0, coeff0);
+ qcoeff1 = _mm_sign_epi16(qcoeff1, coeff1);
+
+ // Mask out zbin threshold coeffs
+ qcoeff0 = _mm_and_si128(qcoeff0, cmp_mask0);
+ qcoeff1 = _mm_and_si128(qcoeff1, cmp_mask1);
+
+ store_tran_low(qcoeff0, qcoeff_ptr);
+ store_tran_low(qcoeff1, qcoeff_ptr + 8);
+
+ calculate_dqcoeff_and_store(qcoeff0, dequant, dqcoeff_ptr);
+ dequant = _mm_unpackhi_epi64(dequant, dequant);
+ calculate_dqcoeff_and_store(qcoeff1, dequant, dqcoeff_ptr + 8);
+
+ eob = scan_for_eob(&qcoeff0, &qcoeff1, iscan, 0, zero);
+ }
+
+ // AC only loop.
+ for (index = 16; index < n_coeffs; index += 16) {
+ coeff0 = load_tran_low(coeff_ptr + index);
+ coeff1 = load_tran_low(coeff_ptr + index + 8);
+
+ qcoeff0 = _mm_abs_epi16(coeff0);
+ qcoeff1 = _mm_abs_epi16(coeff1);
+
+ cmp_mask0 = _mm_cmpgt_epi16(qcoeff0, zbin);
+ cmp_mask1 = _mm_cmpgt_epi16(qcoeff1, zbin);
+
+ all_zero = _mm_or_si128(cmp_mask0, cmp_mask1);
+ if (_mm_test_all_zeros(all_zero, all_zero)) {
+ _mm256_store_si256((__m256i *)(qcoeff_ptr + index), big_zero);
+ _mm256_store_si256((__m256i *)(dqcoeff_ptr + index), big_zero);
+#if CONFIG_VP9_HIGHBITDEPTH
+ _mm256_store_si256((__m256i *)(qcoeff_ptr + index + 8), big_zero);
+ _mm256_store_si256((__m256i *)(dqcoeff_ptr + index + 8), big_zero);
+#endif // CONFIG_VP9_HIGHBITDEPTH
+ continue;
+ }
+
+ calculate_qcoeff(&qcoeff0, round, quant, shift);
+ calculate_qcoeff(&qcoeff1, round, quant, shift);
+
+ qcoeff0 = _mm_sign_epi16(qcoeff0, coeff0);
+ qcoeff1 = _mm_sign_epi16(qcoeff1, coeff1);
+
+ qcoeff0 = _mm_and_si128(qcoeff0, cmp_mask0);
+ qcoeff1 = _mm_and_si128(qcoeff1, cmp_mask1);
+
+ store_tran_low(qcoeff0, qcoeff_ptr + index);
+ store_tran_low(qcoeff1, qcoeff_ptr + index + 8);
+
+ calculate_dqcoeff_and_store(qcoeff0, dequant, dqcoeff_ptr + index);
+ calculate_dqcoeff_and_store(qcoeff1, dequant, dqcoeff_ptr + index + 8);
+
+ eob0 = scan_for_eob(&qcoeff0, &qcoeff1, iscan, index, zero);
+ eob = _mm_max_epi16(eob, eob0);
+ }
+
+ *eob_ptr = accumulate_eob(eob);
+}
+
+void vpx_quantize_b_32x32_avx(const tran_low_t *coeff_ptr,
+ const struct macroblock_plane *mb_plane,
+ tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
+ const int16_t *dequant_ptr, uint16_t *eob_ptr,
+ const struct ScanOrder *scan_order) {
+ const __m128i zero = _mm_setzero_si128();
+ const __m256i big_zero = _mm256_setzero_si256();
+ int index;
+ const int16_t *iscan = scan_order->iscan;
+
+ __m128i zbin, round, quant, dequant, shift;
+ __m128i coeff0, coeff1;
+ __m128i qcoeff0, qcoeff1;
+ __m128i cmp_mask0, cmp_mask1;
+ __m128i all_zero;
+ __m128i eob = zero, eob0;
+
+ load_b_values32x32(mb_plane, &zbin, &round, &quant, dequant_ptr, &dequant,
+ &shift);
+
+ // Do DC and first 15 AC.
+ coeff0 = load_tran_low(coeff_ptr);
+ coeff1 = load_tran_low(coeff_ptr + 8);
+
+ qcoeff0 = _mm_abs_epi16(coeff0);
+ qcoeff1 = _mm_abs_epi16(coeff1);
+
+ cmp_mask0 = _mm_cmpgt_epi16(qcoeff0, zbin);
+ zbin = _mm_unpackhi_epi64(zbin, zbin); // Switch DC to AC.
+ cmp_mask1 = _mm_cmpgt_epi16(qcoeff1, zbin);
+
+ all_zero = _mm_or_si128(cmp_mask0, cmp_mask1);
+ if (_mm_test_all_zeros(all_zero, all_zero)) {
+ _mm256_store_si256((__m256i *)(qcoeff_ptr), big_zero);
+ _mm256_store_si256((__m256i *)(dqcoeff_ptr), big_zero);
+#if CONFIG_VP9_HIGHBITDEPTH
+ _mm256_store_si256((__m256i *)(qcoeff_ptr + 8), big_zero);
+ _mm256_store_si256((__m256i *)(dqcoeff_ptr + 8), big_zero);
+#endif // CONFIG_VP9_HIGHBITDEPTH
+
+ round = _mm_unpackhi_epi64(round, round);
+ quant = _mm_unpackhi_epi64(quant, quant);
+ shift = _mm_unpackhi_epi64(shift, shift);
+ dequant = _mm_unpackhi_epi64(dequant, dequant);
+ } else {
+ calculate_qcoeff(&qcoeff0, round, quant, shift);
+ round = _mm_unpackhi_epi64(round, round);
+ quant = _mm_unpackhi_epi64(quant, quant);
+ shift = _mm_unpackhi_epi64(shift, shift);
+ calculate_qcoeff(&qcoeff1, round, quant, shift);
+
+ // Reinsert signs.
+ qcoeff0 = _mm_sign_epi16(qcoeff0, coeff0);
+ qcoeff1 = _mm_sign_epi16(qcoeff1, coeff1);
+
+ // Mask out zbin threshold coeffs.
+ qcoeff0 = _mm_and_si128(qcoeff0, cmp_mask0);
+ qcoeff1 = _mm_and_si128(qcoeff1, cmp_mask1);
+
+ store_tran_low(qcoeff0, qcoeff_ptr);
+ store_tran_low(qcoeff1, qcoeff_ptr + 8);
+
+ calculate_dqcoeff_and_store_32x32(qcoeff0, dequant, zero, dqcoeff_ptr);
+ dequant = _mm_unpackhi_epi64(dequant, dequant);
+ calculate_dqcoeff_and_store_32x32(qcoeff1, dequant, zero, dqcoeff_ptr + 8);
+
+ eob = scan_for_eob(&qcoeff0, &qcoeff1, iscan, 0, zero);
+ }
+
+ // AC only loop.
+ for (index = 16; index < 32 * 32; index += 16) {
+ coeff0 = load_tran_low(coeff_ptr + index);
+ coeff1 = load_tran_low(coeff_ptr + index + 8);
+
+ qcoeff0 = _mm_abs_epi16(coeff0);
+ qcoeff1 = _mm_abs_epi16(coeff1);
+
+ cmp_mask0 = _mm_cmpgt_epi16(qcoeff0, zbin);
+ cmp_mask1 = _mm_cmpgt_epi16(qcoeff1, zbin);
+
+ all_zero = _mm_or_si128(cmp_mask0, cmp_mask1);
+ if (_mm_test_all_zeros(all_zero, all_zero)) {
+ _mm256_store_si256((__m256i *)(qcoeff_ptr + index), big_zero);
+ _mm256_store_si256((__m256i *)(dqcoeff_ptr + index), big_zero);
+#if CONFIG_VP9_HIGHBITDEPTH
+ _mm256_store_si256((__m256i *)(qcoeff_ptr + index + 8), big_zero);
+ _mm256_store_si256((__m256i *)(dqcoeff_ptr + index + 8), big_zero);
+#endif // CONFIG_VP9_HIGHBITDEPTH
+ continue;
+ }
+
+ calculate_qcoeff(&qcoeff0, round, quant, shift);
+ calculate_qcoeff(&qcoeff1, round, quant, shift);
+
+ qcoeff0 = _mm_sign_epi16(qcoeff0, coeff0);
+ qcoeff1 = _mm_sign_epi16(qcoeff1, coeff1);
+
+ qcoeff0 = _mm_and_si128(qcoeff0, cmp_mask0);
+ qcoeff1 = _mm_and_si128(qcoeff1, cmp_mask1);
+
+ store_tran_low(qcoeff0, qcoeff_ptr + index);
+ store_tran_low(qcoeff1, qcoeff_ptr + index + 8);
+
+ calculate_dqcoeff_and_store_32x32(qcoeff0, dequant, zero,
+ dqcoeff_ptr + index);
+ calculate_dqcoeff_and_store_32x32(qcoeff1, dequant, zero,
+ dqcoeff_ptr + index + 8);
+
+ eob0 = scan_for_eob(&qcoeff0, &qcoeff1, iscan, index, zero);
+ eob = _mm_max_epi16(eob, eob0);
+ }
+
+ *eob_ptr = accumulate_eob(eob);
+}
diff --git a/media/libvpx/libvpx/vpx_dsp/x86/quantize_avx2.c b/media/libvpx/libvpx/vpx_dsp/x86/quantize_avx2.c
new file mode 100644
index 0000000000..3d97b3fdae
--- /dev/null
+++ b/media/libvpx/libvpx/vpx_dsp/x86/quantize_avx2.c
@@ -0,0 +1,291 @@
+/*
+ * Copyright (c) 2022 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <assert.h>
+#include <immintrin.h>
+
+#include "./vpx_dsp_rtcd.h"
+#include "vpx/vpx_integer.h"
+#include "vp9/common/vp9_scan.h"
+#include "vp9/encoder/vp9_block.h"
+
+static VPX_FORCE_INLINE void load_b_values_avx2(
+ const int16_t *zbin_ptr, __m256i *zbin, const int16_t *round_ptr,
+ __m256i *round, const int16_t *quant_ptr, __m256i *quant,
+ const int16_t *dequant_ptr, __m256i *dequant, const int16_t *shift_ptr,
+ __m256i *shift, int log_scale) {
+ *zbin = _mm256_castsi128_si256(_mm_load_si128((const __m128i *)zbin_ptr));
+ *zbin = _mm256_permute4x64_epi64(*zbin, 0x54);
+ if (log_scale > 0) {
+ const __m256i rnd = _mm256_set1_epi16((int16_t)(1 << (log_scale - 1)));
+ *zbin = _mm256_add_epi16(*zbin, rnd);
+ *zbin = _mm256_srai_epi16(*zbin, log_scale);
+ }
+ // Subtracting 1 here eliminates a _mm256_cmpeq_epi16() instruction when
+ // calculating the zbin mask. (See quantize_b_logscale{0,1,2}_16)
+ *zbin = _mm256_sub_epi16(*zbin, _mm256_set1_epi16(1));
+
+ *round = _mm256_castsi128_si256(_mm_load_si128((const __m128i *)round_ptr));
+ *round = _mm256_permute4x64_epi64(*round, 0x54);
+ if (log_scale > 0) {
+ const __m256i rnd = _mm256_set1_epi16((int16_t)(1 << (log_scale - 1)));
+ *round = _mm256_add_epi16(*round, rnd);
+ *round = _mm256_srai_epi16(*round, log_scale);
+ }
+
+ *quant = _mm256_castsi128_si256(_mm_load_si128((const __m128i *)quant_ptr));
+ *quant = _mm256_permute4x64_epi64(*quant, 0x54);
+ *dequant =
+ _mm256_castsi128_si256(_mm_load_si128((const __m128i *)dequant_ptr));
+ *dequant = _mm256_permute4x64_epi64(*dequant, 0x54);
+ *shift = _mm256_castsi128_si256(_mm_load_si128((const __m128i *)shift_ptr));
+ *shift = _mm256_permute4x64_epi64(*shift, 0x54);
+}
+
+static VPX_FORCE_INLINE __m256i
+load_coefficients_avx2(const tran_low_t *coeff_ptr) {
+#if CONFIG_VP9_HIGHBITDEPTH
+ // typedef int32_t tran_low_t;
+ const __m256i coeff1 = _mm256_loadu_si256((const __m256i *)coeff_ptr);
+ const __m256i coeff2 = _mm256_loadu_si256((const __m256i *)(coeff_ptr + 8));
+ return _mm256_packs_epi32(coeff1, coeff2);
+#else
+ // typedef int16_t tran_low_t;
+ return _mm256_loadu_si256((const __m256i *)coeff_ptr);
+#endif
+}
+
+static VPX_FORCE_INLINE void store_coefficients_avx2(__m256i coeff_vals,
+ tran_low_t *coeff_ptr) {
+#if CONFIG_VP9_HIGHBITDEPTH
+ // typedef int32_t tran_low_t;
+ __m256i coeff_sign = _mm256_srai_epi16(coeff_vals, 15);
+ __m256i coeff_vals_lo = _mm256_unpacklo_epi16(coeff_vals, coeff_sign);
+ __m256i coeff_vals_hi = _mm256_unpackhi_epi16(coeff_vals, coeff_sign);
+ _mm256_storeu_si256((__m256i *)coeff_ptr, coeff_vals_lo);
+ _mm256_storeu_si256((__m256i *)(coeff_ptr + 8), coeff_vals_hi);
+#else
+ // typedef int16_t tran_low_t;
+ _mm256_storeu_si256((__m256i *)coeff_ptr, coeff_vals);
+#endif
+}
+
+static VPX_FORCE_INLINE __m256i
+quantize_b_16(const tran_low_t *coeff_ptr, tran_low_t *qcoeff_ptr,
+ tran_low_t *dqcoeff_ptr, __m256i *v_quant, __m256i *v_dequant,
+ __m256i *v_round, __m256i *v_zbin, __m256i *v_quant_shift) {
+ const __m256i v_coeff = load_coefficients_avx2(coeff_ptr);
+ const __m256i v_abs_coeff = _mm256_abs_epi16(v_coeff);
+ const __m256i v_zbin_mask = _mm256_cmpgt_epi16(v_abs_coeff, *v_zbin);
+
+ if (_mm256_movemask_epi8(v_zbin_mask) == 0) {
+ _mm256_storeu_si256((__m256i *)qcoeff_ptr, _mm256_setzero_si256());
+ _mm256_storeu_si256((__m256i *)dqcoeff_ptr, _mm256_setzero_si256());
+#if CONFIG_VP9_HIGHBITDEPTH
+ _mm256_store_si256((__m256i *)(qcoeff_ptr + 8), _mm256_setzero_si256());
+ _mm256_store_si256((__m256i *)(dqcoeff_ptr + 8), _mm256_setzero_si256());
+#endif // CONFIG_VP9_HIGHBITDEPTH
+ return _mm256_setzero_si256();
+ }
+ {
+ // tmp = v_zbin_mask ? (int64_t)abs_coeff + log_scaled_round : 0
+ const __m256i v_tmp_rnd =
+ _mm256_and_si256(_mm256_adds_epi16(v_abs_coeff, *v_round), v_zbin_mask);
+
+ const __m256i v_tmp32_a = _mm256_mulhi_epi16(v_tmp_rnd, *v_quant);
+ const __m256i v_tmp32_b = _mm256_add_epi16(v_tmp32_a, v_tmp_rnd);
+ const __m256i v_tmp32 = _mm256_mulhi_epi16(v_tmp32_b, *v_quant_shift);
+ const __m256i v_nz_mask =
+ _mm256_cmpgt_epi16(v_tmp32, _mm256_setzero_si256());
+ const __m256i v_qcoeff = _mm256_sign_epi16(v_tmp32, v_coeff);
+#if CONFIG_VP9_HIGHBITDEPTH
+ const __m256i low = _mm256_mullo_epi16(v_qcoeff, *v_dequant);
+ const __m256i high = _mm256_mulhi_epi16(v_qcoeff, *v_dequant);
+
+ const __m256i v_dqcoeff_lo = _mm256_unpacklo_epi16(low, high);
+ const __m256i v_dqcoeff_hi = _mm256_unpackhi_epi16(low, high);
+#else
+ const __m256i v_dqcoeff = _mm256_mullo_epi16(v_qcoeff, *v_dequant);
+#endif
+
+ store_coefficients_avx2(v_qcoeff, qcoeff_ptr);
+#if CONFIG_VP9_HIGHBITDEPTH
+ _mm256_storeu_si256((__m256i *)(dqcoeff_ptr), v_dqcoeff_lo);
+ _mm256_storeu_si256((__m256i *)(dqcoeff_ptr + 8), v_dqcoeff_hi);
+#else
+ store_coefficients_avx2(v_dqcoeff, dqcoeff_ptr);
+#endif
+ return v_nz_mask;
+ }
+}
+
+static VPX_FORCE_INLINE __m256i get_max_lane_eob(const int16_t *iscan,
+ __m256i v_eobmax,
+ __m256i v_mask) {
+#if CONFIG_VP9_HIGHBITDEPTH
+ const __m256i v_iscan = _mm256_permute4x64_epi64(
+ _mm256_loadu_si256((const __m256i *)iscan), 0xD8);
+#else
+ const __m256i v_iscan = _mm256_loadu_si256((const __m256i *)iscan);
+#endif
+ const __m256i v_nz_iscan = _mm256_and_si256(v_iscan, v_mask);
+ return _mm256_max_epi16(v_eobmax, v_nz_iscan);
+}
+
+static VPX_FORCE_INLINE int16_t accumulate_eob256(__m256i eob256) {
+ const __m128i eob_lo = _mm256_castsi256_si128(eob256);
+ const __m128i eob_hi = _mm256_extractf128_si256(eob256, 1);
+ __m128i eob = _mm_max_epi16(eob_lo, eob_hi);
+ __m128i eob_shuffled = _mm_shuffle_epi32(eob, 0xe);
+ eob = _mm_max_epi16(eob, eob_shuffled);
+ eob_shuffled = _mm_shufflelo_epi16(eob, 0xe);
+ eob = _mm_max_epi16(eob, eob_shuffled);
+ eob_shuffled = _mm_shufflelo_epi16(eob, 0x1);
+ eob = _mm_max_epi16(eob, eob_shuffled);
+ return _mm_extract_epi16(eob, 1);
+}
+
+void vpx_quantize_b_avx2(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
+ const int16_t *zbin_ptr, const int16_t *round_ptr,
+ const int16_t *quant_ptr,
+ const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr,
+ tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr,
+ uint16_t *eob_ptr, const int16_t *scan,
+ const int16_t *iscan) {
+ __m256i v_zbin, v_round, v_quant, v_dequant, v_quant_shift, v_nz_mask;
+ __m256i v_eobmax = _mm256_setzero_si256();
+ intptr_t count;
+ (void)scan;
+
+ load_b_values_avx2(zbin_ptr, &v_zbin, round_ptr, &v_round, quant_ptr,
+ &v_quant, dequant_ptr, &v_dequant, quant_shift_ptr,
+ &v_quant_shift, 0);
+ // Do DC and first 15 AC.
+ v_nz_mask = quantize_b_16(coeff_ptr, qcoeff_ptr, dqcoeff_ptr, &v_quant,
+ &v_dequant, &v_round, &v_zbin, &v_quant_shift);
+
+ v_eobmax = get_max_lane_eob(iscan, v_eobmax, v_nz_mask);
+
+ v_round = _mm256_unpackhi_epi64(v_round, v_round);
+ v_quant = _mm256_unpackhi_epi64(v_quant, v_quant);
+ v_dequant = _mm256_unpackhi_epi64(v_dequant, v_dequant);
+ v_quant_shift = _mm256_unpackhi_epi64(v_quant_shift, v_quant_shift);
+ v_zbin = _mm256_unpackhi_epi64(v_zbin, v_zbin);
+
+ for (count = n_coeffs - 16; count > 0; count -= 16) {
+ coeff_ptr += 16;
+ qcoeff_ptr += 16;
+ dqcoeff_ptr += 16;
+ iscan += 16;
+ v_nz_mask = quantize_b_16(coeff_ptr, qcoeff_ptr, dqcoeff_ptr, &v_quant,
+ &v_dequant, &v_round, &v_zbin, &v_quant_shift);
+
+ v_eobmax = get_max_lane_eob(iscan, v_eobmax, v_nz_mask);
+ }
+
+ *eob_ptr = accumulate_eob256(v_eobmax);
+}
+
+static VPX_FORCE_INLINE __m256i quantize_b_32x32_16(
+ const tran_low_t *coeff_ptr, tran_low_t *qcoeff_ptr,
+ tran_low_t *dqcoeff_ptr, const int16_t *iscan, __m256i *v_quant,
+ __m256i *v_dequant, __m256i *v_round, __m256i *v_zbin,
+ __m256i *v_quant_shift, __m256i *v_eobmax) {
+ const __m256i v_coeff = load_coefficients_avx2(coeff_ptr);
+ const __m256i v_abs_coeff = _mm256_abs_epi16(v_coeff);
+ const __m256i v_zbin_mask = _mm256_cmpgt_epi16(v_abs_coeff, *v_zbin);
+
+ if (_mm256_movemask_epi8(v_zbin_mask) == 0) {
+ _mm256_store_si256((__m256i *)qcoeff_ptr, _mm256_setzero_si256());
+ _mm256_store_si256((__m256i *)dqcoeff_ptr, _mm256_setzero_si256());
+#if CONFIG_VP9_HIGHBITDEPTH
+ _mm256_store_si256((__m256i *)(qcoeff_ptr + 8), _mm256_setzero_si256());
+ _mm256_store_si256((__m256i *)(dqcoeff_ptr + 8), _mm256_setzero_si256());
+#endif
+ return *v_eobmax;
+ }
+ {
+ // tmp = v_zbin_mask ? (int64_t)abs_coeff + round : 0
+ const __m256i v_tmp_rnd =
+ _mm256_and_si256(_mm256_adds_epi16(v_abs_coeff, *v_round), v_zbin_mask);
+ // tmp32 = (int)(((((tmp * quant_ptr[rc != 0]) >> 16) + tmp) *
+ // quant_shift_ptr[rc != 0]) >> 15);
+ const __m256i v_tmp32_a = _mm256_mulhi_epi16(v_tmp_rnd, *v_quant);
+ const __m256i v_tmp32_b = _mm256_add_epi16(v_tmp32_a, v_tmp_rnd);
+ const __m256i v_tmp32_hi =
+ _mm256_slli_epi16(_mm256_mulhi_epi16(v_tmp32_b, *v_quant_shift), 1);
+ const __m256i v_tmp32_lo =
+ _mm256_srli_epi16(_mm256_mullo_epi16(v_tmp32_b, *v_quant_shift), 15);
+ const __m256i v_tmp32 = _mm256_or_si256(v_tmp32_hi, v_tmp32_lo);
+ const __m256i v_qcoeff = _mm256_sign_epi16(v_tmp32, v_coeff);
+ const __m256i v_sign_lo =
+ _mm256_unpacklo_epi16(_mm256_setzero_si256(), v_coeff);
+ const __m256i v_sign_hi =
+ _mm256_unpackhi_epi16(_mm256_setzero_si256(), v_coeff);
+ const __m256i low = _mm256_mullo_epi16(v_tmp32, *v_dequant);
+ const __m256i high = _mm256_mulhi_epi16(v_tmp32, *v_dequant);
+ const __m256i v_dqcoeff_lo = _mm256_sign_epi32(
+ _mm256_srli_epi32(_mm256_unpacklo_epi16(low, high), 1), v_sign_lo);
+ const __m256i v_dqcoeff_hi = _mm256_sign_epi32(
+ _mm256_srli_epi32(_mm256_unpackhi_epi16(low, high), 1), v_sign_hi);
+ const __m256i v_nz_mask =
+ _mm256_cmpgt_epi16(v_tmp32, _mm256_setzero_si256());
+
+ store_coefficients_avx2(v_qcoeff, qcoeff_ptr);
+
+#if CONFIG_VP9_HIGHBITDEPTH
+ _mm256_storeu_si256((__m256i *)(dqcoeff_ptr), v_dqcoeff_lo);
+ _mm256_storeu_si256((__m256i *)(dqcoeff_ptr + 8), v_dqcoeff_hi);
+#else
+ store_coefficients_avx2(_mm256_packs_epi32(v_dqcoeff_lo, v_dqcoeff_hi),
+ dqcoeff_ptr);
+#endif
+
+ return get_max_lane_eob(iscan, *v_eobmax, v_nz_mask);
+ }
+}
+
+void vpx_quantize_b_32x32_avx2(const tran_low_t *coeff_ptr,
+ const struct macroblock_plane *mb_plane,
+ tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
+ const int16_t *dequant_ptr, uint16_t *eob_ptr,
+ const struct ScanOrder *scan_order) {
+ __m256i v_zbin, v_round, v_quant, v_dequant, v_quant_shift;
+ __m256i v_eobmax = _mm256_setzero_si256();
+ intptr_t count;
+ const int16_t *iscan = scan_order->iscan;
+
+ load_b_values_avx2(mb_plane->zbin, &v_zbin, mb_plane->round, &v_round,
+ mb_plane->quant, &v_quant, dequant_ptr, &v_dequant,
+ mb_plane->quant_shift, &v_quant_shift, 1);
+
+ // Do DC and first 15 AC.
+ v_eobmax = quantize_b_32x32_16(coeff_ptr, qcoeff_ptr, dqcoeff_ptr, iscan,
+ &v_quant, &v_dequant, &v_round, &v_zbin,
+ &v_quant_shift, &v_eobmax);
+
+ v_round = _mm256_unpackhi_epi64(v_round, v_round);
+ v_quant = _mm256_unpackhi_epi64(v_quant, v_quant);
+ v_dequant = _mm256_unpackhi_epi64(v_dequant, v_dequant);
+ v_quant_shift = _mm256_unpackhi_epi64(v_quant_shift, v_quant_shift);
+ v_zbin = _mm256_unpackhi_epi64(v_zbin, v_zbin);
+
+ for (count = (32 * 32) - 16; count > 0; count -= 16) {
+ coeff_ptr += 16;
+ qcoeff_ptr += 16;
+ dqcoeff_ptr += 16;
+ iscan += 16;
+ v_eobmax = quantize_b_32x32_16(coeff_ptr, qcoeff_ptr, dqcoeff_ptr, iscan,
+ &v_quant, &v_dequant, &v_round, &v_zbin,
+ &v_quant_shift, &v_eobmax);
+ }
+
+ *eob_ptr = accumulate_eob256(v_eobmax);
+}
diff --git a/media/libvpx/libvpx/vpx_dsp/x86/quantize_sse2.c b/media/libvpx/libvpx/vpx_dsp/x86/quantize_sse2.c
new file mode 100644
index 0000000000..9533e7916d
--- /dev/null
+++ b/media/libvpx/libvpx/vpx_dsp/x86/quantize_sse2.c
@@ -0,0 +1,116 @@
+/*
+ * Copyright (c) 2015 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <assert.h>
+#include <emmintrin.h>
+#include <xmmintrin.h>
+
+#include "./vpx_dsp_rtcd.h"
+#include "vpx/vpx_integer.h"
+#include "vpx_dsp/x86/bitdepth_conversion_sse2.h"
+#include "vpx_dsp/x86/quantize_sse2.h"
+
+void vpx_quantize_b_sse2(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
+ const int16_t *zbin_ptr, const int16_t *round_ptr,
+ const int16_t *quant_ptr,
+ const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr,
+ tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr,
+ uint16_t *eob_ptr, const int16_t *scan,
+ const int16_t *iscan) {
+ const __m128i zero = _mm_setzero_si128();
+ int index = 16;
+
+ __m128i zbin, round, quant, dequant, shift;
+ __m128i coeff0, coeff1, coeff0_sign, coeff1_sign;
+ __m128i qcoeff0, qcoeff1;
+ __m128i cmp_mask0, cmp_mask1;
+ __m128i eob, eob0;
+
+ (void)scan;
+
+ // Setup global values.
+ load_b_values(zbin_ptr, &zbin, round_ptr, &round, quant_ptr, &quant,
+ dequant_ptr, &dequant, quant_shift_ptr, &shift);
+
+ // Do DC and first 15 AC.
+ coeff0 = load_tran_low(coeff_ptr);
+ coeff1 = load_tran_low(coeff_ptr + 8);
+
+ // Poor man's abs().
+ coeff0_sign = _mm_srai_epi16(coeff0, 15);
+ coeff1_sign = _mm_srai_epi16(coeff1, 15);
+ qcoeff0 = invert_sign_sse2(coeff0, coeff0_sign);
+ qcoeff1 = invert_sign_sse2(coeff1, coeff1_sign);
+
+ cmp_mask0 = _mm_cmpgt_epi16(qcoeff0, zbin);
+ zbin = _mm_unpackhi_epi64(zbin, zbin); // Switch DC to AC
+ cmp_mask1 = _mm_cmpgt_epi16(qcoeff1, zbin);
+
+ calculate_qcoeff(&qcoeff0, round, quant, shift);
+
+ round = _mm_unpackhi_epi64(round, round);
+ quant = _mm_unpackhi_epi64(quant, quant);
+ shift = _mm_unpackhi_epi64(shift, shift);
+
+ calculate_qcoeff(&qcoeff1, round, quant, shift);
+
+ // Reinsert signs
+ qcoeff0 = invert_sign_sse2(qcoeff0, coeff0_sign);
+ qcoeff1 = invert_sign_sse2(qcoeff1, coeff1_sign);
+
+ // Mask out zbin threshold coeffs
+ qcoeff0 = _mm_and_si128(qcoeff0, cmp_mask0);
+ qcoeff1 = _mm_and_si128(qcoeff1, cmp_mask1);
+
+ store_tran_low(qcoeff0, qcoeff_ptr);
+ store_tran_low(qcoeff1, qcoeff_ptr + 8);
+
+ calculate_dqcoeff_and_store(qcoeff0, dequant, dqcoeff_ptr);
+ dequant = _mm_unpackhi_epi64(dequant, dequant);
+ calculate_dqcoeff_and_store(qcoeff1, dequant, dqcoeff_ptr + 8);
+
+ eob = scan_for_eob(&qcoeff0, &qcoeff1, iscan, 0, zero);
+
+ // AC only loop.
+ while (index < n_coeffs) {
+ coeff0 = load_tran_low(coeff_ptr + index);
+ coeff1 = load_tran_low(coeff_ptr + index + 8);
+
+ coeff0_sign = _mm_srai_epi16(coeff0, 15);
+ coeff1_sign = _mm_srai_epi16(coeff1, 15);
+ qcoeff0 = invert_sign_sse2(coeff0, coeff0_sign);
+ qcoeff1 = invert_sign_sse2(coeff1, coeff1_sign);
+
+ cmp_mask0 = _mm_cmpgt_epi16(qcoeff0, zbin);
+ cmp_mask1 = _mm_cmpgt_epi16(qcoeff1, zbin);
+
+ calculate_qcoeff(&qcoeff0, round, quant, shift);
+ calculate_qcoeff(&qcoeff1, round, quant, shift);
+
+ qcoeff0 = invert_sign_sse2(qcoeff0, coeff0_sign);
+ qcoeff1 = invert_sign_sse2(qcoeff1, coeff1_sign);
+
+ qcoeff0 = _mm_and_si128(qcoeff0, cmp_mask0);
+ qcoeff1 = _mm_and_si128(qcoeff1, cmp_mask1);
+
+ store_tran_low(qcoeff0, qcoeff_ptr + index);
+ store_tran_low(qcoeff1, qcoeff_ptr + index + 8);
+
+ calculate_dqcoeff_and_store(qcoeff0, dequant, dqcoeff_ptr + index);
+ calculate_dqcoeff_and_store(qcoeff1, dequant, dqcoeff_ptr + index + 8);
+
+ eob0 = scan_for_eob(&qcoeff0, &qcoeff1, iscan, index, zero);
+ eob = _mm_max_epi16(eob, eob0);
+
+ index += 16;
+ }
+
+ *eob_ptr = accumulate_eob(eob);
+}
diff --git a/media/libvpx/libvpx/vpx_dsp/x86/quantize_sse2.h b/media/libvpx/libvpx/vpx_dsp/x86/quantize_sse2.h
new file mode 100644
index 0000000000..fe42fee018
--- /dev/null
+++ b/media/libvpx/libvpx/vpx_dsp/x86/quantize_sse2.h
@@ -0,0 +1,127 @@
+/*
+ * Copyright (c) 2017 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef VPX_VPX_DSP_X86_QUANTIZE_SSE2_H_
+#define VPX_VPX_DSP_X86_QUANTIZE_SSE2_H_
+
+#include <emmintrin.h>
+
+#include "./vpx_config.h"
+#include "vpx/vpx_integer.h"
+#include "vp9/encoder/vp9_block.h"
+
+static INLINE void load_b_values(const int16_t *zbin_ptr, __m128i *zbin,
+ const int16_t *round_ptr, __m128i *round,
+ const int16_t *quant_ptr, __m128i *quant,
+ const int16_t *dequant_ptr, __m128i *dequant,
+ const int16_t *shift_ptr, __m128i *shift) {
+ *zbin = _mm_load_si128((const __m128i *)zbin_ptr);
+ *round = _mm_load_si128((const __m128i *)round_ptr);
+ *quant = _mm_load_si128((const __m128i *)quant_ptr);
+ *zbin = _mm_sub_epi16(*zbin, _mm_set1_epi16(1));
+ *dequant = _mm_load_si128((const __m128i *)dequant_ptr);
+ *shift = _mm_load_si128((const __m128i *)shift_ptr);
+}
+
+static INLINE void load_b_values32x32(
+ const struct macroblock_plane *const mb_plane, __m128i *zbin,
+ __m128i *round, __m128i *quant, const int16_t *dequant_ptr,
+ __m128i *dequant, __m128i *shift) {
+ const __m128i one = _mm_set1_epi16(1);
+ // The 32x32 halves zbin and round.
+ *zbin = _mm_load_si128((const __m128i *)mb_plane->zbin);
+ // Shift with rounding.
+ *zbin = _mm_add_epi16(*zbin, one);
+ *zbin = _mm_srli_epi16(*zbin, 1);
+ // x86 has no "greater *or equal*" comparison. Subtract 1 from zbin so
+ // it is a strict "greater" comparison.
+ *zbin = _mm_sub_epi16(*zbin, one);
+
+ *round = _mm_load_si128((const __m128i *)mb_plane->round);
+ *round = _mm_add_epi16(*round, one);
+ *round = _mm_srli_epi16(*round, 1);
+
+ *quant = _mm_load_si128((const __m128i *)mb_plane->quant);
+ *dequant = _mm_load_si128((const __m128i *)dequant_ptr);
+ *shift = _mm_load_si128((const __m128i *)mb_plane->quant_shift);
+ // I suspect this is not technically OK because quant_shift can be up
+ // to 1 << 16 and shifting up again will outrange that, but the test is not
+ // comprehensive enough to catch that and "it's been that way forever"
+ *shift = _mm_slli_epi16(*shift, 1);
+}
+
+static INLINE void load_fp_values(const int16_t *round_ptr, __m128i *round,
+ const int16_t *quant_ptr, __m128i *quant,
+ const int16_t *dequant_ptr,
+ __m128i *dequant) {
+ *round = _mm_load_si128((const __m128i *)round_ptr);
+ *quant = _mm_load_si128((const __m128i *)quant_ptr);
+ *dequant = _mm_load_si128((const __m128i *)dequant_ptr);
+}
+
+// With ssse3 and later abs() and sign() are preferred.
+static INLINE __m128i invert_sign_sse2(__m128i a, __m128i sign) {
+ a = _mm_xor_si128(a, sign);
+ return _mm_sub_epi16(a, sign);
+}
+
+static INLINE void calculate_qcoeff(__m128i *coeff, const __m128i round,
+ const __m128i quant, const __m128i shift) {
+ __m128i tmp, qcoeff;
+ qcoeff = _mm_adds_epi16(*coeff, round);
+ tmp = _mm_mulhi_epi16(qcoeff, quant);
+ qcoeff = _mm_add_epi16(tmp, qcoeff);
+ *coeff = _mm_mulhi_epi16(qcoeff, shift);
+}
+
+static INLINE void calculate_dqcoeff_and_store(__m128i qcoeff, __m128i dequant,
+ tran_low_t *dqcoeff) {
+#if CONFIG_VP9_HIGHBITDEPTH
+ const __m128i low = _mm_mullo_epi16(qcoeff, dequant);
+ const __m128i high = _mm_mulhi_epi16(qcoeff, dequant);
+
+ const __m128i dqcoeff32_0 = _mm_unpacklo_epi16(low, high);
+ const __m128i dqcoeff32_1 = _mm_unpackhi_epi16(low, high);
+
+ _mm_store_si128((__m128i *)(dqcoeff), dqcoeff32_0);
+ _mm_store_si128((__m128i *)(dqcoeff + 4), dqcoeff32_1);
+#else
+ const __m128i dqcoeff16 = _mm_mullo_epi16(qcoeff, dequant);
+
+ _mm_store_si128((__m128i *)(dqcoeff), dqcoeff16);
+#endif // CONFIG_VP9_HIGHBITDEPTH
+}
+
+// Scan 16 values for eob reference in scan.
+static INLINE __m128i scan_for_eob(__m128i *coeff0, __m128i *coeff1,
+ const int16_t *scan, const int index,
+ const __m128i zero) {
+ const __m128i zero_coeff0 = _mm_cmpeq_epi16(*coeff0, zero);
+ const __m128i zero_coeff1 = _mm_cmpeq_epi16(*coeff1, zero);
+ __m128i scan0 = _mm_load_si128((const __m128i *)(scan + index));
+ __m128i scan1 = _mm_load_si128((const __m128i *)(scan + index + 8));
+ __m128i eob0, eob1;
+ eob0 = _mm_andnot_si128(zero_coeff0, scan0);
+ eob1 = _mm_andnot_si128(zero_coeff1, scan1);
+ return _mm_max_epi16(eob0, eob1);
+}
+
+static INLINE int16_t accumulate_eob(__m128i eob) {
+ __m128i eob_shuffled;
+ eob_shuffled = _mm_shuffle_epi32(eob, 0xe);
+ eob = _mm_max_epi16(eob, eob_shuffled);
+ eob_shuffled = _mm_shufflelo_epi16(eob, 0xe);
+ eob = _mm_max_epi16(eob, eob_shuffled);
+ eob_shuffled = _mm_shufflelo_epi16(eob, 0x1);
+ eob = _mm_max_epi16(eob, eob_shuffled);
+ return _mm_extract_epi16(eob, 1);
+}
+
+#endif // VPX_VPX_DSP_X86_QUANTIZE_SSE2_H_
diff --git a/media/libvpx/libvpx/vpx_dsp/x86/quantize_ssse3.c b/media/libvpx/libvpx/vpx_dsp/x86/quantize_ssse3.c
new file mode 100644
index 0000000000..641f23298b
--- /dev/null
+++ b/media/libvpx/libvpx/vpx_dsp/x86/quantize_ssse3.c
@@ -0,0 +1,232 @@
+/*
+ * Copyright (c) 2017 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <assert.h>
+#include <tmmintrin.h>
+
+#include "./vpx_dsp_rtcd.h"
+#include "vpx/vpx_integer.h"
+#include "vpx_dsp/x86/bitdepth_conversion_sse2.h"
+#include "vpx_dsp/x86/quantize_sse2.h"
+#include "vpx_dsp/x86/quantize_ssse3.h"
+#include "vp9/common/vp9_scan.h"
+#include "vp9/encoder/vp9_block.h"
+
+void vpx_quantize_b_ssse3(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
+ const int16_t *zbin_ptr, const int16_t *round_ptr,
+ const int16_t *quant_ptr,
+ const int16_t *quant_shift_ptr,
+ tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
+ const int16_t *dequant_ptr, uint16_t *eob_ptr,
+ const int16_t *scan, const int16_t *iscan) {
+ const __m128i zero = _mm_setzero_si128();
+ int index = 16;
+
+ __m128i zbin, round, quant, dequant, shift;
+ __m128i coeff0, coeff1;
+ __m128i qcoeff0, qcoeff1;
+ __m128i cmp_mask0, cmp_mask1;
+ __m128i eob, eob0;
+
+ (void)scan;
+
+ load_b_values(zbin_ptr, &zbin, round_ptr, &round, quant_ptr, &quant,
+ dequant_ptr, &dequant, quant_shift_ptr, &shift);
+
+ // Do DC and first 15 AC.
+ coeff0 = load_tran_low(coeff_ptr);
+ coeff1 = load_tran_low(coeff_ptr + 8);
+
+ qcoeff0 = _mm_abs_epi16(coeff0);
+ qcoeff1 = _mm_abs_epi16(coeff1);
+
+ cmp_mask0 = _mm_cmpgt_epi16(qcoeff0, zbin);
+ zbin = _mm_unpackhi_epi64(zbin, zbin); // Switch DC to AC
+ cmp_mask1 = _mm_cmpgt_epi16(qcoeff1, zbin);
+
+ calculate_qcoeff(&qcoeff0, round, quant, shift);
+ round = _mm_unpackhi_epi64(round, round);
+ quant = _mm_unpackhi_epi64(quant, quant);
+ shift = _mm_unpackhi_epi64(shift, shift);
+ calculate_qcoeff(&qcoeff1, round, quant, shift);
+
+ // Reinsert signs
+ qcoeff0 = _mm_sign_epi16(qcoeff0, coeff0);
+ qcoeff1 = _mm_sign_epi16(qcoeff1, coeff1);
+
+ // Mask out zbin threshold coeffs
+ qcoeff0 = _mm_and_si128(qcoeff0, cmp_mask0);
+ qcoeff1 = _mm_and_si128(qcoeff1, cmp_mask1);
+
+ store_tran_low(qcoeff0, qcoeff_ptr);
+ store_tran_low(qcoeff1, qcoeff_ptr + 8);
+
+ calculate_dqcoeff_and_store(qcoeff0, dequant, dqcoeff_ptr);
+ dequant = _mm_unpackhi_epi64(dequant, dequant);
+ calculate_dqcoeff_and_store(qcoeff1, dequant, dqcoeff_ptr + 8);
+
+ eob = scan_for_eob(&qcoeff0, &qcoeff1, iscan, 0, zero);
+
+ // AC only loop.
+ while (index < n_coeffs) {
+ coeff0 = load_tran_low(coeff_ptr + index);
+ coeff1 = load_tran_low(coeff_ptr + index + 8);
+
+ qcoeff0 = _mm_abs_epi16(coeff0);
+ qcoeff1 = _mm_abs_epi16(coeff1);
+
+ cmp_mask0 = _mm_cmpgt_epi16(qcoeff0, zbin);
+ cmp_mask1 = _mm_cmpgt_epi16(qcoeff1, zbin);
+
+ calculate_qcoeff(&qcoeff0, round, quant, shift);
+ calculate_qcoeff(&qcoeff1, round, quant, shift);
+
+ qcoeff0 = _mm_sign_epi16(qcoeff0, coeff0);
+ qcoeff1 = _mm_sign_epi16(qcoeff1, coeff1);
+
+ qcoeff0 = _mm_and_si128(qcoeff0, cmp_mask0);
+ qcoeff1 = _mm_and_si128(qcoeff1, cmp_mask1);
+
+ store_tran_low(qcoeff0, qcoeff_ptr + index);
+ store_tran_low(qcoeff1, qcoeff_ptr + index + 8);
+
+ calculate_dqcoeff_and_store(qcoeff0, dequant, dqcoeff_ptr + index);
+ calculate_dqcoeff_and_store(qcoeff1, dequant, dqcoeff_ptr + index + 8);
+
+ eob0 = scan_for_eob(&qcoeff0, &qcoeff1, iscan, index, zero);
+ eob = _mm_max_epi16(eob, eob0);
+
+ index += 16;
+ }
+
+ *eob_ptr = accumulate_eob(eob);
+}
+
+void vpx_quantize_b_32x32_ssse3(const tran_low_t *coeff_ptr,
+ const struct macroblock_plane *mb_plane,
+ tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
+ const int16_t *dequant_ptr, uint16_t *eob_ptr,
+ const struct ScanOrder *scan_order) {
+ const __m128i zero = _mm_setzero_si128();
+ int index;
+ const int16_t *iscan = scan_order->iscan;
+
+ __m128i zbin, round, quant, dequant, shift;
+ __m128i coeff0, coeff1;
+ __m128i qcoeff0, qcoeff1;
+ __m128i cmp_mask0, cmp_mask1;
+ __m128i all_zero;
+ __m128i eob = zero, eob0;
+
+ load_b_values32x32(mb_plane, &zbin, &round, &quant, dequant_ptr, &dequant,
+ &shift);
+
+ // Do DC and first 15 AC.
+ coeff0 = load_tran_low(coeff_ptr);
+ coeff1 = load_tran_low(coeff_ptr + 8);
+
+ qcoeff0 = _mm_abs_epi16(coeff0);
+ qcoeff1 = _mm_abs_epi16(coeff1);
+
+ cmp_mask0 = _mm_cmpgt_epi16(qcoeff0, zbin);
+ zbin = _mm_unpackhi_epi64(zbin, zbin); // Switch DC to AC.
+ cmp_mask1 = _mm_cmpgt_epi16(qcoeff1, zbin);
+
+ all_zero = _mm_or_si128(cmp_mask0, cmp_mask1);
+ if (_mm_movemask_epi8(all_zero) == 0) {
+ _mm_store_si128((__m128i *)(qcoeff_ptr), zero);
+ _mm_store_si128((__m128i *)(qcoeff_ptr + 8), zero);
+ _mm_store_si128((__m128i *)(dqcoeff_ptr), zero);
+ _mm_store_si128((__m128i *)(dqcoeff_ptr + 8), zero);
+#if CONFIG_VP9_HIGHBITDEPTH
+ _mm_store_si128((__m128i *)(qcoeff_ptr + 4), zero);
+ _mm_store_si128((__m128i *)(qcoeff_ptr + 12), zero);
+ _mm_store_si128((__m128i *)(dqcoeff_ptr + 4), zero);
+ _mm_store_si128((__m128i *)(dqcoeff_ptr + 12), zero);
+#endif // CONFIG_HIGHBITDEPTH
+
+ round = _mm_unpackhi_epi64(round, round);
+ quant = _mm_unpackhi_epi64(quant, quant);
+ shift = _mm_unpackhi_epi64(shift, shift);
+ dequant = _mm_unpackhi_epi64(dequant, dequant);
+ } else {
+ calculate_qcoeff(&qcoeff0, round, quant, shift);
+ round = _mm_unpackhi_epi64(round, round);
+ quant = _mm_unpackhi_epi64(quant, quant);
+ shift = _mm_unpackhi_epi64(shift, shift);
+ calculate_qcoeff(&qcoeff1, round, quant, shift);
+
+ // Reinsert signs.
+ qcoeff0 = _mm_sign_epi16(qcoeff0, coeff0);
+ qcoeff1 = _mm_sign_epi16(qcoeff1, coeff1);
+
+ // Mask out zbin threshold coeffs.
+ qcoeff0 = _mm_and_si128(qcoeff0, cmp_mask0);
+ qcoeff1 = _mm_and_si128(qcoeff1, cmp_mask1);
+
+ store_tran_low(qcoeff0, qcoeff_ptr);
+ store_tran_low(qcoeff1, qcoeff_ptr + 8);
+
+ calculate_dqcoeff_and_store_32x32(qcoeff0, dequant, zero, dqcoeff_ptr);
+ dequant = _mm_unpackhi_epi64(dequant, dequant);
+ calculate_dqcoeff_and_store_32x32(qcoeff1, dequant, zero, dqcoeff_ptr + 8);
+
+ eob = scan_for_eob(&qcoeff0, &qcoeff1, iscan, 0, zero);
+ }
+
+ // AC only loop.
+ for (index = 16; index < 32 * 32; index += 16) {
+ coeff0 = load_tran_low(coeff_ptr + index);
+ coeff1 = load_tran_low(coeff_ptr + index + 8);
+
+ qcoeff0 = _mm_abs_epi16(coeff0);
+ qcoeff1 = _mm_abs_epi16(coeff1);
+
+ cmp_mask0 = _mm_cmpgt_epi16(qcoeff0, zbin);
+ cmp_mask1 = _mm_cmpgt_epi16(qcoeff1, zbin);
+
+ all_zero = _mm_or_si128(cmp_mask0, cmp_mask1);
+ if (_mm_movemask_epi8(all_zero) == 0) {
+ _mm_store_si128((__m128i *)(qcoeff_ptr + index), zero);
+ _mm_store_si128((__m128i *)(qcoeff_ptr + index + 8), zero);
+ _mm_store_si128((__m128i *)(dqcoeff_ptr + index), zero);
+ _mm_store_si128((__m128i *)(dqcoeff_ptr + index + 8), zero);
+#if CONFIG_VP9_HIGHBITDEPTH
+ _mm_store_si128((__m128i *)(qcoeff_ptr + index + 4), zero);
+ _mm_store_si128((__m128i *)(qcoeff_ptr + index + 12), zero);
+ _mm_store_si128((__m128i *)(dqcoeff_ptr + index + 4), zero);
+ _mm_store_si128((__m128i *)(dqcoeff_ptr + index + 12), zero);
+#endif // CONFIG_VP9_HIGHBITDEPTH
+ continue;
+ }
+
+ calculate_qcoeff(&qcoeff0, round, quant, shift);
+ calculate_qcoeff(&qcoeff1, round, quant, shift);
+
+ qcoeff0 = _mm_sign_epi16(qcoeff0, coeff0);
+ qcoeff1 = _mm_sign_epi16(qcoeff1, coeff1);
+
+ qcoeff0 = _mm_and_si128(qcoeff0, cmp_mask0);
+ qcoeff1 = _mm_and_si128(qcoeff1, cmp_mask1);
+
+ store_tran_low(qcoeff0, qcoeff_ptr + index);
+ store_tran_low(qcoeff1, qcoeff_ptr + index + 8);
+
+ calculate_dqcoeff_and_store_32x32(qcoeff0, dequant, zero,
+ dqcoeff_ptr + index);
+ calculate_dqcoeff_and_store_32x32(qcoeff1, dequant, zero,
+ dqcoeff_ptr + 8 + index);
+
+ eob0 = scan_for_eob(&qcoeff0, &qcoeff1, iscan, index, zero);
+ eob = _mm_max_epi16(eob, eob0);
+ }
+
+ *eob_ptr = accumulate_eob(eob);
+}
diff --git a/media/libvpx/libvpx/vpx_dsp/x86/quantize_ssse3.h b/media/libvpx/libvpx/vpx_dsp/x86/quantize_ssse3.h
new file mode 100644
index 0000000000..e8d2a05771
--- /dev/null
+++ b/media/libvpx/libvpx/vpx_dsp/x86/quantize_ssse3.h
@@ -0,0 +1,51 @@
+/*
+ * Copyright (c) 2017 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef VPX_VPX_DSP_X86_QUANTIZE_SSSE3_H_
+#define VPX_VPX_DSP_X86_QUANTIZE_SSSE3_H_
+
+#include <emmintrin.h>
+
+#include "./vpx_config.h"
+#include "vpx/vpx_integer.h"
+#include "vpx_dsp/x86/quantize_sse2.h"
+
+static INLINE void calculate_dqcoeff_and_store_32x32(const __m128i qcoeff,
+ const __m128i dequant,
+ const __m128i zero,
+ tran_low_t *dqcoeff) {
+ // Un-sign to bias rounding like C.
+ const __m128i coeff = _mm_abs_epi16(qcoeff);
+
+ const __m128i sign_0 = _mm_unpacklo_epi16(zero, qcoeff);
+ const __m128i sign_1 = _mm_unpackhi_epi16(zero, qcoeff);
+
+ const __m128i low = _mm_mullo_epi16(coeff, dequant);
+ const __m128i high = _mm_mulhi_epi16(coeff, dequant);
+ __m128i dqcoeff32_0 = _mm_unpacklo_epi16(low, high);
+ __m128i dqcoeff32_1 = _mm_unpackhi_epi16(low, high);
+
+ // "Divide" by 2.
+ dqcoeff32_0 = _mm_srli_epi32(dqcoeff32_0, 1);
+ dqcoeff32_1 = _mm_srli_epi32(dqcoeff32_1, 1);
+
+ dqcoeff32_0 = _mm_sign_epi32(dqcoeff32_0, sign_0);
+ dqcoeff32_1 = _mm_sign_epi32(dqcoeff32_1, sign_1);
+
+#if CONFIG_VP9_HIGHBITDEPTH
+ _mm_store_si128((__m128i *)(dqcoeff), dqcoeff32_0);
+ _mm_store_si128((__m128i *)(dqcoeff + 4), dqcoeff32_1);
+#else
+ _mm_store_si128((__m128i *)(dqcoeff),
+ _mm_packs_epi32(dqcoeff32_0, dqcoeff32_1));
+#endif // CONFIG_VP9_HIGHBITDEPTH
+}
+
+#endif // VPX_VPX_DSP_X86_QUANTIZE_SSSE3_H_
diff --git a/media/libvpx/libvpx/vpx_dsp/x86/sad4d_avx2.c b/media/libvpx/libvpx/vpx_dsp/x86/sad4d_avx2.c
new file mode 100644
index 0000000000..cf7111983b
--- /dev/null
+++ b/media/libvpx/libvpx/vpx_dsp/x86/sad4d_avx2.c
@@ -0,0 +1,184 @@
+/*
+ * Copyright (c) 2014 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+#include <immintrin.h> // AVX2
+#include "./vpx_dsp_rtcd.h"
+#include "vpx/vpx_integer.h"
+
+// Note with sums[4] some versions of Visual Studio may fail due to parameter
+// alignment, though the functions should be equivalent:
+// error C2719: 'sums': formal parameter with requested alignment of 32 won't be
+// aligned
+static INLINE void calc_final_4(const __m256i *const sums /*[4]*/,
+ uint32_t sad_array[4]) {
+ const __m256i t0 = _mm256_hadd_epi32(sums[0], sums[1]);
+ const __m256i t1 = _mm256_hadd_epi32(sums[2], sums[3]);
+ const __m256i t2 = _mm256_hadd_epi32(t0, t1);
+ const __m128i sum = _mm_add_epi32(_mm256_castsi256_si128(t2),
+ _mm256_extractf128_si256(t2, 1));
+ _mm_storeu_si128((__m128i *)sad_array, sum);
+}
+
+static INLINE void sad32xhx4d_avx2(const uint8_t *src_ptr, int src_stride,
+ const uint8_t *const ref_array[4],
+ int ref_stride, int h,
+ uint32_t sad_array[4]) {
+ int i;
+ const uint8_t *refs[4];
+ __m256i sums[4];
+
+ refs[0] = ref_array[0];
+ refs[1] = ref_array[1];
+ refs[2] = ref_array[2];
+ refs[3] = ref_array[3];
+ sums[0] = _mm256_setzero_si256();
+ sums[1] = _mm256_setzero_si256();
+ sums[2] = _mm256_setzero_si256();
+ sums[3] = _mm256_setzero_si256();
+
+ for (i = 0; i < h; i++) {
+ __m256i r[4];
+
+ // load src and all ref[]
+ const __m256i s = _mm256_load_si256((const __m256i *)src_ptr);
+ r[0] = _mm256_loadu_si256((const __m256i *)refs[0]);
+ r[1] = _mm256_loadu_si256((const __m256i *)refs[1]);
+ r[2] = _mm256_loadu_si256((const __m256i *)refs[2]);
+ r[3] = _mm256_loadu_si256((const __m256i *)refs[3]);
+
+ // sum of the absolute differences between every ref[] to src
+ r[0] = _mm256_sad_epu8(r[0], s);
+ r[1] = _mm256_sad_epu8(r[1], s);
+ r[2] = _mm256_sad_epu8(r[2], s);
+ r[3] = _mm256_sad_epu8(r[3], s);
+
+ // sum every ref[]
+ sums[0] = _mm256_add_epi32(sums[0], r[0]);
+ sums[1] = _mm256_add_epi32(sums[1], r[1]);
+ sums[2] = _mm256_add_epi32(sums[2], r[2]);
+ sums[3] = _mm256_add_epi32(sums[3], r[3]);
+
+ src_ptr += src_stride;
+ refs[0] += ref_stride;
+ refs[1] += ref_stride;
+ refs[2] += ref_stride;
+ refs[3] += ref_stride;
+ }
+
+ calc_final_4(sums, sad_array);
+}
+
+static INLINE void sad64xhx4d_avx2(const uint8_t *src_ptr, int src_stride,
+ const uint8_t *const ref_array[4],
+ int ref_stride, int h,
+ uint32_t sad_array[4]) {
+ __m256i sums[4];
+ int i;
+ const uint8_t *refs[4];
+
+ refs[0] = ref_array[0];
+ refs[1] = ref_array[1];
+ refs[2] = ref_array[2];
+ refs[3] = ref_array[3];
+ sums[0] = _mm256_setzero_si256();
+ sums[1] = _mm256_setzero_si256();
+ sums[2] = _mm256_setzero_si256();
+ sums[3] = _mm256_setzero_si256();
+
+ for (i = 0; i < h; i++) {
+ __m256i r_lo[4], r_hi[4];
+ // load 64 bytes from src and all ref[]
+ const __m256i s_lo = _mm256_load_si256((const __m256i *)src_ptr);
+ const __m256i s_hi = _mm256_load_si256((const __m256i *)(src_ptr + 32));
+ r_lo[0] = _mm256_loadu_si256((const __m256i *)refs[0]);
+ r_hi[0] = _mm256_loadu_si256((const __m256i *)(refs[0] + 32));
+ r_lo[1] = _mm256_loadu_si256((const __m256i *)refs[1]);
+ r_hi[1] = _mm256_loadu_si256((const __m256i *)(refs[1] + 32));
+ r_lo[2] = _mm256_loadu_si256((const __m256i *)refs[2]);
+ r_hi[2] = _mm256_loadu_si256((const __m256i *)(refs[2] + 32));
+ r_lo[3] = _mm256_loadu_si256((const __m256i *)refs[3]);
+ r_hi[3] = _mm256_loadu_si256((const __m256i *)(refs[3] + 32));
+
+ // sum of the absolute differences between every ref[] to src
+ r_lo[0] = _mm256_sad_epu8(r_lo[0], s_lo);
+ r_lo[1] = _mm256_sad_epu8(r_lo[1], s_lo);
+ r_lo[2] = _mm256_sad_epu8(r_lo[2], s_lo);
+ r_lo[3] = _mm256_sad_epu8(r_lo[3], s_lo);
+ r_hi[0] = _mm256_sad_epu8(r_hi[0], s_hi);
+ r_hi[1] = _mm256_sad_epu8(r_hi[1], s_hi);
+ r_hi[2] = _mm256_sad_epu8(r_hi[2], s_hi);
+ r_hi[3] = _mm256_sad_epu8(r_hi[3], s_hi);
+
+ // sum every ref[]
+ sums[0] = _mm256_add_epi32(sums[0], r_lo[0]);
+ sums[1] = _mm256_add_epi32(sums[1], r_lo[1]);
+ sums[2] = _mm256_add_epi32(sums[2], r_lo[2]);
+ sums[3] = _mm256_add_epi32(sums[3], r_lo[3]);
+ sums[0] = _mm256_add_epi32(sums[0], r_hi[0]);
+ sums[1] = _mm256_add_epi32(sums[1], r_hi[1]);
+ sums[2] = _mm256_add_epi32(sums[2], r_hi[2]);
+ sums[3] = _mm256_add_epi32(sums[3], r_hi[3]);
+
+ src_ptr += src_stride;
+ refs[0] += ref_stride;
+ refs[1] += ref_stride;
+ refs[2] += ref_stride;
+ refs[3] += ref_stride;
+ }
+
+ calc_final_4(sums, sad_array);
+}
+
+#define SAD64_H(h) \
+ void vpx_sad64x##h##x4d_avx2(const uint8_t *src, int src_stride, \
+ const uint8_t *const ref_array[4], \
+ int ref_stride, uint32_t sad_array[4]) { \
+ sad64xhx4d_avx2(src, src_stride, ref_array, ref_stride, h, sad_array); \
+ }
+
+#define SAD32_H(h) \
+ void vpx_sad32x##h##x4d_avx2(const uint8_t *src, int src_stride, \
+ const uint8_t *const ref_array[4], \
+ int ref_stride, uint32_t sad_array[4]) { \
+ sad32xhx4d_avx2(src, src_stride, ref_array, ref_stride, h, sad_array); \
+ }
+
+SAD64_H(64)
+SAD32_H(32)
+
+#define SADS64_H(h) \
+ void vpx_sad_skip_64x##h##x4d_avx2(const uint8_t *src, int src_stride, \
+ const uint8_t *const ref_array[4], \
+ int ref_stride, uint32_t sad_array[4]) { \
+ sad64xhx4d_avx2(src, 2 * src_stride, ref_array, 2 * ref_stride, \
+ ((h) >> 1), sad_array); \
+ sad_array[0] <<= 1; \
+ sad_array[1] <<= 1; \
+ sad_array[2] <<= 1; \
+ sad_array[3] <<= 1; \
+ }
+
+#define SADS32_H(h) \
+ void vpx_sad_skip_32x##h##x4d_avx2(const uint8_t *src, int src_stride, \
+ const uint8_t *const ref_array[4], \
+ int ref_stride, uint32_t sad_array[4]) { \
+ sad32xhx4d_avx2(src, 2 * src_stride, ref_array, 2 * ref_stride, \
+ ((h) >> 1), sad_array); \
+ sad_array[0] <<= 1; \
+ sad_array[1] <<= 1; \
+ sad_array[2] <<= 1; \
+ sad_array[3] <<= 1; \
+ }
+
+SADS64_H(64)
+SADS64_H(32)
+
+SADS32_H(64)
+SADS32_H(32)
+SADS32_H(16)
diff --git a/media/libvpx/libvpx/vpx_dsp/x86/sad4d_avx512.c b/media/libvpx/libvpx/vpx_dsp/x86/sad4d_avx512.c
new file mode 100644
index 0000000000..cfd23fedd9
--- /dev/null
+++ b/media/libvpx/libvpx/vpx_dsp/x86/sad4d_avx512.c
@@ -0,0 +1,83 @@
+/*
+ * Copyright (c) 2017 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+#include <immintrin.h> // AVX512
+#include "./vpx_dsp_rtcd.h"
+#include "vpx/vpx_integer.h"
+
+void vpx_sad64x64x4d_avx512(const uint8_t *src_ptr, int src_stride,
+ const uint8_t *const ref_array[4], int ref_stride,
+ uint32_t sad_array[4]) {
+ __m512i src_reg, ref0_reg, ref1_reg, ref2_reg, ref3_reg;
+ __m512i sum_ref0, sum_ref1, sum_ref2, sum_ref3;
+ __m512i sum_mlow, sum_mhigh;
+ int i;
+ const uint8_t *ref0, *ref1, *ref2, *ref3;
+
+ ref0 = ref_array[0];
+ ref1 = ref_array[1];
+ ref2 = ref_array[2];
+ ref3 = ref_array[3];
+ sum_ref0 = _mm512_set1_epi16(0);
+ sum_ref1 = _mm512_set1_epi16(0);
+ sum_ref2 = _mm512_set1_epi16(0);
+ sum_ref3 = _mm512_set1_epi16(0);
+ for (i = 0; i < 64; i++) {
+ // load src and all ref[]
+ src_reg = _mm512_loadu_si512((const __m512i *)src_ptr);
+ ref0_reg = _mm512_loadu_si512((const __m512i *)ref0);
+ ref1_reg = _mm512_loadu_si512((const __m512i *)ref1);
+ ref2_reg = _mm512_loadu_si512((const __m512i *)ref2);
+ ref3_reg = _mm512_loadu_si512((const __m512i *)ref3);
+ // sum of the absolute differences between every ref[] to src
+ ref0_reg = _mm512_sad_epu8(ref0_reg, src_reg);
+ ref1_reg = _mm512_sad_epu8(ref1_reg, src_reg);
+ ref2_reg = _mm512_sad_epu8(ref2_reg, src_reg);
+ ref3_reg = _mm512_sad_epu8(ref3_reg, src_reg);
+ // sum every ref[]
+ sum_ref0 = _mm512_add_epi32(sum_ref0, ref0_reg);
+ sum_ref1 = _mm512_add_epi32(sum_ref1, ref1_reg);
+ sum_ref2 = _mm512_add_epi32(sum_ref2, ref2_reg);
+ sum_ref3 = _mm512_add_epi32(sum_ref3, ref3_reg);
+
+ src_ptr += src_stride;
+ ref0 += ref_stride;
+ ref1 += ref_stride;
+ ref2 += ref_stride;
+ ref3 += ref_stride;
+ }
+ {
+ __m256i sum256;
+ __m128i sum128;
+ // in sum_ref[] the result is saved in the first 4 bytes
+ // the other 4 bytes are zeroed.
+ // sum_ref1 and sum_ref3 are shifted left by 4 bytes
+ sum_ref1 = _mm512_bslli_epi128(sum_ref1, 4);
+ sum_ref3 = _mm512_bslli_epi128(sum_ref3, 4);
+
+ // merge sum_ref0 and sum_ref1 also sum_ref2 and sum_ref3
+ sum_ref0 = _mm512_or_si512(sum_ref0, sum_ref1);
+ sum_ref2 = _mm512_or_si512(sum_ref2, sum_ref3);
+
+ // merge every 64 bit from each sum_ref[]
+ sum_mlow = _mm512_unpacklo_epi64(sum_ref0, sum_ref2);
+ sum_mhigh = _mm512_unpackhi_epi64(sum_ref0, sum_ref2);
+
+ // add the low 64 bit to the high 64 bit
+ sum_mlow = _mm512_add_epi32(sum_mlow, sum_mhigh);
+
+ // add the low 128 bit to the high 128 bit
+ sum256 = _mm256_add_epi32(_mm512_castsi512_si256(sum_mlow),
+ _mm512_extracti32x8_epi32(sum_mlow, 1));
+ sum128 = _mm_add_epi32(_mm256_castsi256_si128(sum256),
+ _mm256_extractf128_si256(sum256, 1));
+
+ _mm_storeu_si128((__m128i *)(sad_array), sum128);
+ }
+}
diff --git a/media/libvpx/libvpx/vpx_dsp/x86/sad4d_sse2.asm b/media/libvpx/libvpx/vpx_dsp/x86/sad4d_sse2.asm
new file mode 100644
index 0000000000..ed4ea3ef9b
--- /dev/null
+++ b/media/libvpx/libvpx/vpx_dsp/x86/sad4d_sse2.asm
@@ -0,0 +1,278 @@
+;
+; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+;
+; Use of this source code is governed by a BSD-style license
+; that can be found in the LICENSE file in the root of the source
+; tree. An additional intellectual property rights grant can be found
+; in the file PATENTS. All contributing project authors may
+; be found in the AUTHORS file in the root of the source tree.
+;
+
+%include "third_party/x86inc/x86inc.asm"
+
+SECTION .text
+
+; PROCESS_4x2x4 first, off_{first,second}_{src,ref}, advance_at_end
+%macro PROCESS_4x2x4 5-6 0
+ movd m0, [srcq +%2]
+%if %1 == 1
+ movd m6, [ref1q+%3]
+ movd m4, [ref2q+%3]
+ movd m7, [ref3q+%3]
+ movd m5, [ref4q+%3]
+ movd m1, [srcq +%4]
+ movd m2, [ref1q+%5]
+ punpckldq m0, m1
+ punpckldq m6, m2
+ movd m1, [ref2q+%5]
+ movd m2, [ref3q+%5]
+ movd m3, [ref4q+%5]
+ punpckldq m4, m1
+ punpckldq m7, m2
+ punpckldq m5, m3
+ movlhps m0, m0
+ movlhps m6, m4
+ movlhps m7, m5
+ psadbw m6, m0
+ psadbw m7, m0
+%else
+ movd m1, [ref1q+%3]
+ movd m5, [ref1q+%5]
+ movd m2, [ref2q+%3]
+ movd m4, [ref2q+%5]
+ punpckldq m1, m5
+ punpckldq m2, m4
+ movd m3, [ref3q+%3]
+ movd m5, [ref3q+%5]
+ punpckldq m3, m5
+ movd m4, [ref4q+%3]
+ movd m5, [ref4q+%5]
+ punpckldq m4, m5
+ movd m5, [srcq +%4]
+ punpckldq m0, m5
+ movlhps m0, m0
+ movlhps m1, m2
+ movlhps m3, m4
+ psadbw m1, m0
+ psadbw m3, m0
+ paddd m6, m1
+ paddd m7, m3
+%endif
+%if %6 == 1
+ lea srcq, [srcq +src_strideq*2]
+ lea ref1q, [ref1q+ref_strideq*2]
+ lea ref2q, [ref2q+ref_strideq*2]
+ lea ref3q, [ref3q+ref_strideq*2]
+ lea ref4q, [ref4q+ref_strideq*2]
+%endif
+%endmacro
+
+; PROCESS_8x2x4 first, off_{first,second}_{src,ref}, advance_at_end
+%macro PROCESS_8x2x4 5-6 0
+ movh m0, [srcq +%2]
+%if %1 == 1
+ movh m4, [ref1q+%3]
+ movh m5, [ref2q+%3]
+ movh m6, [ref3q+%3]
+ movh m7, [ref4q+%3]
+ movhps m0, [srcq +%4]
+ movhps m4, [ref1q+%5]
+ movhps m5, [ref2q+%5]
+ movhps m6, [ref3q+%5]
+ movhps m7, [ref4q+%5]
+ psadbw m4, m0
+ psadbw m5, m0
+ psadbw m6, m0
+ psadbw m7, m0
+%else
+ movh m1, [ref1q+%3]
+ movh m2, [ref2q+%3]
+ movh m3, [ref3q+%3]
+ movhps m0, [srcq +%4]
+ movhps m1, [ref1q+%5]
+ movhps m2, [ref2q+%5]
+ movhps m3, [ref3q+%5]
+ psadbw m1, m0
+ psadbw m2, m0
+ psadbw m3, m0
+ paddd m4, m1
+ movh m1, [ref4q+%3]
+ movhps m1, [ref4q+%5]
+ paddd m5, m2
+ paddd m6, m3
+ psadbw m1, m0
+ paddd m7, m1
+%endif
+%if %6 == 1
+ lea srcq, [srcq +src_strideq*2]
+ lea ref1q, [ref1q+ref_strideq*2]
+ lea ref2q, [ref2q+ref_strideq*2]
+ lea ref3q, [ref3q+ref_strideq*2]
+ lea ref4q, [ref4q+ref_strideq*2]
+%endif
+%endmacro
+
+; PROCESS_16x2x4 first, off_{first,second}_{src,ref}, advance_at_end
+%macro PROCESS_16x2x4 5-6 0
+ ; 1st 16 px
+ mova m0, [srcq +%2]
+%if %1 == 1
+ movu m4, [ref1q+%3]
+ movu m5, [ref2q+%3]
+ movu m6, [ref3q+%3]
+ movu m7, [ref4q+%3]
+ psadbw m4, m0
+ psadbw m5, m0
+ psadbw m6, m0
+ psadbw m7, m0
+%else
+ movu m1, [ref1q+%3]
+ movu m2, [ref2q+%3]
+ movu m3, [ref3q+%3]
+ psadbw m1, m0
+ psadbw m2, m0
+ psadbw m3, m0
+ paddd m4, m1
+ movu m1, [ref4q+%3]
+ paddd m5, m2
+ paddd m6, m3
+ psadbw m1, m0
+ paddd m7, m1
+%endif
+
+ ; 2nd 16 px
+ mova m0, [srcq +%4]
+ movu m1, [ref1q+%5]
+ movu m2, [ref2q+%5]
+ movu m3, [ref3q+%5]
+ psadbw m1, m0
+ psadbw m2, m0
+ psadbw m3, m0
+ paddd m4, m1
+ movu m1, [ref4q+%5]
+ paddd m5, m2
+ paddd m6, m3
+%if %6 == 1
+ lea srcq, [srcq +src_strideq*2]
+ lea ref1q, [ref1q+ref_strideq*2]
+ lea ref2q, [ref2q+ref_strideq*2]
+ lea ref3q, [ref3q+ref_strideq*2]
+ lea ref4q, [ref4q+ref_strideq*2]
+%endif
+ psadbw m1, m0
+ paddd m7, m1
+%endmacro
+
+; PROCESS_32x2x4 first, off_{first,second}_{src,ref}, advance_at_end
+%macro PROCESS_32x2x4 5-6 0
+ PROCESS_16x2x4 %1, %2, %3, %2 + 16, %3 + 16
+ PROCESS_16x2x4 0, %4, %5, %4 + 16, %5 + 16, %6
+%endmacro
+
+; PROCESS_64x2x4 first, off_{first,second}_{src,ref}, advance_at_end
+%macro PROCESS_64x2x4 5-6 0
+ PROCESS_32x2x4 %1, %2, %3, %2 + 32, %3 + 32
+ PROCESS_32x2x4 0, %4, %5, %4 + 32, %5 + 32, %6
+%endmacro
+
+; void vpx_sadNxNx4d_sse2(uint8_t *src, int src_stride,
+; uint8_t *ref[4], int ref_stride,
+; uint32_t res[4]);
+; where NxN = 64x64, 32x32, 16x16, 16x8, 8x16, 8x8, 8x4, 4x8 and 4x4
+%macro SADNXN4D 2-3 0
+%if %3 == 1 ; skip rows
+%if UNIX64
+cglobal sad_skip_%1x%2x4d, 5, 8, 8, src, src_stride, ref1, ref_stride, \
+ res, ref2, ref3, ref4
+%else
+cglobal sad_skip_%1x%2x4d, 4, 7, 8, src, src_stride, ref1, ref_stride, \
+ ref2, ref3, ref4
+%endif
+%else ; normal sad
+%if UNIX64
+cglobal sad%1x%2x4d, 5, 8, 8, src, src_stride, ref1, ref_stride, \
+ res, ref2, ref3, ref4
+%else
+cglobal sad%1x%2x4d, 4, 7, 8, src, src_stride, ref1, ref_stride, \
+ ref2, ref3, ref4
+%endif
+%endif
+%if %3 == 1
+ lea src_strided, [2*src_strided]
+ lea ref_strided, [2*ref_strided]
+%endif
+ movsxdifnidn src_strideq, src_strided
+ movsxdifnidn ref_strideq, ref_strided
+ mov ref2q, [ref1q+gprsize*1]
+ mov ref3q, [ref1q+gprsize*2]
+ mov ref4q, [ref1q+gprsize*3]
+ mov ref1q, [ref1q+gprsize*0]
+
+ PROCESS_%1x2x4 1, 0, 0, src_strideq, ref_strideq, 1
+%if %3 == 1 ; downsample number of rows by 2
+%define num_rep (%2-8)/4
+%else
+%define num_rep (%2-4)/2
+%endif
+%rep num_rep
+ PROCESS_%1x2x4 0, 0, 0, src_strideq, ref_strideq, 1
+%endrep
+%undef num_rep
+ PROCESS_%1x2x4 0, 0, 0, src_strideq, ref_strideq, 0
+
+%if %1 > 4
+ pslldq m5, 4
+ pslldq m7, 4
+ por m4, m5
+ por m6, m7
+ mova m5, m4
+ mova m7, m6
+ punpcklqdq m4, m6
+ punpckhqdq m5, m7
+ movifnidn r4, r4mp
+ paddd m4, m5
+%if %3 == 1
+ pslld m4, 1
+%endif
+ movu [r4], m4
+ RET
+%else
+ movifnidn r4, r4mp
+ pshufd m6, m6, 0x08
+ pshufd m7, m7, 0x08
+%if %3 == 1
+ pslld m6, 1
+ pslld m7, 1
+%endif
+ movq [r4+0], m6
+ movq [r4+8], m7
+ RET
+%endif
+%endmacro
+
+INIT_XMM sse2
+SADNXN4D 64, 64
+SADNXN4D 64, 32
+SADNXN4D 32, 64
+SADNXN4D 32, 32
+SADNXN4D 32, 16
+SADNXN4D 16, 32
+SADNXN4D 16, 16
+SADNXN4D 16, 8
+SADNXN4D 8, 16
+SADNXN4D 8, 8
+SADNXN4D 8, 4
+SADNXN4D 4, 8
+SADNXN4D 4, 4
+
+SADNXN4D 64, 64, 1
+SADNXN4D 64, 32, 1
+SADNXN4D 32, 64, 1
+SADNXN4D 32, 32, 1
+SADNXN4D 32, 16, 1
+SADNXN4D 16, 32, 1
+SADNXN4D 16, 16, 1
+SADNXN4D 16, 8, 1
+SADNXN4D 8, 16, 1
+SADNXN4D 8, 8, 1
+SADNXN4D 4, 8, 1
diff --git a/media/libvpx/libvpx/vpx_dsp/x86/sad_avx2.c b/media/libvpx/libvpx/vpx_dsp/x86/sad_avx2.c
new file mode 100644
index 0000000000..e00494d766
--- /dev/null
+++ b/media/libvpx/libvpx/vpx_dsp/x86/sad_avx2.c
@@ -0,0 +1,208 @@
+/*
+ * Copyright (c) 2012 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+#include <immintrin.h>
+#include "./vpx_dsp_rtcd.h"
+#include "vpx_ports/mem.h"
+
+static INLINE unsigned int sad64xh_avx2(const uint8_t *src_ptr, int src_stride,
+ const uint8_t *ref_ptr, int ref_stride,
+ int h) {
+ int i, res;
+ __m256i sad1_reg, sad2_reg, ref1_reg, ref2_reg;
+ __m256i sum_sad = _mm256_setzero_si256();
+ __m256i sum_sad_h;
+ __m128i sum_sad128;
+ for (i = 0; i < h; i++) {
+ ref1_reg = _mm256_loadu_si256((__m256i const *)ref_ptr);
+ ref2_reg = _mm256_loadu_si256((__m256i const *)(ref_ptr + 32));
+ sad1_reg =
+ _mm256_sad_epu8(ref1_reg, _mm256_loadu_si256((__m256i const *)src_ptr));
+ sad2_reg = _mm256_sad_epu8(
+ ref2_reg, _mm256_loadu_si256((__m256i const *)(src_ptr + 32)));
+ sum_sad = _mm256_add_epi32(sum_sad, _mm256_add_epi32(sad1_reg, sad2_reg));
+ ref_ptr += ref_stride;
+ src_ptr += src_stride;
+ }
+ sum_sad_h = _mm256_srli_si256(sum_sad, 8);
+ sum_sad = _mm256_add_epi32(sum_sad, sum_sad_h);
+ sum_sad128 = _mm256_extracti128_si256(sum_sad, 1);
+ sum_sad128 = _mm_add_epi32(_mm256_castsi256_si128(sum_sad), sum_sad128);
+ res = _mm_cvtsi128_si32(sum_sad128);
+ return res;
+}
+
+static INLINE unsigned int sad32xh_avx2(const uint8_t *src_ptr, int src_stride,
+ const uint8_t *ref_ptr, int ref_stride,
+ int h) {
+ int i, res;
+ __m256i sad1_reg, sad2_reg, ref1_reg, ref2_reg;
+ __m256i sum_sad = _mm256_setzero_si256();
+ __m256i sum_sad_h;
+ __m128i sum_sad128;
+ const int ref2_stride = ref_stride << 1;
+ const int src2_stride = src_stride << 1;
+ const int max = h >> 1;
+ for (i = 0; i < max; i++) {
+ ref1_reg = _mm256_loadu_si256((__m256i const *)ref_ptr);
+ ref2_reg = _mm256_loadu_si256((__m256i const *)(ref_ptr + ref_stride));
+ sad1_reg =
+ _mm256_sad_epu8(ref1_reg, _mm256_loadu_si256((__m256i const *)src_ptr));
+ sad2_reg = _mm256_sad_epu8(
+ ref2_reg, _mm256_loadu_si256((__m256i const *)(src_ptr + src_stride)));
+ sum_sad = _mm256_add_epi32(sum_sad, _mm256_add_epi32(sad1_reg, sad2_reg));
+ ref_ptr += ref2_stride;
+ src_ptr += src2_stride;
+ }
+ sum_sad_h = _mm256_srli_si256(sum_sad, 8);
+ sum_sad = _mm256_add_epi32(sum_sad, sum_sad_h);
+ sum_sad128 = _mm256_extracti128_si256(sum_sad, 1);
+ sum_sad128 = _mm_add_epi32(_mm256_castsi256_si128(sum_sad), sum_sad128);
+ res = _mm_cvtsi128_si32(sum_sad128);
+ return res;
+}
+
+#define FSAD64_H(h) \
+ unsigned int vpx_sad64x##h##_avx2(const uint8_t *src_ptr, int src_stride, \
+ const uint8_t *ref_ptr, int ref_stride) { \
+ return sad64xh_avx2(src_ptr, src_stride, ref_ptr, ref_stride, h); \
+ }
+
+#define FSADS64_H(h) \
+ unsigned int vpx_sad_skip_64x##h##_avx2( \
+ const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, \
+ int ref_stride) { \
+ return 2 * sad64xh_avx2(src_ptr, src_stride * 2, ref_ptr, ref_stride * 2, \
+ h / 2); \
+ }
+
+#define FSAD32_H(h) \
+ unsigned int vpx_sad32x##h##_avx2(const uint8_t *src_ptr, int src_stride, \
+ const uint8_t *ref_ptr, int ref_stride) { \
+ return sad32xh_avx2(src_ptr, src_stride, ref_ptr, ref_stride, h); \
+ }
+
+#define FSADS32_H(h) \
+ unsigned int vpx_sad_skip_32x##h##_avx2( \
+ const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, \
+ int ref_stride) { \
+ return 2 * sad32xh_avx2(src_ptr, src_stride * 2, ref_ptr, ref_stride * 2, \
+ h / 2); \
+ }
+
+#define FSAD64 \
+ FSAD64_H(64) \
+ FSAD64_H(32) \
+ FSADS64_H(64) \
+ FSADS64_H(32)
+
+#define FSAD32 \
+ FSAD32_H(64) \
+ FSAD32_H(32) \
+ FSAD32_H(16) \
+ FSADS32_H(64) \
+ FSADS32_H(32) \
+ FSADS32_H(16)
+
+FSAD64
+FSAD32
+
+#undef FSAD64
+#undef FSAD32
+#undef FSAD64_H
+#undef FSAD32_H
+#undef FSADS64_H
+#undef FSADS32_H
+
+#define FSADAVG64_H(h) \
+ unsigned int vpx_sad64x##h##_avg_avx2( \
+ const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, \
+ int ref_stride, const uint8_t *second_pred) { \
+ int i; \
+ __m256i sad1_reg, sad2_reg, ref1_reg, ref2_reg; \
+ __m256i sum_sad = _mm256_setzero_si256(); \
+ __m256i sum_sad_h; \
+ __m128i sum_sad128; \
+ for (i = 0; i < h; i++) { \
+ ref1_reg = _mm256_loadu_si256((__m256i const *)ref_ptr); \
+ ref2_reg = _mm256_loadu_si256((__m256i const *)(ref_ptr + 32)); \
+ ref1_reg = _mm256_avg_epu8( \
+ ref1_reg, _mm256_loadu_si256((__m256i const *)second_pred)); \
+ ref2_reg = _mm256_avg_epu8( \
+ ref2_reg, _mm256_loadu_si256((__m256i const *)(second_pred + 32))); \
+ sad1_reg = _mm256_sad_epu8( \
+ ref1_reg, _mm256_loadu_si256((__m256i const *)src_ptr)); \
+ sad2_reg = _mm256_sad_epu8( \
+ ref2_reg, _mm256_loadu_si256((__m256i const *)(src_ptr + 32))); \
+ sum_sad = \
+ _mm256_add_epi32(sum_sad, _mm256_add_epi32(sad1_reg, sad2_reg)); \
+ ref_ptr += ref_stride; \
+ src_ptr += src_stride; \
+ second_pred += 64; \
+ } \
+ sum_sad_h = _mm256_srli_si256(sum_sad, 8); \
+ sum_sad = _mm256_add_epi32(sum_sad, sum_sad_h); \
+ sum_sad128 = _mm256_extracti128_si256(sum_sad, 1); \
+ sum_sad128 = _mm_add_epi32(_mm256_castsi256_si128(sum_sad), sum_sad128); \
+ return (unsigned int)_mm_cvtsi128_si32(sum_sad128); \
+ }
+
+#define FSADAVG32_H(h) \
+ unsigned int vpx_sad32x##h##_avg_avx2( \
+ const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, \
+ int ref_stride, const uint8_t *second_pred) { \
+ int i; \
+ __m256i sad1_reg, sad2_reg, ref1_reg, ref2_reg; \
+ __m256i sum_sad = _mm256_setzero_si256(); \
+ __m256i sum_sad_h; \
+ __m128i sum_sad128; \
+ int ref2_stride = ref_stride << 1; \
+ int src2_stride = src_stride << 1; \
+ int max = h >> 1; \
+ for (i = 0; i < max; i++) { \
+ ref1_reg = _mm256_loadu_si256((__m256i const *)ref_ptr); \
+ ref2_reg = _mm256_loadu_si256((__m256i const *)(ref_ptr + ref_stride)); \
+ ref1_reg = _mm256_avg_epu8( \
+ ref1_reg, _mm256_loadu_si256((__m256i const *)second_pred)); \
+ ref2_reg = _mm256_avg_epu8( \
+ ref2_reg, _mm256_loadu_si256((__m256i const *)(second_pred + 32))); \
+ sad1_reg = _mm256_sad_epu8( \
+ ref1_reg, _mm256_loadu_si256((__m256i const *)src_ptr)); \
+ sad2_reg = _mm256_sad_epu8( \
+ ref2_reg, \
+ _mm256_loadu_si256((__m256i const *)(src_ptr + src_stride))); \
+ sum_sad = \
+ _mm256_add_epi32(sum_sad, _mm256_add_epi32(sad1_reg, sad2_reg)); \
+ ref_ptr += ref2_stride; \
+ src_ptr += src2_stride; \
+ second_pred += 64; \
+ } \
+ sum_sad_h = _mm256_srli_si256(sum_sad, 8); \
+ sum_sad = _mm256_add_epi32(sum_sad, sum_sad_h); \
+ sum_sad128 = _mm256_extracti128_si256(sum_sad, 1); \
+ sum_sad128 = _mm_add_epi32(_mm256_castsi256_si128(sum_sad), sum_sad128); \
+ return (unsigned int)_mm_cvtsi128_si32(sum_sad128); \
+ }
+
+#define FSADAVG64 \
+ FSADAVG64_H(64) \
+ FSADAVG64_H(32)
+
+#define FSADAVG32 \
+ FSADAVG32_H(64) \
+ FSADAVG32_H(32) \
+ FSADAVG32_H(16)
+
+FSADAVG64
+FSADAVG32
+
+#undef FSADAVG64
+#undef FSADAVG32
+#undef FSADAVG64_H
+#undef FSADAVG32_H
diff --git a/media/libvpx/libvpx/vpx_dsp/x86/sad_sse2.asm b/media/libvpx/libvpx/vpx_dsp/x86/sad_sse2.asm
new file mode 100644
index 0000000000..627e463bf8
--- /dev/null
+++ b/media/libvpx/libvpx/vpx_dsp/x86/sad_sse2.asm
@@ -0,0 +1,332 @@
+;
+; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+;
+; Use of this source code is governed by a BSD-style license
+; that can be found in the LICENSE file in the root of the source
+; tree. An additional intellectual property rights grant can be found
+; in the file PATENTS. All contributing project authors may
+; be found in the AUTHORS file in the root of the source tree.
+;
+
+%include "third_party/x86inc/x86inc.asm"
+
+SECTION .text
+
+; Macro Arguments
+; Arg 1: Width
+; Arg 2: Height
+; Arg 3: Number of general purpose registers
+; Arg 4: Type of function: if 0, normal sad; if 1, avg; if 2, skip rows
+%macro SAD_FN 4
+%if %4 == 0 ; normal sad
+%if %3 == 5
+cglobal sad%1x%2, 4, %3, 5, src, src_stride, ref, ref_stride, n_rows
+%else ; %3 == 7
+cglobal sad%1x%2, 4, %3, 6, src, src_stride, ref, ref_stride, \
+ src_stride3, ref_stride3, n_rows
+%endif ; %3 == 5/7
+
+%elif %4 == 2 ; skip
+%if %3 == 5
+cglobal sad_skip_%1x%2, 4, %3, 5, src, src_stride, ref, ref_stride, n_rows
+%else ; %3 == 7
+cglobal sad_skip_%1x%2, 4, %3, 6, src, src_stride, ref, ref_stride, \
+ src_stride3, ref_stride3, n_rows
+%endif ; %3 == 5/7
+
+%else
+%if %3 == 5
+cglobal sad%1x%2_avg, 5, 1 + %3, 5, src, src_stride, ref, ref_stride, \
+ second_pred, n_rows
+%else ; %3 == 7
+cglobal sad%1x%2_avg, 5, VPX_ARCH_X86_64 + %3, 6, src, src_stride, \
+ ref, ref_stride, \
+ second_pred, \
+ src_stride3, ref_stride3
+%if VPX_ARCH_X86_64
+%define n_rowsd r7d
+%else ; x86-32
+%define n_rowsd dword r0m
+%endif ; x86-32/64
+%endif ; %3 == 5/7
+%endif ; sad/avg/skip
+%if %4 == 2; skip rows so double the stride
+lea src_strided, [src_strided*2]
+lea ref_strided, [ref_strided*2]
+%endif ; %4 skip
+ movsxdifnidn src_strideq, src_strided
+ movsxdifnidn ref_strideq, ref_strided
+%if %3 == 7
+ lea src_stride3q, [src_strideq*3]
+ lea ref_stride3q, [ref_strideq*3]
+%endif ; %3 == 7
+%endmacro
+
+; unsigned int vpx_sad64x64_sse2(uint8_t *src, int src_stride,
+; uint8_t *ref, int ref_stride);
+%macro SAD64XN 1-2 0
+ SAD_FN 64, %1, 5, %2
+%if %2 == 2
+ mov n_rowsd, %1/2
+%else
+ mov n_rowsd, %1
+%endif
+ pxor m0, m0
+.loop:
+ movu m1, [refq]
+ movu m2, [refq+16]
+ movu m3, [refq+32]
+ movu m4, [refq+48]
+%if %2 == 1
+ pavgb m1, [second_predq+mmsize*0]
+ pavgb m2, [second_predq+mmsize*1]
+ pavgb m3, [second_predq+mmsize*2]
+ pavgb m4, [second_predq+mmsize*3]
+ lea second_predq, [second_predq+mmsize*4]
+%endif
+ psadbw m1, [srcq]
+ psadbw m2, [srcq+16]
+ psadbw m3, [srcq+32]
+ psadbw m4, [srcq+48]
+ paddd m1, m2
+ paddd m3, m4
+ add refq, ref_strideq
+ paddd m0, m1
+ add srcq, src_strideq
+ paddd m0, m3
+ dec n_rowsd
+ jg .loop
+
+ movhlps m1, m0
+ paddd m0, m1
+%if %2 == 2 ; we skipped rows, so now we need to double the sad
+ pslld m0, 1
+%endif
+ movd eax, m0
+ RET
+%endmacro
+
+INIT_XMM sse2
+SAD64XN 64 ; sad64x64_sse2
+SAD64XN 32 ; sad64x32_sse2
+SAD64XN 64, 1 ; sad64x64_avg_sse2
+SAD64XN 32, 1 ; sad64x32_avg_sse2
+SAD64XN 64, 2 ; sad64x64_skip_sse2
+SAD64XN 32, 2 ; sad64x32_skip_sse2
+
+; unsigned int vpx_sad32x32_sse2(uint8_t *src, int src_stride,
+; uint8_t *ref, int ref_stride);
+%macro SAD32XN 1-2 0
+ SAD_FN 32, %1, 5, %2
+%if %2 == 2
+ mov n_rowsd, %1/4
+%else
+ mov n_rowsd, %1/2
+%endif
+ pxor m0, m0
+.loop:
+ movu m1, [refq]
+ movu m2, [refq+16]
+ movu m3, [refq+ref_strideq]
+ movu m4, [refq+ref_strideq+16]
+%if %2 == 1
+ pavgb m1, [second_predq+mmsize*0]
+ pavgb m2, [second_predq+mmsize*1]
+ pavgb m3, [second_predq+mmsize*2]
+ pavgb m4, [second_predq+mmsize*3]
+ lea second_predq, [second_predq+mmsize*4]
+%endif
+ psadbw m1, [srcq]
+ psadbw m2, [srcq+16]
+ psadbw m3, [srcq+src_strideq]
+ psadbw m4, [srcq+src_strideq+16]
+ paddd m1, m2
+ paddd m3, m4
+ lea refq, [refq+ref_strideq*2]
+ paddd m0, m1
+ lea srcq, [srcq+src_strideq*2]
+ paddd m0, m3
+ dec n_rowsd
+ jg .loop
+
+ movhlps m1, m0
+ paddd m0, m1
+%if %2 == 2 ; we skipped rows, so now we need to double the sad
+ pslld m0, 1
+%endif
+ movd eax, m0
+ RET
+%endmacro
+
+INIT_XMM sse2
+SAD32XN 64 ; sad32x64_sse2
+SAD32XN 32 ; sad32x32_sse2
+SAD32XN 16 ; sad32x16_sse2
+SAD32XN 64, 1 ; sad32x64_avg_sse2
+SAD32XN 32, 1 ; sad32x32_avg_sse2
+SAD32XN 16, 1 ; sad32x16_avg_sse2
+SAD32XN 64, 2 ; sad32x64_skip_sse2
+SAD32XN 32, 2 ; sad32x32_skip_sse2
+SAD32XN 16, 2 ; sad32x16_skip_sse2
+
+; unsigned int vpx_sad16x{8,16}_sse2(uint8_t *src, int src_stride,
+; uint8_t *ref, int ref_stride);
+%macro SAD16XN 1-2 0
+ SAD_FN 16, %1, 7, %2
+%if %2 == 2
+ mov n_rowsd, %1/8
+%else
+ mov n_rowsd, %1/4
+%endif
+ pxor m0, m0
+
+.loop:
+ movu m1, [refq]
+ movu m2, [refq+ref_strideq]
+ movu m3, [refq+ref_strideq*2]
+ movu m4, [refq+ref_stride3q]
+%if %2 == 1
+ pavgb m1, [second_predq+mmsize*0]
+ pavgb m2, [second_predq+mmsize*1]
+ pavgb m3, [second_predq+mmsize*2]
+ pavgb m4, [second_predq+mmsize*3]
+ lea second_predq, [second_predq+mmsize*4]
+%endif
+ psadbw m1, [srcq]
+ psadbw m2, [srcq+src_strideq]
+ psadbw m3, [srcq+src_strideq*2]
+ psadbw m4, [srcq+src_stride3q]
+ paddd m1, m2
+ paddd m3, m4
+ lea refq, [refq+ref_strideq*4]
+ paddd m0, m1
+ lea srcq, [srcq+src_strideq*4]
+ paddd m0, m3
+ dec n_rowsd
+ jg .loop
+
+ movhlps m1, m0
+ paddd m0, m1
+%if %2 == 2 ; we skipped rows, so now we need to double the sad
+ pslld m0, 1
+%endif
+ movd eax, m0
+ RET
+%endmacro
+
+INIT_XMM sse2
+SAD16XN 32 ; sad16x32_sse2
+SAD16XN 16 ; sad16x16_sse2
+SAD16XN 8 ; sad16x8_sse2
+SAD16XN 32, 1 ; sad16x32_avg_sse2
+SAD16XN 16, 1 ; sad16x16_avg_sse2
+SAD16XN 8, 1 ; sad16x8_avg_sse2
+SAD16XN 32, 2 ; sad16x32_skip_sse2
+SAD16XN 16, 2 ; sad16x16_skip_sse2
+SAD16XN 8, 2 ; sad16x8_skip_sse2
+
+; unsigned int vpx_sad8x{8,16}_sse2(uint8_t *src, int src_stride,
+; uint8_t *ref, int ref_stride);
+%macro SAD8XN 1-2 0
+ SAD_FN 8, %1, 7, %2
+%if %2 == 2
+ mov n_rowsd, %1/8
+%else
+ mov n_rowsd, %1/4
+%endif
+ pxor m0, m0
+
+.loop:
+ movh m1, [refq]
+ movhps m1, [refq+ref_strideq]
+ movh m2, [refq+ref_strideq*2]
+ movhps m2, [refq+ref_stride3q]
+%if %2 == 1
+ pavgb m1, [second_predq+mmsize*0]
+ pavgb m2, [second_predq+mmsize*1]
+ lea second_predq, [second_predq+mmsize*2]
+%endif
+ movh m3, [srcq]
+ movhps m3, [srcq+src_strideq]
+ movh m4, [srcq+src_strideq*2]
+ movhps m4, [srcq+src_stride3q]
+ psadbw m1, m3
+ psadbw m2, m4
+ lea refq, [refq+ref_strideq*4]
+ paddd m0, m1
+ lea srcq, [srcq+src_strideq*4]
+ paddd m0, m2
+ dec n_rowsd
+ jg .loop
+
+ movhlps m1, m0
+ paddd m0, m1
+%if %2 == 2 ; we skipped rows, so now we need to double the sad
+ pslld m0, 1
+%endif
+ movd eax, m0
+ RET
+%endmacro
+
+INIT_XMM sse2
+SAD8XN 16 ; sad8x16_sse2
+SAD8XN 8 ; sad8x8_sse2
+SAD8XN 4 ; sad8x4_sse2
+SAD8XN 16, 1 ; sad8x16_avg_sse2
+SAD8XN 8, 1 ; sad8x8_avg_sse2
+SAD8XN 4, 1 ; sad8x4_avg_sse2
+SAD8XN 16, 2 ; sad8x16_skip_sse2
+SAD8XN 8, 2 ; sad8x8_skip_sse2
+
+; unsigned int vpx_sad4x{4, 8}_sse2(uint8_t *src, int src_stride,
+; uint8_t *ref, int ref_stride);
+%macro SAD4XN 1-2 0
+ SAD_FN 4, %1, 7, %2
+%if %2 == 2
+ mov n_rowsd, %1/8
+%else
+ mov n_rowsd, %1/4
+%endif
+ pxor m0, m0
+
+.loop:
+ movd m1, [refq]
+ movd m2, [refq+ref_strideq]
+ movd m3, [refq+ref_strideq*2]
+ movd m4, [refq+ref_stride3q]
+ punpckldq m1, m2
+ punpckldq m3, m4
+ movlhps m1, m3
+%if %2 == 1
+ pavgb m1, [second_predq+mmsize*0]
+ lea second_predq, [second_predq+mmsize*1]
+%endif
+ movd m2, [srcq]
+ movd m5, [srcq+src_strideq]
+ movd m4, [srcq+src_strideq*2]
+ movd m3, [srcq+src_stride3q]
+ punpckldq m2, m5
+ punpckldq m4, m3
+ movlhps m2, m4
+ psadbw m1, m2
+ lea refq, [refq+ref_strideq*4]
+ paddd m0, m1
+ lea srcq, [srcq+src_strideq*4]
+ dec n_rowsd
+ jg .loop
+
+ movhlps m1, m0
+ paddd m0, m1
+%if %2 == 2 ; we skipped rows, so now we need to double the sad
+ pslld m0, 1
+%endif
+ movd eax, m0
+ RET
+%endmacro
+
+INIT_XMM sse2
+SAD4XN 8 ; sad4x8_sse
+SAD4XN 4 ; sad4x4_sse
+SAD4XN 8, 1 ; sad4x8_avg_sse
+SAD4XN 4, 1 ; sad4x4_avg_sse
+SAD4XN 8, 2 ; sad4x8_skip_sse
diff --git a/media/libvpx/libvpx/vpx_dsp/x86/ssim_opt_x86_64.asm b/media/libvpx/libvpx/vpx_dsp/x86/ssim_opt_x86_64.asm
new file mode 100644
index 0000000000..41ffbb07e6
--- /dev/null
+++ b/media/libvpx/libvpx/vpx_dsp/x86/ssim_opt_x86_64.asm
@@ -0,0 +1,219 @@
+;
+; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+;
+; Use of this source code is governed by a BSD-style license
+; that can be found in the LICENSE file in the root of the source
+; tree. An additional intellectual property rights grant can be found
+; in the file PATENTS. All contributing project authors may
+; be found in the AUTHORS file in the root of the source tree.
+;
+
+%include "vpx_ports/x86_abi_support.asm"
+
+; tabulate_ssim - sums sum_s,sum_r,sum_sq_s,sum_sq_r, sum_sxr
+%macro TABULATE_SSIM 0
+ paddusw xmm15, xmm3 ; sum_s
+ paddusw xmm14, xmm4 ; sum_r
+ movdqa xmm1, xmm3
+ pmaddwd xmm1, xmm1
+ paddd xmm13, xmm1 ; sum_sq_s
+ movdqa xmm2, xmm4
+ pmaddwd xmm2, xmm2
+ paddd xmm12, xmm2 ; sum_sq_r
+ pmaddwd xmm3, xmm4
+ paddd xmm11, xmm3 ; sum_sxr
+%endmacro
+
+; Sum across the register %1 starting with q words
+%macro SUM_ACROSS_Q 1
+ movdqa xmm2,%1
+ punpckldq %1,xmm0
+ punpckhdq xmm2,xmm0
+ paddq %1,xmm2
+ movdqa xmm2,%1
+ punpcklqdq %1,xmm0
+ punpckhqdq xmm2,xmm0
+ paddq %1,xmm2
+%endmacro
+
+; Sum across the register %1 starting with q words
+%macro SUM_ACROSS_W 1
+ movdqa xmm1, %1
+ punpcklwd %1,xmm0
+ punpckhwd xmm1,xmm0
+ paddd %1, xmm1
+ SUM_ACROSS_Q %1
+%endmacro
+
+SECTION .text
+
+;void ssim_parms_sse2(
+; unsigned char *s,
+; int sp,
+; unsigned char *r,
+; int rp
+; uint32_t *sum_s,
+; uint32_t *sum_r,
+; uint32_t *sum_sq_s,
+; uint32_t *sum_sq_r,
+; uint32_t *sum_sxr);
+;
+; TODO: Use parm passing through structure, probably don't need the pxors
+; ( calling app will initialize to 0 ) could easily fit everything in sse2
+; without too much hastle, and can probably do better estimates with psadw
+; or pavgb At this point this is just meant to be first pass for calculating
+; all the parms needed for 16x16 ssim so we can play with dssim as distortion
+; in mode selection code.
+globalsym(vpx_ssim_parms_16x16_sse2)
+sym(vpx_ssim_parms_16x16_sse2):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 9
+ SAVE_XMM 15
+ push rsi
+ push rdi
+ ; end prolog
+
+ mov rsi, arg(0) ;s
+ mov rcx, arg(1) ;sp
+ mov rdi, arg(2) ;r
+ mov rax, arg(3) ;rp
+
+ pxor xmm0, xmm0
+ pxor xmm15,xmm15 ;sum_s
+ pxor xmm14,xmm14 ;sum_r
+ pxor xmm13,xmm13 ;sum_sq_s
+ pxor xmm12,xmm12 ;sum_sq_r
+ pxor xmm11,xmm11 ;sum_sxr
+
+ mov rdx, 16 ;row counter
+.NextRow:
+
+ ;grab source and reference pixels
+ movdqu xmm5, [rsi]
+ movdqu xmm6, [rdi]
+ movdqa xmm3, xmm5
+ movdqa xmm4, xmm6
+ punpckhbw xmm3, xmm0 ; high_s
+ punpckhbw xmm4, xmm0 ; high_r
+
+ TABULATE_SSIM
+
+ movdqa xmm3, xmm5
+ movdqa xmm4, xmm6
+ punpcklbw xmm3, xmm0 ; low_s
+ punpcklbw xmm4, xmm0 ; low_r
+
+ TABULATE_SSIM
+
+ add rsi, rcx ; next s row
+ add rdi, rax ; next r row
+
+ dec rdx ; counter
+ jnz .NextRow
+
+ SUM_ACROSS_W xmm15
+ SUM_ACROSS_W xmm14
+ SUM_ACROSS_Q xmm13
+ SUM_ACROSS_Q xmm12
+ SUM_ACROSS_Q xmm11
+
+ mov rdi,arg(4)
+ movd [rdi], xmm15;
+ mov rdi,arg(5)
+ movd [rdi], xmm14;
+ mov rdi,arg(6)
+ movd [rdi], xmm13;
+ mov rdi,arg(7)
+ movd [rdi], xmm12;
+ mov rdi,arg(8)
+ movd [rdi], xmm11;
+
+ ; begin epilog
+ pop rdi
+ pop rsi
+ RESTORE_XMM
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+
+;void ssim_parms_sse2(
+; unsigned char *s,
+; int sp,
+; unsigned char *r,
+; int rp
+; uint32_t *sum_s,
+; uint32_t *sum_r,
+; uint32_t *sum_sq_s,
+; uint32_t *sum_sq_r,
+; uint32_t *sum_sxr);
+;
+; TODO: Use parm passing through structure, probably don't need the pxors
+; ( calling app will initialize to 0 ) could easily fit everything in sse2
+; without too much hastle, and can probably do better estimates with psadw
+; or pavgb At this point this is just meant to be first pass for calculating
+; all the parms needed for 16x16 ssim so we can play with dssim as distortion
+; in mode selection code.
+globalsym(vpx_ssim_parms_8x8_sse2)
+sym(vpx_ssim_parms_8x8_sse2):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 9
+ SAVE_XMM 15
+ push rsi
+ push rdi
+ ; end prolog
+
+ mov rsi, arg(0) ;s
+ mov rcx, arg(1) ;sp
+ mov rdi, arg(2) ;r
+ mov rax, arg(3) ;rp
+
+ pxor xmm0, xmm0
+ pxor xmm15,xmm15 ;sum_s
+ pxor xmm14,xmm14 ;sum_r
+ pxor xmm13,xmm13 ;sum_sq_s
+ pxor xmm12,xmm12 ;sum_sq_r
+ pxor xmm11,xmm11 ;sum_sxr
+
+ mov rdx, 8 ;row counter
+.NextRow:
+
+ ;grab source and reference pixels
+ movq xmm3, [rsi]
+ movq xmm4, [rdi]
+ punpcklbw xmm3, xmm0 ; low_s
+ punpcklbw xmm4, xmm0 ; low_r
+
+ TABULATE_SSIM
+
+ add rsi, rcx ; next s row
+ add rdi, rax ; next r row
+
+ dec rdx ; counter
+ jnz .NextRow
+
+ SUM_ACROSS_W xmm15
+ SUM_ACROSS_W xmm14
+ SUM_ACROSS_Q xmm13
+ SUM_ACROSS_Q xmm12
+ SUM_ACROSS_Q xmm11
+
+ mov rdi,arg(4)
+ movd [rdi], xmm15;
+ mov rdi,arg(5)
+ movd [rdi], xmm14;
+ mov rdi,arg(6)
+ movd [rdi], xmm13;
+ mov rdi,arg(7)
+ movd [rdi], xmm12;
+ mov rdi,arg(8)
+ movd [rdi], xmm11;
+
+ ; begin epilog
+ pop rdi
+ pop rsi
+ RESTORE_XMM
+ UNSHADOW_ARGS
+ pop rbp
+ ret
diff --git a/media/libvpx/libvpx/vpx_dsp/x86/subpel_variance_sse2.asm b/media/libvpx/libvpx/vpx_dsp/x86/subpel_variance_sse2.asm
new file mode 100644
index 0000000000..d1d8d3460e
--- /dev/null
+++ b/media/libvpx/libvpx/vpx_dsp/x86/subpel_variance_sse2.asm
@@ -0,0 +1,1467 @@
+;
+; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+;
+; Use of this source code is governed by a BSD-style license
+; that can be found in the LICENSE file in the root of the source
+; tree. An additional intellectual property rights grant can be found
+; in the file PATENTS. All contributing project authors may
+; be found in the AUTHORS file in the root of the source tree.
+;
+
+%include "third_party/x86inc/x86inc.asm"
+
+SECTION_RODATA
+pw_8: times 8 dw 8
+bilin_filter_m_sse2: times 8 dw 16
+ times 8 dw 0
+ times 8 dw 14
+ times 8 dw 2
+ times 8 dw 12
+ times 8 dw 4
+ times 8 dw 10
+ times 8 dw 6
+ times 16 dw 8
+ times 8 dw 6
+ times 8 dw 10
+ times 8 dw 4
+ times 8 dw 12
+ times 8 dw 2
+ times 8 dw 14
+
+bilin_filter_m_ssse3: times 8 db 16, 0
+ times 8 db 14, 2
+ times 8 db 12, 4
+ times 8 db 10, 6
+ times 16 db 8
+ times 8 db 6, 10
+ times 8 db 4, 12
+ times 8 db 2, 14
+
+SECTION .text
+
+; int vpx_sub_pixel_varianceNxh(const uint8_t *src, ptrdiff_t src_stride,
+; int x_offset, int y_offset,
+; const uint8_t *ref, ptrdiff_t ref_stride,
+; int height, unsigned int *sse);
+;
+; This function returns the SE and stores SSE in the given pointer.
+
+%macro SUM_SSE 6 ; src1, ref1, src2, ref2, sum, sse
+ psubw %3, %4
+ psubw %1, %2
+ paddw %5, %3
+ pmaddwd %3, %3
+ paddw %5, %1
+ pmaddwd %1, %1
+ paddd %6, %3
+ paddd %6, %1
+%endmacro
+
+%macro STORE_AND_RET 1
+%if %1 > 4
+ ; if H=64 and W=16, we have 8 words of each 2(1bit)x64(6bit)x9bit=16bit
+ ; in m6, i.e. it _exactly_ fits in a signed word per word in the xmm reg.
+ ; We have to sign-extend it before adding the words within the register
+ ; and outputing to a dword.
+ pcmpgtw m5, m6 ; mask for 0 > x
+ movhlps m3, m7
+ punpcklwd m4, m6, m5
+ punpckhwd m6, m5 ; sign-extend m6 word->dword
+ paddd m7, m3
+ paddd m6, m4
+ pshufd m3, m7, 0x1
+ movhlps m4, m6
+ paddd m7, m3
+ paddd m6, m4
+ mov r1, ssem ; r1 = unsigned int *sse
+ pshufd m4, m6, 0x1
+ movd [r1], m7 ; store sse
+ paddd m6, m4
+ movd raxd, m6 ; store sum as return value
+%else ; 4xh
+ pshuflw m4, m6, 0xe
+ pshuflw m3, m7, 0xe
+ paddw m6, m4
+ paddd m7, m3
+ pcmpgtw m5, m6 ; mask for 0 > x
+ mov r1, ssem ; r1 = unsigned int *sse
+ punpcklwd m6, m5 ; sign-extend m6 word->dword
+ movd [r1], m7 ; store sse
+ pshuflw m4, m6, 0xe
+ paddd m6, m4
+ movd raxd, m6 ; store sum as return value
+%endif
+ RET
+%endmacro
+
+%macro INC_SRC_BY_SRC_STRIDE 0
+%if VPX_ARCH_X86=1 && CONFIG_PIC=1
+ add srcq, src_stridemp
+%else
+ add srcq, src_strideq
+%endif
+%endmacro
+
+%macro SUBPEL_VARIANCE 1-2 0 ; W
+%if cpuflag(ssse3)
+%define bilin_filter_m bilin_filter_m_ssse3
+%define filter_idx_shift 4
+%else
+%define bilin_filter_m bilin_filter_m_sse2
+%define filter_idx_shift 5
+%endif
+; FIXME(rbultje) only bilinear filters use >8 registers, and ssse3 only uses
+; 11, not 13, if the registers are ordered correctly. May make a minor speed
+; difference on Win64
+
+%if VPX_ARCH_X86_64
+ %if %2 == 1 ; avg
+ cglobal sub_pixel_avg_variance%1xh, 9, 10, 13, src, src_stride, \
+ x_offset, y_offset, ref, ref_stride, \
+ second_pred, second_stride, height, sse
+ %define second_str second_strideq
+ %else
+ cglobal sub_pixel_variance%1xh, 7, 8, 13, src, src_stride, \
+ x_offset, y_offset, ref, ref_stride, \
+ height, sse
+ %endif
+ %define block_height heightd
+ %define bilin_filter sseq
+%else
+ %if CONFIG_PIC=1
+ %if %2 == 1 ; avg
+ cglobal sub_pixel_avg_variance%1xh, 7, 7, 13, src, src_stride, \
+ x_offset, y_offset, ref, ref_stride, \
+ second_pred, second_stride, height, sse
+ %define block_height dword heightm
+ %define second_str second_stridemp
+ %else
+ cglobal sub_pixel_variance%1xh, 7, 7, 13, src, src_stride, \
+ x_offset, y_offset, ref, ref_stride, \
+ height, sse
+ %define block_height heightd
+ %endif
+
+ ; reuse argument stack space
+ %define g_bilin_filterm x_offsetm
+ %define g_pw_8m y_offsetm
+
+ ;Store bilin_filter and pw_8 location in stack
+ %if GET_GOT_DEFINED == 1
+ GET_GOT eax
+ add esp, 4 ; restore esp
+ %endif
+
+ lea ecx, [GLOBAL(bilin_filter_m)]
+ mov g_bilin_filterm, ecx
+
+ lea ecx, [GLOBAL(pw_8)]
+ mov g_pw_8m, ecx
+
+ LOAD_IF_USED 0, 1 ; load eax, ecx back
+ %else
+ %if %2 == 1 ; avg
+ cglobal sub_pixel_avg_variance%1xh, 7, 7, 13, src, src_stride, \
+ x_offset, y_offset, \
+ ref, ref_stride, second_pred, second_stride, \
+ height, sse
+ %define block_height dword heightm
+ %define second_str second_stridemp
+ %else
+ cglobal sub_pixel_variance%1xh, 7, 7, 13, src, src_stride, \
+ x_offset, y_offset, ref, ref_stride, \
+ height, sse
+ %define block_height heightd
+ %endif
+ %define bilin_filter bilin_filter_m
+ %endif
+%endif
+
+%if %1 == 4
+ %define movx movd
+%else
+ %define movx movh
+%endif
+
+ ASSERT %1 <= 16 ; m6 overflows if w > 16
+ pxor m6, m6 ; sum
+ pxor m7, m7 ; sse
+ ; FIXME(rbultje) if both filters are bilinear, we don't actually use m5; we
+ ; could perhaps use it for something more productive then
+ pxor m5, m5 ; dedicated zero register
+%if %1 < 16
+ sar block_height, 1
+%if %2 == 1 ; avg
+ shl second_str, 1
+%endif
+%endif
+
+ ; FIXME(rbultje) replace by jumptable?
+ test x_offsetd, x_offsetd
+ jnz .x_nonzero
+ ; x_offset == 0
+ test y_offsetd, y_offsetd
+ jnz .x_zero_y_nonzero
+
+ ; x_offset == 0 && y_offset == 0
+.x_zero_y_zero_loop:
+%if %1 == 16
+ movu m0, [srcq]
+ mova m1, [refq]
+%if %2 == 1 ; avg
+ pavgb m0, [second_predq]
+ punpckhbw m3, m1, m5
+ punpcklbw m1, m5
+%endif
+ punpckhbw m2, m0, m5
+ punpcklbw m0, m5
+
+%if %2 == 0 ; !avg
+ punpckhbw m3, m1, m5
+ punpcklbw m1, m5
+%endif
+ SUM_SSE m0, m1, m2, m3, m6, m7
+
+ add srcq, src_strideq
+ add refq, ref_strideq
+%else ; %1 < 16
+ movx m0, [srcq]
+%if %2 == 1 ; avg
+%if %1 > 4
+ movhps m0, [srcq+src_strideq]
+%else ; 4xh
+ movx m1, [srcq+src_strideq]
+ punpckldq m0, m1
+%endif
+%else ; !avg
+ movx m2, [srcq+src_strideq]
+%endif
+
+ movx m1, [refq]
+ movx m3, [refq+ref_strideq]
+
+%if %2 == 1 ; avg
+%if %1 > 4
+ pavgb m0, [second_predq]
+%else
+ movh m2, [second_predq]
+ pavgb m0, m2
+%endif
+ punpcklbw m3, m5
+ punpcklbw m1, m5
+%if %1 > 4
+ punpckhbw m2, m0, m5
+ punpcklbw m0, m5
+%else ; 4xh
+ punpcklbw m0, m5
+ movhlps m2, m0
+%endif
+%else ; !avg
+ punpcklbw m0, m5
+ punpcklbw m2, m5
+ punpcklbw m3, m5
+ punpcklbw m1, m5
+%endif
+ SUM_SSE m0, m1, m2, m3, m6, m7
+
+ lea srcq, [srcq+src_strideq*2]
+ lea refq, [refq+ref_strideq*2]
+%endif
+%if %2 == 1 ; avg
+ add second_predq, second_str
+%endif
+ dec block_height
+ jg .x_zero_y_zero_loop
+ STORE_AND_RET %1
+
+.x_zero_y_nonzero:
+ cmp y_offsetd, 4
+ jne .x_zero_y_nonhalf
+
+ ; x_offset == 0 && y_offset == 0.5
+.x_zero_y_half_loop:
+%if %1 == 16
+ movu m0, [srcq]
+ movu m4, [srcq+src_strideq]
+ mova m1, [refq]
+ pavgb m0, m4
+ punpckhbw m3, m1, m5
+%if %2 == 1 ; avg
+ pavgb m0, [second_predq]
+%endif
+ punpcklbw m1, m5
+ punpckhbw m2, m0, m5
+ punpcklbw m0, m5
+ SUM_SSE m0, m1, m2, m3, m6, m7
+
+ add srcq, src_strideq
+ add refq, ref_strideq
+%else ; %1 < 16
+ movx m0, [srcq]
+ movx m2, [srcq+src_strideq]
+%if %2 == 1 ; avg
+%if %1 > 4
+ movhps m2, [srcq+src_strideq*2]
+%else ; 4xh
+ movx m1, [srcq+src_strideq*2]
+ punpckldq m2, m1
+%endif
+ movx m1, [refq]
+%if %1 > 4
+ movlhps m0, m2
+%else ; 4xh
+ punpckldq m0, m2
+%endif
+ movx m3, [refq+ref_strideq]
+ pavgb m0, m2
+ punpcklbw m1, m5
+%if %1 > 4
+ pavgb m0, [second_predq]
+ punpcklbw m3, m5
+ punpckhbw m2, m0, m5
+ punpcklbw m0, m5
+%else ; 4xh
+ movh m4, [second_predq]
+ pavgb m0, m4
+ punpcklbw m3, m5
+ punpcklbw m0, m5
+ movhlps m2, m0
+%endif
+%else ; !avg
+ movx m4, [srcq+src_strideq*2]
+ movx m1, [refq]
+ pavgb m0, m2
+ movx m3, [refq+ref_strideq]
+ pavgb m2, m4
+ punpcklbw m0, m5
+ punpcklbw m2, m5
+ punpcklbw m3, m5
+ punpcklbw m1, m5
+%endif
+ SUM_SSE m0, m1, m2, m3, m6, m7
+
+ lea srcq, [srcq+src_strideq*2]
+ lea refq, [refq+ref_strideq*2]
+%endif
+%if %2 == 1 ; avg
+ add second_predq, second_str
+%endif
+ dec block_height
+ jg .x_zero_y_half_loop
+ STORE_AND_RET %1
+
+.x_zero_y_nonhalf:
+ ; x_offset == 0 && y_offset == bilin interpolation
+%if VPX_ARCH_X86_64
+ lea bilin_filter, [GLOBAL(bilin_filter_m)]
+%endif
+ shl y_offsetd, filter_idx_shift
+%if VPX_ARCH_X86_64 && %1 > 4
+ mova m8, [bilin_filter+y_offsetq]
+%if notcpuflag(ssse3) ; FIXME(rbultje) don't scatter registers on x86-64
+ mova m9, [bilin_filter+y_offsetq+16]
+%endif
+ mova m10, [GLOBAL(pw_8)]
+%define filter_y_a m8
+%define filter_y_b m9
+%define filter_rnd m10
+%else ; x86-32 or mmx
+%if VPX_ARCH_X86=1 && CONFIG_PIC=1
+; x_offset == 0, reuse x_offset reg
+%define tempq x_offsetq
+ add y_offsetq, g_bilin_filterm
+%define filter_y_a [y_offsetq]
+%define filter_y_b [y_offsetq+16]
+ mov tempq, g_pw_8m
+%define filter_rnd [tempq]
+%else
+ add y_offsetq, bilin_filter
+%define filter_y_a [y_offsetq]
+%define filter_y_b [y_offsetq+16]
+%define filter_rnd [GLOBAL(pw_8)]
+%endif
+%endif
+
+.x_zero_y_other_loop:
+%if %1 == 16
+ movu m0, [srcq]
+ movu m4, [srcq+src_strideq]
+ mova m1, [refq]
+%if cpuflag(ssse3)
+ punpckhbw m2, m0, m4
+ punpcklbw m0, m4
+ pmaddubsw m2, filter_y_a
+ pmaddubsw m0, filter_y_a
+ paddw m2, filter_rnd
+ paddw m0, filter_rnd
+%else
+ punpckhbw m2, m0, m5
+ punpckhbw m3, m4, m5
+ punpcklbw m0, m5
+ punpcklbw m4, m5
+ ; FIXME(rbultje) instead of out=((num-x)*in1+x*in2+rnd)>>log2(num), we can
+ ; also do out=in1+(((num-x)*(in2-in1)+rnd)>>log2(num)). Total number of
+ ; instructions is the same (5), but it is 1 mul instead of 2, so might be
+ ; slightly faster because of pmullw latency. It would also cut our rodata
+ ; tables in half for this function, and save 1-2 registers on x86-64.
+ pmullw m2, filter_y_a
+ pmullw m3, filter_y_b
+ paddw m2, filter_rnd
+ pmullw m0, filter_y_a
+ pmullw m4, filter_y_b
+ paddw m0, filter_rnd
+ paddw m2, m3
+ paddw m0, m4
+%endif
+ psraw m2, 4
+ psraw m0, 4
+%if %2 == 1 ; avg
+ ; FIXME(rbultje) pipeline
+ packuswb m0, m2
+ pavgb m0, [second_predq]
+ punpckhbw m2, m0, m5
+ punpcklbw m0, m5
+%endif
+ punpckhbw m3, m1, m5
+ punpcklbw m1, m5
+ SUM_SSE m0, m1, m2, m3, m6, m7
+
+ add srcq, src_strideq
+ add refq, ref_strideq
+%else ; %1 < 16
+ movx m0, [srcq]
+ movx m2, [srcq+src_strideq]
+ movx m4, [srcq+src_strideq*2]
+ movx m3, [refq+ref_strideq]
+%if cpuflag(ssse3)
+ movx m1, [refq]
+ punpcklbw m0, m2
+ punpcklbw m2, m4
+ pmaddubsw m0, filter_y_a
+ pmaddubsw m2, filter_y_a
+ punpcklbw m3, m5
+ paddw m2, filter_rnd
+ paddw m0, filter_rnd
+%else
+ punpcklbw m0, m5
+ punpcklbw m2, m5
+ punpcklbw m4, m5
+ pmullw m0, filter_y_a
+ pmullw m1, m2, filter_y_b
+ punpcklbw m3, m5
+ paddw m0, filter_rnd
+ pmullw m2, filter_y_a
+ pmullw m4, filter_y_b
+ paddw m0, m1
+ paddw m2, filter_rnd
+ movx m1, [refq]
+ paddw m2, m4
+%endif
+ psraw m0, 4
+ psraw m2, 4
+%if %2 == 1 ; avg
+ ; FIXME(rbultje) pipeline
+%if %1 == 4
+ movlhps m0, m2
+%endif
+ packuswb m0, m2
+%if %1 > 4
+ pavgb m0, [second_predq]
+ punpckhbw m2, m0, m5
+ punpcklbw m0, m5
+%else ; 4xh
+ movh m2, [second_predq]
+ pavgb m0, m2
+ punpcklbw m0, m5
+ movhlps m2, m0
+%endif
+%endif
+ punpcklbw m1, m5
+ SUM_SSE m0, m1, m2, m3, m6, m7
+
+ lea srcq, [srcq+src_strideq*2]
+ lea refq, [refq+ref_strideq*2]
+%endif
+%if %2 == 1 ; avg
+ add second_predq, second_str
+%endif
+ dec block_height
+ jg .x_zero_y_other_loop
+%undef filter_y_a
+%undef filter_y_b
+%undef filter_rnd
+ STORE_AND_RET %1
+
+.x_nonzero:
+ cmp x_offsetd, 4
+ jne .x_nonhalf
+ ; x_offset == 0.5
+ test y_offsetd, y_offsetd
+ jnz .x_half_y_nonzero
+
+ ; x_offset == 0.5 && y_offset == 0
+.x_half_y_zero_loop:
+%if %1 == 16
+ movu m0, [srcq]
+ movu m4, [srcq+1]
+ mova m1, [refq]
+ pavgb m0, m4
+ punpckhbw m3, m1, m5
+%if %2 == 1 ; avg
+ pavgb m0, [second_predq]
+%endif
+ punpcklbw m1, m5
+ punpckhbw m2, m0, m5
+ punpcklbw m0, m5
+ SUM_SSE m0, m1, m2, m3, m6, m7
+
+ add srcq, src_strideq
+ add refq, ref_strideq
+%else ; %1 < 16
+ movx m0, [srcq]
+ movx m4, [srcq+1]
+%if %2 == 1 ; avg
+%if %1 > 4
+ movhps m0, [srcq+src_strideq]
+ movhps m4, [srcq+src_strideq+1]
+%else ; 4xh
+ movx m1, [srcq+src_strideq]
+ punpckldq m0, m1
+ movx m2, [srcq+src_strideq+1]
+ punpckldq m4, m2
+%endif
+ movx m1, [refq]
+ movx m3, [refq+ref_strideq]
+ pavgb m0, m4
+ punpcklbw m3, m5
+%if %1 > 4
+ pavgb m0, [second_predq]
+ punpcklbw m1, m5
+ punpckhbw m2, m0, m5
+ punpcklbw m0, m5
+%else ; 4xh
+ movh m2, [second_predq]
+ pavgb m0, m2
+ punpcklbw m1, m5
+ punpcklbw m0, m5
+ movhlps m2, m0
+%endif
+%else ; !avg
+ movx m2, [srcq+src_strideq]
+ movx m1, [refq]
+ pavgb m0, m4
+ movx m4, [srcq+src_strideq+1]
+ movx m3, [refq+ref_strideq]
+ pavgb m2, m4
+ punpcklbw m0, m5
+ punpcklbw m2, m5
+ punpcklbw m3, m5
+ punpcklbw m1, m5
+%endif
+ SUM_SSE m0, m1, m2, m3, m6, m7
+
+ lea srcq, [srcq+src_strideq*2]
+ lea refq, [refq+ref_strideq*2]
+%endif
+%if %2 == 1 ; avg
+ add second_predq, second_str
+%endif
+ dec block_height
+ jg .x_half_y_zero_loop
+ STORE_AND_RET %1
+
+.x_half_y_nonzero:
+ cmp y_offsetd, 4
+ jne .x_half_y_nonhalf
+
+ ; x_offset == 0.5 && y_offset == 0.5
+%if %1 == 16
+ movu m0, [srcq]
+ movu m3, [srcq+1]
+ add srcq, src_strideq
+ pavgb m0, m3
+.x_half_y_half_loop:
+ movu m4, [srcq]
+ movu m3, [srcq+1]
+ mova m1, [refq]
+ pavgb m4, m3
+ punpckhbw m3, m1, m5
+ pavgb m0, m4
+%if %2 == 1 ; avg
+ punpcklbw m1, m5
+ pavgb m0, [second_predq]
+ punpckhbw m2, m0, m5
+ punpcklbw m0, m5
+%else
+ punpckhbw m2, m0, m5
+ punpcklbw m0, m5
+ punpcklbw m1, m5
+%endif
+ SUM_SSE m0, m1, m2, m3, m6, m7
+ mova m0, m4
+
+ add srcq, src_strideq
+ add refq, ref_strideq
+%else ; %1 < 16
+ movx m0, [srcq]
+ movx m3, [srcq+1]
+ add srcq, src_strideq
+ pavgb m0, m3
+.x_half_y_half_loop:
+ movx m2, [srcq]
+ movx m3, [srcq+1]
+%if %2 == 1 ; avg
+%if %1 > 4
+ movhps m2, [srcq+src_strideq]
+ movhps m3, [srcq+src_strideq+1]
+%else
+ movx m1, [srcq+src_strideq]
+ punpckldq m2, m1
+ movx m1, [srcq+src_strideq+1]
+ punpckldq m3, m1
+%endif
+ pavgb m2, m3
+%if %1 > 4
+ movlhps m0, m2
+ movhlps m4, m2
+%else ; 4xh
+ punpckldq m0, m2
+ pshuflw m4, m2, 0xe
+%endif
+ movx m1, [refq]
+ pavgb m0, m2
+ movx m3, [refq+ref_strideq]
+%if %1 > 4
+ pavgb m0, [second_predq]
+%else
+ movh m2, [second_predq]
+ pavgb m0, m2
+%endif
+ punpcklbw m3, m5
+ punpcklbw m1, m5
+%if %1 > 4
+ punpckhbw m2, m0, m5
+ punpcklbw m0, m5
+%else
+ punpcklbw m0, m5
+ movhlps m2, m0
+%endif
+%else ; !avg
+ movx m4, [srcq+src_strideq]
+ movx m1, [srcq+src_strideq+1]
+ pavgb m2, m3
+ pavgb m4, m1
+ pavgb m0, m2
+ pavgb m2, m4
+ movx m1, [refq]
+ movx m3, [refq+ref_strideq]
+ punpcklbw m0, m5
+ punpcklbw m2, m5
+ punpcklbw m3, m5
+ punpcklbw m1, m5
+%endif
+ SUM_SSE m0, m1, m2, m3, m6, m7
+ mova m0, m4
+
+ lea srcq, [srcq+src_strideq*2]
+ lea refq, [refq+ref_strideq*2]
+%endif
+%if %2 == 1 ; avg
+ add second_predq, second_str
+%endif
+ dec block_height
+ jg .x_half_y_half_loop
+ STORE_AND_RET %1
+
+.x_half_y_nonhalf:
+ ; x_offset == 0.5 && y_offset == bilin interpolation
+%if VPX_ARCH_X86_64
+ lea bilin_filter, [GLOBAL(bilin_filter_m)]
+%endif
+ shl y_offsetd, filter_idx_shift
+%if VPX_ARCH_X86_64 && %1 > 4
+ mova m8, [bilin_filter+y_offsetq]
+%if notcpuflag(ssse3) ; FIXME(rbultje) don't scatter registers on x86-64
+ mova m9, [bilin_filter+y_offsetq+16]
+%endif
+ mova m10, [GLOBAL(pw_8)]
+%define filter_y_a m8
+%define filter_y_b m9
+%define filter_rnd m10
+%else ;x86_32
+%if VPX_ARCH_X86=1 && CONFIG_PIC=1
+; x_offset == 0.5. We can reuse x_offset reg
+%define tempq x_offsetq
+ add y_offsetq, g_bilin_filterm
+%define filter_y_a [y_offsetq]
+%define filter_y_b [y_offsetq+16]
+ mov tempq, g_pw_8m
+%define filter_rnd [tempq]
+%else
+ add y_offsetq, bilin_filter
+%define filter_y_a [y_offsetq]
+%define filter_y_b [y_offsetq+16]
+%define filter_rnd [GLOBAL(pw_8)]
+%endif
+%endif
+
+%if %1 == 16
+ movu m0, [srcq]
+ movu m3, [srcq+1]
+ add srcq, src_strideq
+ pavgb m0, m3
+.x_half_y_other_loop:
+ movu m4, [srcq]
+ movu m2, [srcq+1]
+ mova m1, [refq]
+ pavgb m4, m2
+%if cpuflag(ssse3)
+ punpckhbw m2, m0, m4
+ punpcklbw m0, m4
+ pmaddubsw m2, filter_y_a
+ pmaddubsw m0, filter_y_a
+ paddw m2, filter_rnd
+ paddw m0, filter_rnd
+ psraw m2, 4
+%else
+ punpckhbw m2, m0, m5
+ punpckhbw m3, m4, m5
+ pmullw m2, filter_y_a
+ pmullw m3, filter_y_b
+ paddw m2, filter_rnd
+ punpcklbw m0, m5
+ paddw m2, m3
+ punpcklbw m3, m4, m5
+ pmullw m0, filter_y_a
+ pmullw m3, filter_y_b
+ paddw m0, filter_rnd
+ psraw m2, 4
+ paddw m0, m3
+%endif
+ punpckhbw m3, m1, m5
+ psraw m0, 4
+%if %2 == 1 ; avg
+ ; FIXME(rbultje) pipeline
+ packuswb m0, m2
+ pavgb m0, [second_predq]
+ punpckhbw m2, m0, m5
+ punpcklbw m0, m5
+%endif
+ punpcklbw m1, m5
+ SUM_SSE m0, m1, m2, m3, m6, m7
+ mova m0, m4
+
+ add srcq, src_strideq
+ add refq, ref_strideq
+%else ; %1 < 16
+ movx m0, [srcq]
+ movx m3, [srcq+1]
+ add srcq, src_strideq
+ pavgb m0, m3
+%if notcpuflag(ssse3)
+ punpcklbw m0, m5
+%endif
+.x_half_y_other_loop:
+ movx m2, [srcq]
+ movx m1, [srcq+1]
+ movx m4, [srcq+src_strideq]
+ movx m3, [srcq+src_strideq+1]
+ pavgb m2, m1
+ pavgb m4, m3
+ movx m3, [refq+ref_strideq]
+%if cpuflag(ssse3)
+ movx m1, [refq]
+ punpcklbw m0, m2
+ punpcklbw m2, m4
+ pmaddubsw m0, filter_y_a
+ pmaddubsw m2, filter_y_a
+ punpcklbw m3, m5
+ paddw m0, filter_rnd
+ paddw m2, filter_rnd
+%else
+ punpcklbw m2, m5
+ punpcklbw m4, m5
+ pmullw m0, filter_y_a
+ pmullw m1, m2, filter_y_b
+ punpcklbw m3, m5
+ paddw m0, filter_rnd
+ pmullw m2, filter_y_a
+ paddw m0, m1
+ pmullw m1, m4, filter_y_b
+ paddw m2, filter_rnd
+ paddw m2, m1
+ movx m1, [refq]
+%endif
+ psraw m0, 4
+ psraw m2, 4
+%if %2 == 1 ; avg
+ ; FIXME(rbultje) pipeline
+%if %1 == 4
+ movlhps m0, m2
+%endif
+ packuswb m0, m2
+%if %1 > 4
+ pavgb m0, [second_predq]
+ punpckhbw m2, m0, m5
+ punpcklbw m0, m5
+%else
+ movh m2, [second_predq]
+ pavgb m0, m2
+ punpcklbw m0, m5
+ movhlps m2, m0
+%endif
+%endif
+ punpcklbw m1, m5
+ SUM_SSE m0, m1, m2, m3, m6, m7
+ mova m0, m4
+
+ lea srcq, [srcq+src_strideq*2]
+ lea refq, [refq+ref_strideq*2]
+%endif
+%if %2 == 1 ; avg
+ add second_predq, second_str
+%endif
+ dec block_height
+ jg .x_half_y_other_loop
+%undef filter_y_a
+%undef filter_y_b
+%undef filter_rnd
+ STORE_AND_RET %1
+
+.x_nonhalf:
+ test y_offsetd, y_offsetd
+ jnz .x_nonhalf_y_nonzero
+
+ ; x_offset == bilin interpolation && y_offset == 0
+%if VPX_ARCH_X86_64
+ lea bilin_filter, [GLOBAL(bilin_filter_m)]
+%endif
+ shl x_offsetd, filter_idx_shift
+%if VPX_ARCH_X86_64 && %1 > 4
+ mova m8, [bilin_filter+x_offsetq]
+%if notcpuflag(ssse3) ; FIXME(rbultje) don't scatter registers on x86-64
+ mova m9, [bilin_filter+x_offsetq+16]
+%endif
+ mova m10, [GLOBAL(pw_8)]
+%define filter_x_a m8
+%define filter_x_b m9
+%define filter_rnd m10
+%else ; x86-32
+%if VPX_ARCH_X86=1 && CONFIG_PIC=1
+;y_offset == 0. We can reuse y_offset reg.
+%define tempq y_offsetq
+ add x_offsetq, g_bilin_filterm
+%define filter_x_a [x_offsetq]
+%define filter_x_b [x_offsetq+16]
+ mov tempq, g_pw_8m
+%define filter_rnd [tempq]
+%else
+ add x_offsetq, bilin_filter
+%define filter_x_a [x_offsetq]
+%define filter_x_b [x_offsetq+16]
+%define filter_rnd [GLOBAL(pw_8)]
+%endif
+%endif
+
+.x_other_y_zero_loop:
+%if %1 == 16
+ movu m0, [srcq]
+ movu m4, [srcq+1]
+ mova m1, [refq]
+%if cpuflag(ssse3)
+ punpckhbw m2, m0, m4
+ punpcklbw m0, m4
+ pmaddubsw m2, filter_x_a
+ pmaddubsw m0, filter_x_a
+ paddw m2, filter_rnd
+ paddw m0, filter_rnd
+%else
+ punpckhbw m2, m0, m5
+ punpckhbw m3, m4, m5
+ punpcklbw m0, m5
+ punpcklbw m4, m5
+ pmullw m2, filter_x_a
+ pmullw m3, filter_x_b
+ paddw m2, filter_rnd
+ pmullw m0, filter_x_a
+ pmullw m4, filter_x_b
+ paddw m0, filter_rnd
+ paddw m2, m3
+ paddw m0, m4
+%endif
+ psraw m2, 4
+ psraw m0, 4
+%if %2 == 1 ; avg
+ ; FIXME(rbultje) pipeline
+ packuswb m0, m2
+ pavgb m0, [second_predq]
+ punpckhbw m2, m0, m5
+ punpcklbw m0, m5
+%endif
+ punpckhbw m3, m1, m5
+ punpcklbw m1, m5
+ SUM_SSE m0, m1, m2, m3, m6, m7
+
+ add srcq, src_strideq
+ add refq, ref_strideq
+%else ; %1 < 16
+ movx m0, [srcq]
+ movx m1, [srcq+1]
+ movx m2, [srcq+src_strideq]
+ movx m4, [srcq+src_strideq+1]
+ movx m3, [refq+ref_strideq]
+%if cpuflag(ssse3)
+ punpcklbw m0, m1
+ movx m1, [refq]
+ punpcklbw m2, m4
+ pmaddubsw m0, filter_x_a
+ pmaddubsw m2, filter_x_a
+ punpcklbw m3, m5
+ paddw m0, filter_rnd
+ paddw m2, filter_rnd
+%else
+ punpcklbw m0, m5
+ punpcklbw m1, m5
+ punpcklbw m2, m5
+ punpcklbw m4, m5
+ pmullw m0, filter_x_a
+ pmullw m1, filter_x_b
+ punpcklbw m3, m5
+ paddw m0, filter_rnd
+ pmullw m2, filter_x_a
+ pmullw m4, filter_x_b
+ paddw m0, m1
+ paddw m2, filter_rnd
+ movx m1, [refq]
+ paddw m2, m4
+%endif
+ psraw m0, 4
+ psraw m2, 4
+%if %2 == 1 ; avg
+ ; FIXME(rbultje) pipeline
+%if %1 == 4
+ movlhps m0, m2
+%endif
+ packuswb m0, m2
+%if %1 > 4
+ pavgb m0, [second_predq]
+ punpckhbw m2, m0, m5
+ punpcklbw m0, m5
+%else
+ movh m2, [second_predq]
+ pavgb m0, m2
+ punpcklbw m0, m5
+ movhlps m2, m0
+%endif
+%endif
+ punpcklbw m1, m5
+ SUM_SSE m0, m1, m2, m3, m6, m7
+
+ lea srcq, [srcq+src_strideq*2]
+ lea refq, [refq+ref_strideq*2]
+%endif
+%if %2 == 1 ; avg
+ add second_predq, second_str
+%endif
+ dec block_height
+ jg .x_other_y_zero_loop
+%undef filter_x_a
+%undef filter_x_b
+%undef filter_rnd
+ STORE_AND_RET %1
+
+.x_nonhalf_y_nonzero:
+ cmp y_offsetd, 4
+ jne .x_nonhalf_y_nonhalf
+
+ ; x_offset == bilin interpolation && y_offset == 0.5
+%if VPX_ARCH_X86_64
+ lea bilin_filter, [GLOBAL(bilin_filter_m)]
+%endif
+ shl x_offsetd, filter_idx_shift
+%if VPX_ARCH_X86_64 && %1 > 4
+ mova m8, [bilin_filter+x_offsetq]
+%if notcpuflag(ssse3) ; FIXME(rbultje) don't scatter registers on x86-64
+ mova m9, [bilin_filter+x_offsetq+16]
+%endif
+ mova m10, [GLOBAL(pw_8)]
+%define filter_x_a m8
+%define filter_x_b m9
+%define filter_rnd m10
+%else ; x86-32
+%if VPX_ARCH_X86=1 && CONFIG_PIC=1
+; y_offset == 0.5. We can reuse y_offset reg.
+%define tempq y_offsetq
+ add x_offsetq, g_bilin_filterm
+%define filter_x_a [x_offsetq]
+%define filter_x_b [x_offsetq+16]
+ mov tempq, g_pw_8m
+%define filter_rnd [tempq]
+%else
+ add x_offsetq, bilin_filter
+%define filter_x_a [x_offsetq]
+%define filter_x_b [x_offsetq+16]
+%define filter_rnd [GLOBAL(pw_8)]
+%endif
+%endif
+
+%if %1 == 16
+ movu m0, [srcq]
+ movu m1, [srcq+1]
+%if cpuflag(ssse3)
+ punpckhbw m2, m0, m1
+ punpcklbw m0, m1
+ pmaddubsw m2, filter_x_a
+ pmaddubsw m0, filter_x_a
+ paddw m2, filter_rnd
+ paddw m0, filter_rnd
+%else
+ punpckhbw m2, m0, m5
+ punpckhbw m3, m1, m5
+ punpcklbw m0, m5
+ punpcklbw m1, m5
+ pmullw m0, filter_x_a
+ pmullw m1, filter_x_b
+ paddw m0, filter_rnd
+ pmullw m2, filter_x_a
+ pmullw m3, filter_x_b
+ paddw m2, filter_rnd
+ paddw m0, m1
+ paddw m2, m3
+%endif
+ psraw m0, 4
+ psraw m2, 4
+ add srcq, src_strideq
+ packuswb m0, m2
+.x_other_y_half_loop:
+ movu m4, [srcq]
+ movu m3, [srcq+1]
+%if cpuflag(ssse3)
+ mova m1, [refq]
+ punpckhbw m2, m4, m3
+ punpcklbw m4, m3
+ pmaddubsw m2, filter_x_a
+ pmaddubsw m4, filter_x_a
+ paddw m2, filter_rnd
+ paddw m4, filter_rnd
+ psraw m2, 4
+ psraw m4, 4
+ packuswb m4, m2
+ pavgb m0, m4
+ punpckhbw m3, m1, m5
+ punpcklbw m1, m5
+%else
+ punpckhbw m2, m4, m5
+ punpckhbw m1, m3, m5
+ punpcklbw m4, m5
+ punpcklbw m3, m5
+ pmullw m4, filter_x_a
+ pmullw m3, filter_x_b
+ paddw m4, filter_rnd
+ pmullw m2, filter_x_a
+ pmullw m1, filter_x_b
+ paddw m2, filter_rnd
+ paddw m4, m3
+ paddw m2, m1
+ mova m1, [refq]
+ psraw m4, 4
+ psraw m2, 4
+ punpckhbw m3, m1, m5
+ ; FIXME(rbultje) the repeated pack/unpack here around m0/m2 is because we
+ ; have a 1-register shortage to be able to store the backup of the bilin
+ ; filtered second line as words as cache for the next line. Packing into
+ ; a byte costs 1 pack and 2 unpacks, but saves a register.
+ packuswb m4, m2
+ punpcklbw m1, m5
+ pavgb m0, m4
+%endif
+%if %2 == 1 ; avg
+ ; FIXME(rbultje) pipeline
+ pavgb m0, [second_predq]
+%endif
+ punpckhbw m2, m0, m5
+ punpcklbw m0, m5
+ SUM_SSE m0, m1, m2, m3, m6, m7
+ mova m0, m4
+
+ add srcq, src_strideq
+ add refq, ref_strideq
+%else ; %1 < 16
+ movx m0, [srcq]
+ movx m1, [srcq+1]
+%if cpuflag(ssse3)
+ punpcklbw m0, m1
+ pmaddubsw m0, filter_x_a
+ paddw m0, filter_rnd
+%else
+ punpcklbw m0, m5
+ punpcklbw m1, m5
+ pmullw m0, filter_x_a
+ pmullw m1, filter_x_b
+ paddw m0, filter_rnd
+ paddw m0, m1
+%endif
+ add srcq, src_strideq
+ psraw m0, 4
+.x_other_y_half_loop:
+ movx m2, [srcq]
+ movx m1, [srcq+1]
+ movx m4, [srcq+src_strideq]
+ movx m3, [srcq+src_strideq+1]
+%if cpuflag(ssse3)
+ punpcklbw m2, m1
+ punpcklbw m4, m3
+ pmaddubsw m2, filter_x_a
+ pmaddubsw m4, filter_x_a
+ movx m1, [refq]
+ movx m3, [refq+ref_strideq]
+ paddw m2, filter_rnd
+ paddw m4, filter_rnd
+%else
+ punpcklbw m2, m5
+ punpcklbw m1, m5
+ punpcklbw m4, m5
+ punpcklbw m3, m5
+ pmullw m2, filter_x_a
+ pmullw m1, filter_x_b
+ paddw m2, filter_rnd
+ pmullw m4, filter_x_a
+ pmullw m3, filter_x_b
+ paddw m4, filter_rnd
+ paddw m2, m1
+ movx m1, [refq]
+ paddw m4, m3
+ movx m3, [refq+ref_strideq]
+%endif
+ psraw m2, 4
+ psraw m4, 4
+ pavgw m0, m2
+ pavgw m2, m4
+%if %2 == 1 ; avg
+ ; FIXME(rbultje) pipeline - also consider going to bytes here
+%if %1 == 4
+ movlhps m0, m2
+%endif
+ packuswb m0, m2
+%if %1 > 4
+ pavgb m0, [second_predq]
+ punpckhbw m2, m0, m5
+ punpcklbw m0, m5
+%else
+ movh m2, [second_predq]
+ pavgb m0, m2
+ punpcklbw m0, m5
+ movhlps m2, m0
+%endif
+%endif
+ punpcklbw m3, m5
+ punpcklbw m1, m5
+ SUM_SSE m0, m1, m2, m3, m6, m7
+ mova m0, m4
+
+ lea srcq, [srcq+src_strideq*2]
+ lea refq, [refq+ref_strideq*2]
+%endif
+%if %2 == 1 ; avg
+ add second_predq, second_str
+%endif
+ dec block_height
+ jg .x_other_y_half_loop
+%undef filter_x_a
+%undef filter_x_b
+%undef filter_rnd
+ STORE_AND_RET %1
+
+.x_nonhalf_y_nonhalf:
+%if VPX_ARCH_X86_64
+ lea bilin_filter, [GLOBAL(bilin_filter_m)]
+%endif
+ shl x_offsetd, filter_idx_shift
+ shl y_offsetd, filter_idx_shift
+%if VPX_ARCH_X86_64 && %1 > 4
+ mova m8, [bilin_filter+x_offsetq]
+%if notcpuflag(ssse3) ; FIXME(rbultje) don't scatter registers on x86-64
+ mova m9, [bilin_filter+x_offsetq+16]
+%endif
+ mova m10, [bilin_filter+y_offsetq]
+%if notcpuflag(ssse3) ; FIXME(rbultje) don't scatter registers on x86-64
+ mova m11, [bilin_filter+y_offsetq+16]
+%endif
+ mova m12, [GLOBAL(pw_8)]
+%define filter_x_a m8
+%define filter_x_b m9
+%define filter_y_a m10
+%define filter_y_b m11
+%define filter_rnd m12
+%else ; x86-32
+%if VPX_ARCH_X86=1 && CONFIG_PIC=1
+; In this case, there is NO unused register. Used src_stride register. Later,
+; src_stride has to be loaded from stack when it is needed.
+%define tempq src_strideq
+ mov tempq, g_bilin_filterm
+ add x_offsetq, tempq
+ add y_offsetq, tempq
+%define filter_x_a [x_offsetq]
+%define filter_x_b [x_offsetq+16]
+%define filter_y_a [y_offsetq]
+%define filter_y_b [y_offsetq+16]
+
+ mov tempq, g_pw_8m
+%define filter_rnd [tempq]
+%else
+ add x_offsetq, bilin_filter
+ add y_offsetq, bilin_filter
+%define filter_x_a [x_offsetq]
+%define filter_x_b [x_offsetq+16]
+%define filter_y_a [y_offsetq]
+%define filter_y_b [y_offsetq+16]
+%define filter_rnd [GLOBAL(pw_8)]
+%endif
+%endif
+
+ ; x_offset == bilin interpolation && y_offset == bilin interpolation
+%if %1 == 16
+ movu m0, [srcq]
+ movu m1, [srcq+1]
+%if cpuflag(ssse3)
+ punpckhbw m2, m0, m1
+ punpcklbw m0, m1
+ pmaddubsw m2, filter_x_a
+ pmaddubsw m0, filter_x_a
+ paddw m2, filter_rnd
+ paddw m0, filter_rnd
+%else
+ punpckhbw m2, m0, m5
+ punpckhbw m3, m1, m5
+ punpcklbw m0, m5
+ punpcklbw m1, m5
+ pmullw m0, filter_x_a
+ pmullw m1, filter_x_b
+ paddw m0, filter_rnd
+ pmullw m2, filter_x_a
+ pmullw m3, filter_x_b
+ paddw m2, filter_rnd
+ paddw m0, m1
+ paddw m2, m3
+%endif
+ psraw m0, 4
+ psraw m2, 4
+
+ INC_SRC_BY_SRC_STRIDE
+
+ packuswb m0, m2
+.x_other_y_other_loop:
+%if cpuflag(ssse3)
+ movu m4, [srcq]
+ movu m3, [srcq+1]
+ mova m1, [refq]
+ punpckhbw m2, m4, m3
+ punpcklbw m4, m3
+ pmaddubsw m2, filter_x_a
+ pmaddubsw m4, filter_x_a
+ punpckhbw m3, m1, m5
+ paddw m2, filter_rnd
+ paddw m4, filter_rnd
+ psraw m2, 4
+ psraw m4, 4
+ packuswb m4, m2
+ punpckhbw m2, m0, m4
+ punpcklbw m0, m4
+ pmaddubsw m2, filter_y_a
+ pmaddubsw m0, filter_y_a
+ punpcklbw m1, m5
+ paddw m2, filter_rnd
+ paddw m0, filter_rnd
+ psraw m2, 4
+ psraw m0, 4
+%else
+ movu m3, [srcq]
+ movu m4, [srcq+1]
+ punpckhbw m1, m3, m5
+ punpckhbw m2, m4, m5
+ punpcklbw m3, m5
+ punpcklbw m4, m5
+ pmullw m3, filter_x_a
+ pmullw m4, filter_x_b
+ paddw m3, filter_rnd
+ pmullw m1, filter_x_a
+ pmullw m2, filter_x_b
+ paddw m1, filter_rnd
+ paddw m3, m4
+ paddw m1, m2
+ psraw m3, 4
+ psraw m1, 4
+ packuswb m4, m3, m1
+ punpckhbw m2, m0, m5
+ punpcklbw m0, m5
+ pmullw m2, filter_y_a
+ pmullw m1, filter_y_b
+ paddw m2, filter_rnd
+ pmullw m0, filter_y_a
+ pmullw m3, filter_y_b
+ paddw m2, m1
+ mova m1, [refq]
+ paddw m0, filter_rnd
+ psraw m2, 4
+ paddw m0, m3
+ punpckhbw m3, m1, m5
+ psraw m0, 4
+ punpcklbw m1, m5
+%endif
+%if %2 == 1 ; avg
+ ; FIXME(rbultje) pipeline
+ packuswb m0, m2
+ pavgb m0, [second_predq]
+ punpckhbw m2, m0, m5
+ punpcklbw m0, m5
+%endif
+ SUM_SSE m0, m1, m2, m3, m6, m7
+ mova m0, m4
+
+ INC_SRC_BY_SRC_STRIDE
+ add refq, ref_strideq
+%else ; %1 < 16
+ movx m0, [srcq]
+ movx m1, [srcq+1]
+%if cpuflag(ssse3)
+ punpcklbw m0, m1
+ pmaddubsw m0, filter_x_a
+ paddw m0, filter_rnd
+%else
+ punpcklbw m0, m5
+ punpcklbw m1, m5
+ pmullw m0, filter_x_a
+ pmullw m1, filter_x_b
+ paddw m0, filter_rnd
+ paddw m0, m1
+%endif
+ psraw m0, 4
+%if cpuflag(ssse3)
+ packuswb m0, m0
+%endif
+
+ INC_SRC_BY_SRC_STRIDE
+
+.x_other_y_other_loop:
+ movx m2, [srcq]
+ movx m1, [srcq+1]
+
+ INC_SRC_BY_SRC_STRIDE
+ movx m4, [srcq]
+ movx m3, [srcq+1]
+
+%if cpuflag(ssse3)
+ punpcklbw m2, m1
+ punpcklbw m4, m3
+ pmaddubsw m2, filter_x_a
+ pmaddubsw m4, filter_x_a
+ movx m3, [refq+ref_strideq]
+ movx m1, [refq]
+ paddw m2, filter_rnd
+ paddw m4, filter_rnd
+ psraw m2, 4
+ psraw m4, 4
+ packuswb m2, m2
+ packuswb m4, m4
+ punpcklbw m0, m2
+ punpcklbw m2, m4
+ pmaddubsw m0, filter_y_a
+ pmaddubsw m2, filter_y_a
+ punpcklbw m3, m5
+ paddw m0, filter_rnd
+ paddw m2, filter_rnd
+ psraw m0, 4
+ psraw m2, 4
+ punpcklbw m1, m5
+%else
+ punpcklbw m2, m5
+ punpcklbw m1, m5
+ punpcklbw m4, m5
+ punpcklbw m3, m5
+ pmullw m2, filter_x_a
+ pmullw m1, filter_x_b
+ paddw m2, filter_rnd
+ pmullw m4, filter_x_a
+ pmullw m3, filter_x_b
+ paddw m4, filter_rnd
+ paddw m2, m1
+ paddw m4, m3
+ psraw m2, 4
+ psraw m4, 4
+ pmullw m0, filter_y_a
+ pmullw m3, m2, filter_y_b
+ paddw m0, filter_rnd
+ pmullw m2, filter_y_a
+ pmullw m1, m4, filter_y_b
+ paddw m2, filter_rnd
+ paddw m0, m3
+ movx m3, [refq+ref_strideq]
+ paddw m2, m1
+ movx m1, [refq]
+ psraw m0, 4
+ psraw m2, 4
+ punpcklbw m3, m5
+ punpcklbw m1, m5
+%endif
+%if %2 == 1 ; avg
+ ; FIXME(rbultje) pipeline
+%if %1 == 4
+ movlhps m0, m2
+%endif
+ packuswb m0, m2
+%if %1 > 4
+ pavgb m0, [second_predq]
+ punpckhbw m2, m0, m5
+ punpcklbw m0, m5
+%else
+ movh m2, [second_predq]
+ pavgb m0, m2
+ punpcklbw m0, m5
+ movhlps m2, m0
+%endif
+%endif
+ SUM_SSE m0, m1, m2, m3, m6, m7
+ mova m0, m4
+
+ INC_SRC_BY_SRC_STRIDE
+ lea refq, [refq+ref_strideq*2]
+%endif
+%if %2 == 1 ; avg
+ add second_predq, second_str
+%endif
+ dec block_height
+ jg .x_other_y_other_loop
+%undef filter_x_a
+%undef filter_x_b
+%undef filter_y_a
+%undef filter_y_b
+%undef filter_rnd
+%undef movx
+ STORE_AND_RET %1
+%endmacro
+
+; FIXME(rbultje) the non-bilinear versions (i.e. x=0,8&&y=0,8) are identical
+; between the ssse3 and non-ssse3 version. It may make sense to merge their
+; code in the sense that the ssse3 version would jump to the appropriate
+; location in the sse/2 version, rather than duplicating that code in the
+; binary.
+
+INIT_XMM sse2
+SUBPEL_VARIANCE 4
+SUBPEL_VARIANCE 8
+SUBPEL_VARIANCE 16
+
+INIT_XMM ssse3
+SUBPEL_VARIANCE 4
+SUBPEL_VARIANCE 8
+SUBPEL_VARIANCE 16
+
+INIT_XMM sse2
+SUBPEL_VARIANCE 4, 1
+SUBPEL_VARIANCE 8, 1
+SUBPEL_VARIANCE 16, 1
+
+INIT_XMM ssse3
+SUBPEL_VARIANCE 4, 1
+SUBPEL_VARIANCE 8, 1
+SUBPEL_VARIANCE 16, 1
diff --git a/media/libvpx/libvpx/vpx_dsp/x86/subtract_avx2.c b/media/libvpx/libvpx/vpx_dsp/x86/subtract_avx2.c
new file mode 100644
index 0000000000..4849581ed4
--- /dev/null
+++ b/media/libvpx/libvpx/vpx_dsp/x86/subtract_avx2.c
@@ -0,0 +1,203 @@
+/*
+ * Copyright (c) 2022 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <assert.h>
+#include <immintrin.h>
+
+#include "./vpx_dsp_rtcd.h"
+#include "vpx/vpx_integer.h"
+
+static VPX_FORCE_INLINE void subtract32_avx2(int16_t *diff_ptr,
+ const uint8_t *src_ptr,
+ const uint8_t *pred_ptr) {
+ const __m256i s = _mm256_lddqu_si256((const __m256i *)src_ptr);
+ const __m256i p = _mm256_lddqu_si256((const __m256i *)pred_ptr);
+ const __m256i s_0 = _mm256_cvtepu8_epi16(_mm256_castsi256_si128(s));
+ const __m256i s_1 = _mm256_cvtepu8_epi16(_mm256_extracti128_si256(s, 1));
+ const __m256i p_0 = _mm256_cvtepu8_epi16(_mm256_castsi256_si128(p));
+ const __m256i p_1 = _mm256_cvtepu8_epi16(_mm256_extracti128_si256(p, 1));
+ const __m256i d_0 = _mm256_sub_epi16(s_0, p_0);
+ const __m256i d_1 = _mm256_sub_epi16(s_1, p_1);
+ _mm256_storeu_si256((__m256i *)diff_ptr, d_0);
+ _mm256_storeu_si256((__m256i *)(diff_ptr + 16), d_1);
+}
+
+static VPX_FORCE_INLINE void subtract_block_16xn_avx2(
+ int rows, int16_t *diff_ptr, ptrdiff_t diff_stride, const uint8_t *src_ptr,
+ ptrdiff_t src_stride, const uint8_t *pred_ptr, ptrdiff_t pred_stride) {
+ int j;
+ for (j = 0; j < rows; ++j) {
+ const __m128i s = _mm_lddqu_si128((const __m128i *)src_ptr);
+ const __m128i p = _mm_lddqu_si128((const __m128i *)pred_ptr);
+ const __m256i s_0 = _mm256_cvtepu8_epi16(s);
+ const __m256i p_0 = _mm256_cvtepu8_epi16(p);
+ const __m256i d_0 = _mm256_sub_epi16(s_0, p_0);
+ _mm256_storeu_si256((__m256i *)diff_ptr, d_0);
+ src_ptr += src_stride;
+ pred_ptr += pred_stride;
+ diff_ptr += diff_stride;
+ }
+}
+
+static VPX_FORCE_INLINE void subtract_block_32xn_avx2(
+ int rows, int16_t *diff_ptr, ptrdiff_t diff_stride, const uint8_t *src_ptr,
+ ptrdiff_t src_stride, const uint8_t *pred_ptr, ptrdiff_t pred_stride) {
+ int j;
+ for (j = 0; j < rows; ++j) {
+ subtract32_avx2(diff_ptr, src_ptr, pred_ptr);
+ src_ptr += src_stride;
+ pred_ptr += pred_stride;
+ diff_ptr += diff_stride;
+ }
+}
+
+static VPX_FORCE_INLINE void subtract_block_64xn_avx2(
+ int rows, int16_t *diff_ptr, ptrdiff_t diff_stride, const uint8_t *src_ptr,
+ ptrdiff_t src_stride, const uint8_t *pred_ptr, ptrdiff_t pred_stride) {
+ int j;
+ for (j = 0; j < rows; ++j) {
+ subtract32_avx2(diff_ptr, src_ptr, pred_ptr);
+ subtract32_avx2(diff_ptr + 32, src_ptr + 32, pred_ptr + 32);
+ src_ptr += src_stride;
+ pred_ptr += pred_stride;
+ diff_ptr += diff_stride;
+ }
+}
+
+void vpx_subtract_block_avx2(int rows, int cols, int16_t *diff_ptr,
+ ptrdiff_t diff_stride, const uint8_t *src_ptr,
+ ptrdiff_t src_stride, const uint8_t *pred_ptr,
+ ptrdiff_t pred_stride) {
+ switch (cols) {
+ case 16:
+ subtract_block_16xn_avx2(rows, diff_ptr, diff_stride, src_ptr, src_stride,
+ pred_ptr, pred_stride);
+ break;
+ case 32:
+ subtract_block_32xn_avx2(rows, diff_ptr, diff_stride, src_ptr, src_stride,
+ pred_ptr, pred_stride);
+ break;
+ case 64:
+ subtract_block_64xn_avx2(rows, diff_ptr, diff_stride, src_ptr, src_stride,
+ pred_ptr, pred_stride);
+ break;
+ default:
+ vpx_subtract_block_sse2(rows, cols, diff_ptr, diff_stride, src_ptr,
+ src_stride, pred_ptr, pred_stride);
+ break;
+ }
+}
+
+#if CONFIG_VP9_HIGHBITDEPTH
+void vpx_highbd_subtract_block_avx2(int rows, int cols, int16_t *diff_ptr,
+ ptrdiff_t diff_stride,
+ const uint8_t *src8_ptr,
+ ptrdiff_t src_stride,
+ const uint8_t *pred8_ptr,
+ ptrdiff_t pred_stride, int bd) {
+ uint16_t *src_ptr = CONVERT_TO_SHORTPTR(src8_ptr);
+ uint16_t *pred_ptr = CONVERT_TO_SHORTPTR(pred8_ptr);
+ (void)bd;
+ if (cols == 64) {
+ int j = rows;
+ do {
+ const __m256i s0 = _mm256_lddqu_si256((const __m256i *)src_ptr);
+ const __m256i s1 = _mm256_lddqu_si256((const __m256i *)(src_ptr + 16));
+ const __m256i s2 = _mm256_lddqu_si256((const __m256i *)(src_ptr + 32));
+ const __m256i s3 = _mm256_lddqu_si256((const __m256i *)(src_ptr + 48));
+ const __m256i p0 = _mm256_lddqu_si256((const __m256i *)pred_ptr);
+ const __m256i p1 = _mm256_lddqu_si256((const __m256i *)(pred_ptr + 16));
+ const __m256i p2 = _mm256_lddqu_si256((const __m256i *)(pred_ptr + 32));
+ const __m256i p3 = _mm256_lddqu_si256((const __m256i *)(pred_ptr + 48));
+ const __m256i d0 = _mm256_sub_epi16(s0, p0);
+ const __m256i d1 = _mm256_sub_epi16(s1, p1);
+ const __m256i d2 = _mm256_sub_epi16(s2, p2);
+ const __m256i d3 = _mm256_sub_epi16(s3, p3);
+ _mm256_storeu_si256((__m256i *)diff_ptr, d0);
+ _mm256_storeu_si256((__m256i *)(diff_ptr + 16), d1);
+ _mm256_storeu_si256((__m256i *)(diff_ptr + 32), d2);
+ _mm256_storeu_si256((__m256i *)(diff_ptr + 48), d3);
+ src_ptr += src_stride;
+ pred_ptr += pred_stride;
+ diff_ptr += diff_stride;
+ } while (--j != 0);
+ } else if (cols == 32) {
+ int j = rows;
+ do {
+ const __m256i s0 = _mm256_lddqu_si256((const __m256i *)src_ptr);
+ const __m256i s1 = _mm256_lddqu_si256((const __m256i *)(src_ptr + 16));
+ const __m256i p0 = _mm256_lddqu_si256((const __m256i *)pred_ptr);
+ const __m256i p1 = _mm256_lddqu_si256((const __m256i *)(pred_ptr + 16));
+ const __m256i d0 = _mm256_sub_epi16(s0, p0);
+ const __m256i d1 = _mm256_sub_epi16(s1, p1);
+ _mm256_storeu_si256((__m256i *)diff_ptr, d0);
+ _mm256_storeu_si256((__m256i *)(diff_ptr + 16), d1);
+ src_ptr += src_stride;
+ pred_ptr += pred_stride;
+ diff_ptr += diff_stride;
+ } while (--j != 0);
+ } else if (cols == 16) {
+ int j = rows;
+ do {
+ const __m256i s0 = _mm256_lddqu_si256((const __m256i *)src_ptr);
+ const __m256i s1 =
+ _mm256_lddqu_si256((const __m256i *)(src_ptr + src_stride));
+ const __m256i p0 = _mm256_lddqu_si256((const __m256i *)pred_ptr);
+ const __m256i p1 =
+ _mm256_lddqu_si256((const __m256i *)(pred_ptr + pred_stride));
+ const __m256i d0 = _mm256_sub_epi16(s0, p0);
+ const __m256i d1 = _mm256_sub_epi16(s1, p1);
+ _mm256_storeu_si256((__m256i *)diff_ptr, d0);
+ _mm256_storeu_si256((__m256i *)(diff_ptr + diff_stride), d1);
+ src_ptr += src_stride << 1;
+ pred_ptr += pred_stride << 1;
+ diff_ptr += diff_stride << 1;
+ j -= 2;
+ } while (j != 0);
+ } else if (cols == 8) {
+ int j = rows;
+ do {
+ const __m128i s0 = _mm_lddqu_si128((const __m128i *)src_ptr);
+ const __m128i s1 =
+ _mm_lddqu_si128((const __m128i *)(src_ptr + src_stride));
+ const __m128i p0 = _mm_lddqu_si128((const __m128i *)pred_ptr);
+ const __m128i p1 =
+ _mm_lddqu_si128((const __m128i *)(pred_ptr + pred_stride));
+ const __m128i d0 = _mm_sub_epi16(s0, p0);
+ const __m128i d1 = _mm_sub_epi16(s1, p1);
+ _mm_storeu_si128((__m128i *)diff_ptr, d0);
+ _mm_storeu_si128((__m128i *)(diff_ptr + diff_stride), d1);
+ src_ptr += src_stride << 1;
+ pred_ptr += pred_stride << 1;
+ diff_ptr += diff_stride << 1;
+ j -= 2;
+ } while (j != 0);
+ } else {
+ int j = rows;
+ assert(cols == 4);
+ do {
+ const __m128i s0 = _mm_loadl_epi64((const __m128i *)src_ptr);
+ const __m128i s1 =
+ _mm_loadl_epi64((const __m128i *)(src_ptr + src_stride));
+ const __m128i p0 = _mm_loadl_epi64((const __m128i *)pred_ptr);
+ const __m128i p1 =
+ _mm_loadl_epi64((const __m128i *)(pred_ptr + pred_stride));
+ const __m128i d0 = _mm_sub_epi16(s0, p0);
+ const __m128i d1 = _mm_sub_epi16(s1, p1);
+ _mm_storel_epi64((__m128i *)diff_ptr, d0);
+ _mm_storel_epi64((__m128i *)(diff_ptr + diff_stride), d1);
+ src_ptr += src_stride << 1;
+ pred_ptr += pred_stride << 1;
+ diff_ptr += diff_stride << 1;
+ j -= 2;
+ } while (j != 0);
+ }
+}
+#endif // CONFIG_VP9_HIGHBITDEPTH
diff --git a/media/libvpx/libvpx/vpx_dsp/x86/subtract_sse2.asm b/media/libvpx/libvpx/vpx_dsp/x86/subtract_sse2.asm
new file mode 100644
index 0000000000..4273efb854
--- /dev/null
+++ b/media/libvpx/libvpx/vpx_dsp/x86/subtract_sse2.asm
@@ -0,0 +1,127 @@
+;
+; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+;
+; Use of this source code is governed by a BSD-style license
+; that can be found in the LICENSE file in the root of the source
+; tree. An additional intellectual property rights grant can be found
+; in the file PATENTS. All contributing project authors may
+; be found in the AUTHORS file in the root of the source tree.
+;
+
+%include "third_party/x86inc/x86inc.asm"
+
+SECTION .text
+
+; void vpx_subtract_block(int rows, int cols,
+; int16_t *diff, ptrdiff_t diff_stride,
+; const uint8_t *src, ptrdiff_t src_stride,
+; const uint8_t *pred, ptrdiff_t pred_stride)
+
+INIT_XMM sse2
+cglobal subtract_block, 7, 7, 8, \
+ rows, cols, diff, diff_stride, src, src_stride, \
+ pred, pred_stride
+%define pred_str colsq
+ pxor m7, m7 ; dedicated zero register
+ cmp colsd, 4
+ je .case_4
+ cmp colsd, 8
+ je .case_8
+ cmp colsd, 16
+ je .case_16
+ cmp colsd, 32
+ je .case_32
+
+%macro loop16 6
+ mova m0, [srcq+%1]
+ mova m4, [srcq+%2]
+ mova m1, [predq+%3]
+ mova m5, [predq+%4]
+ punpckhbw m2, m0, m7
+ punpckhbw m3, m1, m7
+ punpcklbw m0, m7
+ punpcklbw m1, m7
+ psubw m2, m3
+ psubw m0, m1
+ punpckhbw m1, m4, m7
+ punpckhbw m3, m5, m7
+ punpcklbw m4, m7
+ punpcklbw m5, m7
+ psubw m1, m3
+ psubw m4, m5
+ mova [diffq+mmsize*0+%5], m0
+ mova [diffq+mmsize*1+%5], m2
+ mova [diffq+mmsize*0+%6], m4
+ mova [diffq+mmsize*1+%6], m1
+%endmacro
+
+ mov pred_str, pred_stridemp
+.loop_64:
+ loop16 0*mmsize, 1*mmsize, 0*mmsize, 1*mmsize, 0*mmsize, 2*mmsize
+ loop16 2*mmsize, 3*mmsize, 2*mmsize, 3*mmsize, 4*mmsize, 6*mmsize
+ lea diffq, [diffq+diff_strideq*2]
+ add predq, pred_str
+ add srcq, src_strideq
+ dec rowsd
+ jg .loop_64
+ RET
+
+.case_32:
+ mov pred_str, pred_stridemp
+.loop_32:
+ loop16 0, mmsize, 0, mmsize, 0, 2*mmsize
+ lea diffq, [diffq+diff_strideq*2]
+ add predq, pred_str
+ add srcq, src_strideq
+ dec rowsd
+ jg .loop_32
+ RET
+
+.case_16:
+ mov pred_str, pred_stridemp
+.loop_16:
+ loop16 0, src_strideq, 0, pred_str, 0, diff_strideq*2
+ lea diffq, [diffq+diff_strideq*4]
+ lea predq, [predq+pred_str*2]
+ lea srcq, [srcq+src_strideq*2]
+ sub rowsd, 2
+ jg .loop_16
+ RET
+
+%macro loop_h 0
+ movh m0, [srcq]
+ movh m2, [srcq+src_strideq]
+ movh m1, [predq]
+ movh m3, [predq+pred_str]
+ punpcklbw m0, m7
+ punpcklbw m1, m7
+ punpcklbw m2, m7
+ punpcklbw m3, m7
+ psubw m0, m1
+ psubw m2, m3
+ mova [diffq], m0
+ mova [diffq+diff_strideq*2], m2
+%endmacro
+
+.case_8:
+ mov pred_str, pred_stridemp
+.loop_8:
+ loop_h
+ lea diffq, [diffq+diff_strideq*4]
+ lea srcq, [srcq+src_strideq*2]
+ lea predq, [predq+pred_str*2]
+ sub rowsd, 2
+ jg .loop_8
+ RET
+
+INIT_MMX
+.case_4:
+ mov pred_str, pred_stridemp
+.loop_4:
+ loop_h
+ lea diffq, [diffq+diff_strideq*4]
+ lea srcq, [srcq+src_strideq*2]
+ lea predq, [predq+pred_str*2]
+ sub rowsd, 2
+ jg .loop_4
+ RET
diff --git a/media/libvpx/libvpx/vpx_dsp/x86/sum_squares_sse2.c b/media/libvpx/libvpx/vpx_dsp/x86/sum_squares_sse2.c
new file mode 100644
index 0000000000..df6514b2c4
--- /dev/null
+++ b/media/libvpx/libvpx/vpx_dsp/x86/sum_squares_sse2.c
@@ -0,0 +1,105 @@
+/*
+ * Copyright (c) 2016 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <assert.h>
+#include <emmintrin.h>
+
+#include "./vpx_dsp_rtcd.h"
+#include "vpx_dsp/x86/mem_sse2.h"
+
+uint64_t vpx_sum_squares_2d_i16_sse2(const int16_t *src, int stride, int size) {
+ // Over 75% of all calls are with size == 4.
+ if (size == 4) {
+ __m128i s[2], sq[2], ss;
+
+ s[0] = _mm_loadl_epi64((const __m128i *)(src + 0 * stride));
+ s[0] = loadh_epi64(s[0], src + 1 * stride);
+ s[1] = _mm_loadl_epi64((const __m128i *)(src + 2 * stride));
+ s[1] = loadh_epi64(s[1], src + 3 * stride);
+ sq[0] = _mm_madd_epi16(s[0], s[0]);
+ sq[1] = _mm_madd_epi16(s[1], s[1]);
+ sq[0] = _mm_add_epi32(sq[0], sq[1]);
+ ss = _mm_add_epi32(sq[0], _mm_srli_si128(sq[0], 8));
+ ss = _mm_add_epi32(ss, _mm_srli_epi64(ss, 32));
+
+ return (uint64_t)_mm_cvtsi128_si32(ss);
+ } else {
+ // Generic case
+ int r = size;
+ const __m128i v_zext_mask_q = _mm_set_epi32(0, -1, 0, -1);
+ __m128i v_acc_q = _mm_setzero_si128();
+
+ assert(size % 8 == 0);
+
+ do {
+ int c = 0;
+ __m128i v_acc_d = _mm_setzero_si128();
+
+ do {
+ const int16_t *const b = src + c;
+ const __m128i v_val_0_w =
+ _mm_load_si128((const __m128i *)(b + 0 * stride));
+ const __m128i v_val_1_w =
+ _mm_load_si128((const __m128i *)(b + 1 * stride));
+ const __m128i v_val_2_w =
+ _mm_load_si128((const __m128i *)(b + 2 * stride));
+ const __m128i v_val_3_w =
+ _mm_load_si128((const __m128i *)(b + 3 * stride));
+ const __m128i v_val_4_w =
+ _mm_load_si128((const __m128i *)(b + 4 * stride));
+ const __m128i v_val_5_w =
+ _mm_load_si128((const __m128i *)(b + 5 * stride));
+ const __m128i v_val_6_w =
+ _mm_load_si128((const __m128i *)(b + 6 * stride));
+ const __m128i v_val_7_w =
+ _mm_load_si128((const __m128i *)(b + 7 * stride));
+
+ const __m128i v_sq_0_d = _mm_madd_epi16(v_val_0_w, v_val_0_w);
+ const __m128i v_sq_1_d = _mm_madd_epi16(v_val_1_w, v_val_1_w);
+ const __m128i v_sq_2_d = _mm_madd_epi16(v_val_2_w, v_val_2_w);
+ const __m128i v_sq_3_d = _mm_madd_epi16(v_val_3_w, v_val_3_w);
+ const __m128i v_sq_4_d = _mm_madd_epi16(v_val_4_w, v_val_4_w);
+ const __m128i v_sq_5_d = _mm_madd_epi16(v_val_5_w, v_val_5_w);
+ const __m128i v_sq_6_d = _mm_madd_epi16(v_val_6_w, v_val_6_w);
+ const __m128i v_sq_7_d = _mm_madd_epi16(v_val_7_w, v_val_7_w);
+
+ const __m128i v_sum_01_d = _mm_add_epi32(v_sq_0_d, v_sq_1_d);
+ const __m128i v_sum_23_d = _mm_add_epi32(v_sq_2_d, v_sq_3_d);
+ const __m128i v_sum_45_d = _mm_add_epi32(v_sq_4_d, v_sq_5_d);
+ const __m128i v_sum_67_d = _mm_add_epi32(v_sq_6_d, v_sq_7_d);
+
+ const __m128i v_sum_0123_d = _mm_add_epi32(v_sum_01_d, v_sum_23_d);
+ const __m128i v_sum_4567_d = _mm_add_epi32(v_sum_45_d, v_sum_67_d);
+
+ v_acc_d = _mm_add_epi32(v_acc_d, v_sum_0123_d);
+ v_acc_d = _mm_add_epi32(v_acc_d, v_sum_4567_d);
+ c += 8;
+ } while (c < size);
+
+ v_acc_q = _mm_add_epi64(v_acc_q, _mm_and_si128(v_acc_d, v_zext_mask_q));
+ v_acc_q = _mm_add_epi64(v_acc_q, _mm_srli_epi64(v_acc_d, 32));
+
+ src += 8 * stride;
+ r -= 8;
+ } while (r);
+
+ v_acc_q = _mm_add_epi64(v_acc_q, _mm_srli_si128(v_acc_q, 8));
+
+#if VPX_ARCH_X86_64
+ return (uint64_t)_mm_cvtsi128_si64(v_acc_q);
+#else
+ {
+ uint64_t tmp;
+ _mm_storel_epi64((__m128i *)&tmp, v_acc_q);
+ return tmp;
+ }
+#endif
+ }
+}
diff --git a/media/libvpx/libvpx/vpx_dsp/x86/transpose_sse2.h b/media/libvpx/libvpx/vpx_dsp/x86/transpose_sse2.h
new file mode 100644
index 0000000000..b4f1190d74
--- /dev/null
+++ b/media/libvpx/libvpx/vpx_dsp/x86/transpose_sse2.h
@@ -0,0 +1,367 @@
+/*
+ * Copyright (c) 2015 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef VPX_VPX_DSP_X86_TRANSPOSE_SSE2_H_
+#define VPX_VPX_DSP_X86_TRANSPOSE_SSE2_H_
+
+#include <emmintrin.h> // SSE2
+
+#include "./vpx_config.h"
+
+static INLINE __m128i transpose_8bit_4x4(const __m128i *const in) {
+ // Unpack 8 bit elements. Goes from:
+ // in[0]: 00 01 02 03
+ // in[1]: 10 11 12 13
+ // in[2]: 20 21 22 23
+ // in[3]: 30 31 32 33
+ // to:
+ // a0: 00 10 01 11 02 12 03 13
+ // a1: 20 30 21 31 22 32 23 33
+ const __m128i a0 = _mm_unpacklo_epi8(in[0], in[1]);
+ const __m128i a1 = _mm_unpacklo_epi8(in[2], in[3]);
+
+ // Unpack 16 bit elements resulting in:
+ // 00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33
+ return _mm_unpacklo_epi16(a0, a1);
+}
+
+static INLINE void transpose_8bit_8x8(const __m128i *const in,
+ __m128i *const out) {
+ // Unpack 8 bit elements. Goes from:
+ // in[0]: 00 01 02 03 04 05 06 07
+ // in[1]: 10 11 12 13 14 15 16 17
+ // in[2]: 20 21 22 23 24 25 26 27
+ // in[3]: 30 31 32 33 34 35 36 37
+ // in[4]: 40 41 42 43 44 45 46 47
+ // in[5]: 50 51 52 53 54 55 56 57
+ // in[6]: 60 61 62 63 64 65 66 67
+ // in[7]: 70 71 72 73 74 75 76 77
+ // to:
+ // a0: 00 10 01 11 02 12 03 13 04 14 05 15 06 16 07 17
+ // a1: 20 30 21 31 22 32 23 33 24 34 25 35 26 36 27 37
+ // a2: 40 50 41 51 42 52 43 53 44 54 45 55 46 56 47 57
+ // a3: 60 70 61 71 62 72 63 73 64 74 65 75 66 76 67 77
+ const __m128i a0 = _mm_unpacklo_epi8(in[0], in[1]);
+ const __m128i a1 = _mm_unpacklo_epi8(in[2], in[3]);
+ const __m128i a2 = _mm_unpacklo_epi8(in[4], in[5]);
+ const __m128i a3 = _mm_unpacklo_epi8(in[6], in[7]);
+
+ // Unpack 16 bit elements resulting in:
+ // b0: 00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33
+ // b1: 40 50 60 70 41 51 61 71 42 52 62 72 43 53 63 73
+ // b2: 04 14 24 34 05 15 25 35 06 16 26 36 07 17 27 37
+ // b3: 44 54 64 74 45 55 65 75 46 56 66 76 47 57 67 77
+ const __m128i b0 = _mm_unpacklo_epi16(a0, a1);
+ const __m128i b1 = _mm_unpackhi_epi16(a0, a1);
+ const __m128i b2 = _mm_unpacklo_epi16(a2, a3);
+ const __m128i b3 = _mm_unpackhi_epi16(a2, a3);
+
+ // Unpack 32 bit elements resulting in:
+ // c0: 00 10 20 30 40 50 60 70 01 11 21 31 41 51 61 71
+ // c1: 02 12 22 32 42 52 62 72 03 13 23 33 43 53 63 73
+ // c2: 04 14 24 34 44 54 64 74 05 15 25 35 45 55 65 75
+ // c3: 06 16 26 36 46 56 66 76 07 17 27 37 47 57 67 77
+ const __m128i c0 = _mm_unpacklo_epi32(b0, b2);
+ const __m128i c1 = _mm_unpackhi_epi32(b0, b2);
+ const __m128i c2 = _mm_unpacklo_epi32(b1, b3);
+ const __m128i c3 = _mm_unpackhi_epi32(b1, b3);
+
+ // Unpack 64 bit elements resulting in:
+ // out[0]: 00 10 20 30 40 50 60 70
+ // out[1]: 01 11 21 31 41 51 61 71
+ // out[2]: 02 12 22 32 42 52 62 72
+ // out[3]: 03 13 23 33 43 53 63 73
+ // out[4]: 04 14 24 34 44 54 64 74
+ // out[5]: 05 15 25 35 45 55 65 75
+ // out[6]: 06 16 26 36 46 56 66 76
+ // out[7]: 07 17 27 37 47 57 67 77
+ out[0] = _mm_unpacklo_epi64(c0, c0);
+ out[1] = _mm_unpackhi_epi64(c0, c0);
+ out[2] = _mm_unpacklo_epi64(c1, c1);
+ out[3] = _mm_unpackhi_epi64(c1, c1);
+ out[4] = _mm_unpacklo_epi64(c2, c2);
+ out[5] = _mm_unpackhi_epi64(c2, c2);
+ out[6] = _mm_unpacklo_epi64(c3, c3);
+ out[7] = _mm_unpackhi_epi64(c3, c3);
+}
+
+static INLINE void transpose_16bit_4x4(const __m128i *const in,
+ __m128i *const out) {
+ // Unpack 16 bit elements. Goes from:
+ // in[0]: 00 01 02 03 XX XX XX XX
+ // in[1]: 10 11 12 13 XX XX XX XX
+ // in[2]: 20 21 22 23 XX XX XX XX
+ // in[3]: 30 31 32 33 XX XX XX XX
+ // to:
+ // a0: 00 10 01 11 02 12 03 13
+ // a1: 20 30 21 31 22 32 23 33
+ const __m128i a0 = _mm_unpacklo_epi16(in[0], in[1]);
+ const __m128i a1 = _mm_unpacklo_epi16(in[2], in[3]);
+
+ // Unpack 32 bit elements resulting in:
+ // out[0]: 00 10 20 30 01 11 21 31
+ // out[1]: 02 12 22 32 03 13 23 33
+ out[0] = _mm_unpacklo_epi32(a0, a1);
+ out[1] = _mm_unpackhi_epi32(a0, a1);
+}
+
+static INLINE void transpose_16bit_4x8(const __m128i *const in,
+ __m128i *const out) {
+ // Unpack 16 bit elements. Goes from:
+ // in[0]: 00 01 02 03 XX XX XX XX
+ // in[1]: 10 11 12 13 XX XX XX XX
+ // in[2]: 20 21 22 23 XX XX XX XX
+ // in[3]: 30 31 32 33 XX XX XX XX
+ // in[4]: 40 41 42 43 XX XX XX XX
+ // in[5]: 50 51 52 53 XX XX XX XX
+ // in[6]: 60 61 62 63 XX XX XX XX
+ // in[7]: 70 71 72 73 XX XX XX XX
+ // to:
+ // a0: 00 10 01 11 02 12 03 13
+ // a1: 20 30 21 31 22 32 23 33
+ // a2: 40 50 41 51 42 52 43 53
+ // a3: 60 70 61 71 62 72 63 73
+ const __m128i a0 = _mm_unpacklo_epi16(in[0], in[1]);
+ const __m128i a1 = _mm_unpacklo_epi16(in[2], in[3]);
+ const __m128i a2 = _mm_unpacklo_epi16(in[4], in[5]);
+ const __m128i a3 = _mm_unpacklo_epi16(in[6], in[7]);
+
+ // Unpack 32 bit elements resulting in:
+ // b0: 00 10 20 30 01 11 21 31
+ // b1: 40 50 60 70 41 51 61 71
+ // b2: 02 12 22 32 03 13 23 33
+ // b3: 42 52 62 72 43 53 63 73
+ const __m128i b0 = _mm_unpacklo_epi32(a0, a1);
+ const __m128i b1 = _mm_unpacklo_epi32(a2, a3);
+ const __m128i b2 = _mm_unpackhi_epi32(a0, a1);
+ const __m128i b3 = _mm_unpackhi_epi32(a2, a3);
+
+ // Unpack 64 bit elements resulting in:
+ // out[0]: 00 10 20 30 40 50 60 70
+ // out[1]: 01 11 21 31 41 51 61 71
+ // out[2]: 02 12 22 32 42 52 62 72
+ // out[3]: 03 13 23 33 43 53 63 73
+ out[0] = _mm_unpacklo_epi64(b0, b1);
+ out[1] = _mm_unpackhi_epi64(b0, b1);
+ out[2] = _mm_unpacklo_epi64(b2, b3);
+ out[3] = _mm_unpackhi_epi64(b2, b3);
+}
+
+static INLINE void transpose_16bit_8x8(const __m128i *const in,
+ __m128i *const out) {
+ // Unpack 16 bit elements. Goes from:
+ // in[0]: 00 01 02 03 04 05 06 07
+ // in[1]: 10 11 12 13 14 15 16 17
+ // in[2]: 20 21 22 23 24 25 26 27
+ // in[3]: 30 31 32 33 34 35 36 37
+ // in[4]: 40 41 42 43 44 45 46 47
+ // in[5]: 50 51 52 53 54 55 56 57
+ // in[6]: 60 61 62 63 64 65 66 67
+ // in[7]: 70 71 72 73 74 75 76 77
+ // to:
+ // a0: 00 10 01 11 02 12 03 13
+ // a1: 20 30 21 31 22 32 23 33
+ // a2: 40 50 41 51 42 52 43 53
+ // a3: 60 70 61 71 62 72 63 73
+ // a4: 04 14 05 15 06 16 07 17
+ // a5: 24 34 25 35 26 36 27 37
+ // a6: 44 54 45 55 46 56 47 57
+ // a7: 64 74 65 75 66 76 67 77
+ const __m128i a0 = _mm_unpacklo_epi16(in[0], in[1]);
+ const __m128i a1 = _mm_unpacklo_epi16(in[2], in[3]);
+ const __m128i a2 = _mm_unpacklo_epi16(in[4], in[5]);
+ const __m128i a3 = _mm_unpacklo_epi16(in[6], in[7]);
+ const __m128i a4 = _mm_unpackhi_epi16(in[0], in[1]);
+ const __m128i a5 = _mm_unpackhi_epi16(in[2], in[3]);
+ const __m128i a6 = _mm_unpackhi_epi16(in[4], in[5]);
+ const __m128i a7 = _mm_unpackhi_epi16(in[6], in[7]);
+
+ // Unpack 32 bit elements resulting in:
+ // b0: 00 10 20 30 01 11 21 31
+ // b1: 40 50 60 70 41 51 61 71
+ // b2: 04 14 24 34 05 15 25 35
+ // b3: 44 54 64 74 45 55 65 75
+ // b4: 02 12 22 32 03 13 23 33
+ // b5: 42 52 62 72 43 53 63 73
+ // b6: 06 16 26 36 07 17 27 37
+ // b7: 46 56 66 76 47 57 67 77
+ const __m128i b0 = _mm_unpacklo_epi32(a0, a1);
+ const __m128i b1 = _mm_unpacklo_epi32(a2, a3);
+ const __m128i b2 = _mm_unpacklo_epi32(a4, a5);
+ const __m128i b3 = _mm_unpacklo_epi32(a6, a7);
+ const __m128i b4 = _mm_unpackhi_epi32(a0, a1);
+ const __m128i b5 = _mm_unpackhi_epi32(a2, a3);
+ const __m128i b6 = _mm_unpackhi_epi32(a4, a5);
+ const __m128i b7 = _mm_unpackhi_epi32(a6, a7);
+
+ // Unpack 64 bit elements resulting in:
+ // out[0]: 00 10 20 30 40 50 60 70
+ // out[1]: 01 11 21 31 41 51 61 71
+ // out[2]: 02 12 22 32 42 52 62 72
+ // out[3]: 03 13 23 33 43 53 63 73
+ // out[4]: 04 14 24 34 44 54 64 74
+ // out[5]: 05 15 25 35 45 55 65 75
+ // out[6]: 06 16 26 36 46 56 66 76
+ // out[7]: 07 17 27 37 47 57 67 77
+ out[0] = _mm_unpacklo_epi64(b0, b1);
+ out[1] = _mm_unpackhi_epi64(b0, b1);
+ out[2] = _mm_unpacklo_epi64(b4, b5);
+ out[3] = _mm_unpackhi_epi64(b4, b5);
+ out[4] = _mm_unpacklo_epi64(b2, b3);
+ out[5] = _mm_unpackhi_epi64(b2, b3);
+ out[6] = _mm_unpacklo_epi64(b6, b7);
+ out[7] = _mm_unpackhi_epi64(b6, b7);
+}
+
+// Transpose in-place
+static INLINE void transpose_16bit_16x16(__m128i *const left,
+ __m128i *const right) {
+ __m128i tbuf[8];
+ transpose_16bit_8x8(left, left);
+ transpose_16bit_8x8(right, tbuf);
+ transpose_16bit_8x8(left + 8, right);
+ transpose_16bit_8x8(right + 8, right + 8);
+
+ left[8] = tbuf[0];
+ left[9] = tbuf[1];
+ left[10] = tbuf[2];
+ left[11] = tbuf[3];
+ left[12] = tbuf[4];
+ left[13] = tbuf[5];
+ left[14] = tbuf[6];
+ left[15] = tbuf[7];
+}
+
+static INLINE void transpose_32bit_4x4(const __m128i *const in,
+ __m128i *const out) {
+ // Unpack 32 bit elements. Goes from:
+ // in[0]: 00 01 02 03
+ // in[1]: 10 11 12 13
+ // in[2]: 20 21 22 23
+ // in[3]: 30 31 32 33
+ // to:
+ // a0: 00 10 01 11
+ // a1: 20 30 21 31
+ // a2: 02 12 03 13
+ // a3: 22 32 23 33
+
+ const __m128i a0 = _mm_unpacklo_epi32(in[0], in[1]);
+ const __m128i a1 = _mm_unpacklo_epi32(in[2], in[3]);
+ const __m128i a2 = _mm_unpackhi_epi32(in[0], in[1]);
+ const __m128i a3 = _mm_unpackhi_epi32(in[2], in[3]);
+
+ // Unpack 64 bit elements resulting in:
+ // out[0]: 00 10 20 30
+ // out[1]: 01 11 21 31
+ // out[2]: 02 12 22 32
+ // out[3]: 03 13 23 33
+ out[0] = _mm_unpacklo_epi64(a0, a1);
+ out[1] = _mm_unpackhi_epi64(a0, a1);
+ out[2] = _mm_unpacklo_epi64(a2, a3);
+ out[3] = _mm_unpackhi_epi64(a2, a3);
+}
+
+static INLINE void transpose_32bit_4x4x2(const __m128i *const in,
+ __m128i *const out) {
+ // Unpack 32 bit elements. Goes from:
+ // in[0]: 00 01 02 03
+ // in[1]: 10 11 12 13
+ // in[2]: 20 21 22 23
+ // in[3]: 30 31 32 33
+ // in[4]: 04 05 06 07
+ // in[5]: 14 15 16 17
+ // in[6]: 24 25 26 27
+ // in[7]: 34 35 36 37
+ // to:
+ // a0: 00 10 01 11
+ // a1: 20 30 21 31
+ // a2: 02 12 03 13
+ // a3: 22 32 23 33
+ // a4: 04 14 05 15
+ // a5: 24 34 25 35
+ // a6: 06 16 07 17
+ // a7: 26 36 27 37
+ const __m128i a0 = _mm_unpacklo_epi32(in[0], in[1]);
+ const __m128i a1 = _mm_unpacklo_epi32(in[2], in[3]);
+ const __m128i a2 = _mm_unpackhi_epi32(in[0], in[1]);
+ const __m128i a3 = _mm_unpackhi_epi32(in[2], in[3]);
+ const __m128i a4 = _mm_unpacklo_epi32(in[4], in[5]);
+ const __m128i a5 = _mm_unpacklo_epi32(in[6], in[7]);
+ const __m128i a6 = _mm_unpackhi_epi32(in[4], in[5]);
+ const __m128i a7 = _mm_unpackhi_epi32(in[6], in[7]);
+
+ // Unpack 64 bit elements resulting in:
+ // out[0]: 00 10 20 30
+ // out[1]: 01 11 21 31
+ // out[2]: 02 12 22 32
+ // out[3]: 03 13 23 33
+ // out[4]: 04 14 24 34
+ // out[5]: 05 15 25 35
+ // out[6]: 06 16 26 36
+ // out[7]: 07 17 27 37
+ out[0] = _mm_unpacklo_epi64(a0, a1);
+ out[1] = _mm_unpackhi_epi64(a0, a1);
+ out[2] = _mm_unpacklo_epi64(a2, a3);
+ out[3] = _mm_unpackhi_epi64(a2, a3);
+ out[4] = _mm_unpacklo_epi64(a4, a5);
+ out[5] = _mm_unpackhi_epi64(a4, a5);
+ out[6] = _mm_unpacklo_epi64(a6, a7);
+ out[7] = _mm_unpackhi_epi64(a6, a7);
+}
+
+static INLINE void transpose_32bit_8x4(const __m128i *const in,
+ __m128i *const out) {
+ // Unpack 32 bit elements. Goes from:
+ // in[0]: 00 01 02 03
+ // in[1]: 04 05 06 07
+ // in[2]: 10 11 12 13
+ // in[3]: 14 15 16 17
+ // in[4]: 20 21 22 23
+ // in[5]: 24 25 26 27
+ // in[6]: 30 31 32 33
+ // in[7]: 34 35 36 37
+ // to:
+ // a0: 00 10 01 11
+ // a1: 20 30 21 31
+ // a2: 02 12 03 13
+ // a3: 22 32 23 33
+ // a4: 04 14 05 15
+ // a5: 24 34 25 35
+ // a6: 06 16 07 17
+ // a7: 26 36 27 37
+ const __m128i a0 = _mm_unpacklo_epi32(in[0], in[2]);
+ const __m128i a1 = _mm_unpacklo_epi32(in[4], in[6]);
+ const __m128i a2 = _mm_unpackhi_epi32(in[0], in[2]);
+ const __m128i a3 = _mm_unpackhi_epi32(in[4], in[6]);
+ const __m128i a4 = _mm_unpacklo_epi32(in[1], in[3]);
+ const __m128i a5 = _mm_unpacklo_epi32(in[5], in[7]);
+ const __m128i a6 = _mm_unpackhi_epi32(in[1], in[3]);
+ const __m128i a7 = _mm_unpackhi_epi32(in[5], in[7]);
+
+ // Unpack 64 bit elements resulting in:
+ // out[0]: 00 10 20 30
+ // out[1]: 01 11 21 31
+ // out[2]: 02 12 22 32
+ // out[3]: 03 13 23 33
+ // out[4]: 04 14 24 34
+ // out[5]: 05 15 25 35
+ // out[6]: 06 16 26 36
+ // out[7]: 07 17 27 37
+ out[0] = _mm_unpacklo_epi64(a0, a1);
+ out[1] = _mm_unpackhi_epi64(a0, a1);
+ out[2] = _mm_unpacklo_epi64(a2, a3);
+ out[3] = _mm_unpackhi_epi64(a2, a3);
+ out[4] = _mm_unpacklo_epi64(a4, a5);
+ out[5] = _mm_unpackhi_epi64(a4, a5);
+ out[6] = _mm_unpacklo_epi64(a6, a7);
+ out[7] = _mm_unpackhi_epi64(a6, a7);
+}
+
+#endif // VPX_VPX_DSP_X86_TRANSPOSE_SSE2_H_
diff --git a/media/libvpx/libvpx/vpx_dsp/x86/txfm_common_sse2.h b/media/libvpx/libvpx/vpx_dsp/x86/txfm_common_sse2.h
new file mode 100644
index 0000000000..de5ce43b00
--- /dev/null
+++ b/media/libvpx/libvpx/vpx_dsp/x86/txfm_common_sse2.h
@@ -0,0 +1,32 @@
+/*
+ * Copyright (c) 2015 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef VPX_VPX_DSP_X86_TXFM_COMMON_SSE2_H_
+#define VPX_VPX_DSP_X86_TXFM_COMMON_SSE2_H_
+
+#include <emmintrin.h>
+#include "vpx/vpx_integer.h"
+
+#define pair_set_epi16(a, b) \
+ _mm_set_epi16((int16_t)(b), (int16_t)(a), (int16_t)(b), (int16_t)(a), \
+ (int16_t)(b), (int16_t)(a), (int16_t)(b), (int16_t)(a))
+
+#define pair_set_epi32(a, b) \
+ _mm_set_epi32((int)(b), (int)(a), (int)(b), (int)(a))
+
+#define dual_set_epi16(a, b) \
+ _mm_set_epi16((int16_t)(b), (int16_t)(b), (int16_t)(b), (int16_t)(b), \
+ (int16_t)(a), (int16_t)(a), (int16_t)(a), (int16_t)(a))
+
+#define octa_set_epi16(a, b, c, d, e, f, g, h) \
+ _mm_setr_epi16((int16_t)(a), (int16_t)(b), (int16_t)(c), (int16_t)(d), \
+ (int16_t)(e), (int16_t)(f), (int16_t)(g), (int16_t)(h))
+
+#endif // VPX_VPX_DSP_X86_TXFM_COMMON_SSE2_H_
diff --git a/media/libvpx/libvpx/vpx_dsp/x86/variance_avx2.c b/media/libvpx/libvpx/vpx_dsp/x86/variance_avx2.c
new file mode 100644
index 0000000000..8305b9f20f
--- /dev/null
+++ b/media/libvpx/libvpx/vpx_dsp/x86/variance_avx2.c
@@ -0,0 +1,872 @@
+/*
+ * Copyright (c) 2012 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <immintrin.h> // AVX2
+
+#include "./vpx_dsp_rtcd.h"
+
+/* clang-format off */
+DECLARE_ALIGNED(32, static const uint8_t, bilinear_filters_avx2[512]) = {
+ 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0,
+ 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0,
+ 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2,
+ 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2,
+ 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4,
+ 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4,
+ 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6,
+ 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6,
+ 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
+ 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
+ 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10,
+ 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10,
+ 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12,
+ 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12,
+ 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14,
+ 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14,
+};
+
+DECLARE_ALIGNED(32, static const int8_t, adjacent_sub_avx2[32]) = {
+ 1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1,
+ 1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1
+};
+/* clang-format on */
+
+static INLINE void variance_kernel_avx2(const __m256i src, const __m256i ref,
+ __m256i *const sse,
+ __m256i *const sum) {
+ const __m256i adj_sub = _mm256_load_si256((__m256i const *)adjacent_sub_avx2);
+
+ // unpack into pairs of source and reference values
+ const __m256i src_ref0 = _mm256_unpacklo_epi8(src, ref);
+ const __m256i src_ref1 = _mm256_unpackhi_epi8(src, ref);
+
+ // subtract adjacent elements using src*1 + ref*-1
+ const __m256i diff0 = _mm256_maddubs_epi16(src_ref0, adj_sub);
+ const __m256i diff1 = _mm256_maddubs_epi16(src_ref1, adj_sub);
+ const __m256i madd0 = _mm256_madd_epi16(diff0, diff0);
+ const __m256i madd1 = _mm256_madd_epi16(diff1, diff1);
+
+ // add to the running totals
+ *sum = _mm256_add_epi16(*sum, _mm256_add_epi16(diff0, diff1));
+ *sse = _mm256_add_epi32(*sse, _mm256_add_epi32(madd0, madd1));
+}
+
+static INLINE void variance_final_from_32bit_sum_avx2(__m256i vsse,
+ __m128i vsum,
+ unsigned int *const sse,
+ int *const sum) {
+ // extract the low lane and add it to the high lane
+ const __m128i sse_reg_128 = _mm_add_epi32(_mm256_castsi256_si128(vsse),
+ _mm256_extractf128_si256(vsse, 1));
+
+ // unpack sse and sum registers and add
+ const __m128i sse_sum_lo = _mm_unpacklo_epi32(sse_reg_128, vsum);
+ const __m128i sse_sum_hi = _mm_unpackhi_epi32(sse_reg_128, vsum);
+ const __m128i sse_sum = _mm_add_epi32(sse_sum_lo, sse_sum_hi);
+
+ // perform the final summation and extract the results
+ const __m128i res = _mm_add_epi32(sse_sum, _mm_srli_si128(sse_sum, 8));
+ *((int *)sse) = _mm_cvtsi128_si32(res);
+ *((int *)sum) = _mm_extract_epi32(res, 1);
+}
+
+static INLINE void variance_final_from_16bit_sum_avx2(__m256i vsse,
+ __m256i vsum,
+ unsigned int *const sse,
+ int *const sum) {
+ // extract the low lane and add it to the high lane
+ const __m128i sum_reg_128 = _mm_add_epi16(_mm256_castsi256_si128(vsum),
+ _mm256_extractf128_si256(vsum, 1));
+ const __m128i sum_reg_64 =
+ _mm_add_epi16(sum_reg_128, _mm_srli_si128(sum_reg_128, 8));
+ const __m128i sum_int32 = _mm_cvtepi16_epi32(sum_reg_64);
+
+ variance_final_from_32bit_sum_avx2(vsse, sum_int32, sse, sum);
+}
+
+static INLINE __m256i sum_to_32bit_avx2(const __m256i sum) {
+ const __m256i sum_lo = _mm256_cvtepi16_epi32(_mm256_castsi256_si128(sum));
+ const __m256i sum_hi =
+ _mm256_cvtepi16_epi32(_mm256_extractf128_si256(sum, 1));
+ return _mm256_add_epi32(sum_lo, sum_hi);
+}
+
+static INLINE void variance8_kernel_avx2(
+ const uint8_t *const src, const int src_stride, const uint8_t *const ref,
+ const int ref_stride, __m256i *const sse, __m256i *const sum) {
+ __m128i src0, src1, ref0, ref1;
+ __m256i ss, rr, diff;
+
+ // 0 0 0.... 0 s07 s06 s05 s04 s03 s02 s01 s00
+ src0 = _mm_loadl_epi64((const __m128i *)(src + 0 * src_stride));
+
+ // 0 0 0.... 0 s17 s16 s15 s14 s13 s12 s11 s10
+ src1 = _mm_loadl_epi64((const __m128i *)(src + 1 * src_stride));
+
+ // s17 s16...s11 s10 s07 s06...s01 s00 (8bit)
+ src0 = _mm_unpacklo_epi64(src0, src1);
+
+ // s17 s16...s11 s10 s07 s06...s01 s00 (16 bit)
+ ss = _mm256_cvtepu8_epi16(src0);
+
+ // 0 0 0.... 0 r07 r06 r05 r04 r03 r02 r01 r00
+ ref0 = _mm_loadl_epi64((const __m128i *)(ref + 0 * ref_stride));
+
+ // 0 0 0.... 0 r17 r16 0 r15 0 r14 0 r13 0 r12 0 r11 0 r10
+ ref1 = _mm_loadl_epi64((const __m128i *)(ref + 1 * ref_stride));
+
+ // r17 r16...r11 r10 r07 r06...r01 r00 (8 bit)
+ ref0 = _mm_unpacklo_epi64(ref0, ref1);
+
+ // r17 r16...r11 r10 r07 r06...r01 r00 (16 bit)
+ rr = _mm256_cvtepu8_epi16(ref0);
+
+ diff = _mm256_sub_epi16(ss, rr);
+ *sse = _mm256_add_epi32(*sse, _mm256_madd_epi16(diff, diff));
+ *sum = _mm256_add_epi16(*sum, diff);
+}
+
+static INLINE void variance16_kernel_avx2(
+ const uint8_t *const src, const int src_stride, const uint8_t *const ref,
+ const int ref_stride, __m256i *const sse, __m256i *const sum) {
+ const __m128i s0 = _mm_loadu_si128((__m128i const *)(src + 0 * src_stride));
+ const __m128i s1 = _mm_loadu_si128((__m128i const *)(src + 1 * src_stride));
+ const __m128i r0 = _mm_loadu_si128((__m128i const *)(ref + 0 * ref_stride));
+ const __m128i r1 = _mm_loadu_si128((__m128i const *)(ref + 1 * ref_stride));
+ const __m256i s = _mm256_inserti128_si256(_mm256_castsi128_si256(s0), s1, 1);
+ const __m256i r = _mm256_inserti128_si256(_mm256_castsi128_si256(r0), r1, 1);
+ variance_kernel_avx2(s, r, sse, sum);
+}
+
+static INLINE void variance32_kernel_avx2(const uint8_t *const src,
+ const uint8_t *const ref,
+ __m256i *const sse,
+ __m256i *const sum) {
+ const __m256i s = _mm256_loadu_si256((__m256i const *)(src));
+ const __m256i r = _mm256_loadu_si256((__m256i const *)(ref));
+ variance_kernel_avx2(s, r, sse, sum);
+}
+
+static INLINE void variance8_avx2(const uint8_t *src, const int src_stride,
+ const uint8_t *ref, const int ref_stride,
+ const int h, __m256i *const vsse,
+ __m256i *const vsum) {
+ int i;
+ *vsum = _mm256_setzero_si256();
+ *vsse = _mm256_setzero_si256();
+
+ for (i = 0; i < h; i += 2) {
+ variance8_kernel_avx2(src, src_stride, ref, ref_stride, vsse, vsum);
+ src += 2 * src_stride;
+ ref += 2 * ref_stride;
+ }
+}
+
+static INLINE void variance16_avx2(const uint8_t *src, const int src_stride,
+ const uint8_t *ref, const int ref_stride,
+ const int h, __m256i *const vsse,
+ __m256i *const vsum) {
+ int i;
+ *vsum = _mm256_setzero_si256();
+ *vsse = _mm256_setzero_si256();
+
+ for (i = 0; i < h; i += 2) {
+ variance16_kernel_avx2(src, src_stride, ref, ref_stride, vsse, vsum);
+ src += 2 * src_stride;
+ ref += 2 * ref_stride;
+ }
+}
+
+static INLINE void variance32_avx2(const uint8_t *src, const int src_stride,
+ const uint8_t *ref, const int ref_stride,
+ const int h, __m256i *const vsse,
+ __m256i *const vsum) {
+ int i;
+ *vsum = _mm256_setzero_si256();
+ *vsse = _mm256_setzero_si256();
+
+ for (i = 0; i < h; i++) {
+ variance32_kernel_avx2(src, ref, vsse, vsum);
+ src += src_stride;
+ ref += ref_stride;
+ }
+}
+
+static INLINE void variance64_avx2(const uint8_t *src, const int src_stride,
+ const uint8_t *ref, const int ref_stride,
+ const int h, __m256i *const vsse,
+ __m256i *const vsum) {
+ int i;
+ *vsum = _mm256_setzero_si256();
+
+ for (i = 0; i < h; i++) {
+ variance32_kernel_avx2(src + 0, ref + 0, vsse, vsum);
+ variance32_kernel_avx2(src + 32, ref + 32, vsse, vsum);
+ src += src_stride;
+ ref += ref_stride;
+ }
+}
+
+void vpx_get16x16var_avx2(const uint8_t *src_ptr, int src_stride,
+ const uint8_t *ref_ptr, int ref_stride,
+ unsigned int *sse, int *sum) {
+ __m256i vsse, vsum;
+ variance16_avx2(src_ptr, src_stride, ref_ptr, ref_stride, 16, &vsse, &vsum);
+ variance_final_from_16bit_sum_avx2(vsse, vsum, sse, sum);
+}
+
+#define FILTER_SRC(filter) \
+ /* filter the source */ \
+ exp_src_lo = _mm256_maddubs_epi16(exp_src_lo, filter); \
+ exp_src_hi = _mm256_maddubs_epi16(exp_src_hi, filter); \
+ \
+ /* add 8 to source */ \
+ exp_src_lo = _mm256_add_epi16(exp_src_lo, pw8); \
+ exp_src_hi = _mm256_add_epi16(exp_src_hi, pw8); \
+ \
+ /* divide source by 16 */ \
+ exp_src_lo = _mm256_srai_epi16(exp_src_lo, 4); \
+ exp_src_hi = _mm256_srai_epi16(exp_src_hi, 4);
+
+#define CALC_SUM_SSE_INSIDE_LOOP \
+ /* expand each byte to 2 bytes */ \
+ exp_dst_lo = _mm256_unpacklo_epi8(dst_reg, zero_reg); \
+ exp_dst_hi = _mm256_unpackhi_epi8(dst_reg, zero_reg); \
+ /* source - dest */ \
+ exp_src_lo = _mm256_sub_epi16(exp_src_lo, exp_dst_lo); \
+ exp_src_hi = _mm256_sub_epi16(exp_src_hi, exp_dst_hi); \
+ /* caculate sum */ \
+ *sum_reg = _mm256_add_epi16(*sum_reg, exp_src_lo); \
+ exp_src_lo = _mm256_madd_epi16(exp_src_lo, exp_src_lo); \
+ *sum_reg = _mm256_add_epi16(*sum_reg, exp_src_hi); \
+ exp_src_hi = _mm256_madd_epi16(exp_src_hi, exp_src_hi); \
+ /* calculate sse */ \
+ *sse_reg = _mm256_add_epi32(*sse_reg, exp_src_lo); \
+ *sse_reg = _mm256_add_epi32(*sse_reg, exp_src_hi);
+
+// final calculation to sum and sse
+#define CALC_SUM_AND_SSE \
+ res_cmp = _mm256_cmpgt_epi16(zero_reg, sum_reg); \
+ sse_reg_hi = _mm256_srli_si256(sse_reg, 8); \
+ sum_reg_lo = _mm256_unpacklo_epi16(sum_reg, res_cmp); \
+ sum_reg_hi = _mm256_unpackhi_epi16(sum_reg, res_cmp); \
+ sse_reg = _mm256_add_epi32(sse_reg, sse_reg_hi); \
+ sum_reg = _mm256_add_epi32(sum_reg_lo, sum_reg_hi); \
+ \
+ sse_reg_hi = _mm256_srli_si256(sse_reg, 4); \
+ sum_reg_hi = _mm256_srli_si256(sum_reg, 8); \
+ \
+ sse_reg = _mm256_add_epi32(sse_reg, sse_reg_hi); \
+ sum_reg = _mm256_add_epi32(sum_reg, sum_reg_hi); \
+ *((int *)sse) = _mm_cvtsi128_si32(_mm256_castsi256_si128(sse_reg)) + \
+ _mm_cvtsi128_si32(_mm256_extractf128_si256(sse_reg, 1)); \
+ sum_reg_hi = _mm256_srli_si256(sum_reg, 4); \
+ sum_reg = _mm256_add_epi32(sum_reg, sum_reg_hi); \
+ sum = _mm_cvtsi128_si32(_mm256_castsi256_si128(sum_reg)) + \
+ _mm_cvtsi128_si32(_mm256_extractf128_si256(sum_reg, 1));
+
+static INLINE void spv32_x0_y0(const uint8_t *src, int src_stride,
+ const uint8_t *dst, int dst_stride,
+ const uint8_t *second_pred, int second_stride,
+ int do_sec, int height, __m256i *sum_reg,
+ __m256i *sse_reg) {
+ const __m256i zero_reg = _mm256_setzero_si256();
+ __m256i exp_src_lo, exp_src_hi, exp_dst_lo, exp_dst_hi;
+ int i;
+ for (i = 0; i < height; i++) {
+ const __m256i dst_reg = _mm256_loadu_si256((__m256i const *)dst);
+ const __m256i src_reg = _mm256_loadu_si256((__m256i const *)src);
+ if (do_sec) {
+ const __m256i sec_reg = _mm256_loadu_si256((__m256i const *)second_pred);
+ const __m256i avg_reg = _mm256_avg_epu8(src_reg, sec_reg);
+ exp_src_lo = _mm256_unpacklo_epi8(avg_reg, zero_reg);
+ exp_src_hi = _mm256_unpackhi_epi8(avg_reg, zero_reg);
+ second_pred += second_stride;
+ } else {
+ exp_src_lo = _mm256_unpacklo_epi8(src_reg, zero_reg);
+ exp_src_hi = _mm256_unpackhi_epi8(src_reg, zero_reg);
+ }
+ CALC_SUM_SSE_INSIDE_LOOP
+ src += src_stride;
+ dst += dst_stride;
+ }
+}
+
+// (x == 0, y == 4) or (x == 4, y == 0). sstep determines the direction.
+static INLINE void spv32_half_zero(const uint8_t *src, int src_stride,
+ const uint8_t *dst, int dst_stride,
+ const uint8_t *second_pred,
+ int second_stride, int do_sec, int height,
+ __m256i *sum_reg, __m256i *sse_reg,
+ int sstep) {
+ const __m256i zero_reg = _mm256_setzero_si256();
+ __m256i exp_src_lo, exp_src_hi, exp_dst_lo, exp_dst_hi;
+ int i;
+ for (i = 0; i < height; i++) {
+ const __m256i dst_reg = _mm256_loadu_si256((__m256i const *)dst);
+ const __m256i src_0 = _mm256_loadu_si256((__m256i const *)src);
+ const __m256i src_1 = _mm256_loadu_si256((__m256i const *)(src + sstep));
+ const __m256i src_avg = _mm256_avg_epu8(src_0, src_1);
+ if (do_sec) {
+ const __m256i sec_reg = _mm256_loadu_si256((__m256i const *)second_pred);
+ const __m256i avg_reg = _mm256_avg_epu8(src_avg, sec_reg);
+ exp_src_lo = _mm256_unpacklo_epi8(avg_reg, zero_reg);
+ exp_src_hi = _mm256_unpackhi_epi8(avg_reg, zero_reg);
+ second_pred += second_stride;
+ } else {
+ exp_src_lo = _mm256_unpacklo_epi8(src_avg, zero_reg);
+ exp_src_hi = _mm256_unpackhi_epi8(src_avg, zero_reg);
+ }
+ CALC_SUM_SSE_INSIDE_LOOP
+ src += src_stride;
+ dst += dst_stride;
+ }
+}
+
+static INLINE void spv32_x0_y4(const uint8_t *src, int src_stride,
+ const uint8_t *dst, int dst_stride,
+ const uint8_t *second_pred, int second_stride,
+ int do_sec, int height, __m256i *sum_reg,
+ __m256i *sse_reg) {
+ spv32_half_zero(src, src_stride, dst, dst_stride, second_pred, second_stride,
+ do_sec, height, sum_reg, sse_reg, src_stride);
+}
+
+static INLINE void spv32_x4_y0(const uint8_t *src, int src_stride,
+ const uint8_t *dst, int dst_stride,
+ const uint8_t *second_pred, int second_stride,
+ int do_sec, int height, __m256i *sum_reg,
+ __m256i *sse_reg) {
+ spv32_half_zero(src, src_stride, dst, dst_stride, second_pred, second_stride,
+ do_sec, height, sum_reg, sse_reg, 1);
+}
+
+static INLINE void spv32_x4_y4(const uint8_t *src, int src_stride,
+ const uint8_t *dst, int dst_stride,
+ const uint8_t *second_pred, int second_stride,
+ int do_sec, int height, __m256i *sum_reg,
+ __m256i *sse_reg) {
+ const __m256i zero_reg = _mm256_setzero_si256();
+ const __m256i src_a = _mm256_loadu_si256((__m256i const *)src);
+ const __m256i src_b = _mm256_loadu_si256((__m256i const *)(src + 1));
+ __m256i prev_src_avg = _mm256_avg_epu8(src_a, src_b);
+ __m256i exp_src_lo, exp_src_hi, exp_dst_lo, exp_dst_hi;
+ int i;
+ src += src_stride;
+ for (i = 0; i < height; i++) {
+ const __m256i dst_reg = _mm256_loadu_si256((__m256i const *)dst);
+ const __m256i src_0 = _mm256_loadu_si256((__m256i const *)(src));
+ const __m256i src_1 = _mm256_loadu_si256((__m256i const *)(src + 1));
+ const __m256i src_avg = _mm256_avg_epu8(src_0, src_1);
+ const __m256i current_avg = _mm256_avg_epu8(prev_src_avg, src_avg);
+ prev_src_avg = src_avg;
+
+ if (do_sec) {
+ const __m256i sec_reg = _mm256_loadu_si256((__m256i const *)second_pred);
+ const __m256i avg_reg = _mm256_avg_epu8(current_avg, sec_reg);
+ exp_src_lo = _mm256_unpacklo_epi8(avg_reg, zero_reg);
+ exp_src_hi = _mm256_unpackhi_epi8(avg_reg, zero_reg);
+ second_pred += second_stride;
+ } else {
+ exp_src_lo = _mm256_unpacklo_epi8(current_avg, zero_reg);
+ exp_src_hi = _mm256_unpackhi_epi8(current_avg, zero_reg);
+ }
+ // save current source average
+ CALC_SUM_SSE_INSIDE_LOOP
+ dst += dst_stride;
+ src += src_stride;
+ }
+}
+
+// (x == 0, y == bil) or (x == 4, y == bil). sstep determines the direction.
+static INLINE void spv32_bilin_zero(const uint8_t *src, int src_stride,
+ const uint8_t *dst, int dst_stride,
+ const uint8_t *second_pred,
+ int second_stride, int do_sec, int height,
+ __m256i *sum_reg, __m256i *sse_reg,
+ int offset, int sstep) {
+ const __m256i zero_reg = _mm256_setzero_si256();
+ const __m256i pw8 = _mm256_set1_epi16(8);
+ const __m256i filter = _mm256_load_si256(
+ (__m256i const *)(bilinear_filters_avx2 + (offset << 5)));
+ __m256i exp_src_lo, exp_src_hi, exp_dst_lo, exp_dst_hi;
+ int i;
+ for (i = 0; i < height; i++) {
+ const __m256i dst_reg = _mm256_loadu_si256((__m256i const *)dst);
+ const __m256i src_0 = _mm256_loadu_si256((__m256i const *)src);
+ const __m256i src_1 = _mm256_loadu_si256((__m256i const *)(src + sstep));
+ exp_src_lo = _mm256_unpacklo_epi8(src_0, src_1);
+ exp_src_hi = _mm256_unpackhi_epi8(src_0, src_1);
+
+ FILTER_SRC(filter)
+ if (do_sec) {
+ const __m256i sec_reg = _mm256_loadu_si256((__m256i const *)second_pred);
+ const __m256i exp_src = _mm256_packus_epi16(exp_src_lo, exp_src_hi);
+ const __m256i avg_reg = _mm256_avg_epu8(exp_src, sec_reg);
+ second_pred += second_stride;
+ exp_src_lo = _mm256_unpacklo_epi8(avg_reg, zero_reg);
+ exp_src_hi = _mm256_unpackhi_epi8(avg_reg, zero_reg);
+ }
+ CALC_SUM_SSE_INSIDE_LOOP
+ src += src_stride;
+ dst += dst_stride;
+ }
+}
+
+static INLINE void spv32_x0_yb(const uint8_t *src, int src_stride,
+ const uint8_t *dst, int dst_stride,
+ const uint8_t *second_pred, int second_stride,
+ int do_sec, int height, __m256i *sum_reg,
+ __m256i *sse_reg, int y_offset) {
+ spv32_bilin_zero(src, src_stride, dst, dst_stride, second_pred, second_stride,
+ do_sec, height, sum_reg, sse_reg, y_offset, src_stride);
+}
+
+static INLINE void spv32_xb_y0(const uint8_t *src, int src_stride,
+ const uint8_t *dst, int dst_stride,
+ const uint8_t *second_pred, int second_stride,
+ int do_sec, int height, __m256i *sum_reg,
+ __m256i *sse_reg, int x_offset) {
+ spv32_bilin_zero(src, src_stride, dst, dst_stride, second_pred, second_stride,
+ do_sec, height, sum_reg, sse_reg, x_offset, 1);
+}
+
+static INLINE void spv32_x4_yb(const uint8_t *src, int src_stride,
+ const uint8_t *dst, int dst_stride,
+ const uint8_t *second_pred, int second_stride,
+ int do_sec, int height, __m256i *sum_reg,
+ __m256i *sse_reg, int y_offset) {
+ const __m256i zero_reg = _mm256_setzero_si256();
+ const __m256i pw8 = _mm256_set1_epi16(8);
+ const __m256i filter = _mm256_load_si256(
+ (__m256i const *)(bilinear_filters_avx2 + (y_offset << 5)));
+ const __m256i src_a = _mm256_loadu_si256((__m256i const *)src);
+ const __m256i src_b = _mm256_loadu_si256((__m256i const *)(src + 1));
+ __m256i prev_src_avg = _mm256_avg_epu8(src_a, src_b);
+ __m256i exp_src_lo, exp_src_hi, exp_dst_lo, exp_dst_hi;
+ int i;
+ src += src_stride;
+ for (i = 0; i < height; i++) {
+ const __m256i dst_reg = _mm256_loadu_si256((__m256i const *)dst);
+ const __m256i src_0 = _mm256_loadu_si256((__m256i const *)src);
+ const __m256i src_1 = _mm256_loadu_si256((__m256i const *)(src + 1));
+ const __m256i src_avg = _mm256_avg_epu8(src_0, src_1);
+ exp_src_lo = _mm256_unpacklo_epi8(prev_src_avg, src_avg);
+ exp_src_hi = _mm256_unpackhi_epi8(prev_src_avg, src_avg);
+ prev_src_avg = src_avg;
+
+ FILTER_SRC(filter)
+ if (do_sec) {
+ const __m256i sec_reg = _mm256_loadu_si256((__m256i const *)second_pred);
+ const __m256i exp_src_avg = _mm256_packus_epi16(exp_src_lo, exp_src_hi);
+ const __m256i avg_reg = _mm256_avg_epu8(exp_src_avg, sec_reg);
+ exp_src_lo = _mm256_unpacklo_epi8(avg_reg, zero_reg);
+ exp_src_hi = _mm256_unpackhi_epi8(avg_reg, zero_reg);
+ second_pred += second_stride;
+ }
+ CALC_SUM_SSE_INSIDE_LOOP
+ dst += dst_stride;
+ src += src_stride;
+ }
+}
+
+static INLINE void spv32_xb_y4(const uint8_t *src, int src_stride,
+ const uint8_t *dst, int dst_stride,
+ const uint8_t *second_pred, int second_stride,
+ int do_sec, int height, __m256i *sum_reg,
+ __m256i *sse_reg, int x_offset) {
+ const __m256i zero_reg = _mm256_setzero_si256();
+ const __m256i pw8 = _mm256_set1_epi16(8);
+ const __m256i filter = _mm256_load_si256(
+ (__m256i const *)(bilinear_filters_avx2 + (x_offset << 5)));
+ const __m256i src_a = _mm256_loadu_si256((__m256i const *)src);
+ const __m256i src_b = _mm256_loadu_si256((__m256i const *)(src + 1));
+ __m256i exp_src_lo, exp_src_hi, exp_dst_lo, exp_dst_hi;
+ __m256i src_reg, src_pack;
+ int i;
+ exp_src_lo = _mm256_unpacklo_epi8(src_a, src_b);
+ exp_src_hi = _mm256_unpackhi_epi8(src_a, src_b);
+ FILTER_SRC(filter)
+ // convert each 16 bit to 8 bit to each low and high lane source
+ src_pack = _mm256_packus_epi16(exp_src_lo, exp_src_hi);
+
+ src += src_stride;
+ for (i = 0; i < height; i++) {
+ const __m256i dst_reg = _mm256_loadu_si256((__m256i const *)dst);
+ const __m256i src_0 = _mm256_loadu_si256((__m256i const *)src);
+ const __m256i src_1 = _mm256_loadu_si256((__m256i const *)(src + 1));
+ exp_src_lo = _mm256_unpacklo_epi8(src_0, src_1);
+ exp_src_hi = _mm256_unpackhi_epi8(src_0, src_1);
+
+ FILTER_SRC(filter)
+
+ src_reg = _mm256_packus_epi16(exp_src_lo, exp_src_hi);
+ // average between previous pack to the current
+ src_pack = _mm256_avg_epu8(src_pack, src_reg);
+
+ if (do_sec) {
+ const __m256i sec_reg = _mm256_loadu_si256((__m256i const *)second_pred);
+ const __m256i avg_pack = _mm256_avg_epu8(src_pack, sec_reg);
+ exp_src_lo = _mm256_unpacklo_epi8(avg_pack, zero_reg);
+ exp_src_hi = _mm256_unpackhi_epi8(avg_pack, zero_reg);
+ second_pred += second_stride;
+ } else {
+ exp_src_lo = _mm256_unpacklo_epi8(src_pack, zero_reg);
+ exp_src_hi = _mm256_unpackhi_epi8(src_pack, zero_reg);
+ }
+ CALC_SUM_SSE_INSIDE_LOOP
+ src_pack = src_reg;
+ dst += dst_stride;
+ src += src_stride;
+ }
+}
+
+static INLINE void spv32_xb_yb(const uint8_t *src, int src_stride,
+ const uint8_t *dst, int dst_stride,
+ const uint8_t *second_pred, int second_stride,
+ int do_sec, int height, __m256i *sum_reg,
+ __m256i *sse_reg, int x_offset, int y_offset) {
+ const __m256i zero_reg = _mm256_setzero_si256();
+ const __m256i pw8 = _mm256_set1_epi16(8);
+ const __m256i xfilter = _mm256_load_si256(
+ (__m256i const *)(bilinear_filters_avx2 + (x_offset << 5)));
+ const __m256i yfilter = _mm256_load_si256(
+ (__m256i const *)(bilinear_filters_avx2 + (y_offset << 5)));
+ const __m256i src_a = _mm256_loadu_si256((__m256i const *)src);
+ const __m256i src_b = _mm256_loadu_si256((__m256i const *)(src + 1));
+ __m256i exp_src_lo, exp_src_hi, exp_dst_lo, exp_dst_hi;
+ __m256i prev_src_pack, src_pack;
+ int i;
+ exp_src_lo = _mm256_unpacklo_epi8(src_a, src_b);
+ exp_src_hi = _mm256_unpackhi_epi8(src_a, src_b);
+ FILTER_SRC(xfilter)
+ // convert each 16 bit to 8 bit to each low and high lane source
+ prev_src_pack = _mm256_packus_epi16(exp_src_lo, exp_src_hi);
+ src += src_stride;
+
+ for (i = 0; i < height; i++) {
+ const __m256i dst_reg = _mm256_loadu_si256((__m256i const *)dst);
+ const __m256i src_0 = _mm256_loadu_si256((__m256i const *)src);
+ const __m256i src_1 = _mm256_loadu_si256((__m256i const *)(src + 1));
+ exp_src_lo = _mm256_unpacklo_epi8(src_0, src_1);
+ exp_src_hi = _mm256_unpackhi_epi8(src_0, src_1);
+
+ FILTER_SRC(xfilter)
+ src_pack = _mm256_packus_epi16(exp_src_lo, exp_src_hi);
+
+ // merge previous pack to current pack source
+ exp_src_lo = _mm256_unpacklo_epi8(prev_src_pack, src_pack);
+ exp_src_hi = _mm256_unpackhi_epi8(prev_src_pack, src_pack);
+
+ FILTER_SRC(yfilter)
+ if (do_sec) {
+ const __m256i sec_reg = _mm256_loadu_si256((__m256i const *)second_pred);
+ const __m256i exp_src = _mm256_packus_epi16(exp_src_lo, exp_src_hi);
+ const __m256i avg_reg = _mm256_avg_epu8(exp_src, sec_reg);
+ exp_src_lo = _mm256_unpacklo_epi8(avg_reg, zero_reg);
+ exp_src_hi = _mm256_unpackhi_epi8(avg_reg, zero_reg);
+ second_pred += second_stride;
+ }
+
+ prev_src_pack = src_pack;
+
+ CALC_SUM_SSE_INSIDE_LOOP
+ dst += dst_stride;
+ src += src_stride;
+ }
+}
+
+static INLINE int sub_pix_var32xh(const uint8_t *src, int src_stride,
+ int x_offset, int y_offset,
+ const uint8_t *dst, int dst_stride,
+ const uint8_t *second_pred, int second_stride,
+ int do_sec, int height, unsigned int *sse) {
+ const __m256i zero_reg = _mm256_setzero_si256();
+ __m256i sum_reg = _mm256_setzero_si256();
+ __m256i sse_reg = _mm256_setzero_si256();
+ __m256i sse_reg_hi, res_cmp, sum_reg_lo, sum_reg_hi;
+ int sum;
+ // x_offset = 0 and y_offset = 0
+ if (x_offset == 0) {
+ if (y_offset == 0) {
+ spv32_x0_y0(src, src_stride, dst, dst_stride, second_pred, second_stride,
+ do_sec, height, &sum_reg, &sse_reg);
+ // x_offset = 0 and y_offset = 4
+ } else if (y_offset == 4) {
+ spv32_x0_y4(src, src_stride, dst, dst_stride, second_pred, second_stride,
+ do_sec, height, &sum_reg, &sse_reg);
+ // x_offset = 0 and y_offset = bilin interpolation
+ } else {
+ spv32_x0_yb(src, src_stride, dst, dst_stride, second_pred, second_stride,
+ do_sec, height, &sum_reg, &sse_reg, y_offset);
+ }
+ // x_offset = 4 and y_offset = 0
+ } else if (x_offset == 4) {
+ if (y_offset == 0) {
+ spv32_x4_y0(src, src_stride, dst, dst_stride, second_pred, second_stride,
+ do_sec, height, &sum_reg, &sse_reg);
+ // x_offset = 4 and y_offset = 4
+ } else if (y_offset == 4) {
+ spv32_x4_y4(src, src_stride, dst, dst_stride, second_pred, second_stride,
+ do_sec, height, &sum_reg, &sse_reg);
+ // x_offset = 4 and y_offset = bilin interpolation
+ } else {
+ spv32_x4_yb(src, src_stride, dst, dst_stride, second_pred, second_stride,
+ do_sec, height, &sum_reg, &sse_reg, y_offset);
+ }
+ // x_offset = bilin interpolation and y_offset = 0
+ } else {
+ if (y_offset == 0) {
+ spv32_xb_y0(src, src_stride, dst, dst_stride, second_pred, second_stride,
+ do_sec, height, &sum_reg, &sse_reg, x_offset);
+ // x_offset = bilin interpolation and y_offset = 4
+ } else if (y_offset == 4) {
+ spv32_xb_y4(src, src_stride, dst, dst_stride, second_pred, second_stride,
+ do_sec, height, &sum_reg, &sse_reg, x_offset);
+ // x_offset = bilin interpolation and y_offset = bilin interpolation
+ } else {
+ spv32_xb_yb(src, src_stride, dst, dst_stride, second_pred, second_stride,
+ do_sec, height, &sum_reg, &sse_reg, x_offset, y_offset);
+ }
+ }
+ CALC_SUM_AND_SSE
+ return sum;
+}
+
+static int sub_pixel_variance32xh_avx2(const uint8_t *src, int src_stride,
+ int x_offset, int y_offset,
+ const uint8_t *dst, int dst_stride,
+ int height, unsigned int *sse) {
+ return sub_pix_var32xh(src, src_stride, x_offset, y_offset, dst, dst_stride,
+ NULL, 0, 0, height, sse);
+}
+
+static int sub_pixel_avg_variance32xh_avx2(const uint8_t *src, int src_stride,
+ int x_offset, int y_offset,
+ const uint8_t *dst, int dst_stride,
+ const uint8_t *second_pred,
+ int second_stride, int height,
+ unsigned int *sse) {
+ return sub_pix_var32xh(src, src_stride, x_offset, y_offset, dst, dst_stride,
+ second_pred, second_stride, 1, height, sse);
+}
+
+typedef void (*get_var_avx2)(const uint8_t *src_ptr, int src_stride,
+ const uint8_t *ref_ptr, int ref_stride,
+ unsigned int *sse, int *sum);
+
+unsigned int vpx_variance8x4_avx2(const uint8_t *src_ptr, int src_stride,
+ const uint8_t *ref_ptr, int ref_stride,
+ unsigned int *sse) {
+ __m256i vsse, vsum;
+ int sum;
+ variance8_avx2(src_ptr, src_stride, ref_ptr, ref_stride, 4, &vsse, &vsum);
+ variance_final_from_16bit_sum_avx2(vsse, vsum, sse, &sum);
+ return *sse - ((sum * sum) >> 5);
+}
+
+unsigned int vpx_variance8x8_avx2(const uint8_t *src_ptr, int src_stride,
+ const uint8_t *ref_ptr, int ref_stride,
+ unsigned int *sse) {
+ __m256i vsse, vsum;
+ int sum;
+ variance8_avx2(src_ptr, src_stride, ref_ptr, ref_stride, 8, &vsse, &vsum);
+ variance_final_from_16bit_sum_avx2(vsse, vsum, sse, &sum);
+ return *sse - ((sum * sum) >> 6);
+}
+
+unsigned int vpx_variance8x16_avx2(const uint8_t *src_ptr, int src_stride,
+ const uint8_t *ref_ptr, int ref_stride,
+ unsigned int *sse) {
+ __m256i vsse, vsum;
+ int sum;
+ variance8_avx2(src_ptr, src_stride, ref_ptr, ref_stride, 16, &vsse, &vsum);
+ variance_final_from_16bit_sum_avx2(vsse, vsum, sse, &sum);
+ return *sse - ((sum * sum) >> 7);
+}
+
+unsigned int vpx_variance16x8_avx2(const uint8_t *src_ptr, int src_stride,
+ const uint8_t *ref_ptr, int ref_stride,
+ unsigned int *sse) {
+ int sum;
+ __m256i vsse, vsum;
+ variance16_avx2(src_ptr, src_stride, ref_ptr, ref_stride, 8, &vsse, &vsum);
+ variance_final_from_16bit_sum_avx2(vsse, vsum, sse, &sum);
+ return *sse - (uint32_t)(((int64_t)sum * sum) >> 7);
+}
+
+unsigned int vpx_variance16x16_avx2(const uint8_t *src_ptr, int src_stride,
+ const uint8_t *ref_ptr, int ref_stride,
+ unsigned int *sse) {
+ int sum;
+ __m256i vsse, vsum;
+ variance16_avx2(src_ptr, src_stride, ref_ptr, ref_stride, 16, &vsse, &vsum);
+ variance_final_from_16bit_sum_avx2(vsse, vsum, sse, &sum);
+ return *sse - (uint32_t)(((int64_t)sum * sum) >> 8);
+}
+
+unsigned int vpx_variance16x32_avx2(const uint8_t *src_ptr, int src_stride,
+ const uint8_t *ref_ptr, int ref_stride,
+ unsigned int *sse) {
+ int sum;
+ __m256i vsse, vsum;
+ variance16_avx2(src_ptr, src_stride, ref_ptr, ref_stride, 32, &vsse, &vsum);
+ variance_final_from_16bit_sum_avx2(vsse, vsum, sse, &sum);
+ return *sse - (uint32_t)(((int64_t)sum * sum) >> 9);
+}
+
+unsigned int vpx_variance32x16_avx2(const uint8_t *src_ptr, int src_stride,
+ const uint8_t *ref_ptr, int ref_stride,
+ unsigned int *sse) {
+ int sum;
+ __m256i vsse, vsum;
+ variance32_avx2(src_ptr, src_stride, ref_ptr, ref_stride, 16, &vsse, &vsum);
+ variance_final_from_16bit_sum_avx2(vsse, vsum, sse, &sum);
+ return *sse - (uint32_t)(((int64_t)sum * sum) >> 9);
+}
+
+unsigned int vpx_variance32x32_avx2(const uint8_t *src_ptr, int src_stride,
+ const uint8_t *ref_ptr, int ref_stride,
+ unsigned int *sse) {
+ int sum;
+ __m256i vsse, vsum;
+ __m128i vsum_128;
+ variance32_avx2(src_ptr, src_stride, ref_ptr, ref_stride, 32, &vsse, &vsum);
+ vsum_128 = _mm_add_epi16(_mm256_castsi256_si128(vsum),
+ _mm256_extractf128_si256(vsum, 1));
+ vsum_128 = _mm_add_epi32(_mm_cvtepi16_epi32(vsum_128),
+ _mm_cvtepi16_epi32(_mm_srli_si128(vsum_128, 8)));
+ variance_final_from_32bit_sum_avx2(vsse, vsum_128, sse, &sum);
+ return *sse - (uint32_t)(((int64_t)sum * sum) >> 10);
+}
+
+unsigned int vpx_variance32x64_avx2(const uint8_t *src_ptr, int src_stride,
+ const uint8_t *ref_ptr, int ref_stride,
+ unsigned int *sse) {
+ int sum;
+ __m256i vsse, vsum;
+ __m128i vsum_128;
+ variance32_avx2(src_ptr, src_stride, ref_ptr, ref_stride, 64, &vsse, &vsum);
+ vsum = sum_to_32bit_avx2(vsum);
+ vsum_128 = _mm_add_epi32(_mm256_castsi256_si128(vsum),
+ _mm256_extractf128_si256(vsum, 1));
+ variance_final_from_32bit_sum_avx2(vsse, vsum_128, sse, &sum);
+ return *sse - (uint32_t)(((int64_t)sum * sum) >> 11);
+}
+
+unsigned int vpx_variance64x32_avx2(const uint8_t *src_ptr, int src_stride,
+ const uint8_t *ref_ptr, int ref_stride,
+ unsigned int *sse) {
+ __m256i vsse = _mm256_setzero_si256();
+ __m256i vsum = _mm256_setzero_si256();
+ __m128i vsum_128;
+ int sum;
+ variance64_avx2(src_ptr, src_stride, ref_ptr, ref_stride, 32, &vsse, &vsum);
+ vsum = sum_to_32bit_avx2(vsum);
+ vsum_128 = _mm_add_epi32(_mm256_castsi256_si128(vsum),
+ _mm256_extractf128_si256(vsum, 1));
+ variance_final_from_32bit_sum_avx2(vsse, vsum_128, sse, &sum);
+ return *sse - (uint32_t)(((int64_t)sum * sum) >> 11);
+}
+
+unsigned int vpx_variance64x64_avx2(const uint8_t *src_ptr, int src_stride,
+ const uint8_t *ref_ptr, int ref_stride,
+ unsigned int *sse) {
+ __m256i vsse = _mm256_setzero_si256();
+ __m256i vsum = _mm256_setzero_si256();
+ __m128i vsum_128;
+ int sum;
+ int i = 0;
+
+ for (i = 0; i < 2; i++) {
+ __m256i vsum16;
+ variance64_avx2(src_ptr + 32 * i * src_stride, src_stride,
+ ref_ptr + 32 * i * ref_stride, ref_stride, 32, &vsse,
+ &vsum16);
+ vsum = _mm256_add_epi32(vsum, sum_to_32bit_avx2(vsum16));
+ }
+ vsum_128 = _mm_add_epi32(_mm256_castsi256_si128(vsum),
+ _mm256_extractf128_si256(vsum, 1));
+ variance_final_from_32bit_sum_avx2(vsse, vsum_128, sse, &sum);
+ return *sse - (unsigned int)(((int64_t)sum * sum) >> 12);
+}
+
+unsigned int vpx_mse16x8_avx2(const uint8_t *src_ptr, int src_stride,
+ const uint8_t *ref_ptr, int ref_stride,
+ unsigned int *sse) {
+ int sum;
+ __m256i vsse, vsum;
+ variance16_avx2(src_ptr, src_stride, ref_ptr, ref_stride, 8, &vsse, &vsum);
+ variance_final_from_16bit_sum_avx2(vsse, vsum, sse, &sum);
+ return *sse;
+}
+
+unsigned int vpx_mse16x16_avx2(const uint8_t *src_ptr, int src_stride,
+ const uint8_t *ref_ptr, int ref_stride,
+ unsigned int *sse) {
+ int sum;
+ __m256i vsse, vsum;
+ variance16_avx2(src_ptr, src_stride, ref_ptr, ref_stride, 16, &vsse, &vsum);
+ variance_final_from_16bit_sum_avx2(vsse, vsum, sse, &sum);
+ return *sse;
+}
+
+unsigned int vpx_sub_pixel_variance64x64_avx2(
+ const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset,
+ const uint8_t *ref_ptr, int ref_stride, unsigned int *sse) {
+ unsigned int sse1;
+ const int se1 = sub_pixel_variance32xh_avx2(
+ src_ptr, src_stride, x_offset, y_offset, ref_ptr, ref_stride, 64, &sse1);
+ unsigned int sse2;
+ const int se2 =
+ sub_pixel_variance32xh_avx2(src_ptr + 32, src_stride, x_offset, y_offset,
+ ref_ptr + 32, ref_stride, 64, &sse2);
+ const int se = se1 + se2;
+ *sse = sse1 + sse2;
+ return *sse - (uint32_t)(((int64_t)se * se) >> 12);
+}
+
+unsigned int vpx_sub_pixel_variance32x32_avx2(
+ const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset,
+ const uint8_t *ref_ptr, int ref_stride, unsigned int *sse) {
+ const int se = sub_pixel_variance32xh_avx2(
+ src_ptr, src_stride, x_offset, y_offset, ref_ptr, ref_stride, 32, sse);
+ return *sse - (uint32_t)(((int64_t)se * se) >> 10);
+}
+
+unsigned int vpx_sub_pixel_avg_variance64x64_avx2(
+ const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset,
+ const uint8_t *ref_ptr, int ref_stride, unsigned int *sse,
+ const uint8_t *second_pred) {
+ unsigned int sse1;
+ const int se1 = sub_pixel_avg_variance32xh_avx2(src_ptr, src_stride, x_offset,
+ y_offset, ref_ptr, ref_stride,
+ second_pred, 64, 64, &sse1);
+ unsigned int sse2;
+ const int se2 = sub_pixel_avg_variance32xh_avx2(
+ src_ptr + 32, src_stride, x_offset, y_offset, ref_ptr + 32, ref_stride,
+ second_pred + 32, 64, 64, &sse2);
+ const int se = se1 + se2;
+
+ *sse = sse1 + sse2;
+
+ return *sse - (uint32_t)(((int64_t)se * se) >> 12);
+}
+
+unsigned int vpx_sub_pixel_avg_variance32x32_avx2(
+ const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset,
+ const uint8_t *ref_ptr, int ref_stride, unsigned int *sse,
+ const uint8_t *second_pred) {
+ // Process 32 elements in parallel.
+ const int se = sub_pixel_avg_variance32xh_avx2(src_ptr, src_stride, x_offset,
+ y_offset, ref_ptr, ref_stride,
+ second_pred, 32, 32, sse);
+ return *sse - (uint32_t)(((int64_t)se * se) >> 10);
+}
diff --git a/media/libvpx/libvpx/vpx_dsp/x86/variance_sse2.c b/media/libvpx/libvpx/vpx_dsp/x86/variance_sse2.c
new file mode 100644
index 0000000000..d6eb12da1a
--- /dev/null
+++ b/media/libvpx/libvpx/vpx_dsp/x86/variance_sse2.c
@@ -0,0 +1,565 @@
+/*
+ * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <assert.h>
+#include <emmintrin.h> // SSE2
+
+#include "./vpx_config.h"
+#include "./vpx_dsp_rtcd.h"
+#include "vpx_ports/mem.h"
+#include "vpx_dsp/x86/mem_sse2.h"
+
+static INLINE unsigned int add32x4_sse2(__m128i val) {
+ val = _mm_add_epi32(val, _mm_srli_si128(val, 8));
+ val = _mm_add_epi32(val, _mm_srli_si128(val, 4));
+ return (unsigned int)_mm_cvtsi128_si32(val);
+}
+
+unsigned int vpx_get_mb_ss_sse2(const int16_t *src_ptr) {
+ __m128i vsum = _mm_setzero_si128();
+ int i;
+
+ for (i = 0; i < 32; ++i) {
+ const __m128i v = _mm_loadu_si128((const __m128i *)src_ptr);
+ vsum = _mm_add_epi32(vsum, _mm_madd_epi16(v, v));
+ src_ptr += 8;
+ }
+
+ return add32x4_sse2(vsum);
+}
+
+static INLINE __m128i load4x2_sse2(const uint8_t *const p, const int stride) {
+ const __m128i p0 = _mm_cvtsi32_si128(loadu_int32(p + 0 * stride));
+ const __m128i p1 = _mm_cvtsi32_si128(loadu_int32(p + 1 * stride));
+ const __m128i p01 = _mm_unpacklo_epi32(p0, p1);
+ return _mm_unpacklo_epi8(p01, _mm_setzero_si128());
+}
+
+static INLINE void variance_kernel_sse2(const __m128i src_ptr,
+ const __m128i ref_ptr,
+ __m128i *const sse,
+ __m128i *const sum) {
+ const __m128i diff = _mm_sub_epi16(src_ptr, ref_ptr);
+ *sse = _mm_add_epi32(*sse, _mm_madd_epi16(diff, diff));
+ *sum = _mm_add_epi16(*sum, diff);
+}
+
+// Can handle 128 pixels' diff sum (such as 8x16 or 16x8)
+// Slightly faster than variance_final_256_pel_sse2()
+static INLINE void variance_final_128_pel_sse2(__m128i vsse, __m128i vsum,
+ unsigned int *const sse,
+ int *const sum) {
+ *sse = add32x4_sse2(vsse);
+
+ vsum = _mm_add_epi16(vsum, _mm_srli_si128(vsum, 8));
+ vsum = _mm_add_epi16(vsum, _mm_srli_si128(vsum, 4));
+ vsum = _mm_add_epi16(vsum, _mm_srli_si128(vsum, 2));
+ *sum = (int16_t)_mm_extract_epi16(vsum, 0);
+}
+
+// Can handle 256 pixels' diff sum (such as 16x16)
+static INLINE void variance_final_256_pel_sse2(__m128i vsse, __m128i vsum,
+ unsigned int *const sse,
+ int *const sum) {
+ *sse = add32x4_sse2(vsse);
+
+ vsum = _mm_add_epi16(vsum, _mm_srli_si128(vsum, 8));
+ vsum = _mm_add_epi16(vsum, _mm_srli_si128(vsum, 4));
+ *sum = (int16_t)_mm_extract_epi16(vsum, 0);
+ *sum += (int16_t)_mm_extract_epi16(vsum, 1);
+}
+
+// Can handle 512 pixels' diff sum (such as 16x32 or 32x16)
+static INLINE void variance_final_512_pel_sse2(__m128i vsse, __m128i vsum,
+ unsigned int *const sse,
+ int *const sum) {
+ *sse = add32x4_sse2(vsse);
+
+ vsum = _mm_add_epi16(vsum, _mm_srli_si128(vsum, 8));
+ vsum = _mm_unpacklo_epi16(vsum, vsum);
+ vsum = _mm_srai_epi32(vsum, 16);
+ *sum = (int)add32x4_sse2(vsum);
+}
+
+static INLINE __m128i sum_to_32bit_sse2(const __m128i sum) {
+ const __m128i sum_lo = _mm_srai_epi32(_mm_unpacklo_epi16(sum, sum), 16);
+ const __m128i sum_hi = _mm_srai_epi32(_mm_unpackhi_epi16(sum, sum), 16);
+ return _mm_add_epi32(sum_lo, sum_hi);
+}
+
+// Can handle 1024 pixels' diff sum (such as 32x32)
+static INLINE int sum_final_sse2(const __m128i sum) {
+ const __m128i t = sum_to_32bit_sse2(sum);
+ return (int)add32x4_sse2(t);
+}
+
+static INLINE void variance4_sse2(const uint8_t *src_ptr, const int src_stride,
+ const uint8_t *ref_ptr, const int ref_stride,
+ const int h, __m128i *const sse,
+ __m128i *const sum) {
+ int i;
+
+ assert(h <= 256); // May overflow for larger height.
+ *sse = _mm_setzero_si128();
+ *sum = _mm_setzero_si128();
+
+ for (i = 0; i < h; i += 2) {
+ const __m128i s = load4x2_sse2(src_ptr, src_stride);
+ const __m128i r = load4x2_sse2(ref_ptr, ref_stride);
+
+ variance_kernel_sse2(s, r, sse, sum);
+ src_ptr += 2 * src_stride;
+ ref_ptr += 2 * ref_stride;
+ }
+}
+
+static INLINE void variance8_sse2(const uint8_t *src_ptr, const int src_stride,
+ const uint8_t *ref_ptr, const int ref_stride,
+ const int h, __m128i *const sse,
+ __m128i *const sum) {
+ const __m128i zero = _mm_setzero_si128();
+ int i;
+
+ assert(h <= 128); // May overflow for larger height.
+ *sse = _mm_setzero_si128();
+ *sum = _mm_setzero_si128();
+
+ for (i = 0; i < h; i++) {
+ const __m128i s =
+ _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)src_ptr), zero);
+ const __m128i r =
+ _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)ref_ptr), zero);
+
+ variance_kernel_sse2(s, r, sse, sum);
+ src_ptr += src_stride;
+ ref_ptr += ref_stride;
+ }
+}
+
+static INLINE void variance16_kernel_sse2(const uint8_t *const src_ptr,
+ const uint8_t *const ref_ptr,
+ __m128i *const sse,
+ __m128i *const sum) {
+ const __m128i zero = _mm_setzero_si128();
+ const __m128i s = _mm_loadu_si128((const __m128i *)src_ptr);
+ const __m128i r = _mm_loadu_si128((const __m128i *)ref_ptr);
+ const __m128i src0 = _mm_unpacklo_epi8(s, zero);
+ const __m128i ref0 = _mm_unpacklo_epi8(r, zero);
+ const __m128i src1 = _mm_unpackhi_epi8(s, zero);
+ const __m128i ref1 = _mm_unpackhi_epi8(r, zero);
+
+ variance_kernel_sse2(src0, ref0, sse, sum);
+ variance_kernel_sse2(src1, ref1, sse, sum);
+}
+
+static INLINE void variance16_sse2(const uint8_t *src_ptr, const int src_stride,
+ const uint8_t *ref_ptr, const int ref_stride,
+ const int h, __m128i *const sse,
+ __m128i *const sum) {
+ int i;
+
+ assert(h <= 64); // May overflow for larger height.
+ *sse = _mm_setzero_si128();
+ *sum = _mm_setzero_si128();
+
+ for (i = 0; i < h; ++i) {
+ variance16_kernel_sse2(src_ptr, ref_ptr, sse, sum);
+ src_ptr += src_stride;
+ ref_ptr += ref_stride;
+ }
+}
+
+static INLINE void variance32_sse2(const uint8_t *src_ptr, const int src_stride,
+ const uint8_t *ref_ptr, const int ref_stride,
+ const int h, __m128i *const sse,
+ __m128i *const sum) {
+ int i;
+
+ assert(h <= 32); // May overflow for larger height.
+ // Don't initialize sse here since it's an accumulation.
+ *sum = _mm_setzero_si128();
+
+ for (i = 0; i < h; ++i) {
+ variance16_kernel_sse2(src_ptr + 0, ref_ptr + 0, sse, sum);
+ variance16_kernel_sse2(src_ptr + 16, ref_ptr + 16, sse, sum);
+ src_ptr += src_stride;
+ ref_ptr += ref_stride;
+ }
+}
+
+static INLINE void variance64_sse2(const uint8_t *src_ptr, const int src_stride,
+ const uint8_t *ref_ptr, const int ref_stride,
+ const int h, __m128i *const sse,
+ __m128i *const sum) {
+ int i;
+
+ assert(h <= 16); // May overflow for larger height.
+ // Don't initialize sse here since it's an accumulation.
+ *sum = _mm_setzero_si128();
+
+ for (i = 0; i < h; ++i) {
+ variance16_kernel_sse2(src_ptr + 0, ref_ptr + 0, sse, sum);
+ variance16_kernel_sse2(src_ptr + 16, ref_ptr + 16, sse, sum);
+ variance16_kernel_sse2(src_ptr + 32, ref_ptr + 32, sse, sum);
+ variance16_kernel_sse2(src_ptr + 48, ref_ptr + 48, sse, sum);
+ src_ptr += src_stride;
+ ref_ptr += ref_stride;
+ }
+}
+
+void vpx_get8x8var_sse2(const uint8_t *src_ptr, int src_stride,
+ const uint8_t *ref_ptr, int ref_stride,
+ unsigned int *sse, int *sum) {
+ __m128i vsse, vsum;
+ variance8_sse2(src_ptr, src_stride, ref_ptr, ref_stride, 8, &vsse, &vsum);
+ variance_final_128_pel_sse2(vsse, vsum, sse, sum);
+}
+
+void vpx_get16x16var_sse2(const uint8_t *src_ptr, int src_stride,
+ const uint8_t *ref_ptr, int ref_stride,
+ unsigned int *sse, int *sum) {
+ __m128i vsse, vsum;
+ variance16_sse2(src_ptr, src_stride, ref_ptr, ref_stride, 16, &vsse, &vsum);
+ variance_final_256_pel_sse2(vsse, vsum, sse, sum);
+}
+
+unsigned int vpx_variance4x4_sse2(const uint8_t *src_ptr, int src_stride,
+ const uint8_t *ref_ptr, int ref_stride,
+ unsigned int *sse) {
+ __m128i vsse, vsum;
+ int sum;
+ variance4_sse2(src_ptr, src_stride, ref_ptr, ref_stride, 4, &vsse, &vsum);
+ variance_final_128_pel_sse2(vsse, vsum, sse, &sum);
+ return *sse - ((sum * sum) >> 4);
+}
+
+unsigned int vpx_variance4x8_sse2(const uint8_t *src_ptr, int src_stride,
+ const uint8_t *ref_ptr, int ref_stride,
+ unsigned int *sse) {
+ __m128i vsse, vsum;
+ int sum;
+ variance4_sse2(src_ptr, src_stride, ref_ptr, ref_stride, 8, &vsse, &vsum);
+ variance_final_128_pel_sse2(vsse, vsum, sse, &sum);
+ return *sse - ((sum * sum) >> 5);
+}
+
+unsigned int vpx_variance8x4_sse2(const uint8_t *src_ptr, int src_stride,
+ const uint8_t *ref_ptr, int ref_stride,
+ unsigned int *sse) {
+ __m128i vsse, vsum;
+ int sum;
+ variance8_sse2(src_ptr, src_stride, ref_ptr, ref_stride, 4, &vsse, &vsum);
+ variance_final_128_pel_sse2(vsse, vsum, sse, &sum);
+ return *sse - ((sum * sum) >> 5);
+}
+
+unsigned int vpx_variance8x8_sse2(const uint8_t *src_ptr, int src_stride,
+ const uint8_t *ref_ptr, int ref_stride,
+ unsigned int *sse) {
+ __m128i vsse, vsum;
+ int sum;
+ variance8_sse2(src_ptr, src_stride, ref_ptr, ref_stride, 8, &vsse, &vsum);
+ variance_final_128_pel_sse2(vsse, vsum, sse, &sum);
+ return *sse - ((sum * sum) >> 6);
+}
+
+unsigned int vpx_variance8x16_sse2(const uint8_t *src_ptr, int src_stride,
+ const uint8_t *ref_ptr, int ref_stride,
+ unsigned int *sse) {
+ __m128i vsse, vsum;
+ int sum;
+ variance8_sse2(src_ptr, src_stride, ref_ptr, ref_stride, 16, &vsse, &vsum);
+ variance_final_128_pel_sse2(vsse, vsum, sse, &sum);
+ return *sse - ((sum * sum) >> 7);
+}
+
+unsigned int vpx_variance16x8_sse2(const uint8_t *src_ptr, int src_stride,
+ const uint8_t *ref_ptr, int ref_stride,
+ unsigned int *sse) {
+ __m128i vsse, vsum;
+ int sum;
+ variance16_sse2(src_ptr, src_stride, ref_ptr, ref_stride, 8, &vsse, &vsum);
+ variance_final_128_pel_sse2(vsse, vsum, sse, &sum);
+ return *sse - ((sum * sum) >> 7);
+}
+
+unsigned int vpx_variance16x16_sse2(const uint8_t *src_ptr, int src_stride,
+ const uint8_t *ref_ptr, int ref_stride,
+ unsigned int *sse) {
+ __m128i vsse, vsum;
+ int sum;
+ variance16_sse2(src_ptr, src_stride, ref_ptr, ref_stride, 16, &vsse, &vsum);
+ variance_final_256_pel_sse2(vsse, vsum, sse, &sum);
+ return *sse - (uint32_t)(((int64_t)sum * sum) >> 8);
+}
+
+unsigned int vpx_variance16x32_sse2(const uint8_t *src_ptr, int src_stride,
+ const uint8_t *ref_ptr, int ref_stride,
+ unsigned int *sse) {
+ __m128i vsse, vsum;
+ int sum;
+ variance16_sse2(src_ptr, src_stride, ref_ptr, ref_stride, 32, &vsse, &vsum);
+ variance_final_512_pel_sse2(vsse, vsum, sse, &sum);
+ return *sse - (unsigned int)(((int64_t)sum * sum) >> 9);
+}
+
+unsigned int vpx_variance32x16_sse2(const uint8_t *src_ptr, int src_stride,
+ const uint8_t *ref_ptr, int ref_stride,
+ unsigned int *sse) {
+ __m128i vsse = _mm_setzero_si128();
+ __m128i vsum;
+ int sum;
+ variance32_sse2(src_ptr, src_stride, ref_ptr, ref_stride, 16, &vsse, &vsum);
+ variance_final_512_pel_sse2(vsse, vsum, sse, &sum);
+ return *sse - (unsigned int)(((int64_t)sum * sum) >> 9);
+}
+
+unsigned int vpx_variance32x32_sse2(const uint8_t *src_ptr, int src_stride,
+ const uint8_t *ref_ptr, int ref_stride,
+ unsigned int *sse) {
+ __m128i vsse = _mm_setzero_si128();
+ __m128i vsum;
+ int sum;
+ variance32_sse2(src_ptr, src_stride, ref_ptr, ref_stride, 32, &vsse, &vsum);
+ *sse = add32x4_sse2(vsse);
+ sum = sum_final_sse2(vsum);
+ return *sse - (unsigned int)(((int64_t)sum * sum) >> 10);
+}
+
+unsigned int vpx_variance32x64_sse2(const uint8_t *src_ptr, int src_stride,
+ const uint8_t *ref_ptr, int ref_stride,
+ unsigned int *sse) {
+ __m128i vsse = _mm_setzero_si128();
+ __m128i vsum = _mm_setzero_si128();
+ int sum;
+ int i = 0;
+
+ for (i = 0; i < 2; i++) {
+ __m128i vsum16;
+ variance32_sse2(src_ptr + 32 * i * src_stride, src_stride,
+ ref_ptr + 32 * i * ref_stride, ref_stride, 32, &vsse,
+ &vsum16);
+ vsum = _mm_add_epi32(vsum, sum_to_32bit_sse2(vsum16));
+ }
+ *sse = add32x4_sse2(vsse);
+ sum = (int)add32x4_sse2(vsum);
+ return *sse - (unsigned int)(((int64_t)sum * sum) >> 11);
+}
+
+unsigned int vpx_variance64x32_sse2(const uint8_t *src_ptr, int src_stride,
+ const uint8_t *ref_ptr, int ref_stride,
+ unsigned int *sse) {
+ __m128i vsse = _mm_setzero_si128();
+ __m128i vsum = _mm_setzero_si128();
+ int sum;
+ int i = 0;
+
+ for (i = 0; i < 2; i++) {
+ __m128i vsum16;
+ variance64_sse2(src_ptr + 16 * i * src_stride, src_stride,
+ ref_ptr + 16 * i * ref_stride, ref_stride, 16, &vsse,
+ &vsum16);
+ vsum = _mm_add_epi32(vsum, sum_to_32bit_sse2(vsum16));
+ }
+ *sse = add32x4_sse2(vsse);
+ sum = (int)add32x4_sse2(vsum);
+ return *sse - (unsigned int)(((int64_t)sum * sum) >> 11);
+}
+
+unsigned int vpx_variance64x64_sse2(const uint8_t *src_ptr, int src_stride,
+ const uint8_t *ref_ptr, int ref_stride,
+ unsigned int *sse) {
+ __m128i vsse = _mm_setzero_si128();
+ __m128i vsum = _mm_setzero_si128();
+ int sum;
+ int i = 0;
+
+ for (i = 0; i < 4; i++) {
+ __m128i vsum16;
+ variance64_sse2(src_ptr + 16 * i * src_stride, src_stride,
+ ref_ptr + 16 * i * ref_stride, ref_stride, 16, &vsse,
+ &vsum16);
+ vsum = _mm_add_epi32(vsum, sum_to_32bit_sse2(vsum16));
+ }
+ *sse = add32x4_sse2(vsse);
+ sum = (int)add32x4_sse2(vsum);
+ return *sse - (unsigned int)(((int64_t)sum * sum) >> 12);
+}
+
+unsigned int vpx_mse8x8_sse2(const uint8_t *src_ptr, int src_stride,
+ const uint8_t *ref_ptr, int ref_stride,
+ unsigned int *sse) {
+ vpx_variance8x8_sse2(src_ptr, src_stride, ref_ptr, ref_stride, sse);
+ return *sse;
+}
+
+unsigned int vpx_mse8x16_sse2(const uint8_t *src_ptr, int src_stride,
+ const uint8_t *ref_ptr, int ref_stride,
+ unsigned int *sse) {
+ vpx_variance8x16_sse2(src_ptr, src_stride, ref_ptr, ref_stride, sse);
+ return *sse;
+}
+
+unsigned int vpx_mse16x8_sse2(const uint8_t *src_ptr, int src_stride,
+ const uint8_t *ref_ptr, int ref_stride,
+ unsigned int *sse) {
+ vpx_variance16x8_sse2(src_ptr, src_stride, ref_ptr, ref_stride, sse);
+ return *sse;
+}
+
+unsigned int vpx_mse16x16_sse2(const uint8_t *src_ptr, int src_stride,
+ const uint8_t *ref_ptr, int ref_stride,
+ unsigned int *sse) {
+ vpx_variance16x16_sse2(src_ptr, src_stride, ref_ptr, ref_stride, sse);
+ return *sse;
+}
+
+// The 2 unused parameters are place holders for PIC enabled build.
+// These definitions are for functions defined in subpel_variance.asm
+#define DECL(w, opt) \
+ int vpx_sub_pixel_variance##w##xh_##opt( \
+ const uint8_t *src_ptr, ptrdiff_t src_stride, int x_offset, \
+ int y_offset, const uint8_t *ref_ptr, ptrdiff_t ref_stride, int height, \
+ unsigned int *sse, void *unused0, void *unused)
+#define DECLS(opt1, opt2) \
+ DECL(4, opt1); \
+ DECL(8, opt1); \
+ DECL(16, opt1)
+
+DECLS(sse2, sse2);
+DECLS(ssse3, ssse3);
+#undef DECLS
+#undef DECL
+
+#define FN(w, h, wf, wlog2, hlog2, opt, cast_prod, cast) \
+ unsigned int vpx_sub_pixel_variance##w##x##h##_##opt( \
+ const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, \
+ const uint8_t *ref_ptr, int ref_stride, unsigned int *sse) { \
+ unsigned int sse_tmp; \
+ int se = vpx_sub_pixel_variance##wf##xh_##opt( \
+ src_ptr, src_stride, x_offset, y_offset, ref_ptr, ref_stride, h, \
+ &sse_tmp, NULL, NULL); \
+ if (w > wf) { \
+ unsigned int sse2; \
+ int se2 = vpx_sub_pixel_variance##wf##xh_##opt( \
+ src_ptr + 16, src_stride, x_offset, y_offset, ref_ptr + 16, \
+ ref_stride, h, &sse2, NULL, NULL); \
+ se += se2; \
+ sse_tmp += sse2; \
+ if (w > wf * 2) { \
+ se2 = vpx_sub_pixel_variance##wf##xh_##opt( \
+ src_ptr + 32, src_stride, x_offset, y_offset, ref_ptr + 32, \
+ ref_stride, h, &sse2, NULL, NULL); \
+ se += se2; \
+ sse_tmp += sse2; \
+ se2 = vpx_sub_pixel_variance##wf##xh_##opt( \
+ src_ptr + 48, src_stride, x_offset, y_offset, ref_ptr + 48, \
+ ref_stride, h, &sse2, NULL, NULL); \
+ se += se2; \
+ sse_tmp += sse2; \
+ } \
+ } \
+ *sse = sse_tmp; \
+ return sse_tmp - \
+ (unsigned int)(cast_prod(cast se * se) >> (wlog2 + hlog2)); \
+ }
+
+#define FNS(opt1, opt2) \
+ FN(64, 64, 16, 6, 6, opt1, (int64_t), (int64_t)) \
+ FN(64, 32, 16, 6, 5, opt1, (int64_t), (int64_t)) \
+ FN(32, 64, 16, 5, 6, opt1, (int64_t), (int64_t)) \
+ FN(32, 32, 16, 5, 5, opt1, (int64_t), (int64_t)) \
+ FN(32, 16, 16, 5, 4, opt1, (int64_t), (int64_t)) \
+ FN(16, 32, 16, 4, 5, opt1, (int64_t), (int64_t)) \
+ FN(16, 16, 16, 4, 4, opt1, (uint32_t), (int64_t)) \
+ FN(16, 8, 16, 4, 3, opt1, (int32_t), (int32_t)) \
+ FN(8, 16, 8, 3, 4, opt1, (int32_t), (int32_t)) \
+ FN(8, 8, 8, 3, 3, opt1, (int32_t), (int32_t)) \
+ FN(8, 4, 8, 3, 2, opt1, (int32_t), (int32_t)) \
+ FN(4, 8, 4, 2, 3, opt1, (int32_t), (int32_t)) \
+ FN(4, 4, 4, 2, 2, opt1, (int32_t), (int32_t))
+
+FNS(sse2, sse2)
+FNS(ssse3, ssse3)
+
+#undef FNS
+#undef FN
+
+// The 2 unused parameters are place holders for PIC enabled build.
+#define DECL(w, opt) \
+ int vpx_sub_pixel_avg_variance##w##xh_##opt( \
+ const uint8_t *src_ptr, ptrdiff_t src_stride, int x_offset, \
+ int y_offset, const uint8_t *ref_ptr, ptrdiff_t ref_stride, \
+ const uint8_t *second_pred, ptrdiff_t second_stride, int height, \
+ unsigned int *sse, void *unused0, void *unused)
+#define DECLS(opt1, opt2) \
+ DECL(4, opt1); \
+ DECL(8, opt1); \
+ DECL(16, opt1)
+
+DECLS(sse2, sse2);
+DECLS(ssse3, ssse3);
+#undef DECL
+#undef DECLS
+
+#define FN(w, h, wf, wlog2, hlog2, opt, cast_prod, cast) \
+ unsigned int vpx_sub_pixel_avg_variance##w##x##h##_##opt( \
+ const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, \
+ const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, \
+ const uint8_t *second_pred) { \
+ unsigned int sse_tmp; \
+ int se = vpx_sub_pixel_avg_variance##wf##xh_##opt( \
+ src_ptr, src_stride, x_offset, y_offset, ref_ptr, ref_stride, \
+ second_pred, w, h, &sse_tmp, NULL, NULL); \
+ if (w > wf) { \
+ unsigned int sse2; \
+ int se2 = vpx_sub_pixel_avg_variance##wf##xh_##opt( \
+ src_ptr + 16, src_stride, x_offset, y_offset, ref_ptr + 16, \
+ ref_stride, second_pred + 16, w, h, &sse2, NULL, NULL); \
+ se += se2; \
+ sse_tmp += sse2; \
+ if (w > wf * 2) { \
+ se2 = vpx_sub_pixel_avg_variance##wf##xh_##opt( \
+ src_ptr + 32, src_stride, x_offset, y_offset, ref_ptr + 32, \
+ ref_stride, second_pred + 32, w, h, &sse2, NULL, NULL); \
+ se += se2; \
+ sse_tmp += sse2; \
+ se2 = vpx_sub_pixel_avg_variance##wf##xh_##opt( \
+ src_ptr + 48, src_stride, x_offset, y_offset, ref_ptr + 48, \
+ ref_stride, second_pred + 48, w, h, &sse2, NULL, NULL); \
+ se += se2; \
+ sse_tmp += sse2; \
+ } \
+ } \
+ *sse = sse_tmp; \
+ return sse_tmp - \
+ (unsigned int)(cast_prod(cast se * se) >> (wlog2 + hlog2)); \
+ }
+
+#define FNS(opt1, opt2) \
+ FN(64, 64, 16, 6, 6, opt1, (int64_t), (int64_t)) \
+ FN(64, 32, 16, 6, 5, opt1, (int64_t), (int64_t)) \
+ FN(32, 64, 16, 5, 6, opt1, (int64_t), (int64_t)) \
+ FN(32, 32, 16, 5, 5, opt1, (int64_t), (int64_t)) \
+ FN(32, 16, 16, 5, 4, opt1, (int64_t), (int64_t)) \
+ FN(16, 32, 16, 4, 5, opt1, (int64_t), (int64_t)) \
+ FN(16, 16, 16, 4, 4, opt1, (uint32_t), (int64_t)) \
+ FN(16, 8, 16, 4, 3, opt1, (uint32_t), (int32_t)) \
+ FN(8, 16, 8, 3, 4, opt1, (uint32_t), (int32_t)) \
+ FN(8, 8, 8, 3, 3, opt1, (uint32_t), (int32_t)) \
+ FN(8, 4, 8, 3, 2, opt1, (uint32_t), (int32_t)) \
+ FN(4, 8, 4, 2, 3, opt1, (uint32_t), (int32_t)) \
+ FN(4, 4, 4, 2, 2, opt1, (uint32_t), (int32_t))
+
+FNS(sse2, sse)
+FNS(ssse3, ssse3)
+
+#undef FNS
+#undef FN
diff --git a/media/libvpx/libvpx/vpx_dsp/x86/vpx_convolve_copy_sse2.asm b/media/libvpx/libvpx/vpx_dsp/x86/vpx_convolve_copy_sse2.asm
new file mode 100644
index 0000000000..3f444e2e6a
--- /dev/null
+++ b/media/libvpx/libvpx/vpx_dsp/x86/vpx_convolve_copy_sse2.asm
@@ -0,0 +1,226 @@
+;
+; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+;
+; Use of this source code is governed by a BSD-style license
+; that can be found in the LICENSE file in the root of the source
+; tree. An additional intellectual property rights grant can be found
+; in the file PATENTS. All contributing project authors may
+; be found in the AUTHORS file in the root of the source tree.
+;
+
+%include "third_party/x86inc/x86inc.asm"
+
+SECTION .text
+
+%macro convolve_fn 1-2
+%ifidn %1, avg
+%define AUX_XMM_REGS 4
+%else
+%define AUX_XMM_REGS 0
+%endif
+%ifidn %2, highbd
+%define pavg pavgw
+cglobal %2_convolve_%1, 4, 8, 4+AUX_XMM_REGS, src, src_stride, \
+ dst, dst_stride, \
+ f, fxo, fxs, fyo, fys, w, h, bd
+%else
+%define pavg pavgb
+cglobal convolve_%1, 4, 8, 4+AUX_XMM_REGS, src, src_stride, \
+ dst, dst_stride, \
+ f, fxo, fxs, fyo, fys, w, h
+%endif
+ mov r4d, dword wm
+%ifidn %2, highbd
+ shl r4d, 1
+ shl src_strideq, 1
+ shl dst_strideq, 1
+%else
+ cmp r4d, 4
+ je .w4
+%endif
+ cmp r4d, 8
+ je .w8
+ cmp r4d, 16
+ je .w16
+ cmp r4d, 32
+ je .w32
+%ifidn %2, highbd
+ cmp r4d, 64
+ je .w64
+
+ mov r4d, dword hm
+.loop128:
+ movu m0, [srcq]
+ movu m1, [srcq+16]
+ movu m2, [srcq+32]
+ movu m3, [srcq+48]
+%ifidn %1, avg
+ pavg m0, [dstq]
+ pavg m1, [dstq+16]
+ pavg m2, [dstq+32]
+ pavg m3, [dstq+48]
+%endif
+ mova [dstq ], m0
+ mova [dstq+16], m1
+ mova [dstq+32], m2
+ mova [dstq+48], m3
+ movu m0, [srcq+64]
+ movu m1, [srcq+80]
+ movu m2, [srcq+96]
+ movu m3, [srcq+112]
+ add srcq, src_strideq
+%ifidn %1, avg
+ pavg m0, [dstq+64]
+ pavg m1, [dstq+80]
+ pavg m2, [dstq+96]
+ pavg m3, [dstq+112]
+%endif
+ mova [dstq+64], m0
+ mova [dstq+80], m1
+ mova [dstq+96], m2
+ mova [dstq+112], m3
+ add dstq, dst_strideq
+ dec r4d
+ jnz .loop128
+ RET
+%endif
+
+.w64:
+ mov r4d, dword hm
+.loop64:
+ movu m0, [srcq]
+ movu m1, [srcq+16]
+ movu m2, [srcq+32]
+ movu m3, [srcq+48]
+ add srcq, src_strideq
+%ifidn %1, avg
+ pavg m0, [dstq]
+ pavg m1, [dstq+16]
+ pavg m2, [dstq+32]
+ pavg m3, [dstq+48]
+%endif
+ mova [dstq ], m0
+ mova [dstq+16], m1
+ mova [dstq+32], m2
+ mova [dstq+48], m3
+ add dstq, dst_strideq
+ dec r4d
+ jnz .loop64
+ RET
+
+.w32:
+ mov r4d, dword hm
+.loop32:
+ movu m0, [srcq]
+ movu m1, [srcq+16]
+ movu m2, [srcq+src_strideq]
+ movu m3, [srcq+src_strideq+16]
+ lea srcq, [srcq+src_strideq*2]
+%ifidn %1, avg
+ pavg m0, [dstq]
+ pavg m1, [dstq +16]
+ pavg m2, [dstq+dst_strideq]
+ pavg m3, [dstq+dst_strideq+16]
+%endif
+ mova [dstq ], m0
+ mova [dstq +16], m1
+ mova [dstq+dst_strideq ], m2
+ mova [dstq+dst_strideq+16], m3
+ lea dstq, [dstq+dst_strideq*2]
+ sub r4d, 2
+ jnz .loop32
+ RET
+
+.w16:
+ mov r4d, dword hm
+ lea r5q, [src_strideq*3]
+ lea r6q, [dst_strideq*3]
+.loop16:
+ movu m0, [srcq]
+ movu m1, [srcq+src_strideq]
+ movu m2, [srcq+src_strideq*2]
+ movu m3, [srcq+r5q]
+ lea srcq, [srcq+src_strideq*4]
+%ifidn %1, avg
+ pavg m0, [dstq]
+ pavg m1, [dstq+dst_strideq]
+ pavg m2, [dstq+dst_strideq*2]
+ pavg m3, [dstq+r6q]
+%endif
+ mova [dstq ], m0
+ mova [dstq+dst_strideq ], m1
+ mova [dstq+dst_strideq*2], m2
+ mova [dstq+r6q ], m3
+ lea dstq, [dstq+dst_strideq*4]
+ sub r4d, 4
+ jnz .loop16
+ RET
+
+.w8:
+ mov r4d, dword hm
+ lea r5q, [src_strideq*3]
+ lea r6q, [dst_strideq*3]
+.loop8:
+ movh m0, [srcq]
+ movh m1, [srcq+src_strideq]
+ movh m2, [srcq+src_strideq*2]
+ movh m3, [srcq+r5q]
+ lea srcq, [srcq+src_strideq*4]
+%ifidn %1, avg
+ movh m4, [dstq]
+ movh m5, [dstq+dst_strideq]
+ movh m6, [dstq+dst_strideq*2]
+ movh m7, [dstq+r6q]
+ pavg m0, m4
+ pavg m1, m5
+ pavg m2, m6
+ pavg m3, m7
+%endif
+ movh [dstq ], m0
+ movh [dstq+dst_strideq ], m1
+ movh [dstq+dst_strideq*2], m2
+ movh [dstq+r6q ], m3
+ lea dstq, [dstq+dst_strideq*4]
+ sub r4d, 4
+ jnz .loop8
+ RET
+
+%ifnidn %2, highbd
+.w4:
+ mov r4d, dword hm
+ lea r5q, [src_strideq*3]
+ lea r6q, [dst_strideq*3]
+.loop4:
+ movd m0, [srcq]
+ movd m1, [srcq+src_strideq]
+ movd m2, [srcq+src_strideq*2]
+ movd m3, [srcq+r5q]
+ lea srcq, [srcq+src_strideq*4]
+%ifidn %1, avg
+ movd m4, [dstq]
+ movd m5, [dstq+dst_strideq]
+ movd m6, [dstq+dst_strideq*2]
+ movd m7, [dstq+r6q]
+ pavg m0, m4
+ pavg m1, m5
+ pavg m2, m6
+ pavg m3, m7
+%endif
+ movd [dstq ], m0
+ movd [dstq+dst_strideq ], m1
+ movd [dstq+dst_strideq*2], m2
+ movd [dstq+r6q ], m3
+ lea dstq, [dstq+dst_strideq*4]
+ sub r4d, 4
+ jnz .loop4
+ RET
+%endif
+%endmacro
+
+INIT_XMM sse2
+convolve_fn copy
+convolve_fn avg
+%if CONFIG_VP9_HIGHBITDEPTH
+convolve_fn copy, highbd
+convolve_fn avg, highbd
+%endif
diff --git a/media/libvpx/libvpx/vpx_dsp/x86/vpx_high_subpixel_8t_sse2.asm b/media/libvpx/libvpx/vpx_dsp/x86/vpx_high_subpixel_8t_sse2.asm
new file mode 100644
index 0000000000..fc301fb39e
--- /dev/null
+++ b/media/libvpx/libvpx/vpx_dsp/x86/vpx_high_subpixel_8t_sse2.asm
@@ -0,0 +1,964 @@
+;
+; Copyright (c) 2014 The WebM project authors. All Rights Reserved.
+;
+; Use of this source code is governed by a BSD-style license
+; that can be found in the LICENSE file in the root of the source
+; tree. An additional intellectual property rights grant can be found
+; in the file PATENTS. All contributing project authors may
+; be found in the AUTHORS file in the root of the source tree.
+;
+
+
+%include "vpx_ports/x86_abi_support.asm"
+
+;Note: tap3 and tap4 have to be applied and added after other taps to avoid
+;overflow.
+
+%macro HIGH_GET_FILTERS_4 0
+ mov rdx, arg(5) ;filter ptr
+ mov rcx, 0x00000040
+
+ movdqa xmm7, [rdx] ;load filters
+ pshuflw xmm0, xmm7, 0b ;k0
+ pshuflw xmm1, xmm7, 01010101b ;k1
+ pshuflw xmm2, xmm7, 10101010b ;k2
+ pshuflw xmm3, xmm7, 11111111b ;k3
+ psrldq xmm7, 8
+ pshuflw xmm4, xmm7, 0b ;k4
+ pshuflw xmm5, xmm7, 01010101b ;k5
+ pshuflw xmm6, xmm7, 10101010b ;k6
+ pshuflw xmm7, xmm7, 11111111b ;k7
+
+ punpcklwd xmm0, xmm6
+ punpcklwd xmm2, xmm5
+ punpcklwd xmm3, xmm4
+ punpcklwd xmm1, xmm7
+
+ movdqa k0k6, xmm0
+ movdqa k2k5, xmm2
+ movdqa k3k4, xmm3
+ movdqa k1k7, xmm1
+
+ movq xmm6, rcx
+ pshufd xmm6, xmm6, 0
+ movdqa krd, xmm6
+
+ ;Compute max and min values of a pixel
+ mov rdx, 0x00010001
+ movsxd rcx, DWORD PTR arg(6) ;bd
+ movq xmm0, rdx
+ movq xmm1, rcx
+ pshufd xmm0, xmm0, 0b
+ movdqa xmm2, xmm0
+ psllw xmm0, xmm1
+ psubw xmm0, xmm2
+ pxor xmm1, xmm1
+ movdqa max, xmm0 ;max value (for clamping)
+ movdqa min, xmm1 ;min value (for clamping)
+
+%endm
+
+%macro HIGH_APPLY_FILTER_4 1
+ punpcklwd xmm0, xmm6 ;two row in one register
+ punpcklwd xmm1, xmm7
+ punpcklwd xmm2, xmm5
+ punpcklwd xmm3, xmm4
+
+ pmaddwd xmm0, k0k6 ;multiply the filter factors
+ pmaddwd xmm1, k1k7
+ pmaddwd xmm2, k2k5
+ pmaddwd xmm3, k3k4
+
+ paddd xmm0, xmm1 ;sum
+ paddd xmm0, xmm2
+ paddd xmm0, xmm3
+
+ paddd xmm0, krd ;rounding
+ psrad xmm0, 7 ;shift
+ packssdw xmm0, xmm0 ;pack to word
+
+ ;clamp the values
+ pminsw xmm0, max
+ pmaxsw xmm0, min
+
+%if %1
+ movq xmm1, [rdi]
+ pavgw xmm0, xmm1
+%endif
+ movq [rdi], xmm0
+%endm
+
+%macro HIGH_GET_FILTERS 0
+ mov rdx, arg(5) ;filter ptr
+ mov rsi, arg(0) ;src_ptr
+ mov rdi, arg(2) ;output_ptr
+ mov rcx, 0x00000040
+
+ movdqa xmm7, [rdx] ;load filters
+ pshuflw xmm0, xmm7, 0b ;k0
+ pshuflw xmm1, xmm7, 01010101b ;k1
+ pshuflw xmm2, xmm7, 10101010b ;k2
+ pshuflw xmm3, xmm7, 11111111b ;k3
+ pshufhw xmm4, xmm7, 0b ;k4
+ pshufhw xmm5, xmm7, 01010101b ;k5
+ pshufhw xmm6, xmm7, 10101010b ;k6
+ pshufhw xmm7, xmm7, 11111111b ;k7
+ punpcklqdq xmm2, xmm2
+ punpcklqdq xmm3, xmm3
+ punpcklwd xmm0, xmm1
+ punpckhwd xmm6, xmm7
+ punpckhwd xmm2, xmm5
+ punpckhwd xmm3, xmm4
+
+ movdqa k0k1, xmm0 ;store filter factors on stack
+ movdqa k6k7, xmm6
+ movdqa k2k5, xmm2
+ movdqa k3k4, xmm3
+
+ movq xmm6, rcx
+ pshufd xmm6, xmm6, 0
+ movdqa krd, xmm6 ;rounding
+
+ ;Compute max and min values of a pixel
+ mov rdx, 0x00010001
+ movsxd rcx, DWORD PTR arg(6) ;bd
+ movq xmm0, rdx
+ movq xmm1, rcx
+ pshufd xmm0, xmm0, 0b
+ movdqa xmm2, xmm0
+ psllw xmm0, xmm1
+ psubw xmm0, xmm2
+ pxor xmm1, xmm1
+ movdqa max, xmm0 ;max value (for clamping)
+ movdqa min, xmm1 ;min value (for clamping)
+%endm
+
+%macro LOAD_VERT_8 1
+ movdqu xmm0, [rsi + %1] ;0
+ movdqu xmm1, [rsi + rax + %1] ;1
+ movdqu xmm6, [rsi + rdx * 2 + %1] ;6
+ lea rsi, [rsi + rax]
+ movdqu xmm7, [rsi + rdx * 2 + %1] ;7
+ movdqu xmm2, [rsi + rax + %1] ;2
+ movdqu xmm3, [rsi + rax * 2 + %1] ;3
+ movdqu xmm4, [rsi + rdx + %1] ;4
+ movdqu xmm5, [rsi + rax * 4 + %1] ;5
+%endm
+
+%macro HIGH_APPLY_FILTER_8 2
+ movdqu temp, xmm4
+ movdqa xmm4, xmm0
+ punpcklwd xmm0, xmm1
+ punpckhwd xmm4, xmm1
+ movdqa xmm1, xmm6
+ punpcklwd xmm6, xmm7
+ punpckhwd xmm1, xmm7
+ movdqa xmm7, xmm2
+ punpcklwd xmm2, xmm5
+ punpckhwd xmm7, xmm5
+
+ movdqu xmm5, temp
+ movdqu temp, xmm4
+ movdqa xmm4, xmm3
+ punpcklwd xmm3, xmm5
+ punpckhwd xmm4, xmm5
+ movdqu xmm5, temp
+
+ pmaddwd xmm0, k0k1
+ pmaddwd xmm5, k0k1
+ pmaddwd xmm6, k6k7
+ pmaddwd xmm1, k6k7
+ pmaddwd xmm2, k2k5
+ pmaddwd xmm7, k2k5
+ pmaddwd xmm3, k3k4
+ pmaddwd xmm4, k3k4
+
+ paddd xmm0, xmm6
+ paddd xmm0, xmm2
+ paddd xmm0, xmm3
+ paddd xmm5, xmm1
+ paddd xmm5, xmm7
+ paddd xmm5, xmm4
+
+ paddd xmm0, krd ;rounding
+ paddd xmm5, krd
+ psrad xmm0, 7 ;shift
+ psrad xmm5, 7
+ packssdw xmm0, xmm5 ;pack back to word
+
+ ;clamp the values
+ pminsw xmm0, max
+ pmaxsw xmm0, min
+
+%if %1
+ movdqu xmm1, [rdi + %2]
+ pavgw xmm0, xmm1
+%endif
+ movdqu [rdi + %2], xmm0
+%endm
+
+SECTION .text
+
+;void vpx_highbd_filter_block1d4_v8_sse2
+;(
+; unsigned char *src_ptr,
+; unsigned int src_pitch,
+; unsigned char *output_ptr,
+; unsigned int out_pitch,
+; unsigned int output_height,
+; short *filter
+;)
+globalsym(vpx_highbd_filter_block1d4_v8_sse2)
+sym(vpx_highbd_filter_block1d4_v8_sse2):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 7
+ SAVE_XMM 7
+ push rsi
+ push rdi
+ push rbx
+ ; end prolog
+
+ ALIGN_STACK 16, rax
+ sub rsp, 16 * 7
+ %define k0k6 [rsp + 16 * 0]
+ %define k2k5 [rsp + 16 * 1]
+ %define k3k4 [rsp + 16 * 2]
+ %define k1k7 [rsp + 16 * 3]
+ %define krd [rsp + 16 * 4]
+ %define max [rsp + 16 * 5]
+ %define min [rsp + 16 * 6]
+
+ HIGH_GET_FILTERS_4
+
+ mov rsi, arg(0) ;src_ptr
+ mov rdi, arg(2) ;output_ptr
+
+ movsxd rax, DWORD PTR arg(1) ;pixels_per_line
+ movsxd rbx, DWORD PTR arg(3) ;out_pitch
+ lea rax, [rax + rax] ;bytes per line
+ lea rbx, [rbx + rbx]
+ lea rdx, [rax + rax * 2]
+ movsxd rcx, DWORD PTR arg(4) ;output_height
+
+.loop:
+ movq xmm0, [rsi] ;load src: row 0
+ movq xmm1, [rsi + rax] ;1
+ movq xmm6, [rsi + rdx * 2] ;6
+ lea rsi, [rsi + rax]
+ movq xmm7, [rsi + rdx * 2] ;7
+ movq xmm2, [rsi + rax] ;2
+ movq xmm3, [rsi + rax * 2] ;3
+ movq xmm4, [rsi + rdx] ;4
+ movq xmm5, [rsi + rax * 4] ;5
+
+ HIGH_APPLY_FILTER_4 0
+
+ lea rdi, [rdi + rbx]
+ dec rcx
+ jnz .loop
+
+ add rsp, 16 * 7
+ pop rsp
+ pop rbx
+ ; begin epilog
+ pop rdi
+ pop rsi
+ RESTORE_XMM
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+
+;void vpx_highbd_filter_block1d8_v8_sse2
+;(
+; unsigned char *src_ptr,
+; unsigned int src_pitch,
+; unsigned char *output_ptr,
+; unsigned int out_pitch,
+; unsigned int output_height,
+; short *filter
+;)
+globalsym(vpx_highbd_filter_block1d8_v8_sse2)
+sym(vpx_highbd_filter_block1d8_v8_sse2):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 7
+ SAVE_XMM 7
+ push rsi
+ push rdi
+ push rbx
+ ; end prolog
+
+ ALIGN_STACK 16, rax
+ sub rsp, 16 * 8
+ %define k0k1 [rsp + 16 * 0]
+ %define k6k7 [rsp + 16 * 1]
+ %define k2k5 [rsp + 16 * 2]
+ %define k3k4 [rsp + 16 * 3]
+ %define krd [rsp + 16 * 4]
+ %define temp [rsp + 16 * 5]
+ %define max [rsp + 16 * 6]
+ %define min [rsp + 16 * 7]
+
+ HIGH_GET_FILTERS
+
+ movsxd rax, DWORD PTR arg(1) ;pixels_per_line
+ movsxd rbx, DWORD PTR arg(3) ;out_pitch
+ lea rax, [rax + rax] ;bytes per line
+ lea rbx, [rbx + rbx]
+ lea rdx, [rax + rax * 2]
+ movsxd rcx, DWORD PTR arg(4) ;output_height
+
+.loop:
+ LOAD_VERT_8 0
+ HIGH_APPLY_FILTER_8 0, 0
+
+ lea rdi, [rdi + rbx]
+ dec rcx
+ jnz .loop
+
+ add rsp, 16 * 8
+ pop rsp
+ pop rbx
+ ; begin epilog
+ pop rdi
+ pop rsi
+ RESTORE_XMM
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+
+;void vpx_highbd_filter_block1d16_v8_sse2
+;(
+; unsigned char *src_ptr,
+; unsigned int src_pitch,
+; unsigned char *output_ptr,
+; unsigned int out_pitch,
+; unsigned int output_height,
+; short *filter
+;)
+globalsym(vpx_highbd_filter_block1d16_v8_sse2)
+sym(vpx_highbd_filter_block1d16_v8_sse2):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 7
+ SAVE_XMM 7
+ push rsi
+ push rdi
+ push rbx
+ ; end prolog
+
+ ALIGN_STACK 16, rax
+ sub rsp, 16 * 8
+ %define k0k1 [rsp + 16 * 0]
+ %define k6k7 [rsp + 16 * 1]
+ %define k2k5 [rsp + 16 * 2]
+ %define k3k4 [rsp + 16 * 3]
+ %define krd [rsp + 16 * 4]
+ %define temp [rsp + 16 * 5]
+ %define max [rsp + 16 * 6]
+ %define min [rsp + 16 * 7]
+
+ HIGH_GET_FILTERS
+
+ movsxd rax, DWORD PTR arg(1) ;pixels_per_line
+ movsxd rbx, DWORD PTR arg(3) ;out_pitch
+ lea rax, [rax + rax] ;bytes per line
+ lea rbx, [rbx + rbx]
+ lea rdx, [rax + rax * 2]
+ movsxd rcx, DWORD PTR arg(4) ;output_height
+
+.loop:
+ LOAD_VERT_8 0
+ HIGH_APPLY_FILTER_8 0, 0
+ sub rsi, rax
+
+ LOAD_VERT_8 16
+ HIGH_APPLY_FILTER_8 0, 16
+ add rdi, rbx
+
+ dec rcx
+ jnz .loop
+
+ add rsp, 16 * 8
+ pop rsp
+ pop rbx
+ ; begin epilog
+ pop rdi
+ pop rsi
+ RESTORE_XMM
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+
+globalsym(vpx_highbd_filter_block1d4_v8_avg_sse2)
+sym(vpx_highbd_filter_block1d4_v8_avg_sse2):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 7
+ SAVE_XMM 7
+ push rsi
+ push rdi
+ push rbx
+ ; end prolog
+
+ ALIGN_STACK 16, rax
+ sub rsp, 16 * 7
+ %define k0k6 [rsp + 16 * 0]
+ %define k2k5 [rsp + 16 * 1]
+ %define k3k4 [rsp + 16 * 2]
+ %define k1k7 [rsp + 16 * 3]
+ %define krd [rsp + 16 * 4]
+ %define max [rsp + 16 * 5]
+ %define min [rsp + 16 * 6]
+
+ HIGH_GET_FILTERS_4
+
+ mov rsi, arg(0) ;src_ptr
+ mov rdi, arg(2) ;output_ptr
+
+ movsxd rax, DWORD PTR arg(1) ;pixels_per_line
+ movsxd rbx, DWORD PTR arg(3) ;out_pitch
+ lea rax, [rax + rax] ;bytes per line
+ lea rbx, [rbx + rbx]
+ lea rdx, [rax + rax * 2]
+ movsxd rcx, DWORD PTR arg(4) ;output_height
+
+.loop:
+ movq xmm0, [rsi] ;load src: row 0
+ movq xmm1, [rsi + rax] ;1
+ movq xmm6, [rsi + rdx * 2] ;6
+ lea rsi, [rsi + rax]
+ movq xmm7, [rsi + rdx * 2] ;7
+ movq xmm2, [rsi + rax] ;2
+ movq xmm3, [rsi + rax * 2] ;3
+ movq xmm4, [rsi + rdx] ;4
+ movq xmm5, [rsi + rax * 4] ;5
+
+ HIGH_APPLY_FILTER_4 1
+
+ lea rdi, [rdi + rbx]
+ dec rcx
+ jnz .loop
+
+ add rsp, 16 * 7
+ pop rsp
+ pop rbx
+ ; begin epilog
+ pop rdi
+ pop rsi
+ RESTORE_XMM
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+
+globalsym(vpx_highbd_filter_block1d8_v8_avg_sse2)
+sym(vpx_highbd_filter_block1d8_v8_avg_sse2):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 7
+ SAVE_XMM 7
+ push rsi
+ push rdi
+ push rbx
+ ; end prolog
+
+ ALIGN_STACK 16, rax
+ sub rsp, 16 * 8
+ %define k0k1 [rsp + 16 * 0]
+ %define k6k7 [rsp + 16 * 1]
+ %define k2k5 [rsp + 16 * 2]
+ %define k3k4 [rsp + 16 * 3]
+ %define krd [rsp + 16 * 4]
+ %define temp [rsp + 16 * 5]
+ %define max [rsp + 16 * 6]
+ %define min [rsp + 16 * 7]
+
+ HIGH_GET_FILTERS
+
+ movsxd rax, DWORD PTR arg(1) ;pixels_per_line
+ movsxd rbx, DWORD PTR arg(3) ;out_pitch
+ lea rax, [rax + rax] ;bytes per line
+ lea rbx, [rbx + rbx]
+ lea rdx, [rax + rax * 2]
+ movsxd rcx, DWORD PTR arg(4) ;output_height
+.loop:
+ LOAD_VERT_8 0
+ HIGH_APPLY_FILTER_8 1, 0
+
+ lea rdi, [rdi + rbx]
+ dec rcx
+ jnz .loop
+
+ add rsp, 16 * 8
+ pop rsp
+ pop rbx
+ ; begin epilog
+ pop rdi
+ pop rsi
+ RESTORE_XMM
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+
+globalsym(vpx_highbd_filter_block1d16_v8_avg_sse2)
+sym(vpx_highbd_filter_block1d16_v8_avg_sse2):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 7
+ SAVE_XMM 7
+ push rsi
+ push rdi
+ push rbx
+ ; end prolog
+
+ ALIGN_STACK 16, rax
+ sub rsp, 16 * 8
+ %define k0k1 [rsp + 16 * 0]
+ %define k6k7 [rsp + 16 * 1]
+ %define k2k5 [rsp + 16 * 2]
+ %define k3k4 [rsp + 16 * 3]
+ %define krd [rsp + 16 * 4]
+ %define temp [rsp + 16 * 5]
+ %define max [rsp + 16 * 6]
+ %define min [rsp + 16 * 7]
+
+ HIGH_GET_FILTERS
+
+ movsxd rax, DWORD PTR arg(1) ;pixels_per_line
+ movsxd rbx, DWORD PTR arg(3) ;out_pitch
+ lea rax, [rax + rax] ;bytes per line
+ lea rbx, [rbx + rbx]
+ lea rdx, [rax + rax * 2]
+ movsxd rcx, DWORD PTR arg(4) ;output_height
+.loop:
+ LOAD_VERT_8 0
+ HIGH_APPLY_FILTER_8 1, 0
+ sub rsi, rax
+
+ LOAD_VERT_8 16
+ HIGH_APPLY_FILTER_8 1, 16
+ add rdi, rbx
+
+ dec rcx
+ jnz .loop
+
+ add rsp, 16 * 8
+ pop rsp
+ pop rbx
+ ; begin epilog
+ pop rdi
+ pop rsi
+ RESTORE_XMM
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+
+;void vpx_highbd_filter_block1d4_h8_sse2
+;(
+; unsigned char *src_ptr,
+; unsigned int src_pixels_per_line,
+; unsigned char *output_ptr,
+; unsigned int output_pitch,
+; unsigned int output_height,
+; short *filter
+;)
+globalsym(vpx_highbd_filter_block1d4_h8_sse2)
+sym(vpx_highbd_filter_block1d4_h8_sse2):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 7
+ SAVE_XMM 7
+ push rsi
+ push rdi
+ ; end prolog
+
+ ALIGN_STACK 16, rax
+ sub rsp, 16 * 7
+ %define k0k6 [rsp + 16 * 0]
+ %define k2k5 [rsp + 16 * 1]
+ %define k3k4 [rsp + 16 * 2]
+ %define k1k7 [rsp + 16 * 3]
+ %define krd [rsp + 16 * 4]
+ %define max [rsp + 16 * 5]
+ %define min [rsp + 16 * 6]
+
+ HIGH_GET_FILTERS_4
+
+ mov rsi, arg(0) ;src_ptr
+ mov rdi, arg(2) ;output_ptr
+
+ movsxd rax, DWORD PTR arg(1) ;pixels_per_line
+ movsxd rdx, DWORD PTR arg(3) ;out_pitch
+ lea rax, [rax + rax] ;bytes per line
+ lea rdx, [rdx + rdx]
+ movsxd rcx, DWORD PTR arg(4) ;output_height
+
+.loop:
+ movdqu xmm0, [rsi - 6] ;load src
+ movdqu xmm4, [rsi + 2]
+ movdqa xmm1, xmm0
+ movdqa xmm6, xmm4
+ movdqa xmm7, xmm4
+ movdqa xmm2, xmm0
+ movdqa xmm3, xmm0
+ movdqa xmm5, xmm4
+
+ psrldq xmm1, 2
+ psrldq xmm6, 4
+ psrldq xmm7, 6
+ psrldq xmm2, 4
+ psrldq xmm3, 6
+ psrldq xmm5, 2
+
+ HIGH_APPLY_FILTER_4 0
+
+ lea rsi, [rsi + rax]
+ lea rdi, [rdi + rdx]
+ dec rcx
+ jnz .loop
+
+ add rsp, 16 * 7
+ pop rsp
+
+ ; begin epilog
+ pop rdi
+ pop rsi
+ RESTORE_XMM
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+
+;void vpx_highbd_filter_block1d8_h8_sse2
+;(
+; unsigned char *src_ptr,
+; unsigned int src_pixels_per_line,
+; unsigned char *output_ptr,
+; unsigned int output_pitch,
+; unsigned int output_height,
+; short *filter
+;)
+globalsym(vpx_highbd_filter_block1d8_h8_sse2)
+sym(vpx_highbd_filter_block1d8_h8_sse2):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 7
+ SAVE_XMM 7
+ push rsi
+ push rdi
+ ; end prolog
+
+ ALIGN_STACK 16, rax
+ sub rsp, 16 * 8
+ %define k0k1 [rsp + 16 * 0]
+ %define k6k7 [rsp + 16 * 1]
+ %define k2k5 [rsp + 16 * 2]
+ %define k3k4 [rsp + 16 * 3]
+ %define krd [rsp + 16 * 4]
+ %define temp [rsp + 16 * 5]
+ %define max [rsp + 16 * 6]
+ %define min [rsp + 16 * 7]
+
+ HIGH_GET_FILTERS
+
+ movsxd rax, DWORD PTR arg(1) ;pixels_per_line
+ movsxd rdx, DWORD PTR arg(3) ;out_pitch
+ lea rax, [rax + rax] ;bytes per line
+ lea rdx, [rdx + rdx]
+ movsxd rcx, DWORD PTR arg(4) ;output_height
+
+.loop:
+ movdqu xmm0, [rsi - 6] ;load src
+ movdqu xmm1, [rsi - 4]
+ movdqu xmm2, [rsi - 2]
+ movdqu xmm3, [rsi]
+ movdqu xmm4, [rsi + 2]
+ movdqu xmm5, [rsi + 4]
+ movdqu xmm6, [rsi + 6]
+ movdqu xmm7, [rsi + 8]
+
+ HIGH_APPLY_FILTER_8 0, 0
+
+ lea rsi, [rsi + rax]
+ lea rdi, [rdi + rdx]
+ dec rcx
+ jnz .loop
+
+ add rsp, 16 * 8
+ pop rsp
+
+ ; begin epilog
+ pop rdi
+ pop rsi
+ RESTORE_XMM
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+
+;void vpx_highbd_filter_block1d16_h8_sse2
+;(
+; unsigned char *src_ptr,
+; unsigned int src_pixels_per_line,
+; unsigned char *output_ptr,
+; unsigned int output_pitch,
+; unsigned int output_height,
+; short *filter
+;)
+globalsym(vpx_highbd_filter_block1d16_h8_sse2)
+sym(vpx_highbd_filter_block1d16_h8_sse2):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 7
+ SAVE_XMM 7
+ push rsi
+ push rdi
+ ; end prolog
+
+ ALIGN_STACK 16, rax
+ sub rsp, 16 * 8
+ %define k0k1 [rsp + 16 * 0]
+ %define k6k7 [rsp + 16 * 1]
+ %define k2k5 [rsp + 16 * 2]
+ %define k3k4 [rsp + 16 * 3]
+ %define krd [rsp + 16 * 4]
+ %define temp [rsp + 16 * 5]
+ %define max [rsp + 16 * 6]
+ %define min [rsp + 16 * 7]
+
+ HIGH_GET_FILTERS
+
+ movsxd rax, DWORD PTR arg(1) ;pixels_per_line
+ movsxd rdx, DWORD PTR arg(3) ;out_pitch
+ lea rax, [rax + rax] ;bytes per line
+ lea rdx, [rdx + rdx]
+ movsxd rcx, DWORD PTR arg(4) ;output_height
+
+.loop:
+ movdqu xmm0, [rsi - 6] ;load src
+ movdqu xmm1, [rsi - 4]
+ movdqu xmm2, [rsi - 2]
+ movdqu xmm3, [rsi]
+ movdqu xmm4, [rsi + 2]
+ movdqu xmm5, [rsi + 4]
+ movdqu xmm6, [rsi + 6]
+ movdqu xmm7, [rsi + 8]
+
+ HIGH_APPLY_FILTER_8 0, 0
+
+ movdqu xmm0, [rsi + 10] ;load src
+ movdqu xmm1, [rsi + 12]
+ movdqu xmm2, [rsi + 14]
+ movdqu xmm3, [rsi + 16]
+ movdqu xmm4, [rsi + 18]
+ movdqu xmm5, [rsi + 20]
+ movdqu xmm6, [rsi + 22]
+ movdqu xmm7, [rsi + 24]
+
+ HIGH_APPLY_FILTER_8 0, 16
+
+ lea rsi, [rsi + rax]
+ lea rdi, [rdi + rdx]
+ dec rcx
+ jnz .loop
+
+ add rsp, 16 * 8
+ pop rsp
+
+ ; begin epilog
+ pop rdi
+ pop rsi
+ RESTORE_XMM
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+
+globalsym(vpx_highbd_filter_block1d4_h8_avg_sse2)
+sym(vpx_highbd_filter_block1d4_h8_avg_sse2):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 7
+ SAVE_XMM 7
+ push rsi
+ push rdi
+ ; end prolog
+
+ ALIGN_STACK 16, rax
+ sub rsp, 16 * 7
+ %define k0k6 [rsp + 16 * 0]
+ %define k2k5 [rsp + 16 * 1]
+ %define k3k4 [rsp + 16 * 2]
+ %define k1k7 [rsp + 16 * 3]
+ %define krd [rsp + 16 * 4]
+ %define max [rsp + 16 * 5]
+ %define min [rsp + 16 * 6]
+
+ HIGH_GET_FILTERS_4
+
+ mov rsi, arg(0) ;src_ptr
+ mov rdi, arg(2) ;output_ptr
+
+ movsxd rax, DWORD PTR arg(1) ;pixels_per_line
+ movsxd rdx, DWORD PTR arg(3) ;out_pitch
+ lea rax, [rax + rax] ;bytes per line
+ lea rdx, [rdx + rdx]
+ movsxd rcx, DWORD PTR arg(4) ;output_height
+
+.loop:
+ movdqu xmm0, [rsi - 6] ;load src
+ movdqu xmm4, [rsi + 2]
+ movdqa xmm1, xmm0
+ movdqa xmm6, xmm4
+ movdqa xmm7, xmm4
+ movdqa xmm2, xmm0
+ movdqa xmm3, xmm0
+ movdqa xmm5, xmm4
+
+ psrldq xmm1, 2
+ psrldq xmm6, 4
+ psrldq xmm7, 6
+ psrldq xmm2, 4
+ psrldq xmm3, 6
+ psrldq xmm5, 2
+
+ HIGH_APPLY_FILTER_4 1
+
+ lea rsi, [rsi + rax]
+ lea rdi, [rdi + rdx]
+ dec rcx
+ jnz .loop
+
+ add rsp, 16 * 7
+ pop rsp
+
+ ; begin epilog
+ pop rdi
+ pop rsi
+ RESTORE_XMM
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+
+globalsym(vpx_highbd_filter_block1d8_h8_avg_sse2)
+sym(vpx_highbd_filter_block1d8_h8_avg_sse2):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 7
+ SAVE_XMM 7
+ push rsi
+ push rdi
+ ; end prolog
+
+ ALIGN_STACK 16, rax
+ sub rsp, 16 * 8
+ %define k0k1 [rsp + 16 * 0]
+ %define k6k7 [rsp + 16 * 1]
+ %define k2k5 [rsp + 16 * 2]
+ %define k3k4 [rsp + 16 * 3]
+ %define krd [rsp + 16 * 4]
+ %define temp [rsp + 16 * 5]
+ %define max [rsp + 16 * 6]
+ %define min [rsp + 16 * 7]
+
+ HIGH_GET_FILTERS
+
+ movsxd rax, DWORD PTR arg(1) ;pixels_per_line
+ movsxd rdx, DWORD PTR arg(3) ;out_pitch
+ lea rax, [rax + rax] ;bytes per line
+ lea rdx, [rdx + rdx]
+ movsxd rcx, DWORD PTR arg(4) ;output_height
+
+.loop:
+ movdqu xmm0, [rsi - 6] ;load src
+ movdqu xmm1, [rsi - 4]
+ movdqu xmm2, [rsi - 2]
+ movdqu xmm3, [rsi]
+ movdqu xmm4, [rsi + 2]
+ movdqu xmm5, [rsi + 4]
+ movdqu xmm6, [rsi + 6]
+ movdqu xmm7, [rsi + 8]
+
+ HIGH_APPLY_FILTER_8 1, 0
+
+ lea rsi, [rsi + rax]
+ lea rdi, [rdi + rdx]
+ dec rcx
+ jnz .loop
+
+ add rsp, 16 * 8
+ pop rsp
+
+ ; begin epilog
+ pop rdi
+ pop rsi
+ RESTORE_XMM
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+
+globalsym(vpx_highbd_filter_block1d16_h8_avg_sse2)
+sym(vpx_highbd_filter_block1d16_h8_avg_sse2):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 7
+ SAVE_XMM 7
+ push rsi
+ push rdi
+ ; end prolog
+
+ ALIGN_STACK 16, rax
+ sub rsp, 16 * 8
+ %define k0k1 [rsp + 16 * 0]
+ %define k6k7 [rsp + 16 * 1]
+ %define k2k5 [rsp + 16 * 2]
+ %define k3k4 [rsp + 16 * 3]
+ %define krd [rsp + 16 * 4]
+ %define temp [rsp + 16 * 5]
+ %define max [rsp + 16 * 6]
+ %define min [rsp + 16 * 7]
+
+ HIGH_GET_FILTERS
+
+ movsxd rax, DWORD PTR arg(1) ;pixels_per_line
+ movsxd rdx, DWORD PTR arg(3) ;out_pitch
+ lea rax, [rax + rax] ;bytes per line
+ lea rdx, [rdx + rdx]
+ movsxd rcx, DWORD PTR arg(4) ;output_height
+
+.loop:
+ movdqu xmm0, [rsi - 6] ;load src
+ movdqu xmm1, [rsi - 4]
+ movdqu xmm2, [rsi - 2]
+ movdqu xmm3, [rsi]
+ movdqu xmm4, [rsi + 2]
+ movdqu xmm5, [rsi + 4]
+ movdqu xmm6, [rsi + 6]
+ movdqu xmm7, [rsi + 8]
+
+ HIGH_APPLY_FILTER_8 1, 0
+
+ movdqu xmm0, [rsi + 10] ;load src
+ movdqu xmm1, [rsi + 12]
+ movdqu xmm2, [rsi + 14]
+ movdqu xmm3, [rsi + 16]
+ movdqu xmm4, [rsi + 18]
+ movdqu xmm5, [rsi + 20]
+ movdqu xmm6, [rsi + 22]
+ movdqu xmm7, [rsi + 24]
+
+ HIGH_APPLY_FILTER_8 1, 16
+
+ lea rsi, [rsi + rax]
+ lea rdi, [rdi + rdx]
+ dec rcx
+ jnz .loop
+
+ add rsp, 16 * 8
+ pop rsp
+
+ ; begin epilog
+ pop rdi
+ pop rsi
+ RESTORE_XMM
+ UNSHADOW_ARGS
+ pop rbp
+ ret
diff --git a/media/libvpx/libvpx/vpx_dsp/x86/vpx_high_subpixel_bilinear_sse2.asm b/media/libvpx/libvpx/vpx_dsp/x86/vpx_high_subpixel_bilinear_sse2.asm
new file mode 100644
index 0000000000..bd51c75bcb
--- /dev/null
+++ b/media/libvpx/libvpx/vpx_dsp/x86/vpx_high_subpixel_bilinear_sse2.asm
@@ -0,0 +1,496 @@
+;
+; Copyright (c) 2014 The WebM project authors. All Rights Reserved.
+;
+; Use of this source code is governed by a BSD-style license
+; that can be found in the LICENSE file in the root of the source
+; tree. An additional intellectual property rights grant can be found
+; in the file PATENTS. All contributing project authors may
+; be found in the AUTHORS file in the root of the source tree.
+;
+
+%include "vpx_ports/x86_abi_support.asm"
+
+%macro HIGH_GET_PARAM_4 0
+ mov rdx, arg(5) ;filter ptr
+ mov rsi, arg(0) ;src_ptr
+ mov rdi, arg(2) ;output_ptr
+ mov rcx, 0x00000040
+
+ movdqa xmm3, [rdx] ;load filters
+ pshuflw xmm4, xmm3, 11111111b ;k3
+ psrldq xmm3, 8
+ pshuflw xmm3, xmm3, 0b ;k4
+ punpcklwd xmm4, xmm3 ;k3k4
+
+ movq xmm3, rcx ;rounding
+ pshufd xmm3, xmm3, 0
+
+ mov rdx, 0x00010001
+ movsxd rcx, DWORD PTR arg(6) ;bd
+ movq xmm5, rdx
+ movq xmm2, rcx
+ pshufd xmm5, xmm5, 0b
+ movdqa xmm1, xmm5
+ psllw xmm5, xmm2
+ psubw xmm5, xmm1 ;max value (for clamping)
+ pxor xmm2, xmm2 ;min value (for clamping)
+
+ movsxd rax, DWORD PTR arg(1) ;pixels_per_line
+ movsxd rdx, DWORD PTR arg(3) ;out_pitch
+ movsxd rcx, DWORD PTR arg(4) ;output_height
+%endm
+
+%macro HIGH_APPLY_FILTER_4 1
+
+ punpcklwd xmm0, xmm1 ;two row in one register
+ pmaddwd xmm0, xmm4 ;multiply the filter factors
+
+ paddd xmm0, xmm3 ;rounding
+ psrad xmm0, 7 ;shift
+ packssdw xmm0, xmm0 ;pack to word
+
+ ;clamp the values
+ pminsw xmm0, xmm5
+ pmaxsw xmm0, xmm2
+
+%if %1
+ movq xmm1, [rdi]
+ pavgw xmm0, xmm1
+%endif
+
+ movq [rdi], xmm0
+ lea rsi, [rsi + 2*rax]
+ lea rdi, [rdi + 2*rdx]
+ dec rcx
+%endm
+
+%if VPX_ARCH_X86_64
+%macro HIGH_GET_PARAM 0
+ mov rdx, arg(5) ;filter ptr
+ mov rsi, arg(0) ;src_ptr
+ mov rdi, arg(2) ;output_ptr
+ mov rcx, 0x00000040
+
+ movdqa xmm6, [rdx] ;load filters
+
+ pshuflw xmm7, xmm6, 11111111b ;k3
+ pshufhw xmm6, xmm6, 0b ;k4
+ psrldq xmm6, 8
+ punpcklwd xmm7, xmm6 ;k3k4k3k4k3k4k3k4
+
+ movq xmm4, rcx ;rounding
+ pshufd xmm4, xmm4, 0
+
+ mov rdx, 0x00010001
+ movsxd rcx, DWORD PTR arg(6) ;bd
+ movq xmm8, rdx
+ movq xmm5, rcx
+ pshufd xmm8, xmm8, 0b
+ movdqa xmm1, xmm8
+ psllw xmm8, xmm5
+ psubw xmm8, xmm1 ;max value (for clamping)
+ pxor xmm5, xmm5 ;min value (for clamping)
+
+ movsxd rax, DWORD PTR arg(1) ;pixels_per_line
+ movsxd rdx, DWORD PTR arg(3) ;out_pitch
+ movsxd rcx, DWORD PTR arg(4) ;output_height
+%endm
+
+%macro HIGH_APPLY_FILTER_8 1
+ movdqa xmm6, xmm0
+ punpckhwd xmm6, xmm1
+ punpcklwd xmm0, xmm1
+ pmaddwd xmm6, xmm7
+ pmaddwd xmm0, xmm7
+
+ paddd xmm6, xmm4 ;rounding
+ paddd xmm0, xmm4 ;rounding
+ psrad xmm6, 7 ;shift
+ psrad xmm0, 7 ;shift
+ packssdw xmm0, xmm6 ;pack back to word
+
+ ;clamp the values
+ pminsw xmm0, xmm8
+ pmaxsw xmm0, xmm5
+
+%if %1
+ movdqu xmm1, [rdi]
+ pavgw xmm0, xmm1
+%endif
+ movdqu [rdi], xmm0 ;store the result
+
+ lea rsi, [rsi + 2*rax]
+ lea rdi, [rdi + 2*rdx]
+ dec rcx
+%endm
+
+%macro HIGH_APPLY_FILTER_16 1
+ movdqa xmm9, xmm0
+ movdqa xmm6, xmm2
+ punpckhwd xmm9, xmm1
+ punpckhwd xmm6, xmm3
+ punpcklwd xmm0, xmm1
+ punpcklwd xmm2, xmm3
+
+ pmaddwd xmm9, xmm7
+ pmaddwd xmm6, xmm7
+ pmaddwd xmm0, xmm7
+ pmaddwd xmm2, xmm7
+
+ paddd xmm9, xmm4 ;rounding
+ paddd xmm6, xmm4
+ paddd xmm0, xmm4
+ paddd xmm2, xmm4
+
+ psrad xmm9, 7 ;shift
+ psrad xmm6, 7
+ psrad xmm0, 7
+ psrad xmm2, 7
+
+ packssdw xmm0, xmm9 ;pack back to word
+ packssdw xmm2, xmm6 ;pack back to word
+
+ ;clamp the values
+ pminsw xmm0, xmm8
+ pmaxsw xmm0, xmm5
+ pminsw xmm2, xmm8
+ pmaxsw xmm2, xmm5
+
+%if %1
+ movdqu xmm1, [rdi]
+ movdqu xmm3, [rdi + 16]
+ pavgw xmm0, xmm1
+ pavgw xmm2, xmm3
+%endif
+ movdqu [rdi], xmm0 ;store the result
+ movdqu [rdi + 16], xmm2 ;store the result
+
+ lea rsi, [rsi + 2*rax]
+ lea rdi, [rdi + 2*rdx]
+ dec rcx
+%endm
+%endif
+
+SECTION .text
+
+globalsym(vpx_highbd_filter_block1d4_v2_sse2)
+sym(vpx_highbd_filter_block1d4_v2_sse2):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 7
+ push rsi
+ push rdi
+ ; end prolog
+
+ HIGH_GET_PARAM_4
+.loop:
+ movq xmm0, [rsi] ;load src
+ movq xmm1, [rsi + 2*rax]
+
+ HIGH_APPLY_FILTER_4 0
+ jnz .loop
+
+ ; begin epilog
+ pop rdi
+ pop rsi
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+
+%if VPX_ARCH_X86_64
+globalsym(vpx_highbd_filter_block1d8_v2_sse2)
+sym(vpx_highbd_filter_block1d8_v2_sse2):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 7
+ SAVE_XMM 8
+ push rsi
+ push rdi
+ ; end prolog
+
+ HIGH_GET_PARAM
+.loop:
+ movdqu xmm0, [rsi] ;0
+ movdqu xmm1, [rsi + 2*rax] ;1
+
+ HIGH_APPLY_FILTER_8 0
+ jnz .loop
+
+ ; begin epilog
+ pop rdi
+ pop rsi
+ RESTORE_XMM
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+
+globalsym(vpx_highbd_filter_block1d16_v2_sse2)
+sym(vpx_highbd_filter_block1d16_v2_sse2):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 7
+ SAVE_XMM 9
+ push rsi
+ push rdi
+ ; end prolog
+
+ HIGH_GET_PARAM
+.loop:
+ movdqu xmm0, [rsi] ;0
+ movdqu xmm2, [rsi + 16]
+ movdqu xmm1, [rsi + 2*rax] ;1
+ movdqu xmm3, [rsi + 2*rax + 16]
+
+ HIGH_APPLY_FILTER_16 0
+ jnz .loop
+
+ ; begin epilog
+ pop rdi
+ pop rsi
+ RESTORE_XMM
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+%endif
+
+globalsym(vpx_highbd_filter_block1d4_v2_avg_sse2)
+sym(vpx_highbd_filter_block1d4_v2_avg_sse2):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 7
+ push rsi
+ push rdi
+ ; end prolog
+
+ HIGH_GET_PARAM_4
+.loop:
+ movq xmm0, [rsi] ;load src
+ movq xmm1, [rsi + 2*rax]
+
+ HIGH_APPLY_FILTER_4 1
+ jnz .loop
+
+ ; begin epilog
+ pop rdi
+ pop rsi
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+
+%if VPX_ARCH_X86_64
+globalsym(vpx_highbd_filter_block1d8_v2_avg_sse2)
+sym(vpx_highbd_filter_block1d8_v2_avg_sse2):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 7
+ SAVE_XMM 8
+ push rsi
+ push rdi
+ ; end prolog
+
+ HIGH_GET_PARAM
+.loop:
+ movdqu xmm0, [rsi] ;0
+ movdqu xmm1, [rsi + 2*rax] ;1
+
+ HIGH_APPLY_FILTER_8 1
+ jnz .loop
+
+ ; begin epilog
+ pop rdi
+ pop rsi
+ RESTORE_XMM
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+
+globalsym(vpx_highbd_filter_block1d16_v2_avg_sse2)
+sym(vpx_highbd_filter_block1d16_v2_avg_sse2):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 7
+ SAVE_XMM 9
+ push rsi
+ push rdi
+ ; end prolog
+
+ HIGH_GET_PARAM
+.loop:
+ movdqu xmm0, [rsi] ;0
+ movdqu xmm1, [rsi + 2*rax] ;1
+ movdqu xmm2, [rsi + 16]
+ movdqu xmm3, [rsi + 2*rax + 16]
+
+ HIGH_APPLY_FILTER_16 1
+ jnz .loop
+
+ ; begin epilog
+ pop rdi
+ pop rsi
+ RESTORE_XMM
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+%endif
+
+globalsym(vpx_highbd_filter_block1d4_h2_sse2)
+sym(vpx_highbd_filter_block1d4_h2_sse2):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 7
+ push rsi
+ push rdi
+ ; end prolog
+
+ HIGH_GET_PARAM_4
+.loop:
+ movdqu xmm0, [rsi] ;load src
+ movdqa xmm1, xmm0
+ psrldq xmm1, 2
+
+ HIGH_APPLY_FILTER_4 0
+ jnz .loop
+
+ ; begin epilog
+ pop rdi
+ pop rsi
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+
+%if VPX_ARCH_X86_64
+globalsym(vpx_highbd_filter_block1d8_h2_sse2)
+sym(vpx_highbd_filter_block1d8_h2_sse2):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 7
+ SAVE_XMM 8
+ push rsi
+ push rdi
+ ; end prolog
+
+ HIGH_GET_PARAM
+.loop:
+ movdqu xmm0, [rsi] ;load src
+ movdqu xmm1, [rsi + 2]
+
+ HIGH_APPLY_FILTER_8 0
+ jnz .loop
+
+ ; begin epilog
+ pop rdi
+ pop rsi
+ RESTORE_XMM
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+
+globalsym(vpx_highbd_filter_block1d16_h2_sse2)
+sym(vpx_highbd_filter_block1d16_h2_sse2):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 7
+ SAVE_XMM 9
+ push rsi
+ push rdi
+ ; end prolog
+
+ HIGH_GET_PARAM
+.loop:
+ movdqu xmm0, [rsi] ;load src
+ movdqu xmm1, [rsi + 2]
+ movdqu xmm2, [rsi + 16]
+ movdqu xmm3, [rsi + 18]
+
+ HIGH_APPLY_FILTER_16 0
+ jnz .loop
+
+ ; begin epilog
+ pop rdi
+ pop rsi
+ RESTORE_XMM
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+%endif
+
+globalsym(vpx_highbd_filter_block1d4_h2_avg_sse2)
+sym(vpx_highbd_filter_block1d4_h2_avg_sse2):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 7
+ push rsi
+ push rdi
+ ; end prolog
+
+ HIGH_GET_PARAM_4
+.loop:
+ movdqu xmm0, [rsi] ;load src
+ movdqa xmm1, xmm0
+ psrldq xmm1, 2
+
+ HIGH_APPLY_FILTER_4 1
+ jnz .loop
+
+ ; begin epilog
+ pop rdi
+ pop rsi
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+
+%if VPX_ARCH_X86_64
+globalsym(vpx_highbd_filter_block1d8_h2_avg_sse2)
+sym(vpx_highbd_filter_block1d8_h2_avg_sse2):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 7
+ SAVE_XMM 8
+ push rsi
+ push rdi
+ ; end prolog
+
+ HIGH_GET_PARAM
+.loop:
+ movdqu xmm0, [rsi] ;load src
+ movdqu xmm1, [rsi + 2]
+
+ HIGH_APPLY_FILTER_8 1
+ jnz .loop
+
+ ; begin epilog
+ pop rdi
+ pop rsi
+ RESTORE_XMM
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+
+globalsym(vpx_highbd_filter_block1d16_h2_avg_sse2)
+sym(vpx_highbd_filter_block1d16_h2_avg_sse2):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 7
+ SAVE_XMM 9
+ push rsi
+ push rdi
+ ; end prolog
+
+ HIGH_GET_PARAM
+.loop:
+ movdqu xmm0, [rsi] ;load src
+ movdqu xmm1, [rsi + 2]
+ movdqu xmm2, [rsi + 16]
+ movdqu xmm3, [rsi + 18]
+
+ HIGH_APPLY_FILTER_16 1
+ jnz .loop
+
+ ; begin epilog
+ pop rdi
+ pop rsi
+ RESTORE_XMM
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+%endif
diff --git a/media/libvpx/libvpx/vpx_dsp/x86/vpx_subpixel_4t_intrin_sse2.c b/media/libvpx/libvpx/vpx_dsp/x86/vpx_subpixel_4t_intrin_sse2.c
new file mode 100644
index 0000000000..21a35ae3c3
--- /dev/null
+++ b/media/libvpx/libvpx/vpx_dsp/x86/vpx_subpixel_4t_intrin_sse2.c
@@ -0,0 +1,1161 @@
+/*
+ * Copyright (c) 2018 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <emmintrin.h>
+
+#include "./vpx_dsp_rtcd.h"
+#include "vpx/vpx_integer.h"
+#include "vpx_dsp/x86/convolve.h"
+#include "vpx_dsp/x86/convolve_sse2.h"
+#include "vpx_ports/mem.h"
+
+#define CONV8_ROUNDING_BITS (7)
+#define CONV8_ROUNDING_NUM (1 << (CONV8_ROUNDING_BITS - 1))
+
+static void vpx_filter_block1d16_h4_sse2(const uint8_t *src_ptr,
+ ptrdiff_t src_stride, uint8_t *dst_ptr,
+ ptrdiff_t dst_stride, uint32_t height,
+ const int16_t *kernel) {
+ __m128i kernel_reg; // Kernel
+ __m128i kernel_reg_23, kernel_reg_45; // Segments of the kernel used
+ const __m128i reg_32 = _mm_set1_epi16(32); // Used for rounding
+ int h;
+
+ __m128i src_reg, src_reg_shift_1, src_reg_shift_2, src_reg_shift_3;
+ __m128i dst_first, dst_second;
+ __m128i even, odd;
+
+ // Start one pixel before as we need tap/2 - 1 = 1 sample from the past
+ src_ptr -= 1;
+
+ // Load Kernel
+ kernel_reg = _mm_loadu_si128((const __m128i *)kernel);
+ kernel_reg = _mm_srai_epi16(kernel_reg, 1);
+ kernel_reg_23 = extract_quarter_2_epi16_sse2(&kernel_reg);
+ kernel_reg_45 = extract_quarter_3_epi16_sse2(&kernel_reg);
+
+ for (h = height; h > 0; --h) {
+ // We will load multiple shifted versions of the row and shuffle them into
+ // 16-bit words of the form
+ // ... s[2] s[1] s[0] s[-1]
+ // ... s[4] s[3] s[2] s[1]
+ // Then we call multiply and add to get partial results
+ // s[2]k[3]+s[1]k[2] s[0]k[3]s[-1]k[2]
+ // s[4]k[5]+s[3]k[4] s[2]k[5]s[1]k[4]
+ // The two results are then added together for the first half of even
+ // output.
+ // Repeat multiple times to get the whole outoput
+ src_reg = _mm_loadu_si128((const __m128i *)src_ptr);
+ src_reg_shift_1 = _mm_srli_si128(src_reg, 1);
+ src_reg_shift_2 = _mm_srli_si128(src_reg, 2);
+ src_reg_shift_3 = _mm_srli_si128(src_reg, 3);
+
+ // Output 6 4 2 0
+ even = mm_madd_add_epi8_sse2(&src_reg, &src_reg_shift_2, &kernel_reg_23,
+ &kernel_reg_45);
+
+ // Output 7 5 3 1
+ odd = mm_madd_add_epi8_sse2(&src_reg_shift_1, &src_reg_shift_3,
+ &kernel_reg_23, &kernel_reg_45);
+
+ // Combine to get the first half of the dst
+ dst_first = mm_zip_epi32_sse2(&even, &odd);
+
+ // Do again to get the second half of dst
+ src_reg = _mm_loadu_si128((const __m128i *)(src_ptr + 8));
+ src_reg_shift_1 = _mm_srli_si128(src_reg, 1);
+ src_reg_shift_2 = _mm_srli_si128(src_reg, 2);
+ src_reg_shift_3 = _mm_srli_si128(src_reg, 3);
+
+ // Output 14 12 10 8
+ even = mm_madd_add_epi8_sse2(&src_reg, &src_reg_shift_2, &kernel_reg_23,
+ &kernel_reg_45);
+
+ // Output 15 13 11 9
+ odd = mm_madd_add_epi8_sse2(&src_reg_shift_1, &src_reg_shift_3,
+ &kernel_reg_23, &kernel_reg_45);
+
+ // Combine to get the second half of the dst
+ dst_second = mm_zip_epi32_sse2(&even, &odd);
+
+ // Round each result
+ dst_first = mm_round_epi16_sse2(&dst_first, &reg_32, 6);
+ dst_second = mm_round_epi16_sse2(&dst_second, &reg_32, 6);
+
+ // Finally combine to get the final dst
+ dst_first = _mm_packus_epi16(dst_first, dst_second);
+ _mm_store_si128((__m128i *)dst_ptr, dst_first);
+
+ src_ptr += src_stride;
+ dst_ptr += dst_stride;
+ }
+}
+
+/* The macro used to generate functions shifts the src_ptr up by 3 rows already
+ * */
+
+static void vpx_filter_block1d16_v4_sse2(const uint8_t *src_ptr,
+ ptrdiff_t src_stride, uint8_t *dst_ptr,
+ ptrdiff_t dst_stride, uint32_t height,
+ const int16_t *kernel) {
+ // Register for source s[-1:3, :]
+ __m128i src_reg_m1, src_reg_0, src_reg_1, src_reg_2, src_reg_3;
+ // Interleaved rows of the source. lo is first half, hi second
+ __m128i src_reg_m10_lo, src_reg_m10_hi, src_reg_01_lo, src_reg_01_hi;
+ __m128i src_reg_12_lo, src_reg_12_hi, src_reg_23_lo, src_reg_23_hi;
+ // Half of half of the interleaved rows
+ __m128i src_reg_m10_lo_1, src_reg_m10_lo_2, src_reg_m10_hi_1,
+ src_reg_m10_hi_2;
+ __m128i src_reg_01_lo_1, src_reg_01_lo_2, src_reg_01_hi_1, src_reg_01_hi_2;
+ __m128i src_reg_12_lo_1, src_reg_12_lo_2, src_reg_12_hi_1, src_reg_12_hi_2;
+ __m128i src_reg_23_lo_1, src_reg_23_lo_2, src_reg_23_hi_1, src_reg_23_hi_2;
+
+ __m128i kernel_reg; // Kernel
+ __m128i kernel_reg_23, kernel_reg_45; // Segments of the kernel used
+
+ // Result after multiply and add
+ __m128i res_reg_m10_lo, res_reg_01_lo, res_reg_12_lo, res_reg_23_lo;
+ __m128i res_reg_m10_hi, res_reg_01_hi, res_reg_12_hi, res_reg_23_hi;
+ __m128i res_reg_m1012, res_reg_0123;
+ __m128i res_reg_m1012_lo, res_reg_0123_lo, res_reg_m1012_hi, res_reg_0123_hi;
+
+ const __m128i reg_32 = _mm_set1_epi16(32); // Used for rounding
+
+ // We will compute the result two rows at a time
+ const ptrdiff_t src_stride_unrolled = src_stride << 1;
+ const ptrdiff_t dst_stride_unrolled = dst_stride << 1;
+ int h;
+
+ // Load Kernel
+ kernel_reg = _mm_loadu_si128((const __m128i *)kernel);
+ kernel_reg = _mm_srai_epi16(kernel_reg, 1);
+ kernel_reg_23 = extract_quarter_2_epi16_sse2(&kernel_reg);
+ kernel_reg_45 = extract_quarter_3_epi16_sse2(&kernel_reg);
+
+ // We will load two rows of pixels as 8-bit words, rearrange them as 16-bit
+ // words,
+ // shuffle the data into the form
+ // ... s[0,1] s[-1,1] s[0,0] s[-1,0]
+ // ... s[0,7] s[-1,7] s[0,6] s[-1,6]
+ // ... s[0,9] s[-1,9] s[0,8] s[-1,8]
+ // ... s[0,13] s[-1,13] s[0,12] s[-1,12]
+ // so that we can call multiply and add with the kernel to get 32-bit words of
+ // the form
+ // ... s[0,1]k[3]+s[-1,1]k[2] s[0,0]k[3]+s[-1,0]k[2]
+ // Finally, we can add multiple rows together to get the desired output.
+
+ // First shuffle the data
+ src_reg_m1 = _mm_loadu_si128((const __m128i *)src_ptr);
+ src_reg_0 = _mm_loadu_si128((const __m128i *)(src_ptr + src_stride));
+ src_reg_m10_lo = _mm_unpacklo_epi8(src_reg_m1, src_reg_0);
+ src_reg_m10_hi = _mm_unpackhi_epi8(src_reg_m1, src_reg_0);
+ src_reg_m10_lo_1 = _mm_unpacklo_epi8(src_reg_m10_lo, _mm_setzero_si128());
+ src_reg_m10_lo_2 = _mm_unpackhi_epi8(src_reg_m10_lo, _mm_setzero_si128());
+ src_reg_m10_hi_1 = _mm_unpacklo_epi8(src_reg_m10_hi, _mm_setzero_si128());
+ src_reg_m10_hi_2 = _mm_unpackhi_epi8(src_reg_m10_hi, _mm_setzero_si128());
+
+ // More shuffling
+ src_reg_1 = _mm_loadu_si128((const __m128i *)(src_ptr + src_stride * 2));
+ src_reg_01_lo = _mm_unpacklo_epi8(src_reg_0, src_reg_1);
+ src_reg_01_hi = _mm_unpackhi_epi8(src_reg_0, src_reg_1);
+ src_reg_01_lo_1 = _mm_unpacklo_epi8(src_reg_01_lo, _mm_setzero_si128());
+ src_reg_01_lo_2 = _mm_unpackhi_epi8(src_reg_01_lo, _mm_setzero_si128());
+ src_reg_01_hi_1 = _mm_unpacklo_epi8(src_reg_01_hi, _mm_setzero_si128());
+ src_reg_01_hi_2 = _mm_unpackhi_epi8(src_reg_01_hi, _mm_setzero_si128());
+
+ for (h = height; h > 1; h -= 2) {
+ src_reg_2 = _mm_loadu_si128((const __m128i *)(src_ptr + src_stride * 3));
+
+ src_reg_12_lo = _mm_unpacklo_epi8(src_reg_1, src_reg_2);
+ src_reg_12_hi = _mm_unpackhi_epi8(src_reg_1, src_reg_2);
+
+ src_reg_3 = _mm_loadu_si128((const __m128i *)(src_ptr + src_stride * 4));
+
+ src_reg_23_lo = _mm_unpacklo_epi8(src_reg_2, src_reg_3);
+ src_reg_23_hi = _mm_unpackhi_epi8(src_reg_2, src_reg_3);
+
+ // Partial output from first half
+ res_reg_m10_lo = mm_madd_packs_epi16_sse2(
+ &src_reg_m10_lo_1, &src_reg_m10_lo_2, &kernel_reg_23);
+
+ res_reg_01_lo = mm_madd_packs_epi16_sse2(&src_reg_01_lo_1, &src_reg_01_lo_2,
+ &kernel_reg_23);
+
+ src_reg_12_lo_1 = _mm_unpacklo_epi8(src_reg_12_lo, _mm_setzero_si128());
+ src_reg_12_lo_2 = _mm_unpackhi_epi8(src_reg_12_lo, _mm_setzero_si128());
+ res_reg_12_lo = mm_madd_packs_epi16_sse2(&src_reg_12_lo_1, &src_reg_12_lo_2,
+ &kernel_reg_45);
+
+ src_reg_23_lo_1 = _mm_unpacklo_epi8(src_reg_23_lo, _mm_setzero_si128());
+ src_reg_23_lo_2 = _mm_unpackhi_epi8(src_reg_23_lo, _mm_setzero_si128());
+ res_reg_23_lo = mm_madd_packs_epi16_sse2(&src_reg_23_lo_1, &src_reg_23_lo_2,
+ &kernel_reg_45);
+
+ // Add to get first half of the results
+ res_reg_m1012_lo = _mm_adds_epi16(res_reg_m10_lo, res_reg_12_lo);
+ res_reg_0123_lo = _mm_adds_epi16(res_reg_01_lo, res_reg_23_lo);
+
+ // Now repeat everything again for the second half
+ // Partial output for second half
+ res_reg_m10_hi = mm_madd_packs_epi16_sse2(
+ &src_reg_m10_hi_1, &src_reg_m10_hi_2, &kernel_reg_23);
+
+ res_reg_01_hi = mm_madd_packs_epi16_sse2(&src_reg_01_hi_1, &src_reg_01_hi_2,
+ &kernel_reg_23);
+
+ src_reg_12_hi_1 = _mm_unpacklo_epi8(src_reg_12_hi, _mm_setzero_si128());
+ src_reg_12_hi_2 = _mm_unpackhi_epi8(src_reg_12_hi, _mm_setzero_si128());
+ res_reg_12_hi = mm_madd_packs_epi16_sse2(&src_reg_12_hi_1, &src_reg_12_hi_2,
+ &kernel_reg_45);
+
+ src_reg_23_hi_1 = _mm_unpacklo_epi8(src_reg_23_hi, _mm_setzero_si128());
+ src_reg_23_hi_2 = _mm_unpackhi_epi8(src_reg_23_hi, _mm_setzero_si128());
+ res_reg_23_hi = mm_madd_packs_epi16_sse2(&src_reg_23_hi_1, &src_reg_23_hi_2,
+ &kernel_reg_45);
+
+ // Second half of the results
+ res_reg_m1012_hi = _mm_adds_epi16(res_reg_m10_hi, res_reg_12_hi);
+ res_reg_0123_hi = _mm_adds_epi16(res_reg_01_hi, res_reg_23_hi);
+
+ // Round the words
+ res_reg_m1012_lo = mm_round_epi16_sse2(&res_reg_m1012_lo, &reg_32, 6);
+ res_reg_0123_lo = mm_round_epi16_sse2(&res_reg_0123_lo, &reg_32, 6);
+ res_reg_m1012_hi = mm_round_epi16_sse2(&res_reg_m1012_hi, &reg_32, 6);
+ res_reg_0123_hi = mm_round_epi16_sse2(&res_reg_0123_hi, &reg_32, 6);
+
+ // Combine to get the result
+ res_reg_m1012 = _mm_packus_epi16(res_reg_m1012_lo, res_reg_m1012_hi);
+ res_reg_0123 = _mm_packus_epi16(res_reg_0123_lo, res_reg_0123_hi);
+
+ _mm_store_si128((__m128i *)dst_ptr, res_reg_m1012);
+ _mm_store_si128((__m128i *)(dst_ptr + dst_stride), res_reg_0123);
+
+ // Update the source by two rows
+ src_ptr += src_stride_unrolled;
+ dst_ptr += dst_stride_unrolled;
+
+ src_reg_m10_lo_1 = src_reg_12_lo_1;
+ src_reg_m10_lo_2 = src_reg_12_lo_2;
+ src_reg_m10_hi_1 = src_reg_12_hi_1;
+ src_reg_m10_hi_2 = src_reg_12_hi_2;
+ src_reg_01_lo_1 = src_reg_23_lo_1;
+ src_reg_01_lo_2 = src_reg_23_lo_2;
+ src_reg_01_hi_1 = src_reg_23_hi_1;
+ src_reg_01_hi_2 = src_reg_23_hi_2;
+ src_reg_1 = src_reg_3;
+ }
+}
+
+static void vpx_filter_block1d8_h4_sse2(const uint8_t *src_ptr,
+ ptrdiff_t src_stride, uint8_t *dst_ptr,
+ ptrdiff_t dst_stride, uint32_t height,
+ const int16_t *kernel) {
+ __m128i kernel_reg; // Kernel
+ __m128i kernel_reg_23, kernel_reg_45; // Segments of the kernel used
+ const __m128i reg_32 = _mm_set1_epi16(32); // Used for rounding
+ int h;
+
+ __m128i src_reg, src_reg_shift_1, src_reg_shift_2, src_reg_shift_3;
+ __m128i dst_first;
+ __m128i even, odd;
+
+ // Start one pixel before as we need tap/2 - 1 = 1 sample from the past
+ src_ptr -= 1;
+
+ // Load Kernel
+ kernel_reg = _mm_loadu_si128((const __m128i *)kernel);
+ kernel_reg = _mm_srai_epi16(kernel_reg, 1);
+ kernel_reg_23 = extract_quarter_2_epi16_sse2(&kernel_reg);
+ kernel_reg_45 = extract_quarter_3_epi16_sse2(&kernel_reg);
+
+ for (h = height; h > 0; --h) {
+ // We will load multiple shifted versions of the row and shuffle them into
+ // 16-bit words of the form
+ // ... s[2] s[1] s[0] s[-1]
+ // ... s[4] s[3] s[2] s[1]
+ // Then we call multiply and add to get partial results
+ // s[2]k[3]+s[1]k[2] s[0]k[3]s[-1]k[2]
+ // s[4]k[5]+s[3]k[4] s[2]k[5]s[1]k[4]
+ // The two results are then added together to get the even output
+ src_reg = _mm_loadu_si128((const __m128i *)src_ptr);
+ src_reg_shift_1 = _mm_srli_si128(src_reg, 1);
+ src_reg_shift_2 = _mm_srli_si128(src_reg, 2);
+ src_reg_shift_3 = _mm_srli_si128(src_reg, 3);
+
+ // Output 6 4 2 0
+ even = mm_madd_add_epi8_sse2(&src_reg, &src_reg_shift_2, &kernel_reg_23,
+ &kernel_reg_45);
+
+ // Output 7 5 3 1
+ odd = mm_madd_add_epi8_sse2(&src_reg_shift_1, &src_reg_shift_3,
+ &kernel_reg_23, &kernel_reg_45);
+
+ // Combine to get the first half of the dst
+ dst_first = mm_zip_epi32_sse2(&even, &odd);
+ dst_first = mm_round_epi16_sse2(&dst_first, &reg_32, 6);
+
+ // Saturate and convert to 8-bit words
+ dst_first = _mm_packus_epi16(dst_first, _mm_setzero_si128());
+
+ _mm_storel_epi64((__m128i *)dst_ptr, dst_first);
+
+ src_ptr += src_stride;
+ dst_ptr += dst_stride;
+ }
+}
+
+static void vpx_filter_block1d8_v4_sse2(const uint8_t *src_ptr,
+ ptrdiff_t src_stride, uint8_t *dst_ptr,
+ ptrdiff_t dst_stride, uint32_t height,
+ const int16_t *kernel) {
+ // Register for source s[-1:3, :]
+ __m128i src_reg_m1, src_reg_0, src_reg_1, src_reg_2, src_reg_3;
+ // Interleaved rows of the source. lo is first half, hi second
+ __m128i src_reg_m10_lo, src_reg_01_lo;
+ __m128i src_reg_12_lo, src_reg_23_lo;
+ // Half of half of the interleaved rows
+ __m128i src_reg_m10_lo_1, src_reg_m10_lo_2;
+ __m128i src_reg_01_lo_1, src_reg_01_lo_2;
+ __m128i src_reg_12_lo_1, src_reg_12_lo_2;
+ __m128i src_reg_23_lo_1, src_reg_23_lo_2;
+
+ __m128i kernel_reg; // Kernel
+ __m128i kernel_reg_23, kernel_reg_45; // Segments of the kernel used
+
+ // Result after multiply and add
+ __m128i res_reg_m10_lo, res_reg_01_lo, res_reg_12_lo, res_reg_23_lo;
+ __m128i res_reg_m1012, res_reg_0123;
+ __m128i res_reg_m1012_lo, res_reg_0123_lo;
+
+ const __m128i reg_32 = _mm_set1_epi16(32); // Used for rounding
+
+ // We will compute the result two rows at a time
+ const ptrdiff_t src_stride_unrolled = src_stride << 1;
+ const ptrdiff_t dst_stride_unrolled = dst_stride << 1;
+ int h;
+
+ // Load Kernel
+ kernel_reg = _mm_loadu_si128((const __m128i *)kernel);
+ kernel_reg = _mm_srai_epi16(kernel_reg, 1);
+ kernel_reg_23 = extract_quarter_2_epi16_sse2(&kernel_reg);
+ kernel_reg_45 = extract_quarter_3_epi16_sse2(&kernel_reg);
+
+ // We will load two rows of pixels as 8-bit words, rearrange them as 16-bit
+ // words,
+ // shuffle the data into the form
+ // ... s[0,1] s[-1,1] s[0,0] s[-1,0]
+ // ... s[0,7] s[-1,7] s[0,6] s[-1,6]
+ // ... s[0,9] s[-1,9] s[0,8] s[-1,8]
+ // ... s[0,13] s[-1,13] s[0,12] s[-1,12]
+ // so that we can call multiply and add with the kernel to get 32-bit words of
+ // the form
+ // ... s[0,1]k[3]+s[-1,1]k[2] s[0,0]k[3]+s[-1,0]k[2]
+ // Finally, we can add multiple rows together to get the desired output.
+
+ // First shuffle the data
+ src_reg_m1 = _mm_loadu_si128((const __m128i *)src_ptr);
+ src_reg_0 = _mm_loadu_si128((const __m128i *)(src_ptr + src_stride));
+ src_reg_m10_lo = _mm_unpacklo_epi8(src_reg_m1, src_reg_0);
+ src_reg_m10_lo_1 = _mm_unpacklo_epi8(src_reg_m10_lo, _mm_setzero_si128());
+ src_reg_m10_lo_2 = _mm_unpackhi_epi8(src_reg_m10_lo, _mm_setzero_si128());
+
+ // More shuffling
+ src_reg_1 = _mm_loadu_si128((const __m128i *)(src_ptr + src_stride * 2));
+ src_reg_01_lo = _mm_unpacklo_epi8(src_reg_0, src_reg_1);
+ src_reg_01_lo_1 = _mm_unpacklo_epi8(src_reg_01_lo, _mm_setzero_si128());
+ src_reg_01_lo_2 = _mm_unpackhi_epi8(src_reg_01_lo, _mm_setzero_si128());
+
+ for (h = height; h > 1; h -= 2) {
+ src_reg_2 = _mm_loadu_si128((const __m128i *)(src_ptr + src_stride * 3));
+
+ src_reg_12_lo = _mm_unpacklo_epi8(src_reg_1, src_reg_2);
+
+ src_reg_3 = _mm_loadu_si128((const __m128i *)(src_ptr + src_stride * 4));
+
+ src_reg_23_lo = _mm_unpacklo_epi8(src_reg_2, src_reg_3);
+
+ // Partial output
+ res_reg_m10_lo = mm_madd_packs_epi16_sse2(
+ &src_reg_m10_lo_1, &src_reg_m10_lo_2, &kernel_reg_23);
+
+ res_reg_01_lo = mm_madd_packs_epi16_sse2(&src_reg_01_lo_1, &src_reg_01_lo_2,
+ &kernel_reg_23);
+
+ src_reg_12_lo_1 = _mm_unpacklo_epi8(src_reg_12_lo, _mm_setzero_si128());
+ src_reg_12_lo_2 = _mm_unpackhi_epi8(src_reg_12_lo, _mm_setzero_si128());
+ res_reg_12_lo = mm_madd_packs_epi16_sse2(&src_reg_12_lo_1, &src_reg_12_lo_2,
+ &kernel_reg_45);
+
+ src_reg_23_lo_1 = _mm_unpacklo_epi8(src_reg_23_lo, _mm_setzero_si128());
+ src_reg_23_lo_2 = _mm_unpackhi_epi8(src_reg_23_lo, _mm_setzero_si128());
+ res_reg_23_lo = mm_madd_packs_epi16_sse2(&src_reg_23_lo_1, &src_reg_23_lo_2,
+ &kernel_reg_45);
+
+ // Add to get results
+ res_reg_m1012_lo = _mm_adds_epi16(res_reg_m10_lo, res_reg_12_lo);
+ res_reg_0123_lo = _mm_adds_epi16(res_reg_01_lo, res_reg_23_lo);
+
+ // Round the words
+ res_reg_m1012_lo = mm_round_epi16_sse2(&res_reg_m1012_lo, &reg_32, 6);
+ res_reg_0123_lo = mm_round_epi16_sse2(&res_reg_0123_lo, &reg_32, 6);
+
+ // Convert to 8-bit words
+ res_reg_m1012 = _mm_packus_epi16(res_reg_m1012_lo, _mm_setzero_si128());
+ res_reg_0123 = _mm_packus_epi16(res_reg_0123_lo, _mm_setzero_si128());
+
+ // Save only half of the register (8 words)
+ _mm_storel_epi64((__m128i *)dst_ptr, res_reg_m1012);
+ _mm_storel_epi64((__m128i *)(dst_ptr + dst_stride), res_reg_0123);
+
+ // Update the source by two rows
+ src_ptr += src_stride_unrolled;
+ dst_ptr += dst_stride_unrolled;
+
+ src_reg_m10_lo_1 = src_reg_12_lo_1;
+ src_reg_m10_lo_2 = src_reg_12_lo_2;
+ src_reg_01_lo_1 = src_reg_23_lo_1;
+ src_reg_01_lo_2 = src_reg_23_lo_2;
+ src_reg_1 = src_reg_3;
+ }
+}
+
+static void vpx_filter_block1d4_h4_sse2(const uint8_t *src_ptr,
+ ptrdiff_t src_stride, uint8_t *dst_ptr,
+ ptrdiff_t dst_stride, uint32_t height,
+ const int16_t *kernel) {
+ __m128i kernel_reg; // Kernel
+ __m128i kernel_reg_23, kernel_reg_45; // Segments of the kernel used
+ const __m128i reg_32 = _mm_set1_epi16(32); // Used for rounding
+ int h;
+
+ __m128i src_reg, src_reg_shift_1, src_reg_shift_2, src_reg_shift_3;
+ __m128i dst_first;
+ __m128i tmp_0, tmp_1;
+
+ // Start one pixel before as we need tap/2 - 1 = 1 sample from the past
+ src_ptr -= 1;
+
+ // Load Kernel
+ kernel_reg = _mm_loadu_si128((const __m128i *)kernel);
+ kernel_reg = _mm_srai_epi16(kernel_reg, 1);
+ kernel_reg_23 = extract_quarter_2_epi16_sse2(&kernel_reg);
+ kernel_reg_45 = extract_quarter_3_epi16_sse2(&kernel_reg);
+
+ for (h = height; h > 0; --h) {
+ // We will load multiple shifted versions of the row and shuffle them into
+ // 16-bit words of the form
+ // ... s[1] s[0] s[0] s[-1]
+ // ... s[3] s[2] s[2] s[1]
+ // Then we call multiply and add to get partial results
+ // s[1]k[3]+s[0]k[2] s[0]k[3]s[-1]k[2]
+ // s[3]k[5]+s[2]k[4] s[2]k[5]s[1]k[4]
+ // The two results are then added together to get the output
+ src_reg = _mm_loadu_si128((const __m128i *)src_ptr);
+ src_reg_shift_1 = _mm_srli_si128(src_reg, 1);
+ src_reg_shift_2 = _mm_srli_si128(src_reg, 2);
+ src_reg_shift_3 = _mm_srli_si128(src_reg, 3);
+
+ // Convert to 16-bit words
+ src_reg = _mm_unpacklo_epi8(src_reg, _mm_setzero_si128());
+ src_reg_shift_1 = _mm_unpacklo_epi8(src_reg_shift_1, _mm_setzero_si128());
+ src_reg_shift_2 = _mm_unpacklo_epi8(src_reg_shift_2, _mm_setzero_si128());
+ src_reg_shift_3 = _mm_unpacklo_epi8(src_reg_shift_3, _mm_setzero_si128());
+
+ // Shuffle into the right format
+ tmp_0 = _mm_unpacklo_epi32(src_reg, src_reg_shift_1);
+ tmp_1 = _mm_unpacklo_epi32(src_reg_shift_2, src_reg_shift_3);
+
+ // Partial output
+ tmp_0 = _mm_madd_epi16(tmp_0, kernel_reg_23);
+ tmp_1 = _mm_madd_epi16(tmp_1, kernel_reg_45);
+
+ // Output
+ dst_first = _mm_add_epi32(tmp_0, tmp_1);
+ dst_first = _mm_packs_epi32(dst_first, _mm_setzero_si128());
+
+ dst_first = mm_round_epi16_sse2(&dst_first, &reg_32, 6);
+
+ // Saturate and convert to 8-bit words
+ dst_first = _mm_packus_epi16(dst_first, _mm_setzero_si128());
+
+ *((int *)(dst_ptr)) = _mm_cvtsi128_si32(dst_first);
+
+ src_ptr += src_stride;
+ dst_ptr += dst_stride;
+ }
+}
+
+static void vpx_filter_block1d4_v4_sse2(const uint8_t *src_ptr,
+ ptrdiff_t src_stride, uint8_t *dst_ptr,
+ ptrdiff_t dst_stride, uint32_t height,
+ const int16_t *kernel) {
+ // Register for source s[-1:3, :]
+ __m128i src_reg_m1, src_reg_0, src_reg_1, src_reg_2, src_reg_3;
+ // Interleaved rows of the source. lo is first half, hi second
+ __m128i src_reg_m10_lo, src_reg_01_lo;
+ __m128i src_reg_12_lo, src_reg_23_lo;
+ // Half of half of the interleaved rows
+ __m128i src_reg_m10_lo_1;
+ __m128i src_reg_01_lo_1;
+ __m128i src_reg_12_lo_1;
+ __m128i src_reg_23_lo_1;
+
+ __m128i kernel_reg; // Kernel
+ __m128i kernel_reg_23, kernel_reg_45; // Segments of the kernel used
+
+ // Result after multiply and add
+ __m128i res_reg_m10_lo, res_reg_01_lo, res_reg_12_lo, res_reg_23_lo;
+ __m128i res_reg_m1012, res_reg_0123;
+ __m128i res_reg_m1012_lo, res_reg_0123_lo;
+
+ const __m128i reg_32 = _mm_set1_epi16(32); // Used for rounding
+ const __m128i reg_zero = _mm_setzero_si128();
+
+ // We will compute the result two rows at a time
+ const ptrdiff_t src_stride_unrolled = src_stride << 1;
+ const ptrdiff_t dst_stride_unrolled = dst_stride << 1;
+ int h;
+
+ // Load Kernel
+ kernel_reg = _mm_loadu_si128((const __m128i *)kernel);
+ kernel_reg = _mm_srai_epi16(kernel_reg, 1);
+ kernel_reg_23 = extract_quarter_2_epi16_sse2(&kernel_reg);
+ kernel_reg_45 = extract_quarter_3_epi16_sse2(&kernel_reg);
+
+ // We will load two rows of pixels as 8-bit words, rearrange them as 16-bit
+ // words,
+ // shuffle the data into the form
+ // ... s[0,1] s[-1,1] s[0,0] s[-1,0]
+ // ... s[0,7] s[-1,7] s[0,6] s[-1,6]
+ // ... s[0,9] s[-1,9] s[0,8] s[-1,8]
+ // ... s[0,13] s[-1,13] s[0,12] s[-1,12]
+ // so that we can call multiply and add with the kernel to get 32-bit words of
+ // the form
+ // ... s[0,1]k[3]+s[-1,1]k[2] s[0,0]k[3]+s[-1,0]k[2]
+ // Finally, we can add multiple rows together to get the desired output.
+
+ // First shuffle the data
+ src_reg_m1 = _mm_loadu_si128((const __m128i *)src_ptr);
+ src_reg_0 = _mm_loadu_si128((const __m128i *)(src_ptr + src_stride));
+ src_reg_m10_lo = _mm_unpacklo_epi8(src_reg_m1, src_reg_0);
+ src_reg_m10_lo_1 = _mm_unpacklo_epi8(src_reg_m10_lo, _mm_setzero_si128());
+
+ // More shuffling
+ src_reg_1 = _mm_loadu_si128((const __m128i *)(src_ptr + src_stride * 2));
+ src_reg_01_lo = _mm_unpacklo_epi8(src_reg_0, src_reg_1);
+ src_reg_01_lo_1 = _mm_unpacklo_epi8(src_reg_01_lo, _mm_setzero_si128());
+
+ for (h = height; h > 1; h -= 2) {
+ src_reg_2 = _mm_loadu_si128((const __m128i *)(src_ptr + src_stride * 3));
+
+ src_reg_12_lo = _mm_unpacklo_epi8(src_reg_1, src_reg_2);
+
+ src_reg_3 = _mm_loadu_si128((const __m128i *)(src_ptr + src_stride * 4));
+
+ src_reg_23_lo = _mm_unpacklo_epi8(src_reg_2, src_reg_3);
+
+ // Partial output
+ res_reg_m10_lo =
+ mm_madd_packs_epi16_sse2(&src_reg_m10_lo_1, &reg_zero, &kernel_reg_23);
+
+ res_reg_01_lo =
+ mm_madd_packs_epi16_sse2(&src_reg_01_lo_1, &reg_zero, &kernel_reg_23);
+
+ src_reg_12_lo_1 = _mm_unpacklo_epi8(src_reg_12_lo, _mm_setzero_si128());
+ res_reg_12_lo =
+ mm_madd_packs_epi16_sse2(&src_reg_12_lo_1, &reg_zero, &kernel_reg_45);
+
+ src_reg_23_lo_1 = _mm_unpacklo_epi8(src_reg_23_lo, _mm_setzero_si128());
+ res_reg_23_lo =
+ mm_madd_packs_epi16_sse2(&src_reg_23_lo_1, &reg_zero, &kernel_reg_45);
+
+ // Add to get results
+ res_reg_m1012_lo = _mm_adds_epi16(res_reg_m10_lo, res_reg_12_lo);
+ res_reg_0123_lo = _mm_adds_epi16(res_reg_01_lo, res_reg_23_lo);
+
+ // Round the words
+ res_reg_m1012_lo = mm_round_epi16_sse2(&res_reg_m1012_lo, &reg_32, 6);
+ res_reg_0123_lo = mm_round_epi16_sse2(&res_reg_0123_lo, &reg_32, 6);
+
+ // Convert to 8-bit words
+ res_reg_m1012 = _mm_packus_epi16(res_reg_m1012_lo, reg_zero);
+ res_reg_0123 = _mm_packus_epi16(res_reg_0123_lo, reg_zero);
+
+ // Save only half of the register (8 words)
+ *((int *)(dst_ptr)) = _mm_cvtsi128_si32(res_reg_m1012);
+ *((int *)(dst_ptr + dst_stride)) = _mm_cvtsi128_si32(res_reg_0123);
+
+ // Update the source by two rows
+ src_ptr += src_stride_unrolled;
+ dst_ptr += dst_stride_unrolled;
+
+ src_reg_m10_lo_1 = src_reg_12_lo_1;
+ src_reg_01_lo_1 = src_reg_23_lo_1;
+ src_reg_1 = src_reg_3;
+ }
+}
+
+#if CONFIG_VP9_HIGHBITDEPTH && VPX_ARCH_X86_64
+static void vpx_highbd_filter_block1d4_h4_sse2(
+ const uint16_t *src_ptr, ptrdiff_t src_stride, uint16_t *dst_ptr,
+ ptrdiff_t dst_stride, uint32_t height, const int16_t *kernel, int bd) {
+ // We will load multiple shifted versions of the row and shuffle them into
+ // 16-bit words of the form
+ // ... s[2] s[1] s[0] s[-1]
+ // ... s[4] s[3] s[2] s[1]
+ // Then we call multiply and add to get partial results
+ // s[2]k[3]+s[1]k[2] s[0]k[3]s[-1]k[2]
+ // s[4]k[5]+s[3]k[4] s[2]k[5]s[1]k[4]
+ // The two results are then added together to get the even output
+
+ __m128i src_reg, src_reg_shift_1, src_reg_shift_2, src_reg_shift_3;
+ __m128i res_reg;
+ __m128i even, odd;
+
+ __m128i kernel_reg; // Kernel
+ __m128i kernel_reg_23, kernel_reg_45; // Segments of the kernel used
+ const __m128i reg_round =
+ _mm_set1_epi32(CONV8_ROUNDING_NUM); // Used for rounding
+ const __m128i reg_max = _mm_set1_epi16((1 << bd) - 1);
+ const __m128i reg_zero = _mm_setzero_si128();
+ int h;
+
+ // Start one pixel before as we need tap/2 - 1 = 1 sample from the past
+ src_ptr -= 1;
+
+ // Load Kernel
+ kernel_reg = _mm_loadu_si128((const __m128i *)kernel);
+ kernel_reg_23 = extract_quarter_2_epi16_sse2(&kernel_reg);
+ kernel_reg_45 = extract_quarter_3_epi16_sse2(&kernel_reg);
+
+ for (h = height; h > 0; --h) {
+ src_reg = _mm_loadu_si128((const __m128i *)src_ptr);
+ src_reg_shift_1 = _mm_srli_si128(src_reg, 2);
+ src_reg_shift_2 = _mm_srli_si128(src_reg, 4);
+ src_reg_shift_3 = _mm_srli_si128(src_reg, 6);
+
+ // Output 2 0
+ even = mm_madd_add_epi16_sse2(&src_reg, &src_reg_shift_2, &kernel_reg_23,
+ &kernel_reg_45);
+
+ // Output 3 1
+ odd = mm_madd_add_epi16_sse2(&src_reg_shift_1, &src_reg_shift_3,
+ &kernel_reg_23, &kernel_reg_45);
+
+ // Combine to get the first half of the dst
+ res_reg = _mm_unpacklo_epi32(even, odd);
+ res_reg = mm_round_epi32_sse2(&res_reg, &reg_round, CONV8_ROUNDING_BITS);
+ res_reg = _mm_packs_epi32(res_reg, reg_zero);
+
+ // Saturate the result and save
+ res_reg = _mm_min_epi16(res_reg, reg_max);
+ res_reg = _mm_max_epi16(res_reg, reg_zero);
+ _mm_storel_epi64((__m128i *)dst_ptr, res_reg);
+
+ src_ptr += src_stride;
+ dst_ptr += dst_stride;
+ }
+}
+
+static void vpx_highbd_filter_block1d4_v4_sse2(
+ const uint16_t *src_ptr, ptrdiff_t src_stride, uint16_t *dst_ptr,
+ ptrdiff_t dst_stride, uint32_t height, const int16_t *kernel, int bd) {
+ // We will load two rows of pixels as 16-bit words, and shuffle them into the
+ // form
+ // ... s[0,1] s[-1,1] s[0,0] s[-1,0]
+ // ... s[0,7] s[-1,7] s[0,6] s[-1,6]
+ // ... s[0,9] s[-1,9] s[0,8] s[-1,8]
+ // ... s[0,13] s[-1,13] s[0,12] s[-1,12]
+ // so that we can call multiply and add with the kernel to get 32-bit words of
+ // the form
+ // ... s[0,1]k[3]+s[-1,1]k[2] s[0,0]k[3]+s[-1,0]k[2]
+ // Finally, we can add multiple rows together to get the desired output.
+
+ // Register for source s[-1:3, :]
+ __m128i src_reg_m1, src_reg_0, src_reg_1, src_reg_2, src_reg_3;
+ // Interleaved rows of the source. lo is first half, hi second
+ __m128i src_reg_m10, src_reg_01;
+ __m128i src_reg_12, src_reg_23;
+
+ __m128i kernel_reg; // Kernel
+ __m128i kernel_reg_23, kernel_reg_45; // Segments of the kernel used
+
+ // Result after multiply and add
+ __m128i res_reg_m10, res_reg_01, res_reg_12, res_reg_23;
+ __m128i res_reg_m1012, res_reg_0123;
+
+ const __m128i reg_round =
+ _mm_set1_epi32(CONV8_ROUNDING_NUM); // Used for rounding
+ const __m128i reg_max = _mm_set1_epi16((1 << bd) - 1);
+ const __m128i reg_zero = _mm_setzero_si128();
+
+ // We will compute the result two rows at a time
+ const ptrdiff_t src_stride_unrolled = src_stride << 1;
+ const ptrdiff_t dst_stride_unrolled = dst_stride << 1;
+ int h;
+
+ // Load Kernel
+ kernel_reg = _mm_loadu_si128((const __m128i *)kernel);
+ kernel_reg_23 = extract_quarter_2_epi16_sse2(&kernel_reg);
+ kernel_reg_45 = extract_quarter_3_epi16_sse2(&kernel_reg);
+
+ // First shuffle the data
+ src_reg_m1 = _mm_loadl_epi64((const __m128i *)src_ptr);
+ src_reg_0 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_stride));
+ src_reg_m10 = _mm_unpacklo_epi16(src_reg_m1, src_reg_0);
+
+ // More shuffling
+ src_reg_1 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_stride * 2));
+ src_reg_01 = _mm_unpacklo_epi16(src_reg_0, src_reg_1);
+
+ for (h = height; h > 1; h -= 2) {
+ src_reg_2 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_stride * 3));
+
+ src_reg_12 = _mm_unpacklo_epi16(src_reg_1, src_reg_2);
+
+ src_reg_3 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_stride * 4));
+
+ src_reg_23 = _mm_unpacklo_epi16(src_reg_2, src_reg_3);
+
+ // Partial output
+ res_reg_m10 = _mm_madd_epi16(src_reg_m10, kernel_reg_23);
+ res_reg_01 = _mm_madd_epi16(src_reg_01, kernel_reg_23);
+ res_reg_12 = _mm_madd_epi16(src_reg_12, kernel_reg_45);
+ res_reg_23 = _mm_madd_epi16(src_reg_23, kernel_reg_45);
+
+ // Add to get results
+ res_reg_m1012 = _mm_add_epi32(res_reg_m10, res_reg_12);
+ res_reg_0123 = _mm_add_epi32(res_reg_01, res_reg_23);
+
+ // Round the words
+ res_reg_m1012 =
+ mm_round_epi32_sse2(&res_reg_m1012, &reg_round, CONV8_ROUNDING_BITS);
+ res_reg_0123 =
+ mm_round_epi32_sse2(&res_reg_0123, &reg_round, CONV8_ROUNDING_BITS);
+
+ res_reg_m1012 = _mm_packs_epi32(res_reg_m1012, reg_zero);
+ res_reg_0123 = _mm_packs_epi32(res_reg_0123, reg_zero);
+
+ // Saturate according to bit depth
+ res_reg_m1012 = _mm_min_epi16(res_reg_m1012, reg_max);
+ res_reg_0123 = _mm_min_epi16(res_reg_0123, reg_max);
+ res_reg_m1012 = _mm_max_epi16(res_reg_m1012, reg_zero);
+ res_reg_0123 = _mm_max_epi16(res_reg_0123, reg_zero);
+
+ // Save only half of the register (8 words)
+ _mm_storel_epi64((__m128i *)dst_ptr, res_reg_m1012);
+ _mm_storel_epi64((__m128i *)(dst_ptr + dst_stride), res_reg_0123);
+
+ // Update the source by two rows
+ src_ptr += src_stride_unrolled;
+ dst_ptr += dst_stride_unrolled;
+
+ src_reg_m10 = src_reg_12;
+ src_reg_01 = src_reg_23;
+ src_reg_1 = src_reg_3;
+ }
+}
+
+static void vpx_highbd_filter_block1d8_h4_sse2(
+ const uint16_t *src_ptr, ptrdiff_t src_stride, uint16_t *dst_ptr,
+ ptrdiff_t dst_stride, uint32_t height, const int16_t *kernel, int bd) {
+ // We will load multiple shifted versions of the row and shuffle them into
+ // 16-bit words of the form
+ // ... s[2] s[1] s[0] s[-1]
+ // ... s[4] s[3] s[2] s[1]
+ // Then we call multiply and add to get partial results
+ // s[2]k[3]+s[1]k[2] s[0]k[3]s[-1]k[2]
+ // s[4]k[5]+s[3]k[4] s[2]k[5]s[1]k[4]
+ // The two results are then added together for the first half of even
+ // output.
+ // Repeat multiple times to get the whole outoput
+
+ __m128i src_reg, src_reg_next, src_reg_shift_1, src_reg_shift_2,
+ src_reg_shift_3;
+ __m128i res_reg;
+ __m128i even, odd;
+ __m128i tmp_0, tmp_1;
+
+ __m128i kernel_reg; // Kernel
+ __m128i kernel_reg_23, kernel_reg_45; // Segments of the kernel used
+ const __m128i reg_round =
+ _mm_set1_epi32(CONV8_ROUNDING_NUM); // Used for rounding
+ const __m128i reg_max = _mm_set1_epi16((1 << bd) - 1);
+ const __m128i reg_zero = _mm_setzero_si128();
+ int h;
+
+ // Start one pixel before as we need tap/2 - 1 = 1 sample from the past
+ src_ptr -= 1;
+
+ // Load Kernel
+ kernel_reg = _mm_loadu_si128((const __m128i *)kernel);
+ kernel_reg_23 = extract_quarter_2_epi16_sse2(&kernel_reg);
+ kernel_reg_45 = extract_quarter_3_epi16_sse2(&kernel_reg);
+
+ for (h = height; h > 0; --h) {
+ // We will put first half in the first half of the reg, and second half in
+ // second half
+ src_reg = _mm_loadu_si128((const __m128i *)src_ptr);
+ src_reg_next = _mm_loadu_si128((const __m128i *)(src_ptr + 5));
+
+ // Output 6 4 2 0
+ tmp_0 = _mm_srli_si128(src_reg, 4);
+ tmp_1 = _mm_srli_si128(src_reg_next, 2);
+ src_reg_shift_2 = _mm_unpacklo_epi64(tmp_0, tmp_1);
+ even = mm_madd_add_epi16_sse2(&src_reg, &src_reg_shift_2, &kernel_reg_23,
+ &kernel_reg_45);
+
+ // Output 7 5 3 1
+ tmp_0 = _mm_srli_si128(src_reg, 2);
+ tmp_1 = src_reg_next;
+ src_reg_shift_1 = _mm_unpacklo_epi64(tmp_0, tmp_1);
+
+ tmp_0 = _mm_srli_si128(src_reg, 6);
+ tmp_1 = _mm_srli_si128(src_reg_next, 4);
+ src_reg_shift_3 = _mm_unpacklo_epi64(tmp_0, tmp_1);
+
+ odd = mm_madd_add_epi16_sse2(&src_reg_shift_1, &src_reg_shift_3,
+ &kernel_reg_23, &kernel_reg_45);
+
+ // Combine to get the first half of the dst
+ even = mm_round_epi32_sse2(&even, &reg_round, CONV8_ROUNDING_BITS);
+ odd = mm_round_epi32_sse2(&odd, &reg_round, CONV8_ROUNDING_BITS);
+ res_reg = mm_zip_epi32_sse2(&even, &odd);
+
+ // Saturate the result and save
+ res_reg = _mm_min_epi16(res_reg, reg_max);
+ res_reg = _mm_max_epi16(res_reg, reg_zero);
+
+ _mm_store_si128((__m128i *)dst_ptr, res_reg);
+
+ src_ptr += src_stride;
+ dst_ptr += dst_stride;
+ }
+}
+
+static void vpx_highbd_filter_block1d8_v4_sse2(
+ const uint16_t *src_ptr, ptrdiff_t src_stride, uint16_t *dst_ptr,
+ ptrdiff_t dst_stride, uint32_t height, const int16_t *kernel, int bd) {
+ // We will load two rows of pixels as 16-bit words, and shuffle them into the
+ // form
+ // ... s[0,1] s[-1,1] s[0,0] s[-1,0]
+ // ... s[0,7] s[-1,7] s[0,6] s[-1,6]
+ // ... s[0,9] s[-1,9] s[0,8] s[-1,8]
+ // ... s[0,13] s[-1,13] s[0,12] s[-1,12]
+ // so that we can call multiply and add with the kernel to get 32-bit words of
+ // the form
+ // ... s[0,1]k[3]+s[-1,1]k[2] s[0,0]k[3]+s[-1,0]k[2]
+ // Finally, we can add multiple rows together to get the desired output.
+
+ // Register for source s[-1:3, :]
+ __m128i src_reg_m1, src_reg_0, src_reg_1, src_reg_2, src_reg_3;
+ // Interleaved rows of the source. lo is first half, hi second
+ __m128i src_reg_m10_lo, src_reg_01_lo, src_reg_m10_hi, src_reg_01_hi;
+ __m128i src_reg_12_lo, src_reg_23_lo, src_reg_12_hi, src_reg_23_hi;
+
+ // Result after multiply and add
+ __m128i res_reg_m10_lo, res_reg_01_lo, res_reg_12_lo, res_reg_23_lo;
+ __m128i res_reg_m10_hi, res_reg_01_hi, res_reg_12_hi, res_reg_23_hi;
+ __m128i res_reg_m1012, res_reg_0123;
+ __m128i res_reg_m1012_lo, res_reg_0123_lo;
+ __m128i res_reg_m1012_hi, res_reg_0123_hi;
+
+ __m128i kernel_reg; // Kernel
+ __m128i kernel_reg_23, kernel_reg_45; // Segments of the kernel used
+
+ const __m128i reg_round =
+ _mm_set1_epi32(CONV8_ROUNDING_NUM); // Used for rounding
+ const __m128i reg_max = _mm_set1_epi16((1 << bd) - 1);
+ const __m128i reg_zero = _mm_setzero_si128();
+
+ // We will compute the result two rows at a time
+ const ptrdiff_t src_stride_unrolled = src_stride << 1;
+ const ptrdiff_t dst_stride_unrolled = dst_stride << 1;
+ int h;
+
+ // Load Kernel
+ kernel_reg = _mm_loadu_si128((const __m128i *)kernel);
+ kernel_reg_23 = extract_quarter_2_epi16_sse2(&kernel_reg);
+ kernel_reg_45 = extract_quarter_3_epi16_sse2(&kernel_reg);
+
+ // First shuffle the data
+ src_reg_m1 = _mm_loadu_si128((const __m128i *)src_ptr);
+ src_reg_0 = _mm_loadu_si128((const __m128i *)(src_ptr + src_stride));
+ src_reg_m10_lo = _mm_unpacklo_epi16(src_reg_m1, src_reg_0);
+ src_reg_m10_hi = _mm_unpackhi_epi16(src_reg_m1, src_reg_0);
+
+ // More shuffling
+ src_reg_1 = _mm_loadu_si128((const __m128i *)(src_ptr + src_stride * 2));
+ src_reg_01_lo = _mm_unpacklo_epi16(src_reg_0, src_reg_1);
+ src_reg_01_hi = _mm_unpackhi_epi16(src_reg_0, src_reg_1);
+
+ for (h = height; h > 1; h -= 2) {
+ src_reg_2 = _mm_loadu_si128((const __m128i *)(src_ptr + src_stride * 3));
+
+ src_reg_12_lo = _mm_unpacklo_epi16(src_reg_1, src_reg_2);
+ src_reg_12_hi = _mm_unpackhi_epi16(src_reg_1, src_reg_2);
+
+ src_reg_3 = _mm_loadu_si128((const __m128i *)(src_ptr + src_stride * 4));
+
+ src_reg_23_lo = _mm_unpacklo_epi16(src_reg_2, src_reg_3);
+ src_reg_23_hi = _mm_unpackhi_epi16(src_reg_2, src_reg_3);
+
+ // Partial output for first half
+ res_reg_m10_lo = _mm_madd_epi16(src_reg_m10_lo, kernel_reg_23);
+ res_reg_01_lo = _mm_madd_epi16(src_reg_01_lo, kernel_reg_23);
+ res_reg_12_lo = _mm_madd_epi16(src_reg_12_lo, kernel_reg_45);
+ res_reg_23_lo = _mm_madd_epi16(src_reg_23_lo, kernel_reg_45);
+
+ // Add to get results
+ res_reg_m1012_lo = _mm_add_epi32(res_reg_m10_lo, res_reg_12_lo);
+ res_reg_0123_lo = _mm_add_epi32(res_reg_01_lo, res_reg_23_lo);
+
+ // Round the words
+ res_reg_m1012_lo =
+ mm_round_epi32_sse2(&res_reg_m1012_lo, &reg_round, CONV8_ROUNDING_BITS);
+ res_reg_0123_lo =
+ mm_round_epi32_sse2(&res_reg_0123_lo, &reg_round, CONV8_ROUNDING_BITS);
+
+ // Partial output for first half
+ res_reg_m10_hi = _mm_madd_epi16(src_reg_m10_hi, kernel_reg_23);
+ res_reg_01_hi = _mm_madd_epi16(src_reg_01_hi, kernel_reg_23);
+ res_reg_12_hi = _mm_madd_epi16(src_reg_12_hi, kernel_reg_45);
+ res_reg_23_hi = _mm_madd_epi16(src_reg_23_hi, kernel_reg_45);
+
+ // Add to get results
+ res_reg_m1012_hi = _mm_add_epi32(res_reg_m10_hi, res_reg_12_hi);
+ res_reg_0123_hi = _mm_add_epi32(res_reg_01_hi, res_reg_23_hi);
+
+ // Round the words
+ res_reg_m1012_hi =
+ mm_round_epi32_sse2(&res_reg_m1012_hi, &reg_round, CONV8_ROUNDING_BITS);
+ res_reg_0123_hi =
+ mm_round_epi32_sse2(&res_reg_0123_hi, &reg_round, CONV8_ROUNDING_BITS);
+
+ // Combine the two halfs
+ res_reg_m1012 = _mm_packs_epi32(res_reg_m1012_lo, res_reg_m1012_hi);
+ res_reg_0123 = _mm_packs_epi32(res_reg_0123_lo, res_reg_0123_hi);
+
+ // Saturate according to bit depth
+ res_reg_m1012 = _mm_min_epi16(res_reg_m1012, reg_max);
+ res_reg_0123 = _mm_min_epi16(res_reg_0123, reg_max);
+ res_reg_m1012 = _mm_max_epi16(res_reg_m1012, reg_zero);
+ res_reg_0123 = _mm_max_epi16(res_reg_0123, reg_zero);
+
+ // Save only half of the register (8 words)
+ _mm_store_si128((__m128i *)dst_ptr, res_reg_m1012);
+ _mm_store_si128((__m128i *)(dst_ptr + dst_stride), res_reg_0123);
+
+ // Update the source by two rows
+ src_ptr += src_stride_unrolled;
+ dst_ptr += dst_stride_unrolled;
+
+ src_reg_m10_lo = src_reg_12_lo;
+ src_reg_m10_hi = src_reg_12_hi;
+ src_reg_01_lo = src_reg_23_lo;
+ src_reg_01_hi = src_reg_23_hi;
+ src_reg_1 = src_reg_3;
+ }
+}
+
+static void vpx_highbd_filter_block1d16_h4_sse2(
+ const uint16_t *src_ptr, ptrdiff_t src_stride, uint16_t *dst_ptr,
+ ptrdiff_t dst_stride, uint32_t height, const int16_t *kernel, int bd) {
+ vpx_highbd_filter_block1d8_h4_sse2(src_ptr, src_stride, dst_ptr, dst_stride,
+ height, kernel, bd);
+ vpx_highbd_filter_block1d8_h4_sse2(src_ptr + 8, src_stride, dst_ptr + 8,
+ dst_stride, height, kernel, bd);
+}
+
+static void vpx_highbd_filter_block1d16_v4_sse2(
+ const uint16_t *src_ptr, ptrdiff_t src_stride, uint16_t *dst_ptr,
+ ptrdiff_t dst_stride, uint32_t height, const int16_t *kernel, int bd) {
+ vpx_highbd_filter_block1d8_v4_sse2(src_ptr, src_stride, dst_ptr, dst_stride,
+ height, kernel, bd);
+ vpx_highbd_filter_block1d8_v4_sse2(src_ptr + 8, src_stride, dst_ptr + 8,
+ dst_stride, height, kernel, bd);
+}
+#endif // CONFIG_VP9_HIGHBITDEPTH && VPX_ARCH_X86_64
+
+// From vpx_subpixel_8t_sse2.asm.
+filter8_1dfunction vpx_filter_block1d16_v8_sse2;
+filter8_1dfunction vpx_filter_block1d16_h8_sse2;
+filter8_1dfunction vpx_filter_block1d8_v8_sse2;
+filter8_1dfunction vpx_filter_block1d8_h8_sse2;
+filter8_1dfunction vpx_filter_block1d4_v8_sse2;
+filter8_1dfunction vpx_filter_block1d4_h8_sse2;
+filter8_1dfunction vpx_filter_block1d16_v8_avg_sse2;
+filter8_1dfunction vpx_filter_block1d16_h8_avg_sse2;
+filter8_1dfunction vpx_filter_block1d8_v8_avg_sse2;
+filter8_1dfunction vpx_filter_block1d8_h8_avg_sse2;
+filter8_1dfunction vpx_filter_block1d4_v8_avg_sse2;
+filter8_1dfunction vpx_filter_block1d4_h8_avg_sse2;
+
+// Use the [vh]8 version because there is no [vh]4 implementation.
+#define vpx_filter_block1d16_v4_avg_sse2 vpx_filter_block1d16_v8_avg_sse2
+#define vpx_filter_block1d16_h4_avg_sse2 vpx_filter_block1d16_h8_avg_sse2
+#define vpx_filter_block1d8_v4_avg_sse2 vpx_filter_block1d8_v8_avg_sse2
+#define vpx_filter_block1d8_h4_avg_sse2 vpx_filter_block1d8_h8_avg_sse2
+#define vpx_filter_block1d4_v4_avg_sse2 vpx_filter_block1d4_v8_avg_sse2
+#define vpx_filter_block1d4_h4_avg_sse2 vpx_filter_block1d4_h8_avg_sse2
+
+// From vpx_dsp/x86/vpx_subpixel_bilinear_sse2.asm.
+filter8_1dfunction vpx_filter_block1d16_v2_sse2;
+filter8_1dfunction vpx_filter_block1d16_h2_sse2;
+filter8_1dfunction vpx_filter_block1d8_v2_sse2;
+filter8_1dfunction vpx_filter_block1d8_h2_sse2;
+filter8_1dfunction vpx_filter_block1d4_v2_sse2;
+filter8_1dfunction vpx_filter_block1d4_h2_sse2;
+filter8_1dfunction vpx_filter_block1d16_v2_avg_sse2;
+filter8_1dfunction vpx_filter_block1d16_h2_avg_sse2;
+filter8_1dfunction vpx_filter_block1d8_v2_avg_sse2;
+filter8_1dfunction vpx_filter_block1d8_h2_avg_sse2;
+filter8_1dfunction vpx_filter_block1d4_v2_avg_sse2;
+filter8_1dfunction vpx_filter_block1d4_h2_avg_sse2;
+
+// void vpx_convolve8_horiz_sse2(const uint8_t *src, ptrdiff_t src_stride,
+// uint8_t *dst, ptrdiff_t dst_stride,
+// const InterpKernel *filter, int x0_q4,
+// int32_t x_step_q4, int y0_q4, int y_step_q4,
+// int w, int h);
+// void vpx_convolve8_vert_sse2(const uint8_t *src, ptrdiff_t src_stride,
+// uint8_t *dst, ptrdiff_t dst_stride,
+// const InterpKernel *filter, int x0_q4,
+// int32_t x_step_q4, int y0_q4, int y_step_q4,
+// int w, int h);
+// void vpx_convolve8_avg_horiz_sse2(const uint8_t *src, ptrdiff_t src_stride,
+// uint8_t *dst, ptrdiff_t dst_stride,
+// const InterpKernel *filter, int x0_q4,
+// int32_t x_step_q4, int y0_q4,
+// int y_step_q4, int w, int h);
+// void vpx_convolve8_avg_vert_sse2(const uint8_t *src, ptrdiff_t src_stride,
+// uint8_t *dst, ptrdiff_t dst_stride,
+// const InterpKernel *filter, int x0_q4,
+// int32_t x_step_q4, int y0_q4, int y_step_q4,
+// int w, int h);
+FUN_CONV_1D(horiz, x0_q4, x_step_q4, h, src, , sse2, 0)
+FUN_CONV_1D(vert, y0_q4, y_step_q4, v, src - (num_taps / 2 - 1) * src_stride, ,
+ sse2, 0)
+FUN_CONV_1D(avg_horiz, x0_q4, x_step_q4, h, src, avg_, sse2, 1)
+FUN_CONV_1D(avg_vert, y0_q4, y_step_q4, v,
+ src - (num_taps / 2 - 1) * src_stride, avg_, sse2, 1)
+
+// void vpx_convolve8_sse2(const uint8_t *src, ptrdiff_t src_stride,
+// uint8_t *dst, ptrdiff_t dst_stride,
+// const InterpKernel *filter, int x0_q4,
+// int32_t x_step_q4, int y0_q4, int y_step_q4,
+// int w, int h);
+// void vpx_convolve8_avg_sse2(const uint8_t *src, ptrdiff_t src_stride,
+// uint8_t *dst, ptrdiff_t dst_stride,
+// const InterpKernel *filter, int x0_q4,
+// int32_t x_step_q4, int y0_q4, int y_step_q4,
+// int w, int h);
+FUN_CONV_2D(, sse2, 0)
+FUN_CONV_2D(avg_, sse2, 1)
+
+#if CONFIG_VP9_HIGHBITDEPTH && VPX_ARCH_X86_64
+// From vpx_dsp/x86/vpx_high_subpixel_8t_sse2.asm.
+highbd_filter8_1dfunction vpx_highbd_filter_block1d16_v8_sse2;
+highbd_filter8_1dfunction vpx_highbd_filter_block1d16_h8_sse2;
+highbd_filter8_1dfunction vpx_highbd_filter_block1d8_v8_sse2;
+highbd_filter8_1dfunction vpx_highbd_filter_block1d8_h8_sse2;
+highbd_filter8_1dfunction vpx_highbd_filter_block1d4_v8_sse2;
+highbd_filter8_1dfunction vpx_highbd_filter_block1d4_h8_sse2;
+highbd_filter8_1dfunction vpx_highbd_filter_block1d16_v8_avg_sse2;
+highbd_filter8_1dfunction vpx_highbd_filter_block1d16_h8_avg_sse2;
+highbd_filter8_1dfunction vpx_highbd_filter_block1d8_v8_avg_sse2;
+highbd_filter8_1dfunction vpx_highbd_filter_block1d8_h8_avg_sse2;
+highbd_filter8_1dfunction vpx_highbd_filter_block1d4_v8_avg_sse2;
+highbd_filter8_1dfunction vpx_highbd_filter_block1d4_h8_avg_sse2;
+
+// Use the [vh]8 version because there is no [vh]4 implementation.
+#define vpx_highbd_filter_block1d16_v4_avg_sse2 \
+ vpx_highbd_filter_block1d16_v8_avg_sse2
+#define vpx_highbd_filter_block1d16_h4_avg_sse2 \
+ vpx_highbd_filter_block1d16_h8_avg_sse2
+#define vpx_highbd_filter_block1d8_v4_avg_sse2 \
+ vpx_highbd_filter_block1d8_v8_avg_sse2
+#define vpx_highbd_filter_block1d8_h4_avg_sse2 \
+ vpx_highbd_filter_block1d8_h8_avg_sse2
+#define vpx_highbd_filter_block1d4_v4_avg_sse2 \
+ vpx_highbd_filter_block1d4_v8_avg_sse2
+#define vpx_highbd_filter_block1d4_h4_avg_sse2 \
+ vpx_highbd_filter_block1d4_h8_avg_sse2
+
+// From vpx_dsp/x86/vpx_high_subpixel_bilinear_sse2.asm.
+highbd_filter8_1dfunction vpx_highbd_filter_block1d16_v2_sse2;
+highbd_filter8_1dfunction vpx_highbd_filter_block1d16_h2_sse2;
+highbd_filter8_1dfunction vpx_highbd_filter_block1d8_v2_sse2;
+highbd_filter8_1dfunction vpx_highbd_filter_block1d8_h2_sse2;
+highbd_filter8_1dfunction vpx_highbd_filter_block1d4_v2_sse2;
+highbd_filter8_1dfunction vpx_highbd_filter_block1d4_h2_sse2;
+highbd_filter8_1dfunction vpx_highbd_filter_block1d16_v2_avg_sse2;
+highbd_filter8_1dfunction vpx_highbd_filter_block1d16_h2_avg_sse2;
+highbd_filter8_1dfunction vpx_highbd_filter_block1d8_v2_avg_sse2;
+highbd_filter8_1dfunction vpx_highbd_filter_block1d8_h2_avg_sse2;
+highbd_filter8_1dfunction vpx_highbd_filter_block1d4_v2_avg_sse2;
+highbd_filter8_1dfunction vpx_highbd_filter_block1d4_h2_avg_sse2;
+
+// void vpx_highbd_convolve8_horiz_sse2(const uint8_t *src,
+// ptrdiff_t src_stride,
+// uint8_t *dst,
+// ptrdiff_t dst_stride,
+// const int16_t *filter_x,
+// int x_step_q4,
+// const int16_t *filter_y,
+// int y_step_q4,
+// int w, int h, int bd);
+// void vpx_highbd_convolve8_vert_sse2(const uint8_t *src,
+// ptrdiff_t src_stride,
+// uint8_t *dst,
+// ptrdiff_t dst_stride,
+// const int16_t *filter_x,
+// int x_step_q4,
+// const int16_t *filter_y,
+// int y_step_q4,
+// int w, int h, int bd);
+// void vpx_highbd_convolve8_avg_horiz_sse2(const uint8_t *src,
+// ptrdiff_t src_stride,
+// uint8_t *dst,
+// ptrdiff_t dst_stride,
+// const int16_t *filter_x,
+// int x_step_q4,
+// const int16_t *filter_y,
+// int y_step_q4,
+// int w, int h, int bd);
+// void vpx_highbd_convolve8_avg_vert_sse2(const uint8_t *src,
+// ptrdiff_t src_stride,
+// uint8_t *dst,
+// ptrdiff_t dst_stride,
+// const int16_t *filter_x,
+// int x_step_q4,
+// const int16_t *filter_y,
+// int y_step_q4,
+// int w, int h, int bd);
+HIGH_FUN_CONV_1D(horiz, x0_q4, x_step_q4, h, src, , sse2, 0)
+HIGH_FUN_CONV_1D(vert, y0_q4, y_step_q4, v,
+ src - src_stride * (num_taps / 2 - 1), , sse2, 0)
+HIGH_FUN_CONV_1D(avg_horiz, x0_q4, x_step_q4, h, src, avg_, sse2, 1)
+HIGH_FUN_CONV_1D(avg_vert, y0_q4, y_step_q4, v,
+ src - src_stride * (num_taps / 2 - 1), avg_, sse2, 1)
+
+// void vpx_highbd_convolve8_sse2(const uint8_t *src, ptrdiff_t src_stride,
+// uint8_t *dst, ptrdiff_t dst_stride,
+// const InterpKernel *filter, int x0_q4,
+// int32_t x_step_q4, int y0_q4, int y_step_q4,
+// int w, int h, int bd);
+// void vpx_highbd_convolve8_avg_sse2(const uint8_t *src, ptrdiff_t src_stride,
+// uint8_t *dst, ptrdiff_t dst_stride,
+// const InterpKernel *filter, int x0_q4,
+// int32_t x_step_q4, int y0_q4,
+// int y_step_q4, int w, int h, int bd);
+HIGH_FUN_CONV_2D(, sse2, 0)
+HIGH_FUN_CONV_2D(avg_, sse2, 1)
+#endif // CONFIG_VP9_HIGHBITDEPTH && VPX_ARCH_X86_64
diff --git a/media/libvpx/libvpx/vpx_dsp/x86/vpx_subpixel_8t_intrin_avx2.c b/media/libvpx/libvpx/vpx_dsp/x86/vpx_subpixel_8t_intrin_avx2.c
new file mode 100644
index 0000000000..2498bba173
--- /dev/null
+++ b/media/libvpx/libvpx/vpx_dsp/x86/vpx_subpixel_8t_intrin_avx2.c
@@ -0,0 +1,1458 @@
+/*
+ * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <immintrin.h>
+#include <stdio.h>
+
+#include "./vpx_dsp_rtcd.h"
+#include "vpx_dsp/x86/convolve.h"
+#include "vpx_dsp/x86/convolve_avx2.h"
+#include "vpx_dsp/x86/convolve_sse2.h"
+#include "vpx_dsp/x86/convolve_ssse3.h"
+#include "vpx_ports/mem.h"
+
+// filters for 16_h8
+DECLARE_ALIGNED(32, static const uint8_t,
+ filt1_global_avx2[32]) = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5,
+ 6, 6, 7, 7, 8, 0, 1, 1, 2, 2, 3,
+ 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
+
+DECLARE_ALIGNED(32, static const uint8_t,
+ filt2_global_avx2[32]) = { 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7,
+ 8, 8, 9, 9, 10, 2, 3, 3, 4, 4, 5,
+ 5, 6, 6, 7, 7, 8, 8, 9, 9, 10 };
+
+DECLARE_ALIGNED(32, static const uint8_t, filt3_global_avx2[32]) = {
+ 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12,
+ 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12
+};
+
+DECLARE_ALIGNED(32, static const uint8_t, filt4_global_avx2[32]) = {
+ 6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12, 12, 13, 13, 14,
+ 6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12, 12, 13, 13, 14
+};
+
+DECLARE_ALIGNED(32, static const uint8_t, filt_d4_global_avx2[64]) = {
+ 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6, 0, 1, 2, 3, 1, 2,
+ 3, 4, 2, 3, 4, 5, 3, 4, 5, 6, 4, 5, 6, 7, 5, 6, 7, 8, 6, 7, 8, 9,
+ 7, 8, 9, 10, 4, 5, 6, 7, 5, 6, 7, 8, 6, 7, 8, 9, 7, 8, 9, 10,
+};
+
+#define CALC_CONVOLVE8_HORZ_ROW \
+ srcReg = xx_loadu2_mi128(src_ptr - 3 + src_pitch, src_ptr - 3); \
+ s1[0] = _mm256_shuffle_epi8(srcReg, filt[0]); \
+ s1[1] = _mm256_shuffle_epi8(srcReg, filt[1]); \
+ s1[2] = _mm256_shuffle_epi8(srcReg, filt[2]); \
+ s1[3] = _mm256_shuffle_epi8(srcReg, filt[3]); \
+ s1[0] = convolve8_16_avx2(s1, f1); \
+ s1[0] = _mm256_packus_epi16(s1[0], s1[0]); \
+ src_ptr += src_stride; \
+ _mm_storel_epi64((__m128i *)&output_ptr[0], _mm256_castsi256_si128(s1[0])); \
+ output_ptr += output_pitch; \
+ _mm_storel_epi64((__m128i *)&output_ptr[0], \
+ _mm256_extractf128_si256(s1[0], 1)); \
+ output_ptr += output_pitch;
+
+// 0 0 0 0 hi3 hi2 hi1 hi0 | 0 0 0 0 lo3 lo2 lo1 lo0
+static INLINE __m256i xx_loadu2_mi128(const void *hi, const void *lo) {
+ // 0 0 0 0 0 0 0 0 | 0 0 0 0 lo3 lo2 lo1 lo0
+ __m256i a = _mm256_castsi128_si256(_mm_loadu_si128((const __m128i *)(lo)));
+
+ // 0 0 0 0 hi3 hi2 hi1 hi0 | 0 0 0 0 lo3 lo2 lo1 lo0
+ a = _mm256_inserti128_si256(a, _mm_loadu_si128((const __m128i *)(hi)), 1);
+ return a;
+}
+
+static INLINE void vpx_filter_block1d16_h8_x_avx2(
+ const uint8_t *src_ptr, ptrdiff_t src_pixels_per_line, uint8_t *output_ptr,
+ ptrdiff_t output_pitch, uint32_t output_height, const int16_t *filter,
+ const int avg) {
+ __m128i outReg1, outReg2;
+ __m256i outReg32b1, outReg32b2;
+ unsigned int i;
+ ptrdiff_t src_stride, dst_stride;
+ __m256i f[4], filt[4], s[4];
+
+ shuffle_filter_avx2(filter, f);
+ filt[0] = _mm256_load_si256((__m256i const *)filt1_global_avx2);
+ filt[1] = _mm256_load_si256((__m256i const *)filt2_global_avx2);
+ filt[2] = _mm256_load_si256((__m256i const *)filt3_global_avx2);
+ filt[3] = _mm256_load_si256((__m256i const *)filt4_global_avx2);
+
+ // multiple the size of the source and destination stride by two
+ src_stride = src_pixels_per_line << 1;
+ dst_stride = output_pitch << 1;
+ for (i = output_height; i > 1; i -= 2) {
+ __m256i srcReg;
+
+ // load the 2 strides of source
+ srcReg =
+ _mm256_castsi128_si256(_mm_loadu_si128((const __m128i *)(src_ptr - 3)));
+ srcReg = _mm256_inserti128_si256(
+ srcReg,
+ _mm_loadu_si128((const __m128i *)(src_ptr + src_pixels_per_line - 3)),
+ 1);
+
+ // filter the source buffer
+ s[0] = _mm256_shuffle_epi8(srcReg, filt[0]);
+ s[1] = _mm256_shuffle_epi8(srcReg, filt[1]);
+ s[2] = _mm256_shuffle_epi8(srcReg, filt[2]);
+ s[3] = _mm256_shuffle_epi8(srcReg, filt[3]);
+ outReg32b1 = convolve8_16_avx2(s, f);
+
+ // reading 2 strides of the next 16 bytes
+ // (part of it was being read by earlier read)
+ srcReg =
+ _mm256_castsi128_si256(_mm_loadu_si128((const __m128i *)(src_ptr + 5)));
+ srcReg = _mm256_inserti128_si256(
+ srcReg,
+ _mm_loadu_si128((const __m128i *)(src_ptr + src_pixels_per_line + 5)),
+ 1);
+
+ // filter the source buffer
+ s[0] = _mm256_shuffle_epi8(srcReg, filt[0]);
+ s[1] = _mm256_shuffle_epi8(srcReg, filt[1]);
+ s[2] = _mm256_shuffle_epi8(srcReg, filt[2]);
+ s[3] = _mm256_shuffle_epi8(srcReg, filt[3]);
+ outReg32b2 = convolve8_16_avx2(s, f);
+
+ // shrink to 8 bit each 16 bits, the low and high 64-bits of each lane
+ // contain the first and second convolve result respectively
+ outReg32b1 = _mm256_packus_epi16(outReg32b1, outReg32b2);
+
+ src_ptr += src_stride;
+
+ // average if necessary
+ outReg1 = _mm256_castsi256_si128(outReg32b1);
+ outReg2 = _mm256_extractf128_si256(outReg32b1, 1);
+ if (avg) {
+ outReg1 = _mm_avg_epu8(outReg1, _mm_load_si128((__m128i *)output_ptr));
+ outReg2 = _mm_avg_epu8(
+ outReg2, _mm_load_si128((__m128i *)(output_ptr + output_pitch)));
+ }
+
+ // save 16 bytes
+ _mm_store_si128((__m128i *)output_ptr, outReg1);
+
+ // save the next 16 bits
+ _mm_store_si128((__m128i *)(output_ptr + output_pitch), outReg2);
+
+ output_ptr += dst_stride;
+ }
+
+ // if the number of strides is odd.
+ // process only 16 bytes
+ if (i > 0) {
+ __m128i srcReg;
+
+ // load the first 16 bytes of the last row
+ srcReg = _mm_loadu_si128((const __m128i *)(src_ptr - 3));
+
+ // filter the source buffer
+ s[0] = _mm256_castsi128_si256(
+ _mm_shuffle_epi8(srcReg, _mm256_castsi256_si128(filt[0])));
+ s[1] = _mm256_castsi128_si256(
+ _mm_shuffle_epi8(srcReg, _mm256_castsi256_si128(filt[1])));
+ s[2] = _mm256_castsi128_si256(
+ _mm_shuffle_epi8(srcReg, _mm256_castsi256_si128(filt[2])));
+ s[3] = _mm256_castsi128_si256(
+ _mm_shuffle_epi8(srcReg, _mm256_castsi256_si128(filt[3])));
+ outReg1 = convolve8_8_avx2(s, f);
+
+ // reading the next 16 bytes
+ // (part of it was being read by earlier read)
+ srcReg = _mm_loadu_si128((const __m128i *)(src_ptr + 5));
+
+ // filter the source buffer
+ s[0] = _mm256_castsi128_si256(
+ _mm_shuffle_epi8(srcReg, _mm256_castsi256_si128(filt[0])));
+ s[1] = _mm256_castsi128_si256(
+ _mm_shuffle_epi8(srcReg, _mm256_castsi256_si128(filt[1])));
+ s[2] = _mm256_castsi128_si256(
+ _mm_shuffle_epi8(srcReg, _mm256_castsi256_si128(filt[2])));
+ s[3] = _mm256_castsi128_si256(
+ _mm_shuffle_epi8(srcReg, _mm256_castsi256_si128(filt[3])));
+ outReg2 = convolve8_8_avx2(s, f);
+
+ // shrink to 8 bit each 16 bits, the low and high 64-bits of each lane
+ // contain the first and second convolve result respectively
+ outReg1 = _mm_packus_epi16(outReg1, outReg2);
+
+ // average if necessary
+ if (avg) {
+ outReg1 = _mm_avg_epu8(outReg1, _mm_load_si128((__m128i *)output_ptr));
+ }
+
+ // save 16 bytes
+ _mm_store_si128((__m128i *)output_ptr, outReg1);
+ }
+}
+
+static void vpx_filter_block1d16_h8_avx2(
+ const uint8_t *src_ptr, ptrdiff_t src_stride, uint8_t *output_ptr,
+ ptrdiff_t dst_stride, uint32_t output_height, const int16_t *filter) {
+ vpx_filter_block1d16_h8_x_avx2(src_ptr, src_stride, output_ptr, dst_stride,
+ output_height, filter, 0);
+}
+
+static void vpx_filter_block1d16_h8_avg_avx2(
+ const uint8_t *src_ptr, ptrdiff_t src_stride, uint8_t *output_ptr,
+ ptrdiff_t dst_stride, uint32_t output_height, const int16_t *filter) {
+ vpx_filter_block1d16_h8_x_avx2(src_ptr, src_stride, output_ptr, dst_stride,
+ output_height, filter, 1);
+}
+
+static void vpx_filter_block1d8_h8_avx2(
+ const uint8_t *src_ptr, ptrdiff_t src_pitch, uint8_t *output_ptr,
+ ptrdiff_t output_pitch, uint32_t output_height, const int16_t *filter) {
+ __m256i filt[4], f1[4], s1[4], srcReg;
+ __m128i f[4], s[4];
+ int y = output_height;
+
+ // Multiply the size of the source stride by two
+ const ptrdiff_t src_stride = src_pitch << 1;
+
+ shuffle_filter_avx2(filter, f1);
+ filt[0] = _mm256_load_si256((__m256i const *)filt1_global_avx2);
+ filt[1] = _mm256_load_si256((__m256i const *)filt2_global_avx2);
+ filt[2] = _mm256_load_si256((__m256i const *)filt3_global_avx2);
+ filt[3] = _mm256_load_si256((__m256i const *)filt4_global_avx2);
+
+ // Process next 4 rows
+ while (y > 3) {
+ CALC_CONVOLVE8_HORZ_ROW
+ CALC_CONVOLVE8_HORZ_ROW
+ y -= 4;
+ }
+
+ // If remaining, then process 2 rows at a time
+ while (y > 1) {
+ CALC_CONVOLVE8_HORZ_ROW
+ y -= 2;
+ }
+
+ // For the remaining height.
+ if (y > 0) {
+ const __m128i src_reg_128 = _mm_loadu_si128((const __m128i *)(src_ptr - 3));
+
+ f[0] = _mm256_castsi256_si128(f1[0]);
+ f[1] = _mm256_castsi256_si128(f1[1]);
+ f[2] = _mm256_castsi256_si128(f1[2]);
+ f[3] = _mm256_castsi256_si128(f1[3]);
+
+ // filter the source buffer
+ s[0] = _mm_shuffle_epi8(src_reg_128, _mm256_castsi256_si128(filt[0]));
+ s[1] = _mm_shuffle_epi8(src_reg_128, _mm256_castsi256_si128(filt[1]));
+ s[2] = _mm_shuffle_epi8(src_reg_128, _mm256_castsi256_si128(filt[2]));
+ s[3] = _mm_shuffle_epi8(src_reg_128, _mm256_castsi256_si128(filt[3]));
+ s[0] = convolve8_8_ssse3(s, f);
+
+ // Saturate 16bit value to 8bit.
+ s[0] = _mm_packus_epi16(s[0], s[0]);
+
+ // Save only 8 bytes
+ _mm_storel_epi64((__m128i *)&output_ptr[0], s[0]);
+ }
+}
+
+static INLINE void vpx_filter_block1d16_v8_x_avx2(
+ const uint8_t *src_ptr, ptrdiff_t src_pitch, uint8_t *output_ptr,
+ ptrdiff_t out_pitch, uint32_t output_height, const int16_t *filter,
+ const int avg) {
+ __m128i outReg1, outReg2;
+ __m256i srcRegHead1;
+ unsigned int i;
+ ptrdiff_t src_stride, dst_stride;
+ __m256i f[4], s1[4], s2[4];
+
+ shuffle_filter_avx2(filter, f);
+
+ // multiple the size of the source and destination stride by two
+ src_stride = src_pitch << 1;
+ dst_stride = out_pitch << 1;
+
+ {
+ __m128i s[6];
+ __m256i s32b[6];
+
+ // load 16 bytes 7 times in stride of src_pitch
+ s[0] = _mm_loadu_si128((const __m128i *)(src_ptr + 0 * src_pitch));
+ s[1] = _mm_loadu_si128((const __m128i *)(src_ptr + 1 * src_pitch));
+ s[2] = _mm_loadu_si128((const __m128i *)(src_ptr + 2 * src_pitch));
+ s[3] = _mm_loadu_si128((const __m128i *)(src_ptr + 3 * src_pitch));
+ s[4] = _mm_loadu_si128((const __m128i *)(src_ptr + 4 * src_pitch));
+ s[5] = _mm_loadu_si128((const __m128i *)(src_ptr + 5 * src_pitch));
+ srcRegHead1 = _mm256_castsi128_si256(
+ _mm_loadu_si128((const __m128i *)(src_ptr + 6 * src_pitch)));
+
+ // have each consecutive loads on the same 256 register
+ s32b[0] = _mm256_inserti128_si256(_mm256_castsi128_si256(s[0]), s[1], 1);
+ s32b[1] = _mm256_inserti128_si256(_mm256_castsi128_si256(s[1]), s[2], 1);
+ s32b[2] = _mm256_inserti128_si256(_mm256_castsi128_si256(s[2]), s[3], 1);
+ s32b[3] = _mm256_inserti128_si256(_mm256_castsi128_si256(s[3]), s[4], 1);
+ s32b[4] = _mm256_inserti128_si256(_mm256_castsi128_si256(s[4]), s[5], 1);
+ s32b[5] = _mm256_inserti128_si256(_mm256_castsi128_si256(s[5]),
+ _mm256_castsi256_si128(srcRegHead1), 1);
+
+ // merge every two consecutive registers except the last one
+ // the first lanes contain values for filtering odd rows (1,3,5...) and
+ // the second lanes contain values for filtering even rows (2,4,6...)
+ s1[0] = _mm256_unpacklo_epi8(s32b[0], s32b[1]);
+ s2[0] = _mm256_unpackhi_epi8(s32b[0], s32b[1]);
+ s1[1] = _mm256_unpacklo_epi8(s32b[2], s32b[3]);
+ s2[1] = _mm256_unpackhi_epi8(s32b[2], s32b[3]);
+ s1[2] = _mm256_unpacklo_epi8(s32b[4], s32b[5]);
+ s2[2] = _mm256_unpackhi_epi8(s32b[4], s32b[5]);
+ }
+
+ // The output_height is always a multiple of two.
+ assert(!(output_height & 1));
+
+ for (i = output_height; i > 1; i -= 2) {
+ __m256i srcRegHead2, srcRegHead3;
+
+ // load the next 2 loads of 16 bytes and have every two
+ // consecutive loads in the same 256 bit register
+ srcRegHead2 = _mm256_castsi128_si256(
+ _mm_loadu_si128((const __m128i *)(src_ptr + 7 * src_pitch)));
+ srcRegHead1 = _mm256_inserti128_si256(
+ srcRegHead1, _mm256_castsi256_si128(srcRegHead2), 1);
+ srcRegHead3 = _mm256_castsi128_si256(
+ _mm_loadu_si128((const __m128i *)(src_ptr + 8 * src_pitch)));
+ srcRegHead2 = _mm256_inserti128_si256(
+ srcRegHead2, _mm256_castsi256_si128(srcRegHead3), 1);
+
+ // merge the two new consecutive registers
+ // the first lane contain values for filtering odd rows (1,3,5...) and
+ // the second lane contain values for filtering even rows (2,4,6...)
+ s1[3] = _mm256_unpacklo_epi8(srcRegHead1, srcRegHead2);
+ s2[3] = _mm256_unpackhi_epi8(srcRegHead1, srcRegHead2);
+
+ s1[0] = convolve8_16_avx2(s1, f);
+ s2[0] = convolve8_16_avx2(s2, f);
+
+ // shrink to 8 bit each 16 bits, the low and high 64-bits of each lane
+ // contain the first and second convolve result respectively
+ s1[0] = _mm256_packus_epi16(s1[0], s2[0]);
+
+ src_ptr += src_stride;
+
+ // average if necessary
+ outReg1 = _mm256_castsi256_si128(s1[0]);
+ outReg2 = _mm256_extractf128_si256(s1[0], 1);
+ if (avg) {
+ outReg1 = _mm_avg_epu8(outReg1, _mm_load_si128((__m128i *)output_ptr));
+ outReg2 = _mm_avg_epu8(
+ outReg2, _mm_load_si128((__m128i *)(output_ptr + out_pitch)));
+ }
+
+ // save 16 bytes
+ _mm_store_si128((__m128i *)output_ptr, outReg1);
+
+ // save the next 16 bits
+ _mm_store_si128((__m128i *)(output_ptr + out_pitch), outReg2);
+
+ output_ptr += dst_stride;
+
+ // shift down by two rows
+ s1[0] = s1[1];
+ s2[0] = s2[1];
+ s1[1] = s1[2];
+ s2[1] = s2[2];
+ s1[2] = s1[3];
+ s2[2] = s2[3];
+ srcRegHead1 = srcRegHead3;
+ }
+}
+
+static void vpx_filter_block1d16_v8_avx2(const uint8_t *src_ptr,
+ ptrdiff_t src_stride, uint8_t *dst_ptr,
+ ptrdiff_t dst_stride, uint32_t height,
+ const int16_t *filter) {
+ vpx_filter_block1d16_v8_x_avx2(src_ptr, src_stride, dst_ptr, dst_stride,
+ height, filter, 0);
+}
+
+static void vpx_filter_block1d16_v8_avg_avx2(
+ const uint8_t *src_ptr, ptrdiff_t src_stride, uint8_t *dst_ptr,
+ ptrdiff_t dst_stride, uint32_t height, const int16_t *filter) {
+ vpx_filter_block1d16_v8_x_avx2(src_ptr, src_stride, dst_ptr, dst_stride,
+ height, filter, 1);
+}
+
+static void vpx_filter_block1d16_h4_avx2(const uint8_t *src_ptr,
+ ptrdiff_t src_stride, uint8_t *dst_ptr,
+ ptrdiff_t dst_stride, uint32_t height,
+ const int16_t *kernel) {
+ // We will cast the kernel from 16-bit words to 8-bit words, and then extract
+ // the middle four elements of the kernel into two registers in the form
+ // ... k[3] k[2] k[3] k[2]
+ // ... k[5] k[4] k[5] k[4]
+ // Then we shuffle the source into
+ // ... s[1] s[0] s[0] s[-1]
+ // ... s[3] s[2] s[2] s[1]
+ // Calling multiply and add gives us half of the sum. Calling add gives us
+ // first half of the output. Repeat again to get the second half of the
+ // output. Finally we shuffle again to combine the two outputs.
+ // Since avx2 allows us to use 256-bit buffer, we can do this two rows at a
+ // time.
+
+ __m128i kernel_reg; // Kernel
+ __m256i kernel_reg_256, kernel_reg_23,
+ kernel_reg_45; // Segments of the kernel used
+ const __m256i reg_32 = _mm256_set1_epi16(32); // Used for rounding
+ const ptrdiff_t unrolled_src_stride = src_stride << 1;
+ const ptrdiff_t unrolled_dst_stride = dst_stride << 1;
+ int h;
+
+ __m256i src_reg, src_reg_shift_0, src_reg_shift_2;
+ __m256i dst_first, dst_second;
+ __m256i tmp_0, tmp_1;
+ __m256i idx_shift_0 =
+ _mm256_setr_epi8(0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 0, 1, 1,
+ 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8);
+ __m256i idx_shift_2 =
+ _mm256_setr_epi8(2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10, 2, 3, 3,
+ 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10);
+
+ // Start one pixel before as we need tap/2 - 1 = 1 sample from the past
+ src_ptr -= 1;
+
+ // Load Kernel
+ kernel_reg = _mm_loadu_si128((const __m128i *)kernel);
+ kernel_reg = _mm_srai_epi16(kernel_reg, 1);
+ kernel_reg = _mm_packs_epi16(kernel_reg, kernel_reg);
+ kernel_reg_256 = _mm256_broadcastsi128_si256(kernel_reg);
+ kernel_reg_23 =
+ _mm256_shuffle_epi8(kernel_reg_256, _mm256_set1_epi16(0x0302u));
+ kernel_reg_45 =
+ _mm256_shuffle_epi8(kernel_reg_256, _mm256_set1_epi16(0x0504u));
+
+ for (h = height; h >= 2; h -= 2) {
+ // Load the source
+ src_reg = mm256_loadu2_si128(src_ptr, src_ptr + src_stride);
+ src_reg_shift_0 = _mm256_shuffle_epi8(src_reg, idx_shift_0);
+ src_reg_shift_2 = _mm256_shuffle_epi8(src_reg, idx_shift_2);
+
+ // Partial result for first half
+ tmp_0 = _mm256_maddubs_epi16(src_reg_shift_0, kernel_reg_23);
+ tmp_1 = _mm256_maddubs_epi16(src_reg_shift_2, kernel_reg_45);
+ dst_first = _mm256_adds_epi16(tmp_0, tmp_1);
+
+ // Do again to get the second half of dst
+ // Load the source
+ src_reg = mm256_loadu2_si128(src_ptr + 8, src_ptr + src_stride + 8);
+ src_reg_shift_0 = _mm256_shuffle_epi8(src_reg, idx_shift_0);
+ src_reg_shift_2 = _mm256_shuffle_epi8(src_reg, idx_shift_2);
+
+ // Partial result for second half
+ tmp_0 = _mm256_maddubs_epi16(src_reg_shift_0, kernel_reg_23);
+ tmp_1 = _mm256_maddubs_epi16(src_reg_shift_2, kernel_reg_45);
+ dst_second = _mm256_adds_epi16(tmp_0, tmp_1);
+
+ // Round each result
+ dst_first = mm256_round_epi16(&dst_first, &reg_32, 6);
+ dst_second = mm256_round_epi16(&dst_second, &reg_32, 6);
+
+ // Finally combine to get the final dst
+ dst_first = _mm256_packus_epi16(dst_first, dst_second);
+ mm256_store2_si128((__m128i *)dst_ptr, (__m128i *)(dst_ptr + dst_stride),
+ &dst_first);
+
+ src_ptr += unrolled_src_stride;
+ dst_ptr += unrolled_dst_stride;
+ }
+
+ // Repeat for the last row if needed
+ if (h > 0) {
+ src_reg = _mm256_loadu_si256((const __m256i *)src_ptr);
+ // Reorder into 2 1 1 2
+ src_reg = _mm256_permute4x64_epi64(src_reg, 0x94);
+
+ src_reg_shift_0 = _mm256_shuffle_epi8(src_reg, idx_shift_0);
+ src_reg_shift_2 = _mm256_shuffle_epi8(src_reg, idx_shift_2);
+
+ tmp_0 = _mm256_maddubs_epi16(src_reg_shift_0, kernel_reg_23);
+ tmp_1 = _mm256_maddubs_epi16(src_reg_shift_2, kernel_reg_45);
+ dst_first = _mm256_adds_epi16(tmp_0, tmp_1);
+
+ dst_first = mm256_round_epi16(&dst_first, &reg_32, 6);
+
+ dst_first = _mm256_packus_epi16(dst_first, dst_first);
+ dst_first = _mm256_permute4x64_epi64(dst_first, 0x8);
+
+ _mm_store_si128((__m128i *)dst_ptr, _mm256_castsi256_si128(dst_first));
+ }
+}
+
+static void vpx_filter_block1d16_v4_avx2(const uint8_t *src_ptr,
+ ptrdiff_t src_stride, uint8_t *dst_ptr,
+ ptrdiff_t dst_stride, uint32_t height,
+ const int16_t *kernel) {
+ // We will load two rows of pixels as 8-bit words, rearrange them into the
+ // form
+ // ... s[1,0] s[0,0] s[0,0] s[-1,0]
+ // so that we can call multiply and add with the kernel partial output. Then
+ // we can call add with another row to get the output.
+
+ // Register for source s[-1:3, :]
+ __m256i src_reg_1, src_reg_2, src_reg_3;
+ // Interleaved rows of the source. lo is first half, hi second
+ __m256i src_reg_m10, src_reg_01, src_reg_12, src_reg_23;
+ __m256i src_reg_m1001_lo, src_reg_m1001_hi, src_reg_1223_lo, src_reg_1223_hi;
+
+ __m128i kernel_reg; // Kernel
+ __m256i kernel_reg_256, kernel_reg_23,
+ kernel_reg_45; // Segments of the kernel used
+
+ // Result after multiply and add
+ __m256i res_reg_m1001_lo, res_reg_1223_lo, res_reg_m1001_hi, res_reg_1223_hi;
+ __m256i res_reg, res_reg_lo, res_reg_hi;
+
+ const __m256i reg_32 = _mm256_set1_epi16(32); // Used for rounding
+
+ // We will compute the result two rows at a time
+ const ptrdiff_t src_stride_unrolled = src_stride << 1;
+ const ptrdiff_t dst_stride_unrolled = dst_stride << 1;
+ int h;
+
+ // Load Kernel
+ kernel_reg = _mm_loadu_si128((const __m128i *)kernel);
+ kernel_reg = _mm_srai_epi16(kernel_reg, 1);
+ kernel_reg = _mm_packs_epi16(kernel_reg, kernel_reg);
+ kernel_reg_256 = _mm256_broadcastsi128_si256(kernel_reg);
+ kernel_reg_23 =
+ _mm256_shuffle_epi8(kernel_reg_256, _mm256_set1_epi16(0x0302u));
+ kernel_reg_45 =
+ _mm256_shuffle_epi8(kernel_reg_256, _mm256_set1_epi16(0x0504u));
+
+ // Row -1 to row 0
+ src_reg_m10 = mm256_loadu2_si128((const __m128i *)src_ptr,
+ (const __m128i *)(src_ptr + src_stride));
+
+ // Row 0 to row 1
+ src_reg_1 = _mm256_castsi128_si256(
+ _mm_loadu_si128((const __m128i *)(src_ptr + src_stride * 2)));
+ src_reg_01 = _mm256_permute2x128_si256(src_reg_m10, src_reg_1, 0x21);
+
+ // First three rows
+ src_reg_m1001_lo = _mm256_unpacklo_epi8(src_reg_m10, src_reg_01);
+ src_reg_m1001_hi = _mm256_unpackhi_epi8(src_reg_m10, src_reg_01);
+
+ for (h = height; h > 1; h -= 2) {
+ src_reg_2 = _mm256_castsi128_si256(
+ _mm_loadu_si128((const __m128i *)(src_ptr + src_stride * 3)));
+
+ src_reg_12 = _mm256_inserti128_si256(src_reg_1,
+ _mm256_castsi256_si128(src_reg_2), 1);
+
+ src_reg_3 = _mm256_castsi128_si256(
+ _mm_loadu_si128((const __m128i *)(src_ptr + src_stride * 4)));
+
+ src_reg_23 = _mm256_inserti128_si256(src_reg_2,
+ _mm256_castsi256_si128(src_reg_3), 1);
+
+ // Last three rows
+ src_reg_1223_lo = _mm256_unpacklo_epi8(src_reg_12, src_reg_23);
+ src_reg_1223_hi = _mm256_unpackhi_epi8(src_reg_12, src_reg_23);
+
+ // Output from first half
+ res_reg_m1001_lo = _mm256_maddubs_epi16(src_reg_m1001_lo, kernel_reg_23);
+ res_reg_1223_lo = _mm256_maddubs_epi16(src_reg_1223_lo, kernel_reg_45);
+ res_reg_lo = _mm256_adds_epi16(res_reg_m1001_lo, res_reg_1223_lo);
+
+ // Output from second half
+ res_reg_m1001_hi = _mm256_maddubs_epi16(src_reg_m1001_hi, kernel_reg_23);
+ res_reg_1223_hi = _mm256_maddubs_epi16(src_reg_1223_hi, kernel_reg_45);
+ res_reg_hi = _mm256_adds_epi16(res_reg_m1001_hi, res_reg_1223_hi);
+
+ // Round the words
+ res_reg_lo = mm256_round_epi16(&res_reg_lo, &reg_32, 6);
+ res_reg_hi = mm256_round_epi16(&res_reg_hi, &reg_32, 6);
+
+ // Combine to get the result
+ res_reg = _mm256_packus_epi16(res_reg_lo, res_reg_hi);
+
+ // Save the result
+ mm256_store2_si128((__m128i *)dst_ptr, (__m128i *)(dst_ptr + dst_stride),
+ &res_reg);
+
+ // Update the source by two rows
+ src_ptr += src_stride_unrolled;
+ dst_ptr += dst_stride_unrolled;
+
+ src_reg_m1001_lo = src_reg_1223_lo;
+ src_reg_m1001_hi = src_reg_1223_hi;
+ src_reg_1 = src_reg_3;
+ }
+}
+
+static void vpx_filter_block1d8_h4_avx2(const uint8_t *src_ptr,
+ ptrdiff_t src_stride, uint8_t *dst_ptr,
+ ptrdiff_t dst_stride, uint32_t height,
+ const int16_t *kernel) {
+ // We will cast the kernel from 16-bit words to 8-bit words, and then extract
+ // the middle four elements of the kernel into two registers in the form
+ // ... k[3] k[2] k[3] k[2]
+ // ... k[5] k[4] k[5] k[4]
+ // Then we shuffle the source into
+ // ... s[1] s[0] s[0] s[-1]
+ // ... s[3] s[2] s[2] s[1]
+ // Calling multiply and add gives us half of the sum. Calling add gives us
+ // first half of the output. Repeat again to get the second half of the
+ // output. Finally we shuffle again to combine the two outputs.
+ // Since avx2 allows us to use 256-bit buffer, we can do this two rows at a
+ // time.
+
+ __m128i kernel_reg_128; // Kernel
+ __m256i kernel_reg, kernel_reg_23,
+ kernel_reg_45; // Segments of the kernel used
+ const __m256i reg_32 = _mm256_set1_epi16(32); // Used for rounding
+ const ptrdiff_t unrolled_src_stride = src_stride << 1;
+ const ptrdiff_t unrolled_dst_stride = dst_stride << 1;
+ int h;
+
+ __m256i idx_shift_0 =
+ _mm256_setr_epi8(0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 0, 1, 1,
+ 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8);
+ __m256i idx_shift_2 =
+ _mm256_setr_epi8(2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10, 2, 3, 3,
+ 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10);
+
+ // Start one pixel before as we need tap/2 - 1 = 1 sample from the past
+ src_ptr -= 1;
+
+ // Load Kernel
+ kernel_reg_128 = _mm_loadu_si128((const __m128i *)kernel);
+ kernel_reg_128 = _mm_srai_epi16(kernel_reg_128, 1);
+ kernel_reg_128 = _mm_packs_epi16(kernel_reg_128, kernel_reg_128);
+ kernel_reg = _mm256_broadcastsi128_si256(kernel_reg_128);
+ kernel_reg_23 = _mm256_shuffle_epi8(kernel_reg, _mm256_set1_epi16(0x0302u));
+ kernel_reg_45 = _mm256_shuffle_epi8(kernel_reg, _mm256_set1_epi16(0x0504u));
+
+ for (h = height; h >= 2; h -= 2) {
+ // Load the source
+ const __m256i src_reg = mm256_loadu2_si128(src_ptr, src_ptr + src_stride);
+ __m256i dst_reg;
+ __m256i tmp_0, tmp_1;
+ const __m256i src_reg_shift_0 = _mm256_shuffle_epi8(src_reg, idx_shift_0);
+ const __m256i src_reg_shift_2 = _mm256_shuffle_epi8(src_reg, idx_shift_2);
+
+ // Get the output
+ tmp_0 = _mm256_maddubs_epi16(src_reg_shift_0, kernel_reg_23);
+ tmp_1 = _mm256_maddubs_epi16(src_reg_shift_2, kernel_reg_45);
+ dst_reg = _mm256_adds_epi16(tmp_0, tmp_1);
+
+ // Round the result
+ dst_reg = mm256_round_epi16(&dst_reg, &reg_32, 6);
+
+ // Finally combine to get the final dst
+ dst_reg = _mm256_packus_epi16(dst_reg, dst_reg);
+ mm256_storeu2_epi64((__m128i *)dst_ptr, (__m128i *)(dst_ptr + dst_stride),
+ &dst_reg);
+
+ src_ptr += unrolled_src_stride;
+ dst_ptr += unrolled_dst_stride;
+ }
+
+ // Repeat for the last row if needed
+ if (h > 0) {
+ const __m128i src_reg = _mm_loadu_si128((const __m128i *)src_ptr);
+ __m128i dst_reg;
+ const __m128i reg_32_128 = _mm_set1_epi16(32); // Used for rounding
+ __m128i tmp_0, tmp_1;
+
+ __m128i src_reg_shift_0 =
+ _mm_shuffle_epi8(src_reg, _mm256_castsi256_si128(idx_shift_0));
+ __m128i src_reg_shift_2 =
+ _mm_shuffle_epi8(src_reg, _mm256_castsi256_si128(idx_shift_2));
+
+ tmp_0 = _mm_maddubs_epi16(src_reg_shift_0,
+ _mm256_castsi256_si128(kernel_reg_23));
+ tmp_1 = _mm_maddubs_epi16(src_reg_shift_2,
+ _mm256_castsi256_si128(kernel_reg_45));
+ dst_reg = _mm_adds_epi16(tmp_0, tmp_1);
+
+ dst_reg = mm_round_epi16_sse2(&dst_reg, &reg_32_128, 6);
+
+ dst_reg = _mm_packus_epi16(dst_reg, _mm_setzero_si128());
+
+ _mm_storel_epi64((__m128i *)dst_ptr, dst_reg);
+ }
+}
+
+static void vpx_filter_block1d8_v4_avx2(const uint8_t *src_ptr,
+ ptrdiff_t src_stride, uint8_t *dst_ptr,
+ ptrdiff_t dst_stride, uint32_t height,
+ const int16_t *kernel) {
+ // We will load two rows of pixels as 8-bit words, rearrange them into the
+ // form
+ // ... s[1,0] s[0,0] s[0,0] s[-1,0]
+ // so that we can call multiply and add with the kernel partial output. Then
+ // we can call add with another row to get the output.
+
+ // Register for source s[-1:3, :]
+ __m256i src_reg_1, src_reg_2, src_reg_3;
+ // Interleaved rows of the source. lo is first half, hi second
+ __m256i src_reg_m10, src_reg_01, src_reg_12, src_reg_23;
+ __m256i src_reg_m1001, src_reg_1223;
+
+ __m128i kernel_reg_128; // Kernel
+ __m256i kernel_reg, kernel_reg_23,
+ kernel_reg_45; // Segments of the kernel used
+
+ // Result after multiply and add
+ __m256i res_reg_m1001, res_reg_1223;
+ __m256i res_reg;
+
+ const __m256i reg_32 = _mm256_set1_epi16(32); // Used for rounding
+
+ // We will compute the result two rows at a time
+ const ptrdiff_t src_stride_unrolled = src_stride << 1;
+ const ptrdiff_t dst_stride_unrolled = dst_stride << 1;
+ int h;
+
+ // Load Kernel
+ kernel_reg_128 = _mm_loadu_si128((const __m128i *)kernel);
+ kernel_reg_128 = _mm_srai_epi16(kernel_reg_128, 1);
+ kernel_reg_128 = _mm_packs_epi16(kernel_reg_128, kernel_reg_128);
+ kernel_reg = _mm256_broadcastsi128_si256(kernel_reg_128);
+ kernel_reg_23 = _mm256_shuffle_epi8(kernel_reg, _mm256_set1_epi16(0x0302u));
+ kernel_reg_45 = _mm256_shuffle_epi8(kernel_reg, _mm256_set1_epi16(0x0504u));
+
+ // Row -1 to row 0
+ src_reg_m10 = mm256_loadu2_epi64((const __m128i *)src_ptr,
+ (const __m128i *)(src_ptr + src_stride));
+
+ // Row 0 to row 1
+ src_reg_1 = _mm256_castsi128_si256(
+ _mm_loadu_si128((const __m128i *)(src_ptr + src_stride * 2)));
+ src_reg_01 = _mm256_permute2x128_si256(src_reg_m10, src_reg_1, 0x21);
+
+ // First three rows
+ src_reg_m1001 = _mm256_unpacklo_epi8(src_reg_m10, src_reg_01);
+
+ for (h = height; h > 1; h -= 2) {
+ src_reg_2 = _mm256_castsi128_si256(
+ _mm_loadl_epi64((const __m128i *)(src_ptr + src_stride * 3)));
+
+ src_reg_12 = _mm256_inserti128_si256(src_reg_1,
+ _mm256_castsi256_si128(src_reg_2), 1);
+
+ src_reg_3 = _mm256_castsi128_si256(
+ _mm_loadl_epi64((const __m128i *)(src_ptr + src_stride * 4)));
+
+ src_reg_23 = _mm256_inserti128_si256(src_reg_2,
+ _mm256_castsi256_si128(src_reg_3), 1);
+
+ // Last three rows
+ src_reg_1223 = _mm256_unpacklo_epi8(src_reg_12, src_reg_23);
+
+ // Output
+ res_reg_m1001 = _mm256_maddubs_epi16(src_reg_m1001, kernel_reg_23);
+ res_reg_1223 = _mm256_maddubs_epi16(src_reg_1223, kernel_reg_45);
+ res_reg = _mm256_adds_epi16(res_reg_m1001, res_reg_1223);
+
+ // Round the words
+ res_reg = mm256_round_epi16(&res_reg, &reg_32, 6);
+
+ // Combine to get the result
+ res_reg = _mm256_packus_epi16(res_reg, res_reg);
+
+ // Save the result
+ mm256_storeu2_epi64((__m128i *)dst_ptr, (__m128i *)(dst_ptr + dst_stride),
+ &res_reg);
+
+ // Update the source by two rows
+ src_ptr += src_stride_unrolled;
+ dst_ptr += dst_stride_unrolled;
+
+ src_reg_m1001 = src_reg_1223;
+ src_reg_1 = src_reg_3;
+ }
+}
+
+static void vpx_filter_block1d4_h4_avx2(const uint8_t *src_ptr,
+ ptrdiff_t src_stride, uint8_t *dst_ptr,
+ ptrdiff_t dst_stride, uint32_t height,
+ const int16_t *kernel) {
+ // We will cast the kernel from 16-bit words to 8-bit words, and then extract
+ // the middle four elements of the kernel into a single register in the form
+ // k[5:2] k[5:2] k[5:2] k[5:2]
+ // Then we shuffle the source into
+ // s[5:2] s[4:1] s[3:0] s[2:-1]
+ // Calling multiply and add gives us half of the sum next to each other.
+ // Calling horizontal add then gives us the output.
+ // Since avx2 has 256-bit register, we can do 2 rows at a time.
+
+ __m128i kernel_reg_128; // Kernel
+ __m256i kernel_reg;
+ const __m256i reg_32 = _mm256_set1_epi16(32); // Used for rounding
+ int h;
+ const ptrdiff_t unrolled_src_stride = src_stride << 1;
+ const ptrdiff_t unrolled_dst_stride = dst_stride << 1;
+
+ __m256i shuf_idx =
+ _mm256_setr_epi8(0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6, 0, 1, 2,
+ 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6);
+
+ // Start one pixel before as we need tap/2 - 1 = 1 sample from the past
+ src_ptr -= 1;
+
+ // Load Kernel
+ kernel_reg_128 = _mm_loadu_si128((const __m128i *)kernel);
+ kernel_reg_128 = _mm_srai_epi16(kernel_reg_128, 1);
+ kernel_reg_128 = _mm_packs_epi16(kernel_reg_128, kernel_reg_128);
+ kernel_reg = _mm256_broadcastsi128_si256(kernel_reg_128);
+ kernel_reg = _mm256_shuffle_epi8(kernel_reg, _mm256_set1_epi32(0x05040302u));
+
+ for (h = height; h > 1; h -= 2) {
+ // Load the source
+ const __m256i src_reg = mm256_loadu2_epi64(
+ (const __m128i *)src_ptr, (const __m128i *)(src_ptr + src_stride));
+ const __m256i src_reg_shuf = _mm256_shuffle_epi8(src_reg, shuf_idx);
+
+ // Get the result
+ __m256i dst = _mm256_maddubs_epi16(src_reg_shuf, kernel_reg);
+ dst = _mm256_hadds_epi16(dst, _mm256_setzero_si256());
+
+ // Round result
+ dst = mm256_round_epi16(&dst, &reg_32, 6);
+
+ // Pack to 8-bits
+ dst = _mm256_packus_epi16(dst, _mm256_setzero_si256());
+
+ // Save
+ mm256_storeu2_epi32((__m128i *const)dst_ptr,
+ (__m128i *const)(dst_ptr + dst_stride), &dst);
+
+ src_ptr += unrolled_src_stride;
+ dst_ptr += unrolled_dst_stride;
+ }
+
+ if (h > 0) {
+ // Load the source
+ const __m128i reg_32_128 = _mm_set1_epi16(32); // Used for rounding
+ __m128i src_reg = _mm_loadl_epi64((const __m128i *)src_ptr);
+ __m128i src_reg_shuf =
+ _mm_shuffle_epi8(src_reg, _mm256_castsi256_si128(shuf_idx));
+
+ // Get the result
+ __m128i dst =
+ _mm_maddubs_epi16(src_reg_shuf, _mm256_castsi256_si128(kernel_reg));
+ dst = _mm_hadds_epi16(dst, _mm_setzero_si128());
+
+ // Round result
+ dst = mm_round_epi16_sse2(&dst, &reg_32_128, 6);
+
+ // Pack to 8-bits
+ dst = _mm_packus_epi16(dst, _mm_setzero_si128());
+ *((int *)(dst_ptr)) = _mm_cvtsi128_si32(dst);
+ }
+}
+
+static void vpx_filter_block1d4_v4_avx2(const uint8_t *src_ptr,
+ ptrdiff_t src_stride, uint8_t *dst_ptr,
+ ptrdiff_t dst_stride, uint32_t height,
+ const int16_t *kernel) {
+ // We will load two rows of pixels as 8-bit words, rearrange them into the
+ // form
+ // ... s[3,0] s[2,0] s[1,0] s[0,0] s[2,0] s[1,0] s[0,0] s[-1,0]
+ // so that we can call multiply and add with the kernel to get partial output.
+ // Calling horizontal add then gives us the completely output
+
+ // Register for source s[-1:3, :]
+ __m256i src_reg_1, src_reg_2, src_reg_3;
+ // Interleaved rows of the source. lo is first half, hi second
+ __m256i src_reg_m10, src_reg_01, src_reg_12, src_reg_23;
+ __m256i src_reg_m1001, src_reg_1223, src_reg_m1012_1023;
+
+ __m128i kernel_reg_128; // Kernel
+ __m256i kernel_reg;
+
+ // Result after multiply and add
+ __m256i res_reg;
+
+ const __m256i reg_32 = _mm256_set1_epi16(32); // Used for rounding
+
+ // We will compute the result two rows at a time
+ const ptrdiff_t src_stride_unrolled = src_stride << 1;
+ const ptrdiff_t dst_stride_unrolled = dst_stride << 1;
+ int h;
+
+ // Load Kernel
+ kernel_reg_128 = _mm_loadu_si128((const __m128i *)kernel);
+ kernel_reg_128 = _mm_srai_epi16(kernel_reg_128, 1);
+ kernel_reg_128 = _mm_packs_epi16(kernel_reg_128, kernel_reg_128);
+ kernel_reg = _mm256_broadcastsi128_si256(kernel_reg_128);
+ kernel_reg = _mm256_shuffle_epi8(kernel_reg, _mm256_set1_epi32(0x05040302u));
+
+ // Row -1 to row 0
+ src_reg_m10 = mm256_loadu2_si128((const __m128i *)src_ptr,
+ (const __m128i *)(src_ptr + src_stride));
+
+ // Row 0 to row 1
+ src_reg_1 = _mm256_castsi128_si256(
+ _mm_loadu_si128((const __m128i *)(src_ptr + src_stride * 2)));
+ src_reg_01 = _mm256_permute2x128_si256(src_reg_m10, src_reg_1, 0x21);
+
+ // First three rows
+ src_reg_m1001 = _mm256_unpacklo_epi8(src_reg_m10, src_reg_01);
+
+ for (h = height; h > 1; h -= 2) {
+ src_reg_2 = _mm256_castsi128_si256(
+ _mm_loadl_epi64((const __m128i *)(src_ptr + src_stride * 3)));
+
+ src_reg_12 = _mm256_inserti128_si256(src_reg_1,
+ _mm256_castsi256_si128(src_reg_2), 1);
+
+ src_reg_3 = _mm256_castsi128_si256(
+ _mm_loadl_epi64((const __m128i *)(src_ptr + src_stride * 4)));
+
+ src_reg_23 = _mm256_inserti128_si256(src_reg_2,
+ _mm256_castsi256_si128(src_reg_3), 1);
+
+ // Last three rows
+ src_reg_1223 = _mm256_unpacklo_epi8(src_reg_12, src_reg_23);
+
+ // Combine all the rows
+ src_reg_m1012_1023 = _mm256_unpacklo_epi16(src_reg_m1001, src_reg_1223);
+
+ // Output
+ res_reg = _mm256_maddubs_epi16(src_reg_m1012_1023, kernel_reg);
+ res_reg = _mm256_hadds_epi16(res_reg, _mm256_setzero_si256());
+
+ // Round the words
+ res_reg = mm256_round_epi16(&res_reg, &reg_32, 6);
+
+ // Combine to get the result
+ res_reg = _mm256_packus_epi16(res_reg, res_reg);
+
+ // Save the result
+ mm256_storeu2_epi32((__m128i *)dst_ptr, (__m128i *)(dst_ptr + dst_stride),
+ &res_reg);
+
+ // Update the source by two rows
+ src_ptr += src_stride_unrolled;
+ dst_ptr += dst_stride_unrolled;
+
+ src_reg_m1001 = src_reg_1223;
+ src_reg_1 = src_reg_3;
+ }
+}
+
+static void vpx_filter_block1d8_v8_avx2(
+ const uint8_t *src_ptr, ptrdiff_t src_pitch, uint8_t *output_ptr,
+ ptrdiff_t out_pitch, uint32_t output_height, const int16_t *filter) {
+ __m256i f[4], ss[4];
+ __m256i r[8];
+ __m128i s[9];
+
+ unsigned int y = output_height;
+ // Multiply the size of the source stride by two
+ const ptrdiff_t src_stride = src_pitch << 1;
+
+ // The output_height is always a multiple of two.
+ assert(!(output_height & 1));
+
+ shuffle_filter_avx2(filter, f);
+ s[0] = _mm_loadl_epi64((const __m128i *)(src_ptr + 0 * src_pitch));
+ s[1] = _mm_loadl_epi64((const __m128i *)(src_ptr + 1 * src_pitch));
+ s[2] = _mm_loadl_epi64((const __m128i *)(src_ptr + 2 * src_pitch));
+ s[3] = _mm_loadl_epi64((const __m128i *)(src_ptr + 3 * src_pitch));
+ s[4] = _mm_loadl_epi64((const __m128i *)(src_ptr + 4 * src_pitch));
+ s[5] = _mm_loadl_epi64((const __m128i *)(src_ptr + 5 * src_pitch));
+ s[6] = _mm_loadl_epi64((const __m128i *)(src_ptr + 6 * src_pitch));
+
+ // merge the result together
+ // r[0]: 0 0 0 0 0 0 0 0 r17 r16 r15 r14 r13 r12 r11 r10 | 0 0 0 0 0 0 0 0
+ // r07 r06 r05 r04 r03 r02 r01 r00
+ r[0] = _mm256_inserti128_si256(_mm256_castsi128_si256(s[0]), s[1], 1);
+
+ // r[1]: 0 0 0 0 0 0 0 0 r27 r26 r25 r24 r23 r22 r21 r20 | 0 0 0 0 0 0 0 0
+ // r17 r16 r15 r14 r13 r12 r11 r10
+ r[1] = _mm256_inserti128_si256(_mm256_castsi128_si256(s[1]), s[2], 1);
+
+ // r[2]: 0 0 0 0 0 0 0 0 r37 r36 r35 r34 r33 r32 r31 r30 | 0 0 0 0 0 0 0 0
+ // r27 r26 r25 r24 r23 r22 r21 r20
+ r[2] = _mm256_inserti128_si256(_mm256_castsi128_si256(s[2]), s[3], 1);
+
+ // r[3]: 0 0 0 0 0 0 0 0 r47 r46 r45 r44 r43 r42 r41 r40 | 0 0 0 0 0 0 0 0
+ // r37 r36 r35 r34 r33 r32 r31 r30
+ r[3] = _mm256_inserti128_si256(_mm256_castsi128_si256(s[3]), s[4], 1);
+
+ // r[4]: 0 0 0 0 0 0 0 0 r57 r56 r55 r54 r53 r52 r51 r50 | 0 0 0 0 0 0 0 0
+ // r47 r46 r45 r44 r43 r42 r41 r40
+ r[4] = _mm256_inserti128_si256(_mm256_castsi128_si256(s[4]), s[5], 1);
+
+ // r[5]: 0 0 0 0 0 0 0 0 r67 r66 r65 r64 r63 r62 r61 r60 | 0 0 0 0 0 0 0 0
+ // r57 r56 r55 r54 r53 r52 r51 r50
+ r[5] = _mm256_inserti128_si256(_mm256_castsi128_si256(s[5]), s[6], 1);
+
+ // Merge together
+ // ss[0]: |r27 r17|.......|r21 r11|r20 r10 || r17 r07|.....|r12 r02|r11
+ // r01|r10 r00|
+ ss[0] = _mm256_unpacklo_epi8(r[0], r[1]);
+
+ // ss[0]: |r47 r37|.......|r41 r31|r40 r30 || r37 r27|.....|r32 r22|r31
+ // r21|r30 r20|
+ ss[1] = _mm256_unpacklo_epi8(r[2], r[3]);
+
+ // ss[2]: |r67 r57|.......|r61 r51|r60 r50 || r57 r47|.....|r52 r42|r51
+ // r41|r50 r40|
+ ss[2] = _mm256_unpacklo_epi8(r[4], r[5]);
+
+ // Process 2 rows at a time
+ do {
+ s[7] = _mm_loadl_epi64((const __m128i *)(src_ptr + 7 * src_pitch));
+ s[8] = _mm_loadl_epi64((const __m128i *)(src_ptr + 8 * src_pitch));
+
+ // r[6]: 0 0 0 0 0 0 0 0 r77 r76 r75 r74 r73 r72 r71 r70 | 0 0 0 0 0 0 0
+ // 0 r67 r66 r65 r64 r63 r62 r61 r60
+ r[6] = _mm256_inserti128_si256(_mm256_castsi128_si256(s[6]), s[7], 1);
+ // r[7]: 0 0 0 0 0 0 0 0 r87 r86 r85 r84 r83 r82 r81 r80 | 0 0 0 0 0 0 0
+ // 0 r77 r76 r75 r74 r73 r72 r71 r70
+ r[7] = _mm256_inserti128_si256(_mm256_castsi128_si256(s[7]), s[8], 1);
+
+ // ss[3] : | r87 r77 | .......| r81 r71 | r80 r70 || r77 r67 | .....| r72
+ // r62 | r71 r61|r70 r60|
+ ss[3] = _mm256_unpacklo_epi8(r[6], r[7]);
+ ss[0] = convolve8_16_avx2(ss, f);
+ ss[0] = _mm256_packus_epi16(ss[0], ss[0]);
+ src_ptr += src_stride;
+
+ /* shift down two rows */
+ s[6] = s[8];
+ _mm_storel_epi64((__m128i *)&output_ptr[0], _mm256_castsi256_si128(ss[0]));
+ output_ptr += out_pitch;
+ _mm_storel_epi64((__m128i *)&output_ptr[0],
+ _mm256_extractf128_si256(ss[0], 1));
+ output_ptr += out_pitch;
+ ss[0] = ss[1];
+ ss[1] = ss[2];
+ ss[2] = ss[3];
+ y -= 2;
+ } while (y > 1);
+}
+
+static void vpx_filter_block1d4_h8_avx2(
+ const uint8_t *src_ptr, ptrdiff_t src_pitch, uint8_t *output_ptr,
+ ptrdiff_t output_pitch, uint32_t output_height, const int16_t *filter) {
+ __m128i filtersReg;
+ __m256i addFilterReg64_256bit;
+ unsigned int y = output_height;
+
+ assert(output_height > 1);
+
+ addFilterReg64_256bit = _mm256_set1_epi16(32);
+
+ // f7 f6 f5 f4 f3 f2 f1 f0 (16 bit)
+ filtersReg = _mm_loadu_si128((const __m128i *)filter);
+
+ // converting the 16 bit (short) to 8 bit (byte) and have the same data
+ // in both lanes of 128 bit register.
+ // f7 f6 f5 f4 f3 f2 f1 f0 || f7 f6 f5 f4 f3 f2 f1 f0 (8 bit each)
+ filtersReg = _mm_packs_epi16(filtersReg, filtersReg);
+
+ {
+ ptrdiff_t src_stride;
+ __m256i filt1Reg, filt2Reg, firstFilters, secondFilters;
+ // have the same data in both lanes of a 256 bit register
+ // f7 f6 f5 f4 f3 f2 f1 f0 f7 f6 f5 f4 f3 f2 f1 f0 | f7 f6 f5 f4 f3 f2 f1 f0
+ // f7 f6 f5 f4 f3 f2 f1 f0 (8bit each)
+ const __m256i filtersReg32 = _mm256_broadcastsi128_si256(filtersReg);
+
+ // duplicate only the first 32 bits
+ // f3 f2 f1 f0|f3 f2 f1 f0|f3 f2 f1 f0|f3 f2 f1 f0 | f3 f2 f1 f0|f3 f2 f1
+ // f0|f3 f2 f1 f0|f3 f2 f1 f0
+ firstFilters = _mm256_shuffle_epi32(filtersReg32, 0);
+ // duplicate only the second 32 bits
+ // f7 f6 f5 f4|f7 f6 f5 f4|f7 f6 f5 f4|f7 f6 f5 f4 | f7 f6 f5 f4|f7 f6 f5
+ // f4|f7 f6 f5 f4|f7 f6 f5 f4
+ secondFilters = _mm256_shuffle_epi32(filtersReg32, 0x55);
+
+ // s6 s5 s4 s3 s5 s4 s3 s2 s4 s3 s2 s1 s3 s2 s1 s0 | s6 s5 s4 s3 s5 s4 s3
+ // s2 s4 s3 s2 s1 s3 s2 s1 s0
+ filt1Reg = _mm256_load_si256((__m256i const *)filt_d4_global_avx2);
+
+ // s10 s9 s8 s7 s9 s8 s7 s6 s8 s7 s6 s5 s7 s6 s5 s4 | s10 s9 s8 s7 s9 s8 s7
+ // s6 s8 s7 s6 s5 s7 s6 s5 s4
+ filt2Reg = _mm256_load_si256((__m256i const *)(filt_d4_global_avx2 + 32));
+
+ // multiple the size of the source and destination stride by two
+ src_stride = src_pitch << 1;
+
+ do {
+ __m256i srcRegFilt32b1_1, srcRegFilt32b2, srcReg32b1;
+ // load the 2 strides of source
+ // r115 r114 ...... r15 r14 r13 r12 r11 r10 | r015 r014 r013 ...... r07
+ // r06 r05 r04 r03 r02 r01 r00
+ srcReg32b1 = xx_loadu2_mi128(src_ptr - 3 + src_pitch, src_ptr - 3);
+
+ // filter the source buffer
+ // r16 r15 r14 r13 r15 r14 r13 r12 r14 r13 r12 r11 r13 r12 r11 r10 | r06
+ // r05 r04 r03 r05 r04 r03 r02 r04 r03 r02 r01 r03 r02 r01 r00
+ srcRegFilt32b1_1 = _mm256_shuffle_epi8(srcReg32b1, filt1Reg);
+
+ // multiply 4 adjacent elements with the filter and add the result
+ // ...|f3*r14+f2*r13|f1*r13+f0*r12|f3*r13+f2*r12|f1*r11+f0*r10||...
+ // |f1*r03+f0*r02|f3*r04+f2*r03|f1*r02+f0*r01|f3*r03+f2*r02|f1*r01+f0*r00
+ srcRegFilt32b1_1 = _mm256_maddubs_epi16(srcRegFilt32b1_1, firstFilters);
+
+ // filter the source buffer
+ // r110 r19 r18 r17|r19 r18 r17 r16|r18 r17 r16 r15|r17 r16 r15 r14||r010
+ // r09 r08 r07|r09 r08 r07 r06|r08 r07 r06 r05|r07 r06 r05 r04
+ srcRegFilt32b2 = _mm256_shuffle_epi8(srcReg32b1, filt2Reg);
+
+ // multiply 4 adjacent elements with the filter and add the result
+ // r010 r09 r08 r07|r9 r08 r07 r06|r08 r07 r06 r05|r07 r06 r05 r04||r010
+ // r09 r08 r07|r9 r08 r07 r06|r08 r07 r06 r05|r07 r06 r05 r04
+ srcRegFilt32b2 = _mm256_maddubs_epi16(srcRegFilt32b2, secondFilters);
+
+ srcRegFilt32b1_1 =
+ _mm256_add_epi16(srcRegFilt32b1_1, addFilterReg64_256bit);
+ srcRegFilt32b1_1 = _mm256_adds_epi16(srcRegFilt32b1_1, srcRegFilt32b2);
+
+ srcRegFilt32b1_1 =
+ _mm256_hadds_epi16(srcRegFilt32b1_1, _mm256_setzero_si256());
+
+ // 0 0 0 0 R13 R12 R11 R10 || 0 0 0 0 R03 R02 R01 R00 (16bit)
+ srcRegFilt32b1_1 = _mm256_srai_epi16(srcRegFilt32b1_1, 7);
+
+ // 8zeros 0 0 0 0 R13 R12 R11 R10 || 8zeros 0 0 0 0 R03 R02 R01 R00 (8bit)
+ srcRegFilt32b1_1 =
+ _mm256_packus_epi16(srcRegFilt32b1_1, _mm256_setzero_si256());
+
+ src_ptr += src_stride;
+ // save first row 4 values
+ *((int *)&output_ptr[0]) =
+ _mm_cvtsi128_si32(_mm256_castsi256_si128(srcRegFilt32b1_1));
+ output_ptr += output_pitch;
+
+ // save second row 4 values
+ *((int *)&output_ptr[0]) =
+ _mm_cvtsi128_si32(_mm256_extractf128_si256(srcRegFilt32b1_1, 1));
+ output_ptr += output_pitch;
+
+ y = y - 2;
+ } while (y > 1);
+
+ // For remaining height
+ if (y > 0) {
+ __m128i srcReg1, srcRegFilt1_1, addFilterReg64;
+ __m128i srcRegFilt2;
+
+ addFilterReg64 = _mm_set1_epi32((int)0x0400040u);
+
+ srcReg1 = _mm_loadu_si128((const __m128i *)(src_ptr - 3));
+
+ // filter the source buffer
+ srcRegFilt1_1 =
+ _mm_shuffle_epi8(srcReg1, _mm256_castsi256_si128(filt1Reg));
+
+ // multiply 4 adjacent elements with the filter and add the result
+ srcRegFilt1_1 = _mm_maddubs_epi16(srcRegFilt1_1,
+ _mm256_castsi256_si128(firstFilters));
+
+ // filter the source buffer
+ srcRegFilt2 = _mm_shuffle_epi8(srcReg1, _mm256_castsi256_si128(filt2Reg));
+
+ // multiply 4 adjacent elements with the filter and add the result
+ srcRegFilt2 =
+ _mm_maddubs_epi16(srcRegFilt2, _mm256_castsi256_si128(secondFilters));
+
+ srcRegFilt1_1 = _mm_adds_epi16(srcRegFilt1_1, srcRegFilt2);
+ srcRegFilt1_1 = _mm_hadds_epi16(srcRegFilt1_1, _mm_setzero_si128());
+ // shift by 6 bit each 16 bit
+ srcRegFilt1_1 = _mm_adds_epi16(srcRegFilt1_1, addFilterReg64);
+ srcRegFilt1_1 = _mm_srai_epi16(srcRegFilt1_1, 7);
+
+ // shrink to 8 bit each 16 bits, the first lane contain the first
+ // convolve result and the second lane contain the second convolve result
+ srcRegFilt1_1 = _mm_packus_epi16(srcRegFilt1_1, _mm_setzero_si128());
+
+ // save 4 bytes
+ *((int *)(output_ptr)) = _mm_cvtsi128_si32(srcRegFilt1_1);
+ }
+ }
+}
+
+static void vpx_filter_block1d4_v8_avx2(
+ const uint8_t *src_ptr, ptrdiff_t src_pitch, uint8_t *output_ptr,
+ ptrdiff_t out_pitch, uint32_t output_height, const int16_t *filter) {
+ __m256i f[4], ss[4];
+ __m256i r[8];
+ __m128i r1[10];
+ __m128i s[11];
+
+ unsigned int y = output_height;
+ // Multiply the size of the source stride by four
+ const ptrdiff_t src_stride = src_pitch << 2;
+ const ptrdiff_t out_stride = out_pitch << 2;
+
+ // The output_height is always a multiple of two.
+ assert(!(output_height & 0x01));
+
+ shuffle_filter_avx2(filter, f);
+
+ s[0] = _mm_loadl_epi64((const __m128i *)(src_ptr + 0 * src_pitch));
+ s[1] = _mm_loadl_epi64((const __m128i *)(src_ptr + 1 * src_pitch));
+ s[2] = _mm_loadl_epi64((const __m128i *)(src_ptr + 2 * src_pitch));
+ s[3] = _mm_loadl_epi64((const __m128i *)(src_ptr + 3 * src_pitch));
+ s[4] = _mm_loadl_epi64((const __m128i *)(src_ptr + 4 * src_pitch));
+ s[5] = _mm_loadl_epi64((const __m128i *)(src_ptr + 5 * src_pitch));
+ s[6] = _mm_loadl_epi64((const __m128i *)(src_ptr + 6 * src_pitch));
+
+ // R1-0 xxxx .. . . x| r13 r12 r11 r10 r03 r02 r01 r00
+ r1[0] = _mm_unpacklo_epi32(s[0], s[1]);
+
+ // R2-1 xxxx .. . . x| r23 r22 r21 r20 r13 r12 r11 r10
+ r1[1] = _mm_unpacklo_epi32(s[1], s[2]);
+
+ // R3-2 xxxx .. . . x| r33 r32 r31 r30 r23 r22 r21 r20
+ r1[2] = _mm_unpacklo_epi32(s[2], s[3]);
+
+ // R4-3 xxxx .. . . x| r43 r42 r41 r40 r33 r32 r31 r30
+ r1[3] = _mm_unpacklo_epi32(s[3], s[4]);
+
+ // R5-4 xxxx .. . . x| r53 r52 r51 r50 r43 r42 r41 r40
+ r1[4] = _mm_unpacklo_epi32(s[4], s[5]);
+
+ // R6-5 xxxx .. . . x| r63 r62 r61 r60 r53 r52 r51 r50
+ r1[5] = _mm_unpacklo_epi32(s[5], s[6]);
+
+ // 00000000 r33 r32 r31 r30|r23 r22 r21 r20||00000000|r13 r12 r11 r10|r03 r02
+ // r01 r00
+ r[0] = _mm256_inserti128_si256(_mm256_castsi128_si256(r1[0]), r1[2], 1);
+
+ // 00000000 r43 r42 r41 r40|r33 r32 r31 r30||00000000|r23 r22 r21 r20|r13 r12
+ // r11 r10
+ r[1] = _mm256_inserti128_si256(_mm256_castsi128_si256(r1[1]), r1[3], 1);
+
+ // 00000000 r53 r52 r51 r50|r43 r42 r41 r40||00000000|r33 r32 r31 r30|r23 r22
+ // r21 r20
+ r[2] = _mm256_inserti128_si256(_mm256_castsi128_si256(r1[2]), r1[4], 1);
+
+ // 00000000 r63 r62 r61 r60|r53 r52 r51 r50||00000000|r43 r42 r41 r40|r33 r32
+ // r31 r30
+ r[3] = _mm256_inserti128_si256(_mm256_castsi128_si256(r1[3]), r1[5], 1);
+
+ // r43 r33....r40 r30|r33 r23....r30 r20||r23 r13....r20 r10|r13 r03....r10
+ // r00|
+ ss[0] = _mm256_unpacklo_epi8(r[0], r[1]);
+
+ // r63 r53....r60 r50|r53 r43....r50 r40||r43 r33....r40 r30|r33 r23....r30
+ // r20|
+ ss[1] = _mm256_unpacklo_epi8(r[2], r[3]);
+
+ // Process 4 rows at a time
+ while (y >= 4) {
+ s[7] = _mm_loadl_epi64((const __m128i *)(src_ptr + 7 * src_pitch));
+ s[8] = _mm_loadl_epi64((const __m128i *)(src_ptr + 8 * src_pitch));
+ s[9] = _mm_loadl_epi64((const __m128i *)(src_ptr + 9 * src_pitch));
+ s[10] = _mm_loadl_epi64((const __m128i *)(src_ptr + 10 * src_pitch));
+
+ // R7-6 xxxx .. . . x| r73 r72 r71 r70 r63 r62 r61 r60
+ r1[6] = _mm_unpacklo_epi32(s[6], s[7]);
+
+ // R8-7 xxxx .. . . x| r83 r82 r81 r80 r73 r72 r71 r70
+ r1[7] = _mm_unpacklo_epi32(s[7], s[8]);
+
+ // R9-8 xxxx .. . . x| r93 r92 r91 r90 r83 r82 r81 r80
+ r1[8] = _mm_unpacklo_epi32(s[8], s[9]);
+
+ // R10-9 xxxx .. . . x| r10-3 r10-2 r10-1 r10-0 r93 r92 r91 r90
+ r1[9] = _mm_unpacklo_epi32(s[9], s[10]);
+
+ // 00000000 r73 r72 r71 r70|r63 r62 r61 r60||00000000|r53 r52 r51 r50|r43
+ // r42 r41 r40
+ r[4] = _mm256_inserti128_si256(_mm256_castsi128_si256(r1[4]), r1[6], 1);
+
+ // 00000000 r83 r82 r81 r80|r73 r72 r71 r70||00000000|r63 r62 r61 r60|r53
+ // r52 r51 r50
+ r[5] = _mm256_inserti128_si256(_mm256_castsi128_si256(r1[5]), r1[7], 1);
+
+ // 00000000 r93 r92 r91 r90|r83 r82 r81 r80||00000000|r73 r72 r71 r70|r63
+ // r62 r61 r60
+ r[6] = _mm256_inserti128_si256(_mm256_castsi128_si256(r1[6]), r1[8], 1);
+
+ // 00000000 r10-3 r10-2 r10-1 r10-0|r93 r92 r91 r90||00000000|r83 r82 r81
+ // r80|r73 r72 r71 r70
+ r[7] = _mm256_inserti128_si256(_mm256_castsi128_si256(r1[7]), r1[9], 1);
+
+ // r83 r73....r80 r70|r73 r63....r70 r60||r63 r53....r60 r50|r53 r43....r50
+ // r40|
+ ss[2] = _mm256_unpacklo_epi8(r[4], r[5]);
+
+ // r10-3 r10-3....r10-0 r10-0|r93 r83....r90 r80||r83 r73....r80 r70|r73
+ // r63....r70 r60|
+ ss[3] = _mm256_unpacklo_epi8(r[6], r[7]);
+
+ ss[0] = convolve8_16_avx2(ss, f);
+
+ // r3 r2 r3 r2 r1 r0 r1 r0
+ ss[0] = _mm256_packus_epi16(ss[0], ss[0]);
+ src_ptr += src_stride;
+
+ mm256_storeu2_epi32((__m128i *const)output_ptr,
+ (__m128i *const)(output_ptr + (2 * out_pitch)), ss);
+
+ ss[0] = _mm256_srli_si256(ss[0], 4);
+
+ mm256_storeu2_epi32((__m128i *const)(output_ptr + (1 * out_pitch)),
+ (__m128i *const)(output_ptr + (3 * out_pitch)), ss);
+
+ output_ptr += out_stride;
+
+ ss[0] = ss[2];
+ ss[1] = ss[3];
+
+ s[6] = s[10];
+
+ r1[4] = r1[8];
+ r1[5] = r1[9];
+
+ y -= 4;
+ }
+
+ // Process 2 rows
+ if (y == 2) {
+ __m128i ss1[4], f1[4];
+
+ s[7] = _mm_loadl_epi64((const __m128i *)(src_ptr + 7 * src_pitch));
+ s[8] = _mm_loadl_epi64((const __m128i *)(src_ptr + 8 * src_pitch));
+
+ f1[0] = _mm256_castsi256_si128(f[0]);
+ f1[1] = _mm256_castsi256_si128(f[1]);
+ f1[2] = _mm256_castsi256_si128(f[2]);
+ f1[3] = _mm256_castsi256_si128(f[3]);
+
+ // R7-6 xxxx .. . . x| r73 r72 r71 r70 r63 r62 r61 r60
+ r1[6] = _mm_unpacklo_epi32(s[6], s[7]);
+
+ // R8-7 xxxx .. . . x| r83 r82 r81 r80 r73 r72 r71 r70
+ r1[7] = _mm_unpacklo_epi32(s[7], s[8]);
+
+ // r23 r13....r20 r10|r13 r03....r10 r00
+ ss1[0] = _mm256_castsi256_si128(ss[0]);
+
+ // r43 r33....r40 r30|r33 r23....r30 r20
+ ss1[1] = _mm256_castsi256_si128(ss[1]);
+
+ // r63 r53....r60 r50|r53 r43....r50 r40
+ ss1[2] = _mm_unpacklo_epi8(r1[4], r1[5]);
+
+ // r83 r73....r80 r70|r73 r63....r70 r60
+ ss1[3] = _mm_unpacklo_epi8(r1[6], r1[7]);
+
+ ss1[0] = convolve8_8_ssse3(ss1, f1);
+
+ // r1 r0 r1 r0
+ ss1[0] = _mm_packus_epi16(ss1[0], ss1[0]);
+
+ // Save first row 4 values
+ *((int *)&output_ptr[0]) = _mm_cvtsi128_si32(ss1[0]);
+ output_ptr += out_pitch;
+
+ ss1[0] = _mm_srli_si128(ss1[0], 4);
+ // Save second row 4 values
+ *((int *)&output_ptr[0]) = _mm_cvtsi128_si32(ss1[0]);
+ }
+}
+
+#if HAVE_AVX2 && HAVE_SSSE3
+#if VPX_ARCH_X86_64
+filter8_1dfunction vpx_filter_block1d8_v8_intrin_ssse3;
+filter8_1dfunction vpx_filter_block1d8_h8_intrin_ssse3;
+filter8_1dfunction vpx_filter_block1d4_h8_intrin_ssse3;
+#else // VPX_ARCH_X86
+filter8_1dfunction vpx_filter_block1d8_v8_ssse3;
+filter8_1dfunction vpx_filter_block1d8_h8_ssse3;
+filter8_1dfunction vpx_filter_block1d4_h8_ssse3;
+#endif // VPX_ARCH_X86_64
+filter8_1dfunction vpx_filter_block1d8_v8_avg_ssse3;
+filter8_1dfunction vpx_filter_block1d8_h8_avg_ssse3;
+filter8_1dfunction vpx_filter_block1d4_v8_avg_ssse3;
+filter8_1dfunction vpx_filter_block1d4_h8_avg_ssse3;
+#define vpx_filter_block1d8_v8_avg_avx2 vpx_filter_block1d8_v8_avg_ssse3
+#define vpx_filter_block1d8_h8_avg_avx2 vpx_filter_block1d8_h8_avg_ssse3
+#define vpx_filter_block1d4_v8_avg_avx2 vpx_filter_block1d4_v8_avg_ssse3
+#define vpx_filter_block1d4_h8_avg_avx2 vpx_filter_block1d4_h8_avg_ssse3
+filter8_1dfunction vpx_filter_block1d16_v2_ssse3;
+filter8_1dfunction vpx_filter_block1d16_h2_ssse3;
+filter8_1dfunction vpx_filter_block1d8_v2_ssse3;
+filter8_1dfunction vpx_filter_block1d8_h2_ssse3;
+filter8_1dfunction vpx_filter_block1d4_v2_ssse3;
+filter8_1dfunction vpx_filter_block1d4_h2_ssse3;
+#define vpx_filter_block1d16_v2_avx2 vpx_filter_block1d16_v2_ssse3
+#define vpx_filter_block1d16_h2_avx2 vpx_filter_block1d16_h2_ssse3
+#define vpx_filter_block1d8_v2_avx2 vpx_filter_block1d8_v2_ssse3
+#define vpx_filter_block1d8_h2_avx2 vpx_filter_block1d8_h2_ssse3
+#define vpx_filter_block1d4_v2_avx2 vpx_filter_block1d4_v2_ssse3
+#define vpx_filter_block1d4_h2_avx2 vpx_filter_block1d4_h2_ssse3
+filter8_1dfunction vpx_filter_block1d16_v2_avg_ssse3;
+filter8_1dfunction vpx_filter_block1d16_h2_avg_ssse3;
+filter8_1dfunction vpx_filter_block1d8_v2_avg_ssse3;
+filter8_1dfunction vpx_filter_block1d8_h2_avg_ssse3;
+filter8_1dfunction vpx_filter_block1d4_v2_avg_ssse3;
+filter8_1dfunction vpx_filter_block1d4_h2_avg_ssse3;
+#define vpx_filter_block1d16_v2_avg_avx2 vpx_filter_block1d16_v2_avg_ssse3
+#define vpx_filter_block1d16_h2_avg_avx2 vpx_filter_block1d16_h2_avg_ssse3
+#define vpx_filter_block1d8_v2_avg_avx2 vpx_filter_block1d8_v2_avg_ssse3
+#define vpx_filter_block1d8_h2_avg_avx2 vpx_filter_block1d8_h2_avg_ssse3
+#define vpx_filter_block1d4_v2_avg_avx2 vpx_filter_block1d4_v2_avg_ssse3
+#define vpx_filter_block1d4_h2_avg_avx2 vpx_filter_block1d4_h2_avg_ssse3
+
+#define vpx_filter_block1d16_v4_avg_avx2 vpx_filter_block1d16_v8_avg_avx2
+#define vpx_filter_block1d16_h4_avg_avx2 vpx_filter_block1d16_h8_avg_avx2
+#define vpx_filter_block1d8_v4_avg_avx2 vpx_filter_block1d8_v8_avg_avx2
+#define vpx_filter_block1d8_h4_avg_avx2 vpx_filter_block1d8_h8_avg_avx2
+#define vpx_filter_block1d4_v4_avg_avx2 vpx_filter_block1d4_v8_avg_avx2
+#define vpx_filter_block1d4_h4_avg_avx2 vpx_filter_block1d4_h8_avg_avx2
+// void vpx_convolve8_horiz_avx2(const uint8_t *src, ptrdiff_t src_stride,
+// uint8_t *dst, ptrdiff_t dst_stride,
+// const InterpKernel *filter, int x0_q4,
+// int32_t x_step_q4, int y0_q4, int y_step_q4,
+// int w, int h);
+// void vpx_convolve8_vert_avx2(const uint8_t *src, ptrdiff_t src_stride,
+// uint8_t *dst, ptrdiff_t dst_stride,
+// const InterpKernel *filter, int x0_q4,
+// int32_t x_step_q4, int y0_q4, int y_step_q4,
+// int w, int h);
+// void vpx_convolve8_avg_horiz_avx2(const uint8_t *src, ptrdiff_t src_stride,
+// uint8_t *dst, ptrdiff_t dst_stride,
+// const InterpKernel *filter, int x0_q4,
+// int32_t x_step_q4, int y0_q4,
+// int y_step_q4, int w, int h);
+// void vpx_convolve8_avg_vert_avx2(const uint8_t *src, ptrdiff_t src_stride,
+// uint8_t *dst, ptrdiff_t dst_stride,
+// const InterpKernel *filter, int x0_q4,
+// int32_t x_step_q4, int y0_q4,
+// int y_step_q4, int w, int h);
+FUN_CONV_1D(horiz, x0_q4, x_step_q4, h, src, , avx2, 0)
+FUN_CONV_1D(vert, y0_q4, y_step_q4, v, src - src_stride * (num_taps / 2 - 1), ,
+ avx2, 0)
+FUN_CONV_1D(avg_horiz, x0_q4, x_step_q4, h, src, avg_, avx2, 1)
+FUN_CONV_1D(avg_vert, y0_q4, y_step_q4, v,
+ src - src_stride * (num_taps / 2 - 1), avg_, avx2, 1)
+
+// void vpx_convolve8_avx2(const uint8_t *src, ptrdiff_t src_stride,
+// uint8_t *dst, ptrdiff_t dst_stride,
+// const InterpKernel *filter, int x0_q4,
+// int32_t x_step_q4, int y0_q4, int y_step_q4,
+// int w, int h);
+// void vpx_convolve8_avg_avx2(const uint8_t *src, ptrdiff_t src_stride,
+// uint8_t *dst, ptrdiff_t dst_stride,
+// const InterpKernel *filter, int x0_q4,
+// int32_t x_step_q4, int y0_q4, int y_step_q4,
+// int w, int h);
+FUN_CONV_2D(, avx2, 0)
+FUN_CONV_2D(avg_, avx2, 1)
+#endif // HAVE_AX2 && HAVE_SSSE3
diff --git a/media/libvpx/libvpx/vpx_dsp/x86/vpx_subpixel_8t_intrin_ssse3.c b/media/libvpx/libvpx/vpx_dsp/x86/vpx_subpixel_8t_intrin_ssse3.c
new file mode 100644
index 0000000000..4ea2752d38
--- /dev/null
+++ b/media/libvpx/libvpx/vpx_dsp/x86/vpx_subpixel_8t_intrin_ssse3.c
@@ -0,0 +1,1087 @@
+/*
+ * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <tmmintrin.h> // SSSE3
+
+#include <string.h>
+
+#include "./vpx_config.h"
+#include "./vpx_dsp_rtcd.h"
+#include "vpx_dsp/vpx_filter.h"
+#include "vpx_dsp/x86/convolve.h"
+#include "vpx_dsp/x86/convolve_sse2.h"
+#include "vpx_dsp/x86/convolve_ssse3.h"
+#include "vpx_dsp/x86/mem_sse2.h"
+#include "vpx_dsp/x86/transpose_sse2.h"
+#include "vpx_mem/vpx_mem.h"
+#include "vpx_ports/mem.h"
+
+static INLINE __m128i shuffle_filter_convolve8_8_ssse3(
+ const __m128i *const s, const int16_t *const filter) {
+ __m128i f[4];
+ shuffle_filter_ssse3(filter, f);
+ return convolve8_8_ssse3(s, f);
+}
+
+// Used by the avx2 implementation.
+#if VPX_ARCH_X86_64
+// Use the intrinsics below
+filter8_1dfunction vpx_filter_block1d4_h8_intrin_ssse3;
+filter8_1dfunction vpx_filter_block1d8_h8_intrin_ssse3;
+filter8_1dfunction vpx_filter_block1d8_v8_intrin_ssse3;
+#define vpx_filter_block1d4_h8_ssse3 vpx_filter_block1d4_h8_intrin_ssse3
+#define vpx_filter_block1d8_h8_ssse3 vpx_filter_block1d8_h8_intrin_ssse3
+#define vpx_filter_block1d8_v8_ssse3 vpx_filter_block1d8_v8_intrin_ssse3
+#else // VPX_ARCH_X86
+// Use the assembly in vpx_dsp/x86/vpx_subpixel_8t_ssse3.asm.
+filter8_1dfunction vpx_filter_block1d4_h8_ssse3;
+filter8_1dfunction vpx_filter_block1d8_h8_ssse3;
+filter8_1dfunction vpx_filter_block1d8_v8_ssse3;
+#endif
+
+#if VPX_ARCH_X86_64
+void vpx_filter_block1d4_h8_intrin_ssse3(
+ const uint8_t *src_ptr, ptrdiff_t src_pitch, uint8_t *output_ptr,
+ ptrdiff_t output_pitch, uint32_t output_height, const int16_t *filter) {
+ __m128i firstFilters, secondFilters, shuffle1, shuffle2;
+ __m128i srcRegFilt1, srcRegFilt2;
+ __m128i addFilterReg64, filtersReg, srcReg;
+ unsigned int i;
+
+ // create a register with 0,64,0,64,0,64,0,64,0,64,0,64,0,64,0,64
+ addFilterReg64 = _mm_set1_epi32((int)0x0400040u);
+ filtersReg = _mm_loadu_si128((const __m128i *)filter);
+ // converting the 16 bit (short) to 8 bit (byte) and have the same data
+ // in both lanes of 128 bit register.
+ filtersReg = _mm_packs_epi16(filtersReg, filtersReg);
+
+ // duplicate only the first 16 bits in the filter into the first lane
+ firstFilters = _mm_shufflelo_epi16(filtersReg, 0);
+ // duplicate only the third 16 bit in the filter into the first lane
+ secondFilters = _mm_shufflelo_epi16(filtersReg, 0xAAu);
+ // duplicate only the seconds 16 bits in the filter into the second lane
+ // firstFilters: k0 k1 k0 k1 k0 k1 k0 k1 k2 k3 k2 k3 k2 k3 k2 k3
+ firstFilters = _mm_shufflehi_epi16(firstFilters, 0x55u);
+ // duplicate only the forth 16 bits in the filter into the second lane
+ // secondFilters: k4 k5 k4 k5 k4 k5 k4 k5 k6 k7 k6 k7 k6 k7 k6 k7
+ secondFilters = _mm_shufflehi_epi16(secondFilters, 0xFFu);
+
+ // loading the local filters
+ shuffle1 = _mm_setr_epi8(0, 1, 1, 2, 2, 3, 3, 4, 2, 3, 3, 4, 4, 5, 5, 6);
+ shuffle2 = _mm_setr_epi8(4, 5, 5, 6, 6, 7, 7, 8, 6, 7, 7, 8, 8, 9, 9, 10);
+
+ for (i = 0; i < output_height; i++) {
+ srcReg = _mm_loadu_si128((const __m128i *)(src_ptr - 3));
+
+ // filter the source buffer
+ srcRegFilt1 = _mm_shuffle_epi8(srcReg, shuffle1);
+ srcRegFilt2 = _mm_shuffle_epi8(srcReg, shuffle2);
+
+ // multiply 2 adjacent elements with the filter and add the result
+ srcRegFilt1 = _mm_maddubs_epi16(srcRegFilt1, firstFilters);
+ srcRegFilt2 = _mm_maddubs_epi16(srcRegFilt2, secondFilters);
+
+ // sum the results together, saturating only on the final step
+ // the specific order of the additions prevents outranges
+ srcRegFilt1 = _mm_add_epi16(srcRegFilt1, srcRegFilt2);
+
+ // extract the higher half of the register
+ srcRegFilt2 = _mm_srli_si128(srcRegFilt1, 8);
+
+ // add the rounding offset early to avoid another saturated add
+ srcRegFilt1 = _mm_add_epi16(srcRegFilt1, addFilterReg64);
+ srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, srcRegFilt2);
+
+ // shift by 7 bit each 16 bits
+ srcRegFilt1 = _mm_srai_epi16(srcRegFilt1, 7);
+
+ // shrink to 8 bit each 16 bits
+ srcRegFilt1 = _mm_packus_epi16(srcRegFilt1, srcRegFilt1);
+ src_ptr += src_pitch;
+
+ // save only 4 bytes
+ *((int *)&output_ptr[0]) = _mm_cvtsi128_si32(srcRegFilt1);
+
+ output_ptr += output_pitch;
+ }
+}
+
+void vpx_filter_block1d8_h8_intrin_ssse3(
+ const uint8_t *src_ptr, ptrdiff_t src_pitch, uint8_t *output_ptr,
+ ptrdiff_t output_pitch, uint32_t output_height, const int16_t *filter) {
+ unsigned int i;
+ __m128i f[4], filt[4], s[4];
+
+ shuffle_filter_ssse3(filter, f);
+ filt[0] = _mm_setr_epi8(0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8);
+ filt[1] = _mm_setr_epi8(2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10);
+ filt[2] = _mm_setr_epi8(4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12);
+ filt[3] =
+ _mm_setr_epi8(6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12, 12, 13, 13, 14);
+
+ for (i = 0; i < output_height; i++) {
+ const __m128i srcReg = _mm_loadu_si128((const __m128i *)(src_ptr - 3));
+
+ // filter the source buffer
+ s[0] = _mm_shuffle_epi8(srcReg, filt[0]);
+ s[1] = _mm_shuffle_epi8(srcReg, filt[1]);
+ s[2] = _mm_shuffle_epi8(srcReg, filt[2]);
+ s[3] = _mm_shuffle_epi8(srcReg, filt[3]);
+ s[0] = convolve8_8_ssse3(s, f);
+
+ // shrink to 8 bit each 16 bits
+ s[0] = _mm_packus_epi16(s[0], s[0]);
+
+ src_ptr += src_pitch;
+
+ // save only 8 bytes
+ _mm_storel_epi64((__m128i *)&output_ptr[0], s[0]);
+
+ output_ptr += output_pitch;
+ }
+}
+
+void vpx_filter_block1d8_v8_intrin_ssse3(
+ const uint8_t *src_ptr, ptrdiff_t src_pitch, uint8_t *output_ptr,
+ ptrdiff_t out_pitch, uint32_t output_height, const int16_t *filter) {
+ unsigned int i;
+ __m128i f[4], s[8], ss[4];
+
+ shuffle_filter_ssse3(filter, f);
+
+ // load the first 7 rows of 8 bytes
+ s[0] = _mm_loadl_epi64((const __m128i *)(src_ptr + 0 * src_pitch));
+ s[1] = _mm_loadl_epi64((const __m128i *)(src_ptr + 1 * src_pitch));
+ s[2] = _mm_loadl_epi64((const __m128i *)(src_ptr + 2 * src_pitch));
+ s[3] = _mm_loadl_epi64((const __m128i *)(src_ptr + 3 * src_pitch));
+ s[4] = _mm_loadl_epi64((const __m128i *)(src_ptr + 4 * src_pitch));
+ s[5] = _mm_loadl_epi64((const __m128i *)(src_ptr + 5 * src_pitch));
+ s[6] = _mm_loadl_epi64((const __m128i *)(src_ptr + 6 * src_pitch));
+
+ for (i = 0; i < output_height; i++) {
+ // load the last 8 bytes
+ s[7] = _mm_loadl_epi64((const __m128i *)(src_ptr + 7 * src_pitch));
+
+ // merge the result together
+ ss[0] = _mm_unpacklo_epi8(s[0], s[1]);
+ ss[1] = _mm_unpacklo_epi8(s[2], s[3]);
+
+ // merge the result together
+ ss[2] = _mm_unpacklo_epi8(s[4], s[5]);
+ ss[3] = _mm_unpacklo_epi8(s[6], s[7]);
+
+ ss[0] = convolve8_8_ssse3(ss, f);
+ // shrink to 8 bit each 16 bits
+ ss[0] = _mm_packus_epi16(ss[0], ss[0]);
+
+ src_ptr += src_pitch;
+
+ // shift down a row
+ s[0] = s[1];
+ s[1] = s[2];
+ s[2] = s[3];
+ s[3] = s[4];
+ s[4] = s[5];
+ s[5] = s[6];
+ s[6] = s[7];
+
+ // save only 8 bytes convolve result
+ _mm_storel_epi64((__m128i *)&output_ptr[0], ss[0]);
+
+ output_ptr += out_pitch;
+ }
+}
+#endif // VPX_ARCH_X86_64
+
+static void vpx_filter_block1d16_h4_ssse3(const uint8_t *src_ptr,
+ ptrdiff_t src_stride,
+ uint8_t *dst_ptr,
+ ptrdiff_t dst_stride, uint32_t height,
+ const int16_t *kernel) {
+ // We will cast the kernel from 16-bit words to 8-bit words, and then extract
+ // the middle four elements of the kernel into two registers in the form
+ // ... k[3] k[2] k[3] k[2]
+ // ... k[5] k[4] k[5] k[4]
+ // Then we shuffle the source into
+ // ... s[1] s[0] s[0] s[-1]
+ // ... s[3] s[2] s[2] s[1]
+ // Calling multiply and add gives us half of the sum. Calling add gives us
+ // first half of the output. Repeat again to get the second half of the
+ // output. Finally we shuffle again to combine the two outputs.
+
+ __m128i kernel_reg; // Kernel
+ __m128i kernel_reg_23, kernel_reg_45; // Segments of the kernel used
+ const __m128i reg_32 = _mm_set1_epi16(32); // Used for rounding
+ int h;
+
+ __m128i src_reg, src_reg_shift_0, src_reg_shift_2;
+ __m128i dst_first, dst_second;
+ __m128i tmp_0, tmp_1;
+ __m128i idx_shift_0 =
+ _mm_setr_epi8(0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8);
+ __m128i idx_shift_2 =
+ _mm_setr_epi8(2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10);
+
+ // Start one pixel before as we need tap/2 - 1 = 1 sample from the past
+ src_ptr -= 1;
+
+ // Load Kernel
+ kernel_reg = _mm_loadu_si128((const __m128i *)kernel);
+ kernel_reg = _mm_srai_epi16(kernel_reg, 1);
+ kernel_reg = _mm_packs_epi16(kernel_reg, kernel_reg);
+ kernel_reg_23 = _mm_shuffle_epi8(kernel_reg, _mm_set1_epi16(0x0302u));
+ kernel_reg_45 = _mm_shuffle_epi8(kernel_reg, _mm_set1_epi16(0x0504u));
+
+ for (h = height; h > 0; --h) {
+ // Load the source
+ src_reg = _mm_loadu_si128((const __m128i *)src_ptr);
+ src_reg_shift_0 = _mm_shuffle_epi8(src_reg, idx_shift_0);
+ src_reg_shift_2 = _mm_shuffle_epi8(src_reg, idx_shift_2);
+
+ // Partial result for first half
+ tmp_0 = _mm_maddubs_epi16(src_reg_shift_0, kernel_reg_23);
+ tmp_1 = _mm_maddubs_epi16(src_reg_shift_2, kernel_reg_45);
+ dst_first = _mm_adds_epi16(tmp_0, tmp_1);
+
+ // Do again to get the second half of dst
+ // Load the source
+ src_reg = _mm_loadu_si128((const __m128i *)(src_ptr + 8));
+ src_reg_shift_0 = _mm_shuffle_epi8(src_reg, idx_shift_0);
+ src_reg_shift_2 = _mm_shuffle_epi8(src_reg, idx_shift_2);
+
+ // Partial result for first half
+ tmp_0 = _mm_maddubs_epi16(src_reg_shift_0, kernel_reg_23);
+ tmp_1 = _mm_maddubs_epi16(src_reg_shift_2, kernel_reg_45);
+ dst_second = _mm_adds_epi16(tmp_0, tmp_1);
+
+ // Round each result
+ dst_first = mm_round_epi16_sse2(&dst_first, &reg_32, 6);
+ dst_second = mm_round_epi16_sse2(&dst_second, &reg_32, 6);
+
+ // Finally combine to get the final dst
+ dst_first = _mm_packus_epi16(dst_first, dst_second);
+ _mm_store_si128((__m128i *)dst_ptr, dst_first);
+
+ src_ptr += src_stride;
+ dst_ptr += dst_stride;
+ }
+}
+
+static void vpx_filter_block1d16_v4_ssse3(const uint8_t *src_ptr,
+ ptrdiff_t src_stride,
+ uint8_t *dst_ptr,
+ ptrdiff_t dst_stride, uint32_t height,
+ const int16_t *kernel) {
+ // We will load two rows of pixels as 8-bit words, rearrange them into the
+ // form
+ // ... s[0,1] s[-1,1] s[0,0] s[-1,0]
+ // ... s[0,9] s[-1,9] s[0,8] s[-1,8]
+ // so that we can call multiply and add with the kernel to get 16-bit words of
+ // the form
+ // ... s[0,1]k[3]+s[-1,1]k[2] s[0,0]k[3]+s[-1,0]k[2]
+ // Finally, we can add multiple rows together to get the desired output.
+
+ // Register for source s[-1:3, :]
+ __m128i src_reg_m1, src_reg_0, src_reg_1, src_reg_2, src_reg_3;
+ // Interleaved rows of the source. lo is first half, hi second
+ __m128i src_reg_m10_lo, src_reg_m10_hi, src_reg_01_lo, src_reg_01_hi;
+ __m128i src_reg_12_lo, src_reg_12_hi, src_reg_23_lo, src_reg_23_hi;
+
+ __m128i kernel_reg; // Kernel
+ __m128i kernel_reg_23, kernel_reg_45; // Segments of the kernel used
+
+ // Result after multiply and add
+ __m128i res_reg_m10_lo, res_reg_01_lo, res_reg_12_lo, res_reg_23_lo;
+ __m128i res_reg_m10_hi, res_reg_01_hi, res_reg_12_hi, res_reg_23_hi;
+ __m128i res_reg_m1012, res_reg_0123;
+ __m128i res_reg_m1012_lo, res_reg_0123_lo, res_reg_m1012_hi, res_reg_0123_hi;
+
+ const __m128i reg_32 = _mm_set1_epi16(32); // Used for rounding
+
+ // We will compute the result two rows at a time
+ const ptrdiff_t src_stride_unrolled = src_stride << 1;
+ const ptrdiff_t dst_stride_unrolled = dst_stride << 1;
+ int h;
+
+ // Load Kernel
+ kernel_reg = _mm_loadu_si128((const __m128i *)kernel);
+ kernel_reg = _mm_srai_epi16(kernel_reg, 1);
+ kernel_reg = _mm_packs_epi16(kernel_reg, kernel_reg);
+ kernel_reg_23 = _mm_shuffle_epi8(kernel_reg, _mm_set1_epi16(0x0302u));
+ kernel_reg_45 = _mm_shuffle_epi8(kernel_reg, _mm_set1_epi16(0x0504u));
+
+ // First shuffle the data
+ src_reg_m1 = _mm_loadu_si128((const __m128i *)src_ptr);
+ src_reg_0 = _mm_loadu_si128((const __m128i *)(src_ptr + src_stride));
+ src_reg_m10_lo = _mm_unpacklo_epi8(src_reg_m1, src_reg_0);
+ src_reg_m10_hi = _mm_unpackhi_epi8(src_reg_m1, src_reg_0);
+
+ // More shuffling
+ src_reg_1 = _mm_loadu_si128((const __m128i *)(src_ptr + src_stride * 2));
+ src_reg_01_lo = _mm_unpacklo_epi8(src_reg_0, src_reg_1);
+ src_reg_01_hi = _mm_unpackhi_epi8(src_reg_0, src_reg_1);
+
+ for (h = height; h > 1; h -= 2) {
+ src_reg_2 = _mm_loadu_si128((const __m128i *)(src_ptr + src_stride * 3));
+
+ src_reg_12_lo = _mm_unpacklo_epi8(src_reg_1, src_reg_2);
+ src_reg_12_hi = _mm_unpackhi_epi8(src_reg_1, src_reg_2);
+
+ src_reg_3 = _mm_loadu_si128((const __m128i *)(src_ptr + src_stride * 4));
+
+ src_reg_23_lo = _mm_unpacklo_epi8(src_reg_2, src_reg_3);
+ src_reg_23_hi = _mm_unpackhi_epi8(src_reg_2, src_reg_3);
+
+ // Partial output from first half
+ res_reg_m10_lo = _mm_maddubs_epi16(src_reg_m10_lo, kernel_reg_23);
+ res_reg_01_lo = _mm_maddubs_epi16(src_reg_01_lo, kernel_reg_23);
+
+ res_reg_12_lo = _mm_maddubs_epi16(src_reg_12_lo, kernel_reg_45);
+ res_reg_23_lo = _mm_maddubs_epi16(src_reg_23_lo, kernel_reg_45);
+
+ // Add to get first half of the results
+ res_reg_m1012_lo = _mm_adds_epi16(res_reg_m10_lo, res_reg_12_lo);
+ res_reg_0123_lo = _mm_adds_epi16(res_reg_01_lo, res_reg_23_lo);
+
+ // Partial output for second half
+ res_reg_m10_hi = _mm_maddubs_epi16(src_reg_m10_hi, kernel_reg_23);
+ res_reg_01_hi = _mm_maddubs_epi16(src_reg_01_hi, kernel_reg_23);
+
+ res_reg_12_hi = _mm_maddubs_epi16(src_reg_12_hi, kernel_reg_45);
+ res_reg_23_hi = _mm_maddubs_epi16(src_reg_23_hi, kernel_reg_45);
+
+ // Second half of the results
+ res_reg_m1012_hi = _mm_adds_epi16(res_reg_m10_hi, res_reg_12_hi);
+ res_reg_0123_hi = _mm_adds_epi16(res_reg_01_hi, res_reg_23_hi);
+
+ // Round the words
+ res_reg_m1012_lo = mm_round_epi16_sse2(&res_reg_m1012_lo, &reg_32, 6);
+ res_reg_0123_lo = mm_round_epi16_sse2(&res_reg_0123_lo, &reg_32, 6);
+ res_reg_m1012_hi = mm_round_epi16_sse2(&res_reg_m1012_hi, &reg_32, 6);
+ res_reg_0123_hi = mm_round_epi16_sse2(&res_reg_0123_hi, &reg_32, 6);
+
+ // Combine to get the result
+ res_reg_m1012 = _mm_packus_epi16(res_reg_m1012_lo, res_reg_m1012_hi);
+ res_reg_0123 = _mm_packus_epi16(res_reg_0123_lo, res_reg_0123_hi);
+
+ _mm_store_si128((__m128i *)dst_ptr, res_reg_m1012);
+ _mm_store_si128((__m128i *)(dst_ptr + dst_stride), res_reg_0123);
+
+ // Update the source by two rows
+ src_ptr += src_stride_unrolled;
+ dst_ptr += dst_stride_unrolled;
+
+ src_reg_m10_lo = src_reg_12_lo;
+ src_reg_m10_hi = src_reg_12_hi;
+ src_reg_01_lo = src_reg_23_lo;
+ src_reg_01_hi = src_reg_23_hi;
+ src_reg_1 = src_reg_3;
+ }
+}
+
+static void vpx_filter_block1d8_h4_ssse3(const uint8_t *src_ptr,
+ ptrdiff_t src_stride, uint8_t *dst_ptr,
+ ptrdiff_t dst_stride, uint32_t height,
+ const int16_t *kernel) {
+ // We will cast the kernel from 16-bit words to 8-bit words, and then extract
+ // the middle four elements of the kernel into two registers in the form
+ // ... k[3] k[2] k[3] k[2]
+ // ... k[5] k[4] k[5] k[4]
+ // Then we shuffle the source into
+ // ... s[1] s[0] s[0] s[-1]
+ // ... s[3] s[2] s[2] s[1]
+ // Calling multiply and add gives us half of the sum. Calling add gives us
+ // first half of the output. Repeat again to get the second half of the
+ // output. Finally we shuffle again to combine the two outputs.
+
+ __m128i kernel_reg; // Kernel
+ __m128i kernel_reg_23, kernel_reg_45; // Segments of the kernel used
+ const __m128i reg_32 = _mm_set1_epi16(32); // Used for rounding
+ int h;
+
+ __m128i src_reg, src_reg_shift_0, src_reg_shift_2;
+ __m128i dst_first;
+ __m128i tmp_0, tmp_1;
+ __m128i idx_shift_0 =
+ _mm_setr_epi8(0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8);
+ __m128i idx_shift_2 =
+ _mm_setr_epi8(2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10);
+
+ // Start one pixel before as we need tap/2 - 1 = 1 sample from the past
+ src_ptr -= 1;
+
+ // Load Kernel
+ kernel_reg = _mm_loadu_si128((const __m128i *)kernel);
+ kernel_reg = _mm_srai_epi16(kernel_reg, 1);
+ kernel_reg = _mm_packs_epi16(kernel_reg, kernel_reg);
+ kernel_reg_23 = _mm_shuffle_epi8(kernel_reg, _mm_set1_epi16(0x0302u));
+ kernel_reg_45 = _mm_shuffle_epi8(kernel_reg, _mm_set1_epi16(0x0504u));
+
+ for (h = height; h > 0; --h) {
+ // Load the source
+ src_reg = _mm_loadu_si128((const __m128i *)src_ptr);
+ src_reg_shift_0 = _mm_shuffle_epi8(src_reg, idx_shift_0);
+ src_reg_shift_2 = _mm_shuffle_epi8(src_reg, idx_shift_2);
+
+ // Get the result
+ tmp_0 = _mm_maddubs_epi16(src_reg_shift_0, kernel_reg_23);
+ tmp_1 = _mm_maddubs_epi16(src_reg_shift_2, kernel_reg_45);
+ dst_first = _mm_adds_epi16(tmp_0, tmp_1);
+
+ // Round round result
+ dst_first = mm_round_epi16_sse2(&dst_first, &reg_32, 6);
+
+ // Pack to 8-bits
+ dst_first = _mm_packus_epi16(dst_first, _mm_setzero_si128());
+ _mm_storel_epi64((__m128i *)dst_ptr, dst_first);
+
+ src_ptr += src_stride;
+ dst_ptr += dst_stride;
+ }
+}
+
+static void vpx_filter_block1d8_v4_ssse3(const uint8_t *src_ptr,
+ ptrdiff_t src_stride, uint8_t *dst_ptr,
+ ptrdiff_t dst_stride, uint32_t height,
+ const int16_t *kernel) {
+ // We will load two rows of pixels as 8-bit words, rearrange them into the
+ // form
+ // ... s[0,1] s[-1,1] s[0,0] s[-1,0]
+ // so that we can call multiply and add with the kernel to get 16-bit words of
+ // the form
+ // ... s[0,1]k[3]+s[-1,1]k[2] s[0,0]k[3]+s[-1,0]k[2]
+ // Finally, we can add multiple rows together to get the desired output.
+
+ // Register for source s[-1:3, :]
+ __m128i src_reg_m1, src_reg_0, src_reg_1, src_reg_2, src_reg_3;
+ // Interleaved rows of the source. lo is first half, hi second
+ __m128i src_reg_m10, src_reg_01;
+ __m128i src_reg_12, src_reg_23;
+
+ __m128i kernel_reg; // Kernel
+ __m128i kernel_reg_23, kernel_reg_45; // Segments of the kernel used
+
+ // Result after multiply and add
+ __m128i res_reg_m10, res_reg_01, res_reg_12, res_reg_23;
+ __m128i res_reg_m1012, res_reg_0123;
+
+ const __m128i reg_32 = _mm_set1_epi16(32); // Used for rounding
+
+ // We will compute the result two rows at a time
+ const ptrdiff_t src_stride_unrolled = src_stride << 1;
+ const ptrdiff_t dst_stride_unrolled = dst_stride << 1;
+ int h;
+
+ // Load Kernel
+ kernel_reg = _mm_loadu_si128((const __m128i *)kernel);
+ kernel_reg = _mm_srai_epi16(kernel_reg, 1);
+ kernel_reg = _mm_packs_epi16(kernel_reg, kernel_reg);
+ kernel_reg_23 = _mm_shuffle_epi8(kernel_reg, _mm_set1_epi16(0x0302u));
+ kernel_reg_45 = _mm_shuffle_epi8(kernel_reg, _mm_set1_epi16(0x0504u));
+
+ // First shuffle the data
+ src_reg_m1 = _mm_loadl_epi64((const __m128i *)src_ptr);
+ src_reg_0 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_stride));
+ src_reg_m10 = _mm_unpacklo_epi8(src_reg_m1, src_reg_0);
+
+ // More shuffling
+ src_reg_1 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_stride * 2));
+ src_reg_01 = _mm_unpacklo_epi8(src_reg_0, src_reg_1);
+
+ for (h = height; h > 1; h -= 2) {
+ src_reg_2 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_stride * 3));
+
+ src_reg_12 = _mm_unpacklo_epi8(src_reg_1, src_reg_2);
+
+ src_reg_3 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_stride * 4));
+
+ src_reg_23 = _mm_unpacklo_epi8(src_reg_2, src_reg_3);
+
+ // Partial output
+ res_reg_m10 = _mm_maddubs_epi16(src_reg_m10, kernel_reg_23);
+ res_reg_01 = _mm_maddubs_epi16(src_reg_01, kernel_reg_23);
+
+ res_reg_12 = _mm_maddubs_epi16(src_reg_12, kernel_reg_45);
+ res_reg_23 = _mm_maddubs_epi16(src_reg_23, kernel_reg_45);
+
+ // Add to get entire output
+ res_reg_m1012 = _mm_adds_epi16(res_reg_m10, res_reg_12);
+ res_reg_0123 = _mm_adds_epi16(res_reg_01, res_reg_23);
+
+ // Round the words
+ res_reg_m1012 = mm_round_epi16_sse2(&res_reg_m1012, &reg_32, 6);
+ res_reg_0123 = mm_round_epi16_sse2(&res_reg_0123, &reg_32, 6);
+
+ // Pack from 16-bit to 8-bit
+ res_reg_m1012 = _mm_packus_epi16(res_reg_m1012, _mm_setzero_si128());
+ res_reg_0123 = _mm_packus_epi16(res_reg_0123, _mm_setzero_si128());
+
+ _mm_storel_epi64((__m128i *)dst_ptr, res_reg_m1012);
+ _mm_storel_epi64((__m128i *)(dst_ptr + dst_stride), res_reg_0123);
+
+ // Update the source by two rows
+ src_ptr += src_stride_unrolled;
+ dst_ptr += dst_stride_unrolled;
+
+ src_reg_m10 = src_reg_12;
+ src_reg_01 = src_reg_23;
+ src_reg_1 = src_reg_3;
+ }
+}
+
+static void vpx_filter_block1d4_h4_ssse3(const uint8_t *src_ptr,
+ ptrdiff_t src_stride, uint8_t *dst_ptr,
+ ptrdiff_t dst_stride, uint32_t height,
+ const int16_t *kernel) {
+ // We will cast the kernel from 16-bit words to 8-bit words, and then extract
+ // the middle four elements of the kernel into a single register in the form
+ // k[5:2] k[5:2] k[5:2] k[5:2]
+ // Then we shuffle the source into
+ // s[5:2] s[4:1] s[3:0] s[2:-1]
+ // Calling multiply and add gives us half of the sum next to each other.
+ // Calling horizontal add then gives us the output.
+
+ __m128i kernel_reg; // Kernel
+ const __m128i reg_32 = _mm_set1_epi16(32); // Used for rounding
+ int h;
+
+ __m128i src_reg, src_reg_shuf;
+ __m128i dst_first;
+ __m128i shuf_idx =
+ _mm_setr_epi8(0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6);
+
+ // Start one pixel before as we need tap/2 - 1 = 1 sample from the past
+ src_ptr -= 1;
+
+ // Load Kernel
+ kernel_reg = _mm_loadu_si128((const __m128i *)kernel);
+ kernel_reg = _mm_srai_epi16(kernel_reg, 1);
+ kernel_reg = _mm_packs_epi16(kernel_reg, kernel_reg);
+ kernel_reg = _mm_shuffle_epi8(kernel_reg, _mm_set1_epi32(0x05040302u));
+
+ for (h = height; h > 0; --h) {
+ // Load the source
+ src_reg = _mm_loadu_si128((const __m128i *)src_ptr);
+ src_reg_shuf = _mm_shuffle_epi8(src_reg, shuf_idx);
+
+ // Get the result
+ dst_first = _mm_maddubs_epi16(src_reg_shuf, kernel_reg);
+ dst_first = _mm_hadds_epi16(dst_first, _mm_setzero_si128());
+
+ // Round result
+ dst_first = mm_round_epi16_sse2(&dst_first, &reg_32, 6);
+
+ // Pack to 8-bits
+ dst_first = _mm_packus_epi16(dst_first, _mm_setzero_si128());
+ *((int *)(dst_ptr)) = _mm_cvtsi128_si32(dst_first);
+
+ src_ptr += src_stride;
+ dst_ptr += dst_stride;
+ }
+}
+
+static void vpx_filter_block1d4_v4_ssse3(const uint8_t *src_ptr,
+ ptrdiff_t src_stride, uint8_t *dst_ptr,
+ ptrdiff_t dst_stride, uint32_t height,
+ const int16_t *kernel) {
+ // We will load two rows of pixels as 8-bit words, rearrange them into the
+ // form
+ // ... s[2,0] s[1,0] s[0,0] s[-1,0]
+ // so that we can call multiply and add with the kernel partial output. Then
+ // we can call horizontal add to get the output.
+ // Finally, we can add multiple rows together to get the desired output.
+ // This is done two rows at a time
+
+ // Register for source s[-1:3, :]
+ __m128i src_reg_m1, src_reg_0, src_reg_1, src_reg_2, src_reg_3;
+ // Interleaved rows of the source.
+ __m128i src_reg_m10, src_reg_01;
+ __m128i src_reg_12, src_reg_23;
+ __m128i src_reg_m1001, src_reg_1223;
+ __m128i src_reg_m1012_1023_lo, src_reg_m1012_1023_hi;
+
+ __m128i kernel_reg; // Kernel
+
+ // Result after multiply and add
+ __m128i reg_0, reg_1;
+
+ const __m128i reg_32 = _mm_set1_epi16(32); // Used for rounding
+
+ // We will compute the result two rows at a time
+ const ptrdiff_t src_stride_unrolled = src_stride << 1;
+ const ptrdiff_t dst_stride_unrolled = dst_stride << 1;
+ int h;
+
+ // Load Kernel
+ kernel_reg = _mm_loadu_si128((const __m128i *)kernel);
+ kernel_reg = _mm_srai_epi16(kernel_reg, 1);
+ kernel_reg = _mm_packs_epi16(kernel_reg, kernel_reg);
+ kernel_reg = _mm_shuffle_epi8(kernel_reg, _mm_set1_epi32(0x05040302u));
+
+ // First shuffle the data
+ src_reg_m1 = _mm_loadl_epi64((const __m128i *)src_ptr);
+ src_reg_0 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_stride));
+ src_reg_m10 = _mm_unpacklo_epi32(src_reg_m1, src_reg_0);
+
+ // More shuffling
+ src_reg_1 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_stride * 2));
+ src_reg_01 = _mm_unpacklo_epi32(src_reg_0, src_reg_1);
+
+ // Put three rows next to each other
+ src_reg_m1001 = _mm_unpacklo_epi8(src_reg_m10, src_reg_01);
+
+ for (h = height; h > 1; h -= 2) {
+ src_reg_2 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_stride * 3));
+ src_reg_12 = _mm_unpacklo_epi32(src_reg_1, src_reg_2);
+
+ src_reg_3 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_stride * 4));
+ src_reg_23 = _mm_unpacklo_epi32(src_reg_2, src_reg_3);
+
+ // Put three rows next to each other
+ src_reg_1223 = _mm_unpacklo_epi8(src_reg_12, src_reg_23);
+
+ // Put all four rows next to each other
+ src_reg_m1012_1023_lo = _mm_unpacklo_epi16(src_reg_m1001, src_reg_1223);
+ src_reg_m1012_1023_hi = _mm_unpackhi_epi16(src_reg_m1001, src_reg_1223);
+
+ // Get the results
+ reg_0 = _mm_maddubs_epi16(src_reg_m1012_1023_lo, kernel_reg);
+ reg_1 = _mm_maddubs_epi16(src_reg_m1012_1023_hi, kernel_reg);
+ reg_0 = _mm_hadds_epi16(reg_0, _mm_setzero_si128());
+ reg_1 = _mm_hadds_epi16(reg_1, _mm_setzero_si128());
+
+ // Round the words
+ reg_0 = mm_round_epi16_sse2(&reg_0, &reg_32, 6);
+ reg_1 = mm_round_epi16_sse2(&reg_1, &reg_32, 6);
+
+ // Pack from 16-bit to 8-bit and put them in the right order
+ reg_0 = _mm_packus_epi16(reg_0, reg_0);
+ reg_1 = _mm_packus_epi16(reg_1, reg_1);
+
+ // Save the result
+ *((int *)(dst_ptr)) = _mm_cvtsi128_si32(reg_0);
+ *((int *)(dst_ptr + dst_stride)) = _mm_cvtsi128_si32(reg_1);
+
+ // Update the source by two rows
+ src_ptr += src_stride_unrolled;
+ dst_ptr += dst_stride_unrolled;
+
+ src_reg_m1001 = src_reg_1223;
+ src_reg_1 = src_reg_3;
+ }
+}
+
+// From vpx_dsp/x86/vpx_subpixel_8t_ssse3.asm
+filter8_1dfunction vpx_filter_block1d16_v8_ssse3;
+filter8_1dfunction vpx_filter_block1d16_h8_ssse3;
+filter8_1dfunction vpx_filter_block1d4_v8_ssse3;
+filter8_1dfunction vpx_filter_block1d16_v8_avg_ssse3;
+filter8_1dfunction vpx_filter_block1d16_h8_avg_ssse3;
+filter8_1dfunction vpx_filter_block1d8_v8_avg_ssse3;
+filter8_1dfunction vpx_filter_block1d8_h8_avg_ssse3;
+filter8_1dfunction vpx_filter_block1d4_v8_avg_ssse3;
+filter8_1dfunction vpx_filter_block1d4_h8_avg_ssse3;
+
+// Use the [vh]8 version because there is no [vh]4 implementation.
+#define vpx_filter_block1d16_v4_avg_ssse3 vpx_filter_block1d16_v8_avg_ssse3
+#define vpx_filter_block1d16_h4_avg_ssse3 vpx_filter_block1d16_h8_avg_ssse3
+#define vpx_filter_block1d8_v4_avg_ssse3 vpx_filter_block1d8_v8_avg_ssse3
+#define vpx_filter_block1d8_h4_avg_ssse3 vpx_filter_block1d8_h8_avg_ssse3
+#define vpx_filter_block1d4_v4_avg_ssse3 vpx_filter_block1d4_v8_avg_ssse3
+#define vpx_filter_block1d4_h4_avg_ssse3 vpx_filter_block1d4_h8_avg_ssse3
+
+// From vpx_dsp/x86/vpx_subpixel_bilinear_ssse3.asm
+filter8_1dfunction vpx_filter_block1d16_v2_ssse3;
+filter8_1dfunction vpx_filter_block1d16_h2_ssse3;
+filter8_1dfunction vpx_filter_block1d8_v2_ssse3;
+filter8_1dfunction vpx_filter_block1d8_h2_ssse3;
+filter8_1dfunction vpx_filter_block1d4_v2_ssse3;
+filter8_1dfunction vpx_filter_block1d4_h2_ssse3;
+filter8_1dfunction vpx_filter_block1d16_v2_avg_ssse3;
+filter8_1dfunction vpx_filter_block1d16_h2_avg_ssse3;
+filter8_1dfunction vpx_filter_block1d8_v2_avg_ssse3;
+filter8_1dfunction vpx_filter_block1d8_h2_avg_ssse3;
+filter8_1dfunction vpx_filter_block1d4_v2_avg_ssse3;
+filter8_1dfunction vpx_filter_block1d4_h2_avg_ssse3;
+
+// void vpx_convolve8_horiz_ssse3(const uint8_t *src, ptrdiff_t src_stride,
+// uint8_t *dst, ptrdiff_t dst_stride,
+// const InterpKernel *filter, int x0_q4,
+// int32_t x_step_q4, int y0_q4, int y_step_q4,
+// int w, int h);
+// void vpx_convolve8_vert_ssse3(const uint8_t *src, ptrdiff_t src_stride,
+// uint8_t *dst, ptrdiff_t dst_stride,
+// const InterpKernel *filter, int x0_q4,
+// int32_t x_step_q4, int y0_q4, int y_step_q4,
+// int w, int h);
+// void vpx_convolve8_avg_horiz_ssse3(const uint8_t *src, ptrdiff_t src_stride,
+// uint8_t *dst, ptrdiff_t dst_stride,
+// const InterpKernel *filter, int x0_q4,
+// int32_t x_step_q4, int y0_q4,
+// int y_step_q4, int w, int h);
+// void vpx_convolve8_avg_vert_ssse3(const uint8_t *src, ptrdiff_t src_stride,
+// uint8_t *dst, ptrdiff_t dst_stride,
+// const InterpKernel *filter, int x0_q4,
+// int32_t x_step_q4, int y0_q4,
+// int y_step_q4, int w, int h);
+FUN_CONV_1D(horiz, x0_q4, x_step_q4, h, src, , ssse3, 0)
+FUN_CONV_1D(vert, y0_q4, y_step_q4, v, src - src_stride * (num_taps / 2 - 1), ,
+ ssse3, 0)
+FUN_CONV_1D(avg_horiz, x0_q4, x_step_q4, h, src, avg_, ssse3, 1)
+FUN_CONV_1D(avg_vert, y0_q4, y_step_q4, v,
+ src - src_stride * (num_taps / 2 - 1), avg_, ssse3, 1)
+
+static void filter_horiz_w8_ssse3(const uint8_t *const src,
+ const ptrdiff_t src_stride,
+ uint8_t *const dst,
+ const int16_t *const x_filter) {
+ __m128i s[8], ss[4], temp;
+
+ load_8bit_8x8(src, src_stride, s);
+ // 00 01 10 11 20 21 30 31 40 41 50 51 60 61 70 71
+ // 02 03 12 13 22 23 32 33 42 43 52 53 62 63 72 73
+ // 04 05 14 15 24 25 34 35 44 45 54 55 64 65 74 75
+ // 06 07 16 17 26 27 36 37 46 47 56 57 66 67 76 77
+ transpose_16bit_4x8(s, ss);
+ temp = shuffle_filter_convolve8_8_ssse3(ss, x_filter);
+ // shrink to 8 bit each 16 bits
+ temp = _mm_packus_epi16(temp, temp);
+ // save only 8 bytes convolve result
+ _mm_storel_epi64((__m128i *)dst, temp);
+}
+
+static void transpose8x8_to_dst(const uint8_t *const src,
+ const ptrdiff_t src_stride, uint8_t *const dst,
+ const ptrdiff_t dst_stride) {
+ __m128i s[8];
+
+ load_8bit_8x8(src, src_stride, s);
+ transpose_8bit_8x8(s, s);
+ store_8bit_8x8(s, dst, dst_stride);
+}
+
+static void scaledconvolve_horiz_w8(const uint8_t *src,
+ const ptrdiff_t src_stride, uint8_t *dst,
+ const ptrdiff_t dst_stride,
+ const InterpKernel *const x_filters,
+ const int x0_q4, const int x_step_q4,
+ const int w, const int h) {
+ DECLARE_ALIGNED(16, uint8_t, temp[8 * 8]);
+ int x, y, z;
+ src -= SUBPEL_TAPS / 2 - 1;
+
+ // This function processes 8x8 areas. The intermediate height is not always
+ // a multiple of 8, so force it to be a multiple of 8 here.
+ y = h + (8 - (h & 0x7));
+
+ do {
+ int x_q4 = x0_q4;
+ for (x = 0; x < w; x += 8) {
+ // process 8 src_x steps
+ for (z = 0; z < 8; ++z) {
+ const uint8_t *const src_x = &src[x_q4 >> SUBPEL_BITS];
+ const int16_t *const x_filter = x_filters[x_q4 & SUBPEL_MASK];
+ if (x_q4 & SUBPEL_MASK) {
+ filter_horiz_w8_ssse3(src_x, src_stride, temp + (z * 8), x_filter);
+ } else {
+ int i;
+ for (i = 0; i < 8; ++i) {
+ temp[z * 8 + i] = src_x[i * src_stride + 3];
+ }
+ }
+ x_q4 += x_step_q4;
+ }
+
+ // transpose the 8x8 filters values back to dst
+ transpose8x8_to_dst(temp, 8, dst + x, dst_stride);
+ }
+
+ src += src_stride * 8;
+ dst += dst_stride * 8;
+ } while (y -= 8);
+}
+
+static void filter_horiz_w4_ssse3(const uint8_t *const src,
+ const ptrdiff_t src_stride,
+ uint8_t *const dst,
+ const int16_t *const filter) {
+ __m128i s[4], ss[2];
+ __m128i temp;
+
+ load_8bit_8x4(src, src_stride, s);
+ transpose_16bit_4x4(s, ss);
+ // 00 01 10 11 20 21 30 31
+ s[0] = ss[0];
+ // 02 03 12 13 22 23 32 33
+ s[1] = _mm_srli_si128(ss[0], 8);
+ // 04 05 14 15 24 25 34 35
+ s[2] = ss[1];
+ // 06 07 16 17 26 27 36 37
+ s[3] = _mm_srli_si128(ss[1], 8);
+
+ temp = shuffle_filter_convolve8_8_ssse3(s, filter);
+ // shrink to 8 bit each 16 bits
+ temp = _mm_packus_epi16(temp, temp);
+ // save only 4 bytes
+ *(int *)dst = _mm_cvtsi128_si32(temp);
+}
+
+static void transpose4x4_to_dst(const uint8_t *const src,
+ const ptrdiff_t src_stride, uint8_t *const dst,
+ const ptrdiff_t dst_stride) {
+ __m128i s[4];
+
+ load_8bit_4x4(src, src_stride, s);
+ s[0] = transpose_8bit_4x4(s);
+ s[1] = _mm_srli_si128(s[0], 4);
+ s[2] = _mm_srli_si128(s[0], 8);
+ s[3] = _mm_srli_si128(s[0], 12);
+ store_8bit_4x4(s, dst, dst_stride);
+}
+
+static void scaledconvolve_horiz_w4(const uint8_t *src,
+ const ptrdiff_t src_stride, uint8_t *dst,
+ const ptrdiff_t dst_stride,
+ const InterpKernel *const x_filters,
+ const int x0_q4, const int x_step_q4,
+ const int w, const int h) {
+ DECLARE_ALIGNED(16, uint8_t, temp[4 * 4]);
+ int x, y, z;
+ src -= SUBPEL_TAPS / 2 - 1;
+
+ for (y = 0; y < h; y += 4) {
+ int x_q4 = x0_q4;
+ for (x = 0; x < w; x += 4) {
+ // process 4 src_x steps
+ for (z = 0; z < 4; ++z) {
+ const uint8_t *const src_x = &src[x_q4 >> SUBPEL_BITS];
+ const int16_t *const x_filter = x_filters[x_q4 & SUBPEL_MASK];
+ if (x_q4 & SUBPEL_MASK) {
+ filter_horiz_w4_ssse3(src_x, src_stride, temp + (z * 4), x_filter);
+ } else {
+ int i;
+ for (i = 0; i < 4; ++i) {
+ temp[z * 4 + i] = src_x[i * src_stride + 3];
+ }
+ }
+ x_q4 += x_step_q4;
+ }
+
+ // transpose the 4x4 filters values back to dst
+ transpose4x4_to_dst(temp, 4, dst + x, dst_stride);
+ }
+
+ src += src_stride * 4;
+ dst += dst_stride * 4;
+ }
+}
+
+static __m128i filter_vert_kernel(const __m128i *const s,
+ const int16_t *const filter) {
+ __m128i ss[4];
+ __m128i temp;
+
+ // 00 10 01 11 02 12 03 13
+ ss[0] = _mm_unpacklo_epi8(s[0], s[1]);
+ // 20 30 21 31 22 32 23 33
+ ss[1] = _mm_unpacklo_epi8(s[2], s[3]);
+ // 40 50 41 51 42 52 43 53
+ ss[2] = _mm_unpacklo_epi8(s[4], s[5]);
+ // 60 70 61 71 62 72 63 73
+ ss[3] = _mm_unpacklo_epi8(s[6], s[7]);
+
+ temp = shuffle_filter_convolve8_8_ssse3(ss, filter);
+ // shrink to 8 bit each 16 bits
+ return _mm_packus_epi16(temp, temp);
+}
+
+static void filter_vert_w4_ssse3(const uint8_t *const src,
+ const ptrdiff_t src_stride, uint8_t *const dst,
+ const int16_t *const filter) {
+ __m128i s[8];
+ __m128i temp;
+
+ load_8bit_4x8(src, src_stride, s);
+ temp = filter_vert_kernel(s, filter);
+ // save only 4 bytes
+ *(int *)dst = _mm_cvtsi128_si32(temp);
+}
+
+static void scaledconvolve_vert_w4(
+ const uint8_t *src, const ptrdiff_t src_stride, uint8_t *const dst,
+ const ptrdiff_t dst_stride, const InterpKernel *const y_filters,
+ const int y0_q4, const int y_step_q4, const int w, const int h) {
+ int y;
+ int y_q4 = y0_q4;
+
+ src -= src_stride * (SUBPEL_TAPS / 2 - 1);
+ for (y = 0; y < h; ++y) {
+ const unsigned char *src_y = &src[(y_q4 >> SUBPEL_BITS) * src_stride];
+ const int16_t *const y_filter = y_filters[y_q4 & SUBPEL_MASK];
+
+ if (y_q4 & SUBPEL_MASK) {
+ filter_vert_w4_ssse3(src_y, src_stride, &dst[y * dst_stride], y_filter);
+ } else {
+ memcpy(&dst[y * dst_stride], &src_y[3 * src_stride], w);
+ }
+
+ y_q4 += y_step_q4;
+ }
+}
+
+static void filter_vert_w8_ssse3(const uint8_t *const src,
+ const ptrdiff_t src_stride, uint8_t *const dst,
+ const int16_t *const filter) {
+ __m128i s[8], temp;
+
+ load_8bit_8x8(src, src_stride, s);
+ temp = filter_vert_kernel(s, filter);
+ // save only 8 bytes convolve result
+ _mm_storel_epi64((__m128i *)dst, temp);
+}
+
+static void scaledconvolve_vert_w8(
+ const uint8_t *src, const ptrdiff_t src_stride, uint8_t *const dst,
+ const ptrdiff_t dst_stride, const InterpKernel *const y_filters,
+ const int y0_q4, const int y_step_q4, const int w, const int h) {
+ int y;
+ int y_q4 = y0_q4;
+
+ src -= src_stride * (SUBPEL_TAPS / 2 - 1);
+ for (y = 0; y < h; ++y) {
+ const unsigned char *src_y = &src[(y_q4 >> SUBPEL_BITS) * src_stride];
+ const int16_t *const y_filter = y_filters[y_q4 & SUBPEL_MASK];
+ if (y_q4 & SUBPEL_MASK) {
+ filter_vert_w8_ssse3(src_y, src_stride, &dst[y * dst_stride], y_filter);
+ } else {
+ memcpy(&dst[y * dst_stride], &src_y[3 * src_stride], w);
+ }
+ y_q4 += y_step_q4;
+ }
+}
+
+static void filter_vert_w16_ssse3(const uint8_t *src,
+ const ptrdiff_t src_stride,
+ uint8_t *const dst,
+ const int16_t *const filter, const int w) {
+ int i;
+ __m128i f[4];
+ shuffle_filter_ssse3(filter, f);
+
+ for (i = 0; i < w; i += 16) {
+ __m128i s[8], s_lo[4], s_hi[4], temp_lo, temp_hi;
+
+ loadu_8bit_16x8(src, src_stride, s);
+
+ // merge the result together
+ s_lo[0] = _mm_unpacklo_epi8(s[0], s[1]);
+ s_hi[0] = _mm_unpackhi_epi8(s[0], s[1]);
+ s_lo[1] = _mm_unpacklo_epi8(s[2], s[3]);
+ s_hi[1] = _mm_unpackhi_epi8(s[2], s[3]);
+ s_lo[2] = _mm_unpacklo_epi8(s[4], s[5]);
+ s_hi[2] = _mm_unpackhi_epi8(s[4], s[5]);
+ s_lo[3] = _mm_unpacklo_epi8(s[6], s[7]);
+ s_hi[3] = _mm_unpackhi_epi8(s[6], s[7]);
+ temp_lo = convolve8_8_ssse3(s_lo, f);
+ temp_hi = convolve8_8_ssse3(s_hi, f);
+
+ // shrink to 8 bit each 16 bits, the first lane contain the first convolve
+ // result and the second lane contain the second convolve result
+ temp_hi = _mm_packus_epi16(temp_lo, temp_hi);
+ src += 16;
+ // save 16 bytes convolve result
+ _mm_store_si128((__m128i *)&dst[i], temp_hi);
+ }
+}
+
+static void scaledconvolve_vert_w16(
+ const uint8_t *src, const ptrdiff_t src_stride, uint8_t *const dst,
+ const ptrdiff_t dst_stride, const InterpKernel *const y_filters,
+ const int y0_q4, const int y_step_q4, const int w, const int h) {
+ int y;
+ int y_q4 = y0_q4;
+
+ src -= src_stride * (SUBPEL_TAPS / 2 - 1);
+ for (y = 0; y < h; ++y) {
+ const unsigned char *src_y = &src[(y_q4 >> SUBPEL_BITS) * src_stride];
+ const int16_t *const y_filter = y_filters[y_q4 & SUBPEL_MASK];
+ if (y_q4 & SUBPEL_MASK) {
+ filter_vert_w16_ssse3(src_y, src_stride, &dst[y * dst_stride], y_filter,
+ w);
+ } else {
+ memcpy(&dst[y * dst_stride], &src_y[3 * src_stride], w);
+ }
+ y_q4 += y_step_q4;
+ }
+}
+
+void vpx_scaled_2d_ssse3(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,
+ ptrdiff_t dst_stride, const InterpKernel *filter,
+ int x0_q4, int x_step_q4, int y0_q4, int y_step_q4,
+ int w, int h) {
+ // Note: Fixed size intermediate buffer, temp, places limits on parameters.
+ // 2d filtering proceeds in 2 steps:
+ // (1) Interpolate horizontally into an intermediate buffer, temp.
+ // (2) Interpolate temp vertically to derive the sub-pixel result.
+ // Deriving the maximum number of rows in the temp buffer (135):
+ // --Smallest scaling factor is x1/2 ==> y_step_q4 = 32 (Normative).
+ // --Largest block size is 64x64 pixels.
+ // --64 rows in the downscaled frame span a distance of (64 - 1) * 32 in the
+ // original frame (in 1/16th pixel units).
+ // --Must round-up because block may be located at sub-pixel position.
+ // --Require an additional SUBPEL_TAPS rows for the 8-tap filter tails.
+ // --((64 - 1) * 32 + 15) >> 4 + 8 = 135.
+ // --Require an additional 8 rows for the horiz_w8 transpose tail.
+ // When calling in frame scaling function, the smallest scaling factor is x1/4
+ // ==> y_step_q4 = 64. Since w and h are at most 16, the temp buffer is still
+ // big enough.
+ DECLARE_ALIGNED(16, uint8_t, temp[(135 + 8) * 64]);
+ const int intermediate_height =
+ (((h - 1) * y_step_q4 + y0_q4) >> SUBPEL_BITS) + SUBPEL_TAPS;
+
+ assert(w <= 64);
+ assert(h <= 64);
+ assert(y_step_q4 <= 32 || (y_step_q4 <= 64 && h <= 32));
+ assert(x_step_q4 <= 64);
+
+ if (w >= 8) {
+ scaledconvolve_horiz_w8(src - src_stride * (SUBPEL_TAPS / 2 - 1),
+ src_stride, temp, 64, filter, x0_q4, x_step_q4, w,
+ intermediate_height);
+ } else {
+ scaledconvolve_horiz_w4(src - src_stride * (SUBPEL_TAPS / 2 - 1),
+ src_stride, temp, 64, filter, x0_q4, x_step_q4, w,
+ intermediate_height);
+ }
+
+ if (w >= 16) {
+ scaledconvolve_vert_w16(temp + 64 * (SUBPEL_TAPS / 2 - 1), 64, dst,
+ dst_stride, filter, y0_q4, y_step_q4, w, h);
+ } else if (w == 8) {
+ scaledconvolve_vert_w8(temp + 64 * (SUBPEL_TAPS / 2 - 1), 64, dst,
+ dst_stride, filter, y0_q4, y_step_q4, w, h);
+ } else {
+ scaledconvolve_vert_w4(temp + 64 * (SUBPEL_TAPS / 2 - 1), 64, dst,
+ dst_stride, filter, y0_q4, y_step_q4, w, h);
+ }
+}
+
+// void vpx_convolve8_ssse3(const uint8_t *src, ptrdiff_t src_stride,
+// uint8_t *dst, ptrdiff_t dst_stride,
+// const InterpKernel *filter, int x0_q4,
+// int32_t x_step_q4, int y0_q4, int y_step_q4,
+// int w, int h);
+// void vpx_convolve8_avg_ssse3(const uint8_t *src, ptrdiff_t src_stride,
+// uint8_t *dst, ptrdiff_t dst_stride,
+// const InterpKernel *filter, int x0_q4,
+// int32_t x_step_q4, int y0_q4, int y_step_q4,
+// int w, int h);
+FUN_CONV_2D(, ssse3, 0)
+FUN_CONV_2D(avg_, ssse3, 1)
diff --git a/media/libvpx/libvpx/vpx_dsp/x86/vpx_subpixel_8t_sse2.asm b/media/libvpx/libvpx/vpx_dsp/x86/vpx_subpixel_8t_sse2.asm
new file mode 100644
index 0000000000..c8455e13a2
--- /dev/null
+++ b/media/libvpx/libvpx/vpx_dsp/x86/vpx_subpixel_8t_sse2.asm
@@ -0,0 +1,989 @@
+;
+; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+;
+; Use of this source code is governed by a BSD-style license
+; that can be found in the LICENSE file in the root of the source
+; tree. An additional intellectual property rights grant can be found
+; in the file PATENTS. All contributing project authors may
+; be found in the AUTHORS file in the root of the source tree.
+;
+
+
+%include "vpx_ports/x86_abi_support.asm"
+
+;Note: tap3 and tap4 have to be applied and added after other taps to avoid
+;overflow.
+
+%macro GET_FILTERS_4 0
+ mov rdx, arg(5) ;filter ptr
+ mov rcx, 0x0400040
+
+ movdqa xmm7, [rdx] ;load filters
+ pshuflw xmm0, xmm7, 0b ;k0
+ pshuflw xmm1, xmm7, 01010101b ;k1
+ pshuflw xmm2, xmm7, 10101010b ;k2
+ pshuflw xmm3, xmm7, 11111111b ;k3
+ psrldq xmm7, 8
+ pshuflw xmm4, xmm7, 0b ;k4
+ pshuflw xmm5, xmm7, 01010101b ;k5
+ pshuflw xmm6, xmm7, 10101010b ;k6
+ pshuflw xmm7, xmm7, 11111111b ;k7
+
+ punpcklqdq xmm0, xmm1
+ punpcklqdq xmm2, xmm3
+ punpcklqdq xmm5, xmm4
+ punpcklqdq xmm6, xmm7
+
+ movdqa k0k1, xmm0
+ movdqa k2k3, xmm2
+ movdqa k5k4, xmm5
+ movdqa k6k7, xmm6
+
+ movq xmm6, rcx
+ pshufd xmm6, xmm6, 0
+ movdqa krd, xmm6
+
+ pxor xmm7, xmm7
+ movdqa zero, xmm7
+%endm
+
+%macro APPLY_FILTER_4 1
+ punpckldq xmm0, xmm1 ;two row in one register
+ punpckldq xmm6, xmm7
+ punpckldq xmm2, xmm3
+ punpckldq xmm5, xmm4
+
+ punpcklbw xmm0, zero ;unpack to word
+ punpcklbw xmm6, zero
+ punpcklbw xmm2, zero
+ punpcklbw xmm5, zero
+
+ pmullw xmm0, k0k1 ;multiply the filter factors
+ pmullw xmm6, k6k7
+ pmullw xmm2, k2k3
+ pmullw xmm5, k5k4
+
+ paddsw xmm0, xmm6 ;sum
+ movdqa xmm1, xmm0
+ psrldq xmm1, 8
+ paddsw xmm0, xmm1
+ paddsw xmm0, xmm2
+ psrldq xmm2, 8
+ paddsw xmm0, xmm5
+ psrldq xmm5, 8
+ paddsw xmm0, xmm2
+ paddsw xmm0, xmm5
+
+ paddsw xmm0, krd ;rounding
+ psraw xmm0, 7 ;shift
+ packuswb xmm0, xmm0 ;pack to byte
+
+%if %1
+ movd xmm1, [rdi]
+ pavgb xmm0, xmm1
+%endif
+ movd [rdi], xmm0
+%endm
+
+%macro GET_FILTERS 0
+ mov rdx, arg(5) ;filter ptr
+ mov rsi, arg(0) ;src_ptr
+ mov rdi, arg(2) ;output_ptr
+ mov rcx, 0x0400040
+
+ movdqa xmm7, [rdx] ;load filters
+ pshuflw xmm0, xmm7, 0b ;k0
+ pshuflw xmm1, xmm7, 01010101b ;k1
+ pshuflw xmm2, xmm7, 10101010b ;k2
+ pshuflw xmm3, xmm7, 11111111b ;k3
+ pshufhw xmm4, xmm7, 0b ;k4
+ pshufhw xmm5, xmm7, 01010101b ;k5
+ pshufhw xmm6, xmm7, 10101010b ;k6
+ pshufhw xmm7, xmm7, 11111111b ;k7
+
+ punpcklwd xmm0, xmm0
+ punpcklwd xmm1, xmm1
+ punpcklwd xmm2, xmm2
+ punpcklwd xmm3, xmm3
+ punpckhwd xmm4, xmm4
+ punpckhwd xmm5, xmm5
+ punpckhwd xmm6, xmm6
+ punpckhwd xmm7, xmm7
+
+ movdqa k0, xmm0 ;store filter factors on stack
+ movdqa k1, xmm1
+ movdqa k2, xmm2
+ movdqa k3, xmm3
+ movdqa k4, xmm4
+ movdqa k5, xmm5
+ movdqa k6, xmm6
+ movdqa k7, xmm7
+
+ movq xmm6, rcx
+ pshufd xmm6, xmm6, 0
+ movdqa krd, xmm6 ;rounding
+
+ pxor xmm7, xmm7
+ movdqa zero, xmm7
+%endm
+
+%macro LOAD_VERT_8 1
+ movq xmm0, [rsi + %1] ;0
+ movq xmm1, [rsi + rax + %1] ;1
+ movq xmm6, [rsi + rdx * 2 + %1] ;6
+ lea rsi, [rsi + rax]
+ movq xmm7, [rsi + rdx * 2 + %1] ;7
+ movq xmm2, [rsi + rax + %1] ;2
+ movq xmm3, [rsi + rax * 2 + %1] ;3
+ movq xmm4, [rsi + rdx + %1] ;4
+ movq xmm5, [rsi + rax * 4 + %1] ;5
+%endm
+
+%macro APPLY_FILTER_8 2
+ punpcklbw xmm0, zero
+ punpcklbw xmm1, zero
+ punpcklbw xmm6, zero
+ punpcklbw xmm7, zero
+ punpcklbw xmm2, zero
+ punpcklbw xmm5, zero
+ punpcklbw xmm3, zero
+ punpcklbw xmm4, zero
+
+ pmullw xmm0, k0
+ pmullw xmm1, k1
+ pmullw xmm6, k6
+ pmullw xmm7, k7
+ pmullw xmm2, k2
+ pmullw xmm5, k5
+ pmullw xmm3, k3
+ pmullw xmm4, k4
+
+ paddsw xmm0, xmm1
+ paddsw xmm0, xmm6
+ paddsw xmm0, xmm7
+ paddsw xmm0, xmm2
+ paddsw xmm0, xmm5
+ paddsw xmm0, xmm3
+ paddsw xmm0, xmm4
+
+ paddsw xmm0, krd ;rounding
+ psraw xmm0, 7 ;shift
+ packuswb xmm0, xmm0 ;pack back to byte
+%if %1
+ movq xmm1, [rdi + %2]
+ pavgb xmm0, xmm1
+%endif
+ movq [rdi + %2], xmm0
+%endm
+
+SECTION .text
+
+;void vpx_filter_block1d4_v8_sse2
+;(
+; unsigned char *src_ptr,
+; unsigned int src_pitch,
+; unsigned char *output_ptr,
+; unsigned int out_pitch,
+; unsigned int output_height,
+; short *filter
+;)
+globalsym(vpx_filter_block1d4_v8_sse2)
+sym(vpx_filter_block1d4_v8_sse2):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 6
+ SAVE_XMM 7
+ push rsi
+ push rdi
+ push rbx
+ ; end prolog
+
+ ALIGN_STACK 16, rax
+ sub rsp, 16 * 6
+ %define k0k1 [rsp + 16 * 0]
+ %define k2k3 [rsp + 16 * 1]
+ %define k5k4 [rsp + 16 * 2]
+ %define k6k7 [rsp + 16 * 3]
+ %define krd [rsp + 16 * 4]
+ %define zero [rsp + 16 * 5]
+
+ GET_FILTERS_4
+
+ mov rsi, arg(0) ;src_ptr
+ mov rdi, arg(2) ;output_ptr
+
+ movsxd rax, DWORD PTR arg(1) ;pixels_per_line
+ movsxd rbx, DWORD PTR arg(3) ;out_pitch
+ lea rdx, [rax + rax * 2]
+ movsxd rcx, DWORD PTR arg(4) ;output_height
+
+.loop:
+ movd xmm0, [rsi] ;load src: row 0
+ movd xmm1, [rsi + rax] ;1
+ movd xmm6, [rsi + rdx * 2] ;6
+ lea rsi, [rsi + rax]
+ movd xmm7, [rsi + rdx * 2] ;7
+ movd xmm2, [rsi + rax] ;2
+ movd xmm3, [rsi + rax * 2] ;3
+ movd xmm4, [rsi + rdx] ;4
+ movd xmm5, [rsi + rax * 4] ;5
+
+ APPLY_FILTER_4 0
+
+ lea rdi, [rdi + rbx]
+ dec rcx
+ jnz .loop
+
+ add rsp, 16 * 6
+ pop rsp
+ pop rbx
+ ; begin epilog
+ pop rdi
+ pop rsi
+ RESTORE_XMM
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+
+;void vpx_filter_block1d8_v8_sse2
+;(
+; unsigned char *src_ptr,
+; unsigned int src_pitch,
+; unsigned char *output_ptr,
+; unsigned int out_pitch,
+; unsigned int output_height,
+; short *filter
+;)
+globalsym(vpx_filter_block1d8_v8_sse2)
+sym(vpx_filter_block1d8_v8_sse2):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 6
+ SAVE_XMM 7
+ push rsi
+ push rdi
+ push rbx
+ ; end prolog
+
+ ALIGN_STACK 16, rax
+ sub rsp, 16 * 10
+ %define k0 [rsp + 16 * 0]
+ %define k1 [rsp + 16 * 1]
+ %define k2 [rsp + 16 * 2]
+ %define k3 [rsp + 16 * 3]
+ %define k4 [rsp + 16 * 4]
+ %define k5 [rsp + 16 * 5]
+ %define k6 [rsp + 16 * 6]
+ %define k7 [rsp + 16 * 7]
+ %define krd [rsp + 16 * 8]
+ %define zero [rsp + 16 * 9]
+
+ GET_FILTERS
+
+ movsxd rax, DWORD PTR arg(1) ;pixels_per_line
+ movsxd rbx, DWORD PTR arg(3) ;out_pitch
+ lea rdx, [rax + rax * 2]
+ movsxd rcx, DWORD PTR arg(4) ;output_height
+
+.loop:
+ LOAD_VERT_8 0
+ APPLY_FILTER_8 0, 0
+
+ lea rdi, [rdi + rbx]
+ dec rcx
+ jnz .loop
+
+ add rsp, 16 * 10
+ pop rsp
+ pop rbx
+ ; begin epilog
+ pop rdi
+ pop rsi
+ RESTORE_XMM
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+
+;void vpx_filter_block1d16_v8_sse2
+;(
+; unsigned char *src_ptr,
+; unsigned int src_pitch,
+; unsigned char *output_ptr,
+; unsigned int out_pitch,
+; unsigned int output_height,
+; short *filter
+;)
+globalsym(vpx_filter_block1d16_v8_sse2)
+sym(vpx_filter_block1d16_v8_sse2):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 6
+ SAVE_XMM 7
+ push rsi
+ push rdi
+ push rbx
+ ; end prolog
+
+ ALIGN_STACK 16, rax
+ sub rsp, 16 * 10
+ %define k0 [rsp + 16 * 0]
+ %define k1 [rsp + 16 * 1]
+ %define k2 [rsp + 16 * 2]
+ %define k3 [rsp + 16 * 3]
+ %define k4 [rsp + 16 * 4]
+ %define k5 [rsp + 16 * 5]
+ %define k6 [rsp + 16 * 6]
+ %define k7 [rsp + 16 * 7]
+ %define krd [rsp + 16 * 8]
+ %define zero [rsp + 16 * 9]
+
+ GET_FILTERS
+
+ movsxd rax, DWORD PTR arg(1) ;pixels_per_line
+ movsxd rbx, DWORD PTR arg(3) ;out_pitch
+ lea rdx, [rax + rax * 2]
+ movsxd rcx, DWORD PTR arg(4) ;output_height
+
+.loop:
+ LOAD_VERT_8 0
+ APPLY_FILTER_8 0, 0
+ sub rsi, rax
+
+ LOAD_VERT_8 8
+ APPLY_FILTER_8 0, 8
+ add rdi, rbx
+
+ dec rcx
+ jnz .loop
+
+ add rsp, 16 * 10
+ pop rsp
+ pop rbx
+ ; begin epilog
+ pop rdi
+ pop rsi
+ RESTORE_XMM
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+
+globalsym(vpx_filter_block1d4_v8_avg_sse2)
+sym(vpx_filter_block1d4_v8_avg_sse2):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 6
+ SAVE_XMM 7
+ push rsi
+ push rdi
+ push rbx
+ ; end prolog
+
+ ALIGN_STACK 16, rax
+ sub rsp, 16 * 6
+ %define k0k1 [rsp + 16 * 0]
+ %define k2k3 [rsp + 16 * 1]
+ %define k5k4 [rsp + 16 * 2]
+ %define k6k7 [rsp + 16 * 3]
+ %define krd [rsp + 16 * 4]
+ %define zero [rsp + 16 * 5]
+
+ GET_FILTERS_4
+
+ mov rsi, arg(0) ;src_ptr
+ mov rdi, arg(2) ;output_ptr
+
+ movsxd rax, DWORD PTR arg(1) ;pixels_per_line
+ movsxd rbx, DWORD PTR arg(3) ;out_pitch
+ lea rdx, [rax + rax * 2]
+ movsxd rcx, DWORD PTR arg(4) ;output_height
+
+.loop:
+ movd xmm0, [rsi] ;load src: row 0
+ movd xmm1, [rsi + rax] ;1
+ movd xmm6, [rsi + rdx * 2] ;6
+ lea rsi, [rsi + rax]
+ movd xmm7, [rsi + rdx * 2] ;7
+ movd xmm2, [rsi + rax] ;2
+ movd xmm3, [rsi + rax * 2] ;3
+ movd xmm4, [rsi + rdx] ;4
+ movd xmm5, [rsi + rax * 4] ;5
+
+ APPLY_FILTER_4 1
+
+ lea rdi, [rdi + rbx]
+ dec rcx
+ jnz .loop
+
+ add rsp, 16 * 6
+ pop rsp
+ pop rbx
+ ; begin epilog
+ pop rdi
+ pop rsi
+ RESTORE_XMM
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+
+globalsym(vpx_filter_block1d8_v8_avg_sse2)
+sym(vpx_filter_block1d8_v8_avg_sse2):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 6
+ SAVE_XMM 7
+ push rsi
+ push rdi
+ push rbx
+ ; end prolog
+
+ ALIGN_STACK 16, rax
+ sub rsp, 16 * 10
+ %define k0 [rsp + 16 * 0]
+ %define k1 [rsp + 16 * 1]
+ %define k2 [rsp + 16 * 2]
+ %define k3 [rsp + 16 * 3]
+ %define k4 [rsp + 16 * 4]
+ %define k5 [rsp + 16 * 5]
+ %define k6 [rsp + 16 * 6]
+ %define k7 [rsp + 16 * 7]
+ %define krd [rsp + 16 * 8]
+ %define zero [rsp + 16 * 9]
+
+ GET_FILTERS
+
+ movsxd rax, DWORD PTR arg(1) ;pixels_per_line
+ movsxd rbx, DWORD PTR arg(3) ;out_pitch
+ lea rdx, [rax + rax * 2]
+ movsxd rcx, DWORD PTR arg(4) ;output_height
+.loop:
+ LOAD_VERT_8 0
+ APPLY_FILTER_8 1, 0
+
+ lea rdi, [rdi + rbx]
+ dec rcx
+ jnz .loop
+
+ add rsp, 16 * 10
+ pop rsp
+ pop rbx
+ ; begin epilog
+ pop rdi
+ pop rsi
+ RESTORE_XMM
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+
+globalsym(vpx_filter_block1d16_v8_avg_sse2)
+sym(vpx_filter_block1d16_v8_avg_sse2):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 6
+ SAVE_XMM 7
+ push rsi
+ push rdi
+ push rbx
+ ; end prolog
+
+ ALIGN_STACK 16, rax
+ sub rsp, 16 * 10
+ %define k0 [rsp + 16 * 0]
+ %define k1 [rsp + 16 * 1]
+ %define k2 [rsp + 16 * 2]
+ %define k3 [rsp + 16 * 3]
+ %define k4 [rsp + 16 * 4]
+ %define k5 [rsp + 16 * 5]
+ %define k6 [rsp + 16 * 6]
+ %define k7 [rsp + 16 * 7]
+ %define krd [rsp + 16 * 8]
+ %define zero [rsp + 16 * 9]
+
+ GET_FILTERS
+
+ movsxd rax, DWORD PTR arg(1) ;pixels_per_line
+ movsxd rbx, DWORD PTR arg(3) ;out_pitch
+ lea rdx, [rax + rax * 2]
+ movsxd rcx, DWORD PTR arg(4) ;output_height
+.loop:
+ LOAD_VERT_8 0
+ APPLY_FILTER_8 1, 0
+ sub rsi, rax
+
+ LOAD_VERT_8 8
+ APPLY_FILTER_8 1, 8
+ add rdi, rbx
+
+ dec rcx
+ jnz .loop
+
+ add rsp, 16 * 10
+ pop rsp
+ pop rbx
+ ; begin epilog
+ pop rdi
+ pop rsi
+ RESTORE_XMM
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+
+;void vpx_filter_block1d4_h8_sse2
+;(
+; unsigned char *src_ptr,
+; unsigned int src_pixels_per_line,
+; unsigned char *output_ptr,
+; unsigned int output_pitch,
+; unsigned int output_height,
+; short *filter
+;)
+globalsym(vpx_filter_block1d4_h8_sse2)
+sym(vpx_filter_block1d4_h8_sse2):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 6
+ SAVE_XMM 7
+ push rsi
+ push rdi
+ ; end prolog
+
+ ALIGN_STACK 16, rax
+ sub rsp, 16 * 6
+ %define k0k1 [rsp + 16 * 0]
+ %define k2k3 [rsp + 16 * 1]
+ %define k5k4 [rsp + 16 * 2]
+ %define k6k7 [rsp + 16 * 3]
+ %define krd [rsp + 16 * 4]
+ %define zero [rsp + 16 * 5]
+
+ GET_FILTERS_4
+
+ mov rsi, arg(0) ;src_ptr
+ mov rdi, arg(2) ;output_ptr
+
+ movsxd rax, DWORD PTR arg(1) ;pixels_per_line
+ movsxd rdx, DWORD PTR arg(3) ;out_pitch
+ movsxd rcx, DWORD PTR arg(4) ;output_height
+
+.loop:
+ movdqu xmm0, [rsi - 3] ;load src
+
+ movdqa xmm1, xmm0
+ movdqa xmm6, xmm0
+ movdqa xmm7, xmm0
+ movdqa xmm2, xmm0
+ movdqa xmm3, xmm0
+ movdqa xmm5, xmm0
+ movdqa xmm4, xmm0
+
+ psrldq xmm1, 1
+ psrldq xmm6, 6
+ psrldq xmm7, 7
+ psrldq xmm2, 2
+ psrldq xmm3, 3
+ psrldq xmm5, 5
+ psrldq xmm4, 4
+
+ APPLY_FILTER_4 0
+
+ lea rsi, [rsi + rax]
+ lea rdi, [rdi + rdx]
+ dec rcx
+ jnz .loop
+
+ add rsp, 16 * 6
+ pop rsp
+
+ ; begin epilog
+ pop rdi
+ pop rsi
+ RESTORE_XMM
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+
+;void vpx_filter_block1d8_h8_sse2
+;(
+; unsigned char *src_ptr,
+; unsigned int src_pixels_per_line,
+; unsigned char *output_ptr,
+; unsigned int output_pitch,
+; unsigned int output_height,
+; short *filter
+;)
+globalsym(vpx_filter_block1d8_h8_sse2)
+sym(vpx_filter_block1d8_h8_sse2):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 6
+ SAVE_XMM 7
+ push rsi
+ push rdi
+ ; end prolog
+
+ ALIGN_STACK 16, rax
+ sub rsp, 16 * 10
+ %define k0 [rsp + 16 * 0]
+ %define k1 [rsp + 16 * 1]
+ %define k2 [rsp + 16 * 2]
+ %define k3 [rsp + 16 * 3]
+ %define k4 [rsp + 16 * 4]
+ %define k5 [rsp + 16 * 5]
+ %define k6 [rsp + 16 * 6]
+ %define k7 [rsp + 16 * 7]
+ %define krd [rsp + 16 * 8]
+ %define zero [rsp + 16 * 9]
+
+ GET_FILTERS
+
+ movsxd rax, DWORD PTR arg(1) ;pixels_per_line
+ movsxd rdx, DWORD PTR arg(3) ;out_pitch
+ movsxd rcx, DWORD PTR arg(4) ;output_height
+
+.loop:
+ movdqu xmm0, [rsi - 3] ;load src
+
+ movdqa xmm1, xmm0
+ movdqa xmm6, xmm0
+ movdqa xmm7, xmm0
+ movdqa xmm2, xmm0
+ movdqa xmm5, xmm0
+ movdqa xmm3, xmm0
+ movdqa xmm4, xmm0
+
+ psrldq xmm1, 1
+ psrldq xmm6, 6
+ psrldq xmm7, 7
+ psrldq xmm2, 2
+ psrldq xmm5, 5
+ psrldq xmm3, 3
+ psrldq xmm4, 4
+
+ APPLY_FILTER_8 0, 0
+
+ lea rsi, [rsi + rax]
+ lea rdi, [rdi + rdx]
+ dec rcx
+ jnz .loop
+
+ add rsp, 16 * 10
+ pop rsp
+
+ ; begin epilog
+ pop rdi
+ pop rsi
+ RESTORE_XMM
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+
+;void vpx_filter_block1d16_h8_sse2
+;(
+; unsigned char *src_ptr,
+; unsigned int src_pixels_per_line,
+; unsigned char *output_ptr,
+; unsigned int output_pitch,
+; unsigned int output_height,
+; short *filter
+;)
+globalsym(vpx_filter_block1d16_h8_sse2)
+sym(vpx_filter_block1d16_h8_sse2):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 6
+ SAVE_XMM 7
+ push rsi
+ push rdi
+ ; end prolog
+
+ ALIGN_STACK 16, rax
+ sub rsp, 16 * 10
+ %define k0 [rsp + 16 * 0]
+ %define k1 [rsp + 16 * 1]
+ %define k2 [rsp + 16 * 2]
+ %define k3 [rsp + 16 * 3]
+ %define k4 [rsp + 16 * 4]
+ %define k5 [rsp + 16 * 5]
+ %define k6 [rsp + 16 * 6]
+ %define k7 [rsp + 16 * 7]
+ %define krd [rsp + 16 * 8]
+ %define zero [rsp + 16 * 9]
+
+ GET_FILTERS
+
+ movsxd rax, DWORD PTR arg(1) ;pixels_per_line
+ movsxd rdx, DWORD PTR arg(3) ;out_pitch
+ movsxd rcx, DWORD PTR arg(4) ;output_height
+
+.loop:
+ movdqu xmm0, [rsi - 3] ;load src
+
+ movdqa xmm1, xmm0
+ movdqa xmm6, xmm0
+ movdqa xmm7, xmm0
+ movdqa xmm2, xmm0
+ movdqa xmm5, xmm0
+ movdqa xmm3, xmm0
+ movdqa xmm4, xmm0
+
+ psrldq xmm1, 1
+ psrldq xmm6, 6
+ psrldq xmm7, 7
+ psrldq xmm2, 2
+ psrldq xmm5, 5
+ psrldq xmm3, 3
+ psrldq xmm4, 4
+
+ APPLY_FILTER_8 0, 0
+
+ movdqu xmm0, [rsi + 5] ;load src
+
+ movdqa xmm1, xmm0
+ movdqa xmm6, xmm0
+ movdqa xmm7, xmm0
+ movdqa xmm2, xmm0
+ movdqa xmm5, xmm0
+ movdqa xmm3, xmm0
+ movdqa xmm4, xmm0
+
+ psrldq xmm1, 1
+ psrldq xmm6, 6
+ psrldq xmm7, 7
+ psrldq xmm2, 2
+ psrldq xmm5, 5
+ psrldq xmm3, 3
+ psrldq xmm4, 4
+
+ APPLY_FILTER_8 0, 8
+
+ lea rsi, [rsi + rax]
+ lea rdi, [rdi + rdx]
+ dec rcx
+ jnz .loop
+
+ add rsp, 16 * 10
+ pop rsp
+
+ ; begin epilog
+ pop rdi
+ pop rsi
+ RESTORE_XMM
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+
+globalsym(vpx_filter_block1d4_h8_avg_sse2)
+sym(vpx_filter_block1d4_h8_avg_sse2):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 6
+ SAVE_XMM 7
+ push rsi
+ push rdi
+ ; end prolog
+
+ ALIGN_STACK 16, rax
+ sub rsp, 16 * 6
+ %define k0k1 [rsp + 16 * 0]
+ %define k2k3 [rsp + 16 * 1]
+ %define k5k4 [rsp + 16 * 2]
+ %define k6k7 [rsp + 16 * 3]
+ %define krd [rsp + 16 * 4]
+ %define zero [rsp + 16 * 5]
+
+ GET_FILTERS_4
+
+ mov rsi, arg(0) ;src_ptr
+ mov rdi, arg(2) ;output_ptr
+
+ movsxd rax, DWORD PTR arg(1) ;pixels_per_line
+ movsxd rdx, DWORD PTR arg(3) ;out_pitch
+ movsxd rcx, DWORD PTR arg(4) ;output_height
+
+.loop:
+ movdqu xmm0, [rsi - 3] ;load src
+
+ movdqa xmm1, xmm0
+ movdqa xmm6, xmm0
+ movdqa xmm7, xmm0
+ movdqa xmm2, xmm0
+ movdqa xmm3, xmm0
+ movdqa xmm5, xmm0
+ movdqa xmm4, xmm0
+
+ psrldq xmm1, 1
+ psrldq xmm6, 6
+ psrldq xmm7, 7
+ psrldq xmm2, 2
+ psrldq xmm3, 3
+ psrldq xmm5, 5
+ psrldq xmm4, 4
+
+ APPLY_FILTER_4 1
+
+ lea rsi, [rsi + rax]
+ lea rdi, [rdi + rdx]
+ dec rcx
+ jnz .loop
+
+ add rsp, 16 * 6
+ pop rsp
+
+ ; begin epilog
+ pop rdi
+ pop rsi
+ RESTORE_XMM
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+
+globalsym(vpx_filter_block1d8_h8_avg_sse2)
+sym(vpx_filter_block1d8_h8_avg_sse2):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 6
+ SAVE_XMM 7
+ push rsi
+ push rdi
+ ; end prolog
+
+ ALIGN_STACK 16, rax
+ sub rsp, 16 * 10
+ %define k0 [rsp + 16 * 0]
+ %define k1 [rsp + 16 * 1]
+ %define k2 [rsp + 16 * 2]
+ %define k3 [rsp + 16 * 3]
+ %define k4 [rsp + 16 * 4]
+ %define k5 [rsp + 16 * 5]
+ %define k6 [rsp + 16 * 6]
+ %define k7 [rsp + 16 * 7]
+ %define krd [rsp + 16 * 8]
+ %define zero [rsp + 16 * 9]
+
+ GET_FILTERS
+
+ movsxd rax, DWORD PTR arg(1) ;pixels_per_line
+ movsxd rdx, DWORD PTR arg(3) ;out_pitch
+ movsxd rcx, DWORD PTR arg(4) ;output_height
+
+.loop:
+ movdqu xmm0, [rsi - 3] ;load src
+
+ movdqa xmm1, xmm0
+ movdqa xmm6, xmm0
+ movdqa xmm7, xmm0
+ movdqa xmm2, xmm0
+ movdqa xmm5, xmm0
+ movdqa xmm3, xmm0
+ movdqa xmm4, xmm0
+
+ psrldq xmm1, 1
+ psrldq xmm6, 6
+ psrldq xmm7, 7
+ psrldq xmm2, 2
+ psrldq xmm5, 5
+ psrldq xmm3, 3
+ psrldq xmm4, 4
+
+ APPLY_FILTER_8 1, 0
+
+ lea rsi, [rsi + rax]
+ lea rdi, [rdi + rdx]
+ dec rcx
+ jnz .loop
+
+ add rsp, 16 * 10
+ pop rsp
+
+ ; begin epilog
+ pop rdi
+ pop rsi
+ RESTORE_XMM
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+
+globalsym(vpx_filter_block1d16_h8_avg_sse2)
+sym(vpx_filter_block1d16_h8_avg_sse2):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 6
+ SAVE_XMM 7
+ push rsi
+ push rdi
+ ; end prolog
+
+ ALIGN_STACK 16, rax
+ sub rsp, 16 * 10
+ %define k0 [rsp + 16 * 0]
+ %define k1 [rsp + 16 * 1]
+ %define k2 [rsp + 16 * 2]
+ %define k3 [rsp + 16 * 3]
+ %define k4 [rsp + 16 * 4]
+ %define k5 [rsp + 16 * 5]
+ %define k6 [rsp + 16 * 6]
+ %define k7 [rsp + 16 * 7]
+ %define krd [rsp + 16 * 8]
+ %define zero [rsp + 16 * 9]
+
+ GET_FILTERS
+
+ movsxd rax, DWORD PTR arg(1) ;pixels_per_line
+ movsxd rdx, DWORD PTR arg(3) ;out_pitch
+ movsxd rcx, DWORD PTR arg(4) ;output_height
+
+.loop:
+ movdqu xmm0, [rsi - 3] ;load src
+
+ movdqa xmm1, xmm0
+ movdqa xmm6, xmm0
+ movdqa xmm7, xmm0
+ movdqa xmm2, xmm0
+ movdqa xmm5, xmm0
+ movdqa xmm3, xmm0
+ movdqa xmm4, xmm0
+
+ psrldq xmm1, 1
+ psrldq xmm6, 6
+ psrldq xmm7, 7
+ psrldq xmm2, 2
+ psrldq xmm5, 5
+ psrldq xmm3, 3
+ psrldq xmm4, 4
+
+ APPLY_FILTER_8 1, 0
+
+ movdqu xmm0, [rsi + 5] ;load src
+
+ movdqa xmm1, xmm0
+ movdqa xmm6, xmm0
+ movdqa xmm7, xmm0
+ movdqa xmm2, xmm0
+ movdqa xmm5, xmm0
+ movdqa xmm3, xmm0
+ movdqa xmm4, xmm0
+
+ psrldq xmm1, 1
+ psrldq xmm6, 6
+ psrldq xmm7, 7
+ psrldq xmm2, 2
+ psrldq xmm5, 5
+ psrldq xmm3, 3
+ psrldq xmm4, 4
+
+ APPLY_FILTER_8 1, 8
+
+ lea rsi, [rsi + rax]
+ lea rdi, [rdi + rdx]
+ dec rcx
+ jnz .loop
+
+ add rsp, 16 * 10
+ pop rsp
+
+ ; begin epilog
+ pop rdi
+ pop rsi
+ RESTORE_XMM
+ UNSHADOW_ARGS
+ pop rbp
+ ret
diff --git a/media/libvpx/libvpx/vpx_dsp/x86/vpx_subpixel_8t_ssse3.asm b/media/libvpx/libvpx/vpx_dsp/x86/vpx_subpixel_8t_ssse3.asm
new file mode 100644
index 0000000000..fe617f1207
--- /dev/null
+++ b/media/libvpx/libvpx/vpx_dsp/x86/vpx_subpixel_8t_ssse3.asm
@@ -0,0 +1,803 @@
+;
+; Copyright (c) 2015 The WebM project authors. All Rights Reserved.
+;
+; Use of this source code is governed by a BSD-style license
+; that can be found in the LICENSE file in the root of the source
+; tree. An additional intellectual property rights grant can be found
+; in the file PATENTS. All contributing project authors may
+; be found in the AUTHORS file in the root of the source tree.
+;
+
+%include "third_party/x86inc/x86inc.asm"
+
+SECTION_RODATA
+pw_64: times 8 dw 64
+
+; %define USE_PMULHRSW
+; NOTE: pmulhrsw has a latency of 5 cycles. Tests showed a performance loss
+; when using this instruction.
+;
+; The add order below (based on ffvp9) must be followed to prevent outranges.
+; x = k0k1 + k4k5
+; y = k2k3 + k6k7
+; z = signed SAT(x + y)
+
+SECTION .text
+%define LOCAL_VARS_SIZE 16*6
+
+%macro SETUP_LOCAL_VARS 0
+ ; TODO(slavarnway): using xmm registers for these on VPX_ARCH_X86_64 +
+ ; pmaddubsw has a higher latency on some platforms, this might be eased by
+ ; interleaving the instructions.
+ %define k0k1 [rsp + 16*0]
+ %define k2k3 [rsp + 16*1]
+ %define k4k5 [rsp + 16*2]
+ %define k6k7 [rsp + 16*3]
+ packsswb m4, m4
+ ; TODO(slavarnway): multiple pshufb instructions had a higher latency on
+ ; some platforms.
+ pshuflw m0, m4, 0b ;k0_k1
+ pshuflw m1, m4, 01010101b ;k2_k3
+ pshuflw m2, m4, 10101010b ;k4_k5
+ pshuflw m3, m4, 11111111b ;k6_k7
+ punpcklqdq m0, m0
+ punpcklqdq m1, m1
+ punpcklqdq m2, m2
+ punpcklqdq m3, m3
+ mova k0k1, m0
+ mova k2k3, m1
+ mova k4k5, m2
+ mova k6k7, m3
+%if VPX_ARCH_X86_64
+ %define krd m12
+ %define tmp0 [rsp + 16*4]
+ %define tmp1 [rsp + 16*5]
+ mova krd, [GLOBAL(pw_64)]
+%else
+ %define krd [rsp + 16*4]
+%if CONFIG_PIC=0
+ mova m6, [GLOBAL(pw_64)]
+%else
+ ; build constants without accessing global memory
+ pcmpeqb m6, m6 ;all ones
+ psrlw m6, 15
+ psllw m6, 6 ;aka pw_64
+%endif
+ mova krd, m6
+%endif
+%endm
+
+;-------------------------------------------------------------------------------
+%if VPX_ARCH_X86_64
+ %define LOCAL_VARS_SIZE_H4 0
+%else
+ %define LOCAL_VARS_SIZE_H4 16*4
+%endif
+
+%macro SUBPIX_HFILTER4 1
+cglobal filter_block1d4_%1, 6, 6, 11, LOCAL_VARS_SIZE_H4, \
+ src, sstride, dst, dstride, height, filter
+ mova m4, [filterq]
+ packsswb m4, m4
+%if VPX_ARCH_X86_64
+ %define k0k1k4k5 m8
+ %define k2k3k6k7 m9
+ %define krd m10
+ mova krd, [GLOBAL(pw_64)]
+ pshuflw k0k1k4k5, m4, 0b ;k0_k1
+ pshufhw k0k1k4k5, k0k1k4k5, 10101010b ;k0_k1_k4_k5
+ pshuflw k2k3k6k7, m4, 01010101b ;k2_k3
+ pshufhw k2k3k6k7, k2k3k6k7, 11111111b ;k2_k3_k6_k7
+%else
+ %define k0k1k4k5 [rsp + 16*0]
+ %define k2k3k6k7 [rsp + 16*1]
+ %define krd [rsp + 16*2]
+ pshuflw m6, m4, 0b ;k0_k1
+ pshufhw m6, m6, 10101010b ;k0_k1_k4_k5
+ pshuflw m7, m4, 01010101b ;k2_k3
+ pshufhw m7, m7, 11111111b ;k2_k3_k6_k7
+%if CONFIG_PIC=0
+ mova m1, [GLOBAL(pw_64)]
+%else
+ ; build constants without accessing global memory
+ pcmpeqb m1, m1 ;all ones
+ psrlw m1, 15
+ psllw m1, 6 ;aka pw_64
+%endif
+ mova k0k1k4k5, m6
+ mova k2k3k6k7, m7
+ mova krd, m1
+%endif
+ dec heightd
+
+.loop:
+ ;Do two rows at once
+ movu m4, [srcq - 3]
+ movu m5, [srcq + sstrideq - 3]
+ punpckhbw m1, m4, m4
+ punpcklbw m4, m4
+ punpckhbw m3, m5, m5
+ punpcklbw m5, m5
+ palignr m0, m1, m4, 1
+ pmaddubsw m0, k0k1k4k5
+ palignr m1, m4, 5
+ pmaddubsw m1, k2k3k6k7
+ palignr m2, m3, m5, 1
+ pmaddubsw m2, k0k1k4k5
+ palignr m3, m5, 5
+ pmaddubsw m3, k2k3k6k7
+ punpckhqdq m4, m0, m2
+ punpcklqdq m0, m2
+ punpckhqdq m5, m1, m3
+ punpcklqdq m1, m3
+ paddsw m0, m4
+ paddsw m1, m5
+%ifidn %1, h8_avg
+ movd m4, [dstq]
+ movd m5, [dstq + dstrideq]
+%endif
+ paddsw m0, m1
+ paddsw m0, krd
+ psraw m0, 7
+ packuswb m0, m0
+ psrldq m1, m0, 4
+
+%ifidn %1, h8_avg
+ pavgb m0, m4
+ pavgb m1, m5
+%endif
+ movd [dstq], m0
+ movd [dstq + dstrideq], m1
+
+ lea srcq, [srcq + sstrideq ]
+ prefetcht0 [srcq + 4 * sstrideq - 3]
+ lea srcq, [srcq + sstrideq ]
+ lea dstq, [dstq + 2 * dstrideq ]
+ prefetcht0 [srcq + 2 * sstrideq - 3]
+
+ sub heightd, 2
+ jg .loop
+
+ ; Do last row if output_height is odd
+ jne .done
+
+ movu m4, [srcq - 3]
+ punpckhbw m1, m4, m4
+ punpcklbw m4, m4
+ palignr m0, m1, m4, 1
+ palignr m1, m4, 5
+ pmaddubsw m0, k0k1k4k5
+ pmaddubsw m1, k2k3k6k7
+ psrldq m2, m0, 8
+ psrldq m3, m1, 8
+ paddsw m0, m2
+ paddsw m1, m3
+ paddsw m0, m1
+ paddsw m0, krd
+ psraw m0, 7
+ packuswb m0, m0
+%ifidn %1, h8_avg
+ movd m4, [dstq]
+ pavgb m0, m4
+%endif
+ movd [dstq], m0
+.done:
+ REP_RET
+%endm
+
+;-------------------------------------------------------------------------------
+%macro SUBPIX_HFILTER8 1
+cglobal filter_block1d8_%1, 6, 6, 14, LOCAL_VARS_SIZE, \
+ src, sstride, dst, dstride, height, filter
+ mova m4, [filterq]
+ SETUP_LOCAL_VARS
+ dec heightd
+
+.loop:
+ ;Do two rows at once
+ movu m0, [srcq - 3]
+ movu m4, [srcq + sstrideq - 3]
+ punpckhbw m1, m0, m0
+ punpcklbw m0, m0
+ palignr m5, m1, m0, 13
+ pmaddubsw m5, k6k7
+ palignr m2, m1, m0, 5
+ palignr m3, m1, m0, 9
+ palignr m1, m0, 1
+ pmaddubsw m1, k0k1
+ punpckhbw m6, m4, m4
+ punpcklbw m4, m4
+ pmaddubsw m2, k2k3
+ pmaddubsw m3, k4k5
+
+ palignr m7, m6, m4, 13
+ palignr m0, m6, m4, 5
+ pmaddubsw m7, k6k7
+ paddsw m1, m3
+ paddsw m2, m5
+ paddsw m1, m2
+%ifidn %1, h8_avg
+ movh m2, [dstq]
+ movhps m2, [dstq + dstrideq]
+%endif
+ palignr m5, m6, m4, 9
+ palignr m6, m4, 1
+ pmaddubsw m0, k2k3
+ pmaddubsw m6, k0k1
+ paddsw m1, krd
+ pmaddubsw m5, k4k5
+ psraw m1, 7
+ paddsw m0, m7
+ paddsw m6, m5
+ paddsw m6, m0
+ paddsw m6, krd
+ psraw m6, 7
+ packuswb m1, m6
+%ifidn %1, h8_avg
+ pavgb m1, m2
+%endif
+ movh [dstq], m1
+ movhps [dstq + dstrideq], m1
+
+ lea srcq, [srcq + sstrideq ]
+ prefetcht0 [srcq + 4 * sstrideq - 3]
+ lea srcq, [srcq + sstrideq ]
+ lea dstq, [dstq + 2 * dstrideq ]
+ prefetcht0 [srcq + 2 * sstrideq - 3]
+ sub heightd, 2
+ jg .loop
+
+ ; Do last row if output_height is odd
+ jne .done
+
+ movu m0, [srcq - 3]
+ punpckhbw m3, m0, m0
+ punpcklbw m0, m0
+ palignr m1, m3, m0, 1
+ palignr m2, m3, m0, 5
+ palignr m4, m3, m0, 13
+ palignr m3, m0, 9
+ pmaddubsw m1, k0k1
+ pmaddubsw m2, k2k3
+ pmaddubsw m3, k4k5
+ pmaddubsw m4, k6k7
+ paddsw m1, m3
+ paddsw m4, m2
+ paddsw m1, m4
+ paddsw m1, krd
+ psraw m1, 7
+ packuswb m1, m1
+%ifidn %1, h8_avg
+ movh m0, [dstq]
+ pavgb m1, m0
+%endif
+ movh [dstq], m1
+.done:
+ REP_RET
+%endm
+
+;-------------------------------------------------------------------------------
+%macro SUBPIX_HFILTER16 1
+cglobal filter_block1d16_%1, 6, 6, 14, LOCAL_VARS_SIZE, \
+ src, sstride, dst, dstride, height, filter
+ mova m4, [filterq]
+ SETUP_LOCAL_VARS
+
+.loop:
+ prefetcht0 [srcq + 2 * sstrideq -3]
+
+ movu m0, [srcq - 3]
+ movu m4, [srcq - 2]
+ pmaddubsw m0, k0k1
+ pmaddubsw m4, k0k1
+ movu m1, [srcq - 1]
+ movu m5, [srcq + 0]
+ pmaddubsw m1, k2k3
+ pmaddubsw m5, k2k3
+ movu m2, [srcq + 1]
+ movu m6, [srcq + 2]
+ pmaddubsw m2, k4k5
+ pmaddubsw m6, k4k5
+ movu m3, [srcq + 3]
+ movu m7, [srcq + 4]
+ pmaddubsw m3, k6k7
+ pmaddubsw m7, k6k7
+ paddsw m0, m2
+ paddsw m1, m3
+ paddsw m0, m1
+ paddsw m4, m6
+ paddsw m5, m7
+ paddsw m4, m5
+ paddsw m0, krd
+ paddsw m4, krd
+ psraw m0, 7
+ psraw m4, 7
+ packuswb m0, m0
+ packuswb m4, m4
+ punpcklbw m0, m4
+%ifidn %1, h8_avg
+ pavgb m0, [dstq]
+%endif
+ lea srcq, [srcq + sstrideq]
+ mova [dstq], m0
+ lea dstq, [dstq + dstrideq]
+ dec heightd
+ jnz .loop
+ REP_RET
+%endm
+
+INIT_XMM ssse3
+SUBPIX_HFILTER16 h8 ; vpx_filter_block1d16_h8_ssse3
+SUBPIX_HFILTER16 h8_avg ; vpx_filter_block1d16_h8_avg_ssse3
+SUBPIX_HFILTER8 h8 ; vpx_filter_block1d8_h8_ssse3
+SUBPIX_HFILTER8 h8_avg ; vpx_filter_block1d8_h8_avg_ssse3
+SUBPIX_HFILTER4 h8 ; vpx_filter_block1d4_h8_ssse3
+SUBPIX_HFILTER4 h8_avg ; vpx_filter_block1d4_h8_avg_ssse3
+
+;-------------------------------------------------------------------------------
+
+; TODO(Linfeng): Detect cpu type and choose the code with better performance.
+%define X86_SUBPIX_VFILTER_PREFER_SLOW_CELERON 1
+
+%if VPX_ARCH_X86_64 && X86_SUBPIX_VFILTER_PREFER_SLOW_CELERON
+ %define NUM_GENERAL_REG_USED 9
+%else
+ %define NUM_GENERAL_REG_USED 6
+%endif
+
+%macro SUBPIX_VFILTER 2
+cglobal filter_block1d%2_%1, 6, NUM_GENERAL_REG_USED, 15, LOCAL_VARS_SIZE, \
+ src, sstride, dst, dstride, height, filter
+ mova m4, [filterq]
+ SETUP_LOCAL_VARS
+
+%ifidn %2, 8
+ %define movx movh
+%else
+ %define movx movd
+%endif
+
+ dec heightd
+
+%if VPX_ARCH_X86 || X86_SUBPIX_VFILTER_PREFER_SLOW_CELERON
+
+%if VPX_ARCH_X86_64
+ %define src1q r7
+ %define sstride6q r8
+ %define dst_stride dstrideq
+%else
+ %define src1q filterq
+ %define sstride6q dstrideq
+ %define dst_stride dstridemp
+%endif
+ mov src1q, srcq
+ add src1q, sstrideq
+ lea sstride6q, [sstrideq + sstrideq * 4]
+ add sstride6q, sstrideq ;pitch * 6
+
+.loop:
+ ;Do two rows at once
+ movx m0, [srcq ] ;A
+ movx m1, [src1q ] ;B
+ punpcklbw m0, m1 ;A B
+ movx m2, [srcq + sstrideq * 2 ] ;C
+ pmaddubsw m0, k0k1
+ mova m6, m2
+ movx m3, [src1q + sstrideq * 2] ;D
+ punpcklbw m2, m3 ;C D
+ pmaddubsw m2, k2k3
+ movx m4, [srcq + sstrideq * 4 ] ;E
+ mova m7, m4
+ movx m5, [src1q + sstrideq * 4] ;F
+ punpcklbw m4, m5 ;E F
+ pmaddubsw m4, k4k5
+ punpcklbw m1, m6 ;A B next iter
+ movx m6, [srcq + sstride6q ] ;G
+ punpcklbw m5, m6 ;E F next iter
+ punpcklbw m3, m7 ;C D next iter
+ pmaddubsw m5, k4k5
+ movx m7, [src1q + sstride6q ] ;H
+ punpcklbw m6, m7 ;G H
+ pmaddubsw m6, k6k7
+ pmaddubsw m3, k2k3
+ pmaddubsw m1, k0k1
+ paddsw m0, m4
+ paddsw m2, m6
+ movx m6, [srcq + sstrideq * 8 ] ;H next iter
+ punpcklbw m7, m6
+ pmaddubsw m7, k6k7
+ paddsw m0, m2
+ paddsw m0, krd
+ psraw m0, 7
+ paddsw m1, m5
+ packuswb m0, m0
+
+ paddsw m3, m7
+ paddsw m1, m3
+ paddsw m1, krd
+ psraw m1, 7
+ lea srcq, [srcq + sstrideq * 2 ]
+ lea src1q, [src1q + sstrideq * 2]
+ packuswb m1, m1
+
+%ifidn %1, v8_avg
+ movx m2, [dstq]
+ pavgb m0, m2
+%endif
+ movx [dstq], m0
+ add dstq, dst_stride
+%ifidn %1, v8_avg
+ movx m3, [dstq]
+ pavgb m1, m3
+%endif
+ movx [dstq], m1
+ add dstq, dst_stride
+ sub heightd, 2
+ jg .loop
+
+ ; Do last row if output_height is odd
+ jne .done
+
+ movx m0, [srcq ] ;A
+ movx m1, [srcq + sstrideq ] ;B
+ movx m6, [srcq + sstride6q ] ;G
+ punpcklbw m0, m1 ;A B
+ movx m7, [src1q + sstride6q ] ;H
+ pmaddubsw m0, k0k1
+ movx m2, [srcq + sstrideq * 2 ] ;C
+ punpcklbw m6, m7 ;G H
+ movx m3, [src1q + sstrideq * 2] ;D
+ pmaddubsw m6, k6k7
+ movx m4, [srcq + sstrideq * 4 ] ;E
+ punpcklbw m2, m3 ;C D
+ movx m5, [src1q + sstrideq * 4] ;F
+ punpcklbw m4, m5 ;E F
+ pmaddubsw m2, k2k3
+ pmaddubsw m4, k4k5
+ paddsw m2, m6
+ paddsw m0, m4
+ paddsw m0, m2
+ paddsw m0, krd
+ psraw m0, 7
+ packuswb m0, m0
+%ifidn %1, v8_avg
+ movx m1, [dstq]
+ pavgb m0, m1
+%endif
+ movx [dstq], m0
+
+%else
+ ; VPX_ARCH_X86_64
+
+ movx m0, [srcq ] ;A
+ movx m1, [srcq + sstrideq ] ;B
+ lea srcq, [srcq + sstrideq * 2 ]
+ movx m2, [srcq] ;C
+ movx m3, [srcq + sstrideq] ;D
+ lea srcq, [srcq + sstrideq * 2 ]
+ movx m4, [srcq] ;E
+ movx m5, [srcq + sstrideq] ;F
+ lea srcq, [srcq + sstrideq * 2 ]
+ movx m6, [srcq] ;G
+ punpcklbw m0, m1 ;A B
+ punpcklbw m1, m2 ;A B next iter
+ punpcklbw m2, m3 ;C D
+ punpcklbw m3, m4 ;C D next iter
+ punpcklbw m4, m5 ;E F
+ punpcklbw m5, m6 ;E F next iter
+
+.loop:
+ ;Do two rows at once
+ movx m7, [srcq + sstrideq] ;H
+ lea srcq, [srcq + sstrideq * 2 ]
+ movx m14, [srcq] ;H next iter
+ punpcklbw m6, m7 ;G H
+ punpcklbw m7, m14 ;G H next iter
+ pmaddubsw m8, m0, k0k1
+ pmaddubsw m9, m1, k0k1
+ mova m0, m2
+ mova m1, m3
+ pmaddubsw m10, m2, k2k3
+ pmaddubsw m11, m3, k2k3
+ mova m2, m4
+ mova m3, m5
+ pmaddubsw m4, k4k5
+ pmaddubsw m5, k4k5
+ paddsw m8, m4
+ paddsw m9, m5
+ mova m4, m6
+ mova m5, m7
+ pmaddubsw m6, k6k7
+ pmaddubsw m7, k6k7
+ paddsw m10, m6
+ paddsw m11, m7
+ paddsw m8, m10
+ paddsw m9, m11
+ mova m6, m14
+ paddsw m8, krd
+ paddsw m9, krd
+ psraw m8, 7
+ psraw m9, 7
+%ifidn %2, 4
+ packuswb m8, m8
+ packuswb m9, m9
+%else
+ packuswb m8, m9
+%endif
+
+%ifidn %1, v8_avg
+ movx m7, [dstq]
+%ifidn %2, 4
+ movx m10, [dstq + dstrideq]
+ pavgb m9, m10
+%else
+ movhpd m7, [dstq + dstrideq]
+%endif
+ pavgb m8, m7
+%endif
+ movx [dstq], m8
+%ifidn %2, 4
+ movx [dstq + dstrideq], m9
+%else
+ movhpd [dstq + dstrideq], m8
+%endif
+
+ lea dstq, [dstq + dstrideq * 2 ]
+ sub heightd, 2
+ jg .loop
+
+ ; Do last row if output_height is odd
+ jne .done
+
+ movx m7, [srcq + sstrideq] ;H
+ punpcklbw m6, m7 ;G H
+ pmaddubsw m0, k0k1
+ pmaddubsw m2, k2k3
+ pmaddubsw m4, k4k5
+ pmaddubsw m6, k6k7
+ paddsw m0, m4
+ paddsw m2, m6
+ paddsw m0, m2
+ paddsw m0, krd
+ psraw m0, 7
+ packuswb m0, m0
+%ifidn %1, v8_avg
+ movx m1, [dstq]
+ pavgb m0, m1
+%endif
+ movx [dstq], m0
+
+%endif ; VPX_ARCH_X86_64
+
+.done:
+ REP_RET
+
+%endm
+
+;-------------------------------------------------------------------------------
+%macro SUBPIX_VFILTER16 1
+cglobal filter_block1d16_%1, 6, NUM_GENERAL_REG_USED, 16, LOCAL_VARS_SIZE, \
+ src, sstride, dst, dstride, height, filter
+ mova m4, [filterq]
+ SETUP_LOCAL_VARS
+
+%if VPX_ARCH_X86 || X86_SUBPIX_VFILTER_PREFER_SLOW_CELERON
+
+%if VPX_ARCH_X86_64
+ %define src1q r7
+ %define sstride6q r8
+ %define dst_stride dstrideq
+%else
+ %define src1q filterq
+ %define sstride6q dstrideq
+ %define dst_stride dstridemp
+%endif
+ lea src1q, [srcq + sstrideq]
+ lea sstride6q, [sstrideq + sstrideq * 4]
+ add sstride6q, sstrideq ;pitch * 6
+
+.loop:
+ movh m0, [srcq ] ;A
+ movh m1, [src1q ] ;B
+ movh m2, [srcq + sstrideq * 2 ] ;C
+ movh m3, [src1q + sstrideq * 2] ;D
+ movh m4, [srcq + sstrideq * 4 ] ;E
+ movh m5, [src1q + sstrideq * 4] ;F
+
+ punpcklbw m0, m1 ;A B
+ movh m6, [srcq + sstride6q] ;G
+ punpcklbw m2, m3 ;C D
+ movh m7, [src1q + sstride6q] ;H
+ punpcklbw m4, m5 ;E F
+ pmaddubsw m0, k0k1
+ movh m3, [srcq + 8] ;A
+ pmaddubsw m2, k2k3
+ punpcklbw m6, m7 ;G H
+ movh m5, [srcq + sstrideq + 8] ;B
+ pmaddubsw m4, k4k5
+ punpcklbw m3, m5 ;A B
+ movh m7, [srcq + sstrideq * 2 + 8] ;C
+ pmaddubsw m6, k6k7
+ movh m5, [src1q + sstrideq * 2 + 8] ;D
+ punpcklbw m7, m5 ;C D
+ paddsw m2, m6
+ pmaddubsw m3, k0k1
+ movh m1, [srcq + sstrideq * 4 + 8] ;E
+ paddsw m0, m4
+ pmaddubsw m7, k2k3
+ movh m6, [src1q + sstrideq * 4 + 8] ;F
+ punpcklbw m1, m6 ;E F
+ paddsw m0, m2
+ paddsw m0, krd
+ movh m2, [srcq + sstride6q + 8] ;G
+ pmaddubsw m1, k4k5
+ movh m5, [src1q + sstride6q + 8] ;H
+ psraw m0, 7
+ punpcklbw m2, m5 ;G H
+ pmaddubsw m2, k6k7
+ paddsw m7, m2
+ paddsw m3, m1
+ paddsw m3, m7
+ paddsw m3, krd
+ psraw m3, 7
+ packuswb m0, m3
+
+ add srcq, sstrideq
+ add src1q, sstrideq
+%ifidn %1, v8_avg
+ pavgb m0, [dstq]
+%endif
+ mova [dstq], m0
+ add dstq, dst_stride
+ dec heightd
+ jnz .loop
+ REP_RET
+
+%else
+ ; VPX_ARCH_X86_64
+ dec heightd
+
+ movu m1, [srcq ] ;A
+ movu m3, [srcq + sstrideq ] ;B
+ lea srcq, [srcq + sstrideq * 2]
+ punpcklbw m0, m1, m3 ;A B
+ punpckhbw m1, m3 ;A B
+ movu m5, [srcq] ;C
+ punpcklbw m2, m3, m5 ;A B next iter
+ punpckhbw m3, m5 ;A B next iter
+ mova tmp0, m2 ;store to stack
+ mova tmp1, m3 ;store to stack
+ movu m7, [srcq + sstrideq] ;D
+ lea srcq, [srcq + sstrideq * 2]
+ punpcklbw m4, m5, m7 ;C D
+ punpckhbw m5, m7 ;C D
+ movu m9, [srcq] ;E
+ punpcklbw m6, m7, m9 ;C D next iter
+ punpckhbw m7, m9 ;C D next iter
+ movu m11, [srcq + sstrideq] ;F
+ lea srcq, [srcq + sstrideq * 2]
+ punpcklbw m8, m9, m11 ;E F
+ punpckhbw m9, m11 ;E F
+ movu m2, [srcq] ;G
+ punpcklbw m10, m11, m2 ;E F next iter
+ punpckhbw m11, m2 ;E F next iter
+
+.loop:
+ ;Do two rows at once
+ pmaddubsw m13, m0, k0k1
+ mova m0, m4
+ pmaddubsw m14, m8, k4k5
+ pmaddubsw m15, m4, k2k3
+ mova m4, m8
+ paddsw m13, m14
+ movu m3, [srcq + sstrideq] ;H
+ lea srcq, [srcq + sstrideq * 2]
+ punpcklbw m14, m2, m3 ;G H
+ mova m8, m14
+ pmaddubsw m14, k6k7
+ paddsw m15, m14
+ paddsw m13, m15
+ paddsw m13, krd
+ psraw m13, 7
+
+ pmaddubsw m14, m1, k0k1
+ pmaddubsw m1, m9, k4k5
+ pmaddubsw m15, m5, k2k3
+ paddsw m14, m1
+ mova m1, m5
+ mova m5, m9
+ punpckhbw m2, m3 ;G H
+ mova m9, m2
+ pmaddubsw m2, k6k7
+ paddsw m15, m2
+ paddsw m14, m15
+ paddsw m14, krd
+ psraw m14, 7
+ packuswb m13, m14
+%ifidn %1, v8_avg
+ pavgb m13, [dstq]
+%endif
+ mova [dstq], m13
+
+ ; next iter
+ pmaddubsw m15, tmp0, k0k1
+ pmaddubsw m14, m10, k4k5
+ pmaddubsw m13, m6, k2k3
+ paddsw m15, m14
+ mova tmp0, m6
+ mova m6, m10
+ movu m2, [srcq] ;G next iter
+ punpcklbw m14, m3, m2 ;G H next iter
+ mova m10, m14
+ pmaddubsw m14, k6k7
+ paddsw m13, m14
+ paddsw m15, m13
+ paddsw m15, krd
+ psraw m15, 7
+
+ pmaddubsw m14, tmp1, k0k1
+ mova tmp1, m7
+ pmaddubsw m13, m7, k2k3
+ mova m7, m11
+ pmaddubsw m11, k4k5
+ paddsw m14, m11
+ punpckhbw m3, m2 ;G H next iter
+ mova m11, m3
+ pmaddubsw m3, k6k7
+ paddsw m13, m3
+ paddsw m14, m13
+ paddsw m14, krd
+ psraw m14, 7
+ packuswb m15, m14
+%ifidn %1, v8_avg
+ pavgb m15, [dstq + dstrideq]
+%endif
+ mova [dstq + dstrideq], m15
+ lea dstq, [dstq + dstrideq * 2]
+ sub heightd, 2
+ jg .loop
+
+ ; Do last row if output_height is odd
+ jne .done
+
+ movu m3, [srcq + sstrideq] ;H
+ punpcklbw m6, m2, m3 ;G H
+ punpckhbw m2, m3 ;G H
+ pmaddubsw m0, k0k1
+ pmaddubsw m1, k0k1
+ pmaddubsw m4, k2k3
+ pmaddubsw m5, k2k3
+ pmaddubsw m8, k4k5
+ pmaddubsw m9, k4k5
+ pmaddubsw m6, k6k7
+ pmaddubsw m2, k6k7
+ paddsw m0, m8
+ paddsw m1, m9
+ paddsw m4, m6
+ paddsw m5, m2
+ paddsw m0, m4
+ paddsw m1, m5
+ paddsw m0, krd
+ paddsw m1, krd
+ psraw m0, 7
+ psraw m1, 7
+ packuswb m0, m1
+%ifidn %1, v8_avg
+ pavgb m0, [dstq]
+%endif
+ mova [dstq], m0
+
+.done:
+ REP_RET
+
+%endif ; VPX_ARCH_X86_64
+
+%endm
+
+INIT_XMM ssse3
+SUBPIX_VFILTER16 v8 ; vpx_filter_block1d16_v8_ssse3
+SUBPIX_VFILTER16 v8_avg ; vpx_filter_block1d16_v8_avg_ssse3
+SUBPIX_VFILTER v8, 8 ; vpx_filter_block1d8_v8_ssse3
+SUBPIX_VFILTER v8_avg, 8 ; vpx_filter_block1d8_v8_avg_ssse3
+SUBPIX_VFILTER v8, 4 ; vpx_filter_block1d4_v8_ssse3
+SUBPIX_VFILTER v8_avg, 4 ; vpx_filter_block1d4_v8_avg_ssse3
diff --git a/media/libvpx/libvpx/vpx_dsp/x86/vpx_subpixel_bilinear_sse2.asm b/media/libvpx/libvpx/vpx_dsp/x86/vpx_subpixel_bilinear_sse2.asm
new file mode 100644
index 0000000000..65790b1c21
--- /dev/null
+++ b/media/libvpx/libvpx/vpx_dsp/x86/vpx_subpixel_bilinear_sse2.asm
@@ -0,0 +1,450 @@
+;
+; Copyright (c) 2014 The WebM project authors. All Rights Reserved.
+;
+; Use of this source code is governed by a BSD-style license
+; that can be found in the LICENSE file in the root of the source
+; tree. An additional intellectual property rights grant can be found
+; in the file PATENTS. All contributing project authors may
+; be found in the AUTHORS file in the root of the source tree.
+;
+
+%include "vpx_ports/x86_abi_support.asm"
+
+%macro GET_PARAM_4 0
+ mov rdx, arg(5) ;filter ptr
+ mov rsi, arg(0) ;src_ptr
+ mov rdi, arg(2) ;output_ptr
+ mov rcx, 0x0400040
+
+ movdqa xmm3, [rdx] ;load filters
+ pshuflw xmm4, xmm3, 11111111b ;k3
+ psrldq xmm3, 8
+ pshuflw xmm3, xmm3, 0b ;k4
+ punpcklqdq xmm4, xmm3 ;k3k4
+
+ movq xmm3, rcx ;rounding
+ pshufd xmm3, xmm3, 0
+
+ pxor xmm2, xmm2
+
+ movsxd rax, DWORD PTR arg(1) ;pixels_per_line
+ movsxd rdx, DWORD PTR arg(3) ;out_pitch
+ movsxd rcx, DWORD PTR arg(4) ;output_height
+%endm
+
+%macro APPLY_FILTER_4 1
+
+ punpckldq xmm0, xmm1 ;two row in one register
+ punpcklbw xmm0, xmm2 ;unpack to word
+ pmullw xmm0, xmm4 ;multiply the filter factors
+
+ movdqa xmm1, xmm0
+ psrldq xmm1, 8
+ paddsw xmm0, xmm1
+
+ paddsw xmm0, xmm3 ;rounding
+ psraw xmm0, 7 ;shift
+ packuswb xmm0, xmm0 ;pack to byte
+
+%if %1
+ movd xmm1, [rdi]
+ pavgb xmm0, xmm1
+%endif
+
+ movd [rdi], xmm0
+ lea rsi, [rsi + rax]
+ lea rdi, [rdi + rdx]
+ dec rcx
+%endm
+
+%macro GET_PARAM 0
+ mov rdx, arg(5) ;filter ptr
+ mov rsi, arg(0) ;src_ptr
+ mov rdi, arg(2) ;output_ptr
+ mov rcx, 0x0400040
+
+ movdqa xmm7, [rdx] ;load filters
+
+ pshuflw xmm6, xmm7, 11111111b ;k3
+ pshufhw xmm7, xmm7, 0b ;k4
+ punpcklwd xmm6, xmm6
+ punpckhwd xmm7, xmm7
+
+ movq xmm4, rcx ;rounding
+ pshufd xmm4, xmm4, 0
+
+ pxor xmm5, xmm5
+
+ movsxd rax, DWORD PTR arg(1) ;pixels_per_line
+ movsxd rdx, DWORD PTR arg(3) ;out_pitch
+ movsxd rcx, DWORD PTR arg(4) ;output_height
+%endm
+
+%macro APPLY_FILTER_8 1
+ punpcklbw xmm0, xmm5
+ punpcklbw xmm1, xmm5
+
+ pmullw xmm0, xmm6
+ pmullw xmm1, xmm7
+ paddsw xmm0, xmm1
+ paddsw xmm0, xmm4 ;rounding
+ psraw xmm0, 7 ;shift
+ packuswb xmm0, xmm0 ;pack back to byte
+%if %1
+ movq xmm1, [rdi]
+ pavgb xmm0, xmm1
+%endif
+ movq [rdi], xmm0 ;store the result
+
+ lea rsi, [rsi + rax]
+ lea rdi, [rdi + rdx]
+ dec rcx
+%endm
+
+%macro APPLY_FILTER_16 1
+ punpcklbw xmm0, xmm5
+ punpcklbw xmm1, xmm5
+ punpckhbw xmm2, xmm5
+ punpckhbw xmm3, xmm5
+
+ pmullw xmm0, xmm6
+ pmullw xmm1, xmm7
+ pmullw xmm2, xmm6
+ pmullw xmm3, xmm7
+
+ paddsw xmm0, xmm1
+ paddsw xmm2, xmm3
+
+ paddsw xmm0, xmm4 ;rounding
+ paddsw xmm2, xmm4
+ psraw xmm0, 7 ;shift
+ psraw xmm2, 7
+ packuswb xmm0, xmm2 ;pack back to byte
+%if %1
+ movdqu xmm1, [rdi]
+ pavgb xmm0, xmm1
+%endif
+ movdqu [rdi], xmm0 ;store the result
+
+ lea rsi, [rsi + rax]
+ lea rdi, [rdi + rdx]
+ dec rcx
+%endm
+
+SECTION .text
+
+globalsym(vpx_filter_block1d4_v2_sse2)
+sym(vpx_filter_block1d4_v2_sse2):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 6
+ push rsi
+ push rdi
+ ; end prolog
+
+ GET_PARAM_4
+.loop:
+ movd xmm0, [rsi] ;load src
+ movd xmm1, [rsi + rax]
+
+ APPLY_FILTER_4 0
+ jnz .loop
+
+ ; begin epilog
+ pop rdi
+ pop rsi
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+
+globalsym(vpx_filter_block1d8_v2_sse2)
+sym(vpx_filter_block1d8_v2_sse2):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 6
+ SAVE_XMM 7
+ push rsi
+ push rdi
+ ; end prolog
+
+ GET_PARAM
+.loop:
+ movq xmm0, [rsi] ;0
+ movq xmm1, [rsi + rax] ;1
+
+ APPLY_FILTER_8 0
+ jnz .loop
+
+ ; begin epilog
+ pop rdi
+ pop rsi
+ RESTORE_XMM
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+
+globalsym(vpx_filter_block1d16_v2_sse2)
+sym(vpx_filter_block1d16_v2_sse2):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 6
+ SAVE_XMM 7
+ push rsi
+ push rdi
+ ; end prolog
+
+ GET_PARAM
+.loop:
+ movdqu xmm0, [rsi] ;0
+ movdqu xmm1, [rsi + rax] ;1
+ movdqa xmm2, xmm0
+ movdqa xmm3, xmm1
+
+ APPLY_FILTER_16 0
+ jnz .loop
+
+ ; begin epilog
+ pop rdi
+ pop rsi
+ RESTORE_XMM
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+
+globalsym(vpx_filter_block1d4_v2_avg_sse2)
+sym(vpx_filter_block1d4_v2_avg_sse2):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 6
+ push rsi
+ push rdi
+ ; end prolog
+
+ GET_PARAM_4
+.loop:
+ movd xmm0, [rsi] ;load src
+ movd xmm1, [rsi + rax]
+
+ APPLY_FILTER_4 1
+ jnz .loop
+
+ ; begin epilog
+ pop rdi
+ pop rsi
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+
+globalsym(vpx_filter_block1d8_v2_avg_sse2)
+sym(vpx_filter_block1d8_v2_avg_sse2):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 6
+ SAVE_XMM 7
+ push rsi
+ push rdi
+ ; end prolog
+
+ GET_PARAM
+.loop:
+ movq xmm0, [rsi] ;0
+ movq xmm1, [rsi + rax] ;1
+
+ APPLY_FILTER_8 1
+ jnz .loop
+
+ ; begin epilog
+ pop rdi
+ pop rsi
+ RESTORE_XMM
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+
+globalsym(vpx_filter_block1d16_v2_avg_sse2)
+sym(vpx_filter_block1d16_v2_avg_sse2):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 6
+ SAVE_XMM 7
+ push rsi
+ push rdi
+ ; end prolog
+
+ GET_PARAM
+.loop:
+ movdqu xmm0, [rsi] ;0
+ movdqu xmm1, [rsi + rax] ;1
+ movdqa xmm2, xmm0
+ movdqa xmm3, xmm1
+
+ APPLY_FILTER_16 1
+ jnz .loop
+
+ ; begin epilog
+ pop rdi
+ pop rsi
+ RESTORE_XMM
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+
+globalsym(vpx_filter_block1d4_h2_sse2)
+sym(vpx_filter_block1d4_h2_sse2):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 6
+ push rsi
+ push rdi
+ ; end prolog
+
+ GET_PARAM_4
+.loop:
+ movdqu xmm0, [rsi] ;load src
+ movdqa xmm1, xmm0
+ psrldq xmm1, 1
+
+ APPLY_FILTER_4 0
+ jnz .loop
+
+ ; begin epilog
+ pop rdi
+ pop rsi
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+
+globalsym(vpx_filter_block1d8_h2_sse2)
+sym(vpx_filter_block1d8_h2_sse2):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 6
+ SAVE_XMM 7
+ push rsi
+ push rdi
+ ; end prolog
+
+ GET_PARAM
+.loop:
+ movdqu xmm0, [rsi] ;load src
+ movdqa xmm1, xmm0
+ psrldq xmm1, 1
+
+ APPLY_FILTER_8 0
+ jnz .loop
+
+ ; begin epilog
+ pop rdi
+ pop rsi
+ RESTORE_XMM
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+
+globalsym(vpx_filter_block1d16_h2_sse2)
+sym(vpx_filter_block1d16_h2_sse2):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 6
+ SAVE_XMM 7
+ push rsi
+ push rdi
+ ; end prolog
+
+ GET_PARAM
+.loop:
+ movdqu xmm0, [rsi] ;load src
+ movdqu xmm1, [rsi + 1]
+ movdqa xmm2, xmm0
+ movdqa xmm3, xmm1
+
+ APPLY_FILTER_16 0
+ jnz .loop
+
+ ; begin epilog
+ pop rdi
+ pop rsi
+ RESTORE_XMM
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+
+globalsym(vpx_filter_block1d4_h2_avg_sse2)
+sym(vpx_filter_block1d4_h2_avg_sse2):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 6
+ push rsi
+ push rdi
+ ; end prolog
+
+ GET_PARAM_4
+.loop:
+ movdqu xmm0, [rsi] ;load src
+ movdqa xmm1, xmm0
+ psrldq xmm1, 1
+
+ APPLY_FILTER_4 1
+ jnz .loop
+
+ ; begin epilog
+ pop rdi
+ pop rsi
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+
+globalsym(vpx_filter_block1d8_h2_avg_sse2)
+sym(vpx_filter_block1d8_h2_avg_sse2):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 6
+ SAVE_XMM 7
+ push rsi
+ push rdi
+ ; end prolog
+
+ GET_PARAM
+.loop:
+ movdqu xmm0, [rsi] ;load src
+ movdqa xmm1, xmm0
+ psrldq xmm1, 1
+
+ APPLY_FILTER_8 1
+ jnz .loop
+
+ ; begin epilog
+ pop rdi
+ pop rsi
+ RESTORE_XMM
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+
+globalsym(vpx_filter_block1d16_h2_avg_sse2)
+sym(vpx_filter_block1d16_h2_avg_sse2):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 6
+ SAVE_XMM 7
+ push rsi
+ push rdi
+ ; end prolog
+
+ GET_PARAM
+.loop:
+ movdqu xmm0, [rsi] ;load src
+ movdqu xmm1, [rsi + 1]
+ movdqa xmm2, xmm0
+ movdqa xmm3, xmm1
+
+ APPLY_FILTER_16 1
+ jnz .loop
+
+ ; begin epilog
+ pop rdi
+ pop rsi
+ RESTORE_XMM
+ UNSHADOW_ARGS
+ pop rbp
+ ret
diff --git a/media/libvpx/libvpx/vpx_dsp/x86/vpx_subpixel_bilinear_ssse3.asm b/media/libvpx/libvpx/vpx_dsp/x86/vpx_subpixel_bilinear_ssse3.asm
new file mode 100644
index 0000000000..32e3cd3d9f
--- /dev/null
+++ b/media/libvpx/libvpx/vpx_dsp/x86/vpx_subpixel_bilinear_ssse3.asm
@@ -0,0 +1,420 @@
+;
+; Copyright (c) 2014 The WebM project authors. All Rights Reserved.
+;
+; Use of this source code is governed by a BSD-style license
+; that can be found in the LICENSE file in the root of the source
+; tree. An additional intellectual property rights grant can be found
+; in the file PATENTS. All contributing project authors may
+; be found in the AUTHORS file in the root of the source tree.
+;
+
+%include "vpx_ports/x86_abi_support.asm"
+
+%macro GET_PARAM_4 0
+ mov rdx, arg(5) ;filter ptr
+ mov rsi, arg(0) ;src_ptr
+ mov rdi, arg(2) ;output_ptr
+ mov ecx, 0x01000100
+
+ movdqa xmm3, [rdx] ;load filters
+ psrldq xmm3, 6
+ packsswb xmm3, xmm3
+ pshuflw xmm3, xmm3, 0b ;k3_k4
+
+ movd xmm2, ecx ;rounding_shift
+ pshufd xmm2, xmm2, 0
+
+ movsxd rax, DWORD PTR arg(1) ;pixels_per_line
+ movsxd rdx, DWORD PTR arg(3) ;out_pitch
+ movsxd rcx, DWORD PTR arg(4) ;output_height
+%endm
+
+%macro APPLY_FILTER_4 1
+ punpcklbw xmm0, xmm1
+ pmaddubsw xmm0, xmm3
+
+ pmulhrsw xmm0, xmm2 ;rounding(+64)+shift(>>7)
+ packuswb xmm0, xmm0 ;pack to byte
+
+%if %1
+ movd xmm1, [rdi]
+ pavgb xmm0, xmm1
+%endif
+ movd [rdi], xmm0
+ lea rsi, [rsi + rax]
+ lea rdi, [rdi + rdx]
+ dec rcx
+%endm
+
+%macro GET_PARAM 0
+ mov rdx, arg(5) ;filter ptr
+ mov rsi, arg(0) ;src_ptr
+ mov rdi, arg(2) ;output_ptr
+ mov ecx, 0x01000100
+
+ movdqa xmm7, [rdx] ;load filters
+ psrldq xmm7, 6
+ packsswb xmm7, xmm7
+ pshuflw xmm7, xmm7, 0b ;k3_k4
+ punpcklwd xmm7, xmm7
+
+ movd xmm6, ecx ;rounding_shift
+ pshufd xmm6, xmm6, 0
+
+ movsxd rax, DWORD PTR arg(1) ;pixels_per_line
+ movsxd rdx, DWORD PTR arg(3) ;out_pitch
+ movsxd rcx, DWORD PTR arg(4) ;output_height
+%endm
+
+%macro APPLY_FILTER_8 1
+ punpcklbw xmm0, xmm1
+ pmaddubsw xmm0, xmm7
+
+ pmulhrsw xmm0, xmm6 ;rounding(+64)+shift(>>7)
+ packuswb xmm0, xmm0 ;pack back to byte
+
+%if %1
+ movq xmm1, [rdi]
+ pavgb xmm0, xmm1
+%endif
+ movq [rdi], xmm0 ;store the result
+
+ lea rsi, [rsi + rax]
+ lea rdi, [rdi + rdx]
+ dec rcx
+%endm
+
+%macro APPLY_FILTER_16 1
+ punpcklbw xmm0, xmm1
+ punpckhbw xmm2, xmm1
+ pmaddubsw xmm0, xmm7
+ pmaddubsw xmm2, xmm7
+
+ pmulhrsw xmm0, xmm6 ;rounding(+64)+shift(>>7)
+ pmulhrsw xmm2, xmm6
+ packuswb xmm0, xmm2 ;pack back to byte
+
+%if %1
+ movdqu xmm1, [rdi]
+ pavgb xmm0, xmm1
+%endif
+ movdqu [rdi], xmm0 ;store the result
+
+ lea rsi, [rsi + rax]
+ lea rdi, [rdi + rdx]
+ dec rcx
+%endm
+
+SECTION .text
+
+globalsym(vpx_filter_block1d4_v2_ssse3)
+sym(vpx_filter_block1d4_v2_ssse3):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 6
+ push rsi
+ push rdi
+ ; end prolog
+
+ GET_PARAM_4
+.loop:
+ movd xmm0, [rsi] ;load src
+ movd xmm1, [rsi + rax]
+
+ APPLY_FILTER_4 0
+ jnz .loop
+
+ ; begin epilog
+ pop rdi
+ pop rsi
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+
+globalsym(vpx_filter_block1d8_v2_ssse3)
+sym(vpx_filter_block1d8_v2_ssse3):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 6
+ SAVE_XMM 7
+ push rsi
+ push rdi
+ ; end prolog
+
+ GET_PARAM
+.loop:
+ movq xmm0, [rsi] ;0
+ movq xmm1, [rsi + rax] ;1
+
+ APPLY_FILTER_8 0
+ jnz .loop
+
+ ; begin epilog
+ pop rdi
+ pop rsi
+ RESTORE_XMM
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+
+globalsym(vpx_filter_block1d16_v2_ssse3)
+sym(vpx_filter_block1d16_v2_ssse3):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 6
+ SAVE_XMM 7
+ push rsi
+ push rdi
+ ; end prolog
+
+ GET_PARAM
+.loop:
+ movdqu xmm0, [rsi] ;0
+ movdqu xmm1, [rsi + rax] ;1
+ movdqa xmm2, xmm0
+
+ APPLY_FILTER_16 0
+ jnz .loop
+
+ ; begin epilog
+ pop rdi
+ pop rsi
+ RESTORE_XMM
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+
+globalsym(vpx_filter_block1d4_v2_avg_ssse3)
+sym(vpx_filter_block1d4_v2_avg_ssse3):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 6
+ push rsi
+ push rdi
+ ; end prolog
+
+ GET_PARAM_4
+.loop:
+ movd xmm0, [rsi] ;load src
+ movd xmm1, [rsi + rax]
+
+ APPLY_FILTER_4 1
+ jnz .loop
+
+ ; begin epilog
+ pop rdi
+ pop rsi
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+
+globalsym(vpx_filter_block1d8_v2_avg_ssse3)
+sym(vpx_filter_block1d8_v2_avg_ssse3):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 6
+ SAVE_XMM 7
+ push rsi
+ push rdi
+ ; end prolog
+
+ GET_PARAM
+.loop:
+ movq xmm0, [rsi] ;0
+ movq xmm1, [rsi + rax] ;1
+
+ APPLY_FILTER_8 1
+ jnz .loop
+
+ ; begin epilog
+ pop rdi
+ pop rsi
+ RESTORE_XMM
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+
+globalsym(vpx_filter_block1d16_v2_avg_ssse3)
+sym(vpx_filter_block1d16_v2_avg_ssse3):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 6
+ SAVE_XMM 7
+ push rsi
+ push rdi
+ ; end prolog
+
+ GET_PARAM
+.loop:
+ movdqu xmm0, [rsi] ;0
+ movdqu xmm1, [rsi + rax] ;1
+ movdqa xmm2, xmm0
+
+ APPLY_FILTER_16 1
+ jnz .loop
+
+ ; begin epilog
+ pop rdi
+ pop rsi
+ RESTORE_XMM
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+
+globalsym(vpx_filter_block1d4_h2_ssse3)
+sym(vpx_filter_block1d4_h2_ssse3):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 6
+ push rsi
+ push rdi
+ ; end prolog
+
+ GET_PARAM_4
+.loop:
+ movdqu xmm0, [rsi] ;load src
+ movdqa xmm1, xmm0
+ psrldq xmm1, 1
+
+ APPLY_FILTER_4 0
+ jnz .loop
+
+ ; begin epilog
+ pop rdi
+ pop rsi
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+
+globalsym(vpx_filter_block1d8_h2_ssse3)
+sym(vpx_filter_block1d8_h2_ssse3):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 6
+ SAVE_XMM 7
+ push rsi
+ push rdi
+ ; end prolog
+
+ GET_PARAM
+.loop:
+ movdqu xmm0, [rsi] ;load src
+ movdqa xmm1, xmm0
+ psrldq xmm1, 1
+
+ APPLY_FILTER_8 0
+ jnz .loop
+
+ ; begin epilog
+ pop rdi
+ pop rsi
+ RESTORE_XMM
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+
+globalsym(vpx_filter_block1d16_h2_ssse3)
+sym(vpx_filter_block1d16_h2_ssse3):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 6
+ SAVE_XMM 7
+ push rsi
+ push rdi
+ ; end prolog
+
+ GET_PARAM
+.loop:
+ movdqu xmm0, [rsi] ;load src
+ movdqu xmm1, [rsi + 1]
+ movdqa xmm2, xmm0
+
+ APPLY_FILTER_16 0
+ jnz .loop
+
+ ; begin epilog
+ pop rdi
+ pop rsi
+ RESTORE_XMM
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+
+globalsym(vpx_filter_block1d4_h2_avg_ssse3)
+sym(vpx_filter_block1d4_h2_avg_ssse3):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 6
+ push rsi
+ push rdi
+ ; end prolog
+
+ GET_PARAM_4
+.loop:
+ movdqu xmm0, [rsi] ;load src
+ movdqa xmm1, xmm0
+ psrldq xmm1, 1
+
+ APPLY_FILTER_4 1
+ jnz .loop
+
+ ; begin epilog
+ pop rdi
+ pop rsi
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+
+globalsym(vpx_filter_block1d8_h2_avg_ssse3)
+sym(vpx_filter_block1d8_h2_avg_ssse3):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 6
+ SAVE_XMM 7
+ push rsi
+ push rdi
+ ; end prolog
+
+ GET_PARAM
+.loop:
+ movdqu xmm0, [rsi] ;load src
+ movdqa xmm1, xmm0
+ psrldq xmm1, 1
+
+ APPLY_FILTER_8 1
+ jnz .loop
+
+ ; begin epilog
+ pop rdi
+ pop rsi
+ RESTORE_XMM
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+
+globalsym(vpx_filter_block1d16_h2_avg_ssse3)
+sym(vpx_filter_block1d16_h2_avg_ssse3):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 6
+ SAVE_XMM 7
+ push rsi
+ push rdi
+ ; end prolog
+
+ GET_PARAM
+.loop:
+ movdqu xmm0, [rsi] ;load src
+ movdqu xmm1, [rsi + 1]
+ movdqa xmm2, xmm0
+
+ APPLY_FILTER_16 1
+ jnz .loop
+
+ ; begin epilog
+ pop rdi
+ pop rsi
+ RESTORE_XMM
+ UNSHADOW_ARGS
+ pop rbp
+ ret