summaryrefslogtreecommitdiffstats
path: root/media/libvpx/libvpx/vp9/common/x86
diff options
context:
space:
mode:
Diffstat (limited to 'media/libvpx/libvpx/vp9/common/x86')
-rw-r--r--media/libvpx/libvpx/vp9/common/x86/vp9_highbd_iht16x16_add_sse4.c419
-rw-r--r--media/libvpx/libvpx/vp9/common/x86/vp9_highbd_iht4x4_add_sse4.c131
-rw-r--r--media/libvpx/libvpx/vp9/common/x86/vp9_highbd_iht8x8_add_sse4.c255
-rw-r--r--media/libvpx/libvpx/vp9/common/x86/vp9_idct_intrin_sse2.c224
-rw-r--r--media/libvpx/libvpx/vp9/common/x86/vp9_mfqe_sse2.asm289
5 files changed, 1318 insertions, 0 deletions
diff --git a/media/libvpx/libvpx/vp9/common/x86/vp9_highbd_iht16x16_add_sse4.c b/media/libvpx/libvpx/vp9/common/x86/vp9_highbd_iht16x16_add_sse4.c
new file mode 100644
index 0000000000..57b79a732d
--- /dev/null
+++ b/media/libvpx/libvpx/vp9/common/x86/vp9_highbd_iht16x16_add_sse4.c
@@ -0,0 +1,419 @@
+/*
+ * Copyright (c) 2018 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "./vp9_rtcd.h"
+#include "vp9/common/vp9_idct.h"
+#include "vpx_dsp/x86/highbd_inv_txfm_sse4.h"
+#include "vpx_dsp/x86/inv_txfm_sse2.h"
+#include "vpx_dsp/x86/transpose_sse2.h"
+#include "vpx_dsp/x86/txfm_common_sse2.h"
+
+static INLINE void highbd_iadst_half_butterfly_sse4_1(const __m128i in,
+ const int c,
+ __m128i *const s) {
+ const __m128i pair_c = pair_set_epi32(4 * c, 0);
+ __m128i x[2];
+
+ extend_64bit(in, x);
+ s[0] = _mm_mul_epi32(pair_c, x[0]);
+ s[1] = _mm_mul_epi32(pair_c, x[1]);
+}
+
+static INLINE void highbd_iadst_butterfly_sse4_1(const __m128i in0,
+ const __m128i in1,
+ const int c0, const int c1,
+ __m128i *const s0,
+ __m128i *const s1) {
+ const __m128i pair_c0 = pair_set_epi32(4 * c0, 0);
+ const __m128i pair_c1 = pair_set_epi32(4 * c1, 0);
+ __m128i t00[2], t01[2], t10[2], t11[2];
+ __m128i x0[2], x1[2];
+
+ extend_64bit(in0, x0);
+ extend_64bit(in1, x1);
+ t00[0] = _mm_mul_epi32(pair_c0, x0[0]);
+ t00[1] = _mm_mul_epi32(pair_c0, x0[1]);
+ t01[0] = _mm_mul_epi32(pair_c0, x1[0]);
+ t01[1] = _mm_mul_epi32(pair_c0, x1[1]);
+ t10[0] = _mm_mul_epi32(pair_c1, x0[0]);
+ t10[1] = _mm_mul_epi32(pair_c1, x0[1]);
+ t11[0] = _mm_mul_epi32(pair_c1, x1[0]);
+ t11[1] = _mm_mul_epi32(pair_c1, x1[1]);
+
+ s0[0] = _mm_add_epi64(t00[0], t11[0]);
+ s0[1] = _mm_add_epi64(t00[1], t11[1]);
+ s1[0] = _mm_sub_epi64(t10[0], t01[0]);
+ s1[1] = _mm_sub_epi64(t10[1], t01[1]);
+}
+
+static void highbd_iadst16_4col_sse4_1(__m128i *const io /*io[16]*/) {
+ __m128i s0[2], s1[2], s2[2], s3[2], s4[2], s5[2], s6[2], s7[2], s8[2], s9[2],
+ s10[2], s11[2], s12[2], s13[2], s14[2], s15[2];
+ __m128i x0[2], x1[2], x2[2], x3[2], x4[2], x5[2], x6[2], x7[2], x8[2], x9[2],
+ x10[2], x11[2], x12[2], x13[2], x14[2], x15[2];
+
+ // stage 1
+ highbd_iadst_butterfly_sse4_1(io[15], io[0], cospi_1_64, cospi_31_64, s0, s1);
+ highbd_iadst_butterfly_sse4_1(io[13], io[2], cospi_5_64, cospi_27_64, s2, s3);
+ highbd_iadst_butterfly_sse4_1(io[11], io[4], cospi_9_64, cospi_23_64, s4, s5);
+ highbd_iadst_butterfly_sse4_1(io[9], io[6], cospi_13_64, cospi_19_64, s6, s7);
+ highbd_iadst_butterfly_sse4_1(io[7], io[8], cospi_17_64, cospi_15_64, s8, s9);
+ highbd_iadst_butterfly_sse4_1(io[5], io[10], cospi_21_64, cospi_11_64, s10,
+ s11);
+ highbd_iadst_butterfly_sse4_1(io[3], io[12], cospi_25_64, cospi_7_64, s12,
+ s13);
+ highbd_iadst_butterfly_sse4_1(io[1], io[14], cospi_29_64, cospi_3_64, s14,
+ s15);
+
+ x0[0] = _mm_add_epi64(s0[0], s8[0]);
+ x0[1] = _mm_add_epi64(s0[1], s8[1]);
+ x1[0] = _mm_add_epi64(s1[0], s9[0]);
+ x1[1] = _mm_add_epi64(s1[1], s9[1]);
+ x2[0] = _mm_add_epi64(s2[0], s10[0]);
+ x2[1] = _mm_add_epi64(s2[1], s10[1]);
+ x3[0] = _mm_add_epi64(s3[0], s11[0]);
+ x3[1] = _mm_add_epi64(s3[1], s11[1]);
+ x4[0] = _mm_add_epi64(s4[0], s12[0]);
+ x4[1] = _mm_add_epi64(s4[1], s12[1]);
+ x5[0] = _mm_add_epi64(s5[0], s13[0]);
+ x5[1] = _mm_add_epi64(s5[1], s13[1]);
+ x6[0] = _mm_add_epi64(s6[0], s14[0]);
+ x6[1] = _mm_add_epi64(s6[1], s14[1]);
+ x7[0] = _mm_add_epi64(s7[0], s15[0]);
+ x7[1] = _mm_add_epi64(s7[1], s15[1]);
+ x8[0] = _mm_sub_epi64(s0[0], s8[0]);
+ x8[1] = _mm_sub_epi64(s0[1], s8[1]);
+ x9[0] = _mm_sub_epi64(s1[0], s9[0]);
+ x9[1] = _mm_sub_epi64(s1[1], s9[1]);
+ x10[0] = _mm_sub_epi64(s2[0], s10[0]);
+ x10[1] = _mm_sub_epi64(s2[1], s10[1]);
+ x11[0] = _mm_sub_epi64(s3[0], s11[0]);
+ x11[1] = _mm_sub_epi64(s3[1], s11[1]);
+ x12[0] = _mm_sub_epi64(s4[0], s12[0]);
+ x12[1] = _mm_sub_epi64(s4[1], s12[1]);
+ x13[0] = _mm_sub_epi64(s5[0], s13[0]);
+ x13[1] = _mm_sub_epi64(s5[1], s13[1]);
+ x14[0] = _mm_sub_epi64(s6[0], s14[0]);
+ x14[1] = _mm_sub_epi64(s6[1], s14[1]);
+ x15[0] = _mm_sub_epi64(s7[0], s15[0]);
+ x15[1] = _mm_sub_epi64(s7[1], s15[1]);
+
+ x0[0] = dct_const_round_shift_64bit(x0[0]);
+ x0[1] = dct_const_round_shift_64bit(x0[1]);
+ x1[0] = dct_const_round_shift_64bit(x1[0]);
+ x1[1] = dct_const_round_shift_64bit(x1[1]);
+ x2[0] = dct_const_round_shift_64bit(x2[0]);
+ x2[1] = dct_const_round_shift_64bit(x2[1]);
+ x3[0] = dct_const_round_shift_64bit(x3[0]);
+ x3[1] = dct_const_round_shift_64bit(x3[1]);
+ x4[0] = dct_const_round_shift_64bit(x4[0]);
+ x4[1] = dct_const_round_shift_64bit(x4[1]);
+ x5[0] = dct_const_round_shift_64bit(x5[0]);
+ x5[1] = dct_const_round_shift_64bit(x5[1]);
+ x6[0] = dct_const_round_shift_64bit(x6[0]);
+ x6[1] = dct_const_round_shift_64bit(x6[1]);
+ x7[0] = dct_const_round_shift_64bit(x7[0]);
+ x7[1] = dct_const_round_shift_64bit(x7[1]);
+ x8[0] = dct_const_round_shift_64bit(x8[0]);
+ x8[1] = dct_const_round_shift_64bit(x8[1]);
+ x9[0] = dct_const_round_shift_64bit(x9[0]);
+ x9[1] = dct_const_round_shift_64bit(x9[1]);
+ x10[0] = dct_const_round_shift_64bit(x10[0]);
+ x10[1] = dct_const_round_shift_64bit(x10[1]);
+ x11[0] = dct_const_round_shift_64bit(x11[0]);
+ x11[1] = dct_const_round_shift_64bit(x11[1]);
+ x12[0] = dct_const_round_shift_64bit(x12[0]);
+ x12[1] = dct_const_round_shift_64bit(x12[1]);
+ x13[0] = dct_const_round_shift_64bit(x13[0]);
+ x13[1] = dct_const_round_shift_64bit(x13[1]);
+ x14[0] = dct_const_round_shift_64bit(x14[0]);
+ x14[1] = dct_const_round_shift_64bit(x14[1]);
+ x15[0] = dct_const_round_shift_64bit(x15[0]);
+ x15[1] = dct_const_round_shift_64bit(x15[1]);
+ x0[0] = pack_4(x0[0], x0[1]);
+ x1[0] = pack_4(x1[0], x1[1]);
+ x2[0] = pack_4(x2[0], x2[1]);
+ x3[0] = pack_4(x3[0], x3[1]);
+ x4[0] = pack_4(x4[0], x4[1]);
+ x5[0] = pack_4(x5[0], x5[1]);
+ x6[0] = pack_4(x6[0], x6[1]);
+ x7[0] = pack_4(x7[0], x7[1]);
+ x8[0] = pack_4(x8[0], x8[1]);
+ x9[0] = pack_4(x9[0], x9[1]);
+ x10[0] = pack_4(x10[0], x10[1]);
+ x11[0] = pack_4(x11[0], x11[1]);
+ x12[0] = pack_4(x12[0], x12[1]);
+ x13[0] = pack_4(x13[0], x13[1]);
+ x14[0] = pack_4(x14[0], x14[1]);
+ x15[0] = pack_4(x15[0], x15[1]);
+
+ // stage 2
+ s0[0] = x0[0];
+ s1[0] = x1[0];
+ s2[0] = x2[0];
+ s3[0] = x3[0];
+ s4[0] = x4[0];
+ s5[0] = x5[0];
+ s6[0] = x6[0];
+ s7[0] = x7[0];
+ x0[0] = _mm_add_epi32(s0[0], s4[0]);
+ x1[0] = _mm_add_epi32(s1[0], s5[0]);
+ x2[0] = _mm_add_epi32(s2[0], s6[0]);
+ x3[0] = _mm_add_epi32(s3[0], s7[0]);
+ x4[0] = _mm_sub_epi32(s0[0], s4[0]);
+ x5[0] = _mm_sub_epi32(s1[0], s5[0]);
+ x6[0] = _mm_sub_epi32(s2[0], s6[0]);
+ x7[0] = _mm_sub_epi32(s3[0], s7[0]);
+
+ highbd_iadst_butterfly_sse4_1(x8[0], x9[0], cospi_4_64, cospi_28_64, s8, s9);
+ highbd_iadst_butterfly_sse4_1(x10[0], x11[0], cospi_20_64, cospi_12_64, s10,
+ s11);
+ highbd_iadst_butterfly_sse4_1(x13[0], x12[0], cospi_28_64, cospi_4_64, s13,
+ s12);
+ highbd_iadst_butterfly_sse4_1(x15[0], x14[0], cospi_12_64, cospi_20_64, s15,
+ s14);
+
+ x8[0] = _mm_add_epi64(s8[0], s12[0]);
+ x8[1] = _mm_add_epi64(s8[1], s12[1]);
+ x9[0] = _mm_add_epi64(s9[0], s13[0]);
+ x9[1] = _mm_add_epi64(s9[1], s13[1]);
+ x10[0] = _mm_add_epi64(s10[0], s14[0]);
+ x10[1] = _mm_add_epi64(s10[1], s14[1]);
+ x11[0] = _mm_add_epi64(s11[0], s15[0]);
+ x11[1] = _mm_add_epi64(s11[1], s15[1]);
+ x12[0] = _mm_sub_epi64(s8[0], s12[0]);
+ x12[1] = _mm_sub_epi64(s8[1], s12[1]);
+ x13[0] = _mm_sub_epi64(s9[0], s13[0]);
+ x13[1] = _mm_sub_epi64(s9[1], s13[1]);
+ x14[0] = _mm_sub_epi64(s10[0], s14[0]);
+ x14[1] = _mm_sub_epi64(s10[1], s14[1]);
+ x15[0] = _mm_sub_epi64(s11[0], s15[0]);
+ x15[1] = _mm_sub_epi64(s11[1], s15[1]);
+ x8[0] = dct_const_round_shift_64bit(x8[0]);
+ x8[1] = dct_const_round_shift_64bit(x8[1]);
+ x9[0] = dct_const_round_shift_64bit(x9[0]);
+ x9[1] = dct_const_round_shift_64bit(x9[1]);
+ x10[0] = dct_const_round_shift_64bit(x10[0]);
+ x10[1] = dct_const_round_shift_64bit(x10[1]);
+ x11[0] = dct_const_round_shift_64bit(x11[0]);
+ x11[1] = dct_const_round_shift_64bit(x11[1]);
+ x12[0] = dct_const_round_shift_64bit(x12[0]);
+ x12[1] = dct_const_round_shift_64bit(x12[1]);
+ x13[0] = dct_const_round_shift_64bit(x13[0]);
+ x13[1] = dct_const_round_shift_64bit(x13[1]);
+ x14[0] = dct_const_round_shift_64bit(x14[0]);
+ x14[1] = dct_const_round_shift_64bit(x14[1]);
+ x15[0] = dct_const_round_shift_64bit(x15[0]);
+ x15[1] = dct_const_round_shift_64bit(x15[1]);
+ x8[0] = pack_4(x8[0], x8[1]);
+ x9[0] = pack_4(x9[0], x9[1]);
+ x10[0] = pack_4(x10[0], x10[1]);
+ x11[0] = pack_4(x11[0], x11[1]);
+ x12[0] = pack_4(x12[0], x12[1]);
+ x13[0] = pack_4(x13[0], x13[1]);
+ x14[0] = pack_4(x14[0], x14[1]);
+ x15[0] = pack_4(x15[0], x15[1]);
+
+ // stage 3
+ s0[0] = x0[0];
+ s1[0] = x1[0];
+ s2[0] = x2[0];
+ s3[0] = x3[0];
+ highbd_iadst_butterfly_sse4_1(x4[0], x5[0], cospi_8_64, cospi_24_64, s4, s5);
+ highbd_iadst_butterfly_sse4_1(x7[0], x6[0], cospi_24_64, cospi_8_64, s7, s6);
+ s8[0] = x8[0];
+ s9[0] = x9[0];
+ s10[0] = x10[0];
+ s11[0] = x11[0];
+ highbd_iadst_butterfly_sse4_1(x12[0], x13[0], cospi_8_64, cospi_24_64, s12,
+ s13);
+ highbd_iadst_butterfly_sse4_1(x15[0], x14[0], cospi_24_64, cospi_8_64, s15,
+ s14);
+
+ x0[0] = _mm_add_epi32(s0[0], s2[0]);
+ x1[0] = _mm_add_epi32(s1[0], s3[0]);
+ x2[0] = _mm_sub_epi32(s0[0], s2[0]);
+ x3[0] = _mm_sub_epi32(s1[0], s3[0]);
+ x4[0] = _mm_add_epi64(s4[0], s6[0]);
+ x4[1] = _mm_add_epi64(s4[1], s6[1]);
+ x5[0] = _mm_add_epi64(s5[0], s7[0]);
+ x5[1] = _mm_add_epi64(s5[1], s7[1]);
+ x6[0] = _mm_sub_epi64(s4[0], s6[0]);
+ x6[1] = _mm_sub_epi64(s4[1], s6[1]);
+ x7[0] = _mm_sub_epi64(s5[0], s7[0]);
+ x7[1] = _mm_sub_epi64(s5[1], s7[1]);
+ x4[0] = dct_const_round_shift_64bit(x4[0]);
+ x4[1] = dct_const_round_shift_64bit(x4[1]);
+ x5[0] = dct_const_round_shift_64bit(x5[0]);
+ x5[1] = dct_const_round_shift_64bit(x5[1]);
+ x6[0] = dct_const_round_shift_64bit(x6[0]);
+ x6[1] = dct_const_round_shift_64bit(x6[1]);
+ x7[0] = dct_const_round_shift_64bit(x7[0]);
+ x7[1] = dct_const_round_shift_64bit(x7[1]);
+ x4[0] = pack_4(x4[0], x4[1]);
+ x5[0] = pack_4(x5[0], x5[1]);
+ x6[0] = pack_4(x6[0], x6[1]);
+ x7[0] = pack_4(x7[0], x7[1]);
+ x8[0] = _mm_add_epi32(s8[0], s10[0]);
+ x9[0] = _mm_add_epi32(s9[0], s11[0]);
+ x10[0] = _mm_sub_epi32(s8[0], s10[0]);
+ x11[0] = _mm_sub_epi32(s9[0], s11[0]);
+ x12[0] = _mm_add_epi64(s12[0], s14[0]);
+ x12[1] = _mm_add_epi64(s12[1], s14[1]);
+ x13[0] = _mm_add_epi64(s13[0], s15[0]);
+ x13[1] = _mm_add_epi64(s13[1], s15[1]);
+ x14[0] = _mm_sub_epi64(s12[0], s14[0]);
+ x14[1] = _mm_sub_epi64(s12[1], s14[1]);
+ x15[0] = _mm_sub_epi64(s13[0], s15[0]);
+ x15[1] = _mm_sub_epi64(s13[1], s15[1]);
+ x12[0] = dct_const_round_shift_64bit(x12[0]);
+ x12[1] = dct_const_round_shift_64bit(x12[1]);
+ x13[0] = dct_const_round_shift_64bit(x13[0]);
+ x13[1] = dct_const_round_shift_64bit(x13[1]);
+ x14[0] = dct_const_round_shift_64bit(x14[0]);
+ x14[1] = dct_const_round_shift_64bit(x14[1]);
+ x15[0] = dct_const_round_shift_64bit(x15[0]);
+ x15[1] = dct_const_round_shift_64bit(x15[1]);
+ x12[0] = pack_4(x12[0], x12[1]);
+ x13[0] = pack_4(x13[0], x13[1]);
+ x14[0] = pack_4(x14[0], x14[1]);
+ x15[0] = pack_4(x15[0], x15[1]);
+
+ // stage 4
+ s2[0] = _mm_add_epi32(x2[0], x3[0]);
+ s3[0] = _mm_sub_epi32(x2[0], x3[0]);
+ s6[0] = _mm_add_epi32(x7[0], x6[0]);
+ s7[0] = _mm_sub_epi32(x7[0], x6[0]);
+ s10[0] = _mm_add_epi32(x11[0], x10[0]);
+ s11[0] = _mm_sub_epi32(x11[0], x10[0]);
+ s14[0] = _mm_add_epi32(x14[0], x15[0]);
+ s15[0] = _mm_sub_epi32(x14[0], x15[0]);
+ highbd_iadst_half_butterfly_sse4_1(s2[0], -cospi_16_64, s2);
+ highbd_iadst_half_butterfly_sse4_1(s3[0], cospi_16_64, s3);
+ highbd_iadst_half_butterfly_sse4_1(s6[0], cospi_16_64, s6);
+ highbd_iadst_half_butterfly_sse4_1(s7[0], cospi_16_64, s7);
+ highbd_iadst_half_butterfly_sse4_1(s10[0], cospi_16_64, s10);
+ highbd_iadst_half_butterfly_sse4_1(s11[0], cospi_16_64, s11);
+ highbd_iadst_half_butterfly_sse4_1(s14[0], -cospi_16_64, s14);
+ highbd_iadst_half_butterfly_sse4_1(s15[0], cospi_16_64, s15);
+
+ x2[0] = dct_const_round_shift_64bit(s2[0]);
+ x2[1] = dct_const_round_shift_64bit(s2[1]);
+ x3[0] = dct_const_round_shift_64bit(s3[0]);
+ x3[1] = dct_const_round_shift_64bit(s3[1]);
+ x6[0] = dct_const_round_shift_64bit(s6[0]);
+ x6[1] = dct_const_round_shift_64bit(s6[1]);
+ x7[0] = dct_const_round_shift_64bit(s7[0]);
+ x7[1] = dct_const_round_shift_64bit(s7[1]);
+ x10[0] = dct_const_round_shift_64bit(s10[0]);
+ x10[1] = dct_const_round_shift_64bit(s10[1]);
+ x11[0] = dct_const_round_shift_64bit(s11[0]);
+ x11[1] = dct_const_round_shift_64bit(s11[1]);
+ x14[0] = dct_const_round_shift_64bit(s14[0]);
+ x14[1] = dct_const_round_shift_64bit(s14[1]);
+ x15[0] = dct_const_round_shift_64bit(s15[0]);
+ x15[1] = dct_const_round_shift_64bit(s15[1]);
+ x2[0] = pack_4(x2[0], x2[1]);
+ x3[0] = pack_4(x3[0], x3[1]);
+ x6[0] = pack_4(x6[0], x6[1]);
+ x7[0] = pack_4(x7[0], x7[1]);
+ x10[0] = pack_4(x10[0], x10[1]);
+ x11[0] = pack_4(x11[0], x11[1]);
+ x14[0] = pack_4(x14[0], x14[1]);
+ x15[0] = pack_4(x15[0], x15[1]);
+
+ io[0] = x0[0];
+ io[1] = _mm_sub_epi32(_mm_setzero_si128(), x8[0]);
+ io[2] = x12[0];
+ io[3] = _mm_sub_epi32(_mm_setzero_si128(), x4[0]);
+ io[4] = x6[0];
+ io[5] = x14[0];
+ io[6] = x10[0];
+ io[7] = x2[0];
+ io[8] = x3[0];
+ io[9] = x11[0];
+ io[10] = x15[0];
+ io[11] = x7[0];
+ io[12] = x5[0];
+ io[13] = _mm_sub_epi32(_mm_setzero_si128(), x13[0]);
+ io[14] = x9[0];
+ io[15] = _mm_sub_epi32(_mm_setzero_si128(), x1[0]);
+}
+
+void vp9_highbd_iht16x16_256_add_sse4_1(const tran_low_t *input, uint16_t *dest,
+ int stride, int tx_type, int bd) {
+ int i;
+ __m128i out[16], *in;
+
+ if (bd == 8) {
+ __m128i l[16], r[16];
+
+ in = l;
+ for (i = 0; i < 2; i++) {
+ highbd_load_pack_transpose_32bit_8x8(&input[0], 16, &in[0]);
+ highbd_load_pack_transpose_32bit_8x8(&input[8], 16, &in[8]);
+ if (tx_type == DCT_DCT || tx_type == ADST_DCT) {
+ idct16_8col(in, in);
+ } else {
+ vpx_iadst16_8col_sse2(in);
+ }
+ in = r;
+ input += 128;
+ }
+
+ for (i = 0; i < 16; i += 8) {
+ int j;
+ transpose_16bit_8x8(l + i, out);
+ transpose_16bit_8x8(r + i, out + 8);
+ if (tx_type == DCT_DCT || tx_type == DCT_ADST) {
+ idct16_8col(out, out);
+ } else {
+ vpx_iadst16_8col_sse2(out);
+ }
+
+ for (j = 0; j < 16; ++j) {
+ highbd_write_buffer_8(dest + j * stride, out[j], bd);
+ }
+ dest += 8;
+ }
+ } else {
+ __m128i all[4][16];
+
+ for (i = 0; i < 4; i++) {
+ in = all[i];
+ highbd_load_transpose_32bit_8x4(&input[0], 16, &in[0]);
+ highbd_load_transpose_32bit_8x4(&input[8], 16, &in[8]);
+ if (tx_type == DCT_DCT || tx_type == ADST_DCT) {
+ vpx_highbd_idct16_4col_sse4_1(in);
+ } else {
+ highbd_iadst16_4col_sse4_1(in);
+ }
+ input += 4 * 16;
+ }
+
+ for (i = 0; i < 16; i += 4) {
+ int j;
+ transpose_32bit_4x4(all[0] + i, out + 0);
+ transpose_32bit_4x4(all[1] + i, out + 4);
+ transpose_32bit_4x4(all[2] + i, out + 8);
+ transpose_32bit_4x4(all[3] + i, out + 12);
+ if (tx_type == DCT_DCT || tx_type == DCT_ADST) {
+ vpx_highbd_idct16_4col_sse4_1(out);
+ } else {
+ highbd_iadst16_4col_sse4_1(out);
+ }
+
+ for (j = 0; j < 16; ++j) {
+ highbd_write_buffer_4(dest + j * stride, out[j], bd);
+ }
+ dest += 4;
+ }
+ }
+}
diff --git a/media/libvpx/libvpx/vp9/common/x86/vp9_highbd_iht4x4_add_sse4.c b/media/libvpx/libvpx/vp9/common/x86/vp9_highbd_iht4x4_add_sse4.c
new file mode 100644
index 0000000000..af158536f9
--- /dev/null
+++ b/media/libvpx/libvpx/vp9/common/x86/vp9_highbd_iht4x4_add_sse4.c
@@ -0,0 +1,131 @@
+/*
+ * Copyright (c) 2018 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "./vp9_rtcd.h"
+#include "vp9/common/vp9_idct.h"
+#include "vpx_dsp/x86/highbd_inv_txfm_sse4.h"
+#include "vpx_dsp/x86/inv_txfm_sse2.h"
+#include "vpx_dsp/x86/transpose_sse2.h"
+#include "vpx_dsp/x86/txfm_common_sse2.h"
+
+static INLINE void highbd_iadst4_sse4_1(__m128i *const io) {
+ const __m128i pair_c1 = pair_set_epi32(4 * sinpi_1_9, 0);
+ const __m128i pair_c2 = pair_set_epi32(4 * sinpi_2_9, 0);
+ const __m128i pair_c3 = pair_set_epi32(4 * sinpi_3_9, 0);
+ const __m128i pair_c4 = pair_set_epi32(4 * sinpi_4_9, 0);
+ __m128i s0[2], s1[2], s2[2], s3[2], s4[2], s5[2], s6[2], t0[2], t1[2], t2[2];
+ __m128i temp[2];
+
+ transpose_32bit_4x4(io, io);
+
+ extend_64bit(io[0], temp);
+ s0[0] = _mm_mul_epi32(pair_c1, temp[0]);
+ s0[1] = _mm_mul_epi32(pair_c1, temp[1]);
+ s1[0] = _mm_mul_epi32(pair_c2, temp[0]);
+ s1[1] = _mm_mul_epi32(pair_c2, temp[1]);
+
+ extend_64bit(io[1], temp);
+ s2[0] = _mm_mul_epi32(pair_c3, temp[0]);
+ s2[1] = _mm_mul_epi32(pair_c3, temp[1]);
+
+ extend_64bit(io[2], temp);
+ s3[0] = _mm_mul_epi32(pair_c4, temp[0]);
+ s3[1] = _mm_mul_epi32(pair_c4, temp[1]);
+ s4[0] = _mm_mul_epi32(pair_c1, temp[0]);
+ s4[1] = _mm_mul_epi32(pair_c1, temp[1]);
+
+ extend_64bit(io[3], temp);
+ s5[0] = _mm_mul_epi32(pair_c2, temp[0]);
+ s5[1] = _mm_mul_epi32(pair_c2, temp[1]);
+ s6[0] = _mm_mul_epi32(pair_c4, temp[0]);
+ s6[1] = _mm_mul_epi32(pair_c4, temp[1]);
+
+ t0[0] = _mm_add_epi64(s0[0], s3[0]);
+ t0[1] = _mm_add_epi64(s0[1], s3[1]);
+ t0[0] = _mm_add_epi64(t0[0], s5[0]);
+ t0[1] = _mm_add_epi64(t0[1], s5[1]);
+ t1[0] = _mm_sub_epi64(s1[0], s4[0]);
+ t1[1] = _mm_sub_epi64(s1[1], s4[1]);
+ t1[0] = _mm_sub_epi64(t1[0], s6[0]);
+ t1[1] = _mm_sub_epi64(t1[1], s6[1]);
+ temp[0] = _mm_sub_epi32(io[0], io[2]);
+ temp[0] = _mm_add_epi32(temp[0], io[3]);
+ extend_64bit(temp[0], temp);
+ t2[0] = _mm_mul_epi32(pair_c3, temp[0]);
+ t2[1] = _mm_mul_epi32(pair_c3, temp[1]);
+
+ s0[0] = _mm_add_epi64(t0[0], s2[0]);
+ s0[1] = _mm_add_epi64(t0[1], s2[1]);
+ s1[0] = _mm_add_epi64(t1[0], s2[0]);
+ s1[1] = _mm_add_epi64(t1[1], s2[1]);
+ s3[0] = _mm_add_epi64(t0[0], t1[0]);
+ s3[1] = _mm_add_epi64(t0[1], t1[1]);
+ s3[0] = _mm_sub_epi64(s3[0], s2[0]);
+ s3[1] = _mm_sub_epi64(s3[1], s2[1]);
+
+ s0[0] = dct_const_round_shift_64bit(s0[0]);
+ s0[1] = dct_const_round_shift_64bit(s0[1]);
+ s1[0] = dct_const_round_shift_64bit(s1[0]);
+ s1[1] = dct_const_round_shift_64bit(s1[1]);
+ s2[0] = dct_const_round_shift_64bit(t2[0]);
+ s2[1] = dct_const_round_shift_64bit(t2[1]);
+ s3[0] = dct_const_round_shift_64bit(s3[0]);
+ s3[1] = dct_const_round_shift_64bit(s3[1]);
+ io[0] = pack_4(s0[0], s0[1]);
+ io[1] = pack_4(s1[0], s1[1]);
+ io[2] = pack_4(s2[0], s2[1]);
+ io[3] = pack_4(s3[0], s3[1]);
+}
+
+void vp9_highbd_iht4x4_16_add_sse4_1(const tran_low_t *input, uint16_t *dest,
+ int stride, int tx_type, int bd) {
+ __m128i io[4];
+
+ io[0] = _mm_load_si128((const __m128i *)(input + 0));
+ io[1] = _mm_load_si128((const __m128i *)(input + 4));
+ io[2] = _mm_load_si128((const __m128i *)(input + 8));
+ io[3] = _mm_load_si128((const __m128i *)(input + 12));
+
+ if (bd == 8) {
+ __m128i io_short[2];
+
+ io_short[0] = _mm_packs_epi32(io[0], io[1]);
+ io_short[1] = _mm_packs_epi32(io[2], io[3]);
+ if (tx_type == DCT_DCT || tx_type == ADST_DCT) {
+ idct4_sse2(io_short);
+ } else {
+ iadst4_sse2(io_short);
+ }
+ if (tx_type == DCT_DCT || tx_type == DCT_ADST) {
+ idct4_sse2(io_short);
+ } else {
+ iadst4_sse2(io_short);
+ }
+ io_short[0] = _mm_add_epi16(io_short[0], _mm_set1_epi16(8));
+ io_short[1] = _mm_add_epi16(io_short[1], _mm_set1_epi16(8));
+ io[0] = _mm_srai_epi16(io_short[0], 4);
+ io[1] = _mm_srai_epi16(io_short[1], 4);
+ } else {
+ if (tx_type == DCT_DCT || tx_type == ADST_DCT) {
+ highbd_idct4_sse4_1(io);
+ } else {
+ highbd_iadst4_sse4_1(io);
+ }
+ if (tx_type == DCT_DCT || tx_type == DCT_ADST) {
+ highbd_idct4_sse4_1(io);
+ } else {
+ highbd_iadst4_sse4_1(io);
+ }
+ io[0] = wraplow_16bit_shift4(io[0], io[1], _mm_set1_epi32(8));
+ io[1] = wraplow_16bit_shift4(io[2], io[3], _mm_set1_epi32(8));
+ }
+
+ recon_and_store_4x4(io, dest, stride, bd);
+}
diff --git a/media/libvpx/libvpx/vp9/common/x86/vp9_highbd_iht8x8_add_sse4.c b/media/libvpx/libvpx/vp9/common/x86/vp9_highbd_iht8x8_add_sse4.c
new file mode 100644
index 0000000000..7d949b6dbc
--- /dev/null
+++ b/media/libvpx/libvpx/vp9/common/x86/vp9_highbd_iht8x8_add_sse4.c
@@ -0,0 +1,255 @@
+/*
+ * Copyright (c) 2018 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "./vp9_rtcd.h"
+#include "vp9/common/vp9_idct.h"
+#include "vpx_dsp/x86/highbd_inv_txfm_sse4.h"
+#include "vpx_dsp/x86/inv_txfm_sse2.h"
+#include "vpx_dsp/x86/transpose_sse2.h"
+#include "vpx_dsp/x86/txfm_common_sse2.h"
+
+static INLINE void highbd_iadst_half_butterfly_sse4_1(const __m128i in,
+ const int c,
+ __m128i *const s) {
+ const __m128i pair_c = pair_set_epi32(4 * c, 0);
+ __m128i x[2];
+
+ extend_64bit(in, x);
+ s[0] = _mm_mul_epi32(pair_c, x[0]);
+ s[1] = _mm_mul_epi32(pair_c, x[1]);
+}
+
+static INLINE void highbd_iadst_butterfly_sse4_1(const __m128i in0,
+ const __m128i in1,
+ const int c0, const int c1,
+ __m128i *const s0,
+ __m128i *const s1) {
+ const __m128i pair_c0 = pair_set_epi32(4 * c0, 0);
+ const __m128i pair_c1 = pair_set_epi32(4 * c1, 0);
+ __m128i t00[2], t01[2], t10[2], t11[2];
+ __m128i x0[2], x1[2];
+
+ extend_64bit(in0, x0);
+ extend_64bit(in1, x1);
+ t00[0] = _mm_mul_epi32(pair_c0, x0[0]);
+ t00[1] = _mm_mul_epi32(pair_c0, x0[1]);
+ t01[0] = _mm_mul_epi32(pair_c0, x1[0]);
+ t01[1] = _mm_mul_epi32(pair_c0, x1[1]);
+ t10[0] = _mm_mul_epi32(pair_c1, x0[0]);
+ t10[1] = _mm_mul_epi32(pair_c1, x0[1]);
+ t11[0] = _mm_mul_epi32(pair_c1, x1[0]);
+ t11[1] = _mm_mul_epi32(pair_c1, x1[1]);
+
+ s0[0] = _mm_add_epi64(t00[0], t11[0]);
+ s0[1] = _mm_add_epi64(t00[1], t11[1]);
+ s1[0] = _mm_sub_epi64(t10[0], t01[0]);
+ s1[1] = _mm_sub_epi64(t10[1], t01[1]);
+}
+
+static void highbd_iadst8_sse4_1(__m128i *const io) {
+ __m128i s0[2], s1[2], s2[2], s3[2], s4[2], s5[2], s6[2], s7[2];
+ __m128i x0[2], x1[2], x2[2], x3[2], x4[2], x5[2], x6[2], x7[2];
+
+ transpose_32bit_4x4x2(io, io);
+
+ // stage 1
+ highbd_iadst_butterfly_sse4_1(io[7], io[0], cospi_2_64, cospi_30_64, s0, s1);
+ highbd_iadst_butterfly_sse4_1(io[3], io[4], cospi_18_64, cospi_14_64, s4, s5);
+ x0[0] = _mm_add_epi64(s0[0], s4[0]);
+ x0[1] = _mm_add_epi64(s0[1], s4[1]);
+ x1[0] = _mm_add_epi64(s1[0], s5[0]);
+ x1[1] = _mm_add_epi64(s1[1], s5[1]);
+ x4[0] = _mm_sub_epi64(s0[0], s4[0]);
+ x4[1] = _mm_sub_epi64(s0[1], s4[1]);
+ x5[0] = _mm_sub_epi64(s1[0], s5[0]);
+ x5[1] = _mm_sub_epi64(s1[1], s5[1]);
+
+ highbd_iadst_butterfly_sse4_1(io[5], io[2], cospi_10_64, cospi_22_64, s2, s3);
+ highbd_iadst_butterfly_sse4_1(io[1], io[6], cospi_26_64, cospi_6_64, s6, s7);
+ x2[0] = _mm_add_epi64(s2[0], s6[0]);
+ x2[1] = _mm_add_epi64(s2[1], s6[1]);
+ x3[0] = _mm_add_epi64(s3[0], s7[0]);
+ x3[1] = _mm_add_epi64(s3[1], s7[1]);
+ x6[0] = _mm_sub_epi64(s2[0], s6[0]);
+ x6[1] = _mm_sub_epi64(s2[1], s6[1]);
+ x7[0] = _mm_sub_epi64(s3[0], s7[0]);
+ x7[1] = _mm_sub_epi64(s3[1], s7[1]);
+
+ x0[0] = dct_const_round_shift_64bit(x0[0]);
+ x0[1] = dct_const_round_shift_64bit(x0[1]);
+ x1[0] = dct_const_round_shift_64bit(x1[0]);
+ x1[1] = dct_const_round_shift_64bit(x1[1]);
+ x2[0] = dct_const_round_shift_64bit(x2[0]);
+ x2[1] = dct_const_round_shift_64bit(x2[1]);
+ x3[0] = dct_const_round_shift_64bit(x3[0]);
+ x3[1] = dct_const_round_shift_64bit(x3[1]);
+ x4[0] = dct_const_round_shift_64bit(x4[0]);
+ x4[1] = dct_const_round_shift_64bit(x4[1]);
+ x5[0] = dct_const_round_shift_64bit(x5[0]);
+ x5[1] = dct_const_round_shift_64bit(x5[1]);
+ x6[0] = dct_const_round_shift_64bit(x6[0]);
+ x6[1] = dct_const_round_shift_64bit(x6[1]);
+ x7[0] = dct_const_round_shift_64bit(x7[0]);
+ x7[1] = dct_const_round_shift_64bit(x7[1]);
+ s0[0] = pack_4(x0[0], x0[1]); // s0 = x0;
+ s1[0] = pack_4(x1[0], x1[1]); // s1 = x1;
+ s2[0] = pack_4(x2[0], x2[1]); // s2 = x2;
+ s3[0] = pack_4(x3[0], x3[1]); // s3 = x3;
+ x4[0] = pack_4(x4[0], x4[1]);
+ x5[0] = pack_4(x5[0], x5[1]);
+ x6[0] = pack_4(x6[0], x6[1]);
+ x7[0] = pack_4(x7[0], x7[1]);
+
+ // stage 2
+ x0[0] = _mm_add_epi32(s0[0], s2[0]);
+ x1[0] = _mm_add_epi32(s1[0], s3[0]);
+ x2[0] = _mm_sub_epi32(s0[0], s2[0]);
+ x3[0] = _mm_sub_epi32(s1[0], s3[0]);
+
+ highbd_iadst_butterfly_sse4_1(x4[0], x5[0], cospi_8_64, cospi_24_64, s4, s5);
+ highbd_iadst_butterfly_sse4_1(x7[0], x6[0], cospi_24_64, cospi_8_64, s7, s6);
+
+ x4[0] = _mm_add_epi64(s4[0], s6[0]);
+ x4[1] = _mm_add_epi64(s4[1], s6[1]);
+ x5[0] = _mm_add_epi64(s5[0], s7[0]);
+ x5[1] = _mm_add_epi64(s5[1], s7[1]);
+ x6[0] = _mm_sub_epi64(s4[0], s6[0]);
+ x6[1] = _mm_sub_epi64(s4[1], s6[1]);
+ x7[0] = _mm_sub_epi64(s5[0], s7[0]);
+ x7[1] = _mm_sub_epi64(s5[1], s7[1]);
+ x4[0] = dct_const_round_shift_64bit(x4[0]);
+ x4[1] = dct_const_round_shift_64bit(x4[1]);
+ x5[0] = dct_const_round_shift_64bit(x5[0]);
+ x5[1] = dct_const_round_shift_64bit(x5[1]);
+ x6[0] = dct_const_round_shift_64bit(x6[0]);
+ x6[1] = dct_const_round_shift_64bit(x6[1]);
+ x7[0] = dct_const_round_shift_64bit(x7[0]);
+ x7[1] = dct_const_round_shift_64bit(x7[1]);
+ x4[0] = pack_4(x4[0], x4[1]);
+ x5[0] = pack_4(x5[0], x5[1]);
+ x6[0] = pack_4(x6[0], x6[1]);
+ x7[0] = pack_4(x7[0], x7[1]);
+
+ // stage 3
+ s2[0] = _mm_add_epi32(x2[0], x3[0]);
+ s3[0] = _mm_sub_epi32(x2[0], x3[0]);
+ s6[0] = _mm_add_epi32(x6[0], x7[0]);
+ s7[0] = _mm_sub_epi32(x6[0], x7[0]);
+ highbd_iadst_half_butterfly_sse4_1(s2[0], cospi_16_64, s2);
+ highbd_iadst_half_butterfly_sse4_1(s3[0], cospi_16_64, s3);
+ highbd_iadst_half_butterfly_sse4_1(s6[0], cospi_16_64, s6);
+ highbd_iadst_half_butterfly_sse4_1(s7[0], cospi_16_64, s7);
+
+ x2[0] = dct_const_round_shift_64bit(s2[0]);
+ x2[1] = dct_const_round_shift_64bit(s2[1]);
+ x3[0] = dct_const_round_shift_64bit(s3[0]);
+ x3[1] = dct_const_round_shift_64bit(s3[1]);
+ x6[0] = dct_const_round_shift_64bit(s6[0]);
+ x6[1] = dct_const_round_shift_64bit(s6[1]);
+ x7[0] = dct_const_round_shift_64bit(s7[0]);
+ x7[1] = dct_const_round_shift_64bit(s7[1]);
+ x2[0] = pack_4(x2[0], x2[1]);
+ x3[0] = pack_4(x3[0], x3[1]);
+ x6[0] = pack_4(x6[0], x6[1]);
+ x7[0] = pack_4(x7[0], x7[1]);
+
+ io[0] = x0[0];
+ io[1] = _mm_sub_epi32(_mm_setzero_si128(), x4[0]);
+ io[2] = x6[0];
+ io[3] = _mm_sub_epi32(_mm_setzero_si128(), x2[0]);
+ io[4] = x3[0];
+ io[5] = _mm_sub_epi32(_mm_setzero_si128(), x7[0]);
+ io[6] = x5[0];
+ io[7] = _mm_sub_epi32(_mm_setzero_si128(), x1[0]);
+}
+
+void vp9_highbd_iht8x8_64_add_sse4_1(const tran_low_t *input, uint16_t *dest,
+ int stride, int tx_type, int bd) {
+ __m128i io[16];
+
+ io[0] = _mm_load_si128((const __m128i *)(input + 0 * 8 + 0));
+ io[4] = _mm_load_si128((const __m128i *)(input + 0 * 8 + 4));
+ io[1] = _mm_load_si128((const __m128i *)(input + 1 * 8 + 0));
+ io[5] = _mm_load_si128((const __m128i *)(input + 1 * 8 + 4));
+ io[2] = _mm_load_si128((const __m128i *)(input + 2 * 8 + 0));
+ io[6] = _mm_load_si128((const __m128i *)(input + 2 * 8 + 4));
+ io[3] = _mm_load_si128((const __m128i *)(input + 3 * 8 + 0));
+ io[7] = _mm_load_si128((const __m128i *)(input + 3 * 8 + 4));
+ io[8] = _mm_load_si128((const __m128i *)(input + 4 * 8 + 0));
+ io[12] = _mm_load_si128((const __m128i *)(input + 4 * 8 + 4));
+ io[9] = _mm_load_si128((const __m128i *)(input + 5 * 8 + 0));
+ io[13] = _mm_load_si128((const __m128i *)(input + 5 * 8 + 4));
+ io[10] = _mm_load_si128((const __m128i *)(input + 6 * 8 + 0));
+ io[14] = _mm_load_si128((const __m128i *)(input + 6 * 8 + 4));
+ io[11] = _mm_load_si128((const __m128i *)(input + 7 * 8 + 0));
+ io[15] = _mm_load_si128((const __m128i *)(input + 7 * 8 + 4));
+
+ if (bd == 8) {
+ __m128i io_short[8];
+
+ io_short[0] = _mm_packs_epi32(io[0], io[4]);
+ io_short[1] = _mm_packs_epi32(io[1], io[5]);
+ io_short[2] = _mm_packs_epi32(io[2], io[6]);
+ io_short[3] = _mm_packs_epi32(io[3], io[7]);
+ io_short[4] = _mm_packs_epi32(io[8], io[12]);
+ io_short[5] = _mm_packs_epi32(io[9], io[13]);
+ io_short[6] = _mm_packs_epi32(io[10], io[14]);
+ io_short[7] = _mm_packs_epi32(io[11], io[15]);
+
+ if (tx_type == DCT_DCT || tx_type == ADST_DCT) {
+ vpx_idct8_sse2(io_short);
+ } else {
+ iadst8_sse2(io_short);
+ }
+ if (tx_type == DCT_DCT || tx_type == DCT_ADST) {
+ vpx_idct8_sse2(io_short);
+ } else {
+ iadst8_sse2(io_short);
+ }
+ round_shift_8x8(io_short, io);
+ } else {
+ __m128i temp[4];
+
+ if (tx_type == DCT_DCT || tx_type == ADST_DCT) {
+ vpx_highbd_idct8x8_half1d_sse4_1(io);
+ vpx_highbd_idct8x8_half1d_sse4_1(&io[8]);
+ } else {
+ highbd_iadst8_sse4_1(io);
+ highbd_iadst8_sse4_1(&io[8]);
+ }
+
+ temp[0] = io[4];
+ temp[1] = io[5];
+ temp[2] = io[6];
+ temp[3] = io[7];
+ io[4] = io[8];
+ io[5] = io[9];
+ io[6] = io[10];
+ io[7] = io[11];
+
+ if (tx_type == DCT_DCT || tx_type == DCT_ADST) {
+ vpx_highbd_idct8x8_half1d_sse4_1(io);
+ io[8] = temp[0];
+ io[9] = temp[1];
+ io[10] = temp[2];
+ io[11] = temp[3];
+ vpx_highbd_idct8x8_half1d_sse4_1(&io[8]);
+ } else {
+ highbd_iadst8_sse4_1(io);
+ io[8] = temp[0];
+ io[9] = temp[1];
+ io[10] = temp[2];
+ io[11] = temp[3];
+ highbd_iadst8_sse4_1(&io[8]);
+ }
+ highbd_idct8x8_final_round(io);
+ }
+ recon_and_store_8x8(io, dest, stride, bd);
+}
diff --git a/media/libvpx/libvpx/vp9/common/x86/vp9_idct_intrin_sse2.c b/media/libvpx/libvpx/vp9/common/x86/vp9_idct_intrin_sse2.c
new file mode 100644
index 0000000000..ad693718c0
--- /dev/null
+++ b/media/libvpx/libvpx/vp9/common/x86/vp9_idct_intrin_sse2.c
@@ -0,0 +1,224 @@
+/*
+ * Copyright (c) 2012 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "./vp9_rtcd.h"
+#include "vpx_dsp/x86/inv_txfm_sse2.h"
+
+void vp9_iht4x4_16_add_sse2(const tran_low_t *input, uint8_t *dest, int stride,
+ int tx_type) {
+ __m128i in[2];
+ const __m128i eight = _mm_set1_epi16(8);
+
+ in[0] = load_input_data8(input);
+ in[1] = load_input_data8(input + 8);
+
+ switch (tx_type) {
+ case DCT_DCT:
+ idct4_sse2(in);
+ idct4_sse2(in);
+ break;
+ case ADST_DCT:
+ idct4_sse2(in);
+ iadst4_sse2(in);
+ break;
+ case DCT_ADST:
+ iadst4_sse2(in);
+ idct4_sse2(in);
+ break;
+ default:
+ assert(tx_type == ADST_ADST);
+ iadst4_sse2(in);
+ iadst4_sse2(in);
+ break;
+ }
+
+ // Final round and shift
+ in[0] = _mm_add_epi16(in[0], eight);
+ in[1] = _mm_add_epi16(in[1], eight);
+
+ in[0] = _mm_srai_epi16(in[0], 4);
+ in[1] = _mm_srai_epi16(in[1], 4);
+
+ recon_and_store4x4_sse2(in, dest, stride);
+}
+
+void vp9_iht8x8_64_add_sse2(const tran_low_t *input, uint8_t *dest, int stride,
+ int tx_type) {
+ __m128i in[8];
+ const __m128i final_rounding = _mm_set1_epi16(1 << 4);
+
+ // load input data
+ in[0] = load_input_data8(input);
+ in[1] = load_input_data8(input + 8 * 1);
+ in[2] = load_input_data8(input + 8 * 2);
+ in[3] = load_input_data8(input + 8 * 3);
+ in[4] = load_input_data8(input + 8 * 4);
+ in[5] = load_input_data8(input + 8 * 5);
+ in[6] = load_input_data8(input + 8 * 6);
+ in[7] = load_input_data8(input + 8 * 7);
+
+ switch (tx_type) {
+ case DCT_DCT:
+ vpx_idct8_sse2(in);
+ vpx_idct8_sse2(in);
+ break;
+ case ADST_DCT:
+ vpx_idct8_sse2(in);
+ iadst8_sse2(in);
+ break;
+ case DCT_ADST:
+ iadst8_sse2(in);
+ vpx_idct8_sse2(in);
+ break;
+ default:
+ assert(tx_type == ADST_ADST);
+ iadst8_sse2(in);
+ iadst8_sse2(in);
+ break;
+ }
+
+ // Final rounding and shift
+ in[0] = _mm_adds_epi16(in[0], final_rounding);
+ in[1] = _mm_adds_epi16(in[1], final_rounding);
+ in[2] = _mm_adds_epi16(in[2], final_rounding);
+ in[3] = _mm_adds_epi16(in[3], final_rounding);
+ in[4] = _mm_adds_epi16(in[4], final_rounding);
+ in[5] = _mm_adds_epi16(in[5], final_rounding);
+ in[6] = _mm_adds_epi16(in[6], final_rounding);
+ in[7] = _mm_adds_epi16(in[7], final_rounding);
+
+ in[0] = _mm_srai_epi16(in[0], 5);
+ in[1] = _mm_srai_epi16(in[1], 5);
+ in[2] = _mm_srai_epi16(in[2], 5);
+ in[3] = _mm_srai_epi16(in[3], 5);
+ in[4] = _mm_srai_epi16(in[4], 5);
+ in[5] = _mm_srai_epi16(in[5], 5);
+ in[6] = _mm_srai_epi16(in[6], 5);
+ in[7] = _mm_srai_epi16(in[7], 5);
+
+ recon_and_store(dest + 0 * stride, in[0]);
+ recon_and_store(dest + 1 * stride, in[1]);
+ recon_and_store(dest + 2 * stride, in[2]);
+ recon_and_store(dest + 3 * stride, in[3]);
+ recon_and_store(dest + 4 * stride, in[4]);
+ recon_and_store(dest + 5 * stride, in[5]);
+ recon_and_store(dest + 6 * stride, in[6]);
+ recon_and_store(dest + 7 * stride, in[7]);
+}
+
+static INLINE void load_buffer_8x16(const tran_low_t *const input,
+ __m128i *const in) {
+ in[0] = load_input_data8(input + 0 * 16);
+ in[1] = load_input_data8(input + 1 * 16);
+ in[2] = load_input_data8(input + 2 * 16);
+ in[3] = load_input_data8(input + 3 * 16);
+ in[4] = load_input_data8(input + 4 * 16);
+ in[5] = load_input_data8(input + 5 * 16);
+ in[6] = load_input_data8(input + 6 * 16);
+ in[7] = load_input_data8(input + 7 * 16);
+
+ in[8] = load_input_data8(input + 8 * 16);
+ in[9] = load_input_data8(input + 9 * 16);
+ in[10] = load_input_data8(input + 10 * 16);
+ in[11] = load_input_data8(input + 11 * 16);
+ in[12] = load_input_data8(input + 12 * 16);
+ in[13] = load_input_data8(input + 13 * 16);
+ in[14] = load_input_data8(input + 14 * 16);
+ in[15] = load_input_data8(input + 15 * 16);
+}
+
+static INLINE void write_buffer_8x16(uint8_t *const dest, __m128i *const in,
+ const int stride) {
+ const __m128i final_rounding = _mm_set1_epi16(1 << 5);
+ // Final rounding and shift
+ in[0] = _mm_adds_epi16(in[0], final_rounding);
+ in[1] = _mm_adds_epi16(in[1], final_rounding);
+ in[2] = _mm_adds_epi16(in[2], final_rounding);
+ in[3] = _mm_adds_epi16(in[3], final_rounding);
+ in[4] = _mm_adds_epi16(in[4], final_rounding);
+ in[5] = _mm_adds_epi16(in[5], final_rounding);
+ in[6] = _mm_adds_epi16(in[6], final_rounding);
+ in[7] = _mm_adds_epi16(in[7], final_rounding);
+ in[8] = _mm_adds_epi16(in[8], final_rounding);
+ in[9] = _mm_adds_epi16(in[9], final_rounding);
+ in[10] = _mm_adds_epi16(in[10], final_rounding);
+ in[11] = _mm_adds_epi16(in[11], final_rounding);
+ in[12] = _mm_adds_epi16(in[12], final_rounding);
+ in[13] = _mm_adds_epi16(in[13], final_rounding);
+ in[14] = _mm_adds_epi16(in[14], final_rounding);
+ in[15] = _mm_adds_epi16(in[15], final_rounding);
+
+ in[0] = _mm_srai_epi16(in[0], 6);
+ in[1] = _mm_srai_epi16(in[1], 6);
+ in[2] = _mm_srai_epi16(in[2], 6);
+ in[3] = _mm_srai_epi16(in[3], 6);
+ in[4] = _mm_srai_epi16(in[4], 6);
+ in[5] = _mm_srai_epi16(in[5], 6);
+ in[6] = _mm_srai_epi16(in[6], 6);
+ in[7] = _mm_srai_epi16(in[7], 6);
+ in[8] = _mm_srai_epi16(in[8], 6);
+ in[9] = _mm_srai_epi16(in[9], 6);
+ in[10] = _mm_srai_epi16(in[10], 6);
+ in[11] = _mm_srai_epi16(in[11], 6);
+ in[12] = _mm_srai_epi16(in[12], 6);
+ in[13] = _mm_srai_epi16(in[13], 6);
+ in[14] = _mm_srai_epi16(in[14], 6);
+ in[15] = _mm_srai_epi16(in[15], 6);
+
+ recon_and_store(dest + 0 * stride, in[0]);
+ recon_and_store(dest + 1 * stride, in[1]);
+ recon_and_store(dest + 2 * stride, in[2]);
+ recon_and_store(dest + 3 * stride, in[3]);
+ recon_and_store(dest + 4 * stride, in[4]);
+ recon_and_store(dest + 5 * stride, in[5]);
+ recon_and_store(dest + 6 * stride, in[6]);
+ recon_and_store(dest + 7 * stride, in[7]);
+ recon_and_store(dest + 8 * stride, in[8]);
+ recon_and_store(dest + 9 * stride, in[9]);
+ recon_and_store(dest + 10 * stride, in[10]);
+ recon_and_store(dest + 11 * stride, in[11]);
+ recon_and_store(dest + 12 * stride, in[12]);
+ recon_and_store(dest + 13 * stride, in[13]);
+ recon_and_store(dest + 14 * stride, in[14]);
+ recon_and_store(dest + 15 * stride, in[15]);
+}
+
+void vp9_iht16x16_256_add_sse2(const tran_low_t *input, uint8_t *dest,
+ int stride, int tx_type) {
+ __m128i in0[16], in1[16];
+
+ load_buffer_8x16(input, in0);
+ input += 8;
+ load_buffer_8x16(input, in1);
+
+ switch (tx_type) {
+ case DCT_DCT:
+ idct16_sse2(in0, in1);
+ idct16_sse2(in0, in1);
+ break;
+ case ADST_DCT:
+ idct16_sse2(in0, in1);
+ iadst16_sse2(in0, in1);
+ break;
+ case DCT_ADST:
+ iadst16_sse2(in0, in1);
+ idct16_sse2(in0, in1);
+ break;
+ default:
+ assert(tx_type == ADST_ADST);
+ iadst16_sse2(in0, in1);
+ iadst16_sse2(in0, in1);
+ break;
+ }
+
+ write_buffer_8x16(dest, in0, stride);
+ dest += 8;
+ write_buffer_8x16(dest, in1, stride);
+}
diff --git a/media/libvpx/libvpx/vp9/common/x86/vp9_mfqe_sse2.asm b/media/libvpx/libvpx/vp9/common/x86/vp9_mfqe_sse2.asm
new file mode 100644
index 0000000000..ae7c94ea3f
--- /dev/null
+++ b/media/libvpx/libvpx/vp9/common/x86/vp9_mfqe_sse2.asm
@@ -0,0 +1,289 @@
+;
+; Copyright (c) 2015 The WebM project authors. All Rights Reserved.
+;
+; Use of this source code is governed by a BSD-style license
+; that can be found in the LICENSE file in the root of the source
+; tree. An additional intellectual property rights grant can be found
+; in the file PATENTS. All contributing project authors may
+; be found in the AUTHORS file in the root of the source tree.
+;
+
+; This file is a duplicate of mfqe_sse2.asm in VP8.
+; TODO(jackychen): Find a way to fix the duplicate.
+%include "vpx_ports/x86_abi_support.asm"
+
+SECTION .text
+
+;void vp9_filter_by_weight16x16_sse2
+;(
+; unsigned char *src,
+; int src_stride,
+; unsigned char *dst,
+; int dst_stride,
+; int src_weight
+;)
+globalsym(vp9_filter_by_weight16x16_sse2)
+sym(vp9_filter_by_weight16x16_sse2):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 5
+ SAVE_XMM 6
+ GET_GOT rbx
+ push rsi
+ push rdi
+ ; end prolog
+
+ movd xmm0, arg(4) ; src_weight
+ pshuflw xmm0, xmm0, 0x0 ; replicate to all low words
+ punpcklqdq xmm0, xmm0 ; replicate to all hi words
+
+ movdqa xmm1, [GLOBAL(tMFQE)]
+ psubw xmm1, xmm0 ; dst_weight
+
+ mov rax, arg(0) ; src
+ mov rsi, arg(1) ; src_stride
+ mov rdx, arg(2) ; dst
+ mov rdi, arg(3) ; dst_stride
+
+ mov rcx, 16 ; loop count
+ pxor xmm6, xmm6
+
+.combine:
+ movdqa xmm2, [rax]
+ movdqa xmm4, [rdx]
+ add rax, rsi
+
+ ; src * src_weight
+ movdqa xmm3, xmm2
+ punpcklbw xmm2, xmm6
+ punpckhbw xmm3, xmm6
+ pmullw xmm2, xmm0
+ pmullw xmm3, xmm0
+
+ ; dst * dst_weight
+ movdqa xmm5, xmm4
+ punpcklbw xmm4, xmm6
+ punpckhbw xmm5, xmm6
+ pmullw xmm4, xmm1
+ pmullw xmm5, xmm1
+
+ ; sum, round and shift
+ paddw xmm2, xmm4
+ paddw xmm3, xmm5
+ paddw xmm2, [GLOBAL(tMFQE_round)]
+ paddw xmm3, [GLOBAL(tMFQE_round)]
+ psrlw xmm2, 4
+ psrlw xmm3, 4
+
+ packuswb xmm2, xmm3
+ movdqa [rdx], xmm2
+ add rdx, rdi
+
+ dec rcx
+ jnz .combine
+
+ ; begin epilog
+ pop rdi
+ pop rsi
+ RESTORE_GOT
+ RESTORE_XMM
+ UNSHADOW_ARGS
+ pop rbp
+
+ ret
+
+;void vp9_filter_by_weight8x8_sse2
+;(
+; unsigned char *src,
+; int src_stride,
+; unsigned char *dst,
+; int dst_stride,
+; int src_weight
+;)
+globalsym(vp9_filter_by_weight8x8_sse2)
+sym(vp9_filter_by_weight8x8_sse2):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 5
+ GET_GOT rbx
+ push rsi
+ push rdi
+ ; end prolog
+
+ movd xmm0, arg(4) ; src_weight
+ pshuflw xmm0, xmm0, 0x0 ; replicate to all low words
+ punpcklqdq xmm0, xmm0 ; replicate to all hi words
+
+ movdqa xmm1, [GLOBAL(tMFQE)]
+ psubw xmm1, xmm0 ; dst_weight
+
+ mov rax, arg(0) ; src
+ mov rsi, arg(1) ; src_stride
+ mov rdx, arg(2) ; dst
+ mov rdi, arg(3) ; dst_stride
+
+ mov rcx, 8 ; loop count
+ pxor xmm4, xmm4
+
+.combine:
+ movq xmm2, [rax]
+ movq xmm3, [rdx]
+ add rax, rsi
+
+ ; src * src_weight
+ punpcklbw xmm2, xmm4
+ pmullw xmm2, xmm0
+
+ ; dst * dst_weight
+ punpcklbw xmm3, xmm4
+ pmullw xmm3, xmm1
+
+ ; sum, round and shift
+ paddw xmm2, xmm3
+ paddw xmm2, [GLOBAL(tMFQE_round)]
+ psrlw xmm2, 4
+
+ packuswb xmm2, xmm4
+ movq [rdx], xmm2
+ add rdx, rdi
+
+ dec rcx
+ jnz .combine
+
+ ; begin epilog
+ pop rdi
+ pop rsi
+ RESTORE_GOT
+ UNSHADOW_ARGS
+ pop rbp
+
+ ret
+
+;void vp9_variance_and_sad_16x16_sse2 | arg
+;(
+; unsigned char *src1, 0
+; int stride1, 1
+; unsigned char *src2, 2
+; int stride2, 3
+; unsigned int *variance, 4
+; unsigned int *sad, 5
+;)
+globalsym(vp9_variance_and_sad_16x16_sse2)
+sym(vp9_variance_and_sad_16x16_sse2):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 6
+ GET_GOT rbx
+ push rsi
+ push rdi
+ ; end prolog
+
+ mov rax, arg(0) ; src1
+ mov rcx, arg(1) ; stride1
+ mov rdx, arg(2) ; src2
+ mov rdi, arg(3) ; stride2
+
+ mov rsi, 16 ; block height
+
+ ; Prep accumulator registers
+ pxor xmm3, xmm3 ; SAD
+ pxor xmm4, xmm4 ; sum of src2
+ pxor xmm5, xmm5 ; sum of src2^2
+
+ ; Because we're working with the actual output frames
+ ; we can't depend on any kind of data alignment.
+.accumulate:
+ movdqa xmm0, [rax] ; src1
+ movdqa xmm1, [rdx] ; src2
+ add rax, rcx ; src1 + stride1
+ add rdx, rdi ; src2 + stride2
+
+ ; SAD(src1, src2)
+ psadbw xmm0, xmm1
+ paddusw xmm3, xmm0
+
+ ; SUM(src2)
+ pxor xmm2, xmm2
+ psadbw xmm2, xmm1 ; sum src2 by misusing SAD against 0
+ paddusw xmm4, xmm2
+
+ ; pmaddubsw would be ideal if it took two unsigned values. instead,
+ ; it expects a signed and an unsigned value. so instead we zero extend
+ ; and operate on words.
+ pxor xmm2, xmm2
+ movdqa xmm0, xmm1
+ punpcklbw xmm0, xmm2
+ punpckhbw xmm1, xmm2
+ pmaddwd xmm0, xmm0
+ pmaddwd xmm1, xmm1
+ paddd xmm5, xmm0
+ paddd xmm5, xmm1
+
+ sub rsi, 1
+ jnz .accumulate
+
+ ; phaddd only operates on adjacent double words.
+ ; Finalize SAD and store
+ movdqa xmm0, xmm3
+ psrldq xmm0, 8
+ paddusw xmm0, xmm3
+ paddd xmm0, [GLOBAL(t128)]
+ psrld xmm0, 8
+
+ mov rax, arg(5)
+ movd [rax], xmm0
+
+ ; Accumulate sum of src2
+ movdqa xmm0, xmm4
+ psrldq xmm0, 8
+ paddusw xmm0, xmm4
+ ; Square src2. Ignore high value
+ pmuludq xmm0, xmm0
+ psrld xmm0, 8
+
+ ; phaddw could be used to sum adjacent values but we want
+ ; all the values summed. promote to doubles, accumulate,
+ ; shift and sum
+ pxor xmm2, xmm2
+ movdqa xmm1, xmm5
+ punpckldq xmm1, xmm2
+ punpckhdq xmm5, xmm2
+ paddd xmm1, xmm5
+ movdqa xmm2, xmm1
+ psrldq xmm1, 8
+ paddd xmm1, xmm2
+
+ psubd xmm1, xmm0
+
+ ; (variance + 128) >> 8
+ paddd xmm1, [GLOBAL(t128)]
+ psrld xmm1, 8
+ mov rax, arg(4)
+
+ movd [rax], xmm1
+
+
+ ; begin epilog
+ pop rdi
+ pop rsi
+ RESTORE_GOT
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+
+SECTION_RODATA
+align 16
+t128:
+%ifndef __NASM_VER__
+ ddq 128
+%elif CONFIG_BIG_ENDIAN
+ dq 0, 128
+%else
+ dq 128, 0
+%endif
+align 16
+tMFQE: ; 1 << MFQE_PRECISION
+ times 8 dw 0x10
+align 16
+tMFQE_round: ; 1 << (MFQE_PRECISION - 1)
+ times 8 dw 0x08