summaryrefslogtreecommitdiffstats
path: root/media/libvpx/libvpx/vp9/common/x86/vp9_highbd_iht4x4_add_sse4.c
diff options
context:
space:
mode:
Diffstat (limited to 'media/libvpx/libvpx/vp9/common/x86/vp9_highbd_iht4x4_add_sse4.c')
-rw-r--r--media/libvpx/libvpx/vp9/common/x86/vp9_highbd_iht4x4_add_sse4.c131
1 files changed, 131 insertions, 0 deletions
diff --git a/media/libvpx/libvpx/vp9/common/x86/vp9_highbd_iht4x4_add_sse4.c b/media/libvpx/libvpx/vp9/common/x86/vp9_highbd_iht4x4_add_sse4.c
new file mode 100644
index 0000000000..af158536f9
--- /dev/null
+++ b/media/libvpx/libvpx/vp9/common/x86/vp9_highbd_iht4x4_add_sse4.c
@@ -0,0 +1,131 @@
+/*
+ * Copyright (c) 2018 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "./vp9_rtcd.h"
+#include "vp9/common/vp9_idct.h"
+#include "vpx_dsp/x86/highbd_inv_txfm_sse4.h"
+#include "vpx_dsp/x86/inv_txfm_sse2.h"
+#include "vpx_dsp/x86/transpose_sse2.h"
+#include "vpx_dsp/x86/txfm_common_sse2.h"
+
+static INLINE void highbd_iadst4_sse4_1(__m128i *const io) {
+ const __m128i pair_c1 = pair_set_epi32(4 * sinpi_1_9, 0);
+ const __m128i pair_c2 = pair_set_epi32(4 * sinpi_2_9, 0);
+ const __m128i pair_c3 = pair_set_epi32(4 * sinpi_3_9, 0);
+ const __m128i pair_c4 = pair_set_epi32(4 * sinpi_4_9, 0);
+ __m128i s0[2], s1[2], s2[2], s3[2], s4[2], s5[2], s6[2], t0[2], t1[2], t2[2];
+ __m128i temp[2];
+
+ transpose_32bit_4x4(io, io);
+
+ extend_64bit(io[0], temp);
+ s0[0] = _mm_mul_epi32(pair_c1, temp[0]);
+ s0[1] = _mm_mul_epi32(pair_c1, temp[1]);
+ s1[0] = _mm_mul_epi32(pair_c2, temp[0]);
+ s1[1] = _mm_mul_epi32(pair_c2, temp[1]);
+
+ extend_64bit(io[1], temp);
+ s2[0] = _mm_mul_epi32(pair_c3, temp[0]);
+ s2[1] = _mm_mul_epi32(pair_c3, temp[1]);
+
+ extend_64bit(io[2], temp);
+ s3[0] = _mm_mul_epi32(pair_c4, temp[0]);
+ s3[1] = _mm_mul_epi32(pair_c4, temp[1]);
+ s4[0] = _mm_mul_epi32(pair_c1, temp[0]);
+ s4[1] = _mm_mul_epi32(pair_c1, temp[1]);
+
+ extend_64bit(io[3], temp);
+ s5[0] = _mm_mul_epi32(pair_c2, temp[0]);
+ s5[1] = _mm_mul_epi32(pair_c2, temp[1]);
+ s6[0] = _mm_mul_epi32(pair_c4, temp[0]);
+ s6[1] = _mm_mul_epi32(pair_c4, temp[1]);
+
+ t0[0] = _mm_add_epi64(s0[0], s3[0]);
+ t0[1] = _mm_add_epi64(s0[1], s3[1]);
+ t0[0] = _mm_add_epi64(t0[0], s5[0]);
+ t0[1] = _mm_add_epi64(t0[1], s5[1]);
+ t1[0] = _mm_sub_epi64(s1[0], s4[0]);
+ t1[1] = _mm_sub_epi64(s1[1], s4[1]);
+ t1[0] = _mm_sub_epi64(t1[0], s6[0]);
+ t1[1] = _mm_sub_epi64(t1[1], s6[1]);
+ temp[0] = _mm_sub_epi32(io[0], io[2]);
+ temp[0] = _mm_add_epi32(temp[0], io[3]);
+ extend_64bit(temp[0], temp);
+ t2[0] = _mm_mul_epi32(pair_c3, temp[0]);
+ t2[1] = _mm_mul_epi32(pair_c3, temp[1]);
+
+ s0[0] = _mm_add_epi64(t0[0], s2[0]);
+ s0[1] = _mm_add_epi64(t0[1], s2[1]);
+ s1[0] = _mm_add_epi64(t1[0], s2[0]);
+ s1[1] = _mm_add_epi64(t1[1], s2[1]);
+ s3[0] = _mm_add_epi64(t0[0], t1[0]);
+ s3[1] = _mm_add_epi64(t0[1], t1[1]);
+ s3[0] = _mm_sub_epi64(s3[0], s2[0]);
+ s3[1] = _mm_sub_epi64(s3[1], s2[1]);
+
+ s0[0] = dct_const_round_shift_64bit(s0[0]);
+ s0[1] = dct_const_round_shift_64bit(s0[1]);
+ s1[0] = dct_const_round_shift_64bit(s1[0]);
+ s1[1] = dct_const_round_shift_64bit(s1[1]);
+ s2[0] = dct_const_round_shift_64bit(t2[0]);
+ s2[1] = dct_const_round_shift_64bit(t2[1]);
+ s3[0] = dct_const_round_shift_64bit(s3[0]);
+ s3[1] = dct_const_round_shift_64bit(s3[1]);
+ io[0] = pack_4(s0[0], s0[1]);
+ io[1] = pack_4(s1[0], s1[1]);
+ io[2] = pack_4(s2[0], s2[1]);
+ io[3] = pack_4(s3[0], s3[1]);
+}
+
+void vp9_highbd_iht4x4_16_add_sse4_1(const tran_low_t *input, uint16_t *dest,
+ int stride, int tx_type, int bd) {
+ __m128i io[4];
+
+ io[0] = _mm_load_si128((const __m128i *)(input + 0));
+ io[1] = _mm_load_si128((const __m128i *)(input + 4));
+ io[2] = _mm_load_si128((const __m128i *)(input + 8));
+ io[3] = _mm_load_si128((const __m128i *)(input + 12));
+
+ if (bd == 8) {
+ __m128i io_short[2];
+
+ io_short[0] = _mm_packs_epi32(io[0], io[1]);
+ io_short[1] = _mm_packs_epi32(io[2], io[3]);
+ if (tx_type == DCT_DCT || tx_type == ADST_DCT) {
+ idct4_sse2(io_short);
+ } else {
+ iadst4_sse2(io_short);
+ }
+ if (tx_type == DCT_DCT || tx_type == DCT_ADST) {
+ idct4_sse2(io_short);
+ } else {
+ iadst4_sse2(io_short);
+ }
+ io_short[0] = _mm_add_epi16(io_short[0], _mm_set1_epi16(8));
+ io_short[1] = _mm_add_epi16(io_short[1], _mm_set1_epi16(8));
+ io[0] = _mm_srai_epi16(io_short[0], 4);
+ io[1] = _mm_srai_epi16(io_short[1], 4);
+ } else {
+ if (tx_type == DCT_DCT || tx_type == ADST_DCT) {
+ highbd_idct4_sse4_1(io);
+ } else {
+ highbd_iadst4_sse4_1(io);
+ }
+ if (tx_type == DCT_DCT || tx_type == DCT_ADST) {
+ highbd_idct4_sse4_1(io);
+ } else {
+ highbd_iadst4_sse4_1(io);
+ }
+ io[0] = wraplow_16bit_shift4(io[0], io[1], _mm_set1_epi32(8));
+ io[1] = wraplow_16bit_shift4(io[2], io[3], _mm_set1_epi32(8));
+ }
+
+ recon_and_store_4x4(io, dest, stride, bd);
+}