summaryrefslogtreecommitdiffstats
path: root/third_party/aom/aom_dsp/x86/inv_wht_sse2.asm
diff options
context:
space:
mode:
Diffstat (limited to '')
-rw-r--r--third_party/aom/aom_dsp/x86/inv_wht_sse2.asm107
1 files changed, 107 insertions, 0 deletions
diff --git a/third_party/aom/aom_dsp/x86/inv_wht_sse2.asm b/third_party/aom/aom_dsp/x86/inv_wht_sse2.asm
new file mode 100644
index 0000000000..0bc841a7a4
--- /dev/null
+++ b/third_party/aom/aom_dsp/x86/inv_wht_sse2.asm
@@ -0,0 +1,107 @@
+;
+; Copyright (c) 2016, Alliance for Open Media. All rights reserved
+;
+; This source code is subject to the terms of the BSD 2 Clause License and
+; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+; was not distributed with this source code in the LICENSE file, you can
+; obtain it at www.aomedia.org/license/software. If the Alliance for Open
+; Media Patent License 1.0 was not distributed with this source code in the
+; PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+;
+
+;
+
+%include "third_party/x86inc/x86inc.asm"
+
+SECTION .text
+
+%macro REORDER_INPUTS 0
+ ; a c d b to a b c d
+ SWAP 1, 3, 2
+%endmacro
+
+%macro TRANSFORM_COLS 0
+ ; input:
+ ; m0 a
+ ; m1 b
+ ; m2 c
+ ; m3 d
+ paddw m0, m2
+ psubw m3, m1
+
+ ; wide subtract
+ punpcklwd m4, m0
+ punpcklwd m5, m3
+ psrad m4, 16
+ psrad m5, 16
+ psubd m4, m5
+ psrad m4, 1
+ packssdw m4, m4 ; e
+
+ psubw m5, m4, m1 ; b
+ psubw m4, m2 ; c
+ psubw m0, m5
+ paddw m3, m4
+ ; m0 a
+ SWAP 1, 5 ; m1 b
+ SWAP 2, 4 ; m2 c
+ ; m3 d
+%endmacro
+
+%macro TRANSPOSE_4X4 0
+ punpcklwd m0, m2
+ punpcklwd m1, m3
+ mova m2, m0
+ punpcklwd m0, m1
+ punpckhwd m2, m1
+ pshufd m1, m0, 0x0e
+ pshufd m3, m2, 0x0e
+%endmacro
+
+; transpose a 4x4 int16 matrix in xmm0 and xmm1 to the bottom half of xmm0-xmm3
+%macro TRANSPOSE_4X4_WIDE 0
+ mova m3, m0
+ punpcklwd m0, m1
+ punpckhwd m3, m1
+ mova m2, m0
+ punpcklwd m0, m3
+ punpckhwd m2, m3
+ pshufd m1, m0, 0x0e
+ pshufd m3, m2, 0x0e
+%endmacro
+
+%macro ADD_STORE_4P_2X 5 ; src1, src2, tmp1, tmp2, zero
+ movd m%3, [outputq]
+ movd m%4, [outputq + strideq]
+ punpcklbw m%3, m%5
+ punpcklbw m%4, m%5
+ paddw m%1, m%3
+ paddw m%2, m%4
+ packuswb m%1, m%5
+ packuswb m%2, m%5
+ movd [outputq], m%1
+ movd [outputq + strideq], m%2
+%endmacro
+
+INIT_XMM sse2
+cglobal iwht4x4_16_add, 3, 3, 7, input, output, stride
+ mova m0, [inputq + 0]
+ packssdw m0, [inputq + 16]
+ mova m1, [inputq + 32]
+ packssdw m1, [inputq + 48]
+ psraw m0, 2
+ psraw m1, 2
+
+ TRANSPOSE_4X4_WIDE
+ REORDER_INPUTS
+ TRANSFORM_COLS
+ TRANSPOSE_4X4
+ REORDER_INPUTS
+ TRANSFORM_COLS
+
+ pxor m4, m4
+ ADD_STORE_4P_2X 0, 1, 5, 6, 4
+ lea outputq, [outputq + 2 * strideq]
+ ADD_STORE_4P_2X 2, 3, 5, 6, 4
+
+ RET