diff options
Diffstat (limited to '')
48 files changed, 26642 insertions, 0 deletions
diff --git a/media/ffvpx/libavcodec/x86/constants.c b/media/ffvpx/libavcodec/x86/constants.c new file mode 100644 index 0000000000..bc7f2b17b8 --- /dev/null +++ b/media/ffvpx/libavcodec/x86/constants.c @@ -0,0 +1,93 @@ +/* + * MMX/SSE/AVX constants used across x86 dsp optimizations. + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#include "libavutil/mem_internal.h" +#include "libavutil/x86/asm.h" // for xmm_reg +#include "constants.h" + +DECLARE_ALIGNED(32, const ymm_reg, ff_pw_1) = { 0x0001000100010001ULL, 0x0001000100010001ULL, + 0x0001000100010001ULL, 0x0001000100010001ULL }; +DECLARE_ALIGNED(32, const ymm_reg, ff_pw_2) = { 0x0002000200020002ULL, 0x0002000200020002ULL, + 0x0002000200020002ULL, 0x0002000200020002ULL }; +DECLARE_ASM_ALIGNED(16, const xmm_reg, ff_pw_3) = { 0x0003000300030003ULL, 0x0003000300030003ULL }; +DECLARE_ASM_ALIGNED(32, const ymm_reg, ff_pw_4) = { 0x0004000400040004ULL, 0x0004000400040004ULL, + 0x0004000400040004ULL, 0x0004000400040004ULL }; +DECLARE_ASM_ALIGNED(16, const xmm_reg, ff_pw_5) = { 0x0005000500050005ULL, 0x0005000500050005ULL }; +DECLARE_ALIGNED(16, const xmm_reg, ff_pw_8) = { 0x0008000800080008ULL, 0x0008000800080008ULL }; +DECLARE_ASM_ALIGNED(16, const xmm_reg, ff_pw_9) = { 0x0009000900090009ULL, 0x0009000900090009ULL }; +DECLARE_ALIGNED(8, const uint64_t, ff_pw_15) = 0x000F000F000F000FULL; +DECLARE_ALIGNED(16, const xmm_reg, ff_pw_16) = { 0x0010001000100010ULL, 0x0010001000100010ULL }; +DECLARE_ASM_ALIGNED(16, const xmm_reg, ff_pw_18) = { 0x0012001200120012ULL, 0x0012001200120012ULL }; +DECLARE_ALIGNED(16, const xmm_reg, ff_pw_20) = { 0x0014001400140014ULL, 0x0014001400140014ULL }; +DECLARE_ALIGNED(16, const xmm_reg, ff_pw_32) = { 0x0020002000200020ULL, 0x0020002000200020ULL }; +DECLARE_ASM_ALIGNED(8, const uint64_t, ff_pw_42) = 0x002A002A002A002AULL; +DECLARE_ASM_ALIGNED(8, const uint64_t, ff_pw_53) = 0x0035003500350035ULL; +DECLARE_ASM_ALIGNED(16, const xmm_reg, ff_pw_64) = { 0x0040004000400040ULL, 0x0040004000400040ULL }; +DECLARE_ASM_ALIGNED(8, const uint64_t, ff_pw_96) = 0x0060006000600060ULL; +DECLARE_ASM_ALIGNED(8, const uint64_t, ff_pw_128) = 0x0080008000800080ULL; +DECLARE_ALIGNED(32, const ymm_reg, ff_pw_255) = { 0x00ff00ff00ff00ffULL, 0x00ff00ff00ff00ffULL, + 0x00ff00ff00ff00ffULL, 0x00ff00ff00ff00ffULL }; +DECLARE_ALIGNED(32, const ymm_reg, ff_pw_256) = { 0x0100010001000100ULL, 0x0100010001000100ULL, + 0x0100010001000100ULL, 0x0100010001000100ULL }; +DECLARE_ALIGNED(32, const ymm_reg, ff_pw_512) = { 0x0200020002000200ULL, 0x0200020002000200ULL, + 0x0200020002000200ULL, 0x0200020002000200ULL }; +DECLARE_ALIGNED(16, const xmm_reg, ff_pw_1019) = { 0x03FB03FB03FB03FBULL, 0x03FB03FB03FB03FBULL }; +DECLARE_ALIGNED(32, const ymm_reg, ff_pw_1023) = { 0x03ff03ff03ff03ffULL, 0x03ff03ff03ff03ffULL, + 0x03ff03ff03ff03ffULL, 0x03ff03ff03ff03ffULL}; +DECLARE_ALIGNED(32, const ymm_reg, ff_pw_1024) = { 0x0400040004000400ULL, 0x0400040004000400ULL, + 0x0400040004000400ULL, 0x0400040004000400ULL}; +DECLARE_ALIGNED(32, const ymm_reg, ff_pw_2048) = { 0x0800080008000800ULL, 0x0800080008000800ULL, + 0x0800080008000800ULL, 0x0800080008000800ULL }; +DECLARE_ALIGNED(32, const ymm_reg, ff_pw_4095) = { 0x0fff0fff0fff0fffULL, 0x0fff0fff0fff0fffULL, + 0x0fff0fff0fff0fffULL, 0x0fff0fff0fff0fffULL }; +DECLARE_ALIGNED(32, const ymm_reg, ff_pw_4096) = { 0x1000100010001000ULL, 0x1000100010001000ULL, + 0x1000100010001000ULL, 0x1000100010001000ULL }; +DECLARE_ALIGNED(32, const ymm_reg, ff_pw_8192) = { 0x2000200020002000ULL, 0x2000200020002000ULL, + 0x2000200020002000ULL, 0x2000200020002000ULL }; +DECLARE_ALIGNED(32, const ymm_reg, ff_pw_m1) = { 0xFFFFFFFFFFFFFFFFULL, 0xFFFFFFFFFFFFFFFFULL, + 0xFFFFFFFFFFFFFFFFULL, 0xFFFFFFFFFFFFFFFFULL }; + +DECLARE_ALIGNED(32, const ymm_reg, ff_pb_0) = { 0x0000000000000000ULL, 0x0000000000000000ULL, + 0x0000000000000000ULL, 0x0000000000000000ULL }; +DECLARE_ALIGNED(32, const ymm_reg, ff_pb_1) = { 0x0101010101010101ULL, 0x0101010101010101ULL, + 0x0101010101010101ULL, 0x0101010101010101ULL }; +DECLARE_ALIGNED(32, const ymm_reg, ff_pb_2) = { 0x0202020202020202ULL, 0x0202020202020202ULL, + 0x0202020202020202ULL, 0x0202020202020202ULL }; +DECLARE_ALIGNED(32, const ymm_reg, ff_pb_3) = { 0x0303030303030303ULL, 0x0303030303030303ULL, + 0x0303030303030303ULL, 0x0303030303030303ULL }; +DECLARE_ALIGNED(32, const xmm_reg, ff_pb_15) = { 0x0F0F0F0F0F0F0F0FULL, 0x0F0F0F0F0F0F0F0FULL }; +DECLARE_ALIGNED(32, const ymm_reg, ff_pb_80) = { 0x8080808080808080ULL, 0x8080808080808080ULL, + 0x8080808080808080ULL, 0x8080808080808080ULL }; +DECLARE_ALIGNED(32, const ymm_reg, ff_pb_FE) = { 0xFEFEFEFEFEFEFEFEULL, 0xFEFEFEFEFEFEFEFEULL, + 0xFEFEFEFEFEFEFEFEULL, 0xFEFEFEFEFEFEFEFEULL }; +DECLARE_ALIGNED(8, const uint64_t, ff_pb_FC) = 0xFCFCFCFCFCFCFCFCULL; + +DECLARE_ALIGNED(16, const xmm_reg, ff_ps_neg) = { 0x8000000080000000ULL, 0x8000000080000000ULL }; + +DECLARE_ALIGNED(32, const ymm_reg, ff_pd_1) = { 0x0000000100000001ULL, 0x0000000100000001ULL, + 0x0000000100000001ULL, 0x0000000100000001ULL }; +DECLARE_ALIGNED(32, const ymm_reg, ff_pd_16) = { 0x0000001000000010ULL, 0x0000001000000010ULL, + 0x0000001000000010ULL, 0x0000001000000010ULL }; +DECLARE_ALIGNED(32, const ymm_reg, ff_pd_32) = { 0x0000002000000020ULL, 0x0000002000000020ULL, + 0x0000002000000020ULL, 0x0000002000000020ULL }; +DECLARE_ALIGNED(32, const ymm_reg, ff_pd_8192) = { 0x0000200000002000ULL, 0x0000200000002000ULL, + 0x0000200000002000ULL, 0x0000200000002000ULL }; +DECLARE_ALIGNED(32, const ymm_reg, ff_pd_65535)= { 0x0000ffff0000ffffULL, 0x0000ffff0000ffffULL, + 0x0000ffff0000ffffULL, 0x0000ffff0000ffffULL }; diff --git a/media/ffvpx/libavcodec/x86/constants.h b/media/ffvpx/libavcodec/x86/constants.h new file mode 100644 index 0000000000..85da38b7b9 --- /dev/null +++ b/media/ffvpx/libavcodec/x86/constants.h @@ -0,0 +1,72 @@ +/* + * MMX/SSE constants used across x86 dsp optimizations. + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#ifndef AVCODEC_X86_CONSTANTS_H +#define AVCODEC_X86_CONSTANTS_H + +#include <stdint.h> + +#include "libavutil/x86/asm.h" + +extern const ymm_reg ff_pw_1; +extern const ymm_reg ff_pw_2; +extern const xmm_reg ff_pw_3; +extern const ymm_reg ff_pw_4; +extern const xmm_reg ff_pw_5; +extern const xmm_reg ff_pw_8; +extern const xmm_reg ff_pw_9; +extern const uint64_t ff_pw_15; +extern const xmm_reg ff_pw_16; +extern const xmm_reg ff_pw_18; +extern const xmm_reg ff_pw_20; +extern const xmm_reg ff_pw_32; +extern const uint64_t ff_pw_42; +extern const uint64_t ff_pw_53; +extern const xmm_reg ff_pw_64; +extern const uint64_t ff_pw_96; +extern const uint64_t ff_pw_128; +extern const ymm_reg ff_pw_255; +extern const ymm_reg ff_pw_256; +extern const ymm_reg ff_pw_512; +extern const ymm_reg ff_pw_1023; +extern const ymm_reg ff_pw_1024; +extern const ymm_reg ff_pw_2048; +extern const ymm_reg ff_pw_4095; +extern const ymm_reg ff_pw_4096; +extern const ymm_reg ff_pw_8192; +extern const ymm_reg ff_pw_m1; + +extern const ymm_reg ff_pb_0; +extern const ymm_reg ff_pb_1; +extern const ymm_reg ff_pb_2; +extern const ymm_reg ff_pb_3; +extern const ymm_reg ff_pb_80; +extern const ymm_reg ff_pb_FE; +extern const uint64_t ff_pb_FC; + +extern const xmm_reg ff_ps_neg; + +extern const ymm_reg ff_pd_1; +extern const ymm_reg ff_pd_16; +extern const ymm_reg ff_pd_32; +extern const ymm_reg ff_pd_8192; +extern const ymm_reg ff_pd_65535; + +#endif /* AVCODEC_X86_CONSTANTS_H */ diff --git a/media/ffvpx/libavcodec/x86/dct32.asm b/media/ffvpx/libavcodec/x86/dct32.asm new file mode 100644 index 0000000000..37fba51543 --- /dev/null +++ b/media/ffvpx/libavcodec/x86/dct32.asm @@ -0,0 +1,481 @@ +;****************************************************************************** +;* 32 point SSE-optimized DCT transform +;* Copyright (c) 2010 Vitor Sessak +;* +;* This file is part of FFmpeg. +;* +;* FFmpeg is free software; you can redistribute it and/or +;* modify it under the terms of the GNU Lesser General Public +;* License as published by the Free Software Foundation; either +;* version 2.1 of the License, or (at your option) any later version. +;* +;* FFmpeg is distributed in the hope that it will be useful, +;* but WITHOUT ANY WARRANTY; without even the implied warranty of +;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +;* Lesser General Public License for more details. +;* +;* You should have received a copy of the GNU Lesser General Public +;* License along with FFmpeg; if not, write to the Free Software +;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA +;****************************************************************************** + +%include "libavutil/x86/x86util.asm" + +SECTION_RODATA 32 + +ps_p1p1m1m1: dd 0, 0, 0x80000000, 0x80000000, 0, 0, 0x80000000, 0x80000000 + +ps_cos_vec: dd 0.500603, 0.505471, 0.515447, 0.531043 + dd 0.553104, 0.582935, 0.622504, 0.674808 + dd -10.190008, -3.407609, -2.057781, -1.484165 + dd -1.169440, -0.972568, -0.839350, -0.744536 + dd 0.502419, 0.522499, 0.566944, 0.646822 + dd 0.788155, 1.060678, 1.722447, 5.101149 + dd 0.509796, 0.601345, 0.899976, 2.562916 + dd 0.509796, 0.601345, 0.899976, 2.562916 + dd 1.000000, 1.000000, 1.306563, 0.541196 + dd 1.000000, 1.000000, 1.306563, 0.541196 + dd 1.000000, 0.707107, 1.000000, -0.707107 + dd 1.000000, 0.707107, 1.000000, -0.707107 + dd 0.707107, 0.707107, 0.707107, 0.707107 + +%macro BUTTERFLY 4 + subps %4, %1, %2 + addps %2, %2, %1 + mulps %1, %4, %3 +%endmacro + +%macro BUTTERFLY0 5 +%if cpuflag(sse2) && notcpuflag(avx) + pshufd %4, %1, %5 + xorps %1, %2 + addps %1, %4 + mulps %1, %3 +%else + shufps %4, %1, %1, %5 + xorps %1, %1, %2 + addps %4, %4, %1 + mulps %1, %4, %3 +%endif +%endmacro + +%macro BUTTERFLY2 4 + BUTTERFLY0 %1, %2, %3, %4, 0x1b +%endmacro + +%macro BUTTERFLY3 4 + BUTTERFLY0 %1, %2, %3, %4, 0xb1 +%endmacro + +%macro BUTTERFLY3V 5 + movaps m%5, m%1 + addps m%1, m%2 + subps m%5, m%2 + SWAP %2, %5 + mulps m%2, [ps_cos_vec+192] + movaps m%5, m%3 + addps m%3, m%4 + subps m%4, m%5 + mulps m%4, [ps_cos_vec+192] +%endmacro + +%macro PASS6_AND_PERMUTE 0 + mov tmpd, [outq+4] + movss m7, [outq+72] + addss m7, [outq+76] + movss m3, [outq+56] + addss m3, [outq+60] + addss m4, m3 + movss m2, [outq+52] + addss m2, m3 + movss m3, [outq+104] + addss m3, [outq+108] + addss m1, m3 + addss m5, m4 + movss [outq+ 16], m1 + movss m1, [outq+100] + addss m1, m3 + movss m3, [outq+40] + movss [outq+ 48], m1 + addss m3, [outq+44] + movss m1, [outq+100] + addss m4, m3 + addss m3, m2 + addss m1, [outq+108] + movss [outq+ 40], m3 + addss m2, [outq+36] + movss m3, [outq+8] + movss [outq+ 56], m2 + addss m3, [outq+12] + movss [outq+ 32], m3 + movss m3, [outq+80] + movss [outq+ 8], m5 + movss [outq+ 80], m1 + movss m2, [outq+52] + movss m5, [outq+120] + addss m5, [outq+124] + movss m1, [outq+64] + addss m2, [outq+60] + addss m0, m5 + addss m5, [outq+116] + mov [outq+64], tmpd + addss m6, m0 + addss m1, m6 + mov tmpd, [outq+12] + mov [outq+ 96], tmpd + movss [outq+ 4], m1 + movss m1, [outq+24] + movss [outq+ 24], m4 + movss m4, [outq+88] + addss m4, [outq+92] + addss m3, m4 + addss m4, [outq+84] + mov tmpd, [outq+108] + addss m1, [outq+28] + addss m0, m1 + addss m1, m5 + addss m6, m3 + addss m3, m0 + addss m0, m7 + addss m5, [outq+20] + addss m7, m1 + movss [outq+ 12], m6 + mov [outq+112], tmpd + movss m6, [outq+28] + movss [outq+ 28], m0 + movss m0, [outq+36] + movss [outq+ 36], m7 + addss m1, m4 + movss m7, [outq+116] + addss m0, m2 + addss m7, [outq+124] + movss [outq+ 72], m0 + movss m0, [outq+44] + addss m2, m0 + movss [outq+ 44], m1 + movss [outq+ 88], m2 + addss m0, [outq+60] + mov tmpd, [outq+60] + mov [outq+120], tmpd + movss [outq+104], m0 + addss m4, m5 + addss m5, [outq+68] + movss [outq+52], m4 + movss [outq+60], m5 + movss m4, [outq+68] + movss m5, [outq+20] + movss [outq+ 20], m3 + addss m5, m7 + addss m7, m6 + addss m4, m5 + movss m2, [outq+84] + addss m2, [outq+92] + addss m5, m2 + movss [outq+ 68], m4 + addss m2, m7 + movss m4, [outq+76] + movss [outq+ 84], m2 + movss [outq+ 76], m5 + addss m7, m4 + addss m6, [outq+124] + addss m4, m6 + addss m6, [outq+92] + movss [outq+100], m4 + movss [outq+108], m6 + movss m6, [outq+92] + movss [outq+92], m7 + addss m6, [outq+124] + movss [outq+116], m6 +%endmacro + +INIT_YMM avx +SECTION .text +%if HAVE_AVX_EXTERNAL +; void ff_dct32_float_avx(FFTSample *out, const FFTSample *in) +cglobal dct32_float, 2,3,8, out, in, tmp + ; pass 1 + vmovaps m4, [inq+0] + vinsertf128 m5, m5, [inq+96], 1 + vinsertf128 m5, m5, [inq+112], 0 + vshufps m5, m5, m5, 0x1b + BUTTERFLY m4, m5, [ps_cos_vec], m6 + + vmovaps m2, [inq+64] + vinsertf128 m6, m6, [inq+32], 1 + vinsertf128 m6, m6, [inq+48], 0 + vshufps m6, m6, m6, 0x1b + BUTTERFLY m2, m6, [ps_cos_vec+32], m0 + + ; pass 2 + + BUTTERFLY m5, m6, [ps_cos_vec+64], m0 + BUTTERFLY m4, m2, [ps_cos_vec+64], m7 + + + ; pass 3 + vperm2f128 m3, m6, m4, 0x31 + vperm2f128 m1, m6, m4, 0x20 + vshufps m3, m3, m3, 0x1b + + BUTTERFLY m1, m3, [ps_cos_vec+96], m6 + + + vperm2f128 m4, m5, m2, 0x20 + vperm2f128 m5, m5, m2, 0x31 + vshufps m5, m5, m5, 0x1b + + BUTTERFLY m4, m5, [ps_cos_vec+96], m6 + + ; pass 4 + vmovaps m6, [ps_p1p1m1m1+0] + vmovaps m2, [ps_cos_vec+128] + + BUTTERFLY2 m5, m6, m2, m7 + BUTTERFLY2 m4, m6, m2, m7 + BUTTERFLY2 m1, m6, m2, m7 + BUTTERFLY2 m3, m6, m2, m7 + + + ; pass 5 + vshufps m6, m6, m6, 0xcc + vmovaps m2, [ps_cos_vec+160] + + BUTTERFLY3 m5, m6, m2, m7 + BUTTERFLY3 m4, m6, m2, m7 + BUTTERFLY3 m1, m6, m2, m7 + BUTTERFLY3 m3, m6, m2, m7 + + vperm2f128 m6, m3, m3, 0x31 + vmovaps [outq], m3 + + vextractf128 [outq+64], m5, 1 + vextractf128 [outq+32], m5, 0 + + vextractf128 [outq+80], m4, 1 + vextractf128 [outq+48], m4, 0 + + vperm2f128 m0, m1, m1, 0x31 + vmovaps [outq+96], m1 + + vzeroupper + + ; pass 6, no SIMD... +INIT_XMM + PASS6_AND_PERMUTE + RET +%endif + +%if ARCH_X86_64 +%define SPILL SWAP +%define UNSPILL SWAP + +%macro PASS5 0 + nop ; FIXME code alignment + SWAP 5, 8 + SWAP 4, 12 + SWAP 6, 14 + SWAP 7, 13 + SWAP 0, 15 + PERMUTE 9,10, 10,12, 11,14, 12,9, 13,11, 14,13 + TRANSPOSE4x4PS 8, 9, 10, 11, 0 + BUTTERFLY3V 8, 9, 10, 11, 0 + addps m10, m11 + TRANSPOSE4x4PS 12, 13, 14, 15, 0 + BUTTERFLY3V 12, 13, 14, 15, 0 + addps m14, m15 + addps m12, m14 + addps m14, m13 + addps m13, m15 +%endmacro + +%macro PASS6 0 + SWAP 9, 12 + SWAP 11, 14 + movss [outq+0x00], m8 + pshuflw m0, m8, 0xe + movss [outq+0x10], m9 + pshuflw m1, m9, 0xe + movss [outq+0x20], m10 + pshuflw m2, m10, 0xe + movss [outq+0x30], m11 + pshuflw m3, m11, 0xe + movss [outq+0x40], m12 + pshuflw m4, m12, 0xe + movss [outq+0x50], m13 + pshuflw m5, m13, 0xe + movss [outq+0x60], m14 + pshuflw m6, m14, 0xe + movaps [outq+0x70], m15 + pshuflw m7, m15, 0xe + addss m0, m1 + addss m1, m2 + movss [outq+0x08], m0 + addss m2, m3 + movss [outq+0x18], m1 + addss m3, m4 + movss [outq+0x28], m2 + addss m4, m5 + movss [outq+0x38], m3 + addss m5, m6 + movss [outq+0x48], m4 + addss m6, m7 + movss [outq+0x58], m5 + movss [outq+0x68], m6 + movss [outq+0x78], m7 + + PERMUTE 1,8, 3,9, 5,10, 7,11, 9,12, 11,13, 13,14, 8,1, 10,3, 12,5, 14,7 + movhlps m0, m1 + pshufd m1, m1, 3 + SWAP 0, 2, 4, 6, 8, 10, 12, 14 + SWAP 1, 3, 5, 7, 9, 11, 13, 15 +%rep 7 + movhlps m0, m1 + pshufd m1, m1, 3 + addss m15, m1 + SWAP 0, 2, 4, 6, 8, 10, 12, 14 + SWAP 1, 3, 5, 7, 9, 11, 13, 15 +%endrep +%assign i 4 +%rep 15 + addss m0, m1 + movss [outq+i], m0 + SWAP 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 + %assign i i+8 +%endrep +%endmacro + +%else ; ARCH_X86_32 +%macro SPILL 2 ; xmm#, mempos + movaps [outq+(%2-8)*16], m%1 +%endmacro +%macro UNSPILL 2 + movaps m%1, [outq+(%2-8)*16] +%endmacro + +%define PASS6 PASS6_AND_PERMUTE +%macro PASS5 0 + movaps m2, [ps_cos_vec+160] + shufps m3, m3, 0xcc + + BUTTERFLY3 m5, m3, m2, m1 + SPILL 5, 8 + + UNSPILL 1, 9 + BUTTERFLY3 m1, m3, m2, m5 + SPILL 1, 14 + + BUTTERFLY3 m4, m3, m2, m5 + SPILL 4, 12 + + BUTTERFLY3 m7, m3, m2, m5 + SPILL 7, 13 + + UNSPILL 5, 10 + BUTTERFLY3 m5, m3, m2, m7 + SPILL 5, 10 + + UNSPILL 4, 11 + BUTTERFLY3 m4, m3, m2, m7 + SPILL 4, 11 + + BUTTERFLY3 m6, m3, m2, m7 + SPILL 6, 9 + + BUTTERFLY3 m0, m3, m2, m7 + SPILL 0, 15 +%endmacro +%endif + + +; void ff_dct32_float(FFTSample *out, const FFTSample *in) +%macro DCT32_FUNC 0 +cglobal dct32_float, 2, 3, 16, out, in, tmp + ; pass 1 + + movaps m0, [inq+0] + LOAD_INV m1, [inq+112] + BUTTERFLY m0, m1, [ps_cos_vec], m3 + + movaps m7, [inq+64] + LOAD_INV m4, [inq+48] + BUTTERFLY m7, m4, [ps_cos_vec+32], m3 + + ; pass 2 + movaps m2, [ps_cos_vec+64] + BUTTERFLY m1, m4, m2, m3 + SPILL 1, 11 + SPILL 4, 8 + + ; pass 1 + movaps m1, [inq+16] + LOAD_INV m6, [inq+96] + BUTTERFLY m1, m6, [ps_cos_vec+16], m3 + + movaps m4, [inq+80] + LOAD_INV m5, [inq+32] + BUTTERFLY m4, m5, [ps_cos_vec+48], m3 + + ; pass 2 + BUTTERFLY m0, m7, m2, m3 + + movaps m2, [ps_cos_vec+80] + BUTTERFLY m6, m5, m2, m3 + + BUTTERFLY m1, m4, m2, m3 + + ; pass 3 + movaps m2, [ps_cos_vec+96] + shufps m1, m1, 0x1b + BUTTERFLY m0, m1, m2, m3 + SPILL 0, 15 + SPILL 1, 14 + + UNSPILL 0, 8 + shufps m5, m5, 0x1b + BUTTERFLY m0, m5, m2, m3 + + UNSPILL 1, 11 + shufps m6, m6, 0x1b + BUTTERFLY m1, m6, m2, m3 + SPILL 1, 11 + + shufps m4, m4, 0x1b + BUTTERFLY m7, m4, m2, m3 + + ; pass 4 + movaps m3, [ps_p1p1m1m1+0] + movaps m2, [ps_cos_vec+128] + + BUTTERFLY2 m5, m3, m2, m1 + + BUTTERFLY2 m0, m3, m2, m1 + SPILL 0, 9 + + BUTTERFLY2 m6, m3, m2, m1 + SPILL 6, 10 + + UNSPILL 0, 11 + BUTTERFLY2 m0, m3, m2, m1 + SPILL 0, 11 + + BUTTERFLY2 m4, m3, m2, m1 + + BUTTERFLY2 m7, m3, m2, m1 + + UNSPILL 6, 14 + BUTTERFLY2 m6, m3, m2, m1 + + UNSPILL 0, 15 + BUTTERFLY2 m0, m3, m2, m1 + + PASS5 + PASS6 + RET +%endmacro + +%macro LOAD_INV 2 + pshufd %1, %2, 0x1b +%endmacro + +INIT_XMM sse2 +DCT32_FUNC diff --git a/media/ffvpx/libavcodec/x86/dct_init.c b/media/ffvpx/libavcodec/x86/dct_init.c new file mode 100644 index 0000000000..d0e4b34dd3 --- /dev/null +++ b/media/ffvpx/libavcodec/x86/dct_init.c @@ -0,0 +1,36 @@ +/* + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#include "config.h" +#include "libavutil/attributes.h" +#include "libavutil/cpu.h" +#include "libavutil/x86/cpu.h" +#include "libavcodec/dct.h" + +void ff_dct32_float_sse2(FFTSample *out, const FFTSample *in); +void ff_dct32_float_avx(FFTSample *out, const FFTSample *in); + +av_cold void ff_dct_init_x86(DCTContext *s) +{ + int cpu_flags = av_get_cpu_flags(); + + if (EXTERNAL_SSE2(cpu_flags)) + s->dct32 = ff_dct32_float_sse2; + if (EXTERNAL_AVX_FAST(cpu_flags)) + s->dct32 = ff_dct32_float_avx; +} diff --git a/media/ffvpx/libavcodec/x86/fdct.c b/media/ffvpx/libavcodec/x86/fdct.c new file mode 100644 index 0000000000..f4677ff4be --- /dev/null +++ b/media/ffvpx/libavcodec/x86/fdct.c @@ -0,0 +1,378 @@ +/* + * SIMD-optimized forward DCT + * The gcc porting is Copyright (c) 2001 Fabrice Bellard. + * cleanup/optimizations are Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at> + * SSE2 optimization is Copyright (c) 2004 Denes Balatoni. + * + * from fdctam32.c - AP922 MMX(3D-Now) forward-DCT + * + * Intel Application Note AP-922 - fast, precise implementation of DCT + * http://developer.intel.com/vtune/cbts/appnotes.htm + * + * Also of inspiration: + * a page about fdct at http://www.geocities.com/ssavekar/dct.htm + * Skal's fdct at http://skal.planet-d.net/coding/dct.html + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#include "config.h" +#include "libavutil/attributes.h" +#include "libavutil/macros.h" +#include "libavutil/mem_internal.h" +#include "libavutil/x86/asm.h" +#include "fdct.h" + +#if HAVE_SSE2_INLINE + +////////////////////////////////////////////////////////////////////// +// +// constants for the forward DCT +// ----------------------------- +// +// Be sure to check that your compiler is aligning all constants to QWORD +// (8-byte) memory boundaries! Otherwise the unaligned memory access will +// severely stall MMX execution. +// +////////////////////////////////////////////////////////////////////// + +#define BITS_FRW_ACC 3 //; 2 or 3 for accuracy +#define SHIFT_FRW_COL BITS_FRW_ACC +#define SHIFT_FRW_ROW (BITS_FRW_ACC + 17 - 3) +#define RND_FRW_ROW (1 << (SHIFT_FRW_ROW-1)) +//#define RND_FRW_COL (1 << (SHIFT_FRW_COL-1)) + +#define X8(x) x,x,x,x,x,x,x,x + +//concatenated table, for forward DCT transformation +DECLARE_ALIGNED(16, static const int16_t, fdct_tg_all_16)[24] = { + X8(13036), // tg * (2<<16) + 0.5 + X8(27146), // tg * (2<<16) + 0.5 + X8(-21746) // tg * (2<<16) + 0.5 +}; + +DECLARE_ALIGNED(16, static const int16_t, ocos_4_16)[8] = { + X8(23170) //cos * (2<<15) + 0.5 +}; + +DECLARE_ALIGNED(16, static const int16_t, fdct_one_corr)[8] = { X8(1) }; + +static const struct +{ + DECLARE_ALIGNED(16, const int32_t, fdct_r_row_sse2)[4]; +} fdct_r_row_sse2 = +{{ + RND_FRW_ROW, RND_FRW_ROW, RND_FRW_ROW, RND_FRW_ROW +}}; +//DECLARE_ALIGNED(16, static const long, fdct_r_row_sse2)[4] = {RND_FRW_ROW, RND_FRW_ROW, RND_FRW_ROW, RND_FRW_ROW}; + +static const struct +{ + DECLARE_ALIGNED(16, const int16_t, tab_frw_01234567_sse2)[256]; +} tab_frw_01234567_sse2 = +{{ +//DECLARE_ALIGNED(16, static const int16_t, tab_frw_01234567_sse2)[] = { // forward_dct coeff table +#define TABLE_SSE2 C4, C4, C1, C3, -C6, -C2, -C1, -C5, \ + C4, C4, C5, C7, C2, C6, C3, -C7, \ + -C4, C4, C7, C3, C6, -C2, C7, -C5, \ + C4, -C4, C5, -C1, C2, -C6, C3, -C1, +// c1..c7 * cos(pi/4) * 2^15 +#define C1 22725 +#define C2 21407 +#define C3 19266 +#define C4 16384 +#define C5 12873 +#define C6 8867 +#define C7 4520 +TABLE_SSE2 + +#undef C1 +#undef C2 +#undef C3 +#undef C4 +#undef C5 +#undef C6 +#undef C7 +#define C1 31521 +#define C2 29692 +#define C3 26722 +#define C4 22725 +#define C5 17855 +#define C6 12299 +#define C7 6270 +TABLE_SSE2 + +#undef C1 +#undef C2 +#undef C3 +#undef C4 +#undef C5 +#undef C6 +#undef C7 +#define C1 29692 +#define C2 27969 +#define C3 25172 +#define C4 21407 +#define C5 16819 +#define C6 11585 +#define C7 5906 +TABLE_SSE2 + +#undef C1 +#undef C2 +#undef C3 +#undef C4 +#undef C5 +#undef C6 +#undef C7 +#define C1 26722 +#define C2 25172 +#define C3 22654 +#define C4 19266 +#define C5 15137 +#define C6 10426 +#define C7 5315 +TABLE_SSE2 + +#undef C1 +#undef C2 +#undef C3 +#undef C4 +#undef C5 +#undef C6 +#undef C7 +#define C1 22725 +#define C2 21407 +#define C3 19266 +#define C4 16384 +#define C5 12873 +#define C6 8867 +#define C7 4520 +TABLE_SSE2 + +#undef C1 +#undef C2 +#undef C3 +#undef C4 +#undef C5 +#undef C6 +#undef C7 +#define C1 26722 +#define C2 25172 +#define C3 22654 +#define C4 19266 +#define C5 15137 +#define C6 10426 +#define C7 5315 +TABLE_SSE2 + +#undef C1 +#undef C2 +#undef C3 +#undef C4 +#undef C5 +#undef C6 +#undef C7 +#define C1 29692 +#define C2 27969 +#define C3 25172 +#define C4 21407 +#define C5 16819 +#define C6 11585 +#define C7 5906 +TABLE_SSE2 + +#undef C1 +#undef C2 +#undef C3 +#undef C4 +#undef C5 +#undef C6 +#undef C7 +#define C1 31521 +#define C2 29692 +#define C3 26722 +#define C4 22725 +#define C5 17855 +#define C6 12299 +#define C7 6270 +TABLE_SSE2 +}}; + +#define S(s) AV_TOSTRING(s) //AV_STRINGIFY is too long + +#define FDCT_COL(cpu, mm, mov)\ +static av_always_inline void fdct_col_##cpu(const int16_t *in, int16_t *out, int offset)\ +{\ + __asm__ volatile (\ + #mov" 16(%0), %%"#mm"0 \n\t" \ + #mov" 96(%0), %%"#mm"1 \n\t" \ + #mov" %%"#mm"0, %%"#mm"2 \n\t" \ + #mov" 32(%0), %%"#mm"3 \n\t" \ + "paddsw %%"#mm"1, %%"#mm"0 \n\t" \ + #mov" 80(%0), %%"#mm"4 \n\t" \ + "psllw $"S(SHIFT_FRW_COL)", %%"#mm"0 \n\t" \ + #mov" (%0), %%"#mm"5 \n\t" \ + "paddsw %%"#mm"3, %%"#mm"4 \n\t" \ + "paddsw 112(%0), %%"#mm"5 \n\t" \ + "psllw $"S(SHIFT_FRW_COL)", %%"#mm"4 \n\t" \ + #mov" %%"#mm"0, %%"#mm"6 \n\t" \ + "psubsw %%"#mm"1, %%"#mm"2 \n\t" \ + #mov" 16(%1), %%"#mm"1 \n\t" \ + "psubsw %%"#mm"4, %%"#mm"0 \n\t" \ + #mov" 48(%0), %%"#mm"7 \n\t" \ + "pmulhw %%"#mm"0, %%"#mm"1 \n\t" \ + "paddsw 64(%0), %%"#mm"7 \n\t" \ + "psllw $"S(SHIFT_FRW_COL)", %%"#mm"5 \n\t" \ + "paddsw %%"#mm"4, %%"#mm"6 \n\t" \ + "psllw $"S(SHIFT_FRW_COL)", %%"#mm"7 \n\t" \ + #mov" %%"#mm"5, %%"#mm"4 \n\t" \ + "psubsw %%"#mm"7, %%"#mm"5 \n\t" \ + "paddsw %%"#mm"5, %%"#mm"1 \n\t" \ + "paddsw %%"#mm"7, %%"#mm"4 \n\t" \ + "por (%2), %%"#mm"1 \n\t" \ + "psllw $"S(SHIFT_FRW_COL)"+1, %%"#mm"2 \n\t" \ + "pmulhw 16(%1), %%"#mm"5 \n\t" \ + #mov" %%"#mm"4, %%"#mm"7 \n\t" \ + "psubsw 80(%0), %%"#mm"3 \n\t" \ + "psubsw %%"#mm"6, %%"#mm"4 \n\t" \ + #mov" %%"#mm"1, 32(%3) \n\t" \ + "paddsw %%"#mm"6, %%"#mm"7 \n\t" \ + #mov" 48(%0), %%"#mm"1 \n\t" \ + "psllw $"S(SHIFT_FRW_COL)"+1, %%"#mm"3 \n\t" \ + "psubsw 64(%0), %%"#mm"1 \n\t" \ + #mov" %%"#mm"2, %%"#mm"6 \n\t" \ + #mov" %%"#mm"4, 64(%3) \n\t" \ + "paddsw %%"#mm"3, %%"#mm"2 \n\t" \ + "pmulhw (%4), %%"#mm"2 \n\t" \ + "psubsw %%"#mm"3, %%"#mm"6 \n\t" \ + "pmulhw (%4), %%"#mm"6 \n\t" \ + "psubsw %%"#mm"0, %%"#mm"5 \n\t" \ + "por (%2), %%"#mm"5 \n\t" \ + "psllw $"S(SHIFT_FRW_COL)", %%"#mm"1 \n\t" \ + "por (%2), %%"#mm"2 \n\t" \ + #mov" %%"#mm"1, %%"#mm"4 \n\t" \ + #mov" (%0), %%"#mm"3 \n\t" \ + "paddsw %%"#mm"6, %%"#mm"1 \n\t" \ + "psubsw 112(%0), %%"#mm"3 \n\t" \ + "psubsw %%"#mm"6, %%"#mm"4 \n\t" \ + #mov" (%1), %%"#mm"0 \n\t" \ + "psllw $"S(SHIFT_FRW_COL)", %%"#mm"3 \n\t" \ + #mov" 32(%1), %%"#mm"6 \n\t" \ + "pmulhw %%"#mm"1, %%"#mm"0 \n\t" \ + #mov" %%"#mm"7, (%3) \n\t" \ + "pmulhw %%"#mm"4, %%"#mm"6 \n\t" \ + #mov" %%"#mm"5, 96(%3) \n\t" \ + #mov" %%"#mm"3, %%"#mm"7 \n\t" \ + #mov" 32(%1), %%"#mm"5 \n\t" \ + "psubsw %%"#mm"2, %%"#mm"7 \n\t" \ + "paddsw %%"#mm"2, %%"#mm"3 \n\t" \ + "pmulhw %%"#mm"7, %%"#mm"5 \n\t" \ + "paddsw %%"#mm"3, %%"#mm"0 \n\t" \ + "paddsw %%"#mm"4, %%"#mm"6 \n\t" \ + "pmulhw (%1), %%"#mm"3 \n\t" \ + "por (%2), %%"#mm"0 \n\t" \ + "paddsw %%"#mm"7, %%"#mm"5 \n\t" \ + "psubsw %%"#mm"6, %%"#mm"7 \n\t" \ + #mov" %%"#mm"0, 16(%3) \n\t" \ + "paddsw %%"#mm"4, %%"#mm"5 \n\t" \ + #mov" %%"#mm"7, 48(%3) \n\t" \ + "psubsw %%"#mm"1, %%"#mm"3 \n\t" \ + #mov" %%"#mm"5, 80(%3) \n\t" \ + #mov" %%"#mm"3, 112(%3) \n\t" \ + : \ + : "r" (in + offset), "r" (fdct_tg_all_16), "r" (fdct_one_corr), \ + "r" (out + offset), "r" (ocos_4_16)); \ +} + +FDCT_COL(sse2, xmm, movdqa) + +static av_always_inline void fdct_row_sse2(const int16_t *in, int16_t *out) +{ + __asm__ volatile( +#define FDCT_ROW_SSE2_H1(i,t) \ + "movq " #i "(%0), %%xmm2 \n\t" \ + "movq " #i "+8(%0), %%xmm0 \n\t" \ + "movdqa " #t "+32(%1), %%xmm3 \n\t" \ + "movdqa " #t "+48(%1), %%xmm7 \n\t" \ + "movdqa " #t "(%1), %%xmm4 \n\t" \ + "movdqa " #t "+16(%1), %%xmm5 \n\t" + +#define FDCT_ROW_SSE2_H2(i,t) \ + "movq " #i "(%0), %%xmm2 \n\t" \ + "movq " #i "+8(%0), %%xmm0 \n\t" \ + "movdqa " #t "+32(%1), %%xmm3 \n\t" \ + "movdqa " #t "+48(%1), %%xmm7 \n\t" + +#define FDCT_ROW_SSE2(i) \ + "movq %%xmm2, %%xmm1 \n\t" \ + "pshuflw $27, %%xmm0, %%xmm0 \n\t" \ + "paddsw %%xmm0, %%xmm1 \n\t" \ + "psubsw %%xmm0, %%xmm2 \n\t" \ + "punpckldq %%xmm2, %%xmm1 \n\t" \ + "pshufd $78, %%xmm1, %%xmm2 \n\t" \ + "pmaddwd %%xmm2, %%xmm3 \n\t" \ + "pmaddwd %%xmm1, %%xmm7 \n\t" \ + "pmaddwd %%xmm5, %%xmm2 \n\t" \ + "pmaddwd %%xmm4, %%xmm1 \n\t" \ + "paddd %%xmm7, %%xmm3 \n\t" \ + "paddd %%xmm2, %%xmm1 \n\t" \ + "paddd %%xmm6, %%xmm3 \n\t" \ + "paddd %%xmm6, %%xmm1 \n\t" \ + "psrad %3, %%xmm3 \n\t" \ + "psrad %3, %%xmm1 \n\t" \ + "packssdw %%xmm3, %%xmm1 \n\t" \ + "movdqa %%xmm1, " #i "(%4) \n\t" + + "movdqa (%2), %%xmm6 \n\t" + FDCT_ROW_SSE2_H1(0,0) + FDCT_ROW_SSE2(0) + FDCT_ROW_SSE2_H2(64,0) + FDCT_ROW_SSE2(64) + + FDCT_ROW_SSE2_H1(16,64) + FDCT_ROW_SSE2(16) + FDCT_ROW_SSE2_H2(112,64) + FDCT_ROW_SSE2(112) + + FDCT_ROW_SSE2_H1(32,128) + FDCT_ROW_SSE2(32) + FDCT_ROW_SSE2_H2(96,128) + FDCT_ROW_SSE2(96) + + FDCT_ROW_SSE2_H1(48,192) + FDCT_ROW_SSE2(48) + FDCT_ROW_SSE2_H2(80,192) + FDCT_ROW_SSE2(80) + : + : "r" (in), "r" (tab_frw_01234567_sse2.tab_frw_01234567_sse2), + "r" (fdct_r_row_sse2.fdct_r_row_sse2), "i" (SHIFT_FRW_ROW), "r" (out) + XMM_CLOBBERS_ONLY("%xmm0", "%xmm1", "%xmm2", "%xmm3", + "%xmm4", "%xmm5", "%xmm6", "%xmm7") + ); +} + +void ff_fdct_sse2(int16_t *block) +{ + DECLARE_ALIGNED(16, int64_t, align_tmp)[16]; + int16_t * const block1= (int16_t*)align_tmp; + + fdct_col_sse2(block, block1, 0); + fdct_row_sse2(block1, block); +} + +#endif /* HAVE_SSE2_INLINE */ diff --git a/media/ffvpx/libavcodec/x86/fdct.h b/media/ffvpx/libavcodec/x86/fdct.h new file mode 100644 index 0000000000..164d4fb30e --- /dev/null +++ b/media/ffvpx/libavcodec/x86/fdct.h @@ -0,0 +1,26 @@ +/* + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#ifndef AVCODEC_X86_FDCT_H +#define AVCODEC_X86_FDCT_H + +#include <stdint.h> + +void ff_fdct_sse2(int16_t *block); + +#endif /* AVCODEC_X86_FDCT_H */ diff --git a/media/ffvpx/libavcodec/x86/fdctdsp_init.c b/media/ffvpx/libavcodec/x86/fdctdsp_init.c new file mode 100644 index 0000000000..92a842433d --- /dev/null +++ b/media/ffvpx/libavcodec/x86/fdctdsp_init.c @@ -0,0 +1,38 @@ +/* + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#include "libavutil/attributes.h" +#include "libavutil/cpu.h" +#include "libavutil/x86/cpu.h" +#include "libavcodec/avcodec.h" +#include "libavcodec/fdctdsp.h" +#include "fdct.h" + +av_cold void ff_fdctdsp_init_x86(FDCTDSPContext *c, AVCodecContext *avctx, + unsigned high_bit_depth) +{ + int cpu_flags = av_get_cpu_flags(); + const int dct_algo = avctx->dct_algo; + + if (!high_bit_depth) { + if ((dct_algo == FF_DCT_AUTO || dct_algo == FF_DCT_MMX)) { + if (INLINE_SSE2(cpu_flags)) + c->fdct = ff_fdct_sse2; + } + } +} diff --git a/media/ffvpx/libavcodec/x86/fft.asm b/media/ffvpx/libavcodec/x86/fft.asm new file mode 100644 index 0000000000..34c3fc9a0f --- /dev/null +++ b/media/ffvpx/libavcodec/x86/fft.asm @@ -0,0 +1,838 @@ +;****************************************************************************** +;* FFT transform with SSE/AVX optimizations +;* Copyright (c) 2008 Loren Merritt +;* Copyright (c) 2011 Vitor Sessak +;* +;* This algorithm (though not any of the implementation details) is +;* based on libdjbfft by D. J. Bernstein. +;* +;* This file is part of FFmpeg. +;* +;* FFmpeg is free software; you can redistribute it and/or +;* modify it under the terms of the GNU Lesser General Public +;* License as published by the Free Software Foundation; either +;* version 2.1 of the License, or (at your option) any later version. +;* +;* FFmpeg is distributed in the hope that it will be useful, +;* but WITHOUT ANY WARRANTY; without even the implied warranty of +;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +;* Lesser General Public License for more details. +;* +;* You should have received a copy of the GNU Lesser General Public +;* License along with FFmpeg; if not, write to the Free Software +;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA +;****************************************************************************** + +; These functions are not individually interchangeable with the C versions. +; While C takes arrays of FFTComplex, SSE/3DNow leave intermediate results +; in blocks as conventient to the vector size. +; i.e. {4x real, 4x imaginary, 4x real, ...} (or 2x respectively) + +%include "libavutil/x86/x86util.asm" + +%if ARCH_X86_64 +%define pointer resq +%else +%define pointer resd +%endif + +struc FFTContext + .nbits: resd 1 + .reverse: resd 1 + .revtab: pointer 1 + .tmpbuf: pointer 1 + .mdctsize: resd 1 + .mdctbits: resd 1 + .tcos: pointer 1 + .tsin: pointer 1 + .fftperm: pointer 1 + .fftcalc: pointer 1 + .imdctcalc:pointer 1 + .imdcthalf:pointer 1 +endstruc + +SECTION_RODATA 32 + +%define M_SQRT1_2 0.70710678118654752440 +%define M_COS_PI_1_8 0.923879532511287 +%define M_COS_PI_3_8 0.38268343236509 + +ps_cos16_1: dd 1.0, M_COS_PI_1_8, M_SQRT1_2, M_COS_PI_3_8, 1.0, M_COS_PI_1_8, M_SQRT1_2, M_COS_PI_3_8 +ps_cos16_2: dd 0, M_COS_PI_3_8, M_SQRT1_2, M_COS_PI_1_8, 0, -M_COS_PI_3_8, -M_SQRT1_2, -M_COS_PI_1_8 + +ps_root2: times 8 dd M_SQRT1_2 +ps_root2mppm: dd -M_SQRT1_2, M_SQRT1_2, M_SQRT1_2, -M_SQRT1_2, -M_SQRT1_2, M_SQRT1_2, M_SQRT1_2, -M_SQRT1_2 +ps_p1p1m1p1: dd 0, 0, 1<<31, 0, 0, 0, 1<<31, 0 + +perm1: dd 0x00, 0x02, 0x03, 0x01, 0x03, 0x00, 0x02, 0x01 +perm2: dd 0x00, 0x01, 0x02, 0x03, 0x01, 0x00, 0x02, 0x03 +ps_p1p1m1p1root2: dd 1.0, 1.0, -1.0, 1.0, M_SQRT1_2, M_SQRT1_2, M_SQRT1_2, M_SQRT1_2 +ps_m1m1p1m1p1m1m1m1: dd 1<<31, 1<<31, 0, 1<<31, 0, 1<<31, 1<<31, 1<<31 +ps_m1p1: dd 1<<31, 0 + +cextern ps_neg + +%assign i 16 +%rep 14 +cextern cos_ %+ i +%assign i i<<1 +%endrep + +%if ARCH_X86_64 + %define pointer dq +%else + %define pointer dd +%endif + +%macro IF0 1+ +%endmacro +%macro IF1 1+ + %1 +%endmacro + +SECTION .text + +; in: %1 = {r0,i0,r2,i2,r4,i4,r6,i6} +; %2 = {r1,i1,r3,i3,r5,i5,r7,i7} +; %3, %4, %5 tmp +; out: %1 = {r0,r1,r2,r3,i0,i1,i2,i3} +; %2 = {r4,r5,r6,r7,i4,i5,i6,i7} +%macro T8_AVX 5 + vsubps %5, %1, %2 ; v = %1 - %2 + vaddps %3, %1, %2 ; w = %1 + %2 + vmulps %2, %5, [ps_p1p1m1p1root2] ; v *= vals1 + vpermilps %2, %2, [perm1] + vblendps %1, %2, %3, 0x33 ; q = {w1,w2,v4,v2,w5,w6,v7,v6} + vshufps %5, %3, %2, 0x4e ; r = {w3,w4,v1,v3,w7,w8,v8,v5} + vsubps %4, %5, %1 ; s = r - q + vaddps %1, %5, %1 ; u = r + q + vpermilps %1, %1, [perm2] ; k = {u1,u2,u3,u4,u6,u5,u7,u8} + vshufps %5, %4, %1, 0xbb + vshufps %3, %4, %1, 0xee + vperm2f128 %3, %3, %5, 0x13 + vxorps %4, %4, [ps_m1m1p1m1p1m1m1m1] ; s *= {1,1,-1,-1,1,-1,-1,-1} + vshufps %2, %1, %4, 0xdd + vshufps %1, %1, %4, 0x88 + vperm2f128 %4, %2, %1, 0x02 ; v = {k1,k3,s1,s3,k2,k4,s2,s4} + vperm2f128 %1, %1, %2, 0x13 ; w = {k6,k8,s6,s8,k5,k7,s5,s7} + vsubps %5, %1, %3 + vblendps %1, %5, %1, 0x55 ; w -= {0,s7,0,k7,0,s8,0,k8} + vsubps %2, %4, %1 ; %2 = v - w + vaddps %1, %4, %1 ; %1 = v + w +%endmacro + +; In SSE mode do one fft4 transforms +; in: %1={r0,i0,r2,i2} %2={r1,i1,r3,i3} +; out: %1={r0,r1,r2,r3} %2={i0,i1,i2,i3} +; +; In AVX mode do two fft4 transforms +; in: %1={r0,i0,r2,i2,r4,i4,r6,i6} %2={r1,i1,r3,i3,r5,i5,r7,i7} +; out: %1={r0,r1,r2,r3,r4,r5,r6,r7} %2={i0,i1,i2,i3,i4,i5,i6,i7} +%macro T4_SSE 3 + subps %3, %1, %2 ; {t3,t4,-t8,t7} + addps %1, %1, %2 ; {t1,t2,t6,t5} + xorps %3, %3, [ps_p1p1m1p1] + shufps %2, %1, %3, 0xbe ; {t6,t5,t7,t8} + shufps %1, %1, %3, 0x44 ; {t1,t2,t3,t4} + subps %3, %1, %2 ; {r2,i2,r3,i3} + addps %1, %1, %2 ; {r0,i0,r1,i1} + shufps %2, %1, %3, 0xdd ; {i0,i1,i2,i3} + shufps %1, %1, %3, 0x88 ; {r0,r1,r2,r3} +%endmacro + +; In SSE mode do one FFT8 +; in: %1={r0,r1,r2,r3} %2={i0,i1,i2,i3} %3={r4,i4,r6,i6} %4={r5,i5,r7,i7} +; out: %1={r0,r1,r2,r3} %2={i0,i1,i2,i3} %1={r4,r5,r6,r7} %2={i4,i5,i6,i7} +; +; In AVX mode do two FFT8 +; in: %1={r0,i0,r2,i2,r8, i8, r10,i10} %2={r1,i1,r3,i3,r9, i9, r11,i11} +; %3={r4,i4,r6,i6,r12,i12,r14,i14} %4={r5,i5,r7,i7,r13,i13,r15,i15} +; out: %1={r0,r1,r2,r3,r8, r9, r10,r11} %2={i0,i1,i2,i3,i8, i9, i10,i11} +; %3={r4,r5,r6,r7,r12,r13,r14,r15} %4={i4,i5,i6,i7,i12,i13,i14,i15} +%macro T8_SSE 6 + addps %6, %3, %4 ; {t1,t2,t3,t4} + subps %3, %3, %4 ; {r5,i5,r7,i7} + shufps %4, %3, %3, 0xb1 ; {i5,r5,i7,r7} + mulps %3, %3, [ps_root2mppm] ; {-r5,i5,r7,-i7} + mulps %4, %4, [ps_root2] + addps %3, %3, %4 ; {t8,t7,ta,t9} + shufps %4, %6, %3, 0x9c ; {t1,t4,t7,ta} + shufps %6, %6, %3, 0x36 ; {t3,t2,t9,t8} + subps %3, %6, %4 ; {t6,t5,tc,tb} + addps %6, %6, %4 ; {t1,t2,t9,ta} + shufps %5, %6, %3, 0x8d ; {t2,ta,t6,tc} + shufps %6, %6, %3, 0xd8 ; {t1,t9,t5,tb} + subps %3, %1, %6 ; {r4,r5,r6,r7} + addps %1, %1, %6 ; {r0,r1,r2,r3} + subps %4, %2, %5 ; {i4,i5,i6,i7} + addps %2, %2, %5 ; {i0,i1,i2,i3} +%endmacro + +%macro INTERL 5 +%if cpuflag(avx) + vunpckhps %3, %2, %1 + vunpcklps %2, %2, %1 + vextractf128 %4(%5), %2, 0 + vextractf128 %4 %+ H(%5), %3, 0 + vextractf128 %4(%5 + 1), %2, 1 + vextractf128 %4 %+ H(%5 + 1), %3, 1 +%elif cpuflag(sse) + mova %3, %2 + unpcklps %2, %1 + unpckhps %3, %1 + mova %4(%5), %2 + mova %4(%5+1), %3 +%endif +%endmacro + +; scheduled for cpu-bound sizes +%macro PASS_SMALL 3 ; (to load m4-m7), wre, wim +IF%1 mova m4, Z(4) +IF%1 mova m5, Z(5) + mova m0, %2 ; wre + mova m1, %3 ; wim + mulps m2, m4, m0 ; r2*wre +IF%1 mova m6, Z2(6) + mulps m3, m5, m1 ; i2*wim +IF%1 mova m7, Z2(7) + mulps m4, m4, m1 ; r2*wim + mulps m5, m5, m0 ; i2*wre + addps m2, m2, m3 ; r2*wre + i2*wim + mulps m3, m1, m7 ; i3*wim + subps m5, m5, m4 ; i2*wre - r2*wim + mulps m1, m1, m6 ; r3*wim + mulps m4, m0, m6 ; r3*wre + mulps m0, m0, m7 ; i3*wre + subps m4, m4, m3 ; r3*wre - i3*wim + mova m3, Z(0) + addps m0, m0, m1 ; i3*wre + r3*wim + subps m1, m4, m2 ; t3 + addps m4, m4, m2 ; t5 + subps m3, m3, m4 ; r2 + addps m4, m4, Z(0) ; r0 + mova m6, Z(2) + mova Z(4), m3 + mova Z(0), m4 + subps m3, m5, m0 ; t4 + subps m4, m6, m3 ; r3 + addps m3, m3, m6 ; r1 + mova Z2(6), m4 + mova Z(2), m3 + mova m2, Z(3) + addps m3, m5, m0 ; t6 + subps m2, m2, m1 ; i3 + mova m7, Z(1) + addps m1, m1, Z(3) ; i1 + mova Z2(7), m2 + mova Z(3), m1 + subps m4, m7, m3 ; i2 + addps m3, m3, m7 ; i0 + mova Z(5), m4 + mova Z(1), m3 +%endmacro + +; scheduled to avoid store->load aliasing +%macro PASS_BIG 1 ; (!interleave) + mova m4, Z(4) ; r2 + mova m5, Z(5) ; i2 + mova m0, [wq] ; wre + mova m1, [wq+o1q] ; wim + mulps m2, m4, m0 ; r2*wre + mova m6, Z2(6) ; r3 + mulps m3, m5, m1 ; i2*wim + mova m7, Z2(7) ; i3 + mulps m4, m4, m1 ; r2*wim + mulps m5, m5, m0 ; i2*wre + addps m2, m2, m3 ; r2*wre + i2*wim + mulps m3, m1, m7 ; i3*wim + mulps m1, m1, m6 ; r3*wim + subps m5, m5, m4 ; i2*wre - r2*wim + mulps m4, m0, m6 ; r3*wre + mulps m0, m0, m7 ; i3*wre + subps m4, m4, m3 ; r3*wre - i3*wim + mova m3, Z(0) + addps m0, m0, m1 ; i3*wre + r3*wim + subps m1, m4, m2 ; t3 + addps m4, m4, m2 ; t5 + subps m3, m3, m4 ; r2 + addps m4, m4, Z(0) ; r0 + mova m6, Z(2) + mova Z(4), m3 + mova Z(0), m4 + subps m3, m5, m0 ; t4 + subps m4, m6, m3 ; r3 + addps m3, m3, m6 ; r1 +IF%1 mova Z2(6), m4 +IF%1 mova Z(2), m3 + mova m2, Z(3) + addps m5, m5, m0 ; t6 + subps m2, m2, m1 ; i3 + mova m7, Z(1) + addps m1, m1, Z(3) ; i1 +IF%1 mova Z2(7), m2 +IF%1 mova Z(3), m1 + subps m6, m7, m5 ; i2 + addps m5, m5, m7 ; i0 +IF%1 mova Z(5), m6 +IF%1 mova Z(1), m5 +%if %1==0 + INTERL m1, m3, m7, Z, 2 + INTERL m2, m4, m0, Z2, 6 + + mova m1, Z(0) + mova m2, Z(4) + + INTERL m5, m1, m3, Z, 0 + INTERL m6, m2, m7, Z, 4 +%endif +%endmacro + +%define Z(x) [r0+mmsize*x] +%define Z2(x) [r0+mmsize*x] +%define ZH(x) [r0+mmsize*x+mmsize/2] + +INIT_YMM avx + +%if HAVE_AVX_EXTERNAL +align 16 +fft8_avx: + mova m0, Z(0) + mova m1, Z(1) + T8_AVX m0, m1, m2, m3, m4 + mova Z(0), m0 + mova Z(1), m1 + ret + + +align 16 +fft16_avx: + mova m2, Z(2) + mova m3, Z(3) + T4_SSE m2, m3, m7 + + mova m0, Z(0) + mova m1, Z(1) + T8_AVX m0, m1, m4, m5, m7 + + mova m4, [ps_cos16_1] + mova m5, [ps_cos16_2] + vmulps m6, m2, m4 + vmulps m7, m3, m5 + vaddps m7, m7, m6 + vmulps m2, m2, m5 + vmulps m3, m3, m4 + vsubps m3, m3, m2 + vblendps m2, m7, m3, 0xf0 + vperm2f128 m3, m7, m3, 0x21 + vaddps m4, m2, m3 + vsubps m2, m3, m2 + vperm2f128 m2, m2, m2, 0x01 + vsubps m3, m1, m2 + vaddps m1, m1, m2 + vsubps m5, m0, m4 + vaddps m0, m0, m4 + vextractf128 Z(0), m0, 0 + vextractf128 ZH(0), m1, 0 + vextractf128 Z(1), m0, 1 + vextractf128 ZH(1), m1, 1 + vextractf128 Z(2), m5, 0 + vextractf128 ZH(2), m3, 0 + vextractf128 Z(3), m5, 1 + vextractf128 ZH(3), m3, 1 + ret + +align 16 +fft32_avx: + call fft16_avx + + mova m0, Z(4) + mova m1, Z(5) + + T4_SSE m0, m1, m4 + + mova m2, Z(6) + mova m3, Z(7) + + T8_SSE m0, m1, m2, m3, m4, m6 + ; m0={r0,r1,r2,r3,r8, r9, r10,r11} m1={i0,i1,i2,i3,i8, i9, i10,i11} + ; m2={r4,r5,r6,r7,r12,r13,r14,r15} m3={i4,i5,i6,i7,i12,i13,i14,i15} + + vperm2f128 m4, m0, m2, 0x20 + vperm2f128 m5, m1, m3, 0x20 + vperm2f128 m6, m0, m2, 0x31 + vperm2f128 m7, m1, m3, 0x31 + + PASS_SMALL 0, [cos_32], [cos_32+32] + + ret + +fft32_interleave_avx: + call fft32_avx + mov r2d, 32 +.deint_loop: + mova m2, Z(0) + mova m3, Z(1) + vunpcklps m0, m2, m3 + vunpckhps m1, m2, m3 + vextractf128 Z(0), m0, 0 + vextractf128 ZH(0), m1, 0 + vextractf128 Z(1), m0, 1 + vextractf128 ZH(1), m1, 1 + add r0, mmsize*2 + sub r2d, mmsize/4 + jg .deint_loop + ret + +%endif + +INIT_XMM sse + +align 16 +fft4_avx: +fft4_sse: + mova m0, Z(0) + mova m1, Z(1) + T4_SSE m0, m1, m2 + mova Z(0), m0 + mova Z(1), m1 + ret + +align 16 +fft8_sse: + mova m0, Z(0) + mova m1, Z(1) + T4_SSE m0, m1, m2 + mova m2, Z(2) + mova m3, Z(3) + T8_SSE m0, m1, m2, m3, m4, m5 + mova Z(0), m0 + mova Z(1), m1 + mova Z(2), m2 + mova Z(3), m3 + ret + +align 16 +fft16_sse: + mova m0, Z(0) + mova m1, Z(1) + T4_SSE m0, m1, m2 + mova m2, Z(2) + mova m3, Z(3) + T8_SSE m0, m1, m2, m3, m4, m5 + mova m4, Z(4) + mova m5, Z(5) + mova Z(0), m0 + mova Z(1), m1 + mova Z(2), m2 + mova Z(3), m3 + T4_SSE m4, m5, m6 + mova m6, Z2(6) + mova m7, Z2(7) + T4_SSE m6, m7, m0 + PASS_SMALL 0, [cos_16], [cos_16+16] + ret + + +%define Z(x) [zcq + o1q*(x&6) + mmsize*(x&1)] +%define Z2(x) [zcq + o3q + mmsize*(x&1)] +%define ZH(x) [zcq + o1q*(x&6) + mmsize*(x&1) + mmsize/2] +%define Z2H(x) [zcq + o3q + mmsize*(x&1) + mmsize/2] + +%macro DECL_PASS 2+ ; name, payload +align 16 +%1: +DEFINE_ARGS zc, w, n, o1, o3 + lea o3q, [nq*3] + lea o1q, [nq*8] + shl o3q, 4 +.loop: + %2 + add zcq, mmsize*2 + add wq, mmsize + sub nd, mmsize/8 + jg .loop + rep ret +%endmacro + +%macro FFT_DISPATCH 2; clobbers 5 GPRs, 8 XMMs + lea r2, [dispatch_tab%1] + mov r2, [r2 + (%2q-2)*gprsize] +%ifdef PIC + lea r3, [$$] + add r2, r3 +%endif + call r2 +%endmacro ; FFT_DISPATCH + +INIT_YMM avx + +%if HAVE_AVX_EXTERNAL +DECL_PASS pass_avx, PASS_BIG 1 +DECL_PASS pass_interleave_avx, PASS_BIG 0 + +cglobal fft_calc, 2,5,8 + mov r3d, [r0 + FFTContext.nbits] + mov r0, r1 + mov r1, r3 + FFT_DISPATCH _interleave %+ SUFFIX, r1 + RET + +%endif + +INIT_XMM sse + +DECL_PASS pass_sse, PASS_BIG 1 +DECL_PASS pass_interleave_sse, PASS_BIG 0 + +INIT_XMM sse +cglobal fft_calc, 2,5,8 + mov r3d, [r0 + FFTContext.nbits] + PUSH r1 + PUSH r3 + mov r0, r1 + mov r1, r3 + FFT_DISPATCH _interleave %+ SUFFIX, r1 + POP rcx + POP r4 + cmp rcx, 3+(mmsize/16) + jg .end + mov r2, -1 + add rcx, 3 + shl r2, cl + sub r4, r2 +.loop: + movaps xmm0, [r4 + r2] + movaps xmm1, xmm0 + unpcklps xmm0, [r4 + r2 + 16] + unpckhps xmm1, [r4 + r2 + 16] + movaps [r4 + r2], xmm0 + movaps [r4 + r2 + 16], xmm1 + add r2, mmsize*2 + jl .loop +.end: + RET + +cglobal fft_permute, 2,7,1 + mov r4, [r0 + FFTContext.revtab] + mov r5, [r0 + FFTContext.tmpbuf] + mov ecx, [r0 + FFTContext.nbits] + mov r2, 1 + shl r2, cl + xor r0, r0 +%if ARCH_X86_32 + mov r1, r1m +%endif +.loop: + movaps xmm0, [r1 + 8*r0] + movzx r6, word [r4 + 2*r0] + movzx r3, word [r4 + 2*r0 + 2] + movlps [r5 + 8*r6], xmm0 + movhps [r5 + 8*r3], xmm0 + add r0, 2 + cmp r0, r2 + jl .loop + shl r2, 3 + add r1, r2 + add r5, r2 + neg r2 +; nbits >= 2 (FFT4) and sizeof(FFTComplex)=8 => at least 32B +.loopcopy: + movaps xmm0, [r5 + r2] + movaps xmm1, [r5 + r2 + 16] + movaps [r1 + r2], xmm0 + movaps [r1 + r2 + 16], xmm1 + add r2, 32 + jl .loopcopy + RET + +INIT_XMM sse +cglobal imdct_calc, 3,5,3 + mov r3d, [r0 + FFTContext.mdctsize] + mov r4, [r0 + FFTContext.imdcthalf] + add r1, r3 + PUSH r3 + PUSH r1 +%if ARCH_X86_32 + push r2 + push r1 + push r0 +%else + sub rsp, 8+32*WIN64 ; allocate win64 shadow space +%endif + call r4 +%if ARCH_X86_32 + add esp, 12 +%else + add rsp, 8+32*WIN64 +%endif + POP r1 + POP r3 + lea r0, [r1 + 2*r3] + mov r2, r3 + sub r3, mmsize + neg r2 + mova m2, [ps_neg] +.loop: + mova m0, [r1 + r3] + mova m1, [r0 + r2] + shufps m0, m0, 0x1b + shufps m1, m1, 0x1b + xorps m0, m2 + mova [r0 + r3], m1 + mova [r1 + r2], m0 + sub r3, mmsize + add r2, mmsize + jl .loop + RET + +%ifdef PIC +%define SECTION_REL - $$ +%else +%define SECTION_REL +%endif + +%macro DECL_FFT 1-2 ; nbits, suffix +%ifidn %0, 1 +%xdefine fullsuffix SUFFIX +%else +%xdefine fullsuffix %2 %+ SUFFIX +%endif +%xdefine list_of_fft fft4 %+ SUFFIX SECTION_REL, fft8 %+ SUFFIX SECTION_REL +%if %1>=5 +%xdefine list_of_fft list_of_fft, fft16 %+ SUFFIX SECTION_REL +%endif +%if %1>=6 +%xdefine list_of_fft list_of_fft, fft32 %+ fullsuffix SECTION_REL +%endif + +%assign n 1<<%1 +%rep 18-%1 +%assign n2 n/2 +%assign n4 n/4 +%xdefine list_of_fft list_of_fft, fft %+ n %+ fullsuffix SECTION_REL + +align 16 +fft %+ n %+ fullsuffix: + call fft %+ n2 %+ SUFFIX + add r0, n*4 - (n&(-2<<%1)) + call fft %+ n4 %+ SUFFIX + add r0, n*2 - (n2&(-2<<%1)) + call fft %+ n4 %+ SUFFIX + sub r0, n*6 + (n2&(-2<<%1)) + lea r1, [cos_ %+ n] + mov r2d, n4/2 + jmp pass %+ fullsuffix + +%assign n n*2 +%endrep +%undef n + +align 8 +dispatch_tab %+ fullsuffix: pointer list_of_fft +%endmacro ; DECL_FFT + +%if HAVE_AVX_EXTERNAL +INIT_YMM avx +DECL_FFT 6 +DECL_FFT 6, _interleave +%endif +INIT_XMM sse +DECL_FFT 5 +DECL_FFT 5, _interleave + +INIT_XMM sse +%undef mulps +%undef addps +%undef subps +%undef unpcklps +%undef unpckhps + +%macro PREROTATER 5 ;-2*k, 2*k, input+n4, tcos+n8, tsin+n8 + movaps xmm0, [%3+%2*4] + movaps xmm1, [%3+%1*4-0x10] + movaps xmm2, xmm0 + shufps xmm0, xmm1, 0x88 + shufps xmm1, xmm2, 0x77 + movlps xmm4, [%4+%2*2] + movlps xmm5, [%5+%2*2+0x0] + movhps xmm4, [%4+%1*2-0x8] + movhps xmm5, [%5+%1*2-0x8] + movaps xmm2, xmm0 + movaps xmm3, xmm1 + mulps xmm0, xmm5 + mulps xmm1, xmm4 + mulps xmm2, xmm4 + mulps xmm3, xmm5 + subps xmm1, xmm0 + addps xmm2, xmm3 + movaps xmm0, xmm1 + unpcklps xmm1, xmm2 + unpckhps xmm0, xmm2 +%endmacro + +%macro CMUL 6 ;j, xmm0, xmm1, 3, 4, 5 + mulps m6, %3, [%5+%1] + mulps m7, %2, [%5+%1] + mulps %2, %2, [%6+%1] + mulps %3, %3, [%6+%1] + subps %2, %2, m6 + addps %3, %3, m7 +%endmacro + +%macro POSROTATESHUF 5 ;j, k, z+n8, tcos+n8, tsin+n8 +.post: +%if cpuflag(avx) + vmovaps ymm1, [%3+%1*2] + vmovaps ymm0, [%3+%1*2+0x20] + vmovaps ymm3, [%3+%2*2] + vmovaps ymm2, [%3+%2*2+0x20] + + CMUL %1, ymm0, ymm1, %3, %4, %5 + CMUL %2, ymm2, ymm3, %3, %4, %5 + vshufps ymm1, ymm1, ymm1, 0x1b + vshufps ymm3, ymm3, ymm3, 0x1b + vperm2f128 ymm1, ymm1, ymm1, 0x01 + vperm2f128 ymm3, ymm3, ymm3, 0x01 + vunpcklps ymm6, ymm2, ymm1 + vunpckhps ymm4, ymm2, ymm1 + vunpcklps ymm7, ymm0, ymm3 + vunpckhps ymm5, ymm0, ymm3 + + vextractf128 [%3+%1*2], ymm7, 0 + vextractf128 [%3+%1*2+0x10], ymm5, 0 + vextractf128 [%3+%1*2+0x20], ymm7, 1 + vextractf128 [%3+%1*2+0x30], ymm5, 1 + + vextractf128 [%3+%2*2], ymm6, 0 + vextractf128 [%3+%2*2+0x10], ymm4, 0 + vextractf128 [%3+%2*2+0x20], ymm6, 1 + vextractf128 [%3+%2*2+0x30], ymm4, 1 + sub %2, 0x20 + add %1, 0x20 + jl .post +%else + movaps xmm1, [%3+%1*2] + movaps xmm0, [%3+%1*2+0x10] + CMUL %1, xmm0, xmm1, %3, %4, %5 + movaps xmm5, [%3+%2*2] + movaps xmm4, [%3+%2*2+0x10] + CMUL %2, xmm4, xmm5, %3, %4, %5 + shufps xmm1, xmm1, 0x1b + shufps xmm5, xmm5, 0x1b + movaps xmm6, xmm4 + unpckhps xmm4, xmm1 + unpcklps xmm6, xmm1 + movaps xmm2, xmm0 + unpcklps xmm0, xmm5 + unpckhps xmm2, xmm5 + movaps [%3+%2*2], xmm6 + movaps [%3+%2*2+0x10], xmm4 + movaps [%3+%1*2], xmm0 + movaps [%3+%1*2+0x10], xmm2 + sub %2, 0x10 + add %1, 0x10 + jl .post +%endif +%endmacro + +%macro DECL_IMDCT 0 +cglobal imdct_half, 3,12,8; FFTContext *s, FFTSample *output, const FFTSample *input +%if ARCH_X86_64 +%define rrevtab r7 +%define rtcos r8 +%define rtsin r9 +%else +%define rrevtab r6 +%define rtsin r6 +%define rtcos r5 +%endif + mov r3d, [r0+FFTContext.mdctsize] + add r2, r3 + shr r3, 1 + mov rtcos, [r0+FFTContext.tcos] + mov rtsin, [r0+FFTContext.tsin] + add rtcos, r3 + add rtsin, r3 +%if ARCH_X86_64 == 0 + push rtcos + push rtsin +%endif + shr r3, 1 + mov rrevtab, [r0+FFTContext.revtab] + add rrevtab, r3 +%if ARCH_X86_64 == 0 + push rrevtab +%endif + + sub r3, 4 +%if ARCH_X86_64 + xor r4, r4 + sub r4, r3 +%endif +.pre: +%if ARCH_X86_64 == 0 +;unspill + xor r4, r4 + sub r4, r3 + mov rtcos, [esp+8] + mov rtsin, [esp+4] +%endif + + PREROTATER r4, r3, r2, rtcos, rtsin +%if ARCH_X86_64 + movzx r5, word [rrevtab+r4-4] + movzx r6, word [rrevtab+r4-2] + movzx r10, word [rrevtab+r3] + movzx r11, word [rrevtab+r3+2] + movlps [r1+r5 *8], xmm0 + movhps [r1+r6 *8], xmm0 + movlps [r1+r10*8], xmm1 + movhps [r1+r11*8], xmm1 + add r4, 4 +%else + mov r6, [esp] + movzx r5, word [r6+r4-4] + movzx r4, word [r6+r4-2] + movlps [r1+r5*8], xmm0 + movhps [r1+r4*8], xmm0 + movzx r5, word [r6+r3] + movzx r4, word [r6+r3+2] + movlps [r1+r5*8], xmm1 + movhps [r1+r4*8], xmm1 +%endif + sub r3, 4 + jns .pre + + mov r5, r0 + mov r6, r1 + mov r0, r1 + mov r1d, [r5+FFTContext.nbits] + + FFT_DISPATCH SUFFIX, r1 + + mov r0d, [r5+FFTContext.mdctsize] + add r6, r0 + shr r0, 1 +%if ARCH_X86_64 == 0 +%define rtcos r2 +%define rtsin r3 + mov rtcos, [esp+8] + mov rtsin, [esp+4] +%endif + neg r0 + mov r1, -mmsize + sub r1, r0 + POSROTATESHUF r0, r1, r6, rtcos, rtsin +%if ARCH_X86_64 == 0 + add esp, 12 +%endif + RET +%endmacro + +DECL_IMDCT + +INIT_YMM avx + +%if HAVE_AVX_EXTERNAL +DECL_IMDCT +%endif diff --git a/media/ffvpx/libavcodec/x86/fft.h b/media/ffvpx/libavcodec/x86/fft.h new file mode 100644 index 0000000000..37418ec1f4 --- /dev/null +++ b/media/ffvpx/libavcodec/x86/fft.h @@ -0,0 +1,32 @@ +/* + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#ifndef AVCODEC_X86_FFT_H +#define AVCODEC_X86_FFT_H + +#include "libavcodec/fft.h" + +void ff_fft_permute_sse(FFTContext *s, FFTComplex *z); +void ff_fft_calc_avx(FFTContext *s, FFTComplex *z); +void ff_fft_calc_sse(FFTContext *s, FFTComplex *z); + +void ff_imdct_calc_sse(FFTContext *s, FFTSample *output, const FFTSample *input); +void ff_imdct_half_sse(FFTContext *s, FFTSample *output, const FFTSample *input); +void ff_imdct_half_avx(FFTContext *s, FFTSample *output, const FFTSample *input); + +#endif /* AVCODEC_X86_FFT_H */ diff --git a/media/ffvpx/libavcodec/x86/fft_init.c b/media/ffvpx/libavcodec/x86/fft_init.c new file mode 100644 index 0000000000..df79d57dc7 --- /dev/null +++ b/media/ffvpx/libavcodec/x86/fft_init.c @@ -0,0 +1,47 @@ +/* + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#include "config.h" + +#include "libavutil/attributes.h" +#include "libavutil/cpu.h" +#include "libavutil/x86/cpu.h" + +#include "fft.h" + +av_cold void ff_fft_init_x86(FFTContext *s) +{ + int cpu_flags = av_get_cpu_flags(); + + if (s->nbits > 16) + return; + + if (EXTERNAL_SSE(cpu_flags)) { + s->imdct_calc = ff_imdct_calc_sse; + s->imdct_half = ff_imdct_half_sse; + s->fft_permute = ff_fft_permute_sse; + s->fft_calc = ff_fft_calc_sse; + s->fft_permutation = FF_FFT_PERM_SWAP_LSBS; + } + + if (EXTERNAL_AVX_FAST(cpu_flags) && s->nbits >= 5) { + s->imdct_half = ff_imdct_half_avx; + s->fft_calc = ff_fft_calc_avx; + s->fft_permutation = FF_FFT_PERM_AVX; + } +} diff --git a/media/ffvpx/libavcodec/x86/flacdsp.asm b/media/ffvpx/libavcodec/x86/flacdsp.asm new file mode 100644 index 0000000000..44416e4dfd --- /dev/null +++ b/media/ffvpx/libavcodec/x86/flacdsp.asm @@ -0,0 +1,326 @@ +;****************************************************************************** +;* FLAC DSP SIMD optimizations +;* +;* Copyright (C) 2014 Loren Merritt +;* Copyright (C) 2014 James Almer +;* +;* This file is part of FFmpeg. +;* +;* FFmpeg is free software; you can redistribute it and/or +;* modify it under the terms of the GNU Lesser General Public +;* License as published by the Free Software Foundation; either +;* version 2.1 of the License, or (at your option) any later version. +;* +;* FFmpeg is distributed in the hope that it will be useful, +;* but WITHOUT ANY WARRANTY; without even the implied warranty of +;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +;* Lesser General Public License for more details. +;* +;* You should have received a copy of the GNU Lesser General Public +;* License along with FFmpeg; if not, write to the Free Software +;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA +;****************************************************************************** + +%include "libavutil/x86/x86util.asm" + +SECTION_RODATA + +vector: db 0,1,4,5,8,9,12,13,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,0,1,4,5,8,9,12,13, + +SECTION .text + +%macro PMACSDQL 5 +%if cpuflag(xop) + pmacsdql %1, %2, %3, %1 +%else + pmuldq %2, %3 + paddq %1, %2 +%endif +%endmacro + +%macro LPC_32 1 +INIT_XMM %1 +cglobal flac_lpc_32, 5,6,5, decoded, coeffs, pred_order, qlevel, len, j + sub lend, pred_orderd + jle .ret + lea decodedq, [decodedq+pred_orderq*4-8] + lea coeffsq, [coeffsq+pred_orderq*4] + neg pred_orderq + movd m4, qlevelm +ALIGN 16 +.loop_sample: + movd m0, [decodedq+pred_orderq*4+8] + add decodedq, 8 + movd m1, [coeffsq+pred_orderq*4] + pxor m2, m2 + pxor m3, m3 + lea jq, [pred_orderq+1] + test jq, jq + jz .end_order +.loop_order: + PMACSDQL m2, m0, m1, m2, m0 + movd m0, [decodedq+jq*4] + PMACSDQL m3, m1, m0, m3, m1 + movd m1, [coeffsq+jq*4] + inc jq + jl .loop_order +.end_order: + PMACSDQL m2, m0, m1, m2, m0 + psrlq m2, m4 + movd m0, [decodedq] + paddd m0, m2 + movd [decodedq], m0 + sub lend, 2 + jl .ret + PMACSDQL m3, m1, m0, m3, m1 + psrlq m3, m4 + movd m1, [decodedq+4] + paddd m1, m3 + movd [decodedq+4], m1 + jg .loop_sample +.ret: + RET +%endmacro + +%if HAVE_XOP_EXTERNAL +LPC_32 xop +%endif +LPC_32 sse4 + +;---------------------------------------------------------------------------------- +;void ff_flac_decorrelate_[lrm]s_16_sse2(uint8_t **out, int32_t **in, int channels, +; int len, int shift); +;---------------------------------------------------------------------------------- +%macro FLAC_DECORRELATE_16 3-4 +cglobal flac_decorrelate_%1_16, 2, 4, 4, out, in0, in1, len +%ifidn %1, indep2 + VBROADCASTI128 m2, [vector] +%endif +%if ARCH_X86_32 + mov lend, lenm +%endif + movd m3, r4m + shl lend, 2 + mov in1q, [in0q + gprsize] + mov in0q, [in0q] + mov outq, [outq] + add in1q, lenq + add in0q, lenq + add outq, lenq + neg lenq + +align 16 +.loop: + mova m0, [in0q + lenq] + mova m1, [in1q + lenq] +%ifidn %1, ms + psrad m2, m1, 1 + psubd m0, m2 +%endif +%ifnidn %1, indep2 + p%4d m2, m0, m1 + packssdw m%2, m%2 + packssdw m%3, m%3 + punpcklwd m%2, m%3 + psllw m%2, m3 +%else + pslld m%2, m3 + pslld m%3, m3 + pshufb m%2, m%2, m2 + pshufb m%3, m%3, m2 + punpcklwd m%2, m%3 +%endif + mova [outq + lenq], m%2 + add lenq, 16 + jl .loop + RET +%endmacro + +INIT_XMM sse2 +FLAC_DECORRELATE_16 ls, 0, 2, sub +FLAC_DECORRELATE_16 rs, 2, 1, add +FLAC_DECORRELATE_16 ms, 2, 0, add + +;---------------------------------------------------------------------------------- +;void ff_flac_decorrelate_[lrm]s_32_sse2(uint8_t **out, int32_t **in, int channels, +; int len, int shift); +;---------------------------------------------------------------------------------- +%macro FLAC_DECORRELATE_32 5 +cglobal flac_decorrelate_%1_32, 2, 4, 4, out, in0, in1, len +%if ARCH_X86_32 + mov lend, lenm +%endif + movd m3, r4m + mov in1q, [in0q + gprsize] + mov in0q, [in0q] + mov outq, [outq] + sub in1q, in0q + +align 16 +.loop: + mova m0, [in0q] + mova m1, [in0q + in1q] +%ifidn %1, ms + psrad m2, m1, 1 + psubd m0, m2 +%endif + p%5d m2, m0, m1 + pslld m%2, m3 + pslld m%3, m3 + + SBUTTERFLY dq, %2, %3, %4 + + mova [outq ], m%2 + mova [outq + mmsize], m%3 + + add in0q, mmsize + add outq, mmsize*2 + sub lend, mmsize/4 + jg .loop + RET +%endmacro + +INIT_XMM sse2 +FLAC_DECORRELATE_32 ls, 0, 2, 1, sub +FLAC_DECORRELATE_32 rs, 2, 1, 0, add +FLAC_DECORRELATE_32 ms, 2, 0, 1, add + +;----------------------------------------------------------------------------------------- +;void ff_flac_decorrelate_indep<ch>_<bps>_<opt>(uint8_t **out, int32_t **in, int channels, +; int len, int shift); +;----------------------------------------------------------------------------------------- +;%1 = bps +;%2 = channels +;%3 = last xmm reg used +;%4 = word/dword (shift instruction) +%macro FLAC_DECORRELATE_INDEP 4 +%define REPCOUNT %2/(32/%1) ; 16bits = channels / 2; 32bits = channels +cglobal flac_decorrelate_indep%2_%1, 2, %2+2, %3+1, out, in0, in1, len, in2, in3, in4, in5, in6, in7 +%if ARCH_X86_32 +%if %2 == 6 + DEFINE_ARGS out, in0, in1, in2, in3, in4, in5 + %define lend dword r3m +%else + mov lend, lenm +%endif +%endif + movd m%3, r4m + +%assign %%i 1 +%rep %2-1 + mov in %+ %%i %+ q, [in0q+%%i*gprsize] +%assign %%i %%i+1 +%endrep + + mov in0q, [in0q] + mov outq, [outq] + +%assign %%i 1 +%rep %2-1 + sub in %+ %%i %+ q, in0q +%assign %%i %%i+1 +%endrep + +align 16 +.loop: + mova m0, [in0q] + +%assign %%i 1 +%rep REPCOUNT-1 + mova m %+ %%i, [in0q + in %+ %%i %+ q] +%assign %%i %%i+1 +%endrep + +%if %1 == 32 + +%if %2 == 8 + TRANSPOSE8x4D 0, 1, 2, 3, 4, 5, 6, 7, 8 +%elif %2 == 6 + SBUTTERFLY dq, 0, 1, 6 + SBUTTERFLY dq, 2, 3, 6 + SBUTTERFLY dq, 4, 5, 6 + + punpcklqdq m6, m0, m2 + punpckhqdq m2, m4 + shufps m4, m0, 0xe4 + punpcklqdq m0, m1, m3 + punpckhqdq m3, m5 + shufps m5, m1, 0xe4 + SWAP 0,6,1,4,5,3 +%elif %2 == 4 + TRANSPOSE4x4D 0, 1, 2, 3, 4 +%else ; %2 == 2 + SBUTTERFLY dq, 0, 1, 2 +%endif + +%else ; %1 == 16 + +%if %2 == 8 + packssdw m0, [in0q + in4q] + packssdw m1, [in0q + in5q] + packssdw m2, [in0q + in6q] + packssdw m3, [in0q + in7q] + TRANSPOSE2x4x4W 0, 1, 2, 3, 4 +%elif %2 == 6 + packssdw m0, [in0q + in3q] + packssdw m1, [in0q + in4q] + packssdw m2, [in0q + in5q] + pshufd m3, m0, q1032 + punpcklwd m0, m1 + punpckhwd m1, m2 + punpcklwd m2, m3 + + shufps m3, m0, m2, q2020 + shufps m0, m1, q2031 + shufps m2, m1, q3131 + shufps m1, m2, m3, q3120 + shufps m3, m0, q0220 + shufps m0, m2, q3113 + SWAP 2, 0, 3 +%else ; %2 == 4 + packssdw m0, [in0q + in2q] + packssdw m1, [in0q + in3q] + SBUTTERFLY wd, 0, 1, 2 + SBUTTERFLY dq, 0, 1, 2 +%endif + +%endif + +%assign %%i 0 +%rep REPCOUNT + psll%4 m %+ %%i, m%3 +%assign %%i %%i+1 +%endrep + +%assign %%i 0 +%rep REPCOUNT + mova [outq + %%i*mmsize], m %+ %%i +%assign %%i %%i+1 +%endrep + + add in0q, mmsize + add outq, mmsize*REPCOUNT + sub lend, mmsize/4 + jg .loop + RET +%endmacro + +INIT_XMM ssse3 +FLAC_DECORRELATE_16 indep2, 0, 1 ; Reuse stereo 16bits macro +FLAC_DECORRELATE_INDEP 32, 2, 3, d +FLAC_DECORRELATE_INDEP 16, 4, 3, w +FLAC_DECORRELATE_INDEP 32, 4, 5, d +FLAC_DECORRELATE_INDEP 16, 6, 4, w +FLAC_DECORRELATE_INDEP 32, 6, 7, d +%if ARCH_X86_64 +FLAC_DECORRELATE_INDEP 16, 8, 5, w +FLAC_DECORRELATE_INDEP 32, 8, 9, d +%endif + +INIT_XMM avx +FLAC_DECORRELATE_INDEP 32, 4, 5, d +FLAC_DECORRELATE_INDEP 32, 6, 7, d +%if ARCH_X86_64 +FLAC_DECORRELATE_INDEP 16, 8, 5, w +FLAC_DECORRELATE_INDEP 32, 8, 9, d +%endif diff --git a/media/ffvpx/libavcodec/x86/flacdsp_init.c b/media/ffvpx/libavcodec/x86/flacdsp_init.c new file mode 100644 index 0000000000..87daed7005 --- /dev/null +++ b/media/ffvpx/libavcodec/x86/flacdsp_init.c @@ -0,0 +1,115 @@ +/* + * Copyright (c) 2014 James Almer + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#include "libavutil/attributes.h" +#include "libavcodec/flacdsp.h" +#include "libavutil/x86/cpu.h" +#include "config.h" + +void ff_flac_lpc_32_sse4(int32_t *samples, const int coeffs[32], int order, + int qlevel, int len); +void ff_flac_lpc_32_xop(int32_t *samples, const int coeffs[32], int order, + int qlevel, int len); + +#define DECORRELATE_FUNCS(fmt, opt) \ +void ff_flac_decorrelate_ls_##fmt##_##opt(uint8_t **out, int32_t **in, int channels, \ + int len, int shift); \ +void ff_flac_decorrelate_rs_##fmt##_##opt(uint8_t **out, int32_t **in, int channels, \ + int len, int shift); \ +void ff_flac_decorrelate_ms_##fmt##_##opt(uint8_t **out, int32_t **in, int channels, \ + int len, int shift) + +#define DECORRELATE_IFUNCS(fmt, opt) \ +void ff_flac_decorrelate_indep2_##fmt##_##opt(uint8_t **out, int32_t **in, int channels, \ + int len, int shift); \ +void ff_flac_decorrelate_indep4_##fmt##_##opt(uint8_t **out, int32_t **in, int channels, \ + int len, int shift); \ +void ff_flac_decorrelate_indep6_##fmt##_##opt(uint8_t **out, int32_t **in, int channels, \ + int len, int shift); \ +void ff_flac_decorrelate_indep8_##fmt##_##opt(uint8_t **out, int32_t **in, int channels, \ + int len, int shift) + +DECORRELATE_FUNCS(16, sse2); +DECORRELATE_FUNCS(16, avx); +DECORRELATE_FUNCS(32, sse2); +DECORRELATE_FUNCS(32, avx); +DECORRELATE_IFUNCS(16, ssse3); +DECORRELATE_IFUNCS(16, avx); +DECORRELATE_IFUNCS(32, ssse3); +DECORRELATE_IFUNCS(32, avx); + +av_cold void ff_flacdsp_init_x86(FLACDSPContext *c, enum AVSampleFormat fmt, int channels) +{ +#if HAVE_X86ASM + int cpu_flags = av_get_cpu_flags(); + + if (EXTERNAL_SSE2(cpu_flags)) { + if (fmt == AV_SAMPLE_FMT_S16) { + c->decorrelate[1] = ff_flac_decorrelate_ls_16_sse2; + c->decorrelate[2] = ff_flac_decorrelate_rs_16_sse2; + c->decorrelate[3] = ff_flac_decorrelate_ms_16_sse2; + } else if (fmt == AV_SAMPLE_FMT_S32) { + c->decorrelate[1] = ff_flac_decorrelate_ls_32_sse2; + c->decorrelate[2] = ff_flac_decorrelate_rs_32_sse2; + c->decorrelate[3] = ff_flac_decorrelate_ms_32_sse2; + } + } + if (EXTERNAL_SSSE3(cpu_flags)) { + if (fmt == AV_SAMPLE_FMT_S16) { + if (channels == 2) + c->decorrelate[0] = ff_flac_decorrelate_indep2_16_ssse3; + else if (channels == 4) + c->decorrelate[0] = ff_flac_decorrelate_indep4_16_ssse3; + else if (channels == 6) + c->decorrelate[0] = ff_flac_decorrelate_indep6_16_ssse3; + else if (ARCH_X86_64 && channels == 8) + c->decorrelate[0] = ff_flac_decorrelate_indep8_16_ssse3; + } else if (fmt == AV_SAMPLE_FMT_S32) { + if (channels == 2) + c->decorrelate[0] = ff_flac_decorrelate_indep2_32_ssse3; + else if (channels == 4) + c->decorrelate[0] = ff_flac_decorrelate_indep4_32_ssse3; + else if (channels == 6) + c->decorrelate[0] = ff_flac_decorrelate_indep6_32_ssse3; + else if (ARCH_X86_64 && channels == 8) + c->decorrelate[0] = ff_flac_decorrelate_indep8_32_ssse3; + } + } + if (EXTERNAL_SSE4(cpu_flags)) { + c->lpc32 = ff_flac_lpc_32_sse4; + } + if (EXTERNAL_AVX(cpu_flags)) { + if (fmt == AV_SAMPLE_FMT_S16) { + if (ARCH_X86_64 && channels == 8) + c->decorrelate[0] = ff_flac_decorrelate_indep8_16_avx; + } else if (fmt == AV_SAMPLE_FMT_S32) { + if (channels == 4) + c->decorrelate[0] = ff_flac_decorrelate_indep4_32_avx; + else if (channels == 6) + c->decorrelate[0] = ff_flac_decorrelate_indep6_32_avx; + else if (ARCH_X86_64 && channels == 8) + c->decorrelate[0] = ff_flac_decorrelate_indep8_32_avx; + } + } + if (EXTERNAL_XOP(cpu_flags)) { + c->lpc32 = ff_flac_lpc_32_xop; + } +#endif /* HAVE_X86ASM */ +} diff --git a/media/ffvpx/libavcodec/x86/h264_intrapred.asm b/media/ffvpx/libavcodec/x86/h264_intrapred.asm new file mode 100644 index 0000000000..8a38ba2bb5 --- /dev/null +++ b/media/ffvpx/libavcodec/x86/h264_intrapred.asm @@ -0,0 +1,2044 @@ +;****************************************************************************** +;* H.264 intra prediction asm optimizations +;* Copyright (c) 2010 Fiona Glaser +;* Copyright (c) 2010 Holger Lubitz +;* Copyright (c) 2010 Loren Merritt +;* Copyright (c) 2010 Ronald S. Bultje +;* +;* This file is part of FFmpeg. +;* +;* FFmpeg is free software; you can redistribute it and/or +;* modify it under the terms of the GNU Lesser General Public +;* License as published by the Free Software Foundation; either +;* version 2.1 of the License, or (at your option) any later version. +;* +;* FFmpeg is distributed in the hope that it will be useful, +;* but WITHOUT ANY WARRANTY; without even the implied warranty of +;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +;* Lesser General Public License for more details. +;* +;* You should have received a copy of the GNU Lesser General Public +;* License along with FFmpeg; if not, write to the Free Software +;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA +;****************************************************************************** + +%include "libavutil/x86/x86util.asm" + +SECTION_RODATA + +tm_shuf: times 8 db 0x03, 0x80 +pw_ff00: times 8 dw 0xff00 +plane_shuf: db -8, -7, -6, -5, -4, -3, -2, -1 + db 1, 2, 3, 4, 5, 6, 7, 8 +plane8_shuf: db -4, -3, -2, -1, 0, 0, 0, 0 + db 1, 2, 3, 4, 0, 0, 0, 0 +pw_0to7: dw 0, 1, 2, 3, 4, 5, 6, 7 +pw_1to8: dw 1, 2, 3, 4, 5, 6, 7, 8 +pw_m8tom1: dw -8, -7, -6, -5, -4, -3, -2, -1 +pw_m4to4: dw -4, -3, -2, -1, 1, 2, 3, 4 + +SECTION .text + +cextern pb_1 +cextern pb_3 +cextern pw_4 +cextern pw_8 + +;----------------------------------------------------------------------------- +; void ff_pred16x16_vertical_8(uint8_t *src, ptrdiff_t stride) +;----------------------------------------------------------------------------- + +INIT_XMM sse +cglobal pred16x16_vertical_8, 2,3 + sub r0, r1 + mov r2, 4 + movaps xmm0, [r0] +.loop: + movaps [r0+r1*1], xmm0 + movaps [r0+r1*2], xmm0 + lea r0, [r0+r1*2] + movaps [r0+r1*1], xmm0 + movaps [r0+r1*2], xmm0 + lea r0, [r0+r1*2] + dec r2 + jg .loop + RET + +;----------------------------------------------------------------------------- +; void ff_pred16x16_horizontal_8(uint8_t *src, ptrdiff_t stride) +;----------------------------------------------------------------------------- + +%macro PRED16x16_H 0 +cglobal pred16x16_horizontal_8, 2,3 + mov r2, 8 +%if cpuflag(ssse3) + mova m2, [pb_3] +%endif +.loop: + movd m0, [r0+r1*0-4] + movd m1, [r0+r1*1-4] + +%if cpuflag(ssse3) + pshufb m0, m2 + pshufb m1, m2 +%else + punpcklbw m0, m0 + punpcklbw m1, m1 + SPLATW m0, m0, 3 + SPLATW m1, m1, 3 + mova [r0+r1*0+8], m0 + mova [r0+r1*1+8], m1 +%endif + + mova [r0+r1*0], m0 + mova [r0+r1*1], m1 + lea r0, [r0+r1*2] + dec r2 + jg .loop + RET +%endmacro + +INIT_MMX mmxext +PRED16x16_H +INIT_XMM ssse3 +PRED16x16_H + +;----------------------------------------------------------------------------- +; void ff_pred16x16_dc_8(uint8_t *src, ptrdiff_t stride) +;----------------------------------------------------------------------------- + +%macro PRED16x16_DC 0 +cglobal pred16x16_dc_8, 2,7 + mov r4, r0 + sub r0, r1 + pxor mm0, mm0 + pxor mm1, mm1 + psadbw mm0, [r0+0] + psadbw mm1, [r0+8] + dec r0 + movzx r5d, byte [r0+r1*1] + paddw mm0, mm1 + movd r6d, mm0 + lea r0, [r0+r1*2] +%rep 7 + movzx r2d, byte [r0+r1*0] + movzx r3d, byte [r0+r1*1] + add r5d, r2d + add r6d, r3d + lea r0, [r0+r1*2] +%endrep + movzx r2d, byte [r0+r1*0] + add r5d, r6d + lea r2d, [r2+r5+16] + shr r2d, 5 +%if cpuflag(ssse3) + pxor m1, m1 +%endif + SPLATB_REG m0, r2, m1 + + mov r3d, 4 +.loop: + mova [r4+r1*0], m0 + mova [r4+r1*1], m0 + lea r4, [r4+r1*2] + mova [r4+r1*0], m0 + mova [r4+r1*1], m0 + lea r4, [r4+r1*2] + dec r3d + jg .loop + RET +%endmacro + +INIT_XMM sse2 +PRED16x16_DC +INIT_XMM ssse3 +PRED16x16_DC + +;----------------------------------------------------------------------------- +; void ff_pred16x16_tm_vp8_8(uint8_t *src, ptrdiff_t stride) +;----------------------------------------------------------------------------- + +INIT_XMM sse2 +cglobal pred16x16_tm_vp8_8, 2,6,6 + sub r0, r1 + pxor xmm2, xmm2 + movdqa xmm0, [r0] + movdqa xmm1, xmm0 + punpcklbw xmm0, xmm2 + punpckhbw xmm1, xmm2 + movzx r4d, byte [r0-1] + mov r5d, 8 +.loop: + movzx r2d, byte [r0+r1*1-1] + movzx r3d, byte [r0+r1*2-1] + sub r2d, r4d + sub r3d, r4d + movd xmm2, r2d + movd xmm4, r3d + pshuflw xmm2, xmm2, 0 + pshuflw xmm4, xmm4, 0 + punpcklqdq xmm2, xmm2 + punpcklqdq xmm4, xmm4 + movdqa xmm3, xmm2 + movdqa xmm5, xmm4 + paddw xmm2, xmm0 + paddw xmm3, xmm1 + paddw xmm4, xmm0 + paddw xmm5, xmm1 + packuswb xmm2, xmm3 + packuswb xmm4, xmm5 + movdqa [r0+r1*1], xmm2 + movdqa [r0+r1*2], xmm4 + lea r0, [r0+r1*2] + dec r5d + jg .loop + RET + +%if HAVE_AVX2_EXTERNAL +INIT_YMM avx2 +cglobal pred16x16_tm_vp8_8, 2, 4, 5, dst, stride, stride3, iteration + sub dstq, strideq + pmovzxbw m0, [dstq] + vpbroadcastb xm1, [r0-1] + pmovzxbw m1, xm1 + psubw m0, m1 + mov iterationd, 4 + lea stride3q, [strideq*3] +.loop: + vpbroadcastb xm1, [dstq+strideq*1-1] + vpbroadcastb xm2, [dstq+strideq*2-1] + vpbroadcastb xm3, [dstq+stride3q-1] + vpbroadcastb xm4, [dstq+strideq*4-1] + pmovzxbw m1, xm1 + pmovzxbw m2, xm2 + pmovzxbw m3, xm3 + pmovzxbw m4, xm4 + paddw m1, m0 + paddw m2, m0 + paddw m3, m0 + paddw m4, m0 + vpackuswb m1, m1, m2 + vpackuswb m3, m3, m4 + vpermq m1, m1, q3120 + vpermq m3, m3, q3120 + movdqa [dstq+strideq*1], xm1 + vextracti128 [dstq+strideq*2], m1, 1 + movdqa [dstq+stride3q*1], xm3 + vextracti128 [dstq+strideq*4], m3, 1 + lea dstq, [dstq+strideq*4] + dec iterationd + jg .loop + RET +%endif + +;----------------------------------------------------------------------------- +; void ff_pred16x16_plane_*_8(uint8_t *src, ptrdiff_t stride) +;----------------------------------------------------------------------------- + +%macro H264_PRED16x16_PLANE 1 +cglobal pred16x16_plane_%1_8, 2,9,7 + mov r2, r1 ; +stride + neg r1 ; -stride + + movh m0, [r0+r1 -1] +%if cpuflag(ssse3) + movhps m0, [r0+r1 +8] + pmaddubsw m0, [plane_shuf] ; H coefficients +%else ; sse2 + pxor m2, m2 + movh m1, [r0+r1 +8] + punpcklbw m0, m2 + punpcklbw m1, m2 + pmullw m0, [pw_m8tom1] + pmullw m1, [pw_1to8] + paddw m0, m1 +%endif + movhlps m1, m0 + paddw m0, m1 + PSHUFLW m1, m0, 0xE + paddw m0, m1 + PSHUFLW m1, m0, 0x1 + paddw m0, m1 ; sum of H coefficients + + lea r4, [r0+r2*8-1] + lea r3, [r0+r2*4-1] + add r4, r2 + +%if ARCH_X86_64 +%define e_reg r8 +%else +%define e_reg r0 +%endif + + movzx e_reg, byte [r3+r2*2 ] + movzx r5, byte [r4+r1 ] + sub r5, e_reg + + movzx e_reg, byte [r3+r2 ] + movzx r6, byte [r4 ] + sub r6, e_reg + lea r5, [r5+r6*2] + + movzx e_reg, byte [r3+r1 ] + movzx r6, byte [r4+r2*2 ] + sub r6, e_reg + lea r5, [r5+r6*4] + + movzx e_reg, byte [r3 ] +%if ARCH_X86_64 + movzx r7, byte [r4+r2 ] + sub r7, e_reg +%else + movzx r6, byte [r4+r2 ] + sub r6, e_reg + lea r5, [r5+r6*4] + sub r5, r6 +%endif + + lea e_reg, [r3+r1*4] + lea r3, [r4+r2*4] + + movzx r4, byte [e_reg+r2 ] + movzx r6, byte [r3 ] + sub r6, r4 +%if ARCH_X86_64 + lea r6, [r7+r6*2] + lea r5, [r5+r6*2] + add r5, r6 +%else + lea r5, [r5+r6*4] + lea r5, [r5+r6*2] +%endif + + movzx r4, byte [e_reg ] +%if ARCH_X86_64 + movzx r7, byte [r3 +r2 ] + sub r7, r4 + sub r5, r7 +%else + movzx r6, byte [r3 +r2 ] + sub r6, r4 + lea r5, [r5+r6*8] + sub r5, r6 +%endif + + movzx r4, byte [e_reg+r1 ] + movzx r6, byte [r3 +r2*2] + sub r6, r4 +%if ARCH_X86_64 + add r6, r7 +%endif + lea r5, [r5+r6*8] + + movzx r4, byte [e_reg+r2*2] + movzx r6, byte [r3 +r1 ] + sub r6, r4 + lea r5, [r5+r6*4] + add r5, r6 ; sum of V coefficients + +%if ARCH_X86_64 == 0 + mov r0, r0m +%endif + +%ifidn %1, h264 + lea r5, [r5*5+32] + sar r5, 6 +%elifidn %1, rv40 + lea r5, [r5*5] + sar r5, 6 +%elifidn %1, svq3 + test r5, r5 + lea r6, [r5+3] + cmovs r5, r6 + sar r5, 2 ; V/4 + lea r5, [r5*5] ; 5*(V/4) + test r5, r5 + lea r6, [r5+15] + cmovs r5, r6 + sar r5, 4 ; (5*(V/4))/16 +%endif + + movzx r4, byte [r0+r1 +15] + movzx r3, byte [r3+r2*2 ] + lea r3, [r3+r4+1] + shl r3, 4 + + movd r1d, m0 + movsx r1d, r1w +%ifnidn %1, svq3 +%ifidn %1, h264 + lea r1d, [r1d*5+32] +%else ; rv40 + lea r1d, [r1d*5] +%endif + sar r1d, 6 +%else ; svq3 + test r1d, r1d + lea r4d, [r1d+3] + cmovs r1d, r4d + sar r1d, 2 ; H/4 + lea r1d, [r1d*5] ; 5*(H/4) + test r1d, r1d + lea r4d, [r1d+15] + cmovs r1d, r4d + sar r1d, 4 ; (5*(H/4))/16 +%endif + movd m0, r1d + + add r1d, r5d + add r3d, r1d + shl r1d, 3 + sub r3d, r1d ; a + + movd m1, r5d + movd m3, r3d + SPLATW m0, m0, 0 ; H + SPLATW m1, m1, 0 ; V + SPLATW m3, m3, 0 ; a +%ifidn %1, svq3 + SWAP 0, 1 +%endif + mova m2, m0 + pmullw m0, [pw_0to7] ; 0*H, 1*H, ..., 7*H (words) + psllw m2, 3 + paddw m0, m3 ; a + {0,1,2,3,4,5,6,7}*H + paddw m2, m0 ; a + {8,9,10,11,12,13,14,15}*H + + mov r4, 8 +.loop: + mova m3, m0 ; b[0..7] + mova m4, m2 ; b[8..15] + psraw m3, 5 + psraw m4, 5 + packuswb m3, m4 + mova [r0], m3 + paddw m0, m1 + paddw m2, m1 + + mova m3, m0 ; b[0..7] + mova m4, m2 ; b[8..15] + psraw m3, 5 + psraw m4, 5 + packuswb m3, m4 + mova [r0+r2], m3 + paddw m0, m1 + paddw m2, m1 + + lea r0, [r0+r2*2] + dec r4 + jg .loop + RET +%endmacro + +INIT_XMM sse2 +H264_PRED16x16_PLANE h264 +H264_PRED16x16_PLANE rv40 +H264_PRED16x16_PLANE svq3 +INIT_XMM ssse3 +H264_PRED16x16_PLANE h264 +H264_PRED16x16_PLANE rv40 +H264_PRED16x16_PLANE svq3 + +;----------------------------------------------------------------------------- +; void ff_pred8x8_plane_8(uint8_t *src, ptrdiff_t stride) +;----------------------------------------------------------------------------- + +%macro H264_PRED8x8_PLANE 0 +cglobal pred8x8_plane_8, 2,9,7 + mov r2, r1 ; +stride + neg r1 ; -stride + + movd m0, [r0+r1 -1] +%if cpuflag(ssse3) + movhps m0, [r0+r1 +4] ; this reads 4 bytes more than necessary + pmaddubsw m0, [plane8_shuf] ; H coefficients +%else ; sse2 + pxor m2, m2 + movd m1, [r0+r1 +4] + punpckldq m0, m1 + punpcklbw m0, m2 + pmullw m0, [pw_m4to4] +%endif + movhlps m1, m0 + paddw m0, m1 + +%if notcpuflag(ssse3) + PSHUFLW m1, m0, 0xE + paddw m0, m1 +%endif ; !ssse3 + + PSHUFLW m1, m0, 0x1 + paddw m0, m1 ; sum of H coefficients + + lea r4, [r0+r2*4-1] + lea r3, [r0 -1] + add r4, r2 + +%if ARCH_X86_64 +%define e_reg r8 +%else +%define e_reg r0 +%endif + + movzx e_reg, byte [r3+r2*2 ] + movzx r5, byte [r4+r1 ] + sub r5, e_reg + + movzx e_reg, byte [r3 ] +%if ARCH_X86_64 + movzx r7, byte [r4+r2 ] + sub r7, e_reg + sub r5, r7 +%else + movzx r6, byte [r4+r2 ] + sub r6, e_reg + lea r5, [r5+r6*4] + sub r5, r6 +%endif + + movzx e_reg, byte [r3+r1 ] + movzx r6, byte [r4+r2*2 ] + sub r6, e_reg +%if ARCH_X86_64 + add r6, r7 +%endif + lea r5, [r5+r6*4] + + movzx e_reg, byte [r3+r2 ] + movzx r6, byte [r4 ] + sub r6, e_reg + lea r6, [r5+r6*2] + + lea r5, [r6*9+16] + lea r5, [r5+r6*8] + sar r5, 5 + +%if ARCH_X86_64 == 0 + mov r0, r0m +%endif + + movzx r3, byte [r4+r2*2 ] + movzx r4, byte [r0+r1 +7] + lea r3, [r3+r4+1] + shl r3, 4 + movd r1d, m0 + movsx r1d, r1w + imul r1d, 17 + add r1d, 16 + sar r1d, 5 + movd m0, r1d + add r1d, r5d + sub r3d, r1d + add r1d, r1d + sub r3d, r1d ; a + + movd m1, r5d + movd m3, r3d + SPLATW m0, m0, 0 ; H + SPLATW m1, m1, 0 ; V + SPLATW m3, m3, 0 ; a + pmullw m0, [pw_0to7] ; 0*H, 1*H, ..., 7*H (words) + paddw m0, m3 ; a + {0,1,2,3,4,5,6,7}*H + + mov r4, 4 +ALIGN 16 +.loop: + mova m3, m0 ; b[0..7] + paddw m0, m1 + psraw m3, 5 + mova m4, m0 ; V+b[0..7] + paddw m0, m1 + psraw m4, 5 + packuswb m3, m4 + movh [r0], m3 + movhps [r0+r2], m3 + + lea r0, [r0+r2*2] + dec r4 + jg .loop + RET +%endmacro + +INIT_XMM sse2 +H264_PRED8x8_PLANE +INIT_XMM ssse3 +H264_PRED8x8_PLANE + +;----------------------------------------------------------------------------- +; void ff_pred8x8_vertical_8(uint8_t *src, ptrdiff_t stride) +;----------------------------------------------------------------------------- + +INIT_MMX mmx +cglobal pred8x8_vertical_8, 2,2 + sub r0, r1 + movq mm0, [r0] +%rep 3 + movq [r0+r1*1], mm0 + movq [r0+r1*2], mm0 + lea r0, [r0+r1*2] +%endrep + movq [r0+r1*1], mm0 + movq [r0+r1*2], mm0 + RET + +;----------------------------------------------------------------------------- +; void ff_pred8x8_horizontal_8(uint8_t *src, ptrdiff_t stride) +;----------------------------------------------------------------------------- + +%macro PRED8x8_H 0 +cglobal pred8x8_horizontal_8, 2,3 + mov r2, 4 +%if cpuflag(ssse3) + mova m2, [pb_3] +%endif +.loop: + SPLATB_LOAD m0, r0+r1*0-1, m2 + SPLATB_LOAD m1, r0+r1*1-1, m2 + mova [r0+r1*0], m0 + mova [r0+r1*1], m1 + lea r0, [r0+r1*2] + dec r2 + jg .loop + RET +%endmacro + +INIT_MMX mmxext +PRED8x8_H +INIT_MMX ssse3 +PRED8x8_H + +;----------------------------------------------------------------------------- +; void ff_pred8x8_top_dc_8_mmxext(uint8_t *src, ptrdiff_t stride) +;----------------------------------------------------------------------------- +INIT_MMX mmxext +cglobal pred8x8_top_dc_8, 2,5 + sub r0, r1 + movq mm0, [r0] + pxor mm1, mm1 + pxor mm2, mm2 + lea r2, [r0+r1*2] + punpckhbw mm1, mm0 + punpcklbw mm0, mm2 + psadbw mm1, mm2 ; s1 + lea r3, [r2+r1*2] + psadbw mm0, mm2 ; s0 + psrlw mm1, 1 + psrlw mm0, 1 + pavgw mm1, mm2 + lea r4, [r3+r1*2] + pavgw mm0, mm2 + pshufw mm1, mm1, 0 + pshufw mm0, mm0, 0 ; dc0 (w) + packuswb mm0, mm1 ; dc0,dc1 (b) + movq [r0+r1*1], mm0 + movq [r0+r1*2], mm0 + lea r0, [r3+r1*2] + movq [r2+r1*1], mm0 + movq [r2+r1*2], mm0 + movq [r3+r1*1], mm0 + movq [r3+r1*2], mm0 + movq [r0+r1*1], mm0 + movq [r0+r1*2], mm0 + RET + +;----------------------------------------------------------------------------- +; void ff_pred8x8_dc_8_mmxext(uint8_t *src, ptrdiff_t stride) +;----------------------------------------------------------------------------- + +INIT_MMX mmxext +cglobal pred8x8_dc_8, 2,5 + sub r0, r1 + pxor m7, m7 + movd m0, [r0+0] + movd m1, [r0+4] + psadbw m0, m7 ; s0 + mov r4, r0 + psadbw m1, m7 ; s1 + + movzx r2d, byte [r0+r1*1-1] + movzx r3d, byte [r0+r1*2-1] + lea r0, [r0+r1*2] + add r2d, r3d + movzx r3d, byte [r0+r1*1-1] + add r2d, r3d + movzx r3d, byte [r0+r1*2-1] + add r2d, r3d + lea r0, [r0+r1*2] + movd m2, r2d ; s2 + movzx r2d, byte [r0+r1*1-1] + movzx r3d, byte [r0+r1*2-1] + lea r0, [r0+r1*2] + add r2d, r3d + movzx r3d, byte [r0+r1*1-1] + add r2d, r3d + movzx r3d, byte [r0+r1*2-1] + add r2d, r3d + movd m3, r2d ; s3 + + punpcklwd m0, m1 + mov r0, r4 + punpcklwd m2, m3 + punpckldq m0, m2 ; s0, s1, s2, s3 + pshufw m3, m0, 11110110b ; s2, s1, s3, s3 + lea r2, [r0+r1*2] + pshufw m0, m0, 01110100b ; s0, s1, s3, s1 + paddw m0, m3 + lea r3, [r2+r1*2] + psrlw m0, 2 + pavgw m0, m7 ; s0+s2, s1, s3, s1+s3 + lea r4, [r3+r1*2] + packuswb m0, m0 + punpcklbw m0, m0 + movq m1, m0 + punpcklbw m0, m0 + punpckhbw m1, m1 + movq [r0+r1*1], m0 + movq [r0+r1*2], m0 + movq [r2+r1*1], m0 + movq [r2+r1*2], m0 + movq [r3+r1*1], m1 + movq [r3+r1*2], m1 + movq [r4+r1*1], m1 + movq [r4+r1*2], m1 + RET + +;----------------------------------------------------------------------------- +; void ff_pred8x8_dc_rv40_8(uint8_t *src, ptrdiff_t stride) +;----------------------------------------------------------------------------- + +INIT_MMX mmxext +cglobal pred8x8_dc_rv40_8, 2,7 + mov r4, r0 + sub r0, r1 + pxor mm0, mm0 + psadbw mm0, [r0] + dec r0 + movzx r5d, byte [r0+r1*1] + movd r6d, mm0 + lea r0, [r0+r1*2] +%rep 3 + movzx r2d, byte [r0+r1*0] + movzx r3d, byte [r0+r1*1] + add r5d, r2d + add r6d, r3d + lea r0, [r0+r1*2] +%endrep + movzx r2d, byte [r0+r1*0] + add r5d, r6d + lea r2d, [r2+r5+8] + shr r2d, 4 + movd mm0, r2d + punpcklbw mm0, mm0 + pshufw mm0, mm0, 0 + mov r3d, 4 +.loop: + movq [r4+r1*0], mm0 + movq [r4+r1*1], mm0 + lea r4, [r4+r1*2] + dec r3d + jg .loop + RET + +;----------------------------------------------------------------------------- +; void ff_pred8x8_tm_vp8_8(uint8_t *src, ptrdiff_t stride) +;----------------------------------------------------------------------------- + +INIT_XMM sse2 +cglobal pred8x8_tm_vp8_8, 2,6,4 + sub r0, r1 + pxor xmm1, xmm1 + movq xmm0, [r0] + punpcklbw xmm0, xmm1 + movzx r4d, byte [r0-1] + mov r5d, 4 +.loop: + movzx r2d, byte [r0+r1*1-1] + movzx r3d, byte [r0+r1*2-1] + sub r2d, r4d + sub r3d, r4d + movd xmm2, r2d + movd xmm3, r3d + pshuflw xmm2, xmm2, 0 + pshuflw xmm3, xmm3, 0 + punpcklqdq xmm2, xmm2 + punpcklqdq xmm3, xmm3 + paddw xmm2, xmm0 + paddw xmm3, xmm0 + packuswb xmm2, xmm3 + movq [r0+r1*1], xmm2 + movhps [r0+r1*2], xmm2 + lea r0, [r0+r1*2] + dec r5d + jg .loop + RET + +INIT_XMM ssse3 +cglobal pred8x8_tm_vp8_8, 2,3,6 + sub r0, r1 + movdqa xmm4, [tm_shuf] + pxor xmm1, xmm1 + movq xmm0, [r0] + punpcklbw xmm0, xmm1 + movd xmm5, [r0-4] + pshufb xmm5, xmm4 + mov r2d, 4 +.loop: + movd xmm2, [r0+r1*1-4] + movd xmm3, [r0+r1*2-4] + pshufb xmm2, xmm4 + pshufb xmm3, xmm4 + psubw xmm2, xmm5 + psubw xmm3, xmm5 + paddw xmm2, xmm0 + paddw xmm3, xmm0 + packuswb xmm2, xmm3 + movq [r0+r1*1], xmm2 + movhps [r0+r1*2], xmm2 + lea r0, [r0+r1*2] + dec r2d + jg .loop + RET + +; dest, left, right, src, tmp +; output: %1 = (t[n-1] + t[n]*2 + t[n+1] + 2) >> 2 +%macro PRED4x4_LOWPASS 5 + mova %5, %2 + pavgb %2, %3 + pxor %3, %5 + mova %1, %4 + pand %3, [pb_1] + psubusb %2, %3 + pavgb %1, %2 +%endmacro + +;----------------------------------------------------------------------------- +; void ff_pred8x8l_top_dc_8(uint8_t *src, int has_topleft, int has_topright, +; ptrdiff_t stride) +;----------------------------------------------------------------------------- +%macro PRED8x8L_TOP_DC 0 +cglobal pred8x8l_top_dc_8, 4,4 + sub r0, r3 + pxor mm7, mm7 + movq mm0, [r0-8] + movq mm3, [r0] + movq mm1, [r0+8] + movq mm2, mm3 + movq mm4, mm3 + PALIGNR mm2, mm0, 7, mm0 + PALIGNR mm1, mm4, 1, mm4 + test r1d, r1d ; top_left + jz .fix_lt_2 + test r2d, r2d ; top_right + jz .fix_tr_1 + jmp .body +.fix_lt_2: + movq mm5, mm3 + pxor mm5, mm2 + psllq mm5, 56 + psrlq mm5, 56 + pxor mm2, mm5 + test r2d, r2d ; top_right + jnz .body +.fix_tr_1: + movq mm5, mm3 + pxor mm5, mm1 + psrlq mm5, 56 + psllq mm5, 56 + pxor mm1, mm5 +.body: + PRED4x4_LOWPASS mm0, mm2, mm1, mm3, mm5 + psadbw mm7, mm0 + paddw mm7, [pw_4] + psrlw mm7, 3 + pshufw mm7, mm7, 0 + packuswb mm7, mm7 +%rep 3 + movq [r0+r3*1], mm7 + movq [r0+r3*2], mm7 + lea r0, [r0+r3*2] +%endrep + movq [r0+r3*1], mm7 + movq [r0+r3*2], mm7 + RET +%endmacro + +INIT_MMX mmxext +PRED8x8L_TOP_DC +INIT_MMX ssse3 +PRED8x8L_TOP_DC + +;----------------------------------------------------------------------------- +; void ff_pred8x8l_dc_8(uint8_t *src, int has_topleft, int has_topright, +; ptrdiff_t stride) +;----------------------------------------------------------------------------- + +%macro PRED8x8L_DC 0 +cglobal pred8x8l_dc_8, 4,5 + sub r0, r3 + lea r4, [r0+r3*2] + movq mm0, [r0+r3*1-8] + punpckhbw mm0, [r0+r3*0-8] + movq mm1, [r4+r3*1-8] + punpckhbw mm1, [r0+r3*2-8] + mov r4, r0 + punpckhwd mm1, mm0 + lea r0, [r0+r3*4] + movq mm2, [r0+r3*1-8] + punpckhbw mm2, [r0+r3*0-8] + lea r0, [r0+r3*2] + movq mm3, [r0+r3*1-8] + punpckhbw mm3, [r0+r3*0-8] + punpckhwd mm3, mm2 + punpckhdq mm3, mm1 + lea r0, [r0+r3*2] + movq mm0, [r0+r3*0-8] + movq mm1, [r4] + mov r0, r4 + movq mm4, mm3 + movq mm2, mm3 + PALIGNR mm4, mm0, 7, mm0 + PALIGNR mm1, mm2, 1, mm2 + test r1d, r1d + jnz .do_left +.fix_lt_1: + movq mm5, mm3 + pxor mm5, mm4 + psrlq mm5, 56 + psllq mm5, 48 + pxor mm1, mm5 + jmp .do_left +.fix_lt_2: + movq mm5, mm3 + pxor mm5, mm2 + psllq mm5, 56 + psrlq mm5, 56 + pxor mm2, mm5 + test r2d, r2d + jnz .body +.fix_tr_1: + movq mm5, mm3 + pxor mm5, mm1 + psrlq mm5, 56 + psllq mm5, 56 + pxor mm1, mm5 + jmp .body +.do_left: + movq mm0, mm4 + PRED4x4_LOWPASS mm2, mm1, mm4, mm3, mm5 + movq mm4, mm0 + movq mm7, mm2 + PRED4x4_LOWPASS mm1, mm3, mm0, mm4, mm5 + psllq mm1, 56 + PALIGNR mm7, mm1, 7, mm3 + movq mm0, [r0-8] + movq mm3, [r0] + movq mm1, [r0+8] + movq mm2, mm3 + movq mm4, mm3 + PALIGNR mm2, mm0, 7, mm0 + PALIGNR mm1, mm4, 1, mm4 + test r1d, r1d + jz .fix_lt_2 + test r2d, r2d + jz .fix_tr_1 +.body: + lea r1, [r0+r3*2] + PRED4x4_LOWPASS mm6, mm2, mm1, mm3, mm5 + pxor mm0, mm0 + pxor mm1, mm1 + lea r2, [r1+r3*2] + psadbw mm0, mm7 + psadbw mm1, mm6 + paddw mm0, [pw_8] + paddw mm0, mm1 + lea r4, [r2+r3*2] + psrlw mm0, 4 + pshufw mm0, mm0, 0 + packuswb mm0, mm0 + movq [r0+r3*1], mm0 + movq [r0+r3*2], mm0 + movq [r1+r3*1], mm0 + movq [r1+r3*2], mm0 + movq [r2+r3*1], mm0 + movq [r2+r3*2], mm0 + movq [r4+r3*1], mm0 + movq [r4+r3*2], mm0 + RET +%endmacro + +INIT_MMX mmxext +PRED8x8L_DC +INIT_MMX ssse3 +PRED8x8L_DC + +;----------------------------------------------------------------------------- +; void ff_pred8x8l_horizontal_8(uint8_t *src, int has_topleft, +; int has_topright, ptrdiff_t stride) +;----------------------------------------------------------------------------- + +%macro PRED8x8L_HORIZONTAL 0 +cglobal pred8x8l_horizontal_8, 4,4 + sub r0, r3 + lea r2, [r0+r3*2] + movq mm0, [r0+r3*1-8] + test r1d, r1d + lea r1, [r0+r3] + cmovnz r1, r0 + punpckhbw mm0, [r1+r3*0-8] + movq mm1, [r2+r3*1-8] + punpckhbw mm1, [r0+r3*2-8] + mov r2, r0 + punpckhwd mm1, mm0 + lea r0, [r0+r3*4] + movq mm2, [r0+r3*1-8] + punpckhbw mm2, [r0+r3*0-8] + lea r0, [r0+r3*2] + movq mm3, [r0+r3*1-8] + punpckhbw mm3, [r0+r3*0-8] + punpckhwd mm3, mm2 + punpckhdq mm3, mm1 + lea r0, [r0+r3*2] + movq mm0, [r0+r3*0-8] + movq mm1, [r1+r3*0-8] + mov r0, r2 + movq mm4, mm3 + movq mm2, mm3 + PALIGNR mm4, mm0, 7, mm0 + PALIGNR mm1, mm2, 1, mm2 + movq mm0, mm4 + PRED4x4_LOWPASS mm2, mm1, mm4, mm3, mm5 + movq mm4, mm0 + movq mm7, mm2 + PRED4x4_LOWPASS mm1, mm3, mm0, mm4, mm5 + psllq mm1, 56 + PALIGNR mm7, mm1, 7, mm3 + movq mm3, mm7 + lea r1, [r0+r3*2] + movq mm7, mm3 + punpckhbw mm3, mm3 + punpcklbw mm7, mm7 + pshufw mm0, mm3, 0xff + pshufw mm1, mm3, 0xaa + lea r2, [r1+r3*2] + pshufw mm2, mm3, 0x55 + pshufw mm3, mm3, 0x00 + pshufw mm4, mm7, 0xff + pshufw mm5, mm7, 0xaa + pshufw mm6, mm7, 0x55 + pshufw mm7, mm7, 0x00 + movq [r0+r3*1], mm0 + movq [r0+r3*2], mm1 + movq [r1+r3*1], mm2 + movq [r1+r3*2], mm3 + movq [r2+r3*1], mm4 + movq [r2+r3*2], mm5 + lea r0, [r2+r3*2] + movq [r0+r3*1], mm6 + movq [r0+r3*2], mm7 + RET +%endmacro + +INIT_MMX mmxext +PRED8x8L_HORIZONTAL +INIT_MMX ssse3 +PRED8x8L_HORIZONTAL + +;----------------------------------------------------------------------------- +; void ff_pred8x8l_vertical_8(uint8_t *src, int has_topleft, int has_topright, +; ptrdiff_t stride) +;----------------------------------------------------------------------------- + +%macro PRED8x8L_VERTICAL 0 +cglobal pred8x8l_vertical_8, 4,4 + sub r0, r3 + movq mm0, [r0-8] + movq mm3, [r0] + movq mm1, [r0+8] + movq mm2, mm3 + movq mm4, mm3 + PALIGNR mm2, mm0, 7, mm0 + PALIGNR mm1, mm4, 1, mm4 + test r1d, r1d ; top_left + jz .fix_lt_2 + test r2d, r2d ; top_right + jz .fix_tr_1 + jmp .body +.fix_lt_2: + movq mm5, mm3 + pxor mm5, mm2 + psllq mm5, 56 + psrlq mm5, 56 + pxor mm2, mm5 + test r2d, r2d ; top_right + jnz .body +.fix_tr_1: + movq mm5, mm3 + pxor mm5, mm1 + psrlq mm5, 56 + psllq mm5, 56 + pxor mm1, mm5 +.body: + PRED4x4_LOWPASS mm0, mm2, mm1, mm3, mm5 +%rep 3 + movq [r0+r3*1], mm0 + movq [r0+r3*2], mm0 + lea r0, [r0+r3*2] +%endrep + movq [r0+r3*1], mm0 + movq [r0+r3*2], mm0 + RET +%endmacro + +INIT_MMX mmxext +PRED8x8L_VERTICAL +INIT_MMX ssse3 +PRED8x8L_VERTICAL + +;----------------------------------------------------------------------------- +; void ff_pred8x8l_down_left_8(uint8_t *src, int has_topleft, +; int has_topright, ptrdiff_t stride) +;----------------------------------------------------------------------------- + +%macro PRED8x8L_DOWN_LEFT 0 +cglobal pred8x8l_down_left_8, 4,4 + sub r0, r3 + movq mm0, [r0-8] + movq mm3, [r0] + movq mm1, [r0+8] + movq mm2, mm3 + movq mm4, mm3 + PALIGNR mm2, mm0, 7, mm0 + PALIGNR mm1, mm4, 1, mm4 + test r1d, r1d ; top_left + jz .fix_lt_2 + test r2d, r2d ; top_right + jz .fix_tr_1 + jmp .do_top +.fix_lt_2: + movq mm5, mm3 + pxor mm5, mm2 + psllq mm5, 56 + psrlq mm5, 56 + pxor mm2, mm5 + test r2d, r2d ; top_right + jnz .do_top +.fix_tr_1: + movq mm5, mm3 + pxor mm5, mm1 + psrlq mm5, 56 + psllq mm5, 56 + pxor mm1, mm5 + jmp .do_top +.fix_tr_2: + punpckhbw mm3, mm3 + pshufw mm1, mm3, 0xFF + jmp .do_topright +.do_top: + PRED4x4_LOWPASS mm4, mm2, mm1, mm3, mm5 + movq2dq xmm3, mm4 + test r2d, r2d ; top_right + jz .fix_tr_2 + movq mm0, [r0+8] + movq mm5, mm0 + movq mm2, mm0 + movq mm4, mm0 + psrlq mm5, 56 + PALIGNR mm2, mm3, 7, mm3 + PALIGNR mm5, mm4, 1, mm4 + PRED4x4_LOWPASS mm1, mm2, mm5, mm0, mm4 +.do_topright: + movq2dq xmm4, mm1 + psrlq mm1, 56 + movq2dq xmm5, mm1 + lea r1, [r0+r3*2] + pslldq xmm4, 8 + por xmm3, xmm4 + movdqa xmm2, xmm3 + psrldq xmm2, 1 + pslldq xmm5, 15 + por xmm2, xmm5 + lea r2, [r1+r3*2] + movdqa xmm1, xmm3 + pslldq xmm1, 1 +INIT_XMM cpuname + PRED4x4_LOWPASS xmm0, xmm1, xmm2, xmm3, xmm4 + psrldq xmm0, 1 + movq [r0+r3*1], xmm0 + psrldq xmm0, 1 + movq [r0+r3*2], xmm0 + psrldq xmm0, 1 + lea r0, [r2+r3*2] + movq [r1+r3*1], xmm0 + psrldq xmm0, 1 + movq [r1+r3*2], xmm0 + psrldq xmm0, 1 + movq [r2+r3*1], xmm0 + psrldq xmm0, 1 + movq [r2+r3*2], xmm0 + psrldq xmm0, 1 + movq [r0+r3*1], xmm0 + psrldq xmm0, 1 + movq [r0+r3*2], xmm0 + RET +%endmacro + +INIT_MMX sse2 +PRED8x8L_DOWN_LEFT +INIT_MMX ssse3 +PRED8x8L_DOWN_LEFT + +;----------------------------------------------------------------------------- +; void ff_pred8x8l_down_right_8(uint8_t *src, int has_topleft, +; int has_topright, ptrdiff_t stride) +;----------------------------------------------------------------------------- + +%macro PRED8x8L_DOWN_RIGHT 0 +cglobal pred8x8l_down_right_8, 4,5 + sub r0, r3 + lea r4, [r0+r3*2] + movq mm0, [r0+r3*1-8] + punpckhbw mm0, [r0+r3*0-8] + movq mm1, [r4+r3*1-8] + punpckhbw mm1, [r0+r3*2-8] + mov r4, r0 + punpckhwd mm1, mm0 + lea r0, [r0+r3*4] + movq mm2, [r0+r3*1-8] + punpckhbw mm2, [r0+r3*0-8] + lea r0, [r0+r3*2] + movq mm3, [r0+r3*1-8] + punpckhbw mm3, [r0+r3*0-8] + punpckhwd mm3, mm2 + punpckhdq mm3, mm1 + lea r0, [r0+r3*2] + movq mm0, [r0+r3*0-8] + movq mm1, [r4] + mov r0, r4 + movq mm4, mm3 + movq mm2, mm3 + PALIGNR mm4, mm0, 7, mm0 + PALIGNR mm1, mm2, 1, mm2 + test r1d, r1d + jz .fix_lt_1 + jmp .do_left +.fix_lt_1: + movq mm5, mm3 + pxor mm5, mm4 + psrlq mm5, 56 + psllq mm5, 48 + pxor mm1, mm5 + jmp .do_left +.fix_lt_2: + movq mm5, mm3 + pxor mm5, mm2 + psllq mm5, 56 + psrlq mm5, 56 + pxor mm2, mm5 + test r2d, r2d + jnz .do_top +.fix_tr_1: + movq mm5, mm3 + pxor mm5, mm1 + psrlq mm5, 56 + psllq mm5, 56 + pxor mm1, mm5 + jmp .do_top +.do_left: + movq mm0, mm4 + PRED4x4_LOWPASS mm2, mm1, mm4, mm3, mm5 + movq mm4, mm0 + movq mm7, mm2 + movq2dq xmm3, mm2 + PRED4x4_LOWPASS mm1, mm3, mm0, mm4, mm5 + psllq mm1, 56 + PALIGNR mm7, mm1, 7, mm3 + movq2dq xmm1, mm7 + movq mm0, [r0-8] + movq mm3, [r0] + movq mm1, [r0+8] + movq mm2, mm3 + movq mm4, mm3 + PALIGNR mm2, mm0, 7, mm0 + PALIGNR mm1, mm4, 1, mm4 + test r1d, r1d + jz .fix_lt_2 + test r2d, r2d + jz .fix_tr_1 +.do_top: + PRED4x4_LOWPASS mm4, mm2, mm1, mm3, mm5 + movq2dq xmm4, mm4 + lea r1, [r0+r3*2] + movdqa xmm0, xmm3 + pslldq xmm4, 8 + por xmm3, xmm4 + lea r2, [r1+r3*2] + pslldq xmm4, 1 + por xmm1, xmm4 + psrldq xmm0, 7 + pslldq xmm0, 15 + psrldq xmm0, 7 + por xmm1, xmm0 + lea r0, [r2+r3*2] + movdqa xmm2, xmm3 + psrldq xmm2, 1 +INIT_XMM cpuname + PRED4x4_LOWPASS xmm0, xmm1, xmm2, xmm3, xmm4 + movdqa xmm1, xmm0 + psrldq xmm1, 1 + movq [r0+r3*2], xmm0 + movq [r0+r3*1], xmm1 + psrldq xmm0, 2 + psrldq xmm1, 2 + movq [r2+r3*2], xmm0 + movq [r2+r3*1], xmm1 + psrldq xmm0, 2 + psrldq xmm1, 2 + movq [r1+r3*2], xmm0 + movq [r1+r3*1], xmm1 + psrldq xmm0, 2 + psrldq xmm1, 2 + movq [r4+r3*2], xmm0 + movq [r4+r3*1], xmm1 + RET +%endmacro + +INIT_MMX sse2 +PRED8x8L_DOWN_RIGHT +INIT_MMX ssse3 +PRED8x8L_DOWN_RIGHT + +;----------------------------------------------------------------------------- +; void ff_pred8x8l_vertical_right_8(uint8_t *src, int has_topleft, +; int has_topright, ptrdiff_t stride) +;----------------------------------------------------------------------------- + +%macro PRED8x8L_VERTICAL_RIGHT 0 +cglobal pred8x8l_vertical_right_8, 4,5,7 + ; manually spill XMM registers for Win64 because + ; the code here is initialized with INIT_MMX + WIN64_SPILL_XMM 7 + sub r0, r3 + lea r4, [r0+r3*2] + movq mm0, [r0+r3*1-8] + punpckhbw mm0, [r0+r3*0-8] + movq mm1, [r4+r3*1-8] + punpckhbw mm1, [r0+r3*2-8] + mov r4, r0 + punpckhwd mm1, mm0 + lea r0, [r0+r3*4] + movq mm2, [r0+r3*1-8] + punpckhbw mm2, [r0+r3*0-8] + lea r0, [r0+r3*2] + movq mm3, [r0+r3*1-8] + punpckhbw mm3, [r0+r3*0-8] + punpckhwd mm3, mm2 + punpckhdq mm3, mm1 + lea r0, [r0+r3*2] + movq mm0, [r0+r3*0-8] + movq mm1, [r4] + mov r0, r4 + movq mm4, mm3 + movq mm2, mm3 + PALIGNR mm4, mm0, 7, mm0 + PALIGNR mm1, mm2, 1, mm2 + test r1d, r1d + jnz .do_left +.fix_lt_1: + movq mm5, mm3 + pxor mm5, mm4 + psrlq mm5, 56 + psllq mm5, 48 + pxor mm1, mm5 + jmp .do_left +.fix_lt_2: + movq mm5, mm3 + pxor mm5, mm2 + psllq mm5, 56 + psrlq mm5, 56 + pxor mm2, mm5 + test r2d, r2d + jnz .do_top +.fix_tr_1: + movq mm5, mm3 + pxor mm5, mm1 + psrlq mm5, 56 + psllq mm5, 56 + pxor mm1, mm5 + jmp .do_top +.do_left: + movq mm0, mm4 + PRED4x4_LOWPASS mm2, mm1, mm4, mm3, mm5 + movq2dq xmm0, mm2 + movq mm0, [r0-8] + movq mm3, [r0] + movq mm1, [r0+8] + movq mm2, mm3 + movq mm4, mm3 + PALIGNR mm2, mm0, 7, mm0 + PALIGNR mm1, mm4, 1, mm4 + test r1d, r1d + jz .fix_lt_2 + test r2d, r2d + jz .fix_tr_1 +.do_top: + PRED4x4_LOWPASS mm6, mm2, mm1, mm3, mm5 + lea r1, [r0+r3*2] + movq2dq xmm4, mm6 + pslldq xmm4, 8 + por xmm0, xmm4 + movdqa xmm6, [pw_ff00] + movdqa xmm1, xmm0 + lea r2, [r1+r3*2] + movdqa xmm2, xmm0 + movdqa xmm3, xmm0 + pslldq xmm0, 1 + pslldq xmm1, 2 + pavgb xmm2, xmm0 +INIT_XMM cpuname + PRED4x4_LOWPASS xmm4, xmm3, xmm1, xmm0, xmm5 + pandn xmm6, xmm4 + movdqa xmm5, xmm4 + psrlw xmm4, 8 + packuswb xmm6, xmm4 + movhlps xmm4, xmm6 + movhps [r0+r3*2], xmm5 + movhps [r0+r3*1], xmm2 + psrldq xmm5, 4 + movss xmm5, xmm6 + psrldq xmm2, 4 + movss xmm2, xmm4 + lea r0, [r2+r3*2] + psrldq xmm5, 1 + psrldq xmm2, 1 + movq [r0+r3*2], xmm5 + movq [r0+r3*1], xmm2 + psrldq xmm5, 1 + psrldq xmm2, 1 + movq [r2+r3*2], xmm5 + movq [r2+r3*1], xmm2 + psrldq xmm5, 1 + psrldq xmm2, 1 + movq [r1+r3*2], xmm5 + movq [r1+r3*1], xmm2 + RET +%endmacro + +INIT_MMX sse2 +PRED8x8L_VERTICAL_RIGHT +INIT_MMX ssse3 +PRED8x8L_VERTICAL_RIGHT + +;----------------------------------------------------------------------------- +; void ff_pred8x8l_vertical_left_8(uint8_t *src, int has_topleft, +; int has_topright, ptrdiff_t stride) +;----------------------------------------------------------------------------- + +%macro PRED8x8L_VERTICAL_LEFT 0 +cglobal pred8x8l_vertical_left_8, 4,4 + sub r0, r3 + movq mm0, [r0-8] + movq mm3, [r0] + movq mm1, [r0+8] + movq mm2, mm3 + movq mm4, mm3 + PALIGNR mm2, mm0, 7, mm0 + PALIGNR mm1, mm4, 1, mm4 + test r1d, r1d + jz .fix_lt_2 + test r2d, r2d + jz .fix_tr_1 + jmp .do_top +.fix_lt_2: + movq mm5, mm3 + pxor mm5, mm2 + psllq mm5, 56 + psrlq mm5, 56 + pxor mm2, mm5 + test r2d, r2d + jnz .do_top +.fix_tr_1: + movq mm5, mm3 + pxor mm5, mm1 + psrlq mm5, 56 + psllq mm5, 56 + pxor mm1, mm5 + jmp .do_top +.fix_tr_2: + punpckhbw mm3, mm3 + pshufw mm1, mm3, 0xFF + jmp .do_topright +.do_top: + PRED4x4_LOWPASS mm4, mm2, mm1, mm3, mm5 + movq2dq xmm4, mm4 + test r2d, r2d + jz .fix_tr_2 + movq mm0, [r0+8] + movq mm5, mm0 + movq mm2, mm0 + movq mm4, mm0 + psrlq mm5, 56 + PALIGNR mm2, mm3, 7, mm3 + PALIGNR mm5, mm4, 1, mm4 + PRED4x4_LOWPASS mm1, mm2, mm5, mm0, mm4 +.do_topright: + movq2dq xmm3, mm1 + lea r1, [r0+r3*2] + pslldq xmm3, 8 + por xmm4, xmm3 + movdqa xmm2, xmm4 + movdqa xmm1, xmm4 + movdqa xmm3, xmm4 + psrldq xmm2, 1 + pslldq xmm1, 1 + pavgb xmm3, xmm2 + lea r2, [r1+r3*2] +INIT_XMM cpuname + PRED4x4_LOWPASS xmm0, xmm1, xmm2, xmm4, xmm5 + psrldq xmm0, 1 + movq [r0+r3*1], xmm3 + movq [r0+r3*2], xmm0 + lea r0, [r2+r3*2] + psrldq xmm3, 1 + psrldq xmm0, 1 + movq [r1+r3*1], xmm3 + movq [r1+r3*2], xmm0 + psrldq xmm3, 1 + psrldq xmm0, 1 + movq [r2+r3*1], xmm3 + movq [r2+r3*2], xmm0 + psrldq xmm3, 1 + psrldq xmm0, 1 + movq [r0+r3*1], xmm3 + movq [r0+r3*2], xmm0 + RET +%endmacro + +INIT_MMX sse2 +PRED8x8L_VERTICAL_LEFT +INIT_MMX ssse3 +PRED8x8L_VERTICAL_LEFT + +;----------------------------------------------------------------------------- +; void ff_pred8x8l_horizontal_up_8(uint8_t *src, int has_topleft, +; int has_topright, ptrdiff_t stride) +;----------------------------------------------------------------------------- + +%macro PRED8x8L_HORIZONTAL_UP 0 +cglobal pred8x8l_horizontal_up_8, 4,4 + sub r0, r3 + lea r2, [r0+r3*2] + movq mm0, [r0+r3*1-8] + test r1d, r1d + lea r1, [r0+r3] + cmovnz r1, r0 + punpckhbw mm0, [r1+r3*0-8] + movq mm1, [r2+r3*1-8] + punpckhbw mm1, [r0+r3*2-8] + mov r2, r0 + punpckhwd mm1, mm0 + lea r0, [r0+r3*4] + movq mm2, [r0+r3*1-8] + punpckhbw mm2, [r0+r3*0-8] + lea r0, [r0+r3*2] + movq mm3, [r0+r3*1-8] + punpckhbw mm3, [r0+r3*0-8] + punpckhwd mm3, mm2 + punpckhdq mm3, mm1 + lea r0, [r0+r3*2] + movq mm0, [r0+r3*0-8] + movq mm1, [r1+r3*0-8] + mov r0, r2 + movq mm4, mm3 + movq mm2, mm3 + PALIGNR mm4, mm0, 7, mm0 + PALIGNR mm1, mm2, 1, mm2 + movq mm0, mm4 + PRED4x4_LOWPASS mm2, mm1, mm4, mm3, mm5 + movq mm4, mm0 + movq mm7, mm2 + PRED4x4_LOWPASS mm1, mm3, mm0, mm4, mm5 + psllq mm1, 56 + PALIGNR mm7, mm1, 7, mm3 + lea r1, [r0+r3*2] + pshufw mm0, mm7, 00011011b ; l6 l7 l4 l5 l2 l3 l0 l1 + psllq mm7, 56 ; l7 .. .. .. .. .. .. .. + movq mm2, mm0 + psllw mm0, 8 + psrlw mm2, 8 + por mm2, mm0 ; l7 l6 l5 l4 l3 l2 l1 l0 + movq mm3, mm2 + movq mm4, mm2 + movq mm5, mm2 + psrlq mm2, 8 + psrlq mm3, 16 + lea r2, [r1+r3*2] + por mm2, mm7 ; l7 l7 l6 l5 l4 l3 l2 l1 + punpckhbw mm7, mm7 + por mm3, mm7 ; l7 l7 l7 l6 l5 l4 l3 l2 + pavgb mm4, mm2 + PRED4x4_LOWPASS mm1, mm3, mm5, mm2, mm6 + movq mm5, mm4 + punpcklbw mm4, mm1 ; p4 p3 p2 p1 + punpckhbw mm5, mm1 ; p8 p7 p6 p5 + movq mm6, mm5 + movq mm7, mm5 + movq mm0, mm5 + PALIGNR mm5, mm4, 2, mm1 + pshufw mm1, mm6, 11111001b + PALIGNR mm6, mm4, 4, mm2 + pshufw mm2, mm7, 11111110b + PALIGNR mm7, mm4, 6, mm3 + pshufw mm3, mm0, 11111111b + movq [r0+r3*1], mm4 + movq [r0+r3*2], mm5 + lea r0, [r2+r3*2] + movq [r1+r3*1], mm6 + movq [r1+r3*2], mm7 + movq [r2+r3*1], mm0 + movq [r2+r3*2], mm1 + movq [r0+r3*1], mm2 + movq [r0+r3*2], mm3 + RET +%endmacro + +INIT_MMX mmxext +PRED8x8L_HORIZONTAL_UP +INIT_MMX ssse3 +PRED8x8L_HORIZONTAL_UP + +;----------------------------------------------------------------------------- +; void ff_pred8x8l_horizontal_down_8(uint8_t *src, int has_topleft, +; int has_topright, ptrdiff_t stride) +;----------------------------------------------------------------------------- + +%macro PRED8x8L_HORIZONTAL_DOWN 0 +cglobal pred8x8l_horizontal_down_8, 4,5 + sub r0, r3 + lea r4, [r0+r3*2] + movq mm0, [r0+r3*1-8] + punpckhbw mm0, [r0+r3*0-8] + movq mm1, [r4+r3*1-8] + punpckhbw mm1, [r0+r3*2-8] + mov r4, r0 + punpckhwd mm1, mm0 + lea r0, [r0+r3*4] + movq mm2, [r0+r3*1-8] + punpckhbw mm2, [r0+r3*0-8] + lea r0, [r0+r3*2] + movq mm3, [r0+r3*1-8] + punpckhbw mm3, [r0+r3*0-8] + punpckhwd mm3, mm2 + punpckhdq mm3, mm1 + lea r0, [r0+r3*2] + movq mm0, [r0+r3*0-8] + movq mm1, [r4] + mov r0, r4 + movq mm4, mm3 + movq mm2, mm3 + PALIGNR mm4, mm0, 7, mm0 + PALIGNR mm1, mm2, 1, mm2 + test r1d, r1d + jnz .do_left +.fix_lt_1: + movq mm5, mm3 + pxor mm5, mm4 + psrlq mm5, 56 + psllq mm5, 48 + pxor mm1, mm5 + jmp .do_left +.fix_lt_2: + movq mm5, mm3 + pxor mm5, mm2 + psllq mm5, 56 + psrlq mm5, 56 + pxor mm2, mm5 + test r2d, r2d + jnz .do_top +.fix_tr_1: + movq mm5, mm3 + pxor mm5, mm1 + psrlq mm5, 56 + psllq mm5, 56 + pxor mm1, mm5 + jmp .do_top +.fix_tr_2: + punpckhbw mm3, mm3 + pshufw mm1, mm3, 0xFF + jmp .do_topright +.do_left: + movq mm0, mm4 + PRED4x4_LOWPASS mm2, mm1, mm4, mm3, mm5 + movq2dq xmm0, mm2 + pslldq xmm0, 8 + movq mm4, mm0 + PRED4x4_LOWPASS mm1, mm3, mm0, mm4, mm5 + movq2dq xmm2, mm1 + pslldq xmm2, 15 + psrldq xmm2, 8 + por xmm0, xmm2 + movq mm0, [r0-8] + movq mm3, [r0] + movq mm1, [r0+8] + movq mm2, mm3 + movq mm4, mm3 + PALIGNR mm2, mm0, 7, mm0 + PALIGNR mm1, mm4, 1, mm4 + test r1d, r1d + jz .fix_lt_2 + test r2d, r2d + jz .fix_tr_1 +.do_top: + PRED4x4_LOWPASS mm4, mm2, mm1, mm3, mm5 + movq2dq xmm1, mm4 + test r2d, r2d + jz .fix_tr_2 + movq mm0, [r0+8] + movq mm5, mm0 + movq mm2, mm0 + movq mm4, mm0 + psrlq mm5, 56 + PALIGNR mm2, mm3, 7, mm3 + PALIGNR mm5, mm4, 1, mm4 + PRED4x4_LOWPASS mm1, mm2, mm5, mm0, mm4 +.do_topright: + movq2dq xmm5, mm1 + pslldq xmm5, 8 + por xmm1, xmm5 +INIT_XMM cpuname + lea r2, [r4+r3*2] + movdqa xmm2, xmm1 + movdqa xmm3, xmm1 + PALIGNR xmm1, xmm0, 7, xmm4 + PALIGNR xmm2, xmm0, 9, xmm5 + lea r1, [r2+r3*2] + PALIGNR xmm3, xmm0, 8, xmm0 + movdqa xmm4, xmm1 + pavgb xmm4, xmm3 + lea r0, [r1+r3*2] + PRED4x4_LOWPASS xmm0, xmm1, xmm2, xmm3, xmm5 + punpcklbw xmm4, xmm0 + movhlps xmm0, xmm4 + movq [r0+r3*2], xmm4 + movq [r2+r3*2], xmm0 + psrldq xmm4, 2 + psrldq xmm0, 2 + movq [r0+r3*1], xmm4 + movq [r2+r3*1], xmm0 + psrldq xmm4, 2 + psrldq xmm0, 2 + movq [r1+r3*2], xmm4 + movq [r4+r3*2], xmm0 + psrldq xmm4, 2 + psrldq xmm0, 2 + movq [r1+r3*1], xmm4 + movq [r4+r3*1], xmm0 + RET +%endmacro + +INIT_MMX sse2 +PRED8x8L_HORIZONTAL_DOWN +INIT_MMX ssse3 +PRED8x8L_HORIZONTAL_DOWN + +;------------------------------------------------------------------------------- +; void ff_pred4x4_dc_8_mmxext(uint8_t *src, const uint8_t *topright, +; ptrdiff_t stride) +;------------------------------------------------------------------------------- + +INIT_MMX mmxext +cglobal pred4x4_dc_8, 3,5 + pxor mm7, mm7 + mov r4, r0 + sub r0, r2 + movd mm0, [r0] + psadbw mm0, mm7 + movzx r1d, byte [r0+r2*1-1] + movd r3d, mm0 + add r3d, r1d + movzx r1d, byte [r0+r2*2-1] + lea r0, [r0+r2*2] + add r3d, r1d + movzx r1d, byte [r0+r2*1-1] + add r3d, r1d + movzx r1d, byte [r0+r2*2-1] + add r3d, r1d + add r3d, 4 + shr r3d, 3 + imul r3d, 0x01010101 + mov [r4+r2*0], r3d + mov [r0+r2*0], r3d + mov [r0+r2*1], r3d + mov [r0+r2*2], r3d + RET + +;----------------------------------------------------------------------------- +; void ff_pred4x4_tm_vp8_8_mmxext(uint8_t *src, const uint8_t *topright, +; ptrdiff_t stride) +;----------------------------------------------------------------------------- + +INIT_MMX mmxext +cglobal pred4x4_tm_vp8_8, 3,6 + sub r0, r2 + pxor mm7, mm7 + movd mm0, [r0] + punpcklbw mm0, mm7 + movzx r4d, byte [r0-1] + mov r5d, 2 +.loop: + movzx r1d, byte [r0+r2*1-1] + movzx r3d, byte [r0+r2*2-1] + sub r1d, r4d + sub r3d, r4d + movd mm2, r1d + movd mm4, r3d + pshufw mm2, mm2, 0 + pshufw mm4, mm4, 0 + paddw mm2, mm0 + paddw mm4, mm0 + packuswb mm2, mm2 + packuswb mm4, mm4 + movd [r0+r2*1], mm2 + movd [r0+r2*2], mm4 + lea r0, [r0+r2*2] + dec r5d + jg .loop + RET + +INIT_XMM ssse3 +cglobal pred4x4_tm_vp8_8, 3,3 + sub r0, r2 + movq mm6, [tm_shuf] + pxor mm1, mm1 + movd mm0, [r0] + punpcklbw mm0, mm1 + movd mm7, [r0-4] + pshufb mm7, mm6 + lea r1, [r0+r2*2] + movd mm2, [r0+r2*1-4] + movd mm3, [r0+r2*2-4] + movd mm4, [r1+r2*1-4] + movd mm5, [r1+r2*2-4] + pshufb mm2, mm6 + pshufb mm3, mm6 + pshufb mm4, mm6 + pshufb mm5, mm6 + psubw mm0, mm7 + paddw mm2, mm0 + paddw mm3, mm0 + paddw mm4, mm0 + paddw mm5, mm0 + packuswb mm2, mm2 + packuswb mm3, mm3 + packuswb mm4, mm4 + packuswb mm5, mm5 + movd [r0+r2*1], mm2 + movd [r0+r2*2], mm3 + movd [r1+r2*1], mm4 + movd [r1+r2*2], mm5 + RET + +;----------------------------------------------------------------------------- +; void ff_pred4x4_vertical_vp8_8_mmxext(uint8_t *src, const uint8_t *topright, +; ptrdiff_t stride) +;----------------------------------------------------------------------------- + +INIT_MMX mmxext +cglobal pred4x4_vertical_vp8_8, 3,3 + sub r0, r2 + movd m1, [r0-1] + movd m0, [r0] + mova m2, m0 ;t0 t1 t2 t3 + punpckldq m0, [r1] ;t0 t1 t2 t3 t4 t5 t6 t7 + lea r1, [r0+r2*2] + psrlq m0, 8 ;t1 t2 t3 t4 + PRED4x4_LOWPASS m3, m1, m0, m2, m4 + movd [r0+r2*1], m3 + movd [r0+r2*2], m3 + movd [r1+r2*1], m3 + movd [r1+r2*2], m3 + RET + +;----------------------------------------------------------------------------- +; void ff_pred4x4_down_left_8_mmxext(uint8_t *src, const uint8_t *topright, +; ptrdiff_t stride) +;----------------------------------------------------------------------------- +INIT_MMX mmxext +cglobal pred4x4_down_left_8, 3,3 + sub r0, r2 + movq m1, [r0] + punpckldq m1, [r1] + movq m2, m1 + movq m3, m1 + psllq m1, 8 + pxor m2, m1 + psrlq m2, 8 + pxor m2, m3 + PRED4x4_LOWPASS m0, m1, m2, m3, m4 + lea r1, [r0+r2*2] + psrlq m0, 8 + movd [r0+r2*1], m0 + psrlq m0, 8 + movd [r0+r2*2], m0 + psrlq m0, 8 + movd [r1+r2*1], m0 + psrlq m0, 8 + movd [r1+r2*2], m0 + RET + +;------------------------------------------------------------------------------ +; void ff_pred4x4_vertical_left_8_mmxext(uint8_t *src, const uint8_t *topright, +; ptrdiff_t stride) +;------------------------------------------------------------------------------ + +INIT_MMX mmxext +cglobal pred4x4_vertical_left_8, 3,3 + sub r0, r2 + movq m1, [r0] + punpckldq m1, [r1] + movq m3, m1 + movq m2, m1 + psrlq m3, 8 + psrlq m2, 16 + movq m4, m3 + pavgb m4, m1 + PRED4x4_LOWPASS m0, m1, m2, m3, m5 + lea r1, [r0+r2*2] + movh [r0+r2*1], m4 + movh [r0+r2*2], m0 + psrlq m4, 8 + psrlq m0, 8 + movh [r1+r2*1], m4 + movh [r1+r2*2], m0 + RET + +;------------------------------------------------------------------------------ +; void ff_pred4x4_horizontal_up_8_mmxext(uint8_t *src, const uint8_t *topright, +; ptrdiff_t stride) +;------------------------------------------------------------------------------ + +INIT_MMX mmxext +cglobal pred4x4_horizontal_up_8, 3,3 + sub r0, r2 + lea r1, [r0+r2*2] + movd m0, [r0+r2*1-4] + punpcklbw m0, [r0+r2*2-4] + movd m1, [r1+r2*1-4] + punpcklbw m1, [r1+r2*2-4] + punpckhwd m0, m1 + movq m1, m0 + punpckhbw m1, m1 + pshufw m1, m1, 0xFF + punpckhdq m0, m1 + movq m2, m0 + movq m3, m0 + movq m7, m0 + psrlq m2, 16 + psrlq m3, 8 + pavgb m7, m3 + PRED4x4_LOWPASS m4, m0, m2, m3, m5 + punpcklbw m7, m4 + movd [r0+r2*1], m7 + psrlq m7, 16 + movd [r0+r2*2], m7 + psrlq m7, 16 + movd [r1+r2*1], m7 + movd [r1+r2*2], m1 + RET + +;------------------------------------------------------------------------------ +; void ff_pred4x4_horizontal_down_8_mmxext(uint8_t *src, +; const uint8_t *topright, +; ptrdiff_t stride) +;------------------------------------------------------------------------------ + +INIT_MMX mmxext +cglobal pred4x4_horizontal_down_8, 3,3 + sub r0, r2 + lea r1, [r0+r2*2] + movh m0, [r0-4] ; lt .. + punpckldq m0, [r0] ; t3 t2 t1 t0 lt .. .. .. + psllq m0, 8 ; t2 t1 t0 lt .. .. .. .. + movd m1, [r1+r2*2-4] ; l3 + punpcklbw m1, [r1+r2*1-4] ; l2 l3 + movd m2, [r0+r2*2-4] ; l1 + punpcklbw m2, [r0+r2*1-4] ; l0 l1 + punpckhwd m1, m2 ; l0 l1 l2 l3 + punpckhdq m1, m0 ; t2 t1 t0 lt l0 l1 l2 l3 + movq m0, m1 + movq m2, m1 + movq m5, m1 + psrlq m0, 16 ; .. .. t2 t1 t0 lt l0 l1 + psrlq m2, 8 ; .. t2 t1 t0 lt l0 l1 l2 + pavgb m5, m2 + PRED4x4_LOWPASS m3, m1, m0, m2, m4 + punpcklbw m5, m3 + psrlq m3, 32 + PALIGNR m3, m5, 6, m4 + movh [r1+r2*2], m5 + psrlq m5, 16 + movh [r1+r2*1], m5 + psrlq m5, 16 + movh [r0+r2*2], m5 + movh [r0+r2*1], m3 + RET + +;----------------------------------------------------------------------------- +; void ff_pred4x4_vertical_right_8_mmxext(uint8_t *src, +; const uint8_t *topright, +; ptrdiff_t stride) +;----------------------------------------------------------------------------- + +INIT_MMX mmxext +cglobal pred4x4_vertical_right_8, 3,3 + sub r0, r2 + lea r1, [r0+r2*2] + movh m0, [r0] ; ........t3t2t1t0 + movq m5, m0 + PALIGNR m0, [r0-8], 7, m1 ; ......t3t2t1t0lt + pavgb m5, m0 + PALIGNR m0, [r0+r2*1-8], 7, m1 ; ....t3t2t1t0ltl0 + movq m1, m0 + PALIGNR m0, [r0+r2*2-8], 7, m2 ; ..t3t2t1t0ltl0l1 + movq m2, m0 + PALIGNR m0, [r1+r2*1-8], 7, m3 ; t3t2t1t0ltl0l1l2 + PRED4x4_LOWPASS m3, m1, m0, m2, m4 + movq m1, m3 + psrlq m3, 16 + psllq m1, 48 + movh [r0+r2*1], m5 + movh [r0+r2*2], m3 + PALIGNR m5, m1, 7, m2 + psllq m1, 8 + movh [r1+r2*1], m5 + PALIGNR m3, m1, 7, m1 + movh [r1+r2*2], m3 + RET + +;----------------------------------------------------------------------------- +; void ff_pred4x4_down_right_8_mmxext(uint8_t *src, const uint8_t *topright, +; ptrdiff_t stride) +;----------------------------------------------------------------------------- + +INIT_MMX mmxext +cglobal pred4x4_down_right_8, 3,3 + sub r0, r2 + lea r1, [r0+r2*2] + movq m1, [r1-8] + movq m2, [r0+r2*1-8] + punpckhbw m2, [r0-8] + movh m3, [r0] + punpckhwd m1, m2 + PALIGNR m3, m1, 5, m1 + movq m1, m3 + PALIGNR m3, [r1+r2*1-8], 7, m4 + movq m2, m3 + PALIGNR m3, [r1+r2*2-8], 7, m4 + PRED4x4_LOWPASS m0, m3, m1, m2, m4 + movh [r1+r2*2], m0 + psrlq m0, 8 + movh [r1+r2*1], m0 + psrlq m0, 8 + movh [r0+r2*2], m0 + psrlq m0, 8 + movh [r0+r2*1], m0 + RET diff --git a/media/ffvpx/libavcodec/x86/h264_intrapred_10bit.asm b/media/ffvpx/libavcodec/x86/h264_intrapred_10bit.asm new file mode 100644 index 0000000000..2f30807332 --- /dev/null +++ b/media/ffvpx/libavcodec/x86/h264_intrapred_10bit.asm @@ -0,0 +1,1119 @@ +;***************************************************************************** +;* MMX/SSE2/AVX-optimized 10-bit H.264 intra prediction code +;***************************************************************************** +;* Copyright (C) 2005-2011 x264 project +;* +;* Authors: Daniel Kang <daniel.d.kang@gmail.com> +;* +;* This file is part of FFmpeg. +;* +;* FFmpeg is free software; you can redistribute it and/or +;* modify it under the terms of the GNU Lesser General Public +;* License as published by the Free Software Foundation; either +;* version 2.1 of the License, or (at your option) any later version. +;* +;* FFmpeg is distributed in the hope that it will be useful, +;* but WITHOUT ANY WARRANTY; without even the implied warranty of +;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +;* Lesser General Public License for more details. +;* +;* You should have received a copy of the GNU Lesser General Public +;* License along with FFmpeg; if not, write to the Free Software +;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA +;****************************************************************************** + +%include "libavutil/x86/x86util.asm" + +SECTION_RODATA + +cextern pw_1023 +%define pw_pixel_max pw_1023 +cextern pw_512 +cextern pw_16 +cextern pw_8 +cextern pw_4 +cextern pw_2 +cextern pw_1 +cextern pd_16 + +pw_m32101234: dw -3, -2, -1, 0, 1, 2, 3, 4 +pw_m3: times 8 dw -3 +pd_17: times 4 dd 17 + +SECTION .text + +; dest, left, right, src +; output: %1 = (t[n-1] + t[n]*2 + t[n+1] + 2) >> 2 +%macro PRED4x4_LOWPASS 4 + paddw %2, %3 + psrlw %2, 1 + pavgw %1, %4, %2 +%endmacro + +;----------------------------------------------------------------------------- +; void ff_pred4x4_down_right_10(pixel *src, const pixel *topright, +; ptrdiff_t stride) +;----------------------------------------------------------------------------- +%macro PRED4x4_DR 0 +cglobal pred4x4_down_right_10, 3, 3 + sub r0, r2 + lea r1, [r0+r2*2] + movhps m1, [r1-8] + movhps m2, [r0+r2*1-8] + movhps m4, [r0-8] + punpckhwd m2, m4 + movq m3, [r0] + punpckhdq m1, m2 + PALIGNR m3, m1, 10, m1 + movhps m4, [r1+r2*1-8] + PALIGNR m0, m3, m4, 14, m4 + movhps m4, [r1+r2*2-8] + PALIGNR m2, m0, m4, 14, m4 + PRED4x4_LOWPASS m0, m2, m3, m0 + movq [r1+r2*2], m0 + psrldq m0, 2 + movq [r1+r2*1], m0 + psrldq m0, 2 + movq [r0+r2*2], m0 + psrldq m0, 2 + movq [r0+r2*1], m0 + RET +%endmacro + +INIT_XMM sse2 +PRED4x4_DR +INIT_XMM ssse3 +PRED4x4_DR +%if HAVE_AVX_EXTERNAL +INIT_XMM avx +PRED4x4_DR +%endif + +;------------------------------------------------------------------------------ +; void ff_pred4x4_vertical_right_10(pixel *src, const pixel *topright, +; ptrdiff_t stride) +;------------------------------------------------------------------------------ +%macro PRED4x4_VR 0 +cglobal pred4x4_vertical_right_10, 3, 3, 6 + sub r0, r2 + lea r1, [r0+r2*2] + movq m5, [r0] ; ........t3t2t1t0 + movhps m1, [r0-8] + PALIGNR m0, m5, m1, 14, m1 ; ......t3t2t1t0lt + pavgw m5, m0 + movhps m1, [r0+r2*1-8] + PALIGNR m0, m1, 14, m1 ; ....t3t2t1t0ltl0 + movhps m2, [r0+r2*2-8] + PALIGNR m1, m0, m2, 14, m2 ; ..t3t2t1t0ltl0l1 + movhps m3, [r1+r2*1-8] + PALIGNR m2, m1, m3, 14, m3 ; t3t2t1t0ltl0l1l2 + PRED4x4_LOWPASS m1, m0, m2, m1 + pslldq m0, m1, 12 + psrldq m1, 4 + movq [r0+r2*1], m5 + movq [r0+r2*2], m1 + PALIGNR m5, m0, 14, m2 + pslldq m0, 2 + movq [r1+r2*1], m5 + PALIGNR m1, m0, 14, m0 + movq [r1+r2*2], m1 + RET +%endmacro + +INIT_XMM sse2 +PRED4x4_VR +INIT_XMM ssse3 +PRED4x4_VR +%if HAVE_AVX_EXTERNAL +INIT_XMM avx +PRED4x4_VR +%endif + +;------------------------------------------------------------------------------- +; void ff_pred4x4_horizontal_down_10(pixel *src, const pixel *topright, +; ptrdiff_t stride) +;------------------------------------------------------------------------------- +%macro PRED4x4_HD 0 +cglobal pred4x4_horizontal_down_10, 3, 3 + sub r0, r2 + lea r1, [r0+r2*2] + movq m0, [r0-8] ; lt .. + movhps m0, [r0] + pslldq m0, 2 ; t2 t1 t0 lt .. .. .. .. + movq m1, [r1+r2*2-8] ; l3 + movq m3, [r1+r2*1-8] + punpcklwd m1, m3 ; l2 l3 + movq m2, [r0+r2*2-8] ; l1 + movq m3, [r0+r2*1-8] + punpcklwd m2, m3 ; l0 l1 + punpckhdq m1, m2 ; l0 l1 l2 l3 + punpckhqdq m1, m0 ; t2 t1 t0 lt l0 l1 l2 l3 + psrldq m0, m1, 4 ; .. .. t2 t1 t0 lt l0 l1 + psrldq m3, m1, 2 ; .. t2 t1 t0 lt l0 l1 l2 + pavgw m5, m1, m3 + PRED4x4_LOWPASS m3, m1, m0, m3 + punpcklwd m5, m3 + psrldq m3, 8 + PALIGNR m3, m5, 12, m4 + movq [r1+r2*2], m5 + movhps [r0+r2*2], m5 + psrldq m5, 4 + movq [r1+r2*1], m5 + movq [r0+r2*1], m3 + RET +%endmacro + +INIT_XMM sse2 +PRED4x4_HD +INIT_XMM ssse3 +PRED4x4_HD +%if HAVE_AVX_EXTERNAL +INIT_XMM avx +PRED4x4_HD +%endif + +;----------------------------------------------------------------------------- +; void ff_pred4x4_dc_10(pixel *src, const pixel *topright, ptrdiff_t stride) +;----------------------------------------------------------------------------- + +INIT_MMX mmxext +cglobal pred4x4_dc_10, 3, 3 + sub r0, r2 + lea r1, [r0+r2*2] + movq m2, [r0+r2*1-8] + paddw m2, [r0+r2*2-8] + paddw m2, [r1+r2*1-8] + paddw m2, [r1+r2*2-8] + psrlq m2, 48 + movq m0, [r0] + HADDW m0, m1 + paddw m0, [pw_4] + paddw m0, m2 + psrlw m0, 3 + SPLATW m0, m0, 0 + movq [r0+r2*1], m0 + movq [r0+r2*2], m0 + movq [r1+r2*1], m0 + movq [r1+r2*2], m0 + RET + +;----------------------------------------------------------------------------- +; void ff_pred4x4_down_left_10(pixel *src, const pixel *topright, +; ptrdiff_t stride) +;----------------------------------------------------------------------------- +%macro PRED4x4_DL 0 +cglobal pred4x4_down_left_10, 3, 3 + sub r0, r2 + movq m0, [r0] + movhps m0, [r1] + psrldq m2, m0, 2 + pslldq m3, m0, 2 + pshufhw m2, m2, 10100100b + PRED4x4_LOWPASS m0, m3, m2, m0 + lea r1, [r0+r2*2] + movhps [r1+r2*2], m0 + psrldq m0, 2 + movq [r0+r2*1], m0 + psrldq m0, 2 + movq [r0+r2*2], m0 + psrldq m0, 2 + movq [r1+r2*1], m0 + RET +%endmacro + +INIT_XMM sse2 +PRED4x4_DL +%if HAVE_AVX_EXTERNAL +INIT_XMM avx +PRED4x4_DL +%endif + +;----------------------------------------------------------------------------- +; void ff_pred4x4_vertical_left_10(pixel *src, const pixel *topright, +; ptrdiff_t stride) +;----------------------------------------------------------------------------- +%macro PRED4x4_VL 0 +cglobal pred4x4_vertical_left_10, 3, 3 + sub r0, r2 + movu m1, [r0] + movhps m1, [r1] + psrldq m0, m1, 2 + psrldq m2, m1, 4 + pavgw m4, m0, m1 + PRED4x4_LOWPASS m0, m1, m2, m0 + lea r1, [r0+r2*2] + movq [r0+r2*1], m4 + movq [r0+r2*2], m0 + psrldq m4, 2 + psrldq m0, 2 + movq [r1+r2*1], m4 + movq [r1+r2*2], m0 + RET +%endmacro + +INIT_XMM sse2 +PRED4x4_VL +%if HAVE_AVX_EXTERNAL +INIT_XMM avx +PRED4x4_VL +%endif + +;----------------------------------------------------------------------------- +; void ff_pred4x4_horizontal_up_10(pixel *src, const pixel *topright, +; ptrdiff_t stride) +;----------------------------------------------------------------------------- +INIT_MMX mmxext +cglobal pred4x4_horizontal_up_10, 3, 3 + sub r0, r2 + lea r1, [r0+r2*2] + movq m0, [r0+r2*1-8] + punpckhwd m0, [r0+r2*2-8] + movq m1, [r1+r2*1-8] + punpckhwd m1, [r1+r2*2-8] + punpckhdq m0, m1 + pshufw m1, m1, 0xFF + movq [r1+r2*2], m1 + movd [r1+r2*1+4], m1 + pshufw m2, m0, 11111001b + movq m1, m2 + pavgw m2, m0 + + pshufw m5, m0, 11111110b + PRED4x4_LOWPASS m1, m0, m5, m1 + movq m6, m2 + punpcklwd m6, m1 + movq [r0+r2*1], m6 + psrlq m2, 16 + psrlq m1, 16 + punpcklwd m2, m1 + movq [r0+r2*2], m2 + psrlq m2, 32 + movd [r1+r2*1], m2 + RET + + + +;----------------------------------------------------------------------------- +; void ff_pred8x8_vertical_10(pixel *src, ptrdiff_t stride) +;----------------------------------------------------------------------------- +INIT_XMM sse2 +cglobal pred8x8_vertical_10, 2, 2 + sub r0, r1 + mova m0, [r0] +%rep 3 + mova [r0+r1*1], m0 + mova [r0+r1*2], m0 + lea r0, [r0+r1*2] +%endrep + mova [r0+r1*1], m0 + mova [r0+r1*2], m0 + RET + +;----------------------------------------------------------------------------- +; void ff_pred8x8_horizontal_10(pixel *src, ptrdiff_t stride) +;----------------------------------------------------------------------------- +INIT_XMM sse2 +cglobal pred8x8_horizontal_10, 2, 3 + mov r2d, 4 +.loop: + movq m0, [r0+r1*0-8] + movq m1, [r0+r1*1-8] + pshuflw m0, m0, 0xff + pshuflw m1, m1, 0xff + punpcklqdq m0, m0 + punpcklqdq m1, m1 + mova [r0+r1*0], m0 + mova [r0+r1*1], m1 + lea r0, [r0+r1*2] + dec r2d + jg .loop + RET + +;----------------------------------------------------------------------------- +; void ff_predict_8x8_dc_10(pixel *src, ptrdiff_t stride) +;----------------------------------------------------------------------------- +%macro MOV8 2-3 +; sort of a hack, but it works + movdqa [%1], %2 +%endmacro + +%macro PRED8x8_DC 1 +cglobal pred8x8_dc_10, 2, 6 + sub r0, r1 + pxor m4, m4 + movq m0, [r0+0] + movq m1, [r0+8] + punpcklwd m0, m1 + movhlps m1, m0 + paddw m0, m1 + %1 m2, m0, 00001110b + paddw m0, m2 + + lea r5, [r1*3] + lea r4, [r0+r1*4] + movzx r2d, word [r0+r1*1-2] + movzx r3d, word [r0+r1*2-2] + add r2d, r3d + movzx r3d, word [r0+r5*1-2] + add r2d, r3d + movzx r3d, word [r4-2] + add r2d, r3d + movd m2, r2d ; s2 + + movzx r2d, word [r4+r1*1-2] + movzx r3d, word [r4+r1*2-2] + add r2d, r3d + movzx r3d, word [r4+r5*1-2] + add r2d, r3d + movzx r3d, word [r4+r1*4-2] + add r2d, r3d + movd m3, r2d ; s3 + + punpcklwd m2, m3 + punpckldq m0, m2 ; s0, s1, s2, s3 + %1 m3, m0, 11110110b ; s2, s1, s3, s3 + %1 m0, m0, 01110100b ; s0, s1, s3, s1 + paddw m0, m3 + psrlw m0, 2 + pavgw m0, m4 ; s0+s2, s1, s3, s1+s3 + punpcklwd m0, m0 + pshufd m3, m0, 11111010b + punpckldq m0, m0 + SWAP 0,1 + MOV8 r0+r1*1, m1, m2 + MOV8 r0+r1*2, m1, m2 + MOV8 r0+r5*1, m1, m2 + MOV8 r0+r1*4, m1, m2 + MOV8 r4+r1*1, m3, m4 + MOV8 r4+r1*2, m3, m4 + MOV8 r4+r5*1, m3, m4 + MOV8 r4+r1*4, m3, m4 + RET +%endmacro + +INIT_XMM sse2 +PRED8x8_DC pshuflw + +;----------------------------------------------------------------------------- +; void ff_pred8x8_top_dc_10(pixel *src, ptrdiff_t stride) +;----------------------------------------------------------------------------- +INIT_XMM sse2 +cglobal pred8x8_top_dc_10, 2, 4 + sub r0, r1 + mova m0, [r0] + pshuflw m1, m0, 0x4e + pshufhw m1, m1, 0x4e + paddw m0, m1 + pshuflw m1, m0, 0xb1 + pshufhw m1, m1, 0xb1 + paddw m0, m1 + lea r2, [r1*3] + lea r3, [r0+r1*4] + paddw m0, [pw_2] + psrlw m0, 2 + mova [r0+r1*1], m0 + mova [r0+r1*2], m0 + mova [r0+r2*1], m0 + mova [r0+r1*4], m0 + mova [r3+r1*1], m0 + mova [r3+r1*2], m0 + mova [r3+r2*1], m0 + mova [r3+r1*4], m0 + RET + +;----------------------------------------------------------------------------- +; void ff_pred8x8_plane_10(pixel *src, ptrdiff_t stride) +;----------------------------------------------------------------------------- +INIT_XMM sse2 +cglobal pred8x8_plane_10, 2, 7, 7 + sub r0, r1 + lea r2, [r1*3] + lea r3, [r0+r1*4] + mova m2, [r0] + pmaddwd m2, [pw_m32101234] + HADDD m2, m1 + movd m0, [r0-4] + psrld m0, 14 + psubw m2, m0 ; H + movd m0, [r3+r1*4-4] + movd m1, [r0+12] + paddw m0, m1 + psllw m0, 4 ; 16*(src[7*stride-1] + src[-stride+7]) + movzx r4d, word [r3+r1*1-2] ; src[4*stride-1] + movzx r5d, word [r0+r2*1-2] ; src[2*stride-1] + sub r4d, r5d + movzx r6d, word [r3+r1*2-2] ; src[5*stride-1] + movzx r5d, word [r0+r1*2-2] ; src[1*stride-1] + sub r6d, r5d + lea r4d, [r4+r6*2] + movzx r5d, word [r3+r2*1-2] ; src[6*stride-1] + movzx r6d, word [r0+r1*1-2] ; src[0*stride-1] + sub r5d, r6d + lea r5d, [r5*3] + add r4d, r5d + movzx r6d, word [r3+r1*4-2] ; src[7*stride-1] + movzx r5d, word [r0+r1*0-2] ; src[ -stride-1] + sub r6d, r5d + lea r4d, [r4+r6*4] + movd m3, r4d ; V + punpckldq m2, m3 + pmaddwd m2, [pd_17] + paddd m2, [pd_16] + psrad m2, 5 ; b, c + + mova m3, [pw_pixel_max] + pxor m1, m1 + SPLATW m0, m0, 1 + SPLATW m4, m2, 2 + SPLATW m2, m2, 0 + pmullw m2, [pw_m32101234] ; b + pmullw m5, m4, [pw_m3] ; c + paddw m5, [pw_16] + mov r2d, 8 + add r0, r1 +.loop: + paddsw m6, m2, m5 + paddsw m6, m0 + psraw m6, 5 + CLIPW m6, m1, m3 + mova [r0], m6 + paddw m5, m4 + add r0, r1 + dec r2d + jg .loop + RET + + +;----------------------------------------------------------------------------- +; void ff_pred8x8l_128_dc_10(pixel *src, int has_topleft, int has_topright, +; ptrdiff_t stride) +;----------------------------------------------------------------------------- +INIT_XMM sse2 +cglobal pred8x8l_128_dc_10, 4, 4 + mova m0, [pw_512] ; (1<<(BIT_DEPTH-1)) + lea r1, [r3*3] + lea r2, [r0+r3*4] + MOV8 r0+r3*0, m0, m0 + MOV8 r0+r3*1, m0, m0 + MOV8 r0+r3*2, m0, m0 + MOV8 r0+r1*1, m0, m0 + MOV8 r2+r3*0, m0, m0 + MOV8 r2+r3*1, m0, m0 + MOV8 r2+r3*2, m0, m0 + MOV8 r2+r1*1, m0, m0 + RET + +;----------------------------------------------------------------------------- +; void ff_pred8x8l_top_dc_10(pixel *src, int has_topleft, int has_topright, +; ptrdiff_t stride) +;----------------------------------------------------------------------------- +%macro PRED8x8L_TOP_DC 0 +cglobal pred8x8l_top_dc_10, 4, 4, 6 + sub r0, r3 + mova m0, [r0] + shr r1d, 14 + shr r2d, 13 + neg r1 + pslldq m1, m0, 2 + psrldq m2, m0, 2 + pinsrw m1, [r0+r1], 0 + pinsrw m2, [r0+r2+14], 7 + lea r1, [r3*3] + lea r2, [r0+r3*4] + PRED4x4_LOWPASS m0, m2, m1, m0 + HADDW m0, m1 + paddw m0, [pw_4] + psrlw m0, 3 + SPLATW m0, m0, 0 + mova [r0+r3*1], m0 + mova [r0+r3*2], m0 + mova [r0+r1*1], m0 + mova [r0+r3*4], m0 + mova [r2+r3*1], m0 + mova [r2+r3*2], m0 + mova [r2+r1*1], m0 + mova [r2+r3*4], m0 + RET +%endmacro + +INIT_XMM sse2 +PRED8x8L_TOP_DC +%if HAVE_AVX_EXTERNAL +INIT_XMM avx +PRED8x8L_TOP_DC +%endif + +;------------------------------------------------------------------------------- +; void ff_pred8x8l_dc_10(pixel *src, int has_topleft, int has_topright, +; ptrdiff_t stride) +;------------------------------------------------------------------------------- +;TODO: see if scalar is faster +%macro PRED8x8L_DC 0 +cglobal pred8x8l_dc_10, 4, 6, 6 + sub r0, r3 + lea r4, [r0+r3*4] + lea r5, [r3*3] + mova m0, [r0+r3*2-16] + punpckhwd m0, [r0+r3*1-16] + mova m1, [r4+r3*0-16] + punpckhwd m1, [r0+r5*1-16] + punpckhdq m1, m0 + mova m2, [r4+r3*2-16] + punpckhwd m2, [r4+r3*1-16] + mova m3, [r4+r3*4-16] + punpckhwd m3, [r4+r5*1-16] + punpckhdq m3, m2 + punpckhqdq m3, m1 + mova m0, [r0] + shr r1d, 14 + shr r2d, 13 + neg r1 + pslldq m1, m0, 2 + psrldq m2, m0, 2 + pinsrw m1, [r0+r1], 0 + pinsrw m2, [r0+r2+14], 7 + not r1 + and r1, r3 + pslldq m4, m3, 2 + psrldq m5, m3, 2 + pshuflw m4, m4, 11100101b + pinsrw m5, [r0+r1-2], 7 + PRED4x4_LOWPASS m3, m4, m5, m3 + PRED4x4_LOWPASS m0, m2, m1, m0 + paddw m0, m3 + HADDW m0, m1 + paddw m0, [pw_8] + psrlw m0, 4 + SPLATW m0, m0 + mova [r0+r3*1], m0 + mova [r0+r3*2], m0 + mova [r0+r5*1], m0 + mova [r0+r3*4], m0 + mova [r4+r3*1], m0 + mova [r4+r3*2], m0 + mova [r4+r5*1], m0 + mova [r4+r3*4], m0 + RET +%endmacro + +INIT_XMM sse2 +PRED8x8L_DC +%if HAVE_AVX_EXTERNAL +INIT_XMM avx +PRED8x8L_DC +%endif + +;----------------------------------------------------------------------------- +; void ff_pred8x8l_vertical_10(pixel *src, int has_topleft, int has_topright, +; ptrdiff_t stride) +;----------------------------------------------------------------------------- +%macro PRED8x8L_VERTICAL 0 +cglobal pred8x8l_vertical_10, 4, 4, 6 + sub r0, r3 + mova m0, [r0] + shr r1d, 14 + shr r2d, 13 + neg r1 + pslldq m1, m0, 2 + psrldq m2, m0, 2 + pinsrw m1, [r0+r1], 0 + pinsrw m2, [r0+r2+14], 7 + lea r1, [r3*3] + lea r2, [r0+r3*4] + PRED4x4_LOWPASS m0, m2, m1, m0 + mova [r0+r3*1], m0 + mova [r0+r3*2], m0 + mova [r0+r1*1], m0 + mova [r0+r3*4], m0 + mova [r2+r3*1], m0 + mova [r2+r3*2], m0 + mova [r2+r1*1], m0 + mova [r2+r3*4], m0 + RET +%endmacro + +INIT_XMM sse2 +PRED8x8L_VERTICAL +%if HAVE_AVX_EXTERNAL +INIT_XMM avx +PRED8x8L_VERTICAL +%endif + +;----------------------------------------------------------------------------- +; void ff_pred8x8l_horizontal_10(uint8_t *src, int has_topleft, +; int has_topright, ptrdiff_t stride) +;----------------------------------------------------------------------------- +%macro PRED8x8L_HORIZONTAL 0 +cglobal pred8x8l_horizontal_10, 4, 4, 5 + mova m0, [r0-16] + shr r1d, 14 + dec r1 + and r1, r3 + sub r1, r3 + punpckhwd m0, [r0+r1-16] + mova m1, [r0+r3*2-16] + punpckhwd m1, [r0+r3*1-16] + lea r2, [r0+r3*4] + lea r1, [r3*3] + punpckhdq m1, m0 + mova m2, [r2+r3*0-16] + punpckhwd m2, [r0+r1-16] + mova m3, [r2+r3*2-16] + punpckhwd m3, [r2+r3*1-16] + punpckhdq m3, m2 + punpckhqdq m3, m1 + PALIGNR m4, m3, [r2+r1-16], 14, m0 + pslldq m0, m4, 2 + pshuflw m0, m0, 11100101b + PRED4x4_LOWPASS m4, m3, m0, m4 + punpckhwd m3, m4, m4 + punpcklwd m4, m4 + pshufd m0, m3, 0xff + pshufd m1, m3, 0xaa + pshufd m2, m3, 0x55 + pshufd m3, m3, 0x00 + mova [r0+r3*0], m0 + mova [r0+r3*1], m1 + mova [r0+r3*2], m2 + mova [r0+r1*1], m3 + pshufd m0, m4, 0xff + pshufd m1, m4, 0xaa + pshufd m2, m4, 0x55 + pshufd m3, m4, 0x00 + mova [r2+r3*0], m0 + mova [r2+r3*1], m1 + mova [r2+r3*2], m2 + mova [r2+r1*1], m3 + RET +%endmacro + +INIT_XMM sse2 +PRED8x8L_HORIZONTAL +INIT_XMM ssse3 +PRED8x8L_HORIZONTAL +%if HAVE_AVX_EXTERNAL +INIT_XMM avx +PRED8x8L_HORIZONTAL +%endif + +;----------------------------------------------------------------------------- +; void ff_pred8x8l_down_left_10(pixel *src, int has_topleft, int has_topright, +; ptrdiff_t stride) +;----------------------------------------------------------------------------- +%macro PRED8x8L_DOWN_LEFT 0 +cglobal pred8x8l_down_left_10, 4, 4, 7 + sub r0, r3 + mova m3, [r0] + shr r1d, 14 + neg r1 + shr r2d, 13 + pslldq m1, m3, 2 + psrldq m2, m3, 2 + pinsrw m1, [r0+r1], 0 + pinsrw m2, [r0+r2+14], 7 + PRED4x4_LOWPASS m6, m2, m1, m3 + jz .fix_tr ; flags from shr r2d + mova m1, [r0+16] + psrldq m5, m1, 2 + PALIGNR m2, m1, m3, 14, m3 + pshufhw m5, m5, 10100100b + PRED4x4_LOWPASS m1, m2, m5, m1 +.do_topright: + lea r1, [r3*3] + psrldq m5, m1, 14 + lea r2, [r0+r3*4] + PALIGNR m2, m1, m6, 2, m0 + PALIGNR m3, m1, m6, 14, m0 + PALIGNR m5, m1, 2, m0 + pslldq m4, m6, 2 + PRED4x4_LOWPASS m6, m4, m2, m6 + PRED4x4_LOWPASS m1, m3, m5, m1 + mova [r2+r3*4], m1 + PALIGNR m1, m6, 14, m2 + pslldq m6, 2 + mova [r2+r1*1], m1 + PALIGNR m1, m6, 14, m2 + pslldq m6, 2 + mova [r2+r3*2], m1 + PALIGNR m1, m6, 14, m2 + pslldq m6, 2 + mova [r2+r3*1], m1 + PALIGNR m1, m6, 14, m2 + pslldq m6, 2 + mova [r0+r3*4], m1 + PALIGNR m1, m6, 14, m2 + pslldq m6, 2 + mova [r0+r1*1], m1 + PALIGNR m1, m6, 14, m2 + pslldq m6, 2 + mova [r0+r3*2], m1 + PALIGNR m1, m6, 14, m6 + mova [r0+r3*1], m1 + RET +.fix_tr: + punpckhwd m3, m3 + pshufd m1, m3, 0xFF + jmp .do_topright +%endmacro + +INIT_XMM sse2 +PRED8x8L_DOWN_LEFT +INIT_XMM ssse3 +PRED8x8L_DOWN_LEFT +%if HAVE_AVX_EXTERNAL +INIT_XMM avx +PRED8x8L_DOWN_LEFT +%endif + +;----------------------------------------------------------------------------- +; void ff_pred8x8l_down_right_10(pixel *src, int has_topleft, +; int has_topright, ptrdiff_t stride) +;----------------------------------------------------------------------------- +%macro PRED8x8L_DOWN_RIGHT 0 +; standard forbids this when has_topleft is false +; no need to check +cglobal pred8x8l_down_right_10, 4, 5, 8 + sub r0, r3 + lea r4, [r0+r3*4] + lea r1, [r3*3] + mova m0, [r0+r3*1-16] + punpckhwd m0, [r0+r3*0-16] + mova m1, [r0+r1*1-16] + punpckhwd m1, [r0+r3*2-16] + punpckhdq m1, m0 + mova m2, [r4+r3*1-16] + punpckhwd m2, [r4+r3*0-16] + mova m3, [r4+r1*1-16] + punpckhwd m3, [r4+r3*2-16] + punpckhdq m3, m2 + punpckhqdq m3, m1 + mova m0, [r4+r3*4-16] + mova m1, [r0] + PALIGNR m4, m3, m0, 14, m0 + PALIGNR m1, m3, 2, m2 + pslldq m0, m4, 2 + pshuflw m0, m0, 11100101b + PRED4x4_LOWPASS m6, m1, m4, m3 + PRED4x4_LOWPASS m4, m3, m0, m4 + mova m3, [r0] + shr r2d, 13 + pslldq m1, m3, 2 + psrldq m2, m3, 2 + pinsrw m1, [r0-2], 0 + pinsrw m2, [r0+r2+14], 7 + PRED4x4_LOWPASS m3, m2, m1, m3 + PALIGNR m2, m3, m6, 2, m0 + PALIGNR m5, m3, m6, 14, m0 + psrldq m7, m3, 2 + PRED4x4_LOWPASS m6, m4, m2, m6 + PRED4x4_LOWPASS m3, m5, m7, m3 + mova [r4+r3*4], m6 + PALIGNR m3, m6, 14, m2 + pslldq m6, 2 + mova [r0+r3*1], m3 + PALIGNR m3, m6, 14, m2 + pslldq m6, 2 + mova [r0+r3*2], m3 + PALIGNR m3, m6, 14, m2 + pslldq m6, 2 + mova [r0+r1*1], m3 + PALIGNR m3, m6, 14, m2 + pslldq m6, 2 + mova [r0+r3*4], m3 + PALIGNR m3, m6, 14, m2 + pslldq m6, 2 + mova [r4+r3*1], m3 + PALIGNR m3, m6, 14, m2 + pslldq m6, 2 + mova [r4+r3*2], m3 + PALIGNR m3, m6, 14, m6 + mova [r4+r1*1], m3 + RET +%endmacro + +INIT_XMM sse2 +PRED8x8L_DOWN_RIGHT +INIT_XMM ssse3 +PRED8x8L_DOWN_RIGHT +%if HAVE_AVX_EXTERNAL +INIT_XMM avx +PRED8x8L_DOWN_RIGHT +%endif + +;----------------------------------------------------------------------------- +; void ff_pred8x8l_vertical_right_10(pixel *src, int has_topleft, +; int has_topright, ptrdiff_t stride) +;----------------------------------------------------------------------------- +%macro PRED8x8L_VERTICAL_RIGHT 0 +; likewise with 8x8l_down_right +cglobal pred8x8l_vertical_right_10, 4, 5, 7 + sub r0, r3 + lea r4, [r0+r3*4] + lea r1, [r3*3] + mova m0, [r0+r3*1-16] + punpckhwd m0, [r0+r3*0-16] + mova m1, [r0+r1*1-16] + punpckhwd m1, [r0+r3*2-16] + punpckhdq m1, m0 + mova m2, [r4+r3*1-16] + punpckhwd m2, [r4+r3*0-16] + mova m3, [r4+r1*1-16] + punpckhwd m3, [r4+r3*2-16] + punpckhdq m3, m2 + punpckhqdq m3, m1 + mova m0, [r4+r3*4-16] + mova m1, [r0] + PALIGNR m4, m3, m0, 14, m0 + PALIGNR m1, m3, 2, m2 + PRED4x4_LOWPASS m3, m1, m4, m3 + mova m2, [r0] + shr r2d, 13 + pslldq m1, m2, 2 + psrldq m5, m2, 2 + pinsrw m1, [r0-2], 0 + pinsrw m5, [r0+r2+14], 7 + PRED4x4_LOWPASS m2, m5, m1, m2 + PALIGNR m6, m2, m3, 12, m1 + PALIGNR m5, m2, m3, 14, m0 + PRED4x4_LOWPASS m0, m6, m2, m5 + pavgw m2, m5 + mova [r0+r3*2], m0 + mova [r0+r3*1], m2 + pslldq m6, m3, 4 + pslldq m1, m3, 2 + PRED4x4_LOWPASS m1, m3, m6, m1 + PALIGNR m2, m1, 14, m4 + mova [r0+r1*1], m2 + pslldq m1, 2 + PALIGNR m0, m1, 14, m3 + mova [r0+r3*4], m0 + pslldq m1, 2 + PALIGNR m2, m1, 14, m4 + mova [r4+r3*1], m2 + pslldq m1, 2 + PALIGNR m0, m1, 14, m3 + mova [r4+r3*2], m0 + pslldq m1, 2 + PALIGNR m2, m1, 14, m4 + mova [r4+r1*1], m2 + pslldq m1, 2 + PALIGNR m0, m1, 14, m1 + mova [r4+r3*4], m0 + RET +%endmacro + +INIT_XMM sse2 +PRED8x8L_VERTICAL_RIGHT +INIT_XMM ssse3 +PRED8x8L_VERTICAL_RIGHT +%if HAVE_AVX_EXTERNAL +INIT_XMM avx +PRED8x8L_VERTICAL_RIGHT +%endif + +;----------------------------------------------------------------------------- +; void ff_pred8x8l_horizontal_up_10(pixel *src, int has_topleft, +; int has_topright, ptrdiff_t stride) +;----------------------------------------------------------------------------- +%macro PRED8x8L_HORIZONTAL_UP 0 +cglobal pred8x8l_horizontal_up_10, 4, 4, 6 + mova m0, [r0+r3*0-16] + punpckhwd m0, [r0+r3*1-16] + shr r1d, 14 + dec r1 + and r1, r3 + sub r1, r3 + mova m4, [r0+r1*1-16] + lea r1, [r3*3] + lea r2, [r0+r3*4] + mova m1, [r0+r3*2-16] + punpckhwd m1, [r0+r1*1-16] + punpckhdq m0, m1 + mova m2, [r2+r3*0-16] + punpckhwd m2, [r2+r3*1-16] + mova m3, [r2+r3*2-16] + punpckhwd m3, [r2+r1*1-16] + punpckhdq m2, m3 + punpckhqdq m0, m2 + PALIGNR m1, m0, m4, 14, m4 + psrldq m2, m0, 2 + pshufhw m2, m2, 10100100b + PRED4x4_LOWPASS m0, m1, m2, m0 + psrldq m1, m0, 2 + psrldq m2, m0, 4 + pshufhw m1, m1, 10100100b + pshufhw m2, m2, 01010100b + pavgw m4, m0, m1 + PRED4x4_LOWPASS m1, m2, m0, m1 + punpckhwd m5, m4, m1 + punpcklwd m4, m1 + mova [r2+r3*0], m5 + mova [r0+r3*0], m4 + pshufd m0, m5, 11111001b + pshufd m1, m5, 11111110b + pshufd m2, m5, 11111111b + mova [r2+r3*1], m0 + mova [r2+r3*2], m1 + mova [r2+r1*1], m2 + PALIGNR m2, m5, m4, 4, m0 + PALIGNR m3, m5, m4, 8, m1 + PALIGNR m5, m5, m4, 12, m4 + mova [r0+r3*1], m2 + mova [r0+r3*2], m3 + mova [r0+r1*1], m5 + RET +%endmacro + +INIT_XMM sse2 +PRED8x8L_HORIZONTAL_UP +INIT_XMM ssse3 +PRED8x8L_HORIZONTAL_UP +%if HAVE_AVX_EXTERNAL +INIT_XMM avx +PRED8x8L_HORIZONTAL_UP +%endif + + +;----------------------------------------------------------------------------- +; void ff_pred16x16_vertical_10(pixel *src, ptrdiff_t stride) +;----------------------------------------------------------------------------- +%macro MOV16 3-5 + mova [%1+ 0], %2 + mova [%1+mmsize], %3 +%endmacro + +INIT_XMM sse2 +cglobal pred16x16_vertical_10, 2, 3 + sub r0, r1 + mov r2d, 8 + mova m0, [r0+ 0] + mova m1, [r0+mmsize] +.loop: + MOV16 r0+r1*1, m0, m1, m2, m3 + MOV16 r0+r1*2, m0, m1, m2, m3 + lea r0, [r0+r1*2] + dec r2d + jg .loop + RET + +;----------------------------------------------------------------------------- +; void ff_pred16x16_horizontal_10(pixel *src, ptrdiff_t stride) +;----------------------------------------------------------------------------- +INIT_XMM sse2 +cglobal pred16x16_horizontal_10, 2, 3 + mov r2d, 8 +.vloop: + movd m0, [r0+r1*0-4] + movd m1, [r0+r1*1-4] + SPLATW m0, m0, 1 + SPLATW m1, m1, 1 + MOV16 r0+r1*0, m0, m0, m0, m0 + MOV16 r0+r1*1, m1, m1, m1, m1 + lea r0, [r0+r1*2] + dec r2d + jg .vloop + RET + +;----------------------------------------------------------------------------- +; void ff_pred16x16_dc_10(pixel *src, ptrdiff_t stride) +;----------------------------------------------------------------------------- +INIT_XMM sse2 +cglobal pred16x16_dc_10, 2, 6 + mov r5, r0 + sub r0, r1 + mova m0, [r0+0] + paddw m0, [r0+mmsize] + HADDW m0, m2 + + lea r0, [r0+r1-2] + movzx r3d, word [r0] + movzx r4d, word [r0+r1] +%rep 7 + lea r0, [r0+r1*2] + movzx r2d, word [r0] + add r3d, r2d + movzx r2d, word [r0+r1] + add r4d, r2d +%endrep + lea r3d, [r3+r4+16] + + movd m1, r3d + paddw m0, m1 + psrlw m0, 5 + SPLATW m0, m0 + mov r3d, 8 +.loop: + MOV16 r5+r1*0, m0, m0, m0, m0 + MOV16 r5+r1*1, m0, m0, m0, m0 + lea r5, [r5+r1*2] + dec r3d + jg .loop + RET + +;----------------------------------------------------------------------------- +; void ff_pred16x16_top_dc_10(pixel *src, ptrdiff_t stride) +;----------------------------------------------------------------------------- +INIT_XMM sse2 +cglobal pred16x16_top_dc_10, 2, 3 + sub r0, r1 + mova m0, [r0+0] + paddw m0, [r0+mmsize] + HADDW m0, m2 + + SPLATW m0, m0 + paddw m0, [pw_8] + psrlw m0, 4 + mov r2d, 8 +.loop: + MOV16 r0+r1*1, m0, m0, m0, m0 + MOV16 r0+r1*2, m0, m0, m0, m0 + lea r0, [r0+r1*2] + dec r2d + jg .loop + RET + +;----------------------------------------------------------------------------- +; void ff_pred16x16_left_dc_10(pixel *src, ptrdiff_t stride) +;----------------------------------------------------------------------------- +INIT_XMM sse2 +cglobal pred16x16_left_dc_10, 2, 6 + mov r5, r0 + + sub r0, 2 + movzx r3d, word [r0] + movzx r4d, word [r0+r1] +%rep 7 + lea r0, [r0+r1*2] + movzx r2d, word [r0] + add r3d, r2d + movzx r2d, word [r0+r1] + add r4d, r2d +%endrep + lea r3d, [r3+r4+8] + shr r3d, 4 + + movd m0, r3d + SPLATW m0, m0 + mov r3d, 8 +.loop: + MOV16 r5+r1*0, m0, m0, m0, m0 + MOV16 r5+r1*1, m0, m0, m0, m0 + lea r5, [r5+r1*2] + dec r3d + jg .loop + RET + +;----------------------------------------------------------------------------- +; void ff_pred16x16_128_dc_10(pixel *src, ptrdiff_t stride) +;----------------------------------------------------------------------------- +INIT_XMM sse2 +cglobal pred16x16_128_dc_10, 2,3 + mova m0, [pw_512] + mov r2d, 8 +.loop: + MOV16 r0+r1*0, m0, m0, m0, m0 + MOV16 r0+r1*1, m0, m0, m0, m0 + lea r0, [r0+r1*2] + dec r2d + jg .loop + RET diff --git a/media/ffvpx/libavcodec/x86/h264_intrapred_init.c b/media/ffvpx/libavcodec/x86/h264_intrapred_init.c new file mode 100644 index 0000000000..ee46927a24 --- /dev/null +++ b/media/ffvpx/libavcodec/x86/h264_intrapred_init.c @@ -0,0 +1,336 @@ +/* + * Copyright (c) 2010 Fiona Glaser <fiona@x264.com> + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#include <stddef.h> +#include <stdint.h> +#include "config.h" +#include "libavutil/attributes.h" +#include "libavutil/cpu.h" +#include "libavutil/x86/cpu.h" +#include "libavcodec/codec_id.h" +#include "libavcodec/h264pred.h" + +#define PRED4x4(TYPE, DEPTH, OPT) \ +void ff_pred4x4_ ## TYPE ## _ ## DEPTH ## _ ## OPT (uint8_t *src, \ + const uint8_t *topright, \ + ptrdiff_t stride); + +PRED4x4(dc, 10, mmxext) +PRED4x4(down_left, 10, sse2) +PRED4x4(down_left, 10, avx) +PRED4x4(down_right, 10, sse2) +PRED4x4(down_right, 10, ssse3) +PRED4x4(down_right, 10, avx) +PRED4x4(vertical_left, 10, sse2) +PRED4x4(vertical_left, 10, avx) +PRED4x4(vertical_right, 10, sse2) +PRED4x4(vertical_right, 10, ssse3) +PRED4x4(vertical_right, 10, avx) +PRED4x4(horizontal_up, 10, mmxext) +PRED4x4(horizontal_down, 10, sse2) +PRED4x4(horizontal_down, 10, ssse3) +PRED4x4(horizontal_down, 10, avx) + +#define PRED8x8(TYPE, DEPTH, OPT) \ +void ff_pred8x8_ ## TYPE ## _ ## DEPTH ## _ ## OPT (uint8_t *src, \ + ptrdiff_t stride); + +PRED8x8(dc, 10, sse2) +PRED8x8(top_dc, 10, sse2) +PRED8x8(plane, 10, sse2) +PRED8x8(vertical, 10, sse2) +PRED8x8(horizontal, 10, sse2) + +#define PRED8x8L(TYPE, DEPTH, OPT)\ +void ff_pred8x8l_ ## TYPE ## _ ## DEPTH ## _ ## OPT (uint8_t *src, \ + int has_topleft, \ + int has_topright, \ + ptrdiff_t stride); + +PRED8x8L(dc, 10, sse2) +PRED8x8L(dc, 10, avx) +PRED8x8L(128_dc, 10, sse2) +PRED8x8L(top_dc, 10, sse2) +PRED8x8L(top_dc, 10, avx) +PRED8x8L(vertical, 10, sse2) +PRED8x8L(vertical, 10, avx) +PRED8x8L(horizontal, 10, sse2) +PRED8x8L(horizontal, 10, ssse3) +PRED8x8L(horizontal, 10, avx) +PRED8x8L(down_left, 10, sse2) +PRED8x8L(down_left, 10, ssse3) +PRED8x8L(down_left, 10, avx) +PRED8x8L(down_right, 10, sse2) +PRED8x8L(down_right, 10, ssse3) +PRED8x8L(down_right, 10, avx) +PRED8x8L(vertical_right, 10, sse2) +PRED8x8L(vertical_right, 10, ssse3) +PRED8x8L(vertical_right, 10, avx) +PRED8x8L(horizontal_up, 10, sse2) +PRED8x8L(horizontal_up, 10, ssse3) +PRED8x8L(horizontal_up, 10, avx) + +#define PRED16x16(TYPE, DEPTH, OPT)\ +void ff_pred16x16_ ## TYPE ## _ ## DEPTH ## _ ## OPT (uint8_t *src, \ + ptrdiff_t stride); + +PRED16x16(dc, 10, sse2) +PRED16x16(top_dc, 10, sse2) +PRED16x16(128_dc, 10, sse2) +PRED16x16(left_dc, 10, sse2) +PRED16x16(vertical, 10, sse2) +PRED16x16(horizontal, 10, sse2) + +/* 8-bit versions */ +PRED16x16(vertical, 8, sse) +PRED16x16(horizontal, 8, mmxext) +PRED16x16(horizontal, 8, ssse3) +PRED16x16(dc, 8, sse2) +PRED16x16(dc, 8, ssse3) +PRED16x16(plane_h264, 8, sse2) +PRED16x16(plane_h264, 8, ssse3) +PRED16x16(plane_rv40, 8, sse2) +PRED16x16(plane_rv40, 8, ssse3) +PRED16x16(plane_svq3, 8, sse2) +PRED16x16(plane_svq3, 8, ssse3) +PRED16x16(tm_vp8, 8, sse2) +PRED16x16(tm_vp8, 8, avx2) + +PRED8x8(top_dc, 8, mmxext) +PRED8x8(dc_rv40, 8, mmxext) +PRED8x8(dc, 8, mmxext) +PRED8x8(vertical, 8, mmx) +PRED8x8(horizontal, 8, mmxext) +PRED8x8(horizontal, 8, ssse3) +PRED8x8(plane, 8, sse2) +PRED8x8(plane, 8, ssse3) +PRED8x8(tm_vp8, 8, sse2) +PRED8x8(tm_vp8, 8, ssse3) + +PRED8x8L(top_dc, 8, mmxext) +PRED8x8L(top_dc, 8, ssse3) +PRED8x8L(dc, 8, mmxext) +PRED8x8L(dc, 8, ssse3) +PRED8x8L(horizontal, 8, mmxext) +PRED8x8L(horizontal, 8, ssse3) +PRED8x8L(vertical, 8, mmxext) +PRED8x8L(vertical, 8, ssse3) +PRED8x8L(down_left, 8, sse2) +PRED8x8L(down_left, 8, ssse3) +PRED8x8L(down_right, 8, sse2) +PRED8x8L(down_right, 8, ssse3) +PRED8x8L(vertical_right, 8, sse2) +PRED8x8L(vertical_right, 8, ssse3) +PRED8x8L(vertical_left, 8, sse2) +PRED8x8L(vertical_left, 8, ssse3) +PRED8x8L(horizontal_up, 8, mmxext) +PRED8x8L(horizontal_up, 8, ssse3) +PRED8x8L(horizontal_down, 8, sse2) +PRED8x8L(horizontal_down, 8, ssse3) + +PRED4x4(dc, 8, mmxext) +PRED4x4(down_left, 8, mmxext) +PRED4x4(down_right, 8, mmxext) +PRED4x4(vertical_left, 8, mmxext) +PRED4x4(vertical_right, 8, mmxext) +PRED4x4(horizontal_up, 8, mmxext) +PRED4x4(horizontal_down, 8, mmxext) +PRED4x4(tm_vp8, 8, mmxext) +PRED4x4(tm_vp8, 8, ssse3) +PRED4x4(vertical_vp8, 8, mmxext) + +av_cold void ff_h264_pred_init_x86(H264PredContext *h, int codec_id, + const int bit_depth, + const int chroma_format_idc) +{ + int cpu_flags = av_get_cpu_flags(); + + if (bit_depth == 8) { + if (EXTERNAL_MMX(cpu_flags)) { + if (chroma_format_idc <= 1) { + h->pred8x8 [VERT_PRED8x8 ] = ff_pred8x8_vertical_8_mmx; + } + } + + if (EXTERNAL_MMXEXT(cpu_flags)) { + h->pred16x16[HOR_PRED8x8 ] = ff_pred16x16_horizontal_8_mmxext; + if (chroma_format_idc <= 1) + h->pred8x8[HOR_PRED8x8 ] = ff_pred8x8_horizontal_8_mmxext; + h->pred8x8l [TOP_DC_PRED ] = ff_pred8x8l_top_dc_8_mmxext; + h->pred8x8l [DC_PRED ] = ff_pred8x8l_dc_8_mmxext; + h->pred8x8l [HOR_PRED ] = ff_pred8x8l_horizontal_8_mmxext; + h->pred8x8l [VERT_PRED ] = ff_pred8x8l_vertical_8_mmxext; + h->pred8x8l [HOR_UP_PRED ] = ff_pred8x8l_horizontal_up_8_mmxext; + h->pred4x4 [DIAG_DOWN_RIGHT_PRED ] = ff_pred4x4_down_right_8_mmxext; + h->pred4x4 [VERT_RIGHT_PRED ] = ff_pred4x4_vertical_right_8_mmxext; + h->pred4x4 [HOR_DOWN_PRED ] = ff_pred4x4_horizontal_down_8_mmxext; + h->pred4x4 [DC_PRED ] = ff_pred4x4_dc_8_mmxext; + if (codec_id == AV_CODEC_ID_VP7 || codec_id == AV_CODEC_ID_VP8 || + codec_id == AV_CODEC_ID_H264) { + h->pred4x4 [DIAG_DOWN_LEFT_PRED] = ff_pred4x4_down_left_8_mmxext; + } + if (codec_id == AV_CODEC_ID_SVQ3 || codec_id == AV_CODEC_ID_H264) { + h->pred4x4 [VERT_LEFT_PRED ] = ff_pred4x4_vertical_left_8_mmxext; + } + if (codec_id != AV_CODEC_ID_RV40) { + h->pred4x4 [HOR_UP_PRED ] = ff_pred4x4_horizontal_up_8_mmxext; + } + if (codec_id == AV_CODEC_ID_SVQ3 || codec_id == AV_CODEC_ID_H264) { + if (chroma_format_idc <= 1) { + h->pred8x8[TOP_DC_PRED8x8 ] = ff_pred8x8_top_dc_8_mmxext; + h->pred8x8[DC_PRED8x8 ] = ff_pred8x8_dc_8_mmxext; + } + } + if (codec_id == AV_CODEC_ID_VP7 || codec_id == AV_CODEC_ID_VP8) { + h->pred8x8 [DC_PRED8x8 ] = ff_pred8x8_dc_rv40_8_mmxext; + h->pred4x4 [TM_VP8_PRED ] = ff_pred4x4_tm_vp8_8_mmxext; + h->pred4x4 [VERT_PRED ] = ff_pred4x4_vertical_vp8_8_mmxext; + } + } + + if (EXTERNAL_SSE(cpu_flags)) { + h->pred16x16[VERT_PRED8x8] = ff_pred16x16_vertical_8_sse; + } + + if (EXTERNAL_SSE2(cpu_flags)) { + h->pred16x16[DC_PRED8x8 ] = ff_pred16x16_dc_8_sse2; + h->pred8x8l [DIAG_DOWN_LEFT_PRED ] = ff_pred8x8l_down_left_8_sse2; + h->pred8x8l [DIAG_DOWN_RIGHT_PRED ] = ff_pred8x8l_down_right_8_sse2; + h->pred8x8l [VERT_RIGHT_PRED ] = ff_pred8x8l_vertical_right_8_sse2; + h->pred8x8l [VERT_LEFT_PRED ] = ff_pred8x8l_vertical_left_8_sse2; + h->pred8x8l [HOR_DOWN_PRED ] = ff_pred8x8l_horizontal_down_8_sse2; + if (codec_id == AV_CODEC_ID_VP7 || codec_id == AV_CODEC_ID_VP8) { + h->pred16x16[PLANE_PRED8x8 ] = ff_pred16x16_tm_vp8_8_sse2; + h->pred8x8 [PLANE_PRED8x8 ] = ff_pred8x8_tm_vp8_8_sse2; + } else { + if (chroma_format_idc <= 1) + h->pred8x8 [PLANE_PRED8x8] = ff_pred8x8_plane_8_sse2; + if (codec_id == AV_CODEC_ID_SVQ3) { + h->pred16x16[PLANE_PRED8x8] = ff_pred16x16_plane_svq3_8_sse2; + } else if (codec_id == AV_CODEC_ID_RV40) { + h->pred16x16[PLANE_PRED8x8] = ff_pred16x16_plane_rv40_8_sse2; + } else { + h->pred16x16[PLANE_PRED8x8] = ff_pred16x16_plane_h264_8_sse2; + } + } + } + + if (EXTERNAL_SSSE3(cpu_flags)) { + h->pred16x16[HOR_PRED8x8 ] = ff_pred16x16_horizontal_8_ssse3; + h->pred16x16[DC_PRED8x8 ] = ff_pred16x16_dc_8_ssse3; + if (chroma_format_idc <= 1) + h->pred8x8 [HOR_PRED8x8 ] = ff_pred8x8_horizontal_8_ssse3; + h->pred8x8l [TOP_DC_PRED ] = ff_pred8x8l_top_dc_8_ssse3; + h->pred8x8l [DC_PRED ] = ff_pred8x8l_dc_8_ssse3; + h->pred8x8l [HOR_PRED ] = ff_pred8x8l_horizontal_8_ssse3; + h->pred8x8l [VERT_PRED ] = ff_pred8x8l_vertical_8_ssse3; + h->pred8x8l [DIAG_DOWN_LEFT_PRED ] = ff_pred8x8l_down_left_8_ssse3; + h->pred8x8l [DIAG_DOWN_RIGHT_PRED ] = ff_pred8x8l_down_right_8_ssse3; + h->pred8x8l [VERT_RIGHT_PRED ] = ff_pred8x8l_vertical_right_8_ssse3; + h->pred8x8l [VERT_LEFT_PRED ] = ff_pred8x8l_vertical_left_8_ssse3; + h->pred8x8l [HOR_UP_PRED ] = ff_pred8x8l_horizontal_up_8_ssse3; + h->pred8x8l [HOR_DOWN_PRED ] = ff_pred8x8l_horizontal_down_8_ssse3; + if (codec_id == AV_CODEC_ID_VP7 || codec_id == AV_CODEC_ID_VP8) { + h->pred8x8 [PLANE_PRED8x8 ] = ff_pred8x8_tm_vp8_8_ssse3; + h->pred4x4 [TM_VP8_PRED ] = ff_pred4x4_tm_vp8_8_ssse3; + } else { + if (chroma_format_idc <= 1) + h->pred8x8 [PLANE_PRED8x8] = ff_pred8x8_plane_8_ssse3; + if (codec_id == AV_CODEC_ID_SVQ3) { + h->pred16x16[PLANE_PRED8x8] = ff_pred16x16_plane_svq3_8_ssse3; + } else if (codec_id == AV_CODEC_ID_RV40) { + h->pred16x16[PLANE_PRED8x8] = ff_pred16x16_plane_rv40_8_ssse3; + } else { + h->pred16x16[PLANE_PRED8x8] = ff_pred16x16_plane_h264_8_ssse3; + } + } + } + + if(EXTERNAL_AVX2(cpu_flags)){ + if (codec_id == AV_CODEC_ID_VP8) { + h->pred16x16[PLANE_PRED8x8 ] = ff_pred16x16_tm_vp8_8_avx2; + } + } + } else if (bit_depth == 10) { + if (EXTERNAL_MMXEXT(cpu_flags)) { + h->pred4x4[DC_PRED ] = ff_pred4x4_dc_10_mmxext; + h->pred4x4[HOR_UP_PRED ] = ff_pred4x4_horizontal_up_10_mmxext; + } + if (EXTERNAL_SSE2(cpu_flags)) { + h->pred4x4[DIAG_DOWN_LEFT_PRED ] = ff_pred4x4_down_left_10_sse2; + h->pred4x4[DIAG_DOWN_RIGHT_PRED] = ff_pred4x4_down_right_10_sse2; + h->pred4x4[VERT_LEFT_PRED ] = ff_pred4x4_vertical_left_10_sse2; + h->pred4x4[VERT_RIGHT_PRED ] = ff_pred4x4_vertical_right_10_sse2; + h->pred4x4[HOR_DOWN_PRED ] = ff_pred4x4_horizontal_down_10_sse2; + + if (chroma_format_idc <= 1) { + h->pred8x8[DC_PRED8x8 ] = ff_pred8x8_dc_10_sse2; + h->pred8x8[TOP_DC_PRED8x8 ] = ff_pred8x8_top_dc_10_sse2; + h->pred8x8[PLANE_PRED8x8 ] = ff_pred8x8_plane_10_sse2; + h->pred8x8[VERT_PRED8x8 ] = ff_pred8x8_vertical_10_sse2; + h->pred8x8[HOR_PRED8x8 ] = ff_pred8x8_horizontal_10_sse2; + } + + h->pred8x8l[VERT_PRED ] = ff_pred8x8l_vertical_10_sse2; + h->pred8x8l[HOR_PRED ] = ff_pred8x8l_horizontal_10_sse2; + h->pred8x8l[DC_PRED ] = ff_pred8x8l_dc_10_sse2; + h->pred8x8l[DC_128_PRED ] = ff_pred8x8l_128_dc_10_sse2; + h->pred8x8l[TOP_DC_PRED ] = ff_pred8x8l_top_dc_10_sse2; + h->pred8x8l[DIAG_DOWN_LEFT_PRED ] = ff_pred8x8l_down_left_10_sse2; + h->pred8x8l[DIAG_DOWN_RIGHT_PRED] = ff_pred8x8l_down_right_10_sse2; + h->pred8x8l[VERT_RIGHT_PRED ] = ff_pred8x8l_vertical_right_10_sse2; + h->pred8x8l[HOR_UP_PRED ] = ff_pred8x8l_horizontal_up_10_sse2; + + h->pred16x16[DC_PRED8x8 ] = ff_pred16x16_dc_10_sse2; + h->pred16x16[TOP_DC_PRED8x8 ] = ff_pred16x16_top_dc_10_sse2; + h->pred16x16[DC_128_PRED8x8 ] = ff_pred16x16_128_dc_10_sse2; + h->pred16x16[LEFT_DC_PRED8x8 ] = ff_pred16x16_left_dc_10_sse2; + h->pred16x16[VERT_PRED8x8 ] = ff_pred16x16_vertical_10_sse2; + h->pred16x16[HOR_PRED8x8 ] = ff_pred16x16_horizontal_10_sse2; + } + if (EXTERNAL_SSSE3(cpu_flags)) { + h->pred4x4[DIAG_DOWN_RIGHT_PRED] = ff_pred4x4_down_right_10_ssse3; + h->pred4x4[VERT_RIGHT_PRED ] = ff_pred4x4_vertical_right_10_ssse3; + h->pred4x4[HOR_DOWN_PRED ] = ff_pred4x4_horizontal_down_10_ssse3; + + h->pred8x8l[HOR_PRED ] = ff_pred8x8l_horizontal_10_ssse3; + h->pred8x8l[DIAG_DOWN_LEFT_PRED ] = ff_pred8x8l_down_left_10_ssse3; + h->pred8x8l[DIAG_DOWN_RIGHT_PRED] = ff_pred8x8l_down_right_10_ssse3; + h->pred8x8l[VERT_RIGHT_PRED ] = ff_pred8x8l_vertical_right_10_ssse3; + h->pred8x8l[HOR_UP_PRED ] = ff_pred8x8l_horizontal_up_10_ssse3; + } + if (EXTERNAL_AVX(cpu_flags)) { + h->pred4x4[DIAG_DOWN_LEFT_PRED ] = ff_pred4x4_down_left_10_avx; + h->pred4x4[DIAG_DOWN_RIGHT_PRED] = ff_pred4x4_down_right_10_avx; + h->pred4x4[VERT_LEFT_PRED ] = ff_pred4x4_vertical_left_10_avx; + h->pred4x4[VERT_RIGHT_PRED ] = ff_pred4x4_vertical_right_10_avx; + h->pred4x4[HOR_DOWN_PRED ] = ff_pred4x4_horizontal_down_10_avx; + + h->pred8x8l[VERT_PRED ] = ff_pred8x8l_vertical_10_avx; + h->pred8x8l[HOR_PRED ] = ff_pred8x8l_horizontal_10_avx; + h->pred8x8l[DC_PRED ] = ff_pred8x8l_dc_10_avx; + h->pred8x8l[TOP_DC_PRED ] = ff_pred8x8l_top_dc_10_avx; + h->pred8x8l[DIAG_DOWN_RIGHT_PRED] = ff_pred8x8l_down_right_10_avx; + h->pred8x8l[DIAG_DOWN_LEFT_PRED ] = ff_pred8x8l_down_left_10_avx; + h->pred8x8l[VERT_RIGHT_PRED ] = ff_pred8x8l_vertical_right_10_avx; + h->pred8x8l[HOR_UP_PRED ] = ff_pred8x8l_horizontal_up_10_avx; + } + } +} diff --git a/media/ffvpx/libavcodec/x86/idctdsp.asm b/media/ffvpx/libavcodec/x86/idctdsp.asm new file mode 100644 index 0000000000..1cfdb5419d --- /dev/null +++ b/media/ffvpx/libavcodec/x86/idctdsp.asm @@ -0,0 +1,112 @@ +;****************************************************************************** +;* SIMD-optimized IDCT-related routines +;* Copyright (c) 2008 Loren Merritt +;* Copyright (c) 2003-2013 Michael Niedermayer +;* Copyright (c) 2013 Daniel Kang +;* +;* This file is part of FFmpeg. +;* +;* FFmpeg is free software; you can redistribute it and/or +;* modify it under the terms of the GNU Lesser General Public +;* License as published by the Free Software Foundation; either +;* version 2.1 of the License, or (at your option) any later version. +;* +;* FFmpeg is distributed in the hope that it will be useful, +;* but WITHOUT ANY WARRANTY; without even the implied warranty of +;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +;* Lesser General Public License for more details. +;* +;* You should have received a copy of the GNU Lesser General Public +;* License along with FFmpeg; if not, write to the Free Software +;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA +;****************************************************************************** + +%include "libavutil/x86/x86util.asm" + +SECTION_RODATA + +cextern pb_80 + +SECTION .text + +;-------------------------------------------------------------------------- +;void ff_put_signed_pixels_clamped(const int16_t *block, uint8_t *pixels, +; ptrdiff_t line_size) +;-------------------------------------------------------------------------- + +%macro PUT_SIGNED_PIXELS_CLAMPED_HALF 1 + mova m1, [blockq+mmsize*0+%1] + mova m2, [blockq+mmsize*2+%1] + packsswb m1, [blockq+mmsize*1+%1] + packsswb m2, [blockq+mmsize*3+%1] + paddb m1, m0 + paddb m2, m0 + movq [pixelsq+lsizeq*0], m1 + movhps [pixelsq+lsizeq*1], m1 + movq [pixelsq+lsizeq*2], m2 + movhps [pixelsq+lsize3q ], m2 +%endmacro + +INIT_XMM sse2 +cglobal put_signed_pixels_clamped, 3, 4, 3, block, pixels, lsize, lsize3 + mova m0, [pb_80] + lea lsize3q, [lsizeq*3] + PUT_SIGNED_PIXELS_CLAMPED_HALF 0 + lea pixelsq, [pixelsq+lsizeq*4] + PUT_SIGNED_PIXELS_CLAMPED_HALF 64 + RET + +;-------------------------------------------------------------------------- +; void ff_put_pixels_clamped(const int16_t *block, uint8_t *pixels, +; ptrdiff_t line_size); +;-------------------------------------------------------------------------- +; %1 = block offset +%macro PUT_PIXELS_CLAMPED_HALF 1 + mova m0, [blockq+mmsize*0+%1] + mova m1, [blockq+mmsize*2+%1] + packuswb m0, [blockq+mmsize*1+%1] + packuswb m1, [blockq+mmsize*3+%1] + movq [pixelsq], m0 + movhps [lsizeq+pixelsq], m0 + movq [2*lsizeq+pixelsq], m1 + movhps [lsize3q+pixelsq], m1 +%endmacro + +INIT_XMM sse2 +cglobal put_pixels_clamped, 3, 4, 2, block, pixels, lsize, lsize3 + lea lsize3q, [lsizeq*3] + PUT_PIXELS_CLAMPED_HALF 0 + lea pixelsq, [pixelsq+lsizeq*4] + PUT_PIXELS_CLAMPED_HALF 64 + RET + +;-------------------------------------------------------------------------- +; void ff_add_pixels_clamped(const int16_t *block, uint8_t *pixels, +; ptrdiff_t line_size); +;-------------------------------------------------------------------------- +; %1 = block offset +%macro ADD_PIXELS_CLAMPED 1 + mova m0, [blockq+mmsize*0+%1] + mova m1, [blockq+mmsize*1+%1] + movq m2, [pixelsq] + movq m3, [pixelsq+lsizeq] + punpcklbw m2, m4 + punpcklbw m3, m4 + paddsw m0, m2 + paddsw m1, m3 + packuswb m0, m1 + movq [pixelsq], m0 + movhps [pixelsq+lsizeq], m0 +%endmacro + +INIT_XMM sse2 +cglobal add_pixels_clamped, 3, 3, 5, block, pixels, lsize + pxor m4, m4 + ADD_PIXELS_CLAMPED 0 + lea pixelsq, [pixelsq+lsizeq*2] + ADD_PIXELS_CLAMPED 32 + lea pixelsq, [pixelsq+lsizeq*2] + ADD_PIXELS_CLAMPED 64 + lea pixelsq, [pixelsq+lsizeq*2] + ADD_PIXELS_CLAMPED 96 + RET diff --git a/media/ffvpx/libavcodec/x86/idctdsp.h b/media/ffvpx/libavcodec/x86/idctdsp.h new file mode 100644 index 0000000000..738e4e36e4 --- /dev/null +++ b/media/ffvpx/libavcodec/x86/idctdsp.h @@ -0,0 +1,33 @@ +/* + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#ifndef AVCODEC_X86_IDCTDSP_H +#define AVCODEC_X86_IDCTDSP_H + +#include <stddef.h> +#include <stdint.h> + +void ff_add_pixels_clamped_sse2(const int16_t *block, uint8_t *pixels, + ptrdiff_t line_size); +void ff_put_pixels_clamped_sse2(const int16_t *block, uint8_t *pixels, + ptrdiff_t line_size); +void ff_put_signed_pixels_clamped_sse2(const int16_t *block, uint8_t *pixels, + ptrdiff_t line_size); + + +#endif /* AVCODEC_X86_IDCTDSP_H */ diff --git a/media/ffvpx/libavcodec/x86/idctdsp_init.c b/media/ffvpx/libavcodec/x86/idctdsp_init.c new file mode 100644 index 0000000000..f28a1ad744 --- /dev/null +++ b/media/ffvpx/libavcodec/x86/idctdsp_init.c @@ -0,0 +1,159 @@ +/* + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#include "config.h" +#include "libavutil/attributes.h" +#include "libavutil/cpu.h" +#include "libavutil/x86/cpu.h" +#include "libavcodec/avcodec.h" +#include "libavcodec/idctdsp.h" +#include "idctdsp.h" +#include "simple_idct.h" + +/* Input permutation for the simple_idct_mmx */ +static const uint8_t simple_mmx_permutation[64] = { + 0x00, 0x08, 0x04, 0x09, 0x01, 0x0C, 0x05, 0x0D, + 0x10, 0x18, 0x14, 0x19, 0x11, 0x1C, 0x15, 0x1D, + 0x20, 0x28, 0x24, 0x29, 0x21, 0x2C, 0x25, 0x2D, + 0x12, 0x1A, 0x16, 0x1B, 0x13, 0x1E, 0x17, 0x1F, + 0x02, 0x0A, 0x06, 0x0B, 0x03, 0x0E, 0x07, 0x0F, + 0x30, 0x38, 0x34, 0x39, 0x31, 0x3C, 0x35, 0x3D, + 0x22, 0x2A, 0x26, 0x2B, 0x23, 0x2E, 0x27, 0x2F, + 0x32, 0x3A, 0x36, 0x3B, 0x33, 0x3E, 0x37, 0x3F, +}; + +static const uint8_t idct_sse2_row_perm[8] = { 0, 4, 1, 5, 2, 6, 3, 7 }; + +av_cold int ff_init_scantable_permutation_x86(uint8_t *idct_permutation, + enum idct_permutation_type perm_type) +{ + int i; + + switch (perm_type) { + case FF_IDCT_PERM_SIMPLE: + for (i = 0; i < 64; i++) + idct_permutation[i] = simple_mmx_permutation[i]; + return 1; + case FF_IDCT_PERM_SSE2: + for (i = 0; i < 64; i++) + idct_permutation[i] = (i & 0x38) | idct_sse2_row_perm[i & 7]; + return 1; + } + + return 0; +} + +av_cold void ff_idctdsp_init_x86(IDCTDSPContext *c, AVCodecContext *avctx, + unsigned high_bit_depth) +{ + int cpu_flags = av_get_cpu_flags(); + +#if ARCH_X86_32 + if (EXTERNAL_MMX(cpu_flags)) { + if (!high_bit_depth && + avctx->lowres == 0 && + (avctx->idct_algo == FF_IDCT_AUTO || + avctx->idct_algo == FF_IDCT_SIMPLEAUTO || + avctx->idct_algo == FF_IDCT_SIMPLEMMX)) { + c->idct = ff_simple_idct_mmx; + } + } +#endif + + if (EXTERNAL_SSE2(cpu_flags)) { + c->put_signed_pixels_clamped = ff_put_signed_pixels_clamped_sse2; + c->put_pixels_clamped = ff_put_pixels_clamped_sse2; + c->add_pixels_clamped = ff_add_pixels_clamped_sse2; + +#if ARCH_X86_32 + if (!high_bit_depth && + avctx->lowres == 0 && + (avctx->idct_algo == FF_IDCT_AUTO || + avctx->idct_algo == FF_IDCT_SIMPLEAUTO || + avctx->idct_algo == FF_IDCT_SIMPLEMMX)) { + c->idct_put = ff_simple_idct_put_sse2; + c->idct_add = ff_simple_idct_add_sse2; + c->perm_type = FF_IDCT_PERM_SIMPLE; + } +#endif + + if (ARCH_X86_64 && + !high_bit_depth && + avctx->lowres == 0 && + (avctx->idct_algo == FF_IDCT_AUTO || + avctx->idct_algo == FF_IDCT_SIMPLEAUTO || + avctx->idct_algo == FF_IDCT_SIMPLEMMX || + avctx->idct_algo == FF_IDCT_SIMPLE)) { + c->idct = ff_simple_idct8_sse2; + c->idct_put = ff_simple_idct8_put_sse2; + c->idct_add = ff_simple_idct8_add_sse2; + c->perm_type = FF_IDCT_PERM_TRANSPOSE; + } + } + + if (ARCH_X86_64 && avctx->lowres == 0) { + if (EXTERNAL_AVX(cpu_flags) && + !high_bit_depth && + (avctx->idct_algo == FF_IDCT_AUTO || + avctx->idct_algo == FF_IDCT_SIMPLEAUTO || + avctx->idct_algo == FF_IDCT_SIMPLEMMX || + avctx->idct_algo == FF_IDCT_SIMPLE)) { + c->idct = ff_simple_idct8_avx; + c->idct_put = ff_simple_idct8_put_avx; + c->idct_add = ff_simple_idct8_add_avx; + c->perm_type = FF_IDCT_PERM_TRANSPOSE; + } + + if (avctx->bits_per_raw_sample == 10 && + avctx->codec_id != AV_CODEC_ID_MPEG4 && + (avctx->idct_algo == FF_IDCT_AUTO || + avctx->idct_algo == FF_IDCT_SIMPLEAUTO || + avctx->idct_algo == FF_IDCT_SIMPLE)) { + if (EXTERNAL_SSE2(cpu_flags)) { + c->idct_put = ff_simple_idct10_put_sse2; + c->idct_add = NULL; + c->idct = ff_simple_idct10_sse2; + c->perm_type = FF_IDCT_PERM_TRANSPOSE; + + } + if (EXTERNAL_AVX(cpu_flags)) { + c->idct_put = ff_simple_idct10_put_avx; + c->idct_add = NULL; + c->idct = ff_simple_idct10_avx; + c->perm_type = FF_IDCT_PERM_TRANSPOSE; + } + } + + if (avctx->bits_per_raw_sample == 12 && + (avctx->idct_algo == FF_IDCT_AUTO || + avctx->idct_algo == FF_IDCT_SIMPLEMMX)) { + if (EXTERNAL_SSE2(cpu_flags)) { + c->idct_put = ff_simple_idct12_put_sse2; + c->idct_add = NULL; + c->idct = ff_simple_idct12_sse2; + c->perm_type = FF_IDCT_PERM_TRANSPOSE; + } + if (EXTERNAL_AVX(cpu_flags)) { + c->idct_put = ff_simple_idct12_put_avx; + c->idct_add = NULL; + c->idct = ff_simple_idct12_avx; + c->perm_type = FF_IDCT_PERM_TRANSPOSE; + } + } + } +} diff --git a/media/ffvpx/libavcodec/x86/imdct36.asm b/media/ffvpx/libavcodec/x86/imdct36.asm new file mode 100644 index 0000000000..888c6bf4d6 --- /dev/null +++ b/media/ffvpx/libavcodec/x86/imdct36.asm @@ -0,0 +1,736 @@ +;****************************************************************************** +;* 36 point SSE-optimized IMDCT transform +;* Copyright (c) 2011 Vitor Sessak +;* +;* This file is part of FFmpeg. +;* +;* FFmpeg is free software; you can redistribute it and/or +;* modify it under the terms of the GNU Lesser General Public +;* License as published by the Free Software Foundation; either +;* version 2.1 of the License, or (at your option) any later version. +;* +;* FFmpeg is distributed in the hope that it will be useful, +;* but WITHOUT ANY WARRANTY; without even the implied warranty of +;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +;* Lesser General Public License for more details. +;* +;* You should have received a copy of the GNU Lesser General Public +;* License along with FFmpeg; if not, write to the Free Software +;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA +;****************************************************************************** + +%include "libavutil/x86/x86util.asm" + +SECTION_RODATA + +ps_mask: dd 0, ~0, ~0, ~0 +ps_mask2: dd 0, ~0, 0, ~0 +ps_mask3: dd 0, 0, 0, ~0 +ps_mask4: dd 0, ~0, 0, 0 + +ps_val1: dd -0.5, -0.5, -0.8660254038, -0.8660254038 +ps_val2: dd 1.0, 1.0, 0.8660254038, 0.8660254038 +ps_val3: dd 0.1736481777, 0.1736481777, 0.3420201433, 0.3420201433 +ps_val4: dd -0.7660444431, -0.7660444431, 0.8660254038, 0.8660254038 +ps_val5: dd -0.9396926208, -0.9396926208, -0.9848077530, -0.9848077530 +ps_val6: dd 0.5, 0.5, -0.6427876097, -0.6427876097 +ps_val7: dd 1.0, 1.0, -0.6427876097, -0.6427876097 + +ps_p1p1m1m1: dd 0, 0, 0x80000000, 0x80000000 +ps_p1m1p1m1: dd 0, 0x80000000, 0, 0x80000000 + +ps_cosh: dd 1.0, 0.50190991877167369479, 1.0, 5.73685662283492756461 + dd 1.0, 0.51763809020504152469, 1.0, 1.93185165257813657349 + dd 1.0, 0.55168895948124587824, -1.0, -1.18310079157624925896 + dd 1.0, 0.61038729438072803416, -1.0, -0.87172339781054900991 + dd 1.0, 0.70710678118654752439, 0.0, 0.0 + +ps_cosh_sse3: dd 1.0, -0.50190991877167369479, 1.0, -5.73685662283492756461 + dd 1.0, -0.51763809020504152469, 1.0, -1.93185165257813657349 + dd 1.0, -0.55168895948124587824, -1.0, 1.18310079157624925896 + dd 1.0, -0.61038729438072803416, -1.0, 0.87172339781054900991 + dd 1.0, -0.70710678118654752439, 0.0, 0.0 + +costabs: times 4 dd 0.98480773 + times 4 dd 0.93969262 + times 4 dd 0.86602539 + times 4 dd -0.76604444 + times 4 dd -0.64278764 + times 4 dd 0.50000000 + times 4 dd -0.50000000 + times 4 dd -0.34202015 + times 4 dd -0.17364818 + times 4 dd 0.50190992 + times 4 dd 0.51763808 + times 4 dd 0.55168896 + times 4 dd 0.61038726 + times 4 dd 0.70710677 + times 4 dd 0.87172341 + times 4 dd 1.18310082 + times 4 dd 1.93185163 + times 4 dd 5.73685646 + +%define SBLIMIT 32 +SECTION .text + +%macro PSHUFD 3 +%if cpuflag(sse2) && notcpuflag(avx) + pshufd %1, %2, %3 +%else + shufps %1, %2, %2, %3 +%endif +%endmacro + +; input %2={x1,x2,x3,x4}, %3={y1,y2,y3,y4} +; output %1={x3,x4,y1,y2} +%macro BUILDINVHIGHLOW 3 +%if cpuflag(avx) + shufps %1, %2, %3, 0x4e +%else + movlhps %1, %3 + movhlps %1, %2 +%endif +%endmacro + +; input %2={x1,x2,x3,x4}, %3={y1,y2,y3,y4} +; output %1={x4,y1,y2,y3} +%macro ROTLEFT 3 +%if cpuflag(ssse3) + palignr %1, %3, %2, 12 +%else + BUILDINVHIGHLOW %1, %2, %3 + shufps %1, %1, %3, 0x99 +%endif +%endmacro + +%macro INVERTHL 2 +%if cpuflag(sse2) + PSHUFD %1, %2, 0x4e +%else + movhlps %1, %2 + movlhps %1, %2 +%endif +%endmacro + +%macro BUTTERF 3 + INVERTHL %2, %1 + xorps %1, [ps_p1p1m1m1] + addps %1, %2 +%if cpuflag(sse3) + mulps %1, %1, [ps_cosh_sse3 + %3] + PSHUFD %2, %1, 0xb1 + addsubps %1, %1, %2 +%else + mulps %1, [ps_cosh + %3] + PSHUFD %2, %1, 0xb1 + xorps %1, [ps_p1m1p1m1] + addps %1, %2 +%endif +%endmacro + +%macro BUTTERF2 3 +%if cpuflag(sse3) + mulps %1, %1, [ps_cosh_sse3 + %3] + PSHUFD %2, %1, 0xe1 + addsubps %1, %1, %2 +%else + mulps %1, [ps_cosh + %3] + PSHUFD %2, %1, 0xe1 + xorps %1, [ps_p1m1p1m1] + addps %1, %2 +%endif +%endmacro + +%macro STORE 4 +%if cpuflag(sse4) + movss [%3 ], %1 + extractps dword [%3 + %4], %1, 1 + extractps dword [%3 + 2*%4], %1, 2 + extractps dword [%3 + 3*%4], %1, 3 +%else + movhlps %2, %1 + movss [%3 ], %1 + movss [%3 + 2*%4], %2 + shufps %1, %1, 0xb1 + movss [%3 + %4], %1 + movhlps %2, %1 + movss [%3 + 3*%4], %2 +%endif +%endmacro + +%macro LOAD 4 + movlps %1, [%3 ] + movhps %1, [%3 + %4] + movlps %2, [%3 + 2*%4] + movhps %2, [%3 + 3*%4] + shufps %1, %2, 0x88 +%endmacro + +%macro LOADA64 2 +%if cpuflag(avx) + movu %1, [%2] +%else + movlps %1, [%2] + movhps %1, [%2 + 8] +%endif +%endmacro + +%macro DEFINE_IMDCT 0 +cglobal imdct36_float, 4,4,9, out, buf, in, win + + ; for(i=17;i>=1;i--) in[i] += in[i-1]; + LOADA64 m0, inq + LOADA64 m1, inq + 16 + + ROTLEFT m5, m0, m1 + + PSHUFD m6, m0, 0x93 + andps m6, m6, [ps_mask] + addps m0, m0, m6 + + LOADA64 m2, inq + 32 + + ROTLEFT m7, m1, m2 + + addps m1, m1, m5 + LOADA64 m3, inq + 48 + + ROTLEFT m5, m2, m3 + + xorps m4, m4, m4 + movlps m4, [inq+64] + BUILDINVHIGHLOW m6, m3, m4 + shufps m6, m6, m4, 0xa9 + + addps m4, m4, m6 + addps m2, m2, m7 + addps m3, m3, m5 + + ; for(i=17;i>=3;i-=2) in[i] += in[i-2]; + movlhps m5, m5, m0 + andps m5, m5, [ps_mask3] + + BUILDINVHIGHLOW m7, m0, m1 + andps m7, m7, [ps_mask2] + + addps m0, m0, m5 + + BUILDINVHIGHLOW m6, m1, m2 + andps m6, m6, [ps_mask2] + + addps m1, m1, m7 + + BUILDINVHIGHLOW m7, m2, m3 + andps m7, m7, [ps_mask2] + + addps m2, m2, m6 + + movhlps m6, m6, m3 + andps m6, m6, [ps_mask4] + + addps m3, m3, m7 + addps m4, m4, m6 + + ; Populate tmp[] + movlhps m6, m1, m5 ; zero out high values + subps m6, m6, m4 + + subps m5, m0, m3 + +%if ARCH_X86_64 + SWAP m5, m8 +%endif + + mulps m7, m2, [ps_val1] + +%if ARCH_X86_64 + mulps m5, m8, [ps_val2] +%else + mulps m5, m5, [ps_val2] +%endif + addps m7, m7, m5 + + mulps m5, m6, [ps_val1] + subps m7, m7, m5 + +%if ARCH_X86_64 + SWAP m5, m8 +%else + subps m5, m0, m3 +%endif + + subps m5, m5, m6 + addps m5, m5, m2 + + shufps m6, m4, m3, 0xe4 + subps m6, m6, m2 + mulps m6, m6, [ps_val3] + + addps m4, m4, m1 + mulps m4, m4, [ps_val4] + + shufps m1, m1, m0, 0xe4 + addps m1, m1, m2 + mulps m1, m1, [ps_val5] + + mulps m3, m3, [ps_val6] + mulps m0, m0, [ps_val7] + addps m0, m0, m3 + + xorps m2, m1, [ps_p1p1m1m1] + subps m2, m2, m4 + addps m2, m2, m0 + + addps m3, m4, m0 + subps m3, m3, m6 + xorps m3, m3, [ps_p1p1m1m1] + + shufps m0, m0, m4, 0xe4 + subps m0, m0, m1 + addps m0, m0, m6 + + BUILDINVHIGHLOW m4, m2, m3 + shufps m3, m3, m2, 0x4e + + ; we have tmp = {SwAPLH(m0), SwAPLH(m7), m3, m4, m5} + + BUTTERF m0, m1, 0 + BUTTERF m7, m2, 16 + BUTTERF m3, m6, 32 + BUTTERF m4, m1, 48 + BUTTERF2 m5, m1, 64 + + ; permutates: + ; m0 0 1 2 3 => 2 6 10 14 m1 + ; m7 4 5 6 7 => 3 7 11 15 m2 + ; m3 8 9 10 11 => 17 13 9 5 m3 + ; m4 12 13 14 15 => 16 12 8 4 m5 + ; m5 16 17 xx xx => 0 1 xx xx m0 + + unpckhps m1, m0, m7 + unpckhps m6, m3, m4 + movhlps m2, m6, m1 + movlhps m1, m1, m6 + + unpcklps m5, m5, m4 + unpcklps m3, m3, m7 + movhlps m4, m3, m5 + movlhps m5, m5, m3 + SWAP m4, m3 + ; permutation done + + PSHUFD m6, m2, 0xb1 + movss m4, [bufq + 4*68] + movss m7, [bufq + 4*64] + unpcklps m7, m7, m4 + mulps m6, m6, [winq + 16*4] + addps m6, m6, m7 + movss [outq + 64*SBLIMIT], m6 + shufps m6, m6, m6, 0xb1 + movss [outq + 68*SBLIMIT], m6 + + mulps m6, m3, [winq + 4*4] + LOAD m4, m7, bufq + 4*16, 16 + addps m6, m6, m4 + STORE m6, m7, outq + 16*SBLIMIT, 4*SBLIMIT + + shufps m4, m0, m3, 0xb5 + mulps m4, m4, [winq + 8*4] + LOAD m7, m6, bufq + 4*32, 16 + addps m4, m4, m7 + STORE m4, m6, outq + 32*SBLIMIT, 4*SBLIMIT + + shufps m3, m3, m2, 0xb1 + mulps m3, m3, [winq + 12*4] + LOAD m7, m6, bufq + 4*48, 16 + addps m3, m3, m7 + STORE m3, m7, outq + 48*SBLIMIT, 4*SBLIMIT + + mulps m2, m2, [winq] + LOAD m6, m7, bufq, 16 + addps m2, m2, m6 + STORE m2, m7, outq, 4*SBLIMIT + + mulps m4, m1, [winq + 20*4] + STORE m4, m7, bufq, 16 + + mulps m3, m5, [winq + 24*4] + STORE m3, m7, bufq + 4*16, 16 + + shufps m0, m0, m5, 0xb0 + mulps m0, m0, [winq + 28*4] + STORE m0, m7, bufq + 4*32, 16 + + shufps m5, m5, m1, 0xb1 + mulps m5, m5, [winq + 32*4] + STORE m5, m7, bufq + 4*48, 16 + + shufps m1, m1, m1, 0xb1 + mulps m1, m1, [winq + 36*4] + movss [bufq + 4*64], m1 + shufps m1, m1, 0xb1 + movss [bufq + 4*68], m1 + RET +%endmacro + +INIT_XMM sse2 +DEFINE_IMDCT + +INIT_XMM sse3 +DEFINE_IMDCT + +INIT_XMM ssse3 +DEFINE_IMDCT + +%if HAVE_AVX_EXTERNAL +INIT_XMM avx +DEFINE_IMDCT +%endif + +INIT_XMM sse + +%if ARCH_X86_64 +%define SPILL SWAP +%define UNSPILL SWAP +%define SPILLED(x) m %+ x +%else +%define SPILLED(x) [tmpq+(x-8)*16 + 32*4] +%macro SPILL 2 ; xmm#, mempos + movaps SPILLED(%2), m%1 +%endmacro +%macro UNSPILL 2 + movaps m%1, SPILLED(%2) +%endmacro +%endif + +%macro DEFINE_FOUR_IMDCT 0 +cglobal four_imdct36_float, 5,5,16, out, buf, in, win, tmp + movlps m0, [inq+64] + movhps m0, [inq+64 + 72] + movlps m3, [inq+64 + 2*72] + movhps m3, [inq+64 + 3*72] + + shufps m5, m0, m3, 0xdd + shufps m0, m0, m3, 0x88 + + mova m1, [inq+48] + movu m6, [inq+48 + 72] + mova m7, [inq+48 + 2*72] + movu m3, [inq+48 + 3*72] + + TRANSPOSE4x4PS 1, 6, 7, 3, 4 + + addps m4, m6, m7 + mova [tmpq+4*28], m4 + + addps m7, m3 + addps m6, m1 + addps m3, m0 + addps m0, m5 + addps m0, m7 + addps m7, m6 + mova [tmpq+4*12], m7 + SPILL 3, 12 + + mova m4, [inq+32] + movu m5, [inq+32 + 72] + mova m2, [inq+32 + 2*72] + movu m7, [inq+32 + 3*72] + + TRANSPOSE4x4PS 4, 5, 2, 7, 3 + + addps m1, m7 + SPILL 1, 11 + + addps m3, m5, m2 + SPILL 3, 13 + + addps m7, m2 + addps m5, m4 + addps m6, m7 + mova [tmpq], m6 + addps m7, m5 + mova [tmpq+4*16], m7 + + mova m2, [inq+16] + movu m7, [inq+16 + 72] + mova m1, [inq+16 + 2*72] + movu m6, [inq+16 + 3*72] + + TRANSPOSE4x4PS 2, 7, 1, 6, 3 + + addps m4, m6 + addps m6, m1 + addps m1, m7 + addps m7, m2 + addps m5, m6 + SPILL 5, 15 + addps m6, m7 + mulps m6, [costabs + 16*2] + mova [tmpq+4*8], m6 + SPILL 1, 10 + SPILL 0, 14 + + mova m1, [inq] + movu m6, [inq + 72] + mova m3, [inq + 2*72] + movu m5, [inq + 3*72] + + TRANSPOSE4x4PS 1, 6, 3, 5, 0 + + addps m2, m5 + addps m5, m3 + addps m7, m5 + addps m3, m6 + addps m6, m1 + SPILL 7, 8 + addps m5, m6 + SPILL 6, 9 + addps m6, m4, SPILLED(12) + subps m6, m2 + UNSPILL 7, 11 + SPILL 5, 11 + subps m5, m1, m7 + mulps m7, [costabs + 16*5] + addps m7, m1 + mulps m0, m6, [costabs + 16*6] + addps m0, m5 + mova [tmpq+4*24], m0 + addps m6, m5 + mova [tmpq+4*4], m6 + addps m6, m4, m2 + mulps m6, [costabs + 16*1] + subps m4, SPILLED(12) + mulps m4, [costabs + 16*8] + addps m2, SPILLED(12) + mulps m2, [costabs + 16*3] + subps m5, m7, m6 + subps m5, m2 + addps m6, m7 + addps m6, m4 + addps m7, m2 + subps m7, m4 + mova [tmpq+4*20], m7 + mova m2, [tmpq+4*28] + mova [tmpq+4*28], m5 + UNSPILL 7, 13 + subps m5, m7, m2 + mulps m5, [costabs + 16*7] + UNSPILL 1, 10 + mulps m1, [costabs + 16*2] + addps m4, m3, m2 + mulps m4, [costabs + 16*4] + addps m2, m7 + addps m7, m3 + mulps m7, [costabs] + subps m3, m2 + mulps m3, [costabs + 16*2] + addps m2, m7, m5 + addps m2, m1 + SPILL 2, 10 + addps m7, m4 + subps m7, m1 + SPILL 7, 12 + subps m5, m4 + subps m5, m1 + UNSPILL 0, 14 + SPILL 5, 13 + addps m1, m0, SPILLED(15) + subps m1, SPILLED(8) + mova m4, [costabs + 16*5] + mulps m4, [tmpq] + UNSPILL 2, 9 + addps m4, m2 + subps m2, [tmpq] + mulps m5, m1, [costabs + 16*6] + addps m5, m2 + SPILL 5, 9 + addps m2, m1 + SPILL 2, 14 + UNSPILL 5, 15 + subps m7, m5, m0 + addps m5, SPILLED(8) + mulps m5, [costabs + 16*1] + mulps m7, [costabs + 16*8] + addps m0, SPILLED(8) + mulps m0, [costabs + 16*3] + subps m2, m4, m5 + subps m2, m0 + SPILL 2, 15 + addps m5, m4 + addps m5, m7 + addps m4, m0 + subps m4, m7 + SPILL 4, 8 + mova m7, [tmpq+4*16] + mova m2, [tmpq+4*12] + addps m0, m7, m2 + subps m0, SPILLED(11) + mulps m0, [costabs + 16*2] + addps m4, m7, SPILLED(11) + mulps m4, [costabs] + subps m7, m2 + mulps m7, [costabs + 16*7] + addps m2, SPILLED(11) + mulps m2, [costabs + 16*4] + addps m1, m7, [tmpq+4*8] + addps m1, m4 + addps m4, m2 + subps m4, [tmpq+4*8] + SPILL 4, 11 + subps m7, m2 + subps m7, [tmpq+4*8] + addps m4, m6, SPILLED(10) + subps m6, SPILLED(10) + addps m2, m5, m1 + mulps m2, [costabs + 16*9] + subps m5, m1 + mulps m5, [costabs + 16*17] + subps m1, m4, m2 + addps m4, m2 + mulps m2, m1, [winq+4*36] + addps m2, [bufq+4*36] + mova [outq+1152], m2 + mulps m1, [winq+4*32] + addps m1, [bufq+4*32] + mova [outq+1024], m1 + mulps m1, m4, [winq+4*116] + mova [bufq+4*36], m1 + mulps m4, [winq+4*112] + mova [bufq+4*32], m4 + addps m2, m6, m5 + subps m6, m5 + mulps m1, m6, [winq+4*68] + addps m1, [bufq+4*68] + mova [outq+2176], m1 + mulps m6, [winq] + addps m6, [bufq] + mova [outq], m6 + mulps m1, m2, [winq+4*148] + mova [bufq+4*68], m1 + mulps m2, [winq+4*80] + mova [bufq], m2 + addps m5, m3, [tmpq+4*24] + mova m2, [tmpq+4*24] + subps m2, m3 + mova m1, SPILLED(9) + subps m1, m0 + mulps m1, [costabs + 16*10] + addps m0, SPILLED(9) + mulps m0, [costabs + 16*16] + addps m6, m5, m1 + subps m5, m1 + mulps m3, m5, [winq+4*40] + addps m3, [bufq+4*40] + mova [outq+1280], m3 + mulps m5, [winq+4*28] + addps m5, [bufq+4*28] + mova [outq+896], m5 + mulps m1, m6, [winq+4*120] + mova [bufq+4*40], m1 + mulps m6, [winq+4*108] + mova [bufq+4*28], m6 + addps m1, m2, m0 + subps m2, m0 + mulps m5, m2, [winq+4*64] + addps m5, [bufq+4*64] + mova [outq+2048], m5 + mulps m2, [winq+4*4] + addps m2, [bufq+4*4] + mova [outq+128], m2 + mulps m0, m1, [winq+4*144] + mova [bufq+4*64], m0 + mulps m1, [winq+4*84] + mova [bufq+4*4], m1 + mova m1, [tmpq+4*28] + mova m5, m1 + addps m1, SPILLED(13) + subps m5, SPILLED(13) + UNSPILL 3, 15 + addps m2, m7, m3 + mulps m2, [costabs + 16*11] + subps m3, m7 + mulps m3, [costabs + 16*15] + addps m0, m2, m1 + subps m1, m2 + SWAP m0, m2 + mulps m6, m1, [winq+4*44] + addps m6, [bufq+4*44] + mova [outq+1408], m6 + mulps m1, [winq+4*24] + addps m1, [bufq+4*24] + mova [outq+768], m1 + mulps m0, m2, [winq+4*124] + mova [bufq+4*44], m0 + mulps m2, [winq+4*104] + mova [bufq+4*24], m2 + addps m0, m5, m3 + subps m5, m3 + mulps m1, m5, [winq+4*60] + addps m1, [bufq+4*60] + mova [outq+1920], m1 + mulps m5, [winq+4*8] + addps m5, [bufq+4*8] + mova [outq+256], m5 + mulps m1, m0, [winq+4*140] + mova [bufq+4*60], m1 + mulps m0, [winq+4*88] + mova [bufq+4*8], m0 + mova m1, [tmpq+4*20] + addps m1, SPILLED(12) + mova m2, [tmpq+4*20] + subps m2, SPILLED(12) + UNSPILL 7, 8 + subps m0, m7, SPILLED(11) + addps m7, SPILLED(11) + mulps m4, m7, [costabs + 16*12] + mulps m0, [costabs + 16*14] + addps m5, m1, m4 + subps m1, m4 + mulps m7, m1, [winq+4*48] + addps m7, [bufq+4*48] + mova [outq+1536], m7 + mulps m1, [winq+4*20] + addps m1, [bufq+4*20] + mova [outq+640], m1 + mulps m1, m5, [winq+4*128] + mova [bufq+4*48], m1 + mulps m5, [winq+4*100] + mova [bufq+4*20], m5 + addps m6, m2, m0 + subps m2, m0 + mulps m1, m2, [winq+4*56] + addps m1, [bufq+4*56] + mova [outq+1792], m1 + mulps m2, [winq+4*12] + addps m2, [bufq+4*12] + mova [outq+384], m2 + mulps m0, m6, [winq+4*136] + mova [bufq+4*56], m0 + mulps m6, [winq+4*92] + mova [bufq+4*12], m6 + UNSPILL 0, 14 + mulps m0, [costabs + 16*13] + mova m3, [tmpq+4*4] + addps m2, m0, m3 + subps m3, m0 + mulps m0, m3, [winq+4*52] + addps m0, [bufq+4*52] + mova [outq+1664], m0 + mulps m3, [winq+4*16] + addps m3, [bufq+4*16] + mova [outq+512], m3 + mulps m0, m2, [winq+4*132] + mova [bufq+4*52], m0 + mulps m2, [winq+4*96] + mova [bufq+4*16], m2 + RET +%endmacro + +INIT_XMM sse +DEFINE_FOUR_IMDCT + +%if HAVE_AVX_EXTERNAL +INIT_XMM avx +DEFINE_FOUR_IMDCT +%endif diff --git a/media/ffvpx/libavcodec/x86/mathops.h b/media/ffvpx/libavcodec/x86/mathops.h new file mode 100644 index 0000000000..ca7e2dffc1 --- /dev/null +++ b/media/ffvpx/libavcodec/x86/mathops.h @@ -0,0 +1,153 @@ +/* + * simple math operations + * Copyright (c) 2006 Michael Niedermayer <michaelni@gmx.at> et al + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#ifndef AVCODEC_X86_MATHOPS_H +#define AVCODEC_X86_MATHOPS_H + +#include "config.h" + +#include "libavutil/common.h" +#include "libavutil/x86/asm.h" + +#if HAVE_INLINE_ASM + +#if ARCH_X86_32 + +#define MULL MULL +static av_always_inline av_const int MULL(int a, int b, unsigned shift) +{ + int rt, dummy; + if (__builtin_constant_p(shift)) + __asm__ ( + "imull %3 \n\t" + "shrdl %4, %%edx, %%eax \n\t" + :"=a"(rt), "=d"(dummy) + :"a"(a), "rm"(b), "i"(shift & 0x1F) + ); + else + __asm__ ( + "imull %3 \n\t" + "shrdl %4, %%edx, %%eax \n\t" + :"=a"(rt), "=d"(dummy) + :"a"(a), "rm"(b), "c"((uint8_t)shift) + ); + return rt; +} + +#define MULH MULH +static av_always_inline av_const int MULH(int a, int b) +{ + int rt, dummy; + __asm__ ( + "imull %3" + :"=d"(rt), "=a"(dummy) + :"a"(a), "rm"(b) + ); + return rt; +} + +#define MUL64 MUL64 +static av_always_inline av_const int64_t MUL64(int a, int b) +{ + int64_t rt; + __asm__ ( + "imull %2" + :"=A"(rt) + :"a"(a), "rm"(b) + ); + return rt; +} + +#endif /* ARCH_X86_32 */ + +#if HAVE_I686 +/* median of 3 */ +#define mid_pred mid_pred +static inline av_const int mid_pred(int a, int b, int c) +{ + int i=b; + __asm__ ( + "cmp %2, %1 \n\t" + "cmovg %1, %0 \n\t" + "cmovg %2, %1 \n\t" + "cmp %3, %1 \n\t" + "cmovl %3, %1 \n\t" + "cmp %1, %0 \n\t" + "cmovg %1, %0 \n\t" + :"+&r"(i), "+&r"(a) + :"r"(b), "r"(c) + ); + return i; +} + +#if HAVE_6REGS +#define COPY3_IF_LT(x, y, a, b, c, d)\ +__asm__ volatile(\ + "cmpl %0, %3 \n\t"\ + "cmovl %3, %0 \n\t"\ + "cmovl %4, %1 \n\t"\ + "cmovl %5, %2 \n\t"\ + : "+&r" (x), "+&r" (a), "+r" (c)\ + : "r" (y), "r" (b), "r" (d)\ +); +#endif /* HAVE_6REGS */ + +#endif /* HAVE_I686 */ + +#define MASK_ABS(mask, level) \ + __asm__ ("cdq \n\t" \ + "xorl %1, %0 \n\t" \ + "subl %1, %0 \n\t" \ + : "+a"(level), "=&d"(mask)) + +// avoid +32 for shift optimization (gcc should do that ...) +#define NEG_SSR32 NEG_SSR32 +static inline int32_t NEG_SSR32( int32_t a, int8_t s){ + if (__builtin_constant_p(s)) + __asm__ ("sarl %1, %0\n\t" + : "+r" (a) + : "i" (-s & 0x1F) + ); + else + __asm__ ("sarl %1, %0\n\t" + : "+r" (a) + : "c" ((uint8_t)(-s)) + ); + return a; +} + +#define NEG_USR32 NEG_USR32 +static inline uint32_t NEG_USR32(uint32_t a, int8_t s){ + if (__builtin_constant_p(s)) + __asm__ ("shrl %1, %0\n\t" + : "+r" (a) + : "i" (-s & 0x1F) + ); + else + __asm__ ("shrl %1, %0\n\t" + : "+r" (a) + : "c" ((uint8_t)(-s)) + ); + return a; +} + +#endif /* HAVE_INLINE_ASM */ +#endif /* AVCODEC_X86_MATHOPS_H */ diff --git a/media/ffvpx/libavcodec/x86/moz.build b/media/ffvpx/libavcodec/x86/moz.build new file mode 100644 index 0000000000..693218099a --- /dev/null +++ b/media/ffvpx/libavcodec/x86/moz.build @@ -0,0 +1,55 @@ +# -*- Mode: python; indent-tabs-mode: nil; tab-width: 40 -*- +# vim: set filetype=python: +# This Source Code Form is subject to the terms of the Mozilla Public +# License, v. 2.0. If a copy of the MPL was not distributed with this +# file, You can obtain one at http://mozilla.org/MPL/2.0/. + +SOURCES += [ + 'constants.c', + 'dct32.asm', + 'dct_init.c', + 'fdct.c', + 'fdctdsp_init.c', + 'flacdsp.asm', + 'flacdsp_init.c', + 'h264_intrapred.asm', + 'h264_intrapred_10bit.asm', + 'h264_intrapred_init.c', + 'idctdsp.asm', + 'idctdsp_init.c', + 'imdct36.asm', + 'mpegaudiodsp.c', + 'videodsp.asm', + 'videodsp_init.c', + 'vp8dsp.asm', + 'vp8dsp_init.c', + 'vp8dsp_loopfilter.asm', + 'vp9dsp_init.c', + 'vp9dsp_init_10bpp.c', + 'vp9dsp_init_12bpp.c', + 'vp9dsp_init_16bpp.c', + 'vp9intrapred.asm', + 'vp9intrapred_16bpp.asm', + 'vp9itxfm.asm', + 'vp9itxfm_16bpp.asm', + 'vp9lpf.asm', + 'vp9lpf_16bpp.asm', + 'vp9mc.asm', + 'vp9mc_16bpp.asm', +] + +if CONFIG['CPU_ARCH'] == 'x86': + SOURCES += [ 'simple_idct.asm' ] + +if CONFIG['CPU_ARCH'] == 'x86_64': + SOURCES += [ 'simple_idct10.asm' ] + +if CONFIG['MOZ_LIBAV_FFT']: + SOURCES += [ + 'fft.asm', + 'fft_init.c', + ] + +FINAL_LIBRARY = 'mozavcodec' + +include('/media/ffvpx/ffvpxcommon.mozbuild') diff --git a/media/ffvpx/libavcodec/x86/mpegaudiodsp.c b/media/ffvpx/libavcodec/x86/mpegaudiodsp.c new file mode 100644 index 0000000000..6586fe0726 --- /dev/null +++ b/media/ffvpx/libavcodec/x86/mpegaudiodsp.c @@ -0,0 +1,284 @@ +/* + * SIMD-optimized MP3 decoding functions + * Copyright (c) 2010 Vitor Sessak + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#include <stddef.h> + +#include "config.h" +#include "libavutil/attributes.h" +#include "libavutil/cpu.h" +#include "libavutil/mem_internal.h" +#include "libavutil/x86/asm.h" +#include "libavutil/x86/cpu.h" +#include "libavcodec/mpegaudiodsp.h" + +#define DECL(CPU)\ +static void imdct36_blocks_ ## CPU(float *out, float *buf, float *in, int count, int switch_point, int block_type);\ +void ff_imdct36_float_ ## CPU(float *out, float *buf, float *in, float *win); + +#if HAVE_X86ASM +DECL(sse2) +DECL(sse3) +DECL(ssse3) +DECL(avx) +#endif /* HAVE_X86ASM */ + +void ff_four_imdct36_float_sse(float *out, float *buf, float *in, float *win, + float *tmpbuf); +void ff_four_imdct36_float_avx(float *out, float *buf, float *in, float *win, + float *tmpbuf); + +DECLARE_ALIGNED(16, static float, mdct_win_sse)[2][4][4*40]; + +#if HAVE_6REGS && HAVE_SSE_INLINE + +#define MACS(rt, ra, rb) rt+=(ra)*(rb) +#define MLSS(rt, ra, rb) rt-=(ra)*(rb) + +#define SUM8(op, sum, w, p) \ +{ \ + op(sum, (w)[0 * 64], (p)[0 * 64]); \ + op(sum, (w)[1 * 64], (p)[1 * 64]); \ + op(sum, (w)[2 * 64], (p)[2 * 64]); \ + op(sum, (w)[3 * 64], (p)[3 * 64]); \ + op(sum, (w)[4 * 64], (p)[4 * 64]); \ + op(sum, (w)[5 * 64], (p)[5 * 64]); \ + op(sum, (w)[6 * 64], (p)[6 * 64]); \ + op(sum, (w)[7 * 64], (p)[7 * 64]); \ +} + +static void apply_window(const float *buf, const float *win1, + const float *win2, float *sum1, float *sum2, int len) +{ + x86_reg count = - 4*len; + const float *win1a = win1+len; + const float *win2a = win2+len; + const float *bufa = buf+len; + float *sum1a = sum1+len; + float *sum2a = sum2+len; + + +#define MULT(a, b) \ + "movaps " #a "(%1,%0), %%xmm1 \n\t" \ + "movaps " #a "(%3,%0), %%xmm2 \n\t" \ + "mulps %%xmm2, %%xmm1 \n\t" \ + "subps %%xmm1, %%xmm0 \n\t" \ + "mulps " #b "(%2,%0), %%xmm2 \n\t" \ + "subps %%xmm2, %%xmm4 \n\t" \ + + __asm__ volatile( + "1: \n\t" + "xorps %%xmm0, %%xmm0 \n\t" + "xorps %%xmm4, %%xmm4 \n\t" + + MULT( 0, 0) + MULT( 256, 64) + MULT( 512, 128) + MULT( 768, 192) + MULT(1024, 256) + MULT(1280, 320) + MULT(1536, 384) + MULT(1792, 448) + + "movaps %%xmm0, (%4,%0) \n\t" + "movaps %%xmm4, (%5,%0) \n\t" + "add $16, %0 \n\t" + "jl 1b \n\t" + :"+&r"(count) + :"r"(win1a), "r"(win2a), "r"(bufa), "r"(sum1a), "r"(sum2a) + ); + +#undef MULT +} + +static void apply_window_mp3(float *in, float *win, int *unused, float *out, + ptrdiff_t incr) +{ + LOCAL_ALIGNED_16(float, suma, [17]); + LOCAL_ALIGNED_16(float, sumb, [17]); + LOCAL_ALIGNED_16(float, sumc, [17]); + LOCAL_ALIGNED_16(float, sumd, [17]); + + float sum; + + /* copy to avoid wrap */ + __asm__ volatile( + "movaps 0(%0), %%xmm0 \n\t" \ + "movaps 16(%0), %%xmm1 \n\t" \ + "movaps 32(%0), %%xmm2 \n\t" \ + "movaps 48(%0), %%xmm3 \n\t" \ + "movaps %%xmm0, 0(%1) \n\t" \ + "movaps %%xmm1, 16(%1) \n\t" \ + "movaps %%xmm2, 32(%1) \n\t" \ + "movaps %%xmm3, 48(%1) \n\t" \ + "movaps 64(%0), %%xmm0 \n\t" \ + "movaps 80(%0), %%xmm1 \n\t" \ + "movaps 96(%0), %%xmm2 \n\t" \ + "movaps 112(%0), %%xmm3 \n\t" \ + "movaps %%xmm0, 64(%1) \n\t" \ + "movaps %%xmm1, 80(%1) \n\t" \ + "movaps %%xmm2, 96(%1) \n\t" \ + "movaps %%xmm3, 112(%1) \n\t" + ::"r"(in), "r"(in+512) + :"memory" + ); + + apply_window(in + 16, win , win + 512, suma, sumc, 16); + apply_window(in + 32, win + 48, win + 640, sumb, sumd, 16); + + SUM8(MACS, suma[0], win + 32, in + 48); + + sumc[ 0] = 0; + sumb[16] = 0; + sumd[16] = 0; + +#define SUMS(suma, sumb, sumc, sumd, out1, out2) \ + "movups " #sumd "(%4), %%xmm0 \n\t" \ + "shufps $0x1b, %%xmm0, %%xmm0 \n\t" \ + "subps " #suma "(%1), %%xmm0 \n\t" \ + "movaps %%xmm0," #out1 "(%0) \n\t" \ +\ + "movups " #sumc "(%3), %%xmm0 \n\t" \ + "shufps $0x1b, %%xmm0, %%xmm0 \n\t" \ + "addps " #sumb "(%2), %%xmm0 \n\t" \ + "movaps %%xmm0," #out2 "(%0) \n\t" + + if (incr == 1) { + __asm__ volatile( + SUMS( 0, 48, 4, 52, 0, 112) + SUMS(16, 32, 20, 36, 16, 96) + SUMS(32, 16, 36, 20, 32, 80) + SUMS(48, 0, 52, 4, 48, 64) + + :"+&r"(out) + :"r"(&suma[0]), "r"(&sumb[0]), "r"(&sumc[0]), "r"(&sumd[0]) + :"memory" + ); + out += 16*incr; + } else { + int j; + float *out2 = out + 32 * incr; + out[0 ] = -suma[ 0]; + out += incr; + out2 -= incr; + for(j=1;j<16;j++) { + *out = -suma[ j] + sumd[16-j]; + *out2 = sumb[16-j] + sumc[ j]; + out += incr; + out2 -= incr; + } + } + + sum = 0; + SUM8(MLSS, sum, win + 16 + 32, in + 32); + *out = sum; +} + +#endif /* HAVE_6REGS && HAVE_SSE_INLINE */ + +#if HAVE_X86ASM +#define DECL_IMDCT_BLOCKS(CPU1, CPU2) \ +static void imdct36_blocks_ ## CPU1(float *out, float *buf, float *in, \ + int count, int switch_point, int block_type) \ +{ \ + int align_end = count - (count & 3); \ + int j; \ + for (j = 0; j < align_end; j+= 4) { \ + LOCAL_ALIGNED_16(float, tmpbuf, [1024]); \ + float *win = mdct_win_sse[switch_point && j < 4][block_type]; \ + /* apply window & overlap with previous buffer */ \ + \ + /* select window */ \ + ff_four_imdct36_float_ ## CPU2(out, buf, in, win, tmpbuf); \ + in += 4*18; \ + buf += 4*18; \ + out += 4; \ + } \ + for (; j < count; j++) { \ + /* apply window & overlap with previous buffer */ \ + \ + /* select window */ \ + int win_idx = (switch_point && j < 2) ? 0 : block_type; \ + float *win = ff_mdct_win_float[win_idx + (4 & -(j & 1))]; \ + \ + ff_imdct36_float_ ## CPU1(out, buf, in, win); \ + \ + in += 18; \ + buf++; \ + out++; \ + } \ +} + +#if HAVE_SSE +DECL_IMDCT_BLOCKS(sse2,sse) +DECL_IMDCT_BLOCKS(sse3,sse) +DECL_IMDCT_BLOCKS(ssse3,sse) +#endif +#if HAVE_AVX_EXTERNAL +DECL_IMDCT_BLOCKS(avx,avx) +#endif +#endif /* HAVE_X86ASM */ + +av_cold void ff_mpadsp_init_x86_tabs(void) +{ + int i, j; + for (j = 0; j < 4; j++) { + for (i = 0; i < 40; i ++) { + mdct_win_sse[0][j][4*i ] = ff_mdct_win_float[j ][i]; + mdct_win_sse[0][j][4*i + 1] = ff_mdct_win_float[j + 4][i]; + mdct_win_sse[0][j][4*i + 2] = ff_mdct_win_float[j ][i]; + mdct_win_sse[0][j][4*i + 3] = ff_mdct_win_float[j + 4][i]; + mdct_win_sse[1][j][4*i ] = ff_mdct_win_float[0 ][i]; + mdct_win_sse[1][j][4*i + 1] = ff_mdct_win_float[4 ][i]; + mdct_win_sse[1][j][4*i + 2] = ff_mdct_win_float[j ][i]; + mdct_win_sse[1][j][4*i + 3] = ff_mdct_win_float[j + 4][i]; + } + } +} + +av_cold void ff_mpadsp_init_x86(MPADSPContext *s) +{ + av_unused int cpu_flags = av_get_cpu_flags(); + +#if HAVE_6REGS && HAVE_SSE_INLINE + if (INLINE_SSE(cpu_flags)) { + s->apply_window_float = apply_window_mp3; + } +#endif /* HAVE_SSE_INLINE */ + +#if HAVE_X86ASM +#if HAVE_SSE + if (EXTERNAL_SSE2(cpu_flags)) { + s->imdct36_blocks_float = imdct36_blocks_sse2; + } + if (EXTERNAL_SSE3(cpu_flags)) { + s->imdct36_blocks_float = imdct36_blocks_sse3; + } + if (EXTERNAL_SSSE3(cpu_flags)) { + s->imdct36_blocks_float = imdct36_blocks_ssse3; + } +#endif +#if HAVE_AVX_EXTERNAL + if (EXTERNAL_AVX(cpu_flags)) { + s->imdct36_blocks_float = imdct36_blocks_avx; + } +#endif +#endif /* HAVE_X86ASM */ +} diff --git a/media/ffvpx/libavcodec/x86/simple_idct.asm b/media/ffvpx/libavcodec/x86/simple_idct.asm new file mode 100644 index 0000000000..982b2f0bbb --- /dev/null +++ b/media/ffvpx/libavcodec/x86/simple_idct.asm @@ -0,0 +1,871 @@ +; +; Simple IDCT MMX +; +; Copyright (c) 2001, 2002 Michael Niedermayer <michaelni@gmx.at> +; +; Conversion from gcc syntax to x264asm syntax with minimal modifications +; by James Darnley <jdarnley@obe.tv>. +; +; This file is part of FFmpeg. +; +; FFmpeg is free software; you can redistribute it and/or +; modify it under the terms of the GNU Lesser General Public +; License as published by the Free Software Foundation; either +; version 2.1 of the License, or (at your option) any later version. +; +; FFmpeg is distributed in the hope that it will be useful, +; but WITHOUT ANY WARRANTY; without even the implied warranty of +; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +; Lesser General Public License for more details. +; +; You should have received a copy of the GNU Lesser General Public +; License along with FFmpeg; if not, write to the Free Software +; Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA +;/ + +%include "libavutil/x86/x86util.asm" + +SECTION_RODATA + +%if ARCH_X86_32 +cextern pb_80 + +wm1010: dw 0, 0xffff, 0, 0xffff +d40000: dd 4 << 16, 0 + +; 23170.475006 +; 22725.260826 +; 21406.727617 +; 19265.545870 +; 16384.000000 +; 12872.826198 +; 8866.956905 +; 4520.335430 + +%define C0 23170 ; cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 +%define C1 22725 ; cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 +%define C2 21407 ; cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 +%define C3 19266 ; cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 +%define C4 16383 ; cos(i*M_PI/16)*sqrt(2)*(1<<14) - 0.5 +%define C5 12873 ; cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 +%define C6 8867 ; cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 +%define C7 4520 ; cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 + +%define ROW_SHIFT 11 +%define COL_SHIFT 20 ; 6 + +coeffs: + dw 1 << (ROW_SHIFT - 1), 0 + dw 1 << (ROW_SHIFT - 1), 0 + dw 1 << (ROW_SHIFT - 1), 1 + dw 1 << (ROW_SHIFT - 1), 0 + + dw C4, C4, C4, C4 + dw C4, -C4, C4, -C4 + + dw C2, C6, C2, C6 + dw C6, -C2, C6, -C2 + + dw C1, C3, C1, C3 + dw C5, C7, C5, C7 + + dw C3, -C7, C3, -C7 + dw -C1, -C5, -C1, -C5 + + dw C5, -C1, C5, -C1 + dw C7, C3, C7, C3 + + dw C7, -C5, C7, -C5 + dw C3, -C1, C3, -C1 + +SECTION .text + +%macro DC_COND_IDCT 7 + movq mm0, [blockq + %1] ; R4 R0 r4 r0 + movq mm1, [blockq + %2] ; R6 R2 r6 r2 + movq mm2, [blockq + %3] ; R3 R1 r3 r1 + movq mm3, [blockq + %4] ; R7 R5 r7 r5 + movq mm4, [wm1010] + pand mm4, mm0 + por mm4, mm1 + por mm4, mm2 + por mm4, mm3 + packssdw mm4, mm4 + movd t0d, mm4 + or t0d, t0d + jz %%1 + movq mm4, [coeffs + 16] ; C4 C4 C4 C4 + pmaddwd mm4, mm0 ; C4R4+C4R0 C4r4+C4r0 + movq mm5, [coeffs + 24] ; -C4 C4 -C4 C4 + pmaddwd mm0, mm5 ; -C4R4+C4R0 -C4r4+C4r0 + movq mm5, [coeffs + 32] ; C6 C2 C6 C2 + pmaddwd mm5, mm1 ; C6R6+C2R2 C6r6+C2r2 + movq mm6, [coeffs + 40] ; -C2 C6 -C2 C6 + pmaddwd mm1, mm6 ; -C2R6+C6R2 -C2r6+C6r2 + movq mm7, [coeffs + 48] ; C3 C1 C3 C1 + pmaddwd mm7, mm2 ; C3R3+C1R1 C3r3+C1r1 + paddd mm4, [coeffs + 8] + movq mm6, mm4 ; C4R4+C4R0 C4r4+C4r0 + paddd mm4, mm5 ; A0 a0 + psubd mm6, mm5 ; A3 a3 + movq mm5, [coeffs + 56] ; C7 C5 C7 C5 + pmaddwd mm5, mm3 ; C7R7+C5R5 C7r7+C5r5 + paddd mm0, [coeffs + 8] + paddd mm1, mm0 ; A1 a1 + paddd mm0, mm0 + psubd mm0, mm1 ; A2 a2 + pmaddwd mm2, [coeffs + 64] ; -C7R3+C3R1 -C7r3+C3r1 + paddd mm7, mm5 ; B0 b0 + movq mm5, [coeffs + 72] ; -C5 -C1 -C5 -C1 + pmaddwd mm5, mm3 ; -C5R7-C1R5 -C5r7-C1r5 + paddd mm7, mm4 ; A0+B0 a0+b0 + paddd mm4, mm4 ; 2A0 2a0 + psubd mm4, mm7 ; A0-B0 a0-b0 + paddd mm5, mm2 ; B1 b1 + psrad mm7, %7 + psrad mm4, %7 + movq mm2, mm1 ; A1 a1 + paddd mm1, mm5 ; A1+B1 a1+b1 + psubd mm2, mm5 ; A1-B1 a1-b1 + psrad mm1, %7 + psrad mm2, %7 + packssdw mm7, mm1 ; A1+B1 a1+b1 A0+B0 a0+b0 + packssdw mm2, mm4 ; A0-B0 a0-b0 A1-B1 a1-b1 + movq [%5], mm7 + movq mm1, [blockq + %3] ; R3 R1 r3 r1 + movq mm4, [coeffs + 80] ; -C1 C5 -C1 C5 + movq [24 + %5], mm2 + pmaddwd mm4, mm1 ; -C1R3+C5R1 -C1r3+C5r1 + movq mm7, [coeffs + 88] ; C3 C7 C3 C7 + pmaddwd mm1, [coeffs + 96] ; -C5R3+C7R1 -C5r3+C7r1 + pmaddwd mm7, mm3 ; C3R7+C7R5 C3r7+C7r5 + movq mm2, mm0 ; A2 a2 + pmaddwd mm3, [coeffs + 104] ; -C1R7+C3R5 -C1r7+C3r5 + paddd mm4, mm7 ; B2 b2 + paddd mm2, mm4 ; A2+B2 a2+b2 + psubd mm0, mm4 ; a2-B2 a2-b2 + psrad mm2, %7 + psrad mm0, %7 + movq mm4, mm6 ; A3 a3 + paddd mm3, mm1 ; B3 b3 + paddd mm6, mm3 ; A3+B3 a3+b3 + psubd mm4, mm3 ; a3-B3 a3-b3 + psrad mm6, %7 + packssdw mm2, mm6 ; A3+B3 a3+b3 A2+B2 a2+b2 + movq [8 + %5], mm2 + psrad mm4, %7 + packssdw mm4, mm0 ; A2-B2 a2-b2 A3-B3 a3-b3 + movq [16 + %5], mm4 + jmp %%2 +%%1: + pslld mm0, 16 + paddd mm0, [d40000] + psrad mm0, 13 + packssdw mm0, mm0 + movq [%5], mm0 + movq [8 + %5], mm0 + movq [16 + %5], mm0 + movq [24 + %5], mm0 +%%2: +%endmacro + +%macro Z_COND_IDCT 8 + movq mm0, [blockq + %1] ; R4 R0 r4 r0 + movq mm1, [blockq + %2] ; R6 R2 r6 r2 + movq mm2, [blockq + %3] ; R3 R1 r3 r1 + movq mm3, [blockq + %4] ; R7 R5 r7 r5 + movq mm4, mm0 + por mm4, mm1 + por mm4, mm2 + por mm4, mm3 + packssdw mm4, mm4 + movd t0d, mm4 + or t0d, t0d + jz %8 + movq mm4, [coeffs + 16] ; C4 C4 C4 C4 + pmaddwd mm4, mm0 ; C4R4+C4R0 C4r4+C4r0 + movq mm5, [coeffs + 24] ; -C4 C4 -C4 C4 + pmaddwd mm0, mm5 ; -C4R4+C4R0 -C4r4+C4r0 + movq mm5, [coeffs + 32] ; C6 C2 C6 C2 + pmaddwd mm5, mm1 ; C6R6+C2R2 C6r6+C2r2 + movq mm6, [coeffs + 40] ; -C2 C6 -C2 C6 + pmaddwd mm1, mm6 ; -C2R6+C6R2 -C2r6+C6r2 + movq mm7, [coeffs + 48] ; C3 C1 C3 C1 + pmaddwd mm7, mm2 ; C3R3+C1R1 C3r3+C1r1 + paddd mm4, [coeffs] + movq mm6, mm4 ; C4R4+C4R0 C4r4+C4r0 + paddd mm4, mm5 ; A0 a0 + psubd mm6, mm5 ; A3 a3 + movq mm5, [coeffs + 56] ; C7 C5 C7 C5 + pmaddwd mm5, mm3 ; C7R7+C5R5 C7r7+C5r5 + paddd mm0, [coeffs] + paddd mm1, mm0 ; A1 a1 + paddd mm0, mm0 + psubd mm0, mm1 ; A2 a2 + pmaddwd mm2, [coeffs + 64] ; -C7R3+C3R1 -C7r3+C3r1 + paddd mm7, mm5 ; B0 b0 + movq mm5, [coeffs + 72] ; -C5 -C1 -C5 -C1 + pmaddwd mm5, mm3 ; -C5R7-C1R5 -C5r7-C1r5 + paddd mm7, mm4 ; A0+B0 a0+b0 + paddd mm4, mm4 ; 2A0 2a0 + psubd mm4, mm7 ; A0-B0 a0-b0 + paddd mm5, mm2 ; B1 b1 + psrad mm7, %7 + psrad mm4, %7 + movq mm2, mm1 ; A1 a1 + paddd mm1, mm5 ; A1+B1 a1+b1 + psubd mm2, mm5 ; A1-B1 a1-b1 + psrad mm1, %7 + psrad mm2, %7 + packssdw mm7, mm1 ; A1+B1 a1+b1 A0+B0 a0+b0 + packssdw mm2, mm4 ; A0-B0 a0-b0 A1-B1 a1-b1 + movq [%5], mm7 + movq mm1, [blockq + %3] ; R3 R1 r3 r1 + movq mm4, [coeffs + 80] ; -C1 C5 -C1 C5 + movq [24 + %5], mm2 + pmaddwd mm4, mm1 ; -C1R3+C5R1 -C1r3+C5r1 + movq mm7, [coeffs + 88] ; C3 C7 C3 C7 + pmaddwd mm1, [coeffs + 96] ; -C5R3+C7R1 -C5r3+C7r1 + pmaddwd mm7, mm3 ; C3R7+C7R5 C3r7+C7r5 + movq mm2, mm0 ; A2 a2 + pmaddwd mm3, [coeffs + 104] ; -C1R7+C3R5 -C1r7+C3r5 + paddd mm4, mm7 ; B2 b2 + paddd mm2, mm4 ; A2+B2 a2+b2 + psubd mm0, mm4 ; a2-B2 a2-b2 + psrad mm2, %7 + psrad mm0, %7 + movq mm4, mm6 ; A3 a3 + paddd mm3, mm1 ; B3 b3 + paddd mm6, mm3 ; A3+B3 a3+b3 + psubd mm4, mm3 ; a3-B3 a3-b3 + psrad mm6, %7 + packssdw mm2, mm6 ; A3+B3 a3+b3 A2+B2 a2+b2 + movq [8 + %5], mm2 + psrad mm4, %7 + packssdw mm4, mm0 ; A2-B2 a2-b2 A3-B3 a3-b3 + movq [16 + %5], mm4 +%endmacro + +%macro IDCT1 6 + movq mm0, %1 ; R4 R0 r4 r0 + movq mm1, %2 ; R6 R2 r6 r2 + movq mm2, %3 ; R3 R1 r3 r1 + movq mm3, %4 ; R7 R5 r7 r5 + movq mm4, [coeffs + 16] ; C4 C4 C4 C4 + pmaddwd mm4, mm0 ; C4R4+C4R0 C4r4+C4r0 + movq mm5, [coeffs + 24] ; -C4 C4 -C4 C4 + pmaddwd mm0, mm5 ; -C4R4+C4R0 -C4r4+C4r0 + movq mm5, [coeffs + 32] ; C6 C2 C6 C2 + pmaddwd mm5, mm1 ; C6R6+C2R2 C6r6+C2r2 + movq mm6, [coeffs + 40] ; -C2 C6 -C2 C6 + pmaddwd mm1, mm6 ; -C2R6+C6R2 -C2r6+C6r2 + movq mm6, mm4 ; C4R4+C4R0 C4r4+C4r0 + movq mm7, [coeffs + 48] ; C3 C1 C3 C1 + pmaddwd mm7, mm2 ; C3R3+C1R1 C3r3+C1r1 + paddd mm4, mm5 ; A0 a0 + psubd mm6, mm5 ; A3 a3 + movq mm5, mm0 ; -C4R4+C4R0 -C4r4+C4r0 + paddd mm0, mm1 ; A1 a1 + psubd mm5, mm1 ; A2 a2 + movq mm1, [coeffs + 56] ; C7 C5 C7 C5 + pmaddwd mm1, mm3 ; C7R7+C5R5 C7r7+C5r5 + pmaddwd mm2, [coeffs + 64] ; -C7R3+C3R1 -C7r3+C3r1 + paddd mm7, mm1 ; B0 b0 + movq mm1, [coeffs + 72] ; -C5 -C1 -C5 -C1 + pmaddwd mm1, mm3 ; -C5R7-C1R5 -C5r7-C1r5 + paddd mm7, mm4 ; A0+B0 a0+b0 + paddd mm4, mm4 ; 2A0 2a0 + psubd mm4, mm7 ; A0-B0 a0-b0 + paddd mm1, mm2 ; B1 b1 + psrad mm7, %6 + psrad mm4, %6 + movq mm2, mm0 ; A1 a1 + paddd mm0, mm1 ; A1+B1 a1+b1 + psubd mm2, mm1 ; A1-B1 a1-b1 + psrad mm0, %6 + psrad mm2, %6 + packssdw mm7, mm7 ; A0+B0 a0+b0 + movd [%5], mm7 + packssdw mm0, mm0 ; A1+B1 a1+b1 + movd [16 + %5], mm0 + packssdw mm2, mm2 ; A1-B1 a1-b1 + movd [96 + %5], mm2 + packssdw mm4, mm4 ; A0-B0 a0-b0 + movd [112 + %5], mm4 + movq mm0, %3 ; R3 R1 r3 r1 + movq mm4, [coeffs + 80] ; -C1 C5 -C1 C5 + pmaddwd mm4, mm0 ; -C1R3+C5R1 -C1r3+C5r1 + movq mm7, [coeffs + 88] ; C3 C7 C3 C7 + pmaddwd mm0, [coeffs + 96] ; -C5R3+C7R1 -C5r3+C7r1 + pmaddwd mm7, mm3 ; C3R7+C7R5 C3r7+C7r5 + movq mm2, mm5 ; A2 a2 + pmaddwd mm3, [coeffs + 104] ; -C1R7+C3R5 -C1r7+C3r5 + paddd mm4, mm7 ; B2 b2 + paddd mm2, mm4 ; A2+B2 a2+b2 + psubd mm5, mm4 ; a2-B2 a2-b2 + psrad mm2, %6 + psrad mm5, %6 + movq mm4, mm6 ; A3 a3 + paddd mm3, mm0 ; B3 b3 + paddd mm6, mm3 ; A3+B3 a3+b3 + psubd mm4, mm3 ; a3-B3 a3-b3 + psrad mm6, %6 + psrad mm4, %6 + packssdw mm2, mm2 ; A2+B2 a2+b2 + packssdw mm6, mm6 ; A3+B3 a3+b3 + movd [32 + %5], mm2 + packssdw mm4, mm4 ; A3-B3 a3-b3 + packssdw mm5, mm5 ; A2-B2 a2-b2 + movd [48 + %5], mm6 + movd [64 + %5], mm4 + movd [80 + %5], mm5 +%endmacro + +%macro IDCT2 6 + movq mm0, %1 ; R4 R0 r4 r0 + movq mm1, %2 ; R6 R2 r6 r2 + movq mm3, %4 ; R7 R5 r7 r5 + movq mm4, [coeffs + 16] ; C4 C4 C4 C4 + pmaddwd mm4, mm0 ; C4R4+C4R0 C4r4+C4r0 + movq mm5, [coeffs + 24] ; -C4 C4 -C4 C4 + pmaddwd mm0, mm5 ; -C4R4+C4R0 -C4r4+C4r0 + movq mm5, [coeffs + 32] ; C6 C2 C6 C2 + pmaddwd mm5, mm1 ; C6R6+C2R2 C6r6+C2r2 + movq mm6, [coeffs + 40] ; -C2 C6 -C2 C6 + pmaddwd mm1, mm6 ; -C2R6+C6R2 -C2r6+C6r2 + movq mm6, mm4 ; C4R4+C4R0 C4r4+C4r0 + paddd mm4, mm5 ; A0 a0 + psubd mm6, mm5 ; A3 a3 + movq mm5, mm0 ; -C4R4+C4R0 -C4r4+C4r0 + paddd mm0, mm1 ; A1 a1 + psubd mm5, mm1 ; A2 a2 + movq mm1, [coeffs + 56] ; C7 C5 C7 C5 + pmaddwd mm1, mm3 ; C7R7+C5R5 C7r7+C5r5 + movq mm7, [coeffs + 72] ; -C5 -C1 -C5 -C1 + pmaddwd mm7, mm3 ; -C5R7-C1R5 -C5r7-C1r5 + paddd mm1, mm4 ; A0+B0 a0+b0 + paddd mm4, mm4 ; 2A0 2a0 + psubd mm4, mm1 ; A0-B0 a0-b0 + psrad mm1, %6 + psrad mm4, %6 + movq mm2, mm0 ; A1 a1 + paddd mm0, mm7 ; A1+B1 a1+b1 + psubd mm2, mm7 ; A1-B1 a1-b1 + psrad mm0, %6 + psrad mm2, %6 + packssdw mm1, mm1 ; A0+B0 a0+b0 + movd [%5], mm1 + packssdw mm0, mm0 ; A1+B1 a1+b1 + movd [16 + %5], mm0 + packssdw mm2, mm2 ; A1-B1 a1-b1 + movd [96 + %5], mm2 + packssdw mm4, mm4 ; A0-B0 a0-b0 + movd [112 + %5], mm4 + movq mm1, [coeffs + 88] ; C3 C7 C3 C7 + pmaddwd mm1, mm3 ; C3R7+C7R5 C3r7+C7r5 + movq mm2, mm5 ; A2 a2 + pmaddwd mm3, [coeffs + 104] ; -C1R7+C3R5 -C1r7+C3r5 + paddd mm2, mm1 ; A2+B2 a2+b2 + psubd mm5, mm1 ; a2-B2 a2-b2 + psrad mm2, %6 + psrad mm5, %6 + movq mm1, mm6 ; A3 a3 + paddd mm6, mm3 ; A3+B3 a3+b3 + psubd mm1, mm3 ; a3-B3 a3-b3 + psrad mm6, %6 + psrad mm1, %6 + packssdw mm2, mm2 ; A2+B2 a2+b2 + packssdw mm6, mm6 ; A3+B3 a3+b3 + movd [32 + %5], mm2 + packssdw mm1, mm1 ; A3-B3 a3-b3 + packssdw mm5, mm5 ; A2-B2 a2-b2 + movd [48 + %5], mm6 + movd [64 + %5], mm1 + movd [80 + %5], mm5 +%endmacro + +%macro IDCT3 6 + movq mm0, %1 ; R4 R0 r4 r0 + movq mm3, %4 ; R7 R5 r7 r5 + movq mm4, [coeffs + 16] ; C4 C4 C4 C4 + pmaddwd mm4, mm0 ; C4R4+C4R0 C4r4+C4r0 + movq mm5, [coeffs + 24] ; -C4 C4 -C4 C4 + pmaddwd mm0, mm5 ; -C4R4+C4R0 -C4r4+C4r0 + movq mm6, mm4 ; C4R4+C4R0 C4r4+C4r0 + movq mm5, mm0 ; -C4R4+C4R0 -C4r4+C4r0 + movq mm1, [coeffs + 56] ; C7 C5 C7 C5 + pmaddwd mm1, mm3 ; C7R7+C5R5 C7r7+C5r5 + movq mm7, [coeffs + 72] ; -C5 -C1 -C5 -C1 + pmaddwd mm7, mm3 ; -C5R7-C1R5 -C5r7-C1r5 + paddd mm1, mm4 ; A0+B0 a0+b0 + paddd mm4, mm4 ; 2A0 2a0 + psubd mm4, mm1 ; A0-B0 a0-b0 + psrad mm1, %6 + psrad mm4, %6 + movq mm2, mm0 ; A1 a1 + paddd mm0, mm7 ; A1+B1 a1+b1 + psubd mm2, mm7 ; A1-B1 a1-b1 + psrad mm0, %6 + psrad mm2, %6 + packssdw mm1, mm1 ; A0+B0 a0+b0 + movd [%5], mm1 + packssdw mm0, mm0 ; A1+B1 a1+b1 + movd [16 + %5], mm0 + packssdw mm2, mm2 ; A1-B1 a1-b1 + movd [96 + %5], mm2 + packssdw mm4, mm4 ; A0-B0 a0-b0 + movd [112 + %5], mm4 + movq mm1, [coeffs + 88] ; C3 C7 C3 C7 + pmaddwd mm1, mm3 ; C3R7+C7R5 C3r7+C7r5 + movq mm2, mm5 ; A2 a2 + pmaddwd mm3, [coeffs + 104] ; -C1R7+C3R5 -C1r7+C3r5 + paddd mm2, mm1 ; A2+B2 a2+b2 + psubd mm5, mm1 ; a2-B2 a2-b2 + psrad mm2, %6 + psrad mm5, %6 + movq mm1, mm6 ; A3 a3 + paddd mm6, mm3 ; A3+B3 a3+b3 + psubd mm1, mm3 ; a3-B3 a3-b3 + psrad mm6, %6 + psrad mm1, %6 + packssdw mm2, mm2 ; A2+B2 a2+b2 + packssdw mm6, mm6 ; A3+B3 a3+b3 + movd [32 + %5], mm2 + packssdw mm1, mm1 ; A3-B3 a3-b3 + packssdw mm5, mm5 ; A2-B2 a2-b2 + movd [48 + %5], mm6 + movd [64 + %5], mm1 + movd [80 + %5], mm5 +%endmacro + +%macro IDCT4 6 + movq mm0, %1 ; R4 R0 r4 r0 + movq mm2, %3 ; R3 R1 r3 r1 + movq mm3, %4 ; R7 R5 r7 r5 + movq mm4, [coeffs + 16] ; C4 C4 C4 C4 + pmaddwd mm4, mm0 ; C4R4+C4R0 C4r4+C4r0 + movq mm5, [coeffs + 24] ; -C4 C4 -C4 C4 + pmaddwd mm0, mm5 ; -C4R4+C4R0 -C4r4+C4r0 + movq mm6, mm4 ; C4R4+C4R0 C4r4+C4r0 + movq mm7, [coeffs + 48] ; C3 C1 C3 C1 + pmaddwd mm7, mm2 ; C3R3+C1R1 C3r3+C1r1 + movq mm5, mm0 ; -C4R4+C4R0 -C4r4+C4r0 + movq mm1, [coeffs + 56] ; C7 C5 C7 C5 + pmaddwd mm1, mm3 ; C7R7+C5R5 C7r7+C5r5 + pmaddwd mm2, [coeffs + 64] ; -C7R3+C3R1 -C7r3+C3r1 + paddd mm7, mm1 ; B0 b0 + movq mm1, [coeffs + 72] ; -C5 -C1 -C5 -C1 + pmaddwd mm1, mm3 ; -C5R7-C1R5 -C5r7-C1r5 + paddd mm7, mm4 ; A0+B0 a0+b0 + paddd mm4, mm4 ; 2A0 2a0 + psubd mm4, mm7 ; A0-B0 a0-b0 + paddd mm1, mm2 ; B1 b1 + psrad mm7, %6 + psrad mm4, %6 + movq mm2, mm0 ; A1 a1 + paddd mm0, mm1 ; A1+B1 a1+b1 + psubd mm2, mm1 ; A1-B1 a1-b1 + psrad mm0, %6 + psrad mm2, %6 + packssdw mm7, mm7 ; A0+B0 a0+b0 + movd [%5], mm7 + packssdw mm0, mm0 ; A1+B1 a1+b1 + movd [16 + %5], mm0 + packssdw mm2, mm2 ; A1-B1 a1-b1 + movd [96 + %5], mm2 + packssdw mm4, mm4 ; A0-B0 a0-b0 + movd [112 + %5], mm4 + movq mm0, %3 ; R3 R1 r3 r1 + movq mm4, [coeffs + 80] ; -C1 C5 -C1 C5 + pmaddwd mm4, mm0 ; -C1R3+C5R1 -C1r3+C5r1 + movq mm7, [coeffs + 88] ; C3 C7 C3 C7 + pmaddwd mm0, [coeffs + 96] ; -C5R3+C7R1 -C5r3+C7r1 + pmaddwd mm7, mm3 ; C3R7+C7R5 C3r7+C7r5 + movq mm2, mm5 ; A2 a2 + pmaddwd mm3, [coeffs + 104] ; -C1R7+C3R5 -C1r7+C3r5 + paddd mm4, mm7 ; B2 b2 + paddd mm2, mm4 ; A2+B2 a2+b2 + psubd mm5, mm4 ; a2-B2 a2-b2 + psrad mm2, %6 + psrad mm5, %6 + movq mm4, mm6 ; A3 a3 + paddd mm3, mm0 ; B3 b3 + paddd mm6, mm3 ; A3+B3 a3+b3 + psubd mm4, mm3 ; a3-B3 a3-b3 + psrad mm6, %6 + psrad mm4, %6 + packssdw mm2, mm2 ; A2+B2 a2+b2 + packssdw mm6, mm6 ; A3+B3 a3+b3 + movd [32 + %5], mm2 + packssdw mm4, mm4 ; A3-B3 a3-b3 + packssdw mm5, mm5 ; A2-B2 a2-b2 + movd [48 + %5], mm6 + movd [64 + %5], mm4 + movd [80 + %5], mm5 +%endmacro + +%macro IDCT5 6 + movq mm0, %1 ; R4 R0 r4 r0 + movq mm2, %3 ; R3 R1 r3 r1 + movq mm4, [coeffs + 16] ; C4 C4 C4 C4 + pmaddwd mm4, mm0 ; C4R4+C4R0 C4r4+C4r0 + movq mm5, [coeffs + 24] ; -C4 C4 -C4 C4 + pmaddwd mm0, mm5 ; -C4R4+C4R0 -C4r4+C4r0 + movq mm6, mm4 ; C4R4+C4R0 C4r4+C4r0 + movq mm7, [coeffs + 48] ; C3 C1 C3 C1 + pmaddwd mm7, mm2 ; C3R3+C1R1 C3r3+C1r1 + movq mm5, mm0 ; -C4R4+C4R0 -C4r4+C4r0 + movq mm3, [coeffs + 64] + pmaddwd mm3, mm2 ; -C7R3+C3R1 -C7r3+C3r1 + paddd mm7, mm4 ; A0+B0 a0+b0 + paddd mm4, mm4 ; 2A0 2a0 + psubd mm4, mm7 ; A0-B0 a0-b0 + psrad mm7, %6 + psrad mm4, %6 + movq mm1, mm0 ; A1 a1 + paddd mm0, mm3 ; A1+B1 a1+b1 + psubd mm1, mm3 ; A1-B1 a1-b1 + psrad mm0, %6 + psrad mm1, %6 + packssdw mm7, mm7 ; A0+B0 a0+b0 + movd [%5], mm7 + packssdw mm0, mm0 ; A1+B1 a1+b1 + movd [16 + %5], mm0 + packssdw mm1, mm1 ; A1-B1 a1-b1 + movd [96 + %5], mm1 + packssdw mm4, mm4 ; A0-B0 a0-b0 + movd [112 + %5], mm4 + movq mm4, [coeffs + 80] ; -C1 C5 -C1 C5 + pmaddwd mm4, mm2 ; -C1R3+C5R1 -C1r3+C5r1 + pmaddwd mm2, [coeffs + 96] ; -C5R3+C7R1 -C5r3+C7r1 + movq mm1, mm5 ; A2 a2 + paddd mm1, mm4 ; A2+B2 a2+b2 + psubd mm5, mm4 ; a2-B2 a2-b2 + psrad mm1, %6 + psrad mm5, %6 + movq mm4, mm6 ; A3 a3 + paddd mm6, mm2 ; A3+B3 a3+b3 + psubd mm4, mm2 ; a3-B3 a3-b3 + psrad mm6, %6 + psrad mm4, %6 + packssdw mm1, mm1 ; A2+B2 a2+b2 + packssdw mm6, mm6 ; A3+B3 a3+b3 + movd [32 + %5], mm1 + packssdw mm4, mm4 ; A3-B3 a3-b3 + packssdw mm5, mm5 ; A2-B2 a2-b2 + movd [48 + %5], mm6 + movd [64 + %5], mm4 + movd [80 + %5], mm5 +%endmacro + +%macro IDCT6 6 + movq mm0, [%1] ; R4 R0 r4 r0 + movq mm1, [%2] ; R6 R2 r6 r2 + movq mm4, [coeffs + 16] ; C4 C4 C4 C4 + pmaddwd mm4, mm0 ; C4R4+C4R0 C4r4+C4r0 + movq mm5, [coeffs + 24] ; -C4 C4 -C4 C4 + pmaddwd mm0, mm5 ; -C4R4+C4R0 -C4r4+C4r0 + movq mm5, [coeffs + 32] ; C6 C2 C6 C2 + pmaddwd mm5, mm1 ; C6R6+C2R2 C6r6+C2r2 + movq mm6, [coeffs + 40] ; -C2 C6 -C2 C6 + pmaddwd mm1, mm6 ; -C2R6+C6R2 -C2r6+C6r2 + movq mm6, mm4 ; C4R4+C4R0 C4r4+C4r0 + paddd mm4, mm5 ; A0 a0 + psubd mm6, mm5 ; A3 a3 + movq mm5, mm0 ; -C4R4+C4R0 -C4r4+C4r0 + paddd mm0, mm1 ; A1 a1 + psubd mm5, mm1 ; A2 a2 + movq mm2, [8 + %1] ; R4 R0 r4 r0 + movq mm3, [8 + %2] ; R6 R2 r6 r2 + movq mm1, [coeffs + 16] ; C4 C4 C4 C4 + pmaddwd mm1, mm2 ; C4R4+C4R0 C4r4+C4r0 + movq mm7, [coeffs + 24] ; -C4 C4 -C4 C4 + pmaddwd mm2, mm7 ; -C4R4+C4R0 -C4r4+C4r0 + movq mm7, [coeffs + 32] ; C6 C2 C6 C2 + pmaddwd mm7, mm3 ; C6R6+C2R2 C6r6+C2r2 + pmaddwd mm3, [coeffs + 40] ; -C2R6+C6R2 -C2r6+C6r2 + paddd mm7, mm1 ; A0 a0 + paddd mm1, mm1 ; 2C0 2c0 + psubd mm1, mm7 ; A3 a3 + paddd mm3, mm2 ; A1 a1 + paddd mm2, mm2 ; 2C1 2c1 + psubd mm2, mm3 ; A2 a2 + psrad mm4, %6 + psrad mm7, %6 + psrad mm3, %6 + packssdw mm4, mm7 ; A0 a0 + movq [%5], mm4 + psrad mm0, %6 + packssdw mm0, mm3 ; A1 a1 + movq [16 + %5], mm0 + movq [96 + %5], mm0 + movq [112 + %5], mm4 + psrad mm5, %6 + psrad mm6, %6 + psrad mm2, %6 + packssdw mm5, mm2 ; A2-B2 a2-b2 + movq [32 + %5], mm5 + psrad mm1, %6 + packssdw mm6, mm1 ; A3+B3 a3+b3 + movq [48 + %5], mm6 + movq [64 + %5], mm6 + movq [80 + %5], mm5 +%endmacro + +%macro IDCT7 6 + movq mm0, %1 ; R4 R0 r4 r0 + movq mm1, %2 ; R6 R2 r6 r2 + movq mm2, %3 ; R3 R1 r3 r1 + movq mm4, [coeffs + 16] ; C4 C4 C4 C4 + pmaddwd mm4, mm0 ; C4R4+C4R0 C4r4+C4r0 + movq mm5, [coeffs + 24] ; -C4 C4 -C4 C4 + pmaddwd mm0, mm5 ; -C4R4+C4R0 -C4r4+C4r0 + movq mm5, [coeffs + 32] ; C6 C2 C6 C2 + pmaddwd mm5, mm1 ; C6R6+C2R2 C6r6+C2r2 + movq mm6, [coeffs + 40] ; -C2 C6 -C2 C6 + pmaddwd mm1, mm6 ; -C2R6+C6R2 -C2r6+C6r2 + movq mm6, mm4 ; C4R4+C4R0 C4r4+C4r0 + movq mm7, [coeffs + 48] ; C3 C1 C3 C1 + pmaddwd mm7, mm2 ; C3R3+C1R1 C3r3+C1r1 + paddd mm4, mm5 ; A0 a0 + psubd mm6, mm5 ; A3 a3 + movq mm5, mm0 ; -C4R4+C4R0 -C4r4+C4r0 + paddd mm0, mm1 ; A1 a1 + psubd mm5, mm1 ; A2 a2 + movq mm1, [coeffs + 64] + pmaddwd mm1, mm2 ; -C7R3+C3R1 -C7r3+C3r1 + paddd mm7, mm4 ; A0+B0 a0+b0 + paddd mm4, mm4 ; 2A0 2a0 + psubd mm4, mm7 ; A0-B0 a0-b0 + psrad mm7, %6 + psrad mm4, %6 + movq mm3, mm0 ; A1 a1 + paddd mm0, mm1 ; A1+B1 a1+b1 + psubd mm3, mm1 ; A1-B1 a1-b1 + psrad mm0, %6 + psrad mm3, %6 + packssdw mm7, mm7 ; A0+B0 a0+b0 + movd [%5], mm7 + packssdw mm0, mm0 ; A1+B1 a1+b1 + movd [16 + %5], mm0 + packssdw mm3, mm3 ; A1-B1 a1-b1 + movd [96 + %5], mm3 + packssdw mm4, mm4 ; A0-B0 a0-b0 + movd [112 + %5], mm4 + movq mm4, [coeffs + 80] ; -C1 C5 -C1 C5 + pmaddwd mm4, mm2 ; -C1R3+C5R1 -C1r3+C5r1 + pmaddwd mm2, [coeffs + 96] ; -C5R3+C7R1 -C5r3+C7r1 + movq mm3, mm5 ; A2 a2 + paddd mm3, mm4 ; A2+B2 a2+b2 + psubd mm5, mm4 ; a2-B2 a2-b2 + psrad mm3, %6 + psrad mm5, %6 + movq mm4, mm6 ; A3 a3 + paddd mm6, mm2 ; A3+B3 a3+b3 + psubd mm4, mm2 ; a3-B3 a3-b3 + psrad mm6, %6 + packssdw mm3, mm3 ; A2+B2 a2+b2 + movd [32 + %5], mm3 + psrad mm4, %6 + packssdw mm6, mm6 ; A3+B3 a3+b3 + movd [48 + %5], mm6 + packssdw mm4, mm4 ; A3-B3 a3-b3 + packssdw mm5, mm5 ; A2-B2 a2-b2 + movd [64 + %5], mm4 + movd [80 + %5], mm5 +%endmacro + +%macro IDCT8 6 + movq mm0, [%1] ; R4 R0 r4 r0 + movq mm4, [coeffs + 16] ; C4 C4 C4 C4 + pmaddwd mm4, mm0 ; C4R4+C4R0 C4r4+C4r0 + movq mm5, [coeffs + 24] ; -C4 C4 -C4 C4 + pmaddwd mm0, mm5 ; -C4R4+C4R0 -C4r4+C4r0 + psrad mm4, %6 + psrad mm0, %6 + movq mm2, [8 + %1] ; R4 R0 r4 r0 + movq mm1, [coeffs + 16] ; C4 C4 C4 C4 + pmaddwd mm1, mm2 ; C4R4+C4R0 C4r4+C4r0 + movq mm7, [coeffs + 24] ; -C4 C4 -C4 C4 + pmaddwd mm2, mm7 ; -C4R4+C4R0 -C4r4+C4r0 + movq mm7, [coeffs + 32] ; C6 C2 C6 C2 + psrad mm1, %6 + packssdw mm4, mm1 ; A0 a0 + movq [%5], mm4 + psrad mm2, %6 + packssdw mm0, mm2 ; A1 a1 + movq [16 + %5], mm0 + movq [96 + %5], mm0 + movq [112 + %5], mm4 + movq [32 + %5], mm0 + movq [48 + %5], mm4 + movq [64 + %5], mm4 + movq [80 + %5], mm0 +%endmacro + +%macro IDCT 0 + DC_COND_IDCT 0, 8, 16, 24, rsp + 0, null, 11 + Z_COND_IDCT 32, 40, 48, 56, rsp + 32, null, 11, %%4 + Z_COND_IDCT 64, 72, 80, 88, rsp + 64, null, 11, %%2 + Z_COND_IDCT 96, 104, 112, 120, rsp + 96, null, 11, %%1 + + IDCT1 [rsp + 0], [rsp + 64], [rsp + 32], [rsp + 96], blockq + 0, 20 + IDCT1 [rsp + 8], [rsp + 72], [rsp + 40], [rsp + 104], blockq + 4, 20 + IDCT1 [rsp + 16], [rsp + 80], [rsp + 48], [rsp + 112], blockq + 8, 20 + IDCT1 [rsp + 24], [rsp + 88], [rsp + 56], [rsp + 120], blockq + 12, 20 + jmp %%9 + + ALIGN 16 + %%4: + Z_COND_IDCT 64, 72, 80, 88, rsp + 64, null, 11, %%6 + Z_COND_IDCT 96, 104, 112, 120, rsp + 96, null, 11, %%5 + + IDCT2 [rsp + 0], [rsp + 64], [rsp + 32], [rsp + 96], blockq + 0, 20 + IDCT2 [rsp + 8], [rsp + 72], [rsp + 40], [rsp + 104], blockq + 4, 20 + IDCT2 [rsp + 16], [rsp + 80], [rsp + 48], [rsp + 112], blockq + 8, 20 + IDCT2 [rsp + 24], [rsp + 88], [rsp + 56], [rsp + 120], blockq + 12, 20 + jmp %%9 + + ALIGN 16 + %%6: + Z_COND_IDCT 96, 104, 112, 120, rsp + 96, null, 11, %%7 + + IDCT3 [rsp + 0], [rsp + 64], [rsp + 32], [rsp + 96], blockq + 0, 20 + IDCT3 [rsp + 8], [rsp + 72], [rsp + 40], [rsp + 104], blockq + 4, 20 + IDCT3 [rsp + 16], [rsp + 80], [rsp + 48], [rsp + 112], blockq + 8, 20 + IDCT3 [rsp + 24], [rsp + 88], [rsp + 56], [rsp + 120], blockq + 12, 20 + jmp %%9 + + ALIGN 16 + %%2: + Z_COND_IDCT 96, 104, 112, 120, rsp + 96, null, 11, %%3 + + IDCT4 [rsp + 0], [rsp + 64], [rsp + 32], [rsp + 96], blockq + 0, 20 + IDCT4 [rsp + 8], [rsp + 72], [rsp + 40], [rsp + 104], blockq + 4, 20 + IDCT4 [rsp + 16], [rsp + 80], [rsp + 48], [rsp + 112], blockq + 8, 20 + IDCT4 [rsp + 24], [rsp + 88], [rsp + 56], [rsp + 120], blockq + 12, 20 + jmp %%9 + + ALIGN 16 + %%3: + + IDCT5 [rsp + 0], [rsp + 64], [rsp + 32], [rsp + 96], blockq + 0, 20 + IDCT5 [rsp + 8], [rsp + 72], [rsp + 40], [rsp + 104], blockq + 4, 20 + IDCT5 [rsp + 16], [rsp + 80], [rsp + 48], [rsp + 112], blockq + 8, 20 + IDCT5 [rsp + 24], [rsp + 88], [rsp + 56], [rsp + 120], blockq + 12, 20 + jmp %%9 + + ALIGN 16 + %%5: + + IDCT6 rsp + 0, rsp + 64, rsp + 32, rsp + 96, blockq + 0, 20 + IDCT6 rsp + 16, rsp + 80, rsp + 48, rsp + 112, blockq + 8, 20 + jmp %%9 + + ALIGN 16 + %%1: + + IDCT7 [rsp + 0], [rsp + 64], [rsp + 32], [rsp + 96], blockq + 0, 20 + IDCT7 [rsp + 8], [rsp + 72], [rsp + 40], [rsp + 104], blockq + 4, 20 + IDCT7 [rsp + 16], [rsp + 80], [rsp + 48], [rsp + 112], blockq + 8, 20 + IDCT7 [rsp + 24], [rsp + 88], [rsp + 56], [rsp + 120], blockq + 12, 20 + jmp %%9 + + ALIGN 16 + %%7: + + IDCT8 rsp + 0, rsp + 64, rsp + 32, rsp + 96, blockq + 0, 20 + IDCT8 rsp + 16, rsp + 80, rsp + 48, rsp + 112, blockq + 8, 20 + + %%9: +%endmacro + +%macro PUT_PIXELS_CLAMPED_HALF 1 + mova m0, [blockq+mmsize*0+%1] + mova m1, [blockq+mmsize*2+%1] +%if mmsize == 8 + mova m2, [blockq+mmsize*4+%1] + mova m3, [blockq+mmsize*6+%1] +%endif + packuswb m0, [blockq+mmsize*1+%1] + packuswb m1, [blockq+mmsize*3+%1] +%if mmsize == 8 + packuswb m2, [blockq+mmsize*5+%1] + packuswb m3, [blockq+mmsize*7+%1] + movq [pixelsq], m0 + movq [lsizeq+pixelsq], m1 + movq [2*lsizeq+pixelsq], m2 + movq [lsize3q+pixelsq], m3 +%else + movq [pixelsq], m0 + movhps [lsizeq+pixelsq], m0 + movq [2*lsizeq+pixelsq], m1 + movhps [lsize3q+pixelsq], m1 +%endif +%endmacro + +%macro ADD_PIXELS_CLAMPED 1 + mova m0, [blockq+mmsize*0+%1] + mova m1, [blockq+mmsize*1+%1] +%if mmsize == 8 + mova m5, [blockq+mmsize*2+%1] + mova m6, [blockq+mmsize*3+%1] +%endif + movq m2, [pixelsq] + movq m3, [pixelsq+lsizeq] +%if mmsize == 8 + mova m7, m2 + punpcklbw m2, m4 + punpckhbw m7, m4 + paddsw m0, m2 + paddsw m1, m7 + mova m7, m3 + punpcklbw m3, m4 + punpckhbw m7, m4 + paddsw m5, m3 + paddsw m6, m7 +%else + punpcklbw m2, m4 + punpcklbw m3, m4 + paddsw m0, m2 + paddsw m1, m3 +%endif + packuswb m0, m1 +%if mmsize == 8 + packuswb m5, m6 + movq [pixelsq], m0 + movq [pixelsq+lsizeq], m5 +%else + movq [pixelsq], m0 + movhps [pixelsq+lsizeq], m0 +%endif +%endmacro + +INIT_MMX mmx + +cglobal simple_idct, 1, 2, 8, 128, block, t0 + IDCT +RET + +INIT_XMM sse2 + +cglobal simple_idct_put, 3, 5, 8, 128, pixels, lsize, block, lsize3, t0 + IDCT + lea lsize3q, [lsizeq*3] + PUT_PIXELS_CLAMPED_HALF 0 + lea pixelsq, [pixelsq+lsizeq*4] + PUT_PIXELS_CLAMPED_HALF 64 +RET + +cglobal simple_idct_add, 3, 4, 8, 128, pixels, lsize, block, t0 + IDCT + pxor m4, m4 + ADD_PIXELS_CLAMPED 0 + lea pixelsq, [pixelsq+lsizeq*2] + ADD_PIXELS_CLAMPED 32 + lea pixelsq, [pixelsq+lsizeq*2] + ADD_PIXELS_CLAMPED 64 + lea pixelsq, [pixelsq+lsizeq*2] + ADD_PIXELS_CLAMPED 96 +RET +%endif diff --git a/media/ffvpx/libavcodec/x86/simple_idct.h b/media/ffvpx/libavcodec/x86/simple_idct.h new file mode 100644 index 0000000000..9b64cfe9bc --- /dev/null +++ b/media/ffvpx/libavcodec/x86/simple_idct.h @@ -0,0 +1,53 @@ +/* + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#ifndef AVCODEC_X86_SIMPLE_IDCT_H +#define AVCODEC_X86_SIMPLE_IDCT_H + +#include <stddef.h> +#include <stdint.h> + +void ff_simple_idct_mmx(int16_t *block); +void ff_simple_idct_add_mmx(uint8_t *dest, ptrdiff_t line_size, int16_t *block); +void ff_simple_idct_put_mmx(uint8_t *dest, ptrdiff_t line_size, int16_t *block); + +void ff_simple_idct_add_sse2(uint8_t *dest, ptrdiff_t line_size, int16_t *block); +void ff_simple_idct_put_sse2(uint8_t *dest, ptrdiff_t line_size, int16_t *block); + +void ff_simple_idct8_sse2(int16_t *block); +void ff_simple_idct8_avx(int16_t *block); + +void ff_simple_idct8_put_sse2(uint8_t *dest, ptrdiff_t line_size, int16_t *block); +void ff_simple_idct8_put_avx(uint8_t *dest, ptrdiff_t line_size, int16_t *block); + +void ff_simple_idct8_add_sse2(uint8_t *dest, ptrdiff_t line_size, int16_t *block); +void ff_simple_idct8_add_avx(uint8_t *dest, ptrdiff_t line_size, int16_t *block); + +void ff_simple_idct10_sse2(int16_t *block); +void ff_simple_idct10_avx(int16_t *block); + +void ff_simple_idct10_put_sse2(uint8_t *dest, ptrdiff_t line_size, int16_t *block); +void ff_simple_idct10_put_avx(uint8_t *dest, ptrdiff_t line_size, int16_t *block); + +void ff_simple_idct12_sse2(int16_t *block); +void ff_simple_idct12_avx(int16_t *block); + +void ff_simple_idct12_put_sse2(uint8_t *dest, ptrdiff_t line_size, int16_t *block); +void ff_simple_idct12_put_avx(uint8_t *dest, ptrdiff_t line_size, int16_t *block); + +#endif /* AVCODEC_X86_SIMPLE_IDCT_H */ diff --git a/media/ffvpx/libavcodec/x86/simple_idct10.asm b/media/ffvpx/libavcodec/x86/simple_idct10.asm new file mode 100644 index 0000000000..069bb61378 --- /dev/null +++ b/media/ffvpx/libavcodec/x86/simple_idct10.asm @@ -0,0 +1,205 @@ +;****************************************************************************** +;* x86-SIMD-optimized IDCT for prores +;* this is identical to "simple" IDCT written by Michael Niedermayer +;* except for the clip range +;* +;* Copyright (c) 2011 Ronald S. Bultje <rsbultje@gmail.com> +;* Copyright (c) 2015 Christophe Gisquet +;* +;* This file is part of FFmpeg. +;* +;* FFmpeg is free software; you can redistribute it and/or +;* modify it under the terms of the GNU Lesser General Public +;* License as published by the Free Software Foundation; either +;* version 2.1 of the License, or (at your option) any later version. +;* +;* FFmpeg is distributed in the hope that it will be useful, +;* but WITHOUT ANY WARRANTY; without even the implied warranty of +;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +;* Lesser General Public License for more details. +;* +;* You should have received a copy of the GNU Lesser General Public +;* License along with FFmpeg; if not, write to the Free Software +;* 51, Inc., Foundation Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA +;****************************************************************************** + +%include "libavutil/x86/x86util.asm" + +%if ARCH_X86_64 + +SECTION_RODATA + +cextern pw_2 +cextern pw_16 +cextern pw_32 +cextern pw_1023 +cextern pw_4095 +pd_round_11: times 4 dd 1<<(11-1) +pd_round_12: times 4 dd 1<<(12-1) +pd_round_15: times 4 dd 1<<(15-1) +pd_round_19: times 4 dd 1<<(19-1) +pd_round_20: times 4 dd 1<<(20-1) + +%macro CONST_DEC 3 +const %1 +times 4 dw %2, %3 +%endmacro + +%define W1sh2 22725 ; W1 = 90901 = 22725<<2 + 1 +%define W2sh2 21407 ; W2 = 85627 = 21407<<2 - 1 +%define W3sh2 19265 ; W3 = 77062 = 19265<<2 + 2 +%define W4sh2 16384 ; W4 = 65535 = 16384<<2 - 1 +%define W3sh2_lo 19266 +%define W4sh2_lo 16383 +%define W5sh2 12873 ; W5 = 51491 = 12873<<2 - 1 +%define W6sh2 8867 ; W6 = 35468 = 8867<<2 +%define W7sh2 4520 ; W7 = 18081 = 4520<<2 + 1 + +CONST_DEC w4_plus_w2_hi, W4sh2, +W2sh2 +CONST_DEC w4_min_w2_hi, W4sh2, -W2sh2 +CONST_DEC w4_plus_w6_hi, W4sh2, +W6sh2 +CONST_DEC w4_min_w6_hi, W4sh2, -W6sh2 +CONST_DEC w1_plus_w3_hi, W1sh2, +W3sh2 +CONST_DEC w3_min_w1_hi, W3sh2, -W1sh2 +CONST_DEC w7_plus_w3_hi, W7sh2, +W3sh2 +CONST_DEC w3_min_w7_hi, W3sh2, -W7sh2 +CONST_DEC w1_plus_w5, W1sh2, +W5sh2 +CONST_DEC w5_min_w1, W5sh2, -W1sh2 +CONST_DEC w5_plus_w7, W5sh2, +W7sh2 +CONST_DEC w7_min_w5, W7sh2, -W5sh2 +CONST_DEC w4_plus_w2_lo, W4sh2_lo, +W2sh2 +CONST_DEC w4_min_w2_lo, W4sh2_lo, -W2sh2 +CONST_DEC w4_plus_w6_lo, W4sh2_lo, +W6sh2 +CONST_DEC w4_min_w6_lo, W4sh2_lo, -W6sh2 +CONST_DEC w1_plus_w3_lo, W1sh2, +W3sh2_lo +CONST_DEC w3_min_w1_lo, W3sh2_lo, -W1sh2 +CONST_DEC w7_plus_w3_lo, W7sh2, +W3sh2_lo +CONST_DEC w3_min_w7_lo, W3sh2_lo, -W7sh2 + +%include "libavcodec/x86/simple_idct10_template.asm" + +SECTION .text + +%macro STORE_HI_LO 12 + movq %1, %9 + movq %3, %10 + movq %5, %11 + movq %7, %12 + movhps %2, %9 + movhps %4, %10 + movhps %6, %11 + movhps %8, %12 +%endmacro + +%macro LOAD_ZXBW_8 16 + pmovzxbw %1, %9 + pmovzxbw %2, %10 + pmovzxbw %3, %11 + pmovzxbw %4, %12 + pmovzxbw %5, %13 + pmovzxbw %6, %14 + pmovzxbw %7, %15 + pmovzxbw %8, %16 +%endmacro + +%macro LOAD_ZXBW_4 9 + movh %1, %5 + movh %2, %6 + movh %3, %7 + movh %4, %8 + punpcklbw %1, %9 + punpcklbw %2, %9 + punpcklbw %3, %9 + punpcklbw %4, %9 +%endmacro + +%define PASS4ROWS(base, stride, stride3) \ + [base], [base + stride], [base + 2*stride], [base + stride3] + +%macro idct_fn 0 + +define_constants _lo + +cglobal simple_idct8, 1, 1, 16, 32, block + IDCT_FN "", 11, pw_32, 20, "store" +RET + +cglobal simple_idct8_put, 3, 4, 16, 32, pixels, lsize, block + IDCT_FN "", 11, pw_32, 20 + lea r3, [3*lsizeq] + lea r2, [pixelsq + r3] + packuswb m8, m0 + packuswb m1, m2 + packuswb m4, m11 + packuswb m9, m10 + STORE_HI_LO PASS8ROWS(pixelsq, r2, lsizeq, r3), m8, m1, m4, m9 +RET + +cglobal simple_idct8_add, 3, 4, 16, 32, pixels, lsize, block + IDCT_FN "", 11, pw_32, 20 + lea r2, [3*lsizeq] + %if cpuflag(sse4) + lea r3, [pixelsq + r2] + LOAD_ZXBW_8 m3, m5, m6, m7, m12, m13, m14, m15, PASS8ROWS(pixelsq, r3, lsizeq, r2) + paddsw m8, m3 + paddsw m0, m5 + paddsw m1, m6 + paddsw m2, m7 + paddsw m4, m12 + paddsw m11, m13 + paddsw m9, m14 + paddsw m10, m15 + %else + pxor m12, m12 + LOAD_ZXBW_4 m3, m5, m6, m7, PASS4ROWS(pixelsq, lsizeq, r2), m12 + paddsw m8, m3 + paddsw m0, m5 + paddsw m1, m6 + paddsw m2, m7 + lea r3, [pixelsq + 4*lsizeq] + LOAD_ZXBW_4 m3, m5, m6, m7, PASS4ROWS(r3, lsizeq, r2), m12 + paddsw m4, m3 + paddsw m11, m5 + paddsw m9, m6 + paddsw m10, m7 + lea r3, [pixelsq + r2] + %endif + packuswb m8, m0 + packuswb m1, m2 + packuswb m4, m11 + packuswb m9, m10 + STORE_HI_LO PASS8ROWS(pixelsq, r3, lsizeq, r2), m8, m1, m4, m9 +RET + +define_constants _hi + +cglobal simple_idct10, 1, 1, 16, block + IDCT_FN "", 12, "", 19, "store" + RET + +cglobal simple_idct10_put, 3, 3, 16, pixels, lsize, block + IDCT_FN "", 12, "", 19, "put", 0, pw_1023 + RET + +cglobal simple_idct12, 1, 1, 16, block + ; coeffs are already 15bits, adding the offset would cause + ; overflow in the input + IDCT_FN "", 15, pw_2, 16, "store" + RET + +cglobal simple_idct12_put, 3, 3, 16, pixels, lsize, block + ; range isn't known, so the C simple_idct range is used + ; Also, using a bias on input overflows, so use the bias + ; on output of the first butterfly instead + IDCT_FN "", 15, pw_2, 16, "put", 0, pw_4095 + RET +%endmacro + +INIT_XMM sse2 +idct_fn +%if HAVE_AVX_EXTERNAL +INIT_XMM avx +idct_fn +%endif + +%endif diff --git a/media/ffvpx/libavcodec/x86/simple_idct10_template.asm b/media/ffvpx/libavcodec/x86/simple_idct10_template.asm new file mode 100644 index 0000000000..0d04a9818a --- /dev/null +++ b/media/ffvpx/libavcodec/x86/simple_idct10_template.asm @@ -0,0 +1,369 @@ +;****************************************************************************** +;* x86-SIMD-optimized IDCT for prores +;* this is identical to "simple" IDCT written by Michael Niedermayer +;* except for the clip range +;* +;* Copyright (c) 2011 Ronald S. Bultje <rsbultje@gmail.com> +;* +;* This file is part of FFmpeg. +;* +;* FFmpeg is free software; you can redistribute it and/or +;* modify it under the terms of the GNU Lesser General Public +;* License as published by the Free Software Foundation; either +;* version 2.1 of the License, or (at your option) any later version. +;* +;* FFmpeg is distributed in the hope that it will be useful, +;* but WITHOUT ANY WARRANTY; without even the implied warranty of +;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +;* Lesser General Public License for more details. +;* +;* You should have received a copy of the GNU Lesser General Public +;* License along with FFmpeg; if not, write to the Free Software +;* 51, Inc., Foundation Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA +;****************************************************************************** + +; add SECTION_RODATA and proper include before including this file! + +%if ARCH_X86_64 + +%macro define_constants 1 + %undef w4_plus_w2 + %undef w4_min_w2 + %undef w4_plus_w6 + %undef w4_min_w6 + %undef w1_plus_w3 + %undef w3_min_w1 + %undef w7_plus_w3 + %undef w3_min_w7 + %define w4_plus_w2 w4_plus_w2%1 + %define w4_min_w2 w4_min_w2%1 + %define w4_plus_w6 w4_plus_w6%1 + %define w4_min_w6 w4_min_w6%1 + %define w1_plus_w3 w1_plus_w3%1 + %define w3_min_w1 w3_min_w1%1 + %define w7_plus_w3 w7_plus_w3%1 + %define w3_min_w7 w3_min_w7%1 +%endmacro + +; interleave data while maintaining source +; %1=type, %2=dstlo, %3=dsthi, %4=src, %5=interleave +%macro SBUTTERFLY3 5 + punpckl%1 m%2, m%4, m%5 + punpckh%1 m%3, m%4, m%5 +%endmacro + +; %1/%2=src1/dst1, %3/%4=dst2, %5/%6=src2, %7=shift +; action: %3/%4 = %1/%2 - %5/%6; %1/%2 += %5/%6 +; %1/%2/%3/%4 >>= %7; dword -> word (in %1/%3) +%macro SUMSUB_SHPK 7 + psubd %3, %1, %5 ; { a0 - b0 }[0-3] + psubd %4, %2, %6 ; { a0 - b0 }[4-7] + paddd %1, %5 ; { a0 + b0 }[0-3] + paddd %2, %6 ; { a0 + b0 }[4-7] + psrad %1, %7 + psrad %2, %7 + psrad %3, %7 + psrad %4, %7 + packssdw %1, %2 ; row[0] + packssdw %3, %4 ; row[7] +%endmacro + +; %1 = initial bias ("" if nop) +; %2 = number of bits to shift at the end +; %3 = qmat (for prores) +%macro IDCT_1D 2-3 + ; a0 = (W4 * row[0]) + (1 << (15 - 1)); + ; a1 = a0; + ; a2 = a0; + ; a3 = a0; + ; a0 += W2 * row[2]; + ; a1 += W6 * row[2]; + ; a2 -= W6 * row[2]; + ; a3 -= W2 * row[2]; +%ifstr %1 + mova m15, [pd_round_ %+ %2] +%else + paddw m10, [%1] +%endif + SBUTTERFLY3 wd, 0, 1, 10, 8 ; { row[0], row[2] }[0-3]/[4-7] + pmaddwd m2, m0, [w4_plus_w6] + pmaddwd m3, m1, [w4_plus_w6] + pmaddwd m4, m0, [w4_min_w6] + pmaddwd m5, m1, [w4_min_w6] + pmaddwd m6, m0, [w4_min_w2] + pmaddwd m7, m1, [w4_min_w2] + pmaddwd m0, [w4_plus_w2] + pmaddwd m1, [w4_plus_w2] +%ifstr %1 + ; Adding 1<<(%2-1) for >=15 bits values + paddd m2, m15 + paddd m3, m15 + paddd m4, m15 + paddd m5, m15 + paddd m6, m15 + paddd m7, m15 + paddd m0, m15 + paddd m1, m15 +%endif + + ; a0: -1*row[0]-1*row[2] + ; a1: -1*row[0] + ; a2: -1*row[0] + ; a3: -1*row[0]+1*row[2] + + ; a0 += W4*row[4] + W6*row[6]; i.e. -1*row[4] + ; a1 -= W4*row[4] + W2*row[6]; i.e. -1*row[4]-1*row[6] + ; a2 -= W4*row[4] - W2*row[6]; i.e. -1*row[4]+1*row[6] + ; a3 += W4*row[4] - W6*row[6]; i.e. -1*row[4] + SBUTTERFLY3 wd, 8, 9, 13, 12 ; { row[4], row[6] }[0-3]/[4-7] + pmaddwd m10, m8, [w4_plus_w6] + pmaddwd m11, m9, [w4_plus_w6] + paddd m0, m10 ; a0[0-3] + paddd m1, m11 ; a0[4-7] + pmaddwd m10, m8, [w4_min_w6] + pmaddwd m11, m9, [w4_min_w6] + paddd m6, m10 ; a3[0-3] + paddd m7, m11 ; a3[4-7] + pmaddwd m10, m8, [w4_min_w2] + pmaddwd m11, m9, [w4_min_w2] + pmaddwd m8, [w4_plus_w2] + pmaddwd m9, [w4_plus_w2] + psubd m4, m10 ; a2[0-3] intermediate + psubd m5, m11 ; a2[4-7] intermediate + psubd m2, m8 ; a1[0-3] intermediate + psubd m3, m9 ; a1[4-7] intermediate + + ; load/store + mova [blockq+ 0], m0 + mova [blockq+ 32], m2 + mova [blockq+ 64], m4 + mova [blockq+ 96], m6 + mova m10,[blockq+ 16] ; { row[1] }[0-7] + mova m8, [blockq+ 48] ; { row[3] }[0-7] + mova m13,[blockq+ 80] ; { row[5] }[0-7] + mova m14,[blockq+112] ; { row[7] }[0-7] + mova [blockq+ 16], m1 + mova [blockq+ 48], m3 + mova [blockq+ 80], m5 + mova [blockq+112], m7 +%if %0 == 3 + pmullw m10,[%3+ 16] + pmullw m8, [%3+ 48] + pmullw m13,[%3+ 80] + pmullw m14,[%3+112] +%endif + + ; b0 = MUL(W1, row[1]); + ; MAC(b0, W3, row[3]); + ; b1 = MUL(W3, row[1]); + ; MAC(b1, -W7, row[3]); + ; b2 = MUL(W5, row[1]); + ; MAC(b2, -W1, row[3]); + ; b3 = MUL(W7, row[1]); + ; MAC(b3, -W5, row[3]); + SBUTTERFLY3 wd, 0, 1, 10, 8 ; { row[1], row[3] }[0-3]/[4-7] + pmaddwd m2, m0, [w3_min_w7] + pmaddwd m3, m1, [w3_min_w7] + pmaddwd m4, m0, [w5_min_w1] + pmaddwd m5, m1, [w5_min_w1] + pmaddwd m6, m0, [w7_min_w5] + pmaddwd m7, m1, [w7_min_w5] + pmaddwd m0, [w1_plus_w3] + pmaddwd m1, [w1_plus_w3] + + ; b0: +1*row[1]+2*row[3] + ; b1: +2*row[1]-1*row[3] + ; b2: -1*row[1]-1*row[3] + ; b3: +1*row[1]+1*row[3] + + ; MAC(b0, W5, row[5]); + ; MAC(b0, W7, row[7]); + ; MAC(b1, -W1, row[5]); + ; MAC(b1, -W5, row[7]); + ; MAC(b2, W7, row[5]); + ; MAC(b2, W3, row[7]); + ; MAC(b3, W3, row[5]); + ; MAC(b3, -W1, row[7]); + SBUTTERFLY3 wd, 8, 9, 13, 14 ; { row[5], row[7] }[0-3]/[4-7] + + ; b0: -1*row[5]+1*row[7] + ; b1: -1*row[5]+1*row[7] + ; b2: +1*row[5]+2*row[7] + ; b3: +2*row[5]-1*row[7] + + pmaddwd m10, m8, [w1_plus_w5] + pmaddwd m11, m9, [w1_plus_w5] + pmaddwd m12, m8, [w5_plus_w7] + pmaddwd m13, m9, [w5_plus_w7] + psubd m2, m10 ; b1[0-3] + psubd m3, m11 ; b1[4-7] + paddd m0, m12 ; b0[0-3] + paddd m1, m13 ; b0[4-7] + pmaddwd m12, m8, [w7_plus_w3] + pmaddwd m13, m9, [w7_plus_w3] + pmaddwd m8, [w3_min_w1] + pmaddwd m9, [w3_min_w1] + paddd m4, m12 ; b2[0-3] + paddd m5, m13 ; b2[4-7] + paddd m6, m8 ; b3[0-3] + paddd m7, m9 ; b3[4-7] + + ; row[0] = (a0 + b0) >> 15; + ; row[7] = (a0 - b0) >> 15; + ; row[1] = (a1 + b1) >> 15; + ; row[6] = (a1 - b1) >> 15; + ; row[2] = (a2 + b2) >> 15; + ; row[5] = (a2 - b2) >> 15; + ; row[3] = (a3 + b3) >> 15; + ; row[4] = (a3 - b3) >> 15; + mova m8, [blockq+ 0] ; a0[0-3] + mova m9, [blockq+16] ; a0[4-7] + SUMSUB_SHPK m8, m9, m10, m11, m0, m1, %2 + mova m0, [blockq+32] ; a1[0-3] + mova m1, [blockq+48] ; a1[4-7] + SUMSUB_SHPK m0, m1, m9, m11, m2, m3, %2 + mova m1, [blockq+64] ; a2[0-3] + mova m2, [blockq+80] ; a2[4-7] + SUMSUB_SHPK m1, m2, m11, m3, m4, m5, %2 + mova m2, [blockq+96] ; a3[0-3] + mova m3, [blockq+112] ; a3[4-7] + SUMSUB_SHPK m2, m3, m4, m5, m6, m7, %2 +%endmacro + +; void ff_prores_idct_put_10_<opt>(uint8_t *pixels, ptrdiff_t stride, +; int16_t *block, const int16_t *qmat); + +; %1 = row shift +; %2 = row bias macro +; %3 = column shift +; %4 = column bias macro +; %5 = final action (nothing, "store", "put", "add") +; %6 = min pixel value +; %7 = max pixel value +; %8 = qmat (for prores) + +%macro IDCT_FN 4-8 + ; for (i = 0; i < 8; i++) + ; idctRowCondDC(block + i*8); + mova m10,[blockq+ 0] ; { row[0] }[0-7] + mova m8, [blockq+32] ; { row[2] }[0-7] + mova m13,[blockq+64] ; { row[4] }[0-7] + mova m12,[blockq+96] ; { row[6] }[0-7] + +%if %0 == 8 + pmullw m10,[%8+ 0] + pmullw m8, [%8+32] + pmullw m13,[%8+64] + pmullw m12,[%8+96] + + IDCT_1D %1, %2, %8 +%elif %2 == 11 + ; This copies the DC-only shortcut. When there is only a DC coefficient the + ; C shifts the value and splats it to all coeffs rather than multiplying and + ; doing the full IDCT. This causes a difference on 8-bit because the + ; coefficient is 16383 rather than 16384 (which you can get with shifting). + por m1, m8, m13 + por m1, m12 + por m1, [blockq+ 16] ; { row[1] }[0-7] + por m1, [blockq+ 48] ; { row[3] }[0-7] + por m1, [blockq+ 80] ; { row[5] }[0-7] + por m1, [blockq+112] ; { row[7] }[0-7] + pxor m2, m2 + pcmpeqw m1, m2 + psllw m2, m10, 3 + pand m2, m1 + pcmpeqb m3, m3 + pxor m1, m3 + mova [rsp], m1 + mova [rsp+16], m2 + + IDCT_1D %1, %2 + + mova m5, [rsp] + mova m6, [rsp+16] + pand m8, m5 + por m8, m6 + pand m0, m5 + por m0, m6 + pand m1, m5 + por m1, m6 + pand m2, m5 + por m2, m6 + pand m4, m5 + por m4, m6 + pand m11, m5 + por m11, m6 + pand m9, m5 + por m9, m6 + pand m10, m5 + por m10, m6 +%else + IDCT_1D %1, %2 +%endif + + ; transpose for second part of IDCT + TRANSPOSE8x8W 8, 0, 1, 2, 4, 11, 9, 10, 3 + mova [blockq+ 16], m0 + mova [blockq+ 48], m2 + mova [blockq+ 80], m11 + mova [blockq+112], m10 + SWAP 8, 10 + SWAP 1, 8 + SWAP 4, 13 + SWAP 9, 12 + + ; for (i = 0; i < 8; i++) + ; idctSparseColAdd(dest + i, line_size, block + i); + IDCT_1D %3, %4 + + ; clip/store +%if %0 >= 5 +%ifidn %5,"store" + ; No clamping, means pure idct + mova [blockq+ 0], m8 + mova [blockq+ 16], m0 + mova [blockq+ 32], m1 + mova [blockq+ 48], m2 + mova [blockq+ 64], m4 + mova [blockq+ 80], m11 + mova [blockq+ 96], m9 + mova [blockq+112], m10 +%elifidn %5,"put" +%ifidn %6, 0 + pxor m3, m3 +%else + mova m3, [%6] +%endif ; ifidn %6, 0 + mova m5, [%7] + pmaxsw m8, m3 + pmaxsw m0, m3 + pmaxsw m1, m3 + pmaxsw m2, m3 + pmaxsw m4, m3 + pmaxsw m11, m3 + pmaxsw m9, m3 + pmaxsw m10, m3 + pminsw m8, m5 + pminsw m0, m5 + pminsw m1, m5 + pminsw m2, m5 + pminsw m4, m5 + pminsw m11, m5 + pminsw m9, m5 + pminsw m10, m5 + + lea r2, [r1*3] + mova [r0 ], m8 + mova [r0+r1 ], m0 + mova [r0+r1*2], m1 + mova [r0+r2 ], m2 + lea r0, [r0+r1*4] + mova [r0 ], m4 + mova [r0+r1 ], m11 + mova [r0+r1*2], m9 + mova [r0+r2 ], m10 +%endif ; %5 action +%endif; if %0 >= 5 +%endmacro + +%endif diff --git a/media/ffvpx/libavcodec/x86/videodsp.asm b/media/ffvpx/libavcodec/x86/videodsp.asm new file mode 100644 index 0000000000..3cc07878d3 --- /dev/null +++ b/media/ffvpx/libavcodec/x86/videodsp.asm @@ -0,0 +1,436 @@ +;****************************************************************************** +;* Core video DSP functions +;* Copyright (c) 2012 Ronald S. Bultje <rsbultje@gmail.com> +;* +;* This file is part of FFmpeg. +;* +;* FFmpeg is free software; you can redistribute it and/or +;* modify it under the terms of the GNU Lesser General Public +;* License as published by the Free Software Foundation; either +;* version 2.1 of the License, or (at your option) any later version. +;* +;* FFmpeg is distributed in the hope that it will be useful, +;* but WITHOUT ANY WARRANTY; without even the implied warranty of +;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +;* Lesser General Public License for more details. +;* +;* You should have received a copy of the GNU Lesser General Public +;* License along with FFmpeg; if not, write to the Free Software +;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA +;****************************************************************************** + +%include "libavutil/x86/x86util.asm" + +SECTION .text + +; slow vertical extension loop function. Works with variable-width, and +; does per-line reading/writing of source data + +%macro V_COPY_ROW 2 ; type (top/body/bottom), h +.%1_y_loop: ; do { + mov wq, r7mp ; initialize w (r7mp = wmp) +.%1_x_loop: ; do { + movu m0, [srcq+wq] ; m0 = read($mmsize) + movu [dstq+wq], m0 ; write(m0, $mmsize) + add wq, mmsize ; w -= $mmsize + cmp wq, -mmsize ; } while (w > $mmsize); + jl .%1_x_loop + movu m0, [srcq-mmsize] ; m0 = read($mmsize) + movu [dstq-mmsize], m0 ; write(m0, $mmsize) +%ifidn %1, body ; if ($type == body) { + add srcq, src_strideq ; src += src_stride +%endif ; } + add dstq, dst_strideq ; dst += dst_stride + dec %2 ; } while (--$h); + jnz .%1_y_loop +%endmacro + +; .----. <- zero +; | | <- top is copied from first line in body of source +; |----| <- start_y +; | | <- body is copied verbatim (line-by-line) from source +; |----| <- end_y +; | | <- bottom is copied from last line in body of source +; '----' <- bh +INIT_XMM sse +%if ARCH_X86_64 +cglobal emu_edge_vvar, 7, 8, 1, dst, dst_stride, src, src_stride, \ + start_y, end_y, bh, w +%else ; x86-32 +cglobal emu_edge_vvar, 1, 6, 1, dst, src, start_y, end_y, bh, w +%define src_strideq r3mp +%define dst_strideq r1mp + mov srcq, r2mp + mov start_yq, r4mp + mov end_yq, r5mp + mov bhq, r6mp +%endif + sub bhq, end_yq ; bh -= end_q + sub end_yq, start_yq ; end_q -= start_q + add srcq, r7mp ; (r7mp = wmp) + add dstq, r7mp ; (r7mp = wmp) + neg r7mp ; (r7mp = wmp) + test start_yq, start_yq ; if (start_q) { + jz .body + V_COPY_ROW top, start_yq ; v_copy_row(top, start_yq) +.body: ; } + V_COPY_ROW body, end_yq ; v_copy_row(body, end_yq) + test bhq, bhq ; if (bh) { + jz .end + sub srcq, src_strideq ; src -= src_stride + V_COPY_ROW bottom, bhq ; v_copy_row(bottom, bh) +.end: ; } + RET + +%macro hvar_fn 0 +cglobal emu_edge_hvar, 5, 6, 1, dst, dst_stride, start_x, n_words, h, w + lea dstq, [dstq+n_wordsq*2] + neg n_wordsq + lea start_xq, [start_xq+n_wordsq*2] +.y_loop: ; do { +%if cpuflag(avx2) + vpbroadcastb m0, [dstq+start_xq] + mov wq, n_wordsq ; initialize w +%else + movzx wd, byte [dstq+start_xq] ; w = read(1) + imul wd, 0x01010101 ; w *= 0x01010101 + movd m0, wd + mov wq, n_wordsq ; initialize w + pshufd m0, m0, q0000 ; splat +%endif ; avx2 +.x_loop: ; do { + movu [dstq+wq*2], m0 ; write($reg, $mmsize) + add wq, mmsize/2 ; w -= $mmsize/2 + cmp wq, -(mmsize/2) ; } while (w > $mmsize/2) + jl .x_loop + movu [dstq-mmsize], m0 ; write($reg, $mmsize) + add dstq, dst_strideq ; dst += dst_stride + dec hq ; } while (h--) + jnz .y_loop + RET +%endmacro + +INIT_XMM sse2 +hvar_fn + +%if HAVE_AVX2_EXTERNAL +INIT_XMM avx2 +hvar_fn +%endif + +; macro to read/write a horizontal number of pixels (%2) to/from registers +; on sse, - fills xmm0-15 for consecutive sets of 16 pixels +; - if (%2 & 8) fills 8 bytes into xmm$next +; - if (%2 & 4) fills 4 bytes into xmm$next +; - if (%2 & 3) fills 1, 2 or 4 bytes in eax +; on mmx, - fills mm0-7 for consecutive sets of 8 pixels +; - if (%2 & 4) fills 4 bytes into mm$next +; - if (%2 & 3) fills 1, 2 or 4 bytes in eax +; writing data out is in the same way +%macro READ_NUM_BYTES 2 +%assign %%off 0 ; offset in source buffer +%assign %%mmx_idx 0 ; mmx register index +%assign %%xmm_idx 0 ; xmm register index + +%rep %2/mmsize +%if mmsize == 16 + movu xmm %+ %%xmm_idx, [srcq+%%off] +%assign %%xmm_idx %%xmm_idx+1 +%else ; mmx + movu mm %+ %%mmx_idx, [srcq+%%off] +%assign %%mmx_idx %%mmx_idx+1 +%endif +%assign %%off %%off+mmsize +%endrep ; %2/mmsize + +%if mmsize == 16 +%if (%2-%%off) >= 8 +%if %2 > 16 && (%2-%%off) > 8 + movu xmm %+ %%xmm_idx, [srcq+%2-16] +%assign %%xmm_idx %%xmm_idx+1 +%assign %%off %2 +%else + movq mm %+ %%mmx_idx, [srcq+%%off] +%assign %%mmx_idx %%mmx_idx+1 +%assign %%off %%off+8 +%endif +%endif ; (%2-%%off) >= 8 +%endif + +%if (%2-%%off) >= 4 +%if %2 > 8 && (%2-%%off) > 4 + movq mm %+ %%mmx_idx, [srcq+%2-8] +%assign %%off %2 +%else + movd mm %+ %%mmx_idx, [srcq+%%off] +%assign %%off %%off+4 +%endif +%assign %%mmx_idx %%mmx_idx+1 +%endif ; (%2-%%off) >= 4 + +%if (%2-%%off) >= 1 +%if %2 >= 4 + movd mm %+ %%mmx_idx, [srcq+%2-4] +%elif (%2-%%off) == 1 + mov valb, [srcq+%2-1] +%elif (%2-%%off) == 2 + mov valw, [srcq+%2-2] +%else + mov valb, [srcq+%2-1] + ror vald, 16 + mov valw, [srcq+%2-3] +%endif +%endif ; (%2-%%off) >= 1 +%endmacro ; READ_NUM_BYTES + +%macro WRITE_NUM_BYTES 2 +%assign %%off 0 ; offset in destination buffer +%assign %%mmx_idx 0 ; mmx register index +%assign %%xmm_idx 0 ; xmm register index + +%rep %2/mmsize +%if mmsize == 16 + movu [dstq+%%off], xmm %+ %%xmm_idx +%assign %%xmm_idx %%xmm_idx+1 +%else ; mmx + movu [dstq+%%off], mm %+ %%mmx_idx +%assign %%mmx_idx %%mmx_idx+1 +%endif +%assign %%off %%off+mmsize +%endrep ; %2/mmsize + +%if mmsize == 16 +%if (%2-%%off) >= 8 +%if %2 > 16 && (%2-%%off) > 8 + movu [dstq+%2-16], xmm %+ %%xmm_idx +%assign %%xmm_idx %%xmm_idx+1 +%assign %%off %2 +%else + movq [dstq+%%off], mm %+ %%mmx_idx +%assign %%mmx_idx %%mmx_idx+1 +%assign %%off %%off+8 +%endif +%endif ; (%2-%%off) >= 8 +%endif + +%if (%2-%%off) >= 4 +%if %2 > 8 && (%2-%%off) > 4 + movq [dstq+%2-8], mm %+ %%mmx_idx +%assign %%off %2 +%else + movd [dstq+%%off], mm %+ %%mmx_idx +%assign %%off %%off+4 +%endif +%assign %%mmx_idx %%mmx_idx+1 +%endif ; (%2-%%off) >= 4 + +%if (%2-%%off) >= 1 +%if %2 >= 4 + movd [dstq+%2-4], mm %+ %%mmx_idx +%elif (%2-%%off) == 1 + mov [dstq+%2-1], valb +%elif (%2-%%off) == 2 + mov [dstq+%2-2], valw +%else + mov [dstq+%2-3], valw + ror vald, 16 + mov [dstq+%2-1], valb +%ifnidn %1, body + ror vald, 16 +%endif +%endif +%endif ; (%2-%%off) >= 1 +%endmacro ; WRITE_NUM_BYTES + +; vertical top/bottom extend and body copy fast loops +; these are function pointers to set-width line copy functions, i.e. +; they read a fixed number of pixels into set registers, and write +; those out into the destination buffer +%macro VERTICAL_EXTEND 2 +%assign %%n %1 +%rep 1+%2-%1 +%if %%n <= 3 +%if ARCH_X86_64 +cglobal emu_edge_vfix %+ %%n, 6, 8, 0, dst, dst_stride, src, src_stride, \ + start_y, end_y, val, bh + mov bhq, r6mp ; r6mp = bhmp +%else ; x86-32 +cglobal emu_edge_vfix %+ %%n, 0, 6, 0, val, dst, src, start_y, end_y, bh + mov dstq, r0mp + mov srcq, r2mp + mov start_yq, r4mp + mov end_yq, r5mp + mov bhq, r6mp +%define dst_strideq r1mp +%define src_strideq r3mp +%endif ; x86-64/32 +%else +%if ARCH_X86_64 +cglobal emu_edge_vfix %+ %%n, 7, 7, 1, dst, dst_stride, src, src_stride, \ + start_y, end_y, bh +%else ; x86-32 +cglobal emu_edge_vfix %+ %%n, 1, 5, 1, dst, src, start_y, end_y, bh + mov srcq, r2mp + mov start_yq, r4mp + mov end_yq, r5mp + mov bhq, r6mp +%define dst_strideq r1mp +%define src_strideq r3mp +%endif ; x86-64/32 +%endif + ; FIXME move this to c wrapper? + sub bhq, end_yq ; bh -= end_y + sub end_yq, start_yq ; end_y -= start_y + + ; extend pixels above body + test start_yq, start_yq ; if (start_y) { + jz .body_loop + READ_NUM_BYTES top, %%n ; $variable_regs = read($n) +.top_loop: ; do { + WRITE_NUM_BYTES top, %%n ; write($variable_regs, $n) + add dstq, dst_strideq ; dst += linesize + dec start_yq ; } while (--start_y) + jnz .top_loop ; } + + ; copy body pixels +.body_loop: ; do { + READ_NUM_BYTES body, %%n ; $variable_regs = read($n) + WRITE_NUM_BYTES body, %%n ; write($variable_regs, $n) + add dstq, dst_strideq ; dst += dst_stride + add srcq, src_strideq ; src += src_stride + dec end_yq ; } while (--end_y) + jnz .body_loop + + ; copy bottom pixels + test bhq, bhq ; if (block_h) { + jz .end + sub srcq, src_strideq ; src -= linesize + READ_NUM_BYTES bottom, %%n ; $variable_regs = read($n) +.bottom_loop: ; do { + WRITE_NUM_BYTES bottom, %%n ; write($variable_regs, $n) + add dstq, dst_strideq ; dst += linesize + dec bhq ; } while (--bh) + jnz .bottom_loop ; } + +.end: + RET +%assign %%n %%n+1 +%endrep ; 1+%2-%1 +%endmacro ; VERTICAL_EXTEND + +INIT_MMX mmx +VERTICAL_EXTEND 1, 15 + +INIT_XMM sse +VERTICAL_EXTEND 16, 22 + +; left/right (horizontal) fast extend functions +; these are essentially identical to the vertical extend ones above, +; just left/right separated because number of pixels to extend is +; obviously not the same on both sides. + +%macro READ_V_PIXEL 2 +%if cpuflag(avx2) + vpbroadcastb m0, %2 +%else + movzx vald, byte %2 + imul vald, 0x01010101 +%if %1 >= 8 + movd m0, vald +%if mmsize == 16 + pshufd m0, m0, q0000 +%else + punpckldq m0, m0 +%endif ; mmsize == 16 +%endif ; %1 > 16 +%endif ; avx2 +%endmacro ; READ_V_PIXEL + +%macro WRITE_V_PIXEL 2 +%assign %%off 0 + +%if %1 >= 8 + +%rep %1/mmsize + movu [%2+%%off], m0 +%assign %%off %%off+mmsize +%endrep ; %1/mmsize + +%if mmsize == 16 +%if %1-%%off >= 8 +%if %1 > 16 && %1-%%off > 8 + movu [%2+%1-16], m0 +%assign %%off %1 +%else + movq [%2+%%off], m0 +%assign %%off %%off+8 +%endif +%endif ; %1-%%off >= 8 +%endif ; mmsize == 16 + +%if %1-%%off >= 4 +%if %1 > 8 && %1-%%off > 4 + movq [%2+%1-8], m0 +%assign %%off %1 +%else + movd [%2+%%off], m0 +%assign %%off %%off+4 +%endif +%endif ; %1-%%off >= 4 + +%else ; %1 < 8 + +%rep %1/4 + mov [%2+%%off], vald +%assign %%off %%off+4 +%endrep ; %1/4 + +%endif ; %1 >=/< 8 + +%if %1-%%off == 2 +%if cpuflag(avx2) + movd [%2+%%off-2], m0 +%else + mov [%2+%%off], valw +%endif ; avx2 +%endif ; (%1-%%off)/2 +%endmacro ; WRITE_V_PIXEL + +%macro H_EXTEND 2 +%assign %%n %1 +%rep 1+(%2-%1)/2 +%if cpuflag(avx2) +cglobal emu_edge_hfix %+ %%n, 4, 4, 1, dst, dst_stride, start_x, bh +%else +cglobal emu_edge_hfix %+ %%n, 4, 5, 1, dst, dst_stride, start_x, bh, val +%endif +.loop_y: ; do { + READ_V_PIXEL %%n, [dstq+start_xq] ; $variable_regs = read($n) + WRITE_V_PIXEL %%n, dstq ; write($variable_regs, $n) + add dstq, dst_strideq ; dst += dst_stride + dec bhq ; } while (--bh) + jnz .loop_y + RET +%assign %%n %%n+2 +%endrep ; 1+(%2-%1)/2 +%endmacro ; H_EXTEND + +INIT_MMX mmx +H_EXTEND 2, 14 + +INIT_XMM sse2 +H_EXTEND 16, 22 + +%if HAVE_AVX2_EXTERNAL +INIT_XMM avx2 +H_EXTEND 8, 22 +%endif + +INIT_MMX mmxext +cglobal prefetch, 3, 3, 0, buf, stride, h +.loop: + prefetcht0 [bufq] + add bufq, strideq + dec hd + jg .loop + RET diff --git a/media/ffvpx/libavcodec/x86/videodsp_init.c b/media/ffvpx/libavcodec/x86/videodsp_init.c new file mode 100644 index 0000000000..ae9db95624 --- /dev/null +++ b/media/ffvpx/libavcodec/x86/videodsp_init.c @@ -0,0 +1,237 @@ +/* + * Copyright (C) 2002-2012 Michael Niedermayer + * Copyright (C) 2012 Ronald S. Bultje + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#include "config.h" +#include "libavutil/attributes.h" +#include "libavutil/avassert.h" +#include "libavutil/common.h" +#include "libavutil/cpu.h" +#include "libavutil/x86/asm.h" +#include "libavutil/x86/cpu.h" +#include "libavcodec/videodsp.h" + +#if HAVE_X86ASM +typedef void emu_edge_vfix_func(uint8_t *dst, x86_reg dst_stride, + const uint8_t *src, x86_reg src_stride, + x86_reg start_y, x86_reg end_y, x86_reg bh); +typedef void emu_edge_vvar_func(uint8_t *dst, x86_reg dst_stride, + const uint8_t *src, x86_reg src_stride, + x86_reg start_y, x86_reg end_y, x86_reg bh, + x86_reg w); + +extern emu_edge_vfix_func ff_emu_edge_vfix1_mmx; +extern emu_edge_vfix_func ff_emu_edge_vfix2_mmx; +extern emu_edge_vfix_func ff_emu_edge_vfix3_mmx; +extern emu_edge_vfix_func ff_emu_edge_vfix4_mmx; +extern emu_edge_vfix_func ff_emu_edge_vfix5_mmx; +extern emu_edge_vfix_func ff_emu_edge_vfix6_mmx; +extern emu_edge_vfix_func ff_emu_edge_vfix7_mmx; +extern emu_edge_vfix_func ff_emu_edge_vfix8_mmx; +extern emu_edge_vfix_func ff_emu_edge_vfix9_mmx; +extern emu_edge_vfix_func ff_emu_edge_vfix10_mmx; +extern emu_edge_vfix_func ff_emu_edge_vfix11_mmx; +extern emu_edge_vfix_func ff_emu_edge_vfix12_mmx; +extern emu_edge_vfix_func ff_emu_edge_vfix13_mmx; +extern emu_edge_vfix_func ff_emu_edge_vfix14_mmx; +extern emu_edge_vfix_func ff_emu_edge_vfix15_mmx; +extern emu_edge_vfix_func ff_emu_edge_vfix16_sse; +extern emu_edge_vfix_func ff_emu_edge_vfix17_sse; +extern emu_edge_vfix_func ff_emu_edge_vfix18_sse; +extern emu_edge_vfix_func ff_emu_edge_vfix19_sse; +extern emu_edge_vfix_func ff_emu_edge_vfix20_sse; +extern emu_edge_vfix_func ff_emu_edge_vfix21_sse; +extern emu_edge_vfix_func ff_emu_edge_vfix22_sse; +static emu_edge_vfix_func * const vfixtbl_sse[22] = { + ff_emu_edge_vfix1_mmx, ff_emu_edge_vfix2_mmx, ff_emu_edge_vfix3_mmx, + ff_emu_edge_vfix4_mmx, ff_emu_edge_vfix5_mmx, ff_emu_edge_vfix6_mmx, + ff_emu_edge_vfix7_mmx, ff_emu_edge_vfix8_mmx, ff_emu_edge_vfix9_mmx, + ff_emu_edge_vfix10_mmx, ff_emu_edge_vfix11_mmx, ff_emu_edge_vfix12_mmx, + ff_emu_edge_vfix13_mmx, ff_emu_edge_vfix14_mmx, ff_emu_edge_vfix15_mmx, + ff_emu_edge_vfix16_sse, ff_emu_edge_vfix17_sse, ff_emu_edge_vfix18_sse, + ff_emu_edge_vfix19_sse, ff_emu_edge_vfix20_sse, ff_emu_edge_vfix21_sse, + ff_emu_edge_vfix22_sse +}; +extern emu_edge_vvar_func ff_emu_edge_vvar_sse; + +typedef void emu_edge_hfix_func(uint8_t *dst, x86_reg dst_stride, + x86_reg start_x, x86_reg bh); +typedef void emu_edge_hvar_func(uint8_t *dst, x86_reg dst_stride, + x86_reg start_x, x86_reg n_words, x86_reg bh); + +extern emu_edge_hfix_func ff_emu_edge_hfix2_mmx; +extern emu_edge_hfix_func ff_emu_edge_hfix4_mmx; +extern emu_edge_hfix_func ff_emu_edge_hfix6_mmx; +extern emu_edge_hfix_func ff_emu_edge_hfix8_mmx; +extern emu_edge_hfix_func ff_emu_edge_hfix10_mmx; +extern emu_edge_hfix_func ff_emu_edge_hfix12_mmx; +extern emu_edge_hfix_func ff_emu_edge_hfix14_mmx; +extern emu_edge_hfix_func ff_emu_edge_hfix16_sse2; +extern emu_edge_hfix_func ff_emu_edge_hfix18_sse2; +extern emu_edge_hfix_func ff_emu_edge_hfix20_sse2; +extern emu_edge_hfix_func ff_emu_edge_hfix22_sse2; +static emu_edge_hfix_func * const hfixtbl_sse2[11] = { + ff_emu_edge_hfix2_mmx, ff_emu_edge_hfix4_mmx, ff_emu_edge_hfix6_mmx, + ff_emu_edge_hfix8_mmx, ff_emu_edge_hfix10_mmx, ff_emu_edge_hfix12_mmx, + ff_emu_edge_hfix14_mmx, ff_emu_edge_hfix16_sse2, ff_emu_edge_hfix18_sse2, + ff_emu_edge_hfix20_sse2, ff_emu_edge_hfix22_sse2 +}; +extern emu_edge_hvar_func ff_emu_edge_hvar_sse2; +#if HAVE_AVX2_EXTERNAL +extern emu_edge_hfix_func ff_emu_edge_hfix8_avx2; +extern emu_edge_hfix_func ff_emu_edge_hfix10_avx2; +extern emu_edge_hfix_func ff_emu_edge_hfix12_avx2; +extern emu_edge_hfix_func ff_emu_edge_hfix14_avx2; +extern emu_edge_hfix_func ff_emu_edge_hfix16_avx2; +extern emu_edge_hfix_func ff_emu_edge_hfix18_avx2; +extern emu_edge_hfix_func ff_emu_edge_hfix20_avx2; +extern emu_edge_hfix_func ff_emu_edge_hfix22_avx2; +static emu_edge_hfix_func * const hfixtbl_avx2[11] = { + ff_emu_edge_hfix2_mmx, ff_emu_edge_hfix4_mmx, ff_emu_edge_hfix6_mmx, + ff_emu_edge_hfix8_avx2, ff_emu_edge_hfix10_avx2, ff_emu_edge_hfix12_avx2, + ff_emu_edge_hfix14_avx2, ff_emu_edge_hfix16_avx2, ff_emu_edge_hfix18_avx2, + ff_emu_edge_hfix20_avx2, ff_emu_edge_hfix22_avx2 +}; +extern emu_edge_hvar_func ff_emu_edge_hvar_avx2; +#endif + +static av_always_inline void emulated_edge_mc(uint8_t *dst, const uint8_t *src, + ptrdiff_t dst_stride, + ptrdiff_t src_stride, + x86_reg block_w, x86_reg block_h, + x86_reg src_x, x86_reg src_y, + x86_reg w, x86_reg h, + emu_edge_vfix_func * const *vfix_tbl, + emu_edge_vvar_func *v_extend_var, + emu_edge_hfix_func * const *hfix_tbl, + emu_edge_hvar_func *h_extend_var) +{ + x86_reg start_y, start_x, end_y, end_x, src_y_add = 0, p; + + if (!w || !h) + return; + + av_assert2(block_w <= FFABS(dst_stride)); + + if (src_y >= h) { + src -= src_y*src_stride; + src_y_add = h - 1; + src_y = h - 1; + } else if (src_y <= -block_h) { + src -= src_y*src_stride; + src_y_add = 1 - block_h; + src_y = 1 - block_h; + } + if (src_x >= w) { + src += w - 1 - src_x; + src_x = w - 1; + } else if (src_x <= -block_w) { + src += 1 - block_w - src_x; + src_x = 1 - block_w; + } + + start_y = FFMAX(0, -src_y); + start_x = FFMAX(0, -src_x); + end_y = FFMIN(block_h, h-src_y); + end_x = FFMIN(block_w, w-src_x); + av_assert2(start_x < end_x && block_w > 0); + av_assert2(start_y < end_y && block_h > 0); + + // fill in the to-be-copied part plus all above/below + src += (src_y_add + start_y) * src_stride + start_x; + w = end_x - start_x; + if (w <= 22) { + vfix_tbl[w - 1](dst + start_x, dst_stride, src, src_stride, + start_y, end_y, block_h); + } else { + v_extend_var(dst + start_x, dst_stride, src, src_stride, + start_y, end_y, block_h, w); + } + + // fill left + if (start_x) { + if (start_x <= 22) { + hfix_tbl[(start_x - 1) >> 1](dst, dst_stride, start_x, block_h); + } else { + h_extend_var(dst, dst_stride, + start_x, (start_x + 1) >> 1, block_h); + } + } + + // fill right + p = block_w - end_x; + if (p) { + if (p <= 22) { + hfix_tbl[(p - 1) >> 1](dst + end_x - (p & 1), dst_stride, + -!(p & 1), block_h); + } else { + h_extend_var(dst + end_x - (p & 1), dst_stride, + -!(p & 1), (p + 1) >> 1, block_h); + } + } +} + +static av_noinline void emulated_edge_mc_sse2(uint8_t *buf, const uint8_t *src, + ptrdiff_t buf_stride, + ptrdiff_t src_stride, + int block_w, int block_h, + int src_x, int src_y, int w, + int h) +{ + emulated_edge_mc(buf, src, buf_stride, src_stride, block_w, block_h, + src_x, src_y, w, h, vfixtbl_sse, &ff_emu_edge_vvar_sse, + hfixtbl_sse2, &ff_emu_edge_hvar_sse2); +} + +#if HAVE_AVX2_EXTERNAL +static av_noinline void emulated_edge_mc_avx2(uint8_t *buf, const uint8_t *src, + ptrdiff_t buf_stride, + ptrdiff_t src_stride, + int block_w, int block_h, + int src_x, int src_y, int w, + int h) +{ + emulated_edge_mc(buf, src, buf_stride, src_stride, block_w, block_h, + src_x, src_y, w, h, vfixtbl_sse, &ff_emu_edge_vvar_sse, + hfixtbl_avx2, &ff_emu_edge_hvar_avx2); +} +#endif /* HAVE_AVX2_EXTERNAL */ +#endif /* HAVE_X86ASM */ + +void ff_prefetch_mmxext(const uint8_t *buf, ptrdiff_t stride, int h); + +av_cold void ff_videodsp_init_x86(VideoDSPContext *ctx, int bpc) +{ +#if HAVE_X86ASM + int cpu_flags = av_get_cpu_flags(); + + if (EXTERNAL_MMXEXT(cpu_flags)) { + ctx->prefetch = ff_prefetch_mmxext; + } + if (EXTERNAL_SSE2(cpu_flags) && bpc <= 8) { + ctx->emulated_edge_mc = emulated_edge_mc_sse2; + } +#if HAVE_AVX2_EXTERNAL + if (EXTERNAL_AVX2(cpu_flags) && bpc <= 8) { + ctx->emulated_edge_mc = emulated_edge_mc_avx2; + } +#endif +#endif /* HAVE_X86ASM */ +} diff --git a/media/ffvpx/libavcodec/x86/vp56_arith.h b/media/ffvpx/libavcodec/x86/vp56_arith.h new file mode 100644 index 0000000000..9f7639980c --- /dev/null +++ b/media/ffvpx/libavcodec/x86/vp56_arith.h @@ -0,0 +1,53 @@ +/** + * VP5 and VP6 compatible video decoder (arith decoder) + * + * Copyright (C) 2006 Aurelien Jacobs <aurel@gnuage.org> + * Copyright (C) 2010 Eli Friedman + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#ifndef AVCODEC_X86_VP56_ARITH_H +#define AVCODEC_X86_VP56_ARITH_H + +#if HAVE_INLINE_ASM && HAVE_FAST_CMOV && HAVE_6REGS +#include "libavutil/attributes.h" + +#define vp56_rac_get_prob vp56_rac_get_prob +static av_always_inline int vp56_rac_get_prob(VP56RangeCoder *c, uint8_t prob) +{ + unsigned int code_word = vp56_rac_renorm(c); + unsigned int low = 1 + (((c->high - 1) * prob) >> 8); + unsigned int low_shift = low << 16; + int bit = 0; + c->code_word = code_word; + + __asm__( + "subl %4, %1 \n\t" + "subl %3, %2 \n\t" + "setae %b0 \n\t" + "cmovb %4, %1 \n\t" + "cmovb %5, %2 \n\t" + : "+q"(bit), "+&r"(c->high), "+&r"(c->code_word) + : "r"(low_shift), "r"(low), "r"(code_word) + ); + + return bit; +} +#endif + +#endif /* AVCODEC_X86_VP56_ARITH_H */ diff --git a/media/ffvpx/libavcodec/x86/vp8dsp.asm b/media/ffvpx/libavcodec/x86/vp8dsp.asm new file mode 100644 index 0000000000..6ac5a7721b --- /dev/null +++ b/media/ffvpx/libavcodec/x86/vp8dsp.asm @@ -0,0 +1,1116 @@ +;****************************************************************************** +;* VP8 MMXEXT optimizations +;* Copyright (c) 2010 Ronald S. Bultje <rsbultje@gmail.com> +;* Copyright (c) 2010 Fiona Glaser <fiona@x264.com> +;* +;* This file is part of FFmpeg. +;* +;* FFmpeg is free software; you can redistribute it and/or +;* modify it under the terms of the GNU Lesser General Public +;* License as published by the Free Software Foundation; either +;* version 2.1 of the License, or (at your option) any later version. +;* +;* FFmpeg is distributed in the hope that it will be useful, +;* but WITHOUT ANY WARRANTY; without even the implied warranty of +;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +;* Lesser General Public License for more details. +;* +;* You should have received a copy of the GNU Lesser General Public +;* License along with FFmpeg; if not, write to the Free Software +;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA +;****************************************************************************** + +%include "libavutil/x86/x86util.asm" + +SECTION_RODATA + +fourtap_filter_hw_m: times 4 dw -6, 123 + times 4 dw 12, -1 + times 4 dw -9, 93 + times 4 dw 50, -6 + times 4 dw -6, 50 + times 4 dw 93, -9 + times 4 dw -1, 12 + times 4 dw 123, -6 + +sixtap_filter_hw_m: times 4 dw 2, -11 + times 4 dw 108, 36 + times 4 dw -8, 1 + times 4 dw 3, -16 + times 4 dw 77, 77 + times 4 dw -16, 3 + times 4 dw 1, -8 + times 4 dw 36, 108 + times 4 dw -11, 2 + +fourtap_filter_hb_m: times 8 db -6, 123 + times 8 db 12, -1 + times 8 db -9, 93 + times 8 db 50, -6 + times 8 db -6, 50 + times 8 db 93, -9 + times 8 db -1, 12 + times 8 db 123, -6 + +sixtap_filter_hb_m: times 8 db 2, 1 + times 8 db -11, 108 + times 8 db 36, -8 + times 8 db 3, 3 + times 8 db -16, 77 + times 8 db 77, -16 + times 8 db 1, 2 + times 8 db -8, 36 + times 8 db 108, -11 + +fourtap_filter_v_m: times 8 dw -6 + times 8 dw 123 + times 8 dw 12 + times 8 dw -1 + times 8 dw -9 + times 8 dw 93 + times 8 dw 50 + times 8 dw -6 + times 8 dw -6 + times 8 dw 50 + times 8 dw 93 + times 8 dw -9 + times 8 dw -1 + times 8 dw 12 + times 8 dw 123 + times 8 dw -6 + +sixtap_filter_v_m: times 8 dw 2 + times 8 dw -11 + times 8 dw 108 + times 8 dw 36 + times 8 dw -8 + times 8 dw 1 + times 8 dw 3 + times 8 dw -16 + times 8 dw 77 + times 8 dw 77 + times 8 dw -16 + times 8 dw 3 + times 8 dw 1 + times 8 dw -8 + times 8 dw 36 + times 8 dw 108 + times 8 dw -11 + times 8 dw 2 + +bilinear_filter_vw_m: times 8 dw 1 + times 8 dw 2 + times 8 dw 3 + times 8 dw 4 + times 8 dw 5 + times 8 dw 6 + times 8 dw 7 + +bilinear_filter_vb_m: times 8 db 7, 1 + times 8 db 6, 2 + times 8 db 5, 3 + times 8 db 4, 4 + times 8 db 3, 5 + times 8 db 2, 6 + times 8 db 1, 7 + +%ifdef PIC +%define fourtap_filter_hw picregq +%define sixtap_filter_hw picregq +%define fourtap_filter_hb picregq +%define sixtap_filter_hb picregq +%define fourtap_filter_v picregq +%define sixtap_filter_v picregq +%define bilinear_filter_vw picregq +%define bilinear_filter_vb picregq +%define npicregs 1 +%else +%define fourtap_filter_hw fourtap_filter_hw_m +%define sixtap_filter_hw sixtap_filter_hw_m +%define fourtap_filter_hb fourtap_filter_hb_m +%define sixtap_filter_hb sixtap_filter_hb_m +%define fourtap_filter_v fourtap_filter_v_m +%define sixtap_filter_v sixtap_filter_v_m +%define bilinear_filter_vw bilinear_filter_vw_m +%define bilinear_filter_vb bilinear_filter_vb_m +%define npicregs 0 +%endif + +filter_h2_shuf: db 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 +filter_h4_shuf: db 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10 + +filter_h6_shuf1: db 0, 5, 1, 6, 2, 7, 3, 8, 4, 9, 5, 10, 6, 11, 7, 12 +filter_h6_shuf2: db 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9 +filter_h6_shuf3: db 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10, 10, 11 + +pw_20091: times 4 dw 20091 +pw_17734: times 4 dw 17734 + +cextern pw_3 +cextern pw_4 +cextern pw_64 +cextern pw_256 + +SECTION .text + +;------------------------------------------------------------------------------- +; subpel MC functions: +; +; void ff_put_vp8_epel<size>_h<htap>v<vtap>_<opt>(uint8_t *dst, ptrdiff_t deststride, +; const uint8_t *src, ptrdiff_t srcstride, +; int height, int mx, int my); +;------------------------------------------------------------------------------- + +%macro FILTER_SSSE3 1 +cglobal put_vp8_epel%1_h6, 6, 6 + npicregs, 8, dst, dststride, src, srcstride, height, mx, picreg + lea mxd, [mxq*3] + mova m3, [filter_h6_shuf2] + mova m4, [filter_h6_shuf3] +%ifdef PIC + lea picregq, [sixtap_filter_hb_m] +%endif + mova m5, [sixtap_filter_hb+mxq*8-48] ; set up 6tap filter in bytes + mova m6, [sixtap_filter_hb+mxq*8-32] + mova m7, [sixtap_filter_hb+mxq*8-16] + +.nextrow: + movu m0, [srcq-2] + mova m1, m0 + mova m2, m0 +%if mmsize == 8 +; For epel4, we need 9 bytes, but only 8 get loaded; to compensate, do the +; shuffle with a memory operand + punpcklbw m0, [srcq+3] +%else + pshufb m0, [filter_h6_shuf1] +%endif + pshufb m1, m3 + pshufb m2, m4 + pmaddubsw m0, m5 + pmaddubsw m1, m6 + pmaddubsw m2, m7 + paddsw m0, m1 + paddsw m0, m2 + pmulhrsw m0, [pw_256] + packuswb m0, m0 + movh [dstq], m0 ; store + + ; go to next line + add dstq, dststrideq + add srcq, srcstrideq + dec heightd ; next row + jg .nextrow + RET + +cglobal put_vp8_epel%1_h4, 6, 6 + npicregs, 7, dst, dststride, src, srcstride, height, mx, picreg + shl mxd, 4 + mova m2, [pw_256] + mova m3, [filter_h2_shuf] + mova m4, [filter_h4_shuf] +%ifdef PIC + lea picregq, [fourtap_filter_hb_m] +%endif + mova m5, [fourtap_filter_hb+mxq-16] ; set up 4tap filter in bytes + mova m6, [fourtap_filter_hb+mxq] + +.nextrow: + movu m0, [srcq-1] + mova m1, m0 + pshufb m0, m3 + pshufb m1, m4 + pmaddubsw m0, m5 + pmaddubsw m1, m6 + paddsw m0, m1 + pmulhrsw m0, m2 + packuswb m0, m0 + movh [dstq], m0 ; store + + ; go to next line + add dstq, dststrideq + add srcq, srcstrideq + dec heightd ; next row + jg .nextrow + RET + +cglobal put_vp8_epel%1_v4, 7, 7, 8, dst, dststride, src, srcstride, height, picreg, my + shl myd, 4 +%ifdef PIC + lea picregq, [fourtap_filter_hb_m] +%endif + mova m5, [fourtap_filter_hb+myq-16] + mova m6, [fourtap_filter_hb+myq] + mova m7, [pw_256] + + ; read 3 lines + sub srcq, srcstrideq + movh m0, [srcq] + movh m1, [srcq+ srcstrideq] + movh m2, [srcq+2*srcstrideq] + add srcq, srcstrideq + +.nextrow: + movh m3, [srcq+2*srcstrideq] ; read new row + mova m4, m0 + mova m0, m1 + punpcklbw m4, m1 + mova m1, m2 + punpcklbw m2, m3 + pmaddubsw m4, m5 + pmaddubsw m2, m6 + paddsw m4, m2 + mova m2, m3 + pmulhrsw m4, m7 + packuswb m4, m4 + movh [dstq], m4 + + ; go to next line + add dstq, dststrideq + add srcq, srcstrideq + dec heightd ; next row + jg .nextrow + RET + +cglobal put_vp8_epel%1_v6, 7, 7, 8, dst, dststride, src, srcstride, height, picreg, my + lea myd, [myq*3] +%ifdef PIC + lea picregq, [sixtap_filter_hb_m] +%endif + lea myq, [sixtap_filter_hb+myq*8] + + ; read 5 lines + sub srcq, srcstrideq + sub srcq, srcstrideq + movh m0, [srcq] + movh m1, [srcq+srcstrideq] + movh m2, [srcq+srcstrideq*2] + lea srcq, [srcq+srcstrideq*2] + add srcq, srcstrideq + movh m3, [srcq] + movh m4, [srcq+srcstrideq] + +.nextrow: + movh m5, [srcq+2*srcstrideq] ; read new row + mova m6, m0 + punpcklbw m6, m5 + mova m0, m1 + punpcklbw m1, m2 + mova m7, m3 + punpcklbw m7, m4 + pmaddubsw m6, [myq-48] + pmaddubsw m1, [myq-32] + pmaddubsw m7, [myq-16] + paddsw m6, m1 + paddsw m6, m7 + mova m1, m2 + mova m2, m3 + pmulhrsw m6, [pw_256] + mova m3, m4 + packuswb m6, m6 + mova m4, m5 + movh [dstq], m6 + + ; go to next line + add dstq, dststrideq + add srcq, srcstrideq + dec heightd ; next row + jg .nextrow + RET +%endmacro + +INIT_MMX ssse3 +FILTER_SSSE3 4 +INIT_XMM ssse3 +FILTER_SSSE3 8 + +; 4x4 block, H-only 4-tap filter +INIT_MMX mmxext +cglobal put_vp8_epel4_h4, 6, 6 + npicregs, 0, dst, dststride, src, srcstride, height, mx, picreg + shl mxd, 4 +%ifdef PIC + lea picregq, [fourtap_filter_hw_m] +%endif + movq mm4, [fourtap_filter_hw+mxq-16] ; set up 4tap filter in words + movq mm5, [fourtap_filter_hw+mxq] + movq mm7, [pw_64] + pxor mm6, mm6 + +.nextrow: + movq mm1, [srcq-1] ; (ABCDEFGH) load 8 horizontal pixels + + ; first set of 2 pixels + movq mm2, mm1 ; byte ABCD.. + punpcklbw mm1, mm6 ; byte->word ABCD + pshufw mm0, mm2, 9 ; byte CDEF.. + punpcklbw mm0, mm6 ; byte->word CDEF + pshufw mm3, mm1, 0x94 ; word ABBC + pshufw mm1, mm0, 0x94 ; word CDDE + pmaddwd mm3, mm4 ; multiply 2px with F0/F1 + movq mm0, mm1 ; backup for second set of pixels + pmaddwd mm1, mm5 ; multiply 2px with F2/F3 + paddd mm3, mm1 ; finish 1st 2px + + ; second set of 2 pixels, use backup of above + punpckhbw mm2, mm6 ; byte->word EFGH + pmaddwd mm0, mm4 ; multiply backed up 2px with F0/F1 + pshufw mm1, mm2, 0x94 ; word EFFG + pmaddwd mm1, mm5 ; multiply 2px with F2/F3 + paddd mm0, mm1 ; finish 2nd 2px + + ; merge two sets of 2 pixels into one set of 4, round/clip/store + packssdw mm3, mm0 ; merge dword->word (4px) + paddsw mm3, mm7 ; rounding + psraw mm3, 7 + packuswb mm3, mm6 ; clip and word->bytes + movd [dstq], mm3 ; store + + ; go to next line + add dstq, dststrideq + add srcq, srcstrideq + dec heightd ; next row + jg .nextrow + RET + +; 4x4 block, H-only 6-tap filter +INIT_MMX mmxext +cglobal put_vp8_epel4_h6, 6, 6 + npicregs, 0, dst, dststride, src, srcstride, height, mx, picreg + lea mxd, [mxq*3] +%ifdef PIC + lea picregq, [sixtap_filter_hw_m] +%endif + movq mm4, [sixtap_filter_hw+mxq*8-48] ; set up 4tap filter in words + movq mm5, [sixtap_filter_hw+mxq*8-32] + movq mm6, [sixtap_filter_hw+mxq*8-16] + movq mm7, [pw_64] + pxor mm3, mm3 + +.nextrow: + movq mm1, [srcq-2] ; (ABCDEFGH) load 8 horizontal pixels + + ; first set of 2 pixels + movq mm2, mm1 ; byte ABCD.. + punpcklbw mm1, mm3 ; byte->word ABCD + pshufw mm0, mm2, 0x9 ; byte CDEF.. + punpckhbw mm2, mm3 ; byte->word EFGH + punpcklbw mm0, mm3 ; byte->word CDEF + pshufw mm1, mm1, 0x94 ; word ABBC + pshufw mm2, mm2, 0x94 ; word EFFG + pmaddwd mm1, mm4 ; multiply 2px with F0/F1 + pshufw mm3, mm0, 0x94 ; word CDDE + movq mm0, mm3 ; backup for second set of pixels + pmaddwd mm3, mm5 ; multiply 2px with F2/F3 + paddd mm1, mm3 ; add to 1st 2px cache + movq mm3, mm2 ; backup for second set of pixels + pmaddwd mm2, mm6 ; multiply 2px with F4/F5 + paddd mm1, mm2 ; finish 1st 2px + + ; second set of 2 pixels, use backup of above + movd mm2, [srcq+3] ; byte FGHI (prevent overreads) + pmaddwd mm0, mm4 ; multiply 1st backed up 2px with F0/F1 + pmaddwd mm3, mm5 ; multiply 2nd backed up 2px with F2/F3 + paddd mm0, mm3 ; add to 2nd 2px cache + pxor mm3, mm3 + punpcklbw mm2, mm3 ; byte->word FGHI + pshufw mm2, mm2, 0xE9 ; word GHHI + pmaddwd mm2, mm6 ; multiply 2px with F4/F5 + paddd mm0, mm2 ; finish 2nd 2px + + ; merge two sets of 2 pixels into one set of 4, round/clip/store + packssdw mm1, mm0 ; merge dword->word (4px) + paddsw mm1, mm7 ; rounding + psraw mm1, 7 + packuswb mm1, mm3 ; clip and word->bytes + movd [dstq], mm1 ; store + + ; go to next line + add dstq, dststrideq + add srcq, srcstrideq + dec heightd ; next row + jg .nextrow + RET + +INIT_XMM sse2 +cglobal put_vp8_epel8_h4, 6, 6 + npicregs, 10, dst, dststride, src, srcstride, height, mx, picreg + shl mxd, 5 +%ifdef PIC + lea picregq, [fourtap_filter_v_m] +%endif + lea mxq, [fourtap_filter_v+mxq-32] + pxor m7, m7 + mova m4, [pw_64] + mova m5, [mxq+ 0] + mova m6, [mxq+16] +%ifdef m8 + mova m8, [mxq+32] + mova m9, [mxq+48] +%endif +.nextrow: + movq m0, [srcq-1] + movq m1, [srcq-0] + movq m2, [srcq+1] + movq m3, [srcq+2] + punpcklbw m0, m7 + punpcklbw m1, m7 + punpcklbw m2, m7 + punpcklbw m3, m7 + pmullw m0, m5 + pmullw m1, m6 +%ifdef m8 + pmullw m2, m8 + pmullw m3, m9 +%else + pmullw m2, [mxq+32] + pmullw m3, [mxq+48] +%endif + paddsw m0, m1 + paddsw m2, m3 + paddsw m0, m2 + paddsw m0, m4 + psraw m0, 7 + packuswb m0, m7 + movh [dstq], m0 ; store + + ; go to next line + add dstq, dststrideq + add srcq, srcstrideq + dec heightd ; next row + jg .nextrow + RET + +INIT_XMM sse2 +cglobal put_vp8_epel8_h6, 6, 6 + npicregs, 14, dst, dststride, src, srcstride, height, mx, picreg + lea mxd, [mxq*3] + shl mxd, 4 +%ifdef PIC + lea picregq, [sixtap_filter_v_m] +%endif + lea mxq, [sixtap_filter_v+mxq-96] + pxor m7, m7 + mova m6, [pw_64] +%ifdef m8 + mova m8, [mxq+ 0] + mova m9, [mxq+16] + mova m10, [mxq+32] + mova m11, [mxq+48] + mova m12, [mxq+64] + mova m13, [mxq+80] +%endif +.nextrow: + movq m0, [srcq-2] + movq m1, [srcq-1] + movq m2, [srcq-0] + movq m3, [srcq+1] + movq m4, [srcq+2] + movq m5, [srcq+3] + punpcklbw m0, m7 + punpcklbw m1, m7 + punpcklbw m2, m7 + punpcklbw m3, m7 + punpcklbw m4, m7 + punpcklbw m5, m7 +%ifdef m8 + pmullw m0, m8 + pmullw m1, m9 + pmullw m2, m10 + pmullw m3, m11 + pmullw m4, m12 + pmullw m5, m13 +%else + pmullw m0, [mxq+ 0] + pmullw m1, [mxq+16] + pmullw m2, [mxq+32] + pmullw m3, [mxq+48] + pmullw m4, [mxq+64] + pmullw m5, [mxq+80] +%endif + paddsw m1, m4 + paddsw m0, m5 + paddsw m1, m2 + paddsw m0, m3 + paddsw m0, m1 + paddsw m0, m6 + psraw m0, 7 + packuswb m0, m7 + movh [dstq], m0 ; store + + ; go to next line + add dstq, dststrideq + add srcq, srcstrideq + dec heightd ; next row + jg .nextrow + RET + +%macro FILTER_V 1 +; 4x4 block, V-only 4-tap filter +cglobal put_vp8_epel%1_v4, 7, 7, 8, dst, dststride, src, srcstride, height, picreg, my + shl myd, 5 +%ifdef PIC + lea picregq, [fourtap_filter_v_m] +%endif + lea myq, [fourtap_filter_v+myq-32] + mova m6, [pw_64] + pxor m7, m7 + mova m5, [myq+48] + + ; read 3 lines + sub srcq, srcstrideq + movh m0, [srcq] + movh m1, [srcq+ srcstrideq] + movh m2, [srcq+2*srcstrideq] + add srcq, srcstrideq + punpcklbw m0, m7 + punpcklbw m1, m7 + punpcklbw m2, m7 + +.nextrow: + ; first calculate negative taps (to prevent losing positive overflows) + movh m4, [srcq+2*srcstrideq] ; read new row + punpcklbw m4, m7 + mova m3, m4 + pmullw m0, [myq+0] + pmullw m4, m5 + paddsw m4, m0 + + ; then calculate positive taps + mova m0, m1 + pmullw m1, [myq+16] + paddsw m4, m1 + mova m1, m2 + pmullw m2, [myq+32] + paddsw m4, m2 + mova m2, m3 + + ; round/clip/store + paddsw m4, m6 + psraw m4, 7 + packuswb m4, m7 + movh [dstq], m4 + + ; go to next line + add dstq, dststrideq + add srcq, srcstrideq + dec heightd ; next row + jg .nextrow + RET + + +; 4x4 block, V-only 6-tap filter +cglobal put_vp8_epel%1_v6, 7, 7, 8, dst, dststride, src, srcstride, height, picreg, my + shl myd, 4 + lea myq, [myq*3] +%ifdef PIC + lea picregq, [sixtap_filter_v_m] +%endif + lea myq, [sixtap_filter_v+myq-96] + pxor m7, m7 + + ; read 5 lines + sub srcq, srcstrideq + sub srcq, srcstrideq + movh m0, [srcq] + movh m1, [srcq+srcstrideq] + movh m2, [srcq+srcstrideq*2] + lea srcq, [srcq+srcstrideq*2] + add srcq, srcstrideq + movh m3, [srcq] + movh m4, [srcq+srcstrideq] + punpcklbw m0, m7 + punpcklbw m1, m7 + punpcklbw m2, m7 + punpcklbw m3, m7 + punpcklbw m4, m7 + +.nextrow: + ; first calculate negative taps (to prevent losing positive overflows) + mova m5, m1 + pmullw m5, [myq+16] + mova m6, m4 + pmullw m6, [myq+64] + paddsw m6, m5 + + ; then calculate positive taps + movh m5, [srcq+2*srcstrideq] ; read new row + punpcklbw m5, m7 + pmullw m0, [myq+0] + paddsw m6, m0 + mova m0, m1 + mova m1, m2 + pmullw m2, [myq+32] + paddsw m6, m2 + mova m2, m3 + pmullw m3, [myq+48] + paddsw m6, m3 + mova m3, m4 + mova m4, m5 + pmullw m5, [myq+80] + paddsw m6, m5 + + ; round/clip/store + paddsw m6, [pw_64] + psraw m6, 7 + packuswb m6, m7 + movh [dstq], m6 + + ; go to next line + add dstq, dststrideq + add srcq, srcstrideq + dec heightd ; next row + jg .nextrow + RET +%endmacro + +INIT_MMX mmxext +FILTER_V 4 +INIT_XMM sse2 +FILTER_V 8 + +%macro FILTER_BILINEAR 1 +%if cpuflag(ssse3) +cglobal put_vp8_bilinear%1_v, 7, 7, 5, dst, dststride, src, srcstride, height, picreg, my + shl myd, 4 +%ifdef PIC + lea picregq, [bilinear_filter_vb_m] +%endif + pxor m4, m4 + mova m3, [bilinear_filter_vb+myq-16] +.nextrow: + movh m0, [srcq+srcstrideq*0] + movh m1, [srcq+srcstrideq*1] + movh m2, [srcq+srcstrideq*2] + punpcklbw m0, m1 + punpcklbw m1, m2 + pmaddubsw m0, m3 + pmaddubsw m1, m3 + psraw m0, 2 + psraw m1, 2 + pavgw m0, m4 + pavgw m1, m4 +%if mmsize==8 + packuswb m0, m0 + packuswb m1, m1 + movh [dstq+dststrideq*0], m0 + movh [dstq+dststrideq*1], m1 +%else + packuswb m0, m1 + movh [dstq+dststrideq*0], m0 + movhps [dstq+dststrideq*1], m0 +%endif +%else ; cpuflag(ssse3) +cglobal put_vp8_bilinear%1_v, 7, 7, 7, dst, dststride, src, srcstride, height, picreg, my + shl myd, 4 +%ifdef PIC + lea picregq, [bilinear_filter_vw_m] +%endif + pxor m6, m6 + mova m5, [bilinear_filter_vw+myq-1*16] + neg myq + mova m4, [bilinear_filter_vw+myq+7*16] +.nextrow: + movh m0, [srcq+srcstrideq*0] + movh m1, [srcq+srcstrideq*1] + movh m3, [srcq+srcstrideq*2] + punpcklbw m0, m6 + punpcklbw m1, m6 + punpcklbw m3, m6 + mova m2, m1 + pmullw m0, m4 + pmullw m1, m5 + pmullw m2, m4 + pmullw m3, m5 + paddsw m0, m1 + paddsw m2, m3 + psraw m0, 2 + psraw m2, 2 + pavgw m0, m6 + pavgw m2, m6 +%if mmsize == 8 + packuswb m0, m0 + packuswb m2, m2 + movh [dstq+dststrideq*0], m0 + movh [dstq+dststrideq*1], m2 +%else + packuswb m0, m2 + movh [dstq+dststrideq*0], m0 + movhps [dstq+dststrideq*1], m0 +%endif +%endif ; cpuflag(ssse3) + + lea dstq, [dstq+dststrideq*2] + lea srcq, [srcq+srcstrideq*2] + sub heightd, 2 + jg .nextrow + RET + +%if cpuflag(ssse3) +cglobal put_vp8_bilinear%1_h, 6, 6 + npicregs, 5, dst, dststride, src, srcstride, height, mx, picreg + shl mxd, 4 +%ifdef PIC + lea picregq, [bilinear_filter_vb_m] +%endif + pxor m4, m4 + mova m2, [filter_h2_shuf] + mova m3, [bilinear_filter_vb+mxq-16] +.nextrow: + movu m0, [srcq+srcstrideq*0] + movu m1, [srcq+srcstrideq*1] + pshufb m0, m2 + pshufb m1, m2 + pmaddubsw m0, m3 + pmaddubsw m1, m3 + psraw m0, 2 + psraw m1, 2 + pavgw m0, m4 + pavgw m1, m4 +%if mmsize==8 + packuswb m0, m0 + packuswb m1, m1 + movh [dstq+dststrideq*0], m0 + movh [dstq+dststrideq*1], m1 +%else + packuswb m0, m1 + movh [dstq+dststrideq*0], m0 + movhps [dstq+dststrideq*1], m0 +%endif +%else ; cpuflag(ssse3) +cglobal put_vp8_bilinear%1_h, 6, 6 + npicregs, 7, dst, dststride, src, srcstride, height, mx, picreg + shl mxd, 4 +%ifdef PIC + lea picregq, [bilinear_filter_vw_m] +%endif + pxor m6, m6 + mova m5, [bilinear_filter_vw+mxq-1*16] + neg mxq + mova m4, [bilinear_filter_vw+mxq+7*16] +.nextrow: + movh m0, [srcq+srcstrideq*0+0] + movh m1, [srcq+srcstrideq*0+1] + movh m2, [srcq+srcstrideq*1+0] + movh m3, [srcq+srcstrideq*1+1] + punpcklbw m0, m6 + punpcklbw m1, m6 + punpcklbw m2, m6 + punpcklbw m3, m6 + pmullw m0, m4 + pmullw m1, m5 + pmullw m2, m4 + pmullw m3, m5 + paddsw m0, m1 + paddsw m2, m3 + psraw m0, 2 + psraw m2, 2 + pavgw m0, m6 + pavgw m2, m6 +%if mmsize == 8 + packuswb m0, m0 + packuswb m2, m2 + movh [dstq+dststrideq*0], m0 + movh [dstq+dststrideq*1], m2 +%else + packuswb m0, m2 + movh [dstq+dststrideq*0], m0 + movhps [dstq+dststrideq*1], m0 +%endif +%endif ; cpuflag(ssse3) + + lea dstq, [dstq+dststrideq*2] + lea srcq, [srcq+srcstrideq*2] + sub heightd, 2 + jg .nextrow + RET +%endmacro + +INIT_MMX mmxext +FILTER_BILINEAR 4 +INIT_XMM sse2 +FILTER_BILINEAR 8 +INIT_MMX ssse3 +FILTER_BILINEAR 4 +INIT_XMM ssse3 +FILTER_BILINEAR 8 + +INIT_MMX mmx +cglobal put_vp8_pixels8, 5, 5, 0, dst, dststride, src, srcstride, height +.nextrow: + movq mm0, [srcq+srcstrideq*0] + movq mm1, [srcq+srcstrideq*1] + lea srcq, [srcq+srcstrideq*2] + movq [dstq+dststrideq*0], mm0 + movq [dstq+dststrideq*1], mm1 + lea dstq, [dstq+dststrideq*2] + sub heightd, 2 + jg .nextrow + RET + +INIT_XMM sse +cglobal put_vp8_pixels16, 5, 5, 2, dst, dststride, src, srcstride, height +.nextrow: + movups xmm0, [srcq+srcstrideq*0] + movups xmm1, [srcq+srcstrideq*1] + lea srcq, [srcq+srcstrideq*2] + movaps [dstq+dststrideq*0], xmm0 + movaps [dstq+dststrideq*1], xmm1 + lea dstq, [dstq+dststrideq*2] + sub heightd, 2 + jg .nextrow + RET + +;----------------------------------------------------------------------------- +; void ff_vp8_idct_dc_add_<opt>(uint8_t *dst, int16_t block[16], ptrdiff_t stride); +;----------------------------------------------------------------------------- + +%macro ADD_DC 4 + %4 m2, [dst1q+%3] + %4 m3, [dst1q+strideq+%3] + %4 m4, [dst2q+%3] + %4 m5, [dst2q+strideq+%3] + paddusb m2, %1 + paddusb m3, %1 + paddusb m4, %1 + paddusb m5, %1 + psubusb m2, %2 + psubusb m3, %2 + psubusb m4, %2 + psubusb m5, %2 + %4 [dst1q+%3], m2 + %4 [dst1q+strideq+%3], m3 + %4 [dst2q+%3], m4 + %4 [dst2q+strideq+%3], m5 +%endmacro + +%macro VP8_IDCT_DC_ADD 0 +cglobal vp8_idct_dc_add, 3, 3, 6, dst, block, stride + ; load data + movd m0, [blockq] + pxor m1, m1 + + ; calculate DC + paddw m0, [pw_4] + movd [blockq], m1 + DEFINE_ARGS dst1, dst2, stride + lea dst2q, [dst1q+strideq*2] + movd m2, [dst1q] + movd m3, [dst1q+strideq] + movd m4, [dst2q] + movd m5, [dst2q+strideq] + psraw m0, 3 + pshuflw m0, m0, 0 + punpcklqdq m0, m0 + punpckldq m2, m3 + punpckldq m4, m5 + punpcklbw m2, m1 + punpcklbw m4, m1 + paddw m2, m0 + paddw m4, m0 + packuswb m2, m4 + movd [dst1q], m2 +%if cpuflag(sse4) + pextrd [dst1q+strideq], m2, 1 + pextrd [dst2q], m2, 2 + pextrd [dst2q+strideq], m2, 3 +%else + psrldq m2, 4 + movd [dst1q+strideq], m2 + psrldq m2, 4 + movd [dst2q], m2 + psrldq m2, 4 + movd [dst2q+strideq], m2 +%endif + RET +%endmacro + +INIT_XMM sse2 +VP8_IDCT_DC_ADD +INIT_XMM sse4 +VP8_IDCT_DC_ADD + +;----------------------------------------------------------------------------- +; void ff_vp8_idct_dc_add4y_<opt>(uint8_t *dst, int16_t block[4][16], ptrdiff_t stride); +;----------------------------------------------------------------------------- + +INIT_XMM sse2 +cglobal vp8_idct_dc_add4y, 3, 3, 6, dst, block, stride + ; load data + movd m0, [blockq+32*0] ; A + movd m1, [blockq+32*2] ; C + punpcklwd m0, [blockq+32*1] ; A B + punpcklwd m1, [blockq+32*3] ; C D + punpckldq m0, m1 ; A B C D + pxor m1, m1 + + ; calculate DC + paddw m0, [pw_4] + movd [blockq+32*0], m1 + movd [blockq+32*1], m1 + movd [blockq+32*2], m1 + movd [blockq+32*3], m1 + psraw m0, 3 + psubw m1, m0 + packuswb m0, m0 + packuswb m1, m1 + punpcklbw m0, m0 + punpcklbw m1, m1 + punpcklbw m0, m0 + punpcklbw m1, m1 + + ; add DC + DEFINE_ARGS dst1, dst2, stride + lea dst2q, [dst1q+strideq*2] + ADD_DC m0, m1, 0, mova + RET + +;----------------------------------------------------------------------------- +; void ff_vp8_idct_dc_add4uv_<opt>(uint8_t *dst, int16_t block[4][16], ptrdiff_t stride); +;----------------------------------------------------------------------------- + +INIT_MMX mmx +cglobal vp8_idct_dc_add4uv, 3, 3, 0, dst, block, stride + ; load data + movd m0, [blockq+32*0] ; A + movd m1, [blockq+32*2] ; C + punpcklwd m0, [blockq+32*1] ; A B + punpcklwd m1, [blockq+32*3] ; C D + punpckldq m0, m1 ; A B C D + pxor m6, m6 + + ; calculate DC + paddw m0, [pw_4] + movd [blockq+32*0], m6 + movd [blockq+32*1], m6 + movd [blockq+32*2], m6 + movd [blockq+32*3], m6 + psraw m0, 3 + psubw m6, m0 + packuswb m0, m0 + packuswb m6, m6 + punpcklbw m0, m0 ; AABBCCDD + punpcklbw m6, m6 ; AABBCCDD + movq m1, m0 + movq m7, m6 + punpcklbw m0, m0 ; AAAABBBB + punpckhbw m1, m1 ; CCCCDDDD + punpcklbw m6, m6 ; AAAABBBB + punpckhbw m7, m7 ; CCCCDDDD + + ; add DC + DEFINE_ARGS dst1, dst2, stride + lea dst2q, [dst1q+strideq*2] + ADD_DC m0, m6, 0, mova + lea dst1q, [dst1q+strideq*4] + lea dst2q, [dst2q+strideq*4] + ADD_DC m1, m7, 0, mova + RET + +;----------------------------------------------------------------------------- +; void ff_vp8_idct_add_<opt>(uint8_t *dst, int16_t block[16], ptrdiff_t stride); +;----------------------------------------------------------------------------- + +; calculate %1=mul_35468(%1)-mul_20091(%2); %2=mul_20091(%1)+mul_35468(%2) +; this macro assumes that m6/m7 have words for 20091/17734 loaded +%macro VP8_MULTIPLY_SUMSUB 4 + mova %3, %1 + mova %4, %2 + pmulhw %3, m6 ;20091(1) + pmulhw %4, m6 ;20091(2) + paddw %3, %1 + paddw %4, %2 + paddw %1, %1 + paddw %2, %2 + pmulhw %1, m7 ;35468(1) + pmulhw %2, m7 ;35468(2) + psubw %1, %4 + paddw %2, %3 +%endmacro + +; calculate x0=%1+%3; x1=%1-%3 +; x2=mul_35468(%2)-mul_20091(%4); x3=mul_20091(%2)+mul_35468(%4) +; %1=x0+x3 (tmp0); %2=x1+x2 (tmp1); %3=x1-x2 (tmp2); %4=x0-x3 (tmp3) +; %5/%6 are temporary registers +; we assume m6/m7 have constant words 20091/17734 loaded in them +%macro VP8_IDCT_TRANSFORM4x4_1D 6 + SUMSUB_BA w, %3, %1, %5 ;t0, t1 + VP8_MULTIPLY_SUMSUB m%2, m%4, m%5,m%6 ;t2, t3 + SUMSUB_BA w, %4, %3, %5 ;tmp0, tmp3 + SUMSUB_BA w, %2, %1, %5 ;tmp1, tmp2 + SWAP %4, %1 + SWAP %4, %3 +%endmacro + +INIT_MMX sse +cglobal vp8_idct_add, 3, 3, 0, dst, block, stride + ; load block data + movq m0, [blockq+ 0] + movq m1, [blockq+ 8] + movq m2, [blockq+16] + movq m3, [blockq+24] + movq m6, [pw_20091] + movq m7, [pw_17734] + xorps xmm0, xmm0 + movaps [blockq+ 0], xmm0 + movaps [blockq+16], xmm0 + + ; actual IDCT + VP8_IDCT_TRANSFORM4x4_1D 0, 1, 2, 3, 4, 5 + TRANSPOSE4x4W 0, 1, 2, 3, 4 + paddw m0, [pw_4] + VP8_IDCT_TRANSFORM4x4_1D 0, 1, 2, 3, 4, 5 + TRANSPOSE4x4W 0, 1, 2, 3, 4 + + ; store + pxor m4, m4 + DEFINE_ARGS dst1, dst2, stride + lea dst2q, [dst1q+2*strideq] + STORE_DIFFx2 m0, m1, m6, m7, m4, 3, dst1q, strideq + STORE_DIFFx2 m2, m3, m6, m7, m4, 3, dst2q, strideq + + RET + +;----------------------------------------------------------------------------- +; void ff_vp8_luma_dc_wht(int16_t block[4][4][16], int16_t dc[16]) +;----------------------------------------------------------------------------- + +%macro SCATTER_WHT 3 + movd dc1d, m%1 + movd dc2d, m%2 + mov [blockq+2*16*(0+%3)], dc1w + mov [blockq+2*16*(1+%3)], dc2w + shr dc1d, 16 + shr dc2d, 16 + psrlq m%1, 32 + psrlq m%2, 32 + mov [blockq+2*16*(4+%3)], dc1w + mov [blockq+2*16*(5+%3)], dc2w + movd dc1d, m%1 + movd dc2d, m%2 + mov [blockq+2*16*(8+%3)], dc1w + mov [blockq+2*16*(9+%3)], dc2w + shr dc1d, 16 + shr dc2d, 16 + mov [blockq+2*16*(12+%3)], dc1w + mov [blockq+2*16*(13+%3)], dc2w +%endmacro + +%macro HADAMARD4_1D 4 + SUMSUB_BADC w, %2, %1, %4, %3 + SUMSUB_BADC w, %4, %2, %3, %1 + SWAP %1, %4, %3 +%endmacro + +INIT_MMX sse +cglobal vp8_luma_dc_wht, 2, 3, 0, block, dc1, dc2 + movq m0, [dc1q] + movq m1, [dc1q+8] + movq m2, [dc1q+16] + movq m3, [dc1q+24] + xorps xmm0, xmm0 + movaps [dc1q+ 0], xmm0 + movaps [dc1q+16], xmm0 + HADAMARD4_1D 0, 1, 2, 3 + TRANSPOSE4x4W 0, 1, 2, 3, 4 + paddw m0, [pw_3] + HADAMARD4_1D 0, 1, 2, 3 + psraw m0, 3 + psraw m1, 3 + psraw m2, 3 + psraw m3, 3 + SCATTER_WHT 0, 1, 0 + SCATTER_WHT 2, 3, 2 + RET diff --git a/media/ffvpx/libavcodec/x86/vp8dsp_init.c b/media/ffvpx/libavcodec/x86/vp8dsp_init.c new file mode 100644 index 0000000000..bd20da1fc9 --- /dev/null +++ b/media/ffvpx/libavcodec/x86/vp8dsp_init.c @@ -0,0 +1,383 @@ +/* + * VP8 DSP functions x86-optimized + * Copyright (c) 2010 Ronald S. Bultje <rsbultje@gmail.com> + * Copyright (c) 2010 Fiona Glaser <fiona@x264.com> + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#include "libavutil/attributes.h" +#include "libavutil/cpu.h" +#include "libavutil/mem_internal.h" +#include "libavutil/x86/cpu.h" +#include "libavcodec/vp8dsp.h" + +#if HAVE_X86ASM + +/* + * MC functions + */ +void ff_put_vp8_epel4_h4_mmxext(uint8_t *dst, ptrdiff_t dststride, + const uint8_t *src, ptrdiff_t srcstride, + int height, int mx, int my); +void ff_put_vp8_epel4_h6_mmxext(uint8_t *dst, ptrdiff_t dststride, + const uint8_t *src, ptrdiff_t srcstride, + int height, int mx, int my); +void ff_put_vp8_epel4_v4_mmxext(uint8_t *dst, ptrdiff_t dststride, + const uint8_t *src, ptrdiff_t srcstride, + int height, int mx, int my); +void ff_put_vp8_epel4_v6_mmxext(uint8_t *dst, ptrdiff_t dststride, + const uint8_t *src, ptrdiff_t srcstride, + int height, int mx, int my); + +void ff_put_vp8_epel8_h4_sse2 (uint8_t *dst, ptrdiff_t dststride, + const uint8_t *src, ptrdiff_t srcstride, + int height, int mx, int my); +void ff_put_vp8_epel8_h6_sse2 (uint8_t *dst, ptrdiff_t dststride, + const uint8_t *src, ptrdiff_t srcstride, + int height, int mx, int my); +void ff_put_vp8_epel8_v4_sse2 (uint8_t *dst, ptrdiff_t dststride, + const uint8_t *src, ptrdiff_t srcstride, + int height, int mx, int my); +void ff_put_vp8_epel8_v6_sse2 (uint8_t *dst, ptrdiff_t dststride, + const uint8_t *src, ptrdiff_t srcstride, + int height, int mx, int my); + +void ff_put_vp8_epel4_h4_ssse3 (uint8_t *dst, ptrdiff_t dststride, + const uint8_t *src, ptrdiff_t srcstride, + int height, int mx, int my); +void ff_put_vp8_epel4_h6_ssse3 (uint8_t *dst, ptrdiff_t dststride, + const uint8_t *src, ptrdiff_t srcstride, + int height, int mx, int my); +void ff_put_vp8_epel4_v4_ssse3 (uint8_t *dst, ptrdiff_t dststride, + const uint8_t *src, ptrdiff_t srcstride, + int height, int mx, int my); +void ff_put_vp8_epel4_v6_ssse3 (uint8_t *dst, ptrdiff_t dststride, + const uint8_t *src, ptrdiff_t srcstride, + int height, int mx, int my); +void ff_put_vp8_epel8_h4_ssse3 (uint8_t *dst, ptrdiff_t dststride, + const uint8_t *src, ptrdiff_t srcstride, + int height, int mx, int my); +void ff_put_vp8_epel8_h6_ssse3 (uint8_t *dst, ptrdiff_t dststride, + const uint8_t *src, ptrdiff_t srcstride, + int height, int mx, int my); +void ff_put_vp8_epel8_v4_ssse3 (uint8_t *dst, ptrdiff_t dststride, + const uint8_t *src, ptrdiff_t srcstride, + int height, int mx, int my); +void ff_put_vp8_epel8_v6_ssse3 (uint8_t *dst, ptrdiff_t dststride, + const uint8_t *src, ptrdiff_t srcstride, + int height, int mx, int my); + +void ff_put_vp8_bilinear4_h_mmxext(uint8_t *dst, ptrdiff_t dststride, + const uint8_t *src, ptrdiff_t srcstride, + int height, int mx, int my); +void ff_put_vp8_bilinear8_h_sse2 (uint8_t *dst, ptrdiff_t dststride, + const uint8_t *src, ptrdiff_t srcstride, + int height, int mx, int my); +void ff_put_vp8_bilinear4_h_ssse3 (uint8_t *dst, ptrdiff_t dststride, + const uint8_t *src, ptrdiff_t srcstride, + int height, int mx, int my); +void ff_put_vp8_bilinear8_h_ssse3 (uint8_t *dst, ptrdiff_t dststride, + const uint8_t *src, ptrdiff_t srcstride, + int height, int mx, int my); + +void ff_put_vp8_bilinear4_v_mmxext(uint8_t *dst, ptrdiff_t dststride, + const uint8_t *src, ptrdiff_t srcstride, + int height, int mx, int my); +void ff_put_vp8_bilinear8_v_sse2 (uint8_t *dst, ptrdiff_t dststride, + const uint8_t *src, ptrdiff_t srcstride, + int height, int mx, int my); +void ff_put_vp8_bilinear4_v_ssse3 (uint8_t *dst, ptrdiff_t dststride, + const uint8_t *src, ptrdiff_t srcstride, + int height, int mx, int my); +void ff_put_vp8_bilinear8_v_ssse3 (uint8_t *dst, ptrdiff_t dststride, + const uint8_t *src, ptrdiff_t srcstride, + int height, int mx, int my); + + +void ff_put_vp8_pixels8_mmx (uint8_t *dst, ptrdiff_t dststride, + const uint8_t *src, ptrdiff_t srcstride, + int height, int mx, int my); +void ff_put_vp8_pixels16_sse(uint8_t *dst, ptrdiff_t dststride, + const uint8_t *src, ptrdiff_t srcstride, + int height, int mx, int my); + +#define TAP_W16(OPT, FILTERTYPE, TAPTYPE) \ +static void ff_put_vp8_ ## FILTERTYPE ## 16_ ## TAPTYPE ## _ ## OPT( \ + uint8_t *dst, ptrdiff_t dststride, const uint8_t *src, \ + ptrdiff_t srcstride, int height, int mx, int my) \ +{ \ + ff_put_vp8_ ## FILTERTYPE ## 8_ ## TAPTYPE ## _ ## OPT( \ + dst, dststride, src, srcstride, height, mx, my); \ + ff_put_vp8_ ## FILTERTYPE ## 8_ ## TAPTYPE ## _ ## OPT( \ + dst + 8, dststride, src + 8, srcstride, height, mx, my); \ +} +#define TAP_W8(OPT, FILTERTYPE, TAPTYPE) \ +static void ff_put_vp8_ ## FILTERTYPE ## 8_ ## TAPTYPE ## _ ## OPT( \ + uint8_t *dst, ptrdiff_t dststride, uint8_t *src, \ + ptrdiff_t srcstride, int height, int mx, int my) \ +{ \ + ff_put_vp8_ ## FILTERTYPE ## 4_ ## TAPTYPE ## _ ## OPT( \ + dst, dststride, src, srcstride, height, mx, my); \ + ff_put_vp8_ ## FILTERTYPE ## 4_ ## TAPTYPE ## _ ## OPT( \ + dst + 4, dststride, src + 4, srcstride, height, mx, my); \ +} + +TAP_W16(sse2, epel, h6) +TAP_W16(sse2, epel, v6) +TAP_W16(sse2, bilinear, h) +TAP_W16(sse2, bilinear, v) + +TAP_W16(ssse3, epel, h6) +TAP_W16(ssse3, epel, v6) +TAP_W16(ssse3, bilinear, h) +TAP_W16(ssse3, bilinear, v) + +#define HVTAP(OPT, ALIGN, TAPNUMX, TAPNUMY, SIZE, MAXHEIGHT) \ +static void ff_put_vp8_epel ## SIZE ## _h ## TAPNUMX ## v ## TAPNUMY ## _ ## OPT( \ + uint8_t *dst, ptrdiff_t dststride, const uint8_t *src, \ + ptrdiff_t srcstride, int height, int mx, int my) \ +{ \ + LOCAL_ALIGNED(ALIGN, uint8_t, tmp, [SIZE * (MAXHEIGHT + TAPNUMY - 1)]); \ + uint8_t *tmpptr = tmp + SIZE * (TAPNUMY / 2 - 1); \ + src -= srcstride * (TAPNUMY / 2 - 1); \ + ff_put_vp8_epel ## SIZE ## _h ## TAPNUMX ## _ ## OPT( \ + tmp, SIZE, src, srcstride, height + TAPNUMY - 1, mx, my); \ + ff_put_vp8_epel ## SIZE ## _v ## TAPNUMY ## _ ## OPT( \ + dst, dststride, tmpptr, SIZE, height, mx, my); \ +} + +#define HVTAPMMX(x, y) \ +HVTAP(mmxext, 8, x, y, 4, 8) + +HVTAPMMX(4, 4) +HVTAPMMX(4, 6) +HVTAPMMX(6, 4) +HVTAPMMX(6, 6) + +#define HVTAPSSE2(x, y, w) \ +HVTAP(sse2, 16, x, y, w, 16) \ +HVTAP(ssse3, 16, x, y, w, 16) + +HVTAPSSE2(4, 4, 8) +HVTAPSSE2(4, 6, 8) +HVTAPSSE2(6, 4, 8) +HVTAPSSE2(6, 6, 8) +HVTAPSSE2(6, 6, 16) + +HVTAP(ssse3, 16, 4, 4, 4, 8) +HVTAP(ssse3, 16, 4, 6, 4, 8) +HVTAP(ssse3, 16, 6, 4, 4, 8) +HVTAP(ssse3, 16, 6, 6, 4, 8) + +#define HVBILIN(OPT, ALIGN, SIZE, MAXHEIGHT) \ +static void ff_put_vp8_bilinear ## SIZE ## _hv_ ## OPT( \ + uint8_t *dst, ptrdiff_t dststride, const uint8_t *src, \ + ptrdiff_t srcstride, int height, int mx, int my) \ +{ \ + LOCAL_ALIGNED(ALIGN, uint8_t, tmp, [SIZE * (MAXHEIGHT + 2)]); \ + ff_put_vp8_bilinear ## SIZE ## _h_ ## OPT( \ + tmp, SIZE, src, srcstride, height + 1, mx, my); \ + ff_put_vp8_bilinear ## SIZE ## _v_ ## OPT( \ + dst, dststride, tmp, SIZE, height, mx, my); \ +} + +HVBILIN(mmxext, 8, 4, 8) +HVBILIN(sse2, 8, 8, 16) +HVBILIN(sse2, 8, 16, 16) +HVBILIN(ssse3, 8, 4, 8) +HVBILIN(ssse3, 8, 8, 16) +HVBILIN(ssse3, 8, 16, 16) + +void ff_vp8_idct_dc_add_sse2(uint8_t *dst, int16_t block[16], + ptrdiff_t stride); +void ff_vp8_idct_dc_add_sse4(uint8_t *dst, int16_t block[16], + ptrdiff_t stride); +void ff_vp8_idct_dc_add4y_sse2(uint8_t *dst, int16_t block[4][16], + ptrdiff_t stride); +void ff_vp8_idct_dc_add4uv_mmx(uint8_t *dst, int16_t block[2][16], + ptrdiff_t stride); +void ff_vp8_luma_dc_wht_sse(int16_t block[4][4][16], int16_t dc[16]); +void ff_vp8_idct_add_sse(uint8_t *dst, int16_t block[16], ptrdiff_t stride); + +#define DECLARE_LOOP_FILTER(NAME) \ +void ff_vp8_v_loop_filter_simple_ ## NAME(uint8_t *dst, \ + ptrdiff_t stride, \ + int flim); \ +void ff_vp8_h_loop_filter_simple_ ## NAME(uint8_t *dst, \ + ptrdiff_t stride, \ + int flim); \ +void ff_vp8_v_loop_filter16y_inner_ ## NAME (uint8_t *dst, \ + ptrdiff_t stride, \ + int e, int i, int hvt); \ +void ff_vp8_h_loop_filter16y_inner_ ## NAME (uint8_t *dst, \ + ptrdiff_t stride, \ + int e, int i, int hvt); \ +void ff_vp8_v_loop_filter8uv_inner_ ## NAME (uint8_t *dstU, \ + uint8_t *dstV, \ + ptrdiff_t s, \ + int e, int i, int hvt); \ +void ff_vp8_h_loop_filter8uv_inner_ ## NAME (uint8_t *dstU, \ + uint8_t *dstV, \ + ptrdiff_t s, \ + int e, int i, int hvt); \ +void ff_vp8_v_loop_filter16y_mbedge_ ## NAME(uint8_t *dst, \ + ptrdiff_t stride, \ + int e, int i, int hvt); \ +void ff_vp8_h_loop_filter16y_mbedge_ ## NAME(uint8_t *dst, \ + ptrdiff_t stride, \ + int e, int i, int hvt); \ +void ff_vp8_v_loop_filter8uv_mbedge_ ## NAME(uint8_t *dstU, \ + uint8_t *dstV, \ + ptrdiff_t s, \ + int e, int i, int hvt); \ +void ff_vp8_h_loop_filter8uv_mbedge_ ## NAME(uint8_t *dstU, \ + uint8_t *dstV, \ + ptrdiff_t s, \ + int e, int i, int hvt); + +DECLARE_LOOP_FILTER(sse2) +DECLARE_LOOP_FILTER(ssse3) +DECLARE_LOOP_FILTER(sse4) + +#endif /* HAVE_X86ASM */ + +#define VP8_LUMA_MC_FUNC(IDX, SIZE, OPT) \ + c->put_vp8_epel_pixels_tab[IDX][0][2] = ff_put_vp8_epel ## SIZE ## _h6_ ## OPT; \ + c->put_vp8_epel_pixels_tab[IDX][2][0] = ff_put_vp8_epel ## SIZE ## _v6_ ## OPT; \ + c->put_vp8_epel_pixels_tab[IDX][2][2] = ff_put_vp8_epel ## SIZE ## _h6v6_ ## OPT + +#define VP8_MC_FUNC(IDX, SIZE, OPT) \ + c->put_vp8_epel_pixels_tab[IDX][0][1] = ff_put_vp8_epel ## SIZE ## _h4_ ## OPT; \ + c->put_vp8_epel_pixels_tab[IDX][1][0] = ff_put_vp8_epel ## SIZE ## _v4_ ## OPT; \ + c->put_vp8_epel_pixels_tab[IDX][1][1] = ff_put_vp8_epel ## SIZE ## _h4v4_ ## OPT; \ + c->put_vp8_epel_pixels_tab[IDX][1][2] = ff_put_vp8_epel ## SIZE ## _h6v4_ ## OPT; \ + c->put_vp8_epel_pixels_tab[IDX][2][1] = ff_put_vp8_epel ## SIZE ## _h4v6_ ## OPT; \ + VP8_LUMA_MC_FUNC(IDX, SIZE, OPT) + +#define VP8_BILINEAR_MC_FUNC(IDX, SIZE, OPT) \ + c->put_vp8_bilinear_pixels_tab[IDX][0][1] = ff_put_vp8_bilinear ## SIZE ## _h_ ## OPT; \ + c->put_vp8_bilinear_pixels_tab[IDX][0][2] = ff_put_vp8_bilinear ## SIZE ## _h_ ## OPT; \ + c->put_vp8_bilinear_pixels_tab[IDX][1][0] = ff_put_vp8_bilinear ## SIZE ## _v_ ## OPT; \ + c->put_vp8_bilinear_pixels_tab[IDX][1][1] = ff_put_vp8_bilinear ## SIZE ## _hv_ ## OPT; \ + c->put_vp8_bilinear_pixels_tab[IDX][1][2] = ff_put_vp8_bilinear ## SIZE ## _hv_ ## OPT; \ + c->put_vp8_bilinear_pixels_tab[IDX][2][0] = ff_put_vp8_bilinear ## SIZE ## _v_ ## OPT; \ + c->put_vp8_bilinear_pixels_tab[IDX][2][1] = ff_put_vp8_bilinear ## SIZE ## _hv_ ## OPT; \ + c->put_vp8_bilinear_pixels_tab[IDX][2][2] = ff_put_vp8_bilinear ## SIZE ## _hv_ ## OPT + + +av_cold void ff_vp78dsp_init_x86(VP8DSPContext *c) +{ +#if HAVE_X86ASM + int cpu_flags = av_get_cpu_flags(); + + if (EXTERNAL_MMX(cpu_flags)) { + c->put_vp8_epel_pixels_tab[1][0][0] = + c->put_vp8_bilinear_pixels_tab[1][0][0] = ff_put_vp8_pixels8_mmx; + } + + /* note that 4-tap width=16 functions are missing because w=16 + * is only used for luma, and luma is always a copy or sixtap. */ + if (EXTERNAL_MMXEXT(cpu_flags)) { + VP8_MC_FUNC(2, 4, mmxext); + VP8_BILINEAR_MC_FUNC(2, 4, mmxext); + } + + if (EXTERNAL_SSE(cpu_flags)) { + c->put_vp8_epel_pixels_tab[0][0][0] = + c->put_vp8_bilinear_pixels_tab[0][0][0] = ff_put_vp8_pixels16_sse; + } + + if (EXTERNAL_SSE2_SLOW(cpu_flags)) { + VP8_LUMA_MC_FUNC(0, 16, sse2); + VP8_MC_FUNC(1, 8, sse2); + VP8_BILINEAR_MC_FUNC(0, 16, sse2); + VP8_BILINEAR_MC_FUNC(1, 8, sse2); + } + + if (EXTERNAL_SSSE3(cpu_flags)) { + VP8_LUMA_MC_FUNC(0, 16, ssse3); + VP8_MC_FUNC(1, 8, ssse3); + VP8_MC_FUNC(2, 4, ssse3); + VP8_BILINEAR_MC_FUNC(0, 16, ssse3); + VP8_BILINEAR_MC_FUNC(1, 8, ssse3); + VP8_BILINEAR_MC_FUNC(2, 4, ssse3); + } +#endif /* HAVE_X86ASM */ +} + +av_cold void ff_vp8dsp_init_x86(VP8DSPContext *c) +{ +#if HAVE_X86ASM + int cpu_flags = av_get_cpu_flags(); + + if (EXTERNAL_MMX(cpu_flags)) { + c->vp8_idct_dc_add4uv = ff_vp8_idct_dc_add4uv_mmx; + } + + if (EXTERNAL_SSE(cpu_flags)) { + c->vp8_idct_add = ff_vp8_idct_add_sse; + c->vp8_luma_dc_wht = ff_vp8_luma_dc_wht_sse; + } + + if (EXTERNAL_SSE2_SLOW(cpu_flags)) { + c->vp8_v_loop_filter_simple = ff_vp8_v_loop_filter_simple_sse2; + + c->vp8_v_loop_filter16y_inner = ff_vp8_v_loop_filter16y_inner_sse2; + c->vp8_v_loop_filter8uv_inner = ff_vp8_v_loop_filter8uv_inner_sse2; + + c->vp8_v_loop_filter16y = ff_vp8_v_loop_filter16y_mbedge_sse2; + c->vp8_v_loop_filter8uv = ff_vp8_v_loop_filter8uv_mbedge_sse2; + } + + if (EXTERNAL_SSE2(cpu_flags)) { + c->vp8_idct_dc_add = ff_vp8_idct_dc_add_sse2; + c->vp8_idct_dc_add4y = ff_vp8_idct_dc_add4y_sse2; + + c->vp8_h_loop_filter_simple = ff_vp8_h_loop_filter_simple_sse2; + + c->vp8_h_loop_filter16y_inner = ff_vp8_h_loop_filter16y_inner_sse2; + c->vp8_h_loop_filter8uv_inner = ff_vp8_h_loop_filter8uv_inner_sse2; + + c->vp8_h_loop_filter16y = ff_vp8_h_loop_filter16y_mbedge_sse2; + c->vp8_h_loop_filter8uv = ff_vp8_h_loop_filter8uv_mbedge_sse2; + } + + if (EXTERNAL_SSSE3(cpu_flags)) { + c->vp8_v_loop_filter_simple = ff_vp8_v_loop_filter_simple_ssse3; + c->vp8_h_loop_filter_simple = ff_vp8_h_loop_filter_simple_ssse3; + + c->vp8_v_loop_filter16y_inner = ff_vp8_v_loop_filter16y_inner_ssse3; + c->vp8_h_loop_filter16y_inner = ff_vp8_h_loop_filter16y_inner_ssse3; + c->vp8_v_loop_filter8uv_inner = ff_vp8_v_loop_filter8uv_inner_ssse3; + c->vp8_h_loop_filter8uv_inner = ff_vp8_h_loop_filter8uv_inner_ssse3; + + c->vp8_v_loop_filter16y = ff_vp8_v_loop_filter16y_mbedge_ssse3; + c->vp8_h_loop_filter16y = ff_vp8_h_loop_filter16y_mbedge_ssse3; + c->vp8_v_loop_filter8uv = ff_vp8_v_loop_filter8uv_mbedge_ssse3; + c->vp8_h_loop_filter8uv = ff_vp8_h_loop_filter8uv_mbedge_ssse3; + } + + if (EXTERNAL_SSE4(cpu_flags)) { + c->vp8_idct_dc_add = ff_vp8_idct_dc_add_sse4; + + c->vp8_h_loop_filter_simple = ff_vp8_h_loop_filter_simple_sse4; + c->vp8_h_loop_filter16y = ff_vp8_h_loop_filter16y_mbedge_sse4; + c->vp8_h_loop_filter8uv = ff_vp8_h_loop_filter8uv_mbedge_sse4; + } +#endif /* HAVE_X86ASM */ +} diff --git a/media/ffvpx/libavcodec/x86/vp8dsp_loopfilter.asm b/media/ffvpx/libavcodec/x86/vp8dsp_loopfilter.asm new file mode 100644 index 0000000000..ef397efd3e --- /dev/null +++ b/media/ffvpx/libavcodec/x86/vp8dsp_loopfilter.asm @@ -0,0 +1,1234 @@ +;****************************************************************************** +;* VP8 MMXEXT optimizations +;* Copyright (c) 2010 Ronald S. Bultje <rsbultje@gmail.com> +;* Copyright (c) 2010 Fiona Glaser <fiona@x264.com> +;* +;* This file is part of FFmpeg. +;* +;* FFmpeg is free software; you can redistribute it and/or +;* modify it under the terms of the GNU Lesser General Public +;* License as published by the Free Software Foundation; either +;* version 2.1 of the License, or (at your option) any later version. +;* +;* FFmpeg is distributed in the hope that it will be useful, +;* but WITHOUT ANY WARRANTY; without even the implied warranty of +;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +;* Lesser General Public License for more details. +;* +;* You should have received a copy of the GNU Lesser General Public +;* License along with FFmpeg; if not, write to the Free Software +;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA +;****************************************************************************** + +%include "libavutil/x86/x86util.asm" + +SECTION_RODATA + +pw_27: times 8 dw 27 +pw_63: times 8 dw 63 + +pb_4: times 16 db 4 +pb_F8: times 16 db 0xF8 +pb_FE: times 16 db 0xFE +pb_27_63: times 8 db 27, 63 +pb_18_63: times 8 db 18, 63 +pb_9_63: times 8 db 9, 63 + +cextern pb_1 +cextern pb_3 +cextern pw_9 +cextern pw_18 +cextern pb_80 + +SECTION .text + +;----------------------------------------------------------------------------- +; void ff_vp8_h/v_loop_filter_simple_<opt>(uint8_t *dst, ptrdiff_t stride, int flim); +;----------------------------------------------------------------------------- + +; macro called with 7 mm register indexes as argument, and 5 regular registers +; first 11 mean the same as READ_8x4_TRANSPOSED above +; fifth regular register is scratchspace to reach the bottom 8 rows, it +; will be set to second regular register + 8*stride at the end +%macro READ_16x4_INTERLEAVED 12 + ; transpose 16 (A-P) rows of 4 pixels each + lea %12, [r0+8*r2] + + ; read (and interleave) those addressable by %8 (=r0), A/C/D/E/I/K/L/M + movd m%1, [%8+%10*4] ; A0-3 + movd m%3, [%12+%10*4] ; I0-3 + movd m%2, [%8+%10*2] ; C0-3 + movd m%4, [%12+%10*2] ; K0-3 + movd m%6, [%8+%10] ; D0-3 + movd m%5, [%12+%10] ; L0-3 + movd m%7, [%12] ; M0-3 + add %12, %11 + punpcklbw m%1, m%3 ; A/I + movd m%3, [%8] ; E0-3 + punpcklbw m%2, m%4 ; C/K + punpcklbw m%6, m%5 ; D/L + punpcklbw m%3, m%7 ; E/M + punpcklbw m%2, m%6 ; C/D/K/L interleaved + + ; read (and interleave) those addressable by %9 (=r4), B/F/G/H/J/N/O/P + movd m%5, [%9+%10*4] ; B0-3 + movd m%4, [%12+%10*4] ; J0-3 + movd m%7, [%9] ; F0-3 + movd m%6, [%12] ; N0-3 + punpcklbw m%5, m%4 ; B/J + punpcklbw m%7, m%6 ; F/N + punpcklbw m%1, m%5 ; A/B/I/J interleaved + punpcklbw m%3, m%7 ; E/F/M/N interleaved + movd m%4, [%9+%11] ; G0-3 + movd m%6, [%12+%11] ; O0-3 + movd m%5, [%9+%11*2] ; H0-3 + movd m%7, [%12+%11*2] ; P0-3 + punpcklbw m%4, m%6 ; G/O + punpcklbw m%5, m%7 ; H/P + punpcklbw m%4, m%5 ; G/H/O/P interleaved +%endmacro + +; write 4 xmm registers of 4 dwords each +; arguments same as WRITE_2x4D, but with an extra register, so that the 5 regular +; registers contain buf+4*stride, buf+5*stride, buf+12*stride, -stride and +stride +; we add 1*stride to the third regular registry in the process +; the 10th argument is 16 if it's a Y filter (i.e. all regular registers cover the +; same memory region), or 8 if they cover two separate buffers (third one points to +; a different memory region than the first two), allowing for more optimal code for +; the 16-width case +%macro WRITE_4x4D 10 + ; write out (4 dwords per register), start with dwords zero + movd [%5+%8*4], m%1 + movd [%5], m%2 + movd [%7+%8*4], m%3 + movd [%7], m%4 + + ; store dwords 1 + psrldq m%1, 4 + psrldq m%2, 4 + psrldq m%3, 4 + psrldq m%4, 4 + movd [%6+%8*4], m%1 + movd [%6], m%2 +%if %10 == 16 + movd [%6+%9*4], m%3 +%endif + movd [%7+%9], m%4 + + ; write dwords 2 + psrldq m%1, 4 + psrldq m%2, 4 +%if %10 == 8 + movd [%5+%8*2], m%1 + movd %5d, m%3 +%endif + psrldq m%3, 4 + psrldq m%4, 4 +%if %10 == 16 + movd [%5+%8*2], m%1 +%endif + movd [%6+%9], m%2 + movd [%7+%8*2], m%3 + movd [%7+%9*2], m%4 + add %7, %9 + + ; store dwords 3 + psrldq m%1, 4 + psrldq m%2, 4 + psrldq m%3, 4 + psrldq m%4, 4 +%if %10 == 8 + mov [%7+%8*4], %5d + movd [%6+%8*2], m%1 +%else + movd [%5+%8], m%1 +%endif + movd [%6+%9*2], m%2 + movd [%7+%8*2], m%3 + movd [%7+%9*2], m%4 +%endmacro + +%macro WRITE_8W 5 +%if cpuflag(sse4) + pextrw [%3+%4*4], %1, 0 + pextrw [%2+%4*4], %1, 1 + pextrw [%3+%4*2], %1, 2 + pextrw [%3+%4 ], %1, 3 + pextrw [%3 ], %1, 4 + pextrw [%2 ], %1, 5 + pextrw [%2+%5 ], %1, 6 + pextrw [%2+%5*2], %1, 7 +%else + movd %2d, %1 + psrldq %1, 4 + mov [%3+%4*4], %2w + shr %2, 16 + add %3, %5 + mov [%3+%4*4], %2w + + movd %2d, %1 + psrldq %1, 4 + add %3, %4 + mov [%3+%4*2], %2w + shr %2, 16 + mov [%3+%4 ], %2w + + movd %2d, %1 + psrldq %1, 4 + mov [%3 ], %2w + shr %2, 16 + mov [%3+%5 ], %2w + + movd %2d, %1 + add %3, %5 + mov [%3+%5 ], %2w + shr %2, 16 + mov [%3+%5*2], %2w +%endif +%endmacro + +%macro SIMPLE_LOOPFILTER 2 +cglobal vp8_%1_loop_filter_simple, 3, %2, 8, dst, stride, flim, cntr +%if cpuflag(ssse3) + pxor m0, m0 +%endif + SPLATB_REG m7, flim, m0 ; splat "flim" into register + + ; set up indexes to address 4 rows + DEFINE_ARGS dst1, mstride, stride, dst3, dst2 + mov strideq, mstrideq + neg mstrideq +%ifidn %1, h + lea dst1q, [dst1q+4*strideq-2] +%endif + +%ifidn %1, v + ; read 4 half/full rows of pixels + mova m0, [dst1q+mstrideq*2] ; p1 + mova m1, [dst1q+mstrideq] ; p0 + mova m2, [dst1q] ; q0 + mova m3, [dst1q+ strideq] ; q1 +%else ; h + lea dst2q, [dst1q+ strideq] + + READ_16x4_INTERLEAVED 0, 1, 2, 3, 4, 5, 6, dst1q, dst2q, mstrideq, strideq, dst3q + TRANSPOSE4x4W 0, 1, 2, 3, 4 +%endif + + ; simple_limit + mova m5, m2 ; m5=backup of q0 + mova m6, m1 ; m6=backup of p0 + psubusb m1, m2 ; p0-q0 + psubusb m2, m6 ; q0-p0 + por m1, m2 ; FFABS(p0-q0) + paddusb m1, m1 ; m1=FFABS(p0-q0)*2 + + mova m4, m3 + mova m2, m0 + psubusb m3, m0 ; q1-p1 + psubusb m0, m4 ; p1-q1 + por m3, m0 ; FFABS(p1-q1) + mova m0, [pb_80] + pxor m2, m0 + pxor m4, m0 + psubsb m2, m4 ; m2=p1-q1 (signed) backup for below + pand m3, [pb_FE] + psrlq m3, 1 ; m3=FFABS(p1-q1)/2, this can be used signed + paddusb m3, m1 + psubusb m3, m7 + pxor m1, m1 + pcmpeqb m3, m1 ; abs(p0-q0)*2+abs(p1-q1)/2<=flim mask(0xff/0x0) + + ; filter_common (use m2/p1-q1, m4=q0, m6=p0, m5/q0-p0 and m3/mask) + mova m4, m5 + pxor m5, m0 + pxor m0, m6 + psubsb m5, m0 ; q0-p0 (signed) + paddsb m2, m5 + paddsb m2, m5 + paddsb m2, m5 ; a=(p1-q1) + 3*(q0-p0) + pand m2, m3 ; apply filter mask (m3) + + mova m3, [pb_F8] + mova m1, m2 + paddsb m2, [pb_4] ; f1<<3=a+4 + paddsb m1, [pb_3] ; f2<<3=a+3 + pand m2, m3 + pand m1, m3 ; cache f2<<3 + + pxor m0, m0 + pxor m3, m3 + pcmpgtb m0, m2 ; which values are <0? + psubb m3, m2 ; -f1<<3 + psrlq m2, 3 ; +f1 + psrlq m3, 3 ; -f1 + pand m3, m0 + pandn m0, m2 + psubusb m4, m0 + paddusb m4, m3 ; q0-f1 + + pxor m0, m0 + pxor m3, m3 + pcmpgtb m0, m1 ; which values are <0? + psubb m3, m1 ; -f2<<3 + psrlq m1, 3 ; +f2 + psrlq m3, 3 ; -f2 + pand m3, m0 + pandn m0, m1 + paddusb m6, m0 + psubusb m6, m3 ; p0+f2 + + ; store +%ifidn %1, v + mova [dst1q], m4 + mova [dst1q+mstrideq], m6 +%else ; h + inc dst1q + SBUTTERFLY bw, 6, 4, 0 + +%if cpuflag(sse4) + inc dst2q +%endif + WRITE_8W m6, dst2q, dst1q, mstrideq, strideq + lea dst2q, [dst3q+mstrideq+1] +%if cpuflag(sse4) + inc dst3q +%endif + WRITE_8W m4, dst3q, dst2q, mstrideq, strideq +%endif + + RET +%endmacro + +INIT_XMM sse2 +SIMPLE_LOOPFILTER v, 3 +SIMPLE_LOOPFILTER h, 5 +INIT_XMM ssse3 +SIMPLE_LOOPFILTER v, 3 +SIMPLE_LOOPFILTER h, 5 +INIT_XMM sse4 +SIMPLE_LOOPFILTER h, 5 + +;----------------------------------------------------------------------------- +; void ff_vp8_h/v_loop_filter<size>_inner_<opt>(uint8_t *dst, [uint8_t *v,] ptrdiff_t stride, +; int flimE, int flimI, int hev_thr); +;----------------------------------------------------------------------------- + +%macro INNER_LOOPFILTER 2 +%define stack_size 0 +%ifndef m8 ; stack layout: [0]=E, [1]=I, [2]=hev_thr +%ifidn %1, v ; [3]=hev() result +%define stack_size mmsize * -4 +%else ; h ; extra storage space for transposes +%define stack_size mmsize * -5 +%endif +%endif + +%if %2 == 8 ; chroma +cglobal vp8_%1_loop_filter8uv_inner, 6, 6, 13, stack_size, dst, dst8, stride, flimE, flimI, hevthr +%else ; luma +cglobal vp8_%1_loop_filter16y_inner, 5, 5, 13, stack_size, dst, stride, flimE, flimI, hevthr +%endif + +%if cpuflag(ssse3) + pxor m7, m7 +%endif + +%ifndef m8 + ; splat function arguments + SPLATB_REG m0, flimEq, m7 ; E + SPLATB_REG m1, flimIq, m7 ; I + SPLATB_REG m2, hevthrq, m7 ; hev_thresh + +%define m_flimE [rsp] +%define m_flimI [rsp+mmsize] +%define m_hevthr [rsp+mmsize*2] +%define m_maskres [rsp+mmsize*3] +%define m_p0backup [rsp+mmsize*3] +%define m_q0backup [rsp+mmsize*4] + + mova m_flimE, m0 + mova m_flimI, m1 + mova m_hevthr, m2 +%else +%define m_flimE m9 +%define m_flimI m10 +%define m_hevthr m11 +%define m_maskres m12 +%define m_p0backup m12 +%define m_q0backup m8 + + ; splat function arguments + SPLATB_REG m_flimE, flimEq, m7 ; E + SPLATB_REG m_flimI, flimIq, m7 ; I + SPLATB_REG m_hevthr, hevthrq, m7 ; hev_thresh +%endif + +%if %2 == 8 ; chroma + DEFINE_ARGS dst1, dst8, mstride, stride, dst2 +%else + DEFINE_ARGS dst1, mstride, stride, dst2, dst8 +%endif + mov strideq, mstrideq + neg mstrideq +%ifidn %1, h + lea dst1q, [dst1q+strideq*4-4] +%if %2 == 8 ; chroma + lea dst8q, [dst8q+strideq*4-4] +%endif +%endif + + ; read + lea dst2q, [dst1q+strideq] +%ifidn %1, v +%if %2 == 8 && mmsize == 16 +%define movrow movh +%else +%define movrow mova +%endif + movrow m0, [dst1q+mstrideq*4] ; p3 + movrow m1, [dst2q+mstrideq*4] ; p2 + movrow m2, [dst1q+mstrideq*2] ; p1 + movrow m5, [dst2q] ; q1 + movrow m6, [dst2q+ strideq*1] ; q2 + movrow m7, [dst2q+ strideq*2] ; q3 +%if mmsize == 16 && %2 == 8 + movhps m0, [dst8q+mstrideq*4] + movhps m2, [dst8q+mstrideq*2] + add dst8q, strideq + movhps m1, [dst8q+mstrideq*4] + movhps m5, [dst8q] + movhps m6, [dst8q+ strideq ] + movhps m7, [dst8q+ strideq*2] + add dst8q, mstrideq +%endif +%else ; h +%if %2 == 16 + lea dst8q, [dst1q+ strideq*8] +%endif + + ; read 16 rows of 8px each, interleave + movh m0, [dst1q+mstrideq*4] + movh m1, [dst8q+mstrideq*4] + movh m2, [dst1q+mstrideq*2] + movh m5, [dst8q+mstrideq*2] + movh m3, [dst1q+mstrideq ] + movh m6, [dst8q+mstrideq ] + movh m4, [dst1q] + movh m7, [dst8q] + punpcklbw m0, m1 ; A/I + punpcklbw m2, m5 ; C/K + punpcklbw m3, m6 ; D/L + punpcklbw m4, m7 ; E/M + + add dst8q, strideq + movh m1, [dst2q+mstrideq*4] + movh m6, [dst8q+mstrideq*4] + movh m5, [dst2q] + movh m7, [dst8q] + punpcklbw m1, m6 ; B/J + punpcklbw m5, m7 ; F/N + movh m6, [dst2q+ strideq ] + movh m7, [dst8q+ strideq ] + punpcklbw m6, m7 ; G/O + + ; 8x16 transpose + TRANSPOSE4x4B 0, 1, 2, 3, 7 +%ifdef m8 + SWAP 1, 8 +%else + mova m_q0backup, m1 +%endif + movh m7, [dst2q+ strideq*2] + movh m1, [dst8q+ strideq*2] + punpcklbw m7, m1 ; H/P + TRANSPOSE4x4B 4, 5, 6, 7, 1 + SBUTTERFLY dq, 0, 4, 1 ; p3/p2 + SBUTTERFLY dq, 2, 6, 1 ; q0/q1 + SBUTTERFLY dq, 3, 7, 1 ; q2/q3 +%ifdef m8 + SWAP 1, 8 + SWAP 2, 8 +%else + mova m1, m_q0backup + mova m_q0backup, m2 ; store q0 +%endif + SBUTTERFLY dq, 1, 5, 2 ; p1/p0 +%ifdef m12 + SWAP 5, 12 +%else + mova m_p0backup, m5 ; store p0 +%endif + SWAP 1, 4 + SWAP 2, 4 + SWAP 6, 3 + SWAP 5, 3 +%endif + + ; normal_limit for p3-p2, p2-p1, q3-q2 and q2-q1 + mova m4, m1 + SWAP 4, 1 + psubusb m4, m0 ; p2-p3 + psubusb m0, m1 ; p3-p2 + por m0, m4 ; abs(p3-p2) + + mova m4, m2 + SWAP 4, 2 + psubusb m4, m1 ; p1-p2 + psubusb m1, m2 ; p2-p1 + por m1, m4 ; abs(p2-p1) + + mova m4, m6 + SWAP 4, 6 + psubusb m4, m7 ; q2-q3 + psubusb m7, m6 ; q3-q2 + por m7, m4 ; abs(q3-q2) + + mova m4, m5 + SWAP 4, 5 + psubusb m4, m6 ; q1-q2 + psubusb m6, m5 ; q2-q1 + por m6, m4 ; abs(q2-q1) + + pmaxub m0, m1 + pmaxub m6, m7 + pmaxub m0, m6 + + ; normal_limit and high_edge_variance for p1-p0, q1-q0 + SWAP 7, 3 ; now m7 is zero +%ifidn %1, v + movrow m3, [dst1q+mstrideq ] ; p0 +%if mmsize == 16 && %2 == 8 + movhps m3, [dst8q+mstrideq ] +%endif +%elifdef m12 + SWAP 3, 12 +%else + mova m3, m_p0backup +%endif + + mova m1, m2 + SWAP 1, 2 + mova m6, m3 + SWAP 3, 6 + psubusb m1, m3 ; p1-p0 + psubusb m6, m2 ; p0-p1 + por m1, m6 ; abs(p1-p0) + pmaxub m0, m1 ; max_I + SWAP 1, 4 ; max_hev_thresh + + SWAP 6, 4 ; now m6 is I +%ifidn %1, v + movrow m4, [dst1q] ; q0 +%if mmsize == 16 && %2 == 8 + movhps m4, [dst8q] +%endif +%elifdef m8 + SWAP 4, 8 +%else + mova m4, m_q0backup +%endif + mova m1, m4 + SWAP 1, 4 + mova m7, m5 + SWAP 7, 5 + psubusb m1, m5 ; q0-q1 + psubusb m7, m4 ; q1-q0 + por m1, m7 ; abs(q1-q0) + pxor m7, m7 + pmaxub m0, m1 + pmaxub m6, m1 + psubusb m0, m_flimI + psubusb m6, m_hevthr + pcmpeqb m0, m7 ; max(abs(..)) <= I + pcmpeqb m6, m7 ; !(max(abs..) > thresh) +%ifdef m12 + SWAP 6, 12 +%else + mova m_maskres, m6 ; !(abs(p1-p0) > hev_t || abs(q1-q0) > hev_t) +%endif + + ; simple_limit + mova m1, m3 + SWAP 1, 3 + mova m6, m4 ; keep copies of p0/q0 around for later use + SWAP 6, 4 + psubusb m1, m4 ; p0-q0 + psubusb m6, m3 ; q0-p0 + por m1, m6 ; abs(q0-p0) + paddusb m1, m1 ; m1=2*abs(q0-p0) + + mova m7, m2 + SWAP 7, 2 + mova m6, m5 + SWAP 6, 5 + psubusb m7, m5 ; p1-q1 + psubusb m6, m2 ; q1-p1 + por m7, m6 ; abs(q1-p1) + pxor m6, m6 + pand m7, [pb_FE] + psrlq m7, 1 ; abs(q1-p1)/2 + paddusb m7, m1 ; abs(q0-p0)*2+abs(q1-p1)/2 + psubusb m7, m_flimE + pcmpeqb m7, m6 ; abs(q0-p0)*2+abs(q1-p1)/2 <= E + pand m0, m7 ; normal_limit result + + ; filter_common; at this point, m2-m5=p1-q1 and m0 is filter_mask +%ifdef m8 ; x86-64 && sse2 + mova m8, [pb_80] +%define m_pb_80 m8 +%else ; x86-32 or mmx/mmxext +%define m_pb_80 [pb_80] +%endif + mova m1, m4 + mova m7, m3 + pxor m1, m_pb_80 + pxor m7, m_pb_80 + psubsb m1, m7 ; (signed) q0-p0 + mova m6, m2 + mova m7, m5 + pxor m6, m_pb_80 + pxor m7, m_pb_80 + psubsb m6, m7 ; (signed) p1-q1 + mova m7, m_maskres + pandn m7, m6 + paddsb m7, m1 + paddsb m7, m1 + paddsb m7, m1 ; 3*(q0-p0)+is4tap?(p1-q1) + + pand m7, m0 + mova m1, [pb_F8] + mova m6, m7 + paddsb m7, [pb_3] + paddsb m6, [pb_4] + pand m7, m1 + pand m6, m1 + + pxor m1, m1 + pxor m0, m0 + pcmpgtb m1, m7 + psubb m0, m7 + psrlq m7, 3 ; +f2 + psrlq m0, 3 ; -f2 + pand m0, m1 + pandn m1, m7 + psubusb m3, m0 + paddusb m3, m1 ; p0+f2 + + pxor m1, m1 + pxor m0, m0 + pcmpgtb m0, m6 + psubb m1, m6 + psrlq m6, 3 ; +f1 + psrlq m1, 3 ; -f1 + pand m1, m0 + pandn m0, m6 + psubusb m4, m0 + paddusb m4, m1 ; q0-f1 + +%ifdef m12 + SWAP 6, 12 +%else + mova m6, m_maskres +%endif + pxor m7, m7 + pand m0, m6 + pand m1, m6 + psubusb m1, [pb_1] + pavgb m0, m7 ; a + pavgb m1, m7 ; -a + psubusb m5, m0 + psubusb m2, m1 + paddusb m5, m1 ; q1-a + paddusb m2, m0 ; p1+a + + ; store +%ifidn %1, v + movrow [dst1q+mstrideq*2], m2 + movrow [dst1q+mstrideq ], m3 + movrow [dst1q], m4 + movrow [dst1q+ strideq ], m5 +%if mmsize == 16 && %2 == 8 + movhps [dst8q+mstrideq*2], m2 + movhps [dst8q+mstrideq ], m3 + movhps [dst8q], m4 + movhps [dst8q+ strideq ], m5 +%endif +%else ; h + add dst1q, 2 + add dst2q, 2 + + ; 4x8/16 transpose + TRANSPOSE4x4B 2, 3, 4, 5, 6 + + lea dst8q, [dst8q+mstrideq +2] + WRITE_4x4D 2, 3, 4, 5, dst1q, dst2q, dst8q, mstrideq, strideq, %2 +%endif + + RET +%endmacro + +INIT_XMM sse2 +INNER_LOOPFILTER v, 16 +INNER_LOOPFILTER h, 16 +INNER_LOOPFILTER v, 8 +INNER_LOOPFILTER h, 8 + +INIT_XMM ssse3 +INNER_LOOPFILTER v, 16 +INNER_LOOPFILTER h, 16 +INNER_LOOPFILTER v, 8 +INNER_LOOPFILTER h, 8 + +;----------------------------------------------------------------------------- +; void ff_vp8_h/v_loop_filter<size>_mbedge_<opt>(uint8_t *dst, [uint8_t *v,] ptrdiff_t stride, +; int flimE, int flimI, int hev_thr); +;----------------------------------------------------------------------------- + +%macro MBEDGE_LOOPFILTER 2 +%define stack_size 0 +%ifndef m8 ; stack layout: [0]=E, [1]=I, [2]=hev_thr +%if mmsize == 16 ; [3]=hev() result + ; [4]=filter tmp result + ; [5]/[6] = p2/q2 backup + ; [7]=lim_res sign result +%define stack_size mmsize * -7 +%else ; 8 ; extra storage space for transposes +%define stack_size mmsize * -8 +%endif +%endif + +%if %2 == 8 ; chroma +cglobal vp8_%1_loop_filter8uv_mbedge, 6, 6, 15, stack_size, dst1, dst8, stride, flimE, flimI, hevthr +%else ; luma +cglobal vp8_%1_loop_filter16y_mbedge, 5, 5, 15, stack_size, dst1, stride, flimE, flimI, hevthr +%endif + +%if cpuflag(ssse3) + pxor m7, m7 +%endif + +%ifndef m8 + ; splat function arguments + SPLATB_REG m0, flimEq, m7 ; E + SPLATB_REG m1, flimIq, m7 ; I + SPLATB_REG m2, hevthrq, m7 ; hev_thresh + +%define m_flimE [rsp] +%define m_flimI [rsp+mmsize] +%define m_hevthr [rsp+mmsize*2] +%define m_maskres [rsp+mmsize*3] +%define m_limres [rsp+mmsize*4] +%define m_p0backup [rsp+mmsize*3] +%define m_q0backup [rsp+mmsize*4] +%define m_p2backup [rsp+mmsize*5] +%define m_q2backup [rsp+mmsize*6] +%if mmsize == 16 +%define m_limsign [rsp] +%else +%define m_limsign [rsp+mmsize*7] +%endif + + mova m_flimE, m0 + mova m_flimI, m1 + mova m_hevthr, m2 +%else ; sse2 on x86-64 +%define m_flimE m9 +%define m_flimI m10 +%define m_hevthr m11 +%define m_maskres m12 +%define m_limres m8 +%define m_p0backup m12 +%define m_q0backup m8 +%define m_p2backup m13 +%define m_q2backup m14 +%define m_limsign m9 + + ; splat function arguments + SPLATB_REG m_flimE, flimEq, m7 ; E + SPLATB_REG m_flimI, flimIq, m7 ; I + SPLATB_REG m_hevthr, hevthrq, m7 ; hev_thresh +%endif + +%if %2 == 8 ; chroma + DEFINE_ARGS dst1, dst8, mstride, stride, dst2 +%else + DEFINE_ARGS dst1, mstride, stride, dst2, dst8 +%endif + mov strideq, mstrideq + neg mstrideq +%ifidn %1, h + lea dst1q, [dst1q+strideq*4-4] +%if %2 == 8 ; chroma + lea dst8q, [dst8q+strideq*4-4] +%endif +%endif + + ; read + lea dst2q, [dst1q+ strideq ] +%ifidn %1, v +%if %2 == 8 && mmsize == 16 +%define movrow movh +%else +%define movrow mova +%endif + movrow m0, [dst1q+mstrideq*4] ; p3 + movrow m1, [dst2q+mstrideq*4] ; p2 + movrow m2, [dst1q+mstrideq*2] ; p1 + movrow m5, [dst2q] ; q1 + movrow m6, [dst2q+ strideq ] ; q2 + movrow m7, [dst2q+ strideq*2] ; q3 +%if mmsize == 16 && %2 == 8 + movhps m0, [dst8q+mstrideq*4] + movhps m2, [dst8q+mstrideq*2] + add dst8q, strideq + movhps m1, [dst8q+mstrideq*4] + movhps m5, [dst8q] + movhps m6, [dst8q+ strideq ] + movhps m7, [dst8q+ strideq*2] + add dst8q, mstrideq +%endif +%else ; h +%if %2 == 16 + lea dst8q, [dst1q+ strideq*8 ] +%endif + + ; read 16 rows of 8px each, interleave + movh m0, [dst1q+mstrideq*4] + movh m1, [dst8q+mstrideq*4] + movh m2, [dst1q+mstrideq*2] + movh m5, [dst8q+mstrideq*2] + movh m3, [dst1q+mstrideq ] + movh m6, [dst8q+mstrideq ] + movh m4, [dst1q] + movh m7, [dst8q] + punpcklbw m0, m1 ; A/I + punpcklbw m2, m5 ; C/K + punpcklbw m3, m6 ; D/L + punpcklbw m4, m7 ; E/M + + add dst8q, strideq + movh m1, [dst2q+mstrideq*4] + movh m6, [dst8q+mstrideq*4] + movh m5, [dst2q] + movh m7, [dst8q] + punpcklbw m1, m6 ; B/J + punpcklbw m5, m7 ; F/N + movh m6, [dst2q+ strideq ] + movh m7, [dst8q+ strideq ] + punpcklbw m6, m7 ; G/O + + ; 8x16 transpose + TRANSPOSE4x4B 0, 1, 2, 3, 7 +%ifdef m8 + SWAP 1, 8 +%else + mova m_q0backup, m1 +%endif + movh m7, [dst2q+ strideq*2] + movh m1, [dst8q+ strideq*2] + punpcklbw m7, m1 ; H/P + TRANSPOSE4x4B 4, 5, 6, 7, 1 + SBUTTERFLY dq, 0, 4, 1 ; p3/p2 + SBUTTERFLY dq, 2, 6, 1 ; q0/q1 + SBUTTERFLY dq, 3, 7, 1 ; q2/q3 +%ifdef m8 + SWAP 1, 8 + SWAP 2, 8 +%else + mova m1, m_q0backup + mova m_q0backup, m2 ; store q0 +%endif + SBUTTERFLY dq, 1, 5, 2 ; p1/p0 +%ifdef m12 + SWAP 5, 12 +%else + mova m_p0backup, m5 ; store p0 +%endif + SWAP 1, 4 + SWAP 2, 4 + SWAP 6, 3 + SWAP 5, 3 +%endif + + ; normal_limit for p3-p2, p2-p1, q3-q2 and q2-q1 + mova m4, m1 + SWAP 4, 1 + psubusb m4, m0 ; p2-p3 + psubusb m0, m1 ; p3-p2 + por m0, m4 ; abs(p3-p2) + + mova m4, m2 + SWAP 4, 2 + psubusb m4, m1 ; p1-p2 + mova m_p2backup, m1 + psubusb m1, m2 ; p2-p1 + por m1, m4 ; abs(p2-p1) + + mova m4, m6 + SWAP 4, 6 + psubusb m4, m7 ; q2-q3 + psubusb m7, m6 ; q3-q2 + por m7, m4 ; abs(q3-q2) + + mova m4, m5 + SWAP 4, 5 + psubusb m4, m6 ; q1-q2 + mova m_q2backup, m6 + psubusb m6, m5 ; q2-q1 + por m6, m4 ; abs(q2-q1) + + pmaxub m0, m1 + pmaxub m6, m7 + pmaxub m0, m6 + + ; normal_limit and high_edge_variance for p1-p0, q1-q0 + SWAP 7, 3 ; now m7 is zero +%ifidn %1, v + movrow m3, [dst1q+mstrideq ] ; p0 +%if mmsize == 16 && %2 == 8 + movhps m3, [dst8q+mstrideq ] +%endif +%elifdef m12 + SWAP 3, 12 +%else + mova m3, m_p0backup +%endif + + mova m1, m2 + SWAP 1, 2 + mova m6, m3 + SWAP 3, 6 + psubusb m1, m3 ; p1-p0 + psubusb m6, m2 ; p0-p1 + por m1, m6 ; abs(p1-p0) + pmaxub m0, m1 ; max_I + SWAP 1, 4 ; max_hev_thresh + + SWAP 6, 4 ; now m6 is I +%ifidn %1, v + movrow m4, [dst1q] ; q0 +%if mmsize == 16 && %2 == 8 + movhps m4, [dst8q] +%endif +%elifdef m8 + SWAP 4, 8 +%else + mova m4, m_q0backup +%endif + mova m1, m4 + SWAP 1, 4 + mova m7, m5 + SWAP 7, 5 + psubusb m1, m5 ; q0-q1 + psubusb m7, m4 ; q1-q0 + por m1, m7 ; abs(q1-q0) + pxor m7, m7 + pmaxub m0, m1 + pmaxub m6, m1 + psubusb m0, m_flimI + psubusb m6, m_hevthr + pcmpeqb m0, m7 ; max(abs(..)) <= I + pcmpeqb m6, m7 ; !(max(abs..) > thresh) +%ifdef m12 + SWAP 6, 12 +%else + mova m_maskres, m6 ; !(abs(p1-p0) > hev_t || abs(q1-q0) > hev_t) +%endif + + ; simple_limit + mova m1, m3 + SWAP 1, 3 + mova m6, m4 ; keep copies of p0/q0 around for later use + SWAP 6, 4 + psubusb m1, m4 ; p0-q0 + psubusb m6, m3 ; q0-p0 + por m1, m6 ; abs(q0-p0) + paddusb m1, m1 ; m1=2*abs(q0-p0) + + mova m7, m2 + SWAP 7, 2 + mova m6, m5 + SWAP 6, 5 + psubusb m7, m5 ; p1-q1 + psubusb m6, m2 ; q1-p1 + por m7, m6 ; abs(q1-p1) + pxor m6, m6 + pand m7, [pb_FE] + psrlq m7, 1 ; abs(q1-p1)/2 + paddusb m7, m1 ; abs(q0-p0)*2+abs(q1-p1)/2 + psubusb m7, m_flimE + pcmpeqb m7, m6 ; abs(q0-p0)*2+abs(q1-p1)/2 <= E + pand m0, m7 ; normal_limit result + + ; filter_common; at this point, m2-m5=p1-q1 and m0 is filter_mask +%ifdef m8 ; x86-64 && sse2 + mova m8, [pb_80] +%define m_pb_80 m8 +%else ; x86-32 or mmx/mmxext +%define m_pb_80 [pb_80] +%endif + mova m1, m4 + mova m7, m3 + pxor m1, m_pb_80 + pxor m7, m_pb_80 + psubsb m1, m7 ; (signed) q0-p0 + mova m6, m2 + mova m7, m5 + pxor m6, m_pb_80 + pxor m7, m_pb_80 + psubsb m6, m7 ; (signed) p1-q1 + mova m7, m_maskres + paddsb m6, m1 + paddsb m6, m1 + paddsb m6, m1 + pand m6, m0 +%ifdef m8 + mova m_limres, m6 ; 3*(qp-p0)+(p1-q1) masked for filter_mbedge + pand m_limres, m7 +%else + mova m0, m6 + pand m0, m7 + mova m_limres, m0 +%endif + pandn m7, m6 ; 3*(q0-p0)+(p1-q1) masked for filter_common + + mova m1, [pb_F8] + mova m6, m7 + paddsb m7, [pb_3] + paddsb m6, [pb_4] + pand m7, m1 + pand m6, m1 + + pxor m1, m1 + pxor m0, m0 + pcmpgtb m1, m7 + psubb m0, m7 + psrlq m7, 3 ; +f2 + psrlq m0, 3 ; -f2 + pand m0, m1 + pandn m1, m7 + psubusb m3, m0 + paddusb m3, m1 ; p0+f2 + + pxor m1, m1 + pxor m0, m0 + pcmpgtb m0, m6 + psubb m1, m6 + psrlq m6, 3 ; +f1 + psrlq m1, 3 ; -f1 + pand m1, m0 + pandn m0, m6 + psubusb m4, m0 + paddusb m4, m1 ; q0-f1 + + ; filter_mbedge (m2-m5 = p1-q1; lim_res carries w) +%if cpuflag(ssse3) + mova m7, [pb_1] +%else + mova m7, [pw_63] +%endif +%ifdef m8 + SWAP 1, 8 +%else + mova m1, m_limres +%endif + pxor m0, m0 + mova m6, m1 + pcmpgtb m0, m1 ; which are negative +%if cpuflag(ssse3) + punpcklbw m6, m7 ; interleave with "1" for rounding + punpckhbw m1, m7 +%else + punpcklbw m6, m0 ; signed byte->word + punpckhbw m1, m0 +%endif + mova m_limsign, m0 +%if cpuflag(ssse3) + mova m7, [pb_27_63] +%ifndef m8 + mova m_limres, m1 +%endif +%ifdef m10 + SWAP 0, 10 ; don't lose lim_sign copy +%endif + mova m0, m7 + pmaddubsw m7, m6 + SWAP 6, 7 + pmaddubsw m0, m1 + SWAP 1, 0 +%ifdef m10 + SWAP 0, 10 +%else + mova m0, m_limsign +%endif +%else + mova m_maskres, m6 ; backup for later in filter + mova m_limres, m1 + pmullw m6, [pw_27] + pmullw m1, [pw_27] + paddw m6, m7 + paddw m1, m7 +%endif + psraw m6, 7 + psraw m1, 7 + packsswb m6, m1 ; a0 + pxor m1, m1 + psubb m1, m6 + pand m1, m0 ; -a0 + pandn m0, m6 ; +a0 +%if cpuflag(ssse3) + mova m6, [pb_18_63] ; pipelining +%endif + psubusb m3, m1 + paddusb m4, m1 + paddusb m3, m0 ; p0+a0 + psubusb m4, m0 ; q0-a0 + +%if cpuflag(ssse3) + SWAP 6, 7 +%ifdef m10 + SWAP 1, 10 +%else + mova m1, m_limres +%endif + mova m0, m7 + pmaddubsw m7, m6 + SWAP 6, 7 + pmaddubsw m0, m1 + SWAP 1, 0 +%ifdef m10 + SWAP 0, 10 +%endif + mova m0, m_limsign +%else + mova m6, m_maskres + mova m1, m_limres + pmullw m6, [pw_18] + pmullw m1, [pw_18] + paddw m6, m7 + paddw m1, m7 +%endif + mova m0, m_limsign + psraw m6, 7 + psraw m1, 7 + packsswb m6, m1 ; a1 + pxor m1, m1 + psubb m1, m6 + pand m1, m0 ; -a1 + pandn m0, m6 ; +a1 +%if cpuflag(ssse3) + mova m6, [pb_9_63] +%endif + psubusb m2, m1 + paddusb m5, m1 + paddusb m2, m0 ; p1+a1 + psubusb m5, m0 ; q1-a1 + +%if cpuflag(ssse3) + SWAP 6, 7 +%ifdef m10 + SWAP 1, 10 +%else + mova m1, m_limres +%endif + mova m0, m7 + pmaddubsw m7, m6 + SWAP 6, 7 + pmaddubsw m0, m1 + SWAP 1, 0 +%else +%ifdef m8 + SWAP 6, 12 + SWAP 1, 8 +%else + mova m6, m_maskres + mova m1, m_limres +%endif + pmullw m6, [pw_9] + pmullw m1, [pw_9] + paddw m6, m7 + paddw m1, m7 +%endif +%ifdef m9 + SWAP 7, 9 +%else + mova m7, m_limsign +%endif + psraw m6, 7 + psraw m1, 7 + packsswb m6, m1 ; a1 + pxor m0, m0 + psubb m0, m6 + pand m0, m7 ; -a1 + pandn m7, m6 ; +a1 +%ifdef m8 + SWAP 1, 13 + SWAP 6, 14 +%else + mova m1, m_p2backup + mova m6, m_q2backup +%endif + psubusb m1, m0 + paddusb m6, m0 + paddusb m1, m7 ; p1+a1 + psubusb m6, m7 ; q1-a1 + + ; store +%ifidn %1, v + movrow [dst2q+mstrideq*4], m1 + movrow [dst1q+mstrideq*2], m2 + movrow [dst1q+mstrideq ], m3 + movrow [dst1q], m4 + movrow [dst2q], m5 + movrow [dst2q+ strideq ], m6 +%if mmsize == 16 && %2 == 8 + add dst8q, mstrideq + movhps [dst8q+mstrideq*2], m1 + movhps [dst8q+mstrideq ], m2 + movhps [dst8q], m3 + add dst8q, strideq + movhps [dst8q], m4 + movhps [dst8q+ strideq ], m5 + movhps [dst8q+ strideq*2], m6 +%endif +%else ; h + inc dst1q + inc dst2q + + ; 4x8/16 transpose + TRANSPOSE4x4B 1, 2, 3, 4, 0 + SBUTTERFLY bw, 5, 6, 0 + + lea dst8q, [dst8q+mstrideq+1] + WRITE_4x4D 1, 2, 3, 4, dst1q, dst2q, dst8q, mstrideq, strideq, %2 + lea dst1q, [dst2q+mstrideq+4] + lea dst8q, [dst8q+mstrideq+4] +%if cpuflag(sse4) + add dst2q, 4 +%endif + WRITE_8W m5, dst2q, dst1q, mstrideq, strideq +%if cpuflag(sse4) + lea dst2q, [dst8q+ strideq ] +%endif + WRITE_8W m6, dst2q, dst8q, mstrideq, strideq +%endif + + RET +%endmacro + +INIT_XMM sse2 +MBEDGE_LOOPFILTER v, 16 +MBEDGE_LOOPFILTER h, 16 +MBEDGE_LOOPFILTER v, 8 +MBEDGE_LOOPFILTER h, 8 + +INIT_XMM ssse3 +MBEDGE_LOOPFILTER v, 16 +MBEDGE_LOOPFILTER h, 16 +MBEDGE_LOOPFILTER v, 8 +MBEDGE_LOOPFILTER h, 8 + +INIT_XMM sse4 +MBEDGE_LOOPFILTER h, 16 +MBEDGE_LOOPFILTER h, 8 diff --git a/media/ffvpx/libavcodec/x86/vp9dsp_init.c b/media/ffvpx/libavcodec/x86/vp9dsp_init.c new file mode 100644 index 0000000000..8d11dbc348 --- /dev/null +++ b/media/ffvpx/libavcodec/x86/vp9dsp_init.c @@ -0,0 +1,415 @@ +/* + * VP9 SIMD optimizations + * + * Copyright (c) 2013 Ronald S. Bultje <rsbultje gmail com> + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#include "libavutil/attributes.h" +#include "libavutil/cpu.h" +#include "libavutil/x86/cpu.h" +#include "libavcodec/vp9dsp.h" +#include "libavcodec/x86/vp9dsp_init.h" + +#if HAVE_X86ASM + +decl_fpel_func(put, 4, , mmx); +decl_fpel_func(put, 8, , mmx); +decl_fpel_func(put, 16, , sse); +decl_fpel_func(put, 32, , sse); +decl_fpel_func(put, 64, , sse); +decl_fpel_func(avg, 4, _8, mmxext); +decl_fpel_func(avg, 8, _8, mmxext); +decl_fpel_func(avg, 16, _8, sse2); +decl_fpel_func(avg, 32, _8, sse2); +decl_fpel_func(avg, 64, _8, sse2); +decl_fpel_func(put, 32, , avx); +decl_fpel_func(put, 64, , avx); +decl_fpel_func(avg, 32, _8, avx2); +decl_fpel_func(avg, 64, _8, avx2); + +decl_mc_funcs(4, mmxext, int16_t, 8, 8); +decl_mc_funcs(8, sse2, int16_t, 8, 8); +decl_mc_funcs(4, ssse3, int8_t, 32, 8); +decl_mc_funcs(8, ssse3, int8_t, 32, 8); +#if ARCH_X86_64 +decl_mc_funcs(16, ssse3, int8_t, 32, 8); +decl_mc_funcs(32, avx2, int8_t, 32, 8); +#endif + +mc_rep_funcs(16, 8, 8, sse2, int16_t, 8, 8) +#if ARCH_X86_32 +mc_rep_funcs(16, 8, 8, ssse3, int8_t, 32, 8) +#endif +mc_rep_funcs(32, 16, 16, sse2, int16_t, 8, 8) +mc_rep_funcs(32, 16, 16, ssse3, int8_t, 32, 8) +mc_rep_funcs(64, 32, 32, sse2, int16_t, 8, 8) +mc_rep_funcs(64, 32, 32, ssse3, int8_t, 32, 8) +#if ARCH_X86_64 && HAVE_AVX2_EXTERNAL +mc_rep_funcs(64, 32, 32, avx2, int8_t, 32, 8) +#endif + +extern const int8_t ff_filters_ssse3[3][15][4][32]; +extern const int16_t ff_filters_sse2[3][15][8][8]; + +filters_8tap_2d_fn2(put, 16, 8, 1, mmxext, sse2, sse2) +filters_8tap_2d_fn2(avg, 16, 8, 1, mmxext, sse2, sse2) +filters_8tap_2d_fn2(put, 16, 8, 1, ssse3, ssse3, ssse3) +filters_8tap_2d_fn2(avg, 16, 8, 1, ssse3, ssse3, ssse3) +#if ARCH_X86_64 && HAVE_AVX2_EXTERNAL +filters_8tap_2d_fn(put, 64, 32, 8, 1, avx2, ssse3) +filters_8tap_2d_fn(put, 32, 32, 8, 1, avx2, ssse3) +filters_8tap_2d_fn(avg, 64, 32, 8, 1, avx2, ssse3) +filters_8tap_2d_fn(avg, 32, 32, 8, 1, avx2, ssse3) +#endif + +filters_8tap_1d_fn3(put, 8, mmxext, sse2, sse2) +filters_8tap_1d_fn3(avg, 8, mmxext, sse2, sse2) +filters_8tap_1d_fn3(put, 8, ssse3, ssse3, ssse3) +filters_8tap_1d_fn3(avg, 8, ssse3, ssse3, ssse3) +#if ARCH_X86_64 && HAVE_AVX2_EXTERNAL +filters_8tap_1d_fn2(put, 64, 8, avx2, ssse3) +filters_8tap_1d_fn2(put, 32, 8, avx2, ssse3) +filters_8tap_1d_fn2(avg, 64, 8, avx2, ssse3) +filters_8tap_1d_fn2(avg, 32, 8, avx2, ssse3) +#endif + +#define itxfm_func(typea, typeb, size, opt) \ +void ff_vp9_##typea##_##typeb##_##size##x##size##_add_##opt(uint8_t *dst, ptrdiff_t stride, \ + int16_t *block, int eob) +#define itxfm_funcs(size, opt) \ +itxfm_func(idct, idct, size, opt); \ +itxfm_func(iadst, idct, size, opt); \ +itxfm_func(idct, iadst, size, opt); \ +itxfm_func(iadst, iadst, size, opt) + +itxfm_func(idct, idct, 4, mmxext); +itxfm_func(idct, iadst, 4, sse2); +itxfm_func(iadst, idct, 4, sse2); +itxfm_func(iadst, iadst, 4, sse2); +itxfm_funcs(4, ssse3); +itxfm_funcs(8, sse2); +itxfm_funcs(8, ssse3); +itxfm_funcs(8, avx); +itxfm_funcs(16, sse2); +itxfm_funcs(16, ssse3); +itxfm_funcs(16, avx); +itxfm_func(idct, idct, 32, sse2); +itxfm_func(idct, idct, 32, ssse3); +itxfm_func(idct, idct, 32, avx); +itxfm_func(iwht, iwht, 4, mmx); +itxfm_funcs(16, avx2); +itxfm_func(idct, idct, 32, avx2); + +#undef itxfm_func +#undef itxfm_funcs + +#define lpf_funcs(size1, size2, opt) \ +void ff_vp9_loop_filter_v_##size1##_##size2##_##opt(uint8_t *dst, ptrdiff_t stride, \ + int E, int I, int H); \ +void ff_vp9_loop_filter_h_##size1##_##size2##_##opt(uint8_t *dst, ptrdiff_t stride, \ + int E, int I, int H) + +lpf_funcs(4, 8, mmxext); +lpf_funcs(8, 8, mmxext); +lpf_funcs(16, 16, sse2); +lpf_funcs(16, 16, ssse3); +lpf_funcs(16, 16, avx); +lpf_funcs(44, 16, sse2); +lpf_funcs(44, 16, ssse3); +lpf_funcs(44, 16, avx); +lpf_funcs(84, 16, sse2); +lpf_funcs(84, 16, ssse3); +lpf_funcs(84, 16, avx); +lpf_funcs(48, 16, sse2); +lpf_funcs(48, 16, ssse3); +lpf_funcs(48, 16, avx); +lpf_funcs(88, 16, sse2); +lpf_funcs(88, 16, ssse3); +lpf_funcs(88, 16, avx); + +#undef lpf_funcs + +#define ipred_func(size, type, opt) \ +void ff_vp9_ipred_##type##_##size##x##size##_##opt(uint8_t *dst, ptrdiff_t stride, \ + const uint8_t *l, const uint8_t *a) + +ipred_func(8, v, mmx); + +#define ipred_dc_funcs(size, opt) \ +ipred_func(size, dc, opt); \ +ipred_func(size, dc_left, opt); \ +ipred_func(size, dc_top, opt) + +ipred_dc_funcs(4, mmxext); +ipred_dc_funcs(8, mmxext); + +#define ipred_dir_tm_funcs(size, opt) \ +ipred_func(size, tm, opt); \ +ipred_func(size, dl, opt); \ +ipred_func(size, dr, opt); \ +ipred_func(size, hd, opt); \ +ipred_func(size, hu, opt); \ +ipred_func(size, vl, opt); \ +ipred_func(size, vr, opt) + +ipred_dir_tm_funcs(4, mmxext); + +ipred_func(16, v, sse); +ipred_func(32, v, sse); + +ipred_dc_funcs(16, sse2); +ipred_dc_funcs(32, sse2); + +#define ipred_dir_tm_h_funcs(size, opt) \ +ipred_dir_tm_funcs(size, opt); \ +ipred_func(size, h, opt) + +ipred_dir_tm_h_funcs(8, sse2); +ipred_dir_tm_h_funcs(16, sse2); +ipred_dir_tm_h_funcs(32, sse2); + +ipred_func(4, h, sse2); + +#define ipred_all_funcs(size, opt) \ +ipred_dc_funcs(size, opt); \ +ipred_dir_tm_h_funcs(size, opt) + +// FIXME hd/vl_4x4_ssse3 does not exist +ipred_all_funcs(4, ssse3); +ipred_all_funcs(8, ssse3); +ipred_all_funcs(16, ssse3); +ipred_all_funcs(32, ssse3); + +ipred_dir_tm_h_funcs(8, avx); +ipred_dir_tm_h_funcs(16, avx); +ipred_dir_tm_h_funcs(32, avx); + +ipred_func(32, v, avx); + +ipred_dc_funcs(32, avx2); +ipred_func(32, h, avx2); +ipred_func(32, tm, avx2); + +#undef ipred_func +#undef ipred_dir_tm_h_funcs +#undef ipred_dir_tm_funcs +#undef ipred_dc_funcs + +#endif /* HAVE_X86ASM */ + +av_cold void ff_vp9dsp_init_x86(VP9DSPContext *dsp, int bpp, int bitexact) +{ +#if HAVE_X86ASM + int cpu_flags; + + if (bpp == 10) { + ff_vp9dsp_init_10bpp_x86(dsp, bitexact); + return; + } else if (bpp == 12) { + ff_vp9dsp_init_12bpp_x86(dsp, bitexact); + return; + } + + cpu_flags = av_get_cpu_flags(); + +#define init_lpf(opt) do { \ + dsp->loop_filter_16[0] = ff_vp9_loop_filter_h_16_16_##opt; \ + dsp->loop_filter_16[1] = ff_vp9_loop_filter_v_16_16_##opt; \ + dsp->loop_filter_mix2[0][0][0] = ff_vp9_loop_filter_h_44_16_##opt; \ + dsp->loop_filter_mix2[0][0][1] = ff_vp9_loop_filter_v_44_16_##opt; \ + dsp->loop_filter_mix2[0][1][0] = ff_vp9_loop_filter_h_48_16_##opt; \ + dsp->loop_filter_mix2[0][1][1] = ff_vp9_loop_filter_v_48_16_##opt; \ + dsp->loop_filter_mix2[1][0][0] = ff_vp9_loop_filter_h_84_16_##opt; \ + dsp->loop_filter_mix2[1][0][1] = ff_vp9_loop_filter_v_84_16_##opt; \ + dsp->loop_filter_mix2[1][1][0] = ff_vp9_loop_filter_h_88_16_##opt; \ + dsp->loop_filter_mix2[1][1][1] = ff_vp9_loop_filter_v_88_16_##opt; \ +} while (0) + +#define init_ipred(sz, opt, t, e) \ + dsp->intra_pred[TX_##sz##X##sz][e##_PRED] = ff_vp9_ipred_##t##_##sz##x##sz##_##opt + +#define ff_vp9_ipred_hd_4x4_ssse3 ff_vp9_ipred_hd_4x4_mmxext +#define ff_vp9_ipred_vl_4x4_ssse3 ff_vp9_ipred_vl_4x4_mmxext +#define init_dir_tm_ipred(sz, opt) do { \ + init_ipred(sz, opt, dl, DIAG_DOWN_LEFT); \ + init_ipred(sz, opt, dr, DIAG_DOWN_RIGHT); \ + init_ipred(sz, opt, hd, HOR_DOWN); \ + init_ipred(sz, opt, vl, VERT_LEFT); \ + init_ipred(sz, opt, hu, HOR_UP); \ + init_ipred(sz, opt, tm, TM_VP8); \ + init_ipred(sz, opt, vr, VERT_RIGHT); \ +} while (0) +#define init_dir_tm_h_ipred(sz, opt) do { \ + init_dir_tm_ipred(sz, opt); \ + init_ipred(sz, opt, h, HOR); \ +} while (0) +#define init_dc_ipred(sz, opt) do { \ + init_ipred(sz, opt, dc, DC); \ + init_ipred(sz, opt, dc_left, LEFT_DC); \ + init_ipred(sz, opt, dc_top, TOP_DC); \ +} while (0) +#define init_all_ipred(sz, opt) do { \ + init_dc_ipred(sz, opt); \ + init_dir_tm_h_ipred(sz, opt); \ +} while (0) + + if (EXTERNAL_MMX(cpu_flags)) { + init_fpel_func(4, 0, 4, put, , mmx); + init_fpel_func(3, 0, 8, put, , mmx); + if (!bitexact) { + dsp->itxfm_add[4 /* lossless */][DCT_DCT] = + dsp->itxfm_add[4 /* lossless */][ADST_DCT] = + dsp->itxfm_add[4 /* lossless */][DCT_ADST] = + dsp->itxfm_add[4 /* lossless */][ADST_ADST] = ff_vp9_iwht_iwht_4x4_add_mmx; + } + init_ipred(8, mmx, v, VERT); + } + + if (EXTERNAL_MMXEXT(cpu_flags)) { + dsp->loop_filter_8[0][0] = ff_vp9_loop_filter_h_4_8_mmxext; + dsp->loop_filter_8[0][1] = ff_vp9_loop_filter_v_4_8_mmxext; + dsp->loop_filter_8[1][0] = ff_vp9_loop_filter_h_8_8_mmxext; + dsp->loop_filter_8[1][1] = ff_vp9_loop_filter_v_8_8_mmxext; + init_subpel2(4, 0, 4, put, 8, mmxext); + init_subpel2(4, 1, 4, avg, 8, mmxext); + init_fpel_func(4, 1, 4, avg, _8, mmxext); + init_fpel_func(3, 1, 8, avg, _8, mmxext); + dsp->itxfm_add[TX_4X4][DCT_DCT] = ff_vp9_idct_idct_4x4_add_mmxext; + init_dc_ipred(4, mmxext); + init_dc_ipred(8, mmxext); + init_dir_tm_ipred(4, mmxext); + } + + if (EXTERNAL_SSE(cpu_flags)) { + init_fpel_func(2, 0, 16, put, , sse); + init_fpel_func(1, 0, 32, put, , sse); + init_fpel_func(0, 0, 64, put, , sse); + init_ipred(16, sse, v, VERT); + init_ipred(32, sse, v, VERT); + } + + if (EXTERNAL_SSE2(cpu_flags)) { + init_subpel3_8to64(0, put, 8, sse2); + init_subpel3_8to64(1, avg, 8, sse2); + init_fpel_func(2, 1, 16, avg, _8, sse2); + init_fpel_func(1, 1, 32, avg, _8, sse2); + init_fpel_func(0, 1, 64, avg, _8, sse2); + init_lpf(sse2); + dsp->itxfm_add[TX_4X4][ADST_DCT] = ff_vp9_idct_iadst_4x4_add_sse2; + dsp->itxfm_add[TX_4X4][DCT_ADST] = ff_vp9_iadst_idct_4x4_add_sse2; + dsp->itxfm_add[TX_4X4][ADST_ADST] = ff_vp9_iadst_iadst_4x4_add_sse2; + dsp->itxfm_add[TX_8X8][DCT_DCT] = ff_vp9_idct_idct_8x8_add_sse2; + dsp->itxfm_add[TX_8X8][ADST_DCT] = ff_vp9_idct_iadst_8x8_add_sse2; + dsp->itxfm_add[TX_8X8][DCT_ADST] = ff_vp9_iadst_idct_8x8_add_sse2; + dsp->itxfm_add[TX_8X8][ADST_ADST] = ff_vp9_iadst_iadst_8x8_add_sse2; + dsp->itxfm_add[TX_16X16][DCT_DCT] = ff_vp9_idct_idct_16x16_add_sse2; + dsp->itxfm_add[TX_16X16][ADST_DCT] = ff_vp9_idct_iadst_16x16_add_sse2; + dsp->itxfm_add[TX_16X16][DCT_ADST] = ff_vp9_iadst_idct_16x16_add_sse2; + dsp->itxfm_add[TX_16X16][ADST_ADST] = ff_vp9_iadst_iadst_16x16_add_sse2; + dsp->itxfm_add[TX_32X32][ADST_ADST] = + dsp->itxfm_add[TX_32X32][ADST_DCT] = + dsp->itxfm_add[TX_32X32][DCT_ADST] = + dsp->itxfm_add[TX_32X32][DCT_DCT] = ff_vp9_idct_idct_32x32_add_sse2; + init_dc_ipred(16, sse2); + init_dc_ipred(32, sse2); + init_dir_tm_h_ipred(8, sse2); + init_dir_tm_h_ipred(16, sse2); + init_dir_tm_h_ipred(32, sse2); + init_ipred(4, sse2, h, HOR); + } + + if (EXTERNAL_SSSE3(cpu_flags)) { + init_subpel3(0, put, 8, ssse3); + init_subpel3(1, avg, 8, ssse3); + dsp->itxfm_add[TX_4X4][DCT_DCT] = ff_vp9_idct_idct_4x4_add_ssse3; + dsp->itxfm_add[TX_4X4][ADST_DCT] = ff_vp9_idct_iadst_4x4_add_ssse3; + dsp->itxfm_add[TX_4X4][DCT_ADST] = ff_vp9_iadst_idct_4x4_add_ssse3; + dsp->itxfm_add[TX_4X4][ADST_ADST] = ff_vp9_iadst_iadst_4x4_add_ssse3; + dsp->itxfm_add[TX_8X8][DCT_DCT] = ff_vp9_idct_idct_8x8_add_ssse3; + dsp->itxfm_add[TX_8X8][ADST_DCT] = ff_vp9_idct_iadst_8x8_add_ssse3; + dsp->itxfm_add[TX_8X8][DCT_ADST] = ff_vp9_iadst_idct_8x8_add_ssse3; + dsp->itxfm_add[TX_8X8][ADST_ADST] = ff_vp9_iadst_iadst_8x8_add_ssse3; + dsp->itxfm_add[TX_16X16][DCT_DCT] = ff_vp9_idct_idct_16x16_add_ssse3; + dsp->itxfm_add[TX_16X16][ADST_DCT] = ff_vp9_idct_iadst_16x16_add_ssse3; + dsp->itxfm_add[TX_16X16][DCT_ADST] = ff_vp9_iadst_idct_16x16_add_ssse3; + dsp->itxfm_add[TX_16X16][ADST_ADST] = ff_vp9_iadst_iadst_16x16_add_ssse3; + dsp->itxfm_add[TX_32X32][ADST_ADST] = + dsp->itxfm_add[TX_32X32][ADST_DCT] = + dsp->itxfm_add[TX_32X32][DCT_ADST] = + dsp->itxfm_add[TX_32X32][DCT_DCT] = ff_vp9_idct_idct_32x32_add_ssse3; + init_lpf(ssse3); + init_all_ipred(4, ssse3); + init_all_ipred(8, ssse3); + init_all_ipred(16, ssse3); + init_all_ipred(32, ssse3); + } + + if (EXTERNAL_AVX(cpu_flags)) { + dsp->itxfm_add[TX_8X8][DCT_DCT] = ff_vp9_idct_idct_8x8_add_avx; + dsp->itxfm_add[TX_8X8][ADST_DCT] = ff_vp9_idct_iadst_8x8_add_avx; + dsp->itxfm_add[TX_8X8][DCT_ADST] = ff_vp9_iadst_idct_8x8_add_avx; + dsp->itxfm_add[TX_8X8][ADST_ADST] = ff_vp9_iadst_iadst_8x8_add_avx; + dsp->itxfm_add[TX_16X16][DCT_DCT] = ff_vp9_idct_idct_16x16_add_avx; + dsp->itxfm_add[TX_16X16][ADST_DCT] = ff_vp9_idct_iadst_16x16_add_avx; + dsp->itxfm_add[TX_16X16][DCT_ADST] = ff_vp9_iadst_idct_16x16_add_avx; + dsp->itxfm_add[TX_16X16][ADST_ADST] = ff_vp9_iadst_iadst_16x16_add_avx; + dsp->itxfm_add[TX_32X32][ADST_ADST] = + dsp->itxfm_add[TX_32X32][ADST_DCT] = + dsp->itxfm_add[TX_32X32][DCT_ADST] = + dsp->itxfm_add[TX_32X32][DCT_DCT] = ff_vp9_idct_idct_32x32_add_avx; + init_lpf(avx); + init_dir_tm_h_ipred(8, avx); + init_dir_tm_h_ipred(16, avx); + init_dir_tm_h_ipred(32, avx); + } + if (EXTERNAL_AVX_FAST(cpu_flags)) { + init_fpel_func(1, 0, 32, put, , avx); + init_fpel_func(0, 0, 64, put, , avx); + init_ipred(32, avx, v, VERT); + } + + if (EXTERNAL_AVX2_FAST(cpu_flags)) { + init_fpel_func(1, 1, 32, avg, _8, avx2); + init_fpel_func(0, 1, 64, avg, _8, avx2); + if (ARCH_X86_64) { +#if ARCH_X86_64 && HAVE_AVX2_EXTERNAL + dsp->itxfm_add[TX_16X16][DCT_DCT] = ff_vp9_idct_idct_16x16_add_avx2; + dsp->itxfm_add[TX_16X16][ADST_DCT] = ff_vp9_idct_iadst_16x16_add_avx2; + dsp->itxfm_add[TX_16X16][DCT_ADST] = ff_vp9_iadst_idct_16x16_add_avx2; + dsp->itxfm_add[TX_16X16][ADST_ADST] = ff_vp9_iadst_iadst_16x16_add_avx2; + dsp->itxfm_add[TX_32X32][ADST_ADST] = + dsp->itxfm_add[TX_32X32][ADST_DCT] = + dsp->itxfm_add[TX_32X32][DCT_ADST] = + dsp->itxfm_add[TX_32X32][DCT_DCT] = ff_vp9_idct_idct_32x32_add_avx2; + init_subpel3_32_64(0, put, 8, avx2); + init_subpel3_32_64(1, avg, 8, avx2); +#endif + } + init_dc_ipred(32, avx2); + init_ipred(32, avx2, h, HOR); + init_ipred(32, avx2, tm, TM_VP8); + } + +#undef init_fpel +#undef init_subpel1 +#undef init_subpel2 +#undef init_subpel3 + +#endif /* HAVE_X86ASM */ +} diff --git a/media/ffvpx/libavcodec/x86/vp9dsp_init.h b/media/ffvpx/libavcodec/x86/vp9dsp_init.h new file mode 100644 index 0000000000..fc1e0557fa --- /dev/null +++ b/media/ffvpx/libavcodec/x86/vp9dsp_init.h @@ -0,0 +1,192 @@ +/* + * VP9 SIMD optimizations + * + * Copyright (c) 2013 Ronald S. Bultje <rsbultje gmail com> + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#ifndef AVCODEC_X86_VP9DSP_INIT_H +#define AVCODEC_X86_VP9DSP_INIT_H + +#include "libavutil/attributes.h" +#include "libavutil/mem_internal.h" + +#include "libavcodec/vp9dsp.h" + +// hack to force-expand BPC +#define cat(a, bpp, b) a##bpp##b + +#define decl_fpel_func(avg, sz, bpp, opt) \ +void ff_vp9_##avg##sz##bpp##_##opt(uint8_t *dst, ptrdiff_t dst_stride, \ + const uint8_t *src, ptrdiff_t src_stride, \ + int h, int mx, int my) + +#define decl_mc_func(avg, sz, dir, opt, type, f_sz, bpp) \ +void ff_vp9_##avg##_8tap_1d_##dir##_##sz##_##bpp##_##opt(uint8_t *dst, ptrdiff_t dst_stride, \ + const uint8_t *src, ptrdiff_t src_stride, \ + int h, const type (*filter)[f_sz]) + +#define decl_mc_funcs(sz, opt, type, fsz, bpp) \ +decl_mc_func(put, sz, h, opt, type, fsz, bpp); \ +decl_mc_func(avg, sz, h, opt, type, fsz, bpp); \ +decl_mc_func(put, sz, v, opt, type, fsz, bpp); \ +decl_mc_func(avg, sz, v, opt, type, fsz, bpp) + +#define decl_ipred_fn(type, sz, bpp, opt) \ +void ff_vp9_ipred_##type##_##sz##x##sz##_##bpp##_##opt(uint8_t *dst, \ + ptrdiff_t stride, \ + const uint8_t *l, \ + const uint8_t *a) + +#define decl_ipred_fns(type, bpp, opt4, opt8_16_32) \ +decl_ipred_fn(type, 4, bpp, opt4); \ +decl_ipred_fn(type, 8, bpp, opt8_16_32); \ +decl_ipred_fn(type, 16, bpp, opt8_16_32); \ +decl_ipred_fn(type, 32, bpp, opt8_16_32) + +#define decl_itxfm_func(typea, typeb, size, bpp, opt) \ +void cat(ff_vp9_##typea##_##typeb##_##size##x##size##_add_, bpp, _##opt)(uint8_t *dst, \ + ptrdiff_t stride, \ + int16_t *block, \ + int eob) + +#define decl_itxfm_funcs(size, bpp, opt) \ +decl_itxfm_func(idct, idct, size, bpp, opt); \ +decl_itxfm_func(iadst, idct, size, bpp, opt); \ +decl_itxfm_func(idct, iadst, size, bpp, opt); \ +decl_itxfm_func(iadst, iadst, size, bpp, opt) + +#define mc_rep_func(avg, sz, hsz, hszb, dir, opt, type, f_sz, bpp) \ +static av_always_inline void \ +ff_vp9_##avg##_8tap_1d_##dir##_##sz##_##bpp##_##opt(uint8_t *dst, ptrdiff_t dst_stride, \ + const uint8_t *src, ptrdiff_t src_stride, \ + int h, const type (*filter)[f_sz]) \ +{ \ + ff_vp9_##avg##_8tap_1d_##dir##_##hsz##_##bpp##_##opt(dst, dst_stride, src, \ + src_stride, h, filter); \ + ff_vp9_##avg##_8tap_1d_##dir##_##hsz##_##bpp##_##opt(dst + hszb, dst_stride, src + hszb, \ + src_stride, h, filter); \ +} + +#define mc_rep_funcs(sz, hsz, hszb, opt, type, fsz, bpp) \ +mc_rep_func(put, sz, hsz, hszb, h, opt, type, fsz, bpp) \ +mc_rep_func(avg, sz, hsz, hszb, h, opt, type, fsz, bpp) \ +mc_rep_func(put, sz, hsz, hszb, v, opt, type, fsz, bpp) \ +mc_rep_func(avg, sz, hsz, hszb, v, opt, type, fsz, bpp) + +#define filter_8tap_1d_fn(op, sz, f, f_opt, fname, dir, dvar, bpp, opt) \ +static void op##_8tap_##fname##_##sz##dir##_##bpp##_##opt(uint8_t *dst, ptrdiff_t dst_stride, \ + const uint8_t *src, ptrdiff_t src_stride, \ + int h, int mx, int my) \ +{ \ + ff_vp9_##op##_8tap_1d_##dir##_##sz##_##bpp##_##opt(dst, dst_stride, src, src_stride, \ + h, ff_filters_##f_opt[f][dvar - 1]); \ +} + +#define filters_8tap_1d_fn(op, sz, dir, dvar, bpp, opt, f_opt) \ +filter_8tap_1d_fn(op, sz, FILTER_8TAP_REGULAR, f_opt, regular, dir, dvar, bpp, opt) \ +filter_8tap_1d_fn(op, sz, FILTER_8TAP_SHARP, f_opt, sharp, dir, dvar, bpp, opt) \ +filter_8tap_1d_fn(op, sz, FILTER_8TAP_SMOOTH, f_opt, smooth, dir, dvar, bpp, opt) + +#define filters_8tap_1d_fn2(op, sz, bpp, opt, f_opt) \ +filters_8tap_1d_fn(op, sz, h, mx, bpp, opt, f_opt) \ +filters_8tap_1d_fn(op, sz, v, my, bpp, opt, f_opt) + +#define filters_8tap_1d_fn3(op, bpp, opt4, opt8, f_opt) \ +filters_8tap_1d_fn2(op, 64, bpp, opt8, f_opt) \ +filters_8tap_1d_fn2(op, 32, bpp, opt8, f_opt) \ +filters_8tap_1d_fn2(op, 16, bpp, opt8, f_opt) \ +filters_8tap_1d_fn2(op, 8, bpp, opt8, f_opt) \ +filters_8tap_1d_fn2(op, 4, bpp, opt4, f_opt) + +#define filter_8tap_2d_fn(op, sz, f, f_opt, fname, align, bpp, bytes, opt) \ +static void op##_8tap_##fname##_##sz##hv_##bpp##_##opt(uint8_t *dst, ptrdiff_t dst_stride, \ + const uint8_t *src, ptrdiff_t src_stride, \ + int h, int mx, int my) \ +{ \ + LOCAL_ALIGNED_##align(uint8_t, temp, [71 * 64 * bytes]); \ + ff_vp9_put_8tap_1d_h_##sz##_##bpp##_##opt(temp, 64 * bytes, src - 3 * src_stride, \ + src_stride, h + 7, \ + ff_filters_##f_opt[f][mx - 1]); \ + ff_vp9_##op##_8tap_1d_v_##sz##_##bpp##_##opt(dst, dst_stride, temp + 3 * bytes * 64, \ + 64 * bytes, h, \ + ff_filters_##f_opt[f][my - 1]); \ +} + +#define filters_8tap_2d_fn(op, sz, align, bpp, bytes, opt, f_opt) \ +filter_8tap_2d_fn(op, sz, FILTER_8TAP_REGULAR, f_opt, regular, align, bpp, bytes, opt) \ +filter_8tap_2d_fn(op, sz, FILTER_8TAP_SHARP, f_opt, sharp, align, bpp, bytes, opt) \ +filter_8tap_2d_fn(op, sz, FILTER_8TAP_SMOOTH, f_opt, smooth, align, bpp, bytes, opt) + +#define filters_8tap_2d_fn2(op, align, bpp, bytes, opt4, opt8, f_opt) \ +filters_8tap_2d_fn(op, 64, align, bpp, bytes, opt8, f_opt) \ +filters_8tap_2d_fn(op, 32, align, bpp, bytes, opt8, f_opt) \ +filters_8tap_2d_fn(op, 16, align, bpp, bytes, opt8, f_opt) \ +filters_8tap_2d_fn(op, 8, align, bpp, bytes, opt8, f_opt) \ +filters_8tap_2d_fn(op, 4, align, bpp, bytes, opt4, f_opt) + +#define init_fpel_func(idx1, idx2, sz, type, bpp, opt) \ + dsp->mc[idx1][FILTER_8TAP_SMOOTH ][idx2][0][0] = \ + dsp->mc[idx1][FILTER_8TAP_REGULAR][idx2][0][0] = \ + dsp->mc[idx1][FILTER_8TAP_SHARP ][idx2][0][0] = \ + dsp->mc[idx1][FILTER_BILINEAR ][idx2][0][0] = ff_vp9_##type##sz##bpp##_##opt + +#define init_subpel1(idx1, idx2, idxh, idxv, sz, dir, type, bpp, opt) \ + dsp->mc[idx1][FILTER_8TAP_SMOOTH ][idx2][idxh][idxv] = \ + type##_8tap_smooth_##sz##dir##_##bpp##_##opt; \ + dsp->mc[idx1][FILTER_8TAP_REGULAR][idx2][idxh][idxv] = \ + type##_8tap_regular_##sz##dir##_##bpp##_##opt; \ + dsp->mc[idx1][FILTER_8TAP_SHARP ][idx2][idxh][idxv] = \ + type##_8tap_sharp_##sz##dir##_##bpp##_##opt + +#define init_subpel2(idx1, idx2, sz, type, bpp, opt) \ + init_subpel1(idx1, idx2, 1, 1, sz, hv, type, bpp, opt); \ + init_subpel1(idx1, idx2, 0, 1, sz, v, type, bpp, opt); \ + init_subpel1(idx1, idx2, 1, 0, sz, h, type, bpp, opt) + +#define init_subpel3_32_64(idx, type, bpp, opt) \ + init_subpel2(0, idx, 64, type, bpp, opt); \ + init_subpel2(1, idx, 32, type, bpp, opt) + +#define init_subpel3_8to64(idx, type, bpp, opt) \ + init_subpel3_32_64(idx, type, bpp, opt); \ + init_subpel2(2, idx, 16, type, bpp, opt); \ + init_subpel2(3, idx, 8, type, bpp, opt) + +#define init_subpel3(idx, type, bpp, opt) \ + init_subpel3_8to64(idx, type, bpp, opt); \ + init_subpel2(4, idx, 4, type, bpp, opt) + +#define init_ipred_func(type, enum, sz, bpp, opt) \ + dsp->intra_pred[TX_##sz##X##sz][enum##_PRED] = \ + cat(ff_vp9_ipred_##type##_##sz##x##sz##_, bpp, _##opt) + +#define init_8_16_32_ipred_funcs(type, enum, bpp, opt) \ + init_ipred_func(type, enum, 8, bpp, opt); \ + init_ipred_func(type, enum, 16, bpp, opt); \ + init_ipred_func(type, enum, 32, bpp, opt) + +#define init_ipred_funcs(type, enum, bpp, opt) \ + init_ipred_func(type, enum, 4, bpp, opt); \ + init_8_16_32_ipred_funcs(type, enum, bpp, opt) + +void ff_vp9dsp_init_10bpp_x86(VP9DSPContext *dsp, int bitexact); +void ff_vp9dsp_init_12bpp_x86(VP9DSPContext *dsp, int bitexact); +void ff_vp9dsp_init_16bpp_x86(VP9DSPContext *dsp); + +#endif /* AVCODEC_X86_VP9DSP_INIT_H */ diff --git a/media/ffvpx/libavcodec/x86/vp9dsp_init_10bpp.c b/media/ffvpx/libavcodec/x86/vp9dsp_init_10bpp.c new file mode 100644 index 0000000000..2694c06cb2 --- /dev/null +++ b/media/ffvpx/libavcodec/x86/vp9dsp_init_10bpp.c @@ -0,0 +1,25 @@ +/* + * VP9 SIMD optimizations + * + * Copyright (c) 2013 Ronald S. Bultje <rsbultje gmail com> + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#define BPC 10 +#define INIT_FUNC ff_vp9dsp_init_10bpp_x86 +#include "vp9dsp_init_16bpp_template.c" diff --git a/media/ffvpx/libavcodec/x86/vp9dsp_init_12bpp.c b/media/ffvpx/libavcodec/x86/vp9dsp_init_12bpp.c new file mode 100644 index 0000000000..5da3bc1840 --- /dev/null +++ b/media/ffvpx/libavcodec/x86/vp9dsp_init_12bpp.c @@ -0,0 +1,25 @@ +/* + * VP9 SIMD optimizations + * + * Copyright (c) 2013 Ronald S. Bultje <rsbultje gmail com> + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#define BPC 12 +#define INIT_FUNC ff_vp9dsp_init_12bpp_x86 +#include "vp9dsp_init_16bpp_template.c" diff --git a/media/ffvpx/libavcodec/x86/vp9dsp_init_16bpp.c b/media/ffvpx/libavcodec/x86/vp9dsp_init_16bpp.c new file mode 100644 index 0000000000..e5afea1512 --- /dev/null +++ b/media/ffvpx/libavcodec/x86/vp9dsp_init_16bpp.c @@ -0,0 +1,152 @@ +/* + * VP9 SIMD optimizations + * + * Copyright (c) 2013 Ronald S. Bultje <rsbultje gmail com> + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#include "libavutil/attributes.h" +#include "libavutil/cpu.h" +#include "libavutil/x86/cpu.h" +#include "libavcodec/vp9dsp.h" +#include "libavcodec/x86/vp9dsp_init.h" + +#if HAVE_X86ASM + +decl_fpel_func(put, 8, , mmx); +decl_fpel_func(avg, 8, _16, mmxext); +decl_fpel_func(put, 16, , sse); +decl_fpel_func(put, 32, , sse); +decl_fpel_func(put, 64, , sse); +decl_fpel_func(put, 128, , sse); +decl_fpel_func(avg, 16, _16, sse2); +decl_fpel_func(avg, 32, _16, sse2); +decl_fpel_func(avg, 64, _16, sse2); +decl_fpel_func(avg, 128, _16, sse2); +decl_fpel_func(put, 32, , avx); +decl_fpel_func(put, 64, , avx); +decl_fpel_func(put, 128, , avx); +decl_fpel_func(avg, 32, _16, avx2); +decl_fpel_func(avg, 64, _16, avx2); +decl_fpel_func(avg, 128, _16, avx2); + +decl_ipred_fns(v, 16, mmx, sse); +decl_ipred_fns(h, 16, mmxext, sse2); +decl_ipred_fns(dc, 16, mmxext, sse2); +decl_ipred_fns(dc_top, 16, mmxext, sse2); +decl_ipred_fns(dc_left, 16, mmxext, sse2); +decl_ipred_fn(dl, 16, 16, avx2); +decl_ipred_fn(dl, 32, 16, avx2); +decl_ipred_fn(dr, 16, 16, avx2); +decl_ipred_fn(dr, 32, 16, avx2); +decl_ipred_fn(vl, 16, 16, avx2); +decl_ipred_fn(hd, 16, 16, avx2); + +#define decl_ipred_dir_funcs(type) \ +decl_ipred_fns(type, 16, sse2, sse2); \ +decl_ipred_fns(type, 16, ssse3, ssse3); \ +decl_ipred_fns(type, 16, avx, avx) + +decl_ipred_dir_funcs(dl); +decl_ipred_dir_funcs(dr); +decl_ipred_dir_funcs(vl); +decl_ipred_dir_funcs(vr); +decl_ipred_dir_funcs(hu); +decl_ipred_dir_funcs(hd); +#endif /* HAVE_X86ASM */ + +av_cold void ff_vp9dsp_init_16bpp_x86(VP9DSPContext *dsp) +{ +#if HAVE_X86ASM + int cpu_flags = av_get_cpu_flags(); + + if (EXTERNAL_MMX(cpu_flags)) { + init_fpel_func(4, 0, 8, put, , mmx); + init_ipred_func(v, VERT, 4, 16, mmx); + } + + if (EXTERNAL_MMXEXT(cpu_flags)) { + init_fpel_func(4, 1, 8, avg, _16, mmxext); + init_ipred_func(h, HOR, 4, 16, mmxext); + init_ipred_func(dc, DC, 4, 16, mmxext); + init_ipred_func(dc_top, TOP_DC, 4, 16, mmxext); + init_ipred_func(dc_left, LEFT_DC, 4, 16, mmxext); + } + + if (EXTERNAL_SSE(cpu_flags)) { + init_fpel_func(3, 0, 16, put, , sse); + init_fpel_func(2, 0, 32, put, , sse); + init_fpel_func(1, 0, 64, put, , sse); + init_fpel_func(0, 0, 128, put, , sse); + init_8_16_32_ipred_funcs(v, VERT, 16, sse); + } + + if (EXTERNAL_SSE2(cpu_flags)) { + init_fpel_func(3, 1, 16, avg, _16, sse2); + init_fpel_func(2, 1, 32, avg, _16, sse2); + init_fpel_func(1, 1, 64, avg, _16, sse2); + init_fpel_func(0, 1, 128, avg, _16, sse2); + init_8_16_32_ipred_funcs(h, HOR, 16, sse2); + init_8_16_32_ipred_funcs(dc, DC, 16, sse2); + init_8_16_32_ipred_funcs(dc_top, TOP_DC, 16, sse2); + init_8_16_32_ipred_funcs(dc_left, LEFT_DC, 16, sse2); + init_ipred_funcs(dl, DIAG_DOWN_LEFT, 16, sse2); + init_ipred_funcs(dr, DIAG_DOWN_RIGHT, 16, sse2); + init_ipred_funcs(vl, VERT_LEFT, 16, sse2); + init_ipred_funcs(vr, VERT_RIGHT, 16, sse2); + init_ipred_funcs(hu, HOR_UP, 16, sse2); + init_ipred_funcs(hd, HOR_DOWN, 16, sse2); + } + + if (EXTERNAL_SSSE3(cpu_flags)) { + init_ipred_funcs(dl, DIAG_DOWN_LEFT, 16, ssse3); + init_ipred_funcs(dr, DIAG_DOWN_RIGHT, 16, ssse3); + init_ipred_funcs(vl, VERT_LEFT, 16, ssse3); + init_ipred_funcs(vr, VERT_RIGHT, 16, ssse3); + init_ipred_funcs(hu, HOR_UP, 16, ssse3); + init_ipred_funcs(hd, HOR_DOWN, 16, ssse3); + } + + if (EXTERNAL_AVX_FAST(cpu_flags)) { + init_fpel_func(2, 0, 32, put, , avx); + init_fpel_func(1, 0, 64, put, , avx); + init_fpel_func(0, 0, 128, put, , avx); + init_ipred_funcs(dl, DIAG_DOWN_LEFT, 16, avx); + init_ipred_funcs(dr, DIAG_DOWN_RIGHT, 16, avx); + init_ipred_funcs(vl, VERT_LEFT, 16, avx); + init_ipred_funcs(vr, VERT_RIGHT, 16, avx); + init_ipred_funcs(hu, HOR_UP, 16, avx); + init_ipred_funcs(hd, HOR_DOWN, 16, avx); + } + + if (EXTERNAL_AVX2_FAST(cpu_flags)) { + init_fpel_func(2, 1, 32, avg, _16, avx2); + init_fpel_func(1, 1, 64, avg, _16, avx2); + init_fpel_func(0, 1, 128, avg, _16, avx2); + init_ipred_func(dl, DIAG_DOWN_LEFT, 16, 16, avx2); + init_ipred_func(dl, DIAG_DOWN_LEFT, 32, 16, avx2); + init_ipred_func(dr, DIAG_DOWN_RIGHT, 16, 16, avx2); + init_ipred_func(vl, VERT_LEFT, 16, 16, avx2); + init_ipred_func(hd, HOR_DOWN, 16, 16, avx2); +#if ARCH_X86_64 + init_ipred_func(dr, DIAG_DOWN_RIGHT, 32, 16, avx2); +#endif + } + +#endif /* HAVE_X86ASM */ +} diff --git a/media/ffvpx/libavcodec/x86/vp9dsp_init_16bpp_template.c b/media/ffvpx/libavcodec/x86/vp9dsp_init_16bpp_template.c new file mode 100644 index 0000000000..f93ea2468e --- /dev/null +++ b/media/ffvpx/libavcodec/x86/vp9dsp_init_16bpp_template.c @@ -0,0 +1,239 @@ +/* + * VP9 SIMD optimizations + * + * Copyright (c) 2013 Ronald S. Bultje <rsbultje gmail com> + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#include "libavutil/attributes.h" +#include "libavutil/cpu.h" +#include "libavutil/x86/cpu.h" +#include "libavcodec/vp9dsp.h" +#include "libavcodec/x86/vp9dsp_init.h" + +#if HAVE_X86ASM + +extern const int16_t ff_filters_16bpp[3][15][4][16]; + +decl_mc_funcs(4, sse2, int16_t, 16, BPC); +decl_mc_funcs(8, sse2, int16_t, 16, BPC); +decl_mc_funcs(16, avx2, int16_t, 16, BPC); + +mc_rep_funcs(16, 8, 16, sse2, int16_t, 16, BPC) +mc_rep_funcs(32, 16, 32, sse2, int16_t, 16, BPC) +mc_rep_funcs(64, 32, 64, sse2, int16_t, 16, BPC) +#if HAVE_AVX2_EXTERNAL +mc_rep_funcs(32, 16, 32, avx2, int16_t, 16, BPC) +mc_rep_funcs(64, 32, 64, avx2, int16_t, 16, BPC) +#endif + +filters_8tap_2d_fn2(put, 16, BPC, 2, sse2, sse2, 16bpp) +filters_8tap_2d_fn2(avg, 16, BPC, 2, sse2, sse2, 16bpp) +#if HAVE_AVX2_EXTERNAL +filters_8tap_2d_fn(put, 64, 32, BPC, 2, avx2, 16bpp) +filters_8tap_2d_fn(avg, 64, 32, BPC, 2, avx2, 16bpp) +filters_8tap_2d_fn(put, 32, 32, BPC, 2, avx2, 16bpp) +filters_8tap_2d_fn(avg, 32, 32, BPC, 2, avx2, 16bpp) +filters_8tap_2d_fn(put, 16, 32, BPC, 2, avx2, 16bpp) +filters_8tap_2d_fn(avg, 16, 32, BPC, 2, avx2, 16bpp) +#endif + +filters_8tap_1d_fn3(put, BPC, sse2, sse2, 16bpp) +filters_8tap_1d_fn3(avg, BPC, sse2, sse2, 16bpp) +#if HAVE_AVX2_EXTERNAL +filters_8tap_1d_fn2(put, 64, BPC, avx2, 16bpp) +filters_8tap_1d_fn2(avg, 64, BPC, avx2, 16bpp) +filters_8tap_1d_fn2(put, 32, BPC, avx2, 16bpp) +filters_8tap_1d_fn2(avg, 32, BPC, avx2, 16bpp) +filters_8tap_1d_fn2(put, 16, BPC, avx2, 16bpp) +filters_8tap_1d_fn2(avg, 16, BPC, avx2, 16bpp) +#endif + +#define decl_lpf_func(dir, wd, bpp, opt) \ +void ff_vp9_loop_filter_##dir##_##wd##_##bpp##_##opt(uint8_t *dst, ptrdiff_t stride, \ + int E, int I, int H) + +#define decl_lpf_funcs(dir, wd, bpp) \ +decl_lpf_func(dir, wd, bpp, sse2); \ +decl_lpf_func(dir, wd, bpp, ssse3); \ +decl_lpf_func(dir, wd, bpp, avx) + +#define decl_lpf_funcs_wd(dir) \ +decl_lpf_funcs(dir, 4, BPC); \ +decl_lpf_funcs(dir, 8, BPC); \ +decl_lpf_funcs(dir, 16, BPC) + +decl_lpf_funcs_wd(h); +decl_lpf_funcs_wd(v); + +#define lpf_16_wrapper(dir, off, bpp, opt) \ +static void loop_filter_##dir##_16_##bpp##_##opt(uint8_t *dst, ptrdiff_t stride, \ + int E, int I, int H) \ +{ \ + ff_vp9_loop_filter_##dir##_16_##bpp##_##opt(dst, stride, E, I, H); \ + ff_vp9_loop_filter_##dir##_16_##bpp##_##opt(dst + off, stride, E, I, H); \ +} + +#define lpf_16_wrappers(bpp, opt) \ +lpf_16_wrapper(h, 8 * stride, bpp, opt) \ +lpf_16_wrapper(v, 16, bpp, opt) + +lpf_16_wrappers(BPC, sse2) +lpf_16_wrappers(BPC, ssse3) +lpf_16_wrappers(BPC, avx) + +#define lpf_mix2_wrapper(dir, off, wd1, wd2, bpp, opt) \ +static void loop_filter_##dir##_##wd1##wd2##_##bpp##_##opt(uint8_t *dst, ptrdiff_t stride, \ + int E, int I, int H) \ +{ \ + ff_vp9_loop_filter_##dir##_##wd1##_##bpp##_##opt(dst, stride, \ + E & 0xff, I & 0xff, H & 0xff); \ + ff_vp9_loop_filter_##dir##_##wd2##_##bpp##_##opt(dst + off, stride, \ + E >> 8, I >> 8, H >> 8); \ +} + +#define lpf_mix2_wrappers(wd1, wd2, bpp, opt) \ +lpf_mix2_wrapper(h, 8 * stride, wd1, wd2, bpp, opt) \ +lpf_mix2_wrapper(v, 16, wd1, wd2, bpp, opt) + +#define lpf_mix2_wrappers_set(bpp, opt) \ +lpf_mix2_wrappers(4, 4, bpp, opt) \ +lpf_mix2_wrappers(4, 8, bpp, opt) \ +lpf_mix2_wrappers(8, 4, bpp, opt) \ +lpf_mix2_wrappers(8, 8, bpp, opt) \ + +lpf_mix2_wrappers_set(BPC, sse2) +lpf_mix2_wrappers_set(BPC, ssse3) +lpf_mix2_wrappers_set(BPC, avx) + +decl_ipred_fns(tm, BPC, mmxext, sse2); + +decl_itxfm_func(iwht, iwht, 4, BPC, mmxext); +#if BPC == 10 +decl_itxfm_func(idct, idct, 4, BPC, mmxext); +decl_itxfm_funcs(4, BPC, ssse3); +#else +decl_itxfm_func(idct, idct, 4, BPC, sse2); +#endif +decl_itxfm_func(idct, iadst, 4, BPC, sse2); +decl_itxfm_func(iadst, idct, 4, BPC, sse2); +decl_itxfm_func(iadst, iadst, 4, BPC, sse2); +decl_itxfm_funcs(8, BPC, sse2); +decl_itxfm_funcs(16, BPC, sse2); +decl_itxfm_func(idct, idct, 32, BPC, sse2); +#endif /* HAVE_X86ASM */ + +av_cold void INIT_FUNC(VP9DSPContext *dsp, int bitexact) +{ +#if HAVE_X86ASM + int cpu_flags = av_get_cpu_flags(); + +#define init_lpf_8_func(idx1, idx2, dir, wd, bpp, opt) \ + dsp->loop_filter_8[idx1][idx2] = ff_vp9_loop_filter_##dir##_##wd##_##bpp##_##opt +#define init_lpf_16_func(idx, dir, bpp, opt) \ + dsp->loop_filter_16[idx] = loop_filter_##dir##_16_##bpp##_##opt +#define init_lpf_mix2_func(idx1, idx2, idx3, dir, wd1, wd2, bpp, opt) \ + dsp->loop_filter_mix2[idx1][idx2][idx3] = loop_filter_##dir##_##wd1##wd2##_##bpp##_##opt + +#define init_lpf_funcs(bpp, opt) \ + init_lpf_8_func(0, 0, h, 4, bpp, opt); \ + init_lpf_8_func(0, 1, v, 4, bpp, opt); \ + init_lpf_8_func(1, 0, h, 8, bpp, opt); \ + init_lpf_8_func(1, 1, v, 8, bpp, opt); \ + init_lpf_8_func(2, 0, h, 16, bpp, opt); \ + init_lpf_8_func(2, 1, v, 16, bpp, opt); \ + init_lpf_16_func(0, h, bpp, opt); \ + init_lpf_16_func(1, v, bpp, opt); \ + init_lpf_mix2_func(0, 0, 0, h, 4, 4, bpp, opt); \ + init_lpf_mix2_func(0, 1, 0, h, 4, 8, bpp, opt); \ + init_lpf_mix2_func(1, 0, 0, h, 8, 4, bpp, opt); \ + init_lpf_mix2_func(1, 1, 0, h, 8, 8, bpp, opt); \ + init_lpf_mix2_func(0, 0, 1, v, 4, 4, bpp, opt); \ + init_lpf_mix2_func(0, 1, 1, v, 4, 8, bpp, opt); \ + init_lpf_mix2_func(1, 0, 1, v, 8, 4, bpp, opt); \ + init_lpf_mix2_func(1, 1, 1, v, 8, 8, bpp, opt) + +#define init_itx_func(idxa, idxb, typea, typeb, size, bpp, opt) \ + dsp->itxfm_add[idxa][idxb] = \ + cat(ff_vp9_##typea##_##typeb##_##size##x##size##_add_, bpp, _##opt); +#define init_itx_func_one(idx, typea, typeb, size, bpp, opt) \ + init_itx_func(idx, DCT_DCT, typea, typeb, size, bpp, opt); \ + init_itx_func(idx, ADST_DCT, typea, typeb, size, bpp, opt); \ + init_itx_func(idx, DCT_ADST, typea, typeb, size, bpp, opt); \ + init_itx_func(idx, ADST_ADST, typea, typeb, size, bpp, opt) +#define init_itx_funcs(idx, size, bpp, opt) \ + init_itx_func(idx, DCT_DCT, idct, idct, size, bpp, opt); \ + init_itx_func(idx, ADST_DCT, idct, iadst, size, bpp, opt); \ + init_itx_func(idx, DCT_ADST, iadst, idct, size, bpp, opt); \ + init_itx_func(idx, ADST_ADST, iadst, iadst, size, bpp, opt); \ + + if (EXTERNAL_MMXEXT(cpu_flags)) { + init_ipred_func(tm, TM_VP8, 4, BPC, mmxext); + if (!bitexact) { + init_itx_func_one(4 /* lossless */, iwht, iwht, 4, BPC, mmxext); +#if BPC == 10 + init_itx_func(TX_4X4, DCT_DCT, idct, idct, 4, 10, mmxext); +#endif + } + } + + if (EXTERNAL_SSE2(cpu_flags)) { + init_subpel3(0, put, BPC, sse2); + init_subpel3(1, avg, BPC, sse2); + init_lpf_funcs(BPC, sse2); + init_8_16_32_ipred_funcs(tm, TM_VP8, BPC, sse2); +#if BPC == 10 + if (!bitexact) { + init_itx_func(TX_4X4, ADST_DCT, idct, iadst, 4, 10, sse2); + init_itx_func(TX_4X4, DCT_ADST, iadst, idct, 4, 10, sse2); + init_itx_func(TX_4X4, ADST_ADST, iadst, iadst, 4, 10, sse2); + } +#else + init_itx_funcs(TX_4X4, 4, 12, sse2); +#endif + init_itx_funcs(TX_8X8, 8, BPC, sse2); + init_itx_funcs(TX_16X16, 16, BPC, sse2); + init_itx_func_one(TX_32X32, idct, idct, 32, BPC, sse2); + } + + if (EXTERNAL_SSSE3(cpu_flags)) { + init_lpf_funcs(BPC, ssse3); +#if BPC == 10 + if (!bitexact) { + init_itx_funcs(TX_4X4, 4, BPC, ssse3); + } +#endif + } + + if (EXTERNAL_AVX(cpu_flags)) { + init_lpf_funcs(BPC, avx); + } + + if (EXTERNAL_AVX2_FAST(cpu_flags)) { +#if HAVE_AVX2_EXTERNAL + init_subpel3_32_64(0, put, BPC, avx2); + init_subpel3_32_64(1, avg, BPC, avx2); + init_subpel2(2, 0, 16, put, BPC, avx2); + init_subpel2(2, 1, 16, avg, BPC, avx2); +#endif + } + +#endif /* HAVE_X86ASM */ + + ff_vp9dsp_init_16bpp_x86(dsp); +} diff --git a/media/ffvpx/libavcodec/x86/vp9intrapred.asm b/media/ffvpx/libavcodec/x86/vp9intrapred.asm new file mode 100644 index 0000000000..31f7d449fd --- /dev/null +++ b/media/ffvpx/libavcodec/x86/vp9intrapred.asm @@ -0,0 +1,2044 @@ +;****************************************************************************** +;* VP9 Intra prediction SIMD optimizations +;* +;* Copyright (c) 2013 Ronald S. Bultje <rsbultje gmail com> +;* +;* Parts based on: +;* H.264 intra prediction asm optimizations +;* Copyright (c) 2010 Fiona Glaser +;* Copyright (c) 2010 Holger Lubitz +;* Copyright (c) 2010 Loren Merritt +;* Copyright (c) 2010 Ronald S. Bultje +;* +;* This file is part of FFmpeg. +;* +;* FFmpeg is free software; you can redistribute it and/or +;* modify it under the terms of the GNU Lesser General Public +;* License as published by the Free Software Foundation; either +;* version 2.1 of the License, or (at your option) any later version. +;* +;* FFmpeg is distributed in the hope that it will be useful, +;* but WITHOUT ANY WARRANTY; without even the implied warranty of +;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +;* Lesser General Public License for more details. +;* +;* You should have received a copy of the GNU Lesser General Public +;* License along with FFmpeg; if not, write to the Free Software +;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA +;****************************************************************************** + +%include "libavutil/x86/x86util.asm" + +SECTION_RODATA 32 + +pw_m256: times 16 dw -256 +pw_m255: times 16 dw -255 +pw_4096: times 8 dw 4096 + +pb_4x3_4x2_4x1_4x0: times 4 db 3 + times 4 db 2 + times 4 db 1 + times 4 db 0 +pb_8x1_8x0: times 8 db 1 + times 8 db 0 +pb_8x3_8x2: times 8 db 3 + times 8 db 2 +pb_0to5_2x7: db 0, 1, 2, 3, 4, 5, 7, 7 + times 8 db -1 +pb_0to6_9x7: db 0, 1, 2, 3, 4, 5, 6 + times 9 db 7 +pb_1to6_10x7: db 1, 2, 3, 4, 5, 6 + times 10 db 7 +pb_2to6_3x7: +pb_2to6_11x7: db 2, 3, 4, 5, 6 + times 11 db 7 +pb_1toE_2xF: db 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 15 +pb_2toE_3xF: db 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 15, 15 +pb_13456_3xm1: db 1, 3, 4, 5, 6 + times 3 db -1 +pb_6012_4xm1: db 6, 0, 1, 2 + times 4 db -1 +pb_6xm1_246_8toE: times 6 db -1 + db 2, 4, 6, 8, 9, 10, 11, 12, 13, 14 +pb_6xm1_BDF_0to6: times 6 db -1 + db 11, 13, 15, 0, 1, 2, 3, 4, 5, 6 +pb_02468ACE_13579BDF: db 0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15 + +pb_15x0_1xm1: times 15 db 0 + db -1 +pb_0to2_5x3: db 0, 1, 2 + times 5 db 3 +pb_6xm1_2x0: times 6 db -1 + times 2 db 0 +pb_6x0_2xm1: times 6 db 0 + times 2 db -1 + +cextern pb_1 +cextern pb_2 +cextern pb_3 +cextern pb_15 +cextern pw_2 +cextern pw_4 +cextern pw_8 +cextern pw_16 +cextern pw_32 +cextern pw_255 +cextern pw_512 +cextern pw_1024 +cextern pw_2048 +cextern pw_8192 + +SECTION .text + +; dc_NxN(uint8_t *dst, ptrdiff_t stride, const uint8_t *l, const uint8_t *a) + +%macro DC_4to8_FUNCS 0 +cglobal vp9_ipred_dc_4x4, 4, 4, 0, dst, stride, l, a + movd m0, [lq] + punpckldq m0, [aq] + pxor m1, m1 + psadbw m0, m1 +%if cpuflag(ssse3) + pmulhrsw m0, [pw_4096] + pshufb m0, m1 +%else + paddw m0, [pw_4] + psraw m0, 3 + punpcklbw m0, m0 + pshufw m0, m0, q0000 +%endif + movd [dstq+strideq*0], m0 + movd [dstq+strideq*1], m0 + lea dstq, [dstq+strideq*2] + movd [dstq+strideq*0], m0 + movd [dstq+strideq*1], m0 + RET + +cglobal vp9_ipred_dc_8x8, 4, 4, 0, dst, stride, l, a + movq m0, [lq] + movq m1, [aq] + DEFINE_ARGS dst, stride, stride3 + lea stride3q, [strideq*3] + pxor m2, m2 + psadbw m0, m2 + psadbw m1, m2 + paddw m0, m1 +%if cpuflag(ssse3) + pmulhrsw m0, [pw_2048] + pshufb m0, m2 +%else + paddw m0, [pw_8] + psraw m0, 4 + punpcklbw m0, m0 + pshufw m0, m0, q0000 +%endif + movq [dstq+strideq*0], m0 + movq [dstq+strideq*1], m0 + movq [dstq+strideq*2], m0 + movq [dstq+stride3q ], m0 + lea dstq, [dstq+strideq*4] + movq [dstq+strideq*0], m0 + movq [dstq+strideq*1], m0 + movq [dstq+strideq*2], m0 + movq [dstq+stride3q ], m0 + RET +%endmacro + +INIT_MMX mmxext +DC_4to8_FUNCS +INIT_MMX ssse3 +DC_4to8_FUNCS + +%macro DC_16to32_FUNCS 0 +cglobal vp9_ipred_dc_16x16, 4, 4, 3, dst, stride, l, a + mova m0, [lq] + mova m1, [aq] + DEFINE_ARGS dst, stride, stride3, cnt + lea stride3q, [strideq*3] + pxor m2, m2 + psadbw m0, m2 + psadbw m1, m2 + paddw m0, m1 + movhlps m1, m0 + paddw m0, m1 +%if cpuflag(ssse3) + pmulhrsw m0, [pw_1024] + pshufb m0, m2 +%else + paddw m0, [pw_16] + psraw m0, 5 + punpcklbw m0, m0 + pshuflw m0, m0, q0000 + punpcklqdq m0, m0 +%endif + mov cntd, 4 +.loop: + mova [dstq+strideq*0], m0 + mova [dstq+strideq*1], m0 + mova [dstq+strideq*2], m0 + mova [dstq+stride3q ], m0 + lea dstq, [dstq+strideq*4] + dec cntd + jg .loop + RET + +cglobal vp9_ipred_dc_32x32, 4, 4, 5, dst, stride, l, a + mova m0, [lq] + mova m1, [lq+16] + mova m2, [aq] + mova m3, [aq+16] + DEFINE_ARGS dst, stride, stride3, cnt + lea stride3q, [strideq*3] + pxor m4, m4 + psadbw m0, m4 + psadbw m1, m4 + psadbw m2, m4 + psadbw m3, m4 + paddw m0, m1 + paddw m2, m3 + paddw m0, m2 + movhlps m1, m0 + paddw m0, m1 +%if cpuflag(ssse3) + pmulhrsw m0, [pw_512] + pshufb m0, m4 +%else + paddw m0, [pw_32] + psraw m0, 6 + punpcklbw m0, m0 + pshuflw m0, m0, q0000 + punpcklqdq m0, m0 +%endif + mov cntd, 8 +.loop: + mova [dstq+strideq*0+ 0], m0 + mova [dstq+strideq*0+16], m0 + mova [dstq+strideq*1+ 0], m0 + mova [dstq+strideq*1+16], m0 + mova [dstq+strideq*2+ 0], m0 + mova [dstq+strideq*2+16], m0 + mova [dstq+stride3q + 0], m0 + mova [dstq+stride3q +16], m0 + lea dstq, [dstq+strideq*4] + dec cntd + jg .loop + RET +%endmacro + +INIT_XMM sse2 +DC_16to32_FUNCS +INIT_XMM ssse3 +DC_16to32_FUNCS + +%if HAVE_AVX2_EXTERNAL +INIT_YMM avx2 +cglobal vp9_ipred_dc_32x32, 4, 4, 3, dst, stride, l, a + mova m0, [lq] + mova m1, [aq] + DEFINE_ARGS dst, stride, stride3, cnt + lea stride3q, [strideq*3] + pxor m2, m2 + psadbw m0, m2 + psadbw m1, m2 + paddw m0, m1 + vextracti128 xm1, m0, 1 + paddw xm0, xm1 + movhlps xm1, xm0 + paddw xm0, xm1 + pmulhrsw xm0, [pw_512] + vpbroadcastb m0, xm0 + mov cntd, 4 +.loop: + mova [dstq+strideq*0], m0 + mova [dstq+strideq*1], m0 + mova [dstq+strideq*2], m0 + mova [dstq+stride3q ], m0 + lea dstq, [dstq+strideq*4] + mova [dstq+strideq*0], m0 + mova [dstq+strideq*1], m0 + mova [dstq+strideq*2], m0 + mova [dstq+stride3q ], m0 + lea dstq, [dstq+strideq*4] + dec cntd + jg .loop + RET +%endif + +; dc_top/left_NxN(uint8_t *dst, ptrdiff_t stride, const uint8_t *l, const uint8_t *a) + +%macro DC_1D_4to8_FUNCS 2 ; dir (top or left), arg (a or l) +cglobal vp9_ipred_dc_%1_4x4, 4, 4, 0, dst, stride, l, a + movd m0, [%2q] + pxor m1, m1 + psadbw m0, m1 +%if cpuflag(ssse3) + pmulhrsw m0, [pw_8192] + pshufb m0, m1 +%else + paddw m0, [pw_2] + psraw m0, 2 + punpcklbw m0, m0 + pshufw m0, m0, q0000 +%endif + movd [dstq+strideq*0], m0 + movd [dstq+strideq*1], m0 + lea dstq, [dstq+strideq*2] + movd [dstq+strideq*0], m0 + movd [dstq+strideq*1], m0 + RET + +cglobal vp9_ipred_dc_%1_8x8, 4, 4, 0, dst, stride, l, a + movq m0, [%2q] + DEFINE_ARGS dst, stride, stride3 + lea stride3q, [strideq*3] + pxor m1, m1 + psadbw m0, m1 +%if cpuflag(ssse3) + pmulhrsw m0, [pw_4096] + pshufb m0, m1 +%else + paddw m0, [pw_4] + psraw m0, 3 + punpcklbw m0, m0 + pshufw m0, m0, q0000 +%endif + movq [dstq+strideq*0], m0 + movq [dstq+strideq*1], m0 + movq [dstq+strideq*2], m0 + movq [dstq+stride3q ], m0 + lea dstq, [dstq+strideq*4] + movq [dstq+strideq*0], m0 + movq [dstq+strideq*1], m0 + movq [dstq+strideq*2], m0 + movq [dstq+stride3q ], m0 + RET +%endmacro + +INIT_MMX mmxext +DC_1D_4to8_FUNCS top, a +DC_1D_4to8_FUNCS left, l +INIT_MMX ssse3 +DC_1D_4to8_FUNCS top, a +DC_1D_4to8_FUNCS left, l + +%macro DC_1D_16to32_FUNCS 2; dir (top or left), arg (a or l) +cglobal vp9_ipred_dc_%1_16x16, 4, 4, 3, dst, stride, l, a + mova m0, [%2q] + DEFINE_ARGS dst, stride, stride3, cnt + lea stride3q, [strideq*3] + pxor m2, m2 + psadbw m0, m2 + movhlps m1, m0 + paddw m0, m1 +%if cpuflag(ssse3) + pmulhrsw m0, [pw_2048] + pshufb m0, m2 +%else + paddw m0, [pw_8] + psraw m0, 4 + punpcklbw m0, m0 + pshuflw m0, m0, q0000 + punpcklqdq m0, m0 +%endif + mov cntd, 4 +.loop: + mova [dstq+strideq*0], m0 + mova [dstq+strideq*1], m0 + mova [dstq+strideq*2], m0 + mova [dstq+stride3q ], m0 + lea dstq, [dstq+strideq*4] + dec cntd + jg .loop + RET + +cglobal vp9_ipred_dc_%1_32x32, 4, 4, 3, dst, stride, l, a + mova m0, [%2q] + mova m1, [%2q+16] + DEFINE_ARGS dst, stride, stride3, cnt + lea stride3q, [strideq*3] + pxor m2, m2 + psadbw m0, m2 + psadbw m1, m2 + paddw m0, m1 + movhlps m1, m0 + paddw m0, m1 +%if cpuflag(ssse3) + pmulhrsw m0, [pw_1024] + pshufb m0, m2 +%else + paddw m0, [pw_16] + psraw m0, 5 + punpcklbw m0, m0 + pshuflw m0, m0, q0000 + punpcklqdq m0, m0 +%endif + mov cntd, 8 +.loop: + mova [dstq+strideq*0+ 0], m0 + mova [dstq+strideq*0+16], m0 + mova [dstq+strideq*1+ 0], m0 + mova [dstq+strideq*1+16], m0 + mova [dstq+strideq*2+ 0], m0 + mova [dstq+strideq*2+16], m0 + mova [dstq+stride3q + 0], m0 + mova [dstq+stride3q +16], m0 + lea dstq, [dstq+strideq*4] + dec cntd + jg .loop + RET +%endmacro + +INIT_XMM sse2 +DC_1D_16to32_FUNCS top, a +DC_1D_16to32_FUNCS left, l +INIT_XMM ssse3 +DC_1D_16to32_FUNCS top, a +DC_1D_16to32_FUNCS left, l + +%macro DC_1D_AVX2_FUNCS 2 ; dir (top or left), arg (a or l) +%if HAVE_AVX2_EXTERNAL +cglobal vp9_ipred_dc_%1_32x32, 4, 4, 3, dst, stride, l, a + mova m0, [%2q] + DEFINE_ARGS dst, stride, stride3, cnt + lea stride3q, [strideq*3] + pxor m2, m2 + psadbw m0, m2 + vextracti128 xm1, m0, 1 + paddw xm0, xm1 + movhlps xm1, xm0 + paddw xm0, xm1 + pmulhrsw xm0, [pw_1024] + vpbroadcastb m0, xm0 + mov cntd, 4 +.loop: + mova [dstq+strideq*0], m0 + mova [dstq+strideq*1], m0 + mova [dstq+strideq*2], m0 + mova [dstq+stride3q ], m0 + lea dstq, [dstq+strideq*4] + mova [dstq+strideq*0], m0 + mova [dstq+strideq*1], m0 + mova [dstq+strideq*2], m0 + mova [dstq+stride3q ], m0 + lea dstq, [dstq+strideq*4] + dec cntd + jg .loop + RET +%endif +%endmacro + +INIT_YMM avx2 +DC_1D_AVX2_FUNCS top, a +DC_1D_AVX2_FUNCS left, l + +; v + +INIT_MMX mmx +cglobal vp9_ipred_v_8x8, 4, 4, 0, dst, stride, l, a + movq m0, [aq] + DEFINE_ARGS dst, stride, stride3 + lea stride3q, [strideq*3] + movq [dstq+strideq*0], m0 + movq [dstq+strideq*1], m0 + movq [dstq+strideq*2], m0 + movq [dstq+stride3q ], m0 + lea dstq, [dstq+strideq*4] + movq [dstq+strideq*0], m0 + movq [dstq+strideq*1], m0 + movq [dstq+strideq*2], m0 + movq [dstq+stride3q ], m0 + RET + +INIT_XMM sse +cglobal vp9_ipred_v_16x16, 4, 4, 1, dst, stride, l, a + mova m0, [aq] + DEFINE_ARGS dst, stride, stride3, cnt + lea stride3q, [strideq*3] + mov cntd, 4 +.loop: + mova [dstq+strideq*0], m0 + mova [dstq+strideq*1], m0 + mova [dstq+strideq*2], m0 + mova [dstq+stride3q ], m0 + lea dstq, [dstq+strideq*4] + dec cntd + jg .loop + RET + +INIT_XMM sse +cglobal vp9_ipred_v_32x32, 4, 4, 2, dst, stride, l, a + mova m0, [aq] + mova m1, [aq+16] + DEFINE_ARGS dst, stride, stride3, cnt + lea stride3q, [strideq*3] + mov cntd, 8 +.loop: + mova [dstq+strideq*0+ 0], m0 + mova [dstq+strideq*0+16], m1 + mova [dstq+strideq*1+ 0], m0 + mova [dstq+strideq*1+16], m1 + mova [dstq+strideq*2+ 0], m0 + mova [dstq+strideq*2+16], m1 + mova [dstq+stride3q + 0], m0 + mova [dstq+stride3q +16], m1 + lea dstq, [dstq+strideq*4] + dec cntd + jg .loop + RET + +INIT_YMM avx +cglobal vp9_ipred_v_32x32, 4, 4, 1, dst, stride, l, a + mova m0, [aq] + DEFINE_ARGS dst, stride, stride3, cnt + lea stride3q, [strideq*3] + mov cntd, 4 +.loop: + mova [dstq+strideq*0], m0 + mova [dstq+strideq*1], m0 + mova [dstq+strideq*2], m0 + mova [dstq+stride3q ], m0 + lea dstq, [dstq+strideq*4] + mova [dstq+strideq*0], m0 + mova [dstq+strideq*1], m0 + mova [dstq+strideq*2], m0 + mova [dstq+stride3q ], m0 + lea dstq, [dstq+strideq*4] + dec cntd + jg .loop + RET + +; h + +%macro H_XMM_FUNCS 2 +%if notcpuflag(avx) +cglobal vp9_ipred_h_4x4, 3, 4, 1, dst, stride, l, stride3 + movd m0, [lq] +%if cpuflag(ssse3) + pshufb m0, [pb_4x3_4x2_4x1_4x0] +%else + punpcklbw m0, m0 + pshuflw m0, m0, q0123 + punpcklwd m0, m0 +%endif + lea stride3q, [strideq*3] + movd [dstq+strideq*0], m0 + psrldq m0, 4 + movd [dstq+strideq*1], m0 + psrldq m0, 4 + movd [dstq+strideq*2], m0 + psrldq m0, 4 + movd [dstq+stride3q ], m0 + RET +%endif + +cglobal vp9_ipred_h_8x8, 3, 5, %1, dst, stride, l, stride3, cnt +%if cpuflag(ssse3) + mova m2, [pb_8x1_8x0] + mova m3, [pb_8x3_8x2] +%endif + lea stride3q, [strideq*3] + mov cntq, 1 +.loop: + movd m0, [lq+cntq*4] +%if cpuflag(ssse3) + pshufb m1, m0, m3 + pshufb m0, m2 +%else + punpcklbw m0, m0 + punpcklwd m0, m0 + pshufd m1, m0, q2233 + pshufd m0, m0, q0011 +%endif + movq [dstq+strideq*0], m1 + movhps [dstq+strideq*1], m1 + movq [dstq+strideq*2], m0 + movhps [dstq+stride3q ], m0 + lea dstq, [dstq+strideq*4] + dec cntq + jge .loop + RET + +cglobal vp9_ipred_h_16x16, 3, 5, %2, dst, stride, l, stride3, cnt +%if cpuflag(ssse3) + mova m5, [pb_1] + mova m6, [pb_2] + mova m7, [pb_3] + pxor m4, m4 +%endif + lea stride3q, [strideq*3] + mov cntq, 3 +.loop: + movd m3, [lq+cntq*4] +%if cpuflag(ssse3) + pshufb m0, m3, m7 + pshufb m1, m3, m6 +%else + punpcklbw m3, m3 + punpcklwd m3, m3 + pshufd m0, m3, q3333 + pshufd m1, m3, q2222 +%endif + mova [dstq+strideq*0], m0 + mova [dstq+strideq*1], m1 +%if cpuflag(ssse3) + pshufb m2, m3, m5 + pshufb m3, m4 +%else + pshufd m2, m3, q1111 + pshufd m3, m3, q0000 +%endif + mova [dstq+strideq*2], m2 + mova [dstq+stride3q ], m3 + lea dstq, [dstq+strideq*4] + dec cntq + jge .loop + RET + +cglobal vp9_ipred_h_32x32, 3, 5, %2, dst, stride, l, stride3, cnt +%if cpuflag(ssse3) + mova m5, [pb_1] + mova m6, [pb_2] + mova m7, [pb_3] + pxor m4, m4 +%endif + lea stride3q, [strideq*3] + mov cntq, 7 +.loop: + movd m3, [lq+cntq*4] +%if cpuflag(ssse3) + pshufb m0, m3, m7 + pshufb m1, m3, m6 +%else + punpcklbw m3, m3 + punpcklwd m3, m3 + pshufd m0, m3, q3333 + pshufd m1, m3, q2222 +%endif + mova [dstq+strideq*0+ 0], m0 + mova [dstq+strideq*0+16], m0 + mova [dstq+strideq*1+ 0], m1 + mova [dstq+strideq*1+16], m1 +%if cpuflag(ssse3) + pshufb m2, m3, m5 + pshufb m3, m4 +%else + pshufd m2, m3, q1111 + pshufd m3, m3, q0000 +%endif + mova [dstq+strideq*2+ 0], m2 + mova [dstq+strideq*2+16], m2 + mova [dstq+stride3q + 0], m3 + mova [dstq+stride3q +16], m3 + lea dstq, [dstq+strideq*4] + dec cntq + jge .loop + RET +%endmacro + +INIT_XMM sse2 +H_XMM_FUNCS 2, 4 +INIT_XMM ssse3 +H_XMM_FUNCS 4, 8 +INIT_XMM avx +H_XMM_FUNCS 4, 8 + +%if HAVE_AVX2_EXTERNAL +INIT_YMM avx2 +cglobal vp9_ipred_h_32x32, 3, 5, 8, dst, stride, l, stride3, cnt + mova m5, [pb_1] + mova m6, [pb_2] + mova m7, [pb_3] + pxor m4, m4 + lea stride3q, [strideq*3] + mov cntq, 7 +.loop: + movd xm3, [lq+cntq*4] + vinserti128 m3, m3, xm3, 1 + pshufb m0, m3, m7 + pshufb m1, m3, m6 + mova [dstq+strideq*0], m0 + mova [dstq+strideq*1], m1 + pshufb m2, m3, m5 + pshufb m3, m4 + mova [dstq+strideq*2], m2 + mova [dstq+stride3q ], m3 + lea dstq, [dstq+strideq*4] + dec cntq + jge .loop + RET +%endif + +; tm + +%macro TM_MMX_FUNCS 0 +cglobal vp9_ipred_tm_4x4, 4, 4, 0, dst, stride, l, a + pxor m1, m1 + movd m0, [aq] + pinsrw m2, [aq-1], 0 + punpcklbw m0, m1 + DEFINE_ARGS dst, stride, l, cnt +%if cpuflag(ssse3) + mova m3, [pw_m256] + mova m1, [pw_m255] + pshufb m2, m3 +%else + punpcklbw m2, m1 + pshufw m2, m2, q0000 +%endif + psubw m0, m2 + mov cntq, 1 +.loop: + pinsrw m2, [lq+cntq*2], 0 +%if cpuflag(ssse3) + pshufb m4, m2, m1 + pshufb m2, m3 +%else + punpcklbw m2, m1 + pshufw m4, m2, q1111 + pshufw m2, m2, q0000 +%endif + paddw m4, m0 + paddw m2, m0 + packuswb m4, m4 + packuswb m2, m2 + movd [dstq+strideq*0], m4 + movd [dstq+strideq*1], m2 + lea dstq, [dstq+strideq*2] + dec cntq + jge .loop + RET +%endmacro + +INIT_MMX mmxext +TM_MMX_FUNCS +INIT_MMX ssse3 +TM_MMX_FUNCS + +%macro TM_XMM_FUNCS 0 +cglobal vp9_ipred_tm_8x8, 4, 4, 5, dst, stride, l, a + pxor m1, m1 + movh m0, [aq] + pinsrw m2, [aq-1], 0 + punpcklbw m0, m1 + DEFINE_ARGS dst, stride, l, cnt +%if cpuflag(ssse3) + mova m3, [pw_m256] + mova m1, [pw_m255] + pshufb m2, m3 +%else + punpcklbw m2, m1 + punpcklwd m2, m2 + pshufd m2, m2, q0000 +%endif + psubw m0, m2 + mov cntq, 3 +.loop: + pinsrw m2, [lq+cntq*2], 0 +%if cpuflag(ssse3) + pshufb m4, m2, m1 + pshufb m2, m3 +%else + punpcklbw m2, m1 + punpcklwd m2, m2 + pshufd m4, m2, q1111 + pshufd m2, m2, q0000 +%endif + paddw m4, m0 + paddw m2, m0 + packuswb m4, m2 + movh [dstq+strideq*0], m4 + movhps [dstq+strideq*1], m4 + lea dstq, [dstq+strideq*2] + dec cntq + jge .loop + RET + +cglobal vp9_ipred_tm_16x16, 4, 4, 8, dst, stride, l, a + pxor m3, m3 + mova m0, [aq] + pinsrw m2, [aq-1], 0 + punpckhbw m1, m0, m3 + punpcklbw m0, m3 + DEFINE_ARGS dst, stride, l, cnt +%if cpuflag(ssse3) + mova m4, [pw_m256] + mova m3, [pw_m255] + pshufb m2, m4 +%else + punpcklbw m2, m3 + punpcklwd m2, m2 + pshufd m2, m2, q0000 +%endif + psubw m1, m2 + psubw m0, m2 + mov cntq, 7 +.loop: + pinsrw m7, [lq+cntq*2], 0 +%if cpuflag(ssse3) + pshufb m5, m7, m3 + pshufb m7, m4 +%else + punpcklbw m7, m3 + punpcklwd m7, m7 + pshufd m5, m7, q1111 + pshufd m7, m7, q0000 +%endif + paddw m2, m5, m0 + paddw m5, m1 + paddw m6, m7, m0 + paddw m7, m1 + packuswb m2, m5 + packuswb m6, m7 + mova [dstq+strideq*0], m2 + mova [dstq+strideq*1], m6 + lea dstq, [dstq+strideq*2] + dec cntq + jge .loop + RET + +%if ARCH_X86_64 +%define mem 0 +%else +%define mem 64 +%endif +cglobal vp9_ipred_tm_32x32, 4, 4, 14, mem, dst, stride, l, a + pxor m5, m5 + pinsrw m4, [aq-1], 0 + mova m0, [aq] + mova m2, [aq+16] + DEFINE_ARGS dst, stride, l, cnt +%if cpuflag(ssse3) +%if ARCH_X86_64 + mova m12, [pw_m256] + mova m13, [pw_m255] +%define pw_m256_reg m12 +%define pw_m255_reg m13 +%else +%define pw_m256_reg [pw_m256] +%define pw_m255_reg [pw_m255] +%endif + pshufb m4, pw_m256_reg +%else + punpcklbw m4, m5 + punpcklwd m4, m4 + pshufd m4, m4, q0000 +%endif + punpckhbw m1, m0, m5 + punpckhbw m3, m2, m5 + punpcklbw m0, m5 + punpcklbw m2, m5 + psubw m1, m4 + psubw m0, m4 + psubw m3, m4 + psubw m2, m4 +%if ARCH_X86_64 + SWAP 0, 8 + SWAP 1, 9 + SWAP 2, 10 + SWAP 3, 11 +%else + mova [rsp+0*16], m0 + mova [rsp+1*16], m1 + mova [rsp+2*16], m2 + mova [rsp+3*16], m3 +%endif + mov cntq, 15 +.loop: + pinsrw m3, [lq+cntq*2], 0 +%if cpuflag(ssse3) + pshufb m7, m3, pw_m255_reg + pshufb m3, pw_m256_reg +%else + pxor m7, m7 + punpcklbw m3, m7 + punpcklwd m3, m3 + pshufd m7, m3, q1111 + pshufd m3, m3, q0000 +%endif +%if ARCH_X86_64 + paddw m4, m7, m8 + paddw m5, m7, m9 + paddw m6, m7, m10 + paddw m7, m11 + paddw m0, m3, m8 + paddw m1, m3, m9 + paddw m2, m3, m10 + paddw m3, m11 +%else + paddw m4, m7, [rsp+0*16] + paddw m5, m7, [rsp+1*16] + paddw m6, m7, [rsp+2*16] + paddw m7, [rsp+3*16] + paddw m0, m3, [rsp+0*16] + paddw m1, m3, [rsp+1*16] + paddw m2, m3, [rsp+2*16] + paddw m3, [rsp+3*16] +%endif + packuswb m4, m5 + packuswb m6, m7 + packuswb m0, m1 + packuswb m2, m3 + mova [dstq+strideq*0+ 0], m4 + mova [dstq+strideq*0+16], m6 + mova [dstq+strideq*1+ 0], m0 + mova [dstq+strideq*1+16], m2 + lea dstq, [dstq+strideq*2] + dec cntq + jge .loop + RET +%undef pw_m256_reg +%undef pw_m255_reg +%undef mem +%endmacro + +INIT_XMM sse2 +TM_XMM_FUNCS +INIT_XMM ssse3 +TM_XMM_FUNCS +INIT_XMM avx +TM_XMM_FUNCS + +%if HAVE_AVX2_EXTERNAL +INIT_YMM avx2 +cglobal vp9_ipred_tm_32x32, 4, 4, 8, dst, stride, l, a + pxor m3, m3 + pinsrw xm2, [aq-1], 0 + vinserti128 m2, m2, xm2, 1 + mova m0, [aq] + DEFINE_ARGS dst, stride, l, cnt + mova m4, [pw_m256] + mova m5, [pw_m255] + pshufb m2, m4 + punpckhbw m1, m0, m3 + punpcklbw m0, m3 + psubw m1, m2 + psubw m0, m2 + mov cntq, 15 +.loop: + pinsrw xm7, [lq+cntq*2], 0 + vinserti128 m7, m7, xm7, 1 + pshufb m3, m7, m5 + pshufb m7, m4 + paddw m2, m3, m0 + paddw m3, m1 + paddw m6, m7, m0 + paddw m7, m1 + packuswb m2, m3 + packuswb m6, m7 + mova [dstq+strideq*0], m2 + mova [dstq+strideq*1], m6 + lea dstq, [dstq+strideq*2] + dec cntq + jge .loop + RET +%endif + +; dl + +%macro LOWPASS 4 ; left [dst], center, right, tmp + pxor m%4, m%1, m%3 + pand m%4, [pb_1] + pavgb m%1, m%3 + psubusb m%1, m%4 + pavgb m%1, m%2 +%endmacro + +%macro DL_MMX_FUNCS 0 +cglobal vp9_ipred_dl_4x4, 4, 4, 0, dst, stride, l, a + movq m1, [aq] +%if cpuflag(ssse3) + pshufb m0, m1, [pb_0to5_2x7] + pshufb m2, m1, [pb_2to6_3x7] +%else + punpckhbw m3, m1, m1 ; 44556677 + pand m0, m1, [pb_6xm1_2x0] ; 012345__ + pand m3, [pb_6x0_2xm1] ; ______77 + psrlq m2, m1, 16 ; 234567__ + por m0, m3 ; 01234577 + por m2, m3 ; 23456777 +%endif + psrlq m1, 8 + LOWPASS 0, 1, 2, 3 + + pshufw m1, m0, q3321 + movd [dstq+strideq*0], m0 + movd [dstq+strideq*2], m1 + psrlq m0, 8 + psrlq m1, 8 + add dstq, strideq + movd [dstq+strideq*0], m0 + movd [dstq+strideq*2], m1 + RET +%endmacro + +INIT_MMX mmxext +DL_MMX_FUNCS +INIT_MMX ssse3 +DL_MMX_FUNCS + +%macro DL_XMM_FUNCS 0 +cglobal vp9_ipred_dl_8x8, 4, 4, 4, dst, stride, stride5, a + movq m0, [aq] + lea stride5q, [strideq*5] +%if cpuflag(ssse3) + pshufb m1, m0, [pb_1to6_10x7] +%else + punpcklbw m1, m0, m0 ; 0011223344556677 + punpckhwd m1, m1 ; 4x4,4x5,4x6,4x7 +%endif + shufps m0, m1, q3310 +%if notcpuflag(ssse3) + psrldq m1, m0, 1 + shufps m1, m0, q3210 +%endif + psrldq m2, m1, 1 + LOWPASS 0, 1, 2, 3 + + pshufd m1, m0, q3321 + movq [dstq+strideq*0], m0 + movq [dstq+strideq*4], m1 + psrldq m0, 1 + psrldq m1, 1 + movq [dstq+strideq*1], m0 + movq [dstq+stride5q ], m1 + lea dstq, [dstq+strideq*2] + psrldq m0, 1 + psrldq m1, 1 + movq [dstq+strideq*0], m0 + movq [dstq+strideq*4], m1 + psrldq m0, 1 + psrldq m1, 1 + movq [dstq+strideq*1], m0 + movq [dstq+stride5q ], m1 + RET + +cglobal vp9_ipred_dl_16x16, 4, 4, 6, dst, stride, l, a + mova m0, [aq] +%if cpuflag(ssse3) + mova m5, [pb_1toE_2xF] + pshufb m1, m0, m5 + pshufb m2, m1, m5 + pshufb m4, m0, [pb_15] +%else + pand m5, m0, [pb_15x0_1xm1] ; _______________F + psrldq m1, m0, 1 ; 123456789ABCDEF_ + por m1, m5 ; 123456789ABCDEFF + psrldq m2, m1, 1 ; 23456789ABCDEFF_ + por m2, m5 ; 23456789ABCDEFFF + pshufhw m4, m1, q3333 ; xxxxxxxxFFFFFFFF +%endif + LOWPASS 0, 1, 2, 3 + DEFINE_ARGS dst, stride, cnt, stride9 + lea stride9q, [strideq+strideq*8] + mov cntd, 4 + +.loop: + movhlps m4, m0 + mova [dstq+strideq*0], m0 +%if cpuflag(ssse3) + pshufb m0, m5 +%else + psrldq m0, 1 + por m0, m5 +%endif + mova [dstq+strideq*8], m4 + movhlps m4, m0 + mova [dstq+strideq*1], m0 +%if cpuflag(ssse3) + pshufb m0, m5 +%else + psrldq m0, 1 + por m0, m5 +%endif + mova [dstq+stride9q ], m4 + lea dstq, [dstq+strideq*2] + dec cntd + jg .loop + RET + +cglobal vp9_ipred_dl_32x32, 4, 5, 8, dst, stride, cnt, a, dst16 + mova m0, [aq] + mova m1, [aq+16] + PALIGNR m2, m1, m0, 1, m4 + PALIGNR m3, m1, m0, 2, m4 + LOWPASS 0, 2, 3, 4 +%if cpuflag(ssse3) + mova m5, [pb_1toE_2xF] + pshufb m2, m1, m5 + pshufb m3, m2, m5 + pshufb m6, m1, [pb_15] + mova m7, m6 +%else + pand m5, m1, [pb_15x0_1xm1] ; _______________F + psrldq m2, m1, 1 ; 123456789ABCDEF_ + por m2, m5 ; 123456789ABCDEFF + psrldq m3, m2, 1 ; 23456789ABCDEFF_ + por m3, m5 ; 23456789ABCDEFFF + pshufhw m7, m2, q3333 ; xxxxxxxxFFFFFFFF + pshufd m6, m7, q3333 +%endif + LOWPASS 1, 2, 3, 4 + lea dst16q, [dstq +strideq*8] + mov cntd, 8 + lea dst16q, [dst16q+strideq*8] +.loop: + movhlps m7, m1 + mova [dstq +strideq*0+ 0], m0 + mova [dstq +strideq*0+16], m1 + movhps [dstq+strideq*8+ 0], m0 + movq [dstq +strideq*8+ 8], m1 + mova [dstq +strideq*8+16], m7 + mova [dst16q+strideq*0+ 0], m1 + mova [dst16q+strideq*0+16], m6 + mova [dst16q+strideq*8+ 0], m7 + mova [dst16q+strideq*8+16], m6 +%if cpuflag(avx) + vpalignr m0, m1, m0, 1 + pshufb m1, m5 +%elif cpuflag(ssse3) + palignr m2, m1, m0, 1 + pshufb m1, m5 + mova m0, m2 +%else + mova m4, m1 + psrldq m0, 1 + pslldq m4, 15 + psrldq m1, 1 + por m0, m4 + por m1, m5 +%endif + add dstq, strideq + add dst16q, strideq + dec cntd + jg .loop + RET +%endmacro + +INIT_XMM sse2 +DL_XMM_FUNCS +INIT_XMM ssse3 +DL_XMM_FUNCS +INIT_XMM avx +DL_XMM_FUNCS + +; dr + +%macro DR_MMX_FUNCS 0 +cglobal vp9_ipred_dr_4x4, 4, 4, 0, dst, stride, l, a + movd m0, [lq] + punpckldq m0, [aq-1] + movd m1, [aq+3] + DEFINE_ARGS dst, stride, stride3 + lea stride3q, [strideq*3] + PALIGNR m1, m0, 1, m3 + psrlq m2, m1, 8 + LOWPASS 0, 1, 2, 3 + + movd [dstq+stride3q ], m0 + psrlq m0, 8 + movd [dstq+strideq*2], m0 + psrlq m0, 8 + movd [dstq+strideq*1], m0 + psrlq m0, 8 + movd [dstq+strideq*0], m0 + RET +%endmacro + +INIT_MMX mmxext +DR_MMX_FUNCS +INIT_MMX ssse3 +DR_MMX_FUNCS + +%macro DR_XMM_FUNCS 0 +cglobal vp9_ipred_dr_8x8, 4, 4, 4, dst, stride, l, a + movq m1, [lq] + movhps m1, [aq-1] + movd m2, [aq+7] + DEFINE_ARGS dst, stride, stride3 + lea stride3q, [strideq*3] + pslldq m0, m1, 1 + PALIGNR m2, m1, 1, m3 + LOWPASS 0, 1, 2, 3 + + movhps [dstq+strideq*0], m0 + pslldq m0, 1 + movhps [dstq+strideq*1], m0 + pslldq m0, 1 + movhps [dstq+strideq*2], m0 + pslldq m0, 1 + movhps [dstq+stride3q ], m0 + pslldq m0, 1 + lea dstq, [dstq+strideq*4] + movhps [dstq+strideq*0], m0 + pslldq m0, 1 + movhps [dstq+strideq*1], m0 + pslldq m0, 1 + movhps [dstq+strideq*2], m0 + pslldq m0, 1 + movhps [dstq+stride3q ], m0 + RET + +cglobal vp9_ipred_dr_16x16, 4, 4, 6, dst, stride, l, a + mova m1, [lq] + movu m2, [aq-1] + movd m4, [aq+15] + DEFINE_ARGS dst, stride, stride9, cnt + lea stride9q, [strideq *3] + mov cntd, 4 + lea stride9q, [stride9q*3] + PALIGNR m4, m2, 1, m5 + PALIGNR m3, m2, m1, 15, m5 + LOWPASS 3, 2, 4, 5 + pslldq m0, m1, 1 + PALIGNR m2, m1, 1, m4 + LOWPASS 0, 1, 2, 4 + +.loop: + mova [dstq+strideq*0 ], m3 + movhps [dstq+strideq*8+0], m0 + movq [dstq+strideq*8+8], m3 + PALIGNR m3, m0, 15, m1 + pslldq m0, 1 + mova [dstq+strideq*1 ], m3 + movhps [dstq+stride9q +0], m0 + movq [dstq+stride9q +8], m3 + PALIGNR m3, m0, 15, m1 + pslldq m0, 1 + lea dstq, [dstq+strideq*2] + dec cntd + jg .loop + RET + +cglobal vp9_ipred_dr_32x32, 4, 4, 8, dst, stride, l, a + mova m1, [lq] + mova m2, [lq+16] + movu m3, [aq-1] + movu m4, [aq+15] + movd m5, [aq+31] + DEFINE_ARGS dst, stride, stride8, cnt + lea stride8q, [strideq*8] + PALIGNR m5, m4, 1, m7 + PALIGNR m6, m4, m3, 15, m7 + LOWPASS 5, 4, 6, 7 + PALIGNR m4, m3, 1, m7 + PALIGNR m6, m3, m2, 15, m7 + LOWPASS 4, 3, 6, 7 + PALIGNR m3, m2, 1, m7 + PALIGNR m6, m2, m1, 15, m7 + LOWPASS 3, 2, 6, 7 + PALIGNR m2, m1, 1, m6 + pslldq m0, m1, 1 + LOWPASS 2, 1, 0, 6 + mov cntd, 16 + + ; out=m2/m3/m4/m5 +.loop: + mova [dstq+stride8q*0+ 0], m4 + mova [dstq+stride8q*0+16], m5 + mova [dstq+stride8q*2+ 0], m3 + mova [dstq+stride8q*2+16], m4 + PALIGNR m5, m4, 15, m6 + PALIGNR m4, m3, 15, m6 + PALIGNR m3, m2, 15, m6 + pslldq m2, 1 + add dstq, strideq + dec cntd + jg .loop + RET +%endmacro + +INIT_XMM sse2 +DR_XMM_FUNCS +INIT_XMM ssse3 +DR_XMM_FUNCS +INIT_XMM avx +DR_XMM_FUNCS + +; vl + +INIT_MMX mmxext +cglobal vp9_ipred_vl_4x4, 4, 4, 0, dst, stride, l, a + movq m0, [aq] + psrlq m1, m0, 8 + psrlq m2, m1, 8 + LOWPASS 2, 1, 0, 3 + pavgb m1, m0 + movd [dstq+strideq*0], m1 + movd [dstq+strideq*1], m2 + lea dstq, [dstq+strideq*2] + psrlq m1, 8 + psrlq m2, 8 + movd [dstq+strideq*0], m1 + movd [dstq+strideq*1], m2 + RET + +%macro VL_XMM_FUNCS 0 +cglobal vp9_ipred_vl_8x8, 4, 4, 4, dst, stride, l, a + movq m0, [aq] +%if cpuflag(ssse3) + pshufb m0, [pb_0to6_9x7] +%else + punpcklbw m1, m0, m0 + punpckhwd m1, m1 + shufps m0, m1, q3310 +%endif + DEFINE_ARGS dst, stride, stride3 + lea stride3q, [strideq*3] + psrldq m1, m0, 1 + psrldq m2, m0, 2 + LOWPASS 2, 1, 0, 3 + pavgb m1, m0 + + movq [dstq+strideq*0], m1 + movq [dstq+strideq*1], m2 + psrldq m1, 1 + psrldq m2, 1 + movq [dstq+strideq*2], m1 + movq [dstq+stride3q ], m2 + lea dstq, [dstq+strideq*4] + psrldq m1, 1 + psrldq m2, 1 + movq [dstq+strideq*0], m1 + movq [dstq+strideq*1], m2 + psrldq m1, 1 + psrldq m2, 1 + movq [dstq+strideq*2], m1 + movq [dstq+stride3q ], m2 + RET + +cglobal vp9_ipred_vl_16x16, 4, 4, 5, dst, stride, l, a + mova m0, [aq] + DEFINE_ARGS dst, stride, stride3, cnt + lea stride3q, [strideq*3] +%if cpuflag(ssse3) + mova m4, [pb_1toE_2xF] + pshufb m1, m0, m4 + pshufb m2, m1, m4 +%else + pand m4, m0, [pb_15x0_1xm1] ; _______________F + psrldq m1, m0, 1 ; 123456789ABCDEF_ + por m1, m4 ; 123456789ABCDEFF + psrldq m2, m1, 1 ; 23456789ABCDEFF_ + por m2, m4 ; 23456789ABCDEFFF +%endif + LOWPASS 2, 1, 0, 3 + pavgb m1, m0 + mov cntd, 4 +.loop: + mova [dstq+strideq*0], m1 + mova [dstq+strideq*1], m2 +%if cpuflag(ssse3) + pshufb m1, m4 + pshufb m2, m4 +%else + psrldq m1, 1 + psrldq m2, 1 + por m1, m4 + por m2, m4 +%endif + mova [dstq+strideq*2], m1 + mova [dstq+stride3q ], m2 +%if cpuflag(ssse3) + pshufb m1, m4 + pshufb m2, m4 +%else + psrldq m1, 1 + psrldq m2, 1 + por m1, m4 + por m2, m4 +%endif + lea dstq, [dstq+strideq*4] + dec cntd + jg .loop + RET + +cglobal vp9_ipred_vl_32x32, 4, 4, 7, dst, stride, l, a + mova m0, [aq] + mova m5, [aq+16] + DEFINE_ARGS dst, stride, dst16, cnt + PALIGNR m2, m5, m0, 1, m4 + PALIGNR m3, m5, m0, 2, m4 + lea dst16q, [dstq +strideq*8] + LOWPASS 3, 2, 0, 6 + pavgb m2, m0 +%if cpuflag(ssse3) + mova m4, [pb_1toE_2xF] + pshufb m0, m5, m4 + pshufb m1, m0, m4 +%else + pand m4, m5, [pb_15x0_1xm1] ; _______________F + psrldq m0, m5, 1 ; 123456789ABCDEF_ + por m0, m4 ; 123456789ABCDEFF + psrldq m1, m0, 1 ; 23456789ABCDEFF_ + por m1, m4 ; 23456789ABCDEFFF +%endif + lea dst16q, [dst16q+strideq*8] + LOWPASS 1, 0, 5, 6 + pavgb m0, m5 +%if cpuflag(ssse3) + pshufb m5, [pb_15] +%else + punpckhbw m5, m4, m4 + pshufhw m5, m5, q3333 + punpckhqdq m5, m5 +%endif + mov cntd, 8 + +.loop: +%macro %%write 3 + mova [dstq+stride%1+ 0], %2 + mova [dstq+stride%1+16], %3 + movhps [dst16q+stride%1 ], %2 + movu [dst16q+stride%1+ 8], %3 + movq [dst16q+stride%1+24], m5 +%if cpuflag(avx) + palignr %2, %3, %2, 1 + pshufb %3, m4 +%elif cpuflag(ssse3) + palignr m6, %3, %2, 1 + pshufb %3, m4 + mova %2, m6 +%else + pslldq m6, %3, 15 + psrldq %3, 1 + psrldq %2, 1 + por %3, m4 + por %2, m6 +%endif +%endmacro + + %%write q*0, m2, m0 + %%write q*1, m3, m1 + lea dstq, [dstq +strideq*2] + lea dst16q, [dst16q+strideq*2] + dec cntd + jg .loop + RET +%endmacro + +INIT_XMM sse2 +VL_XMM_FUNCS +INIT_XMM ssse3 +VL_XMM_FUNCS +INIT_XMM avx +VL_XMM_FUNCS + +; vr + +%macro VR_MMX_FUNCS 0 +cglobal vp9_ipred_vr_4x4, 4, 4, 0, dst, stride, l, a + movq m1, [aq-1] + punpckldq m2, [lq] + movd m0, [aq] + DEFINE_ARGS dst, stride, stride3 + lea stride3q, [strideq*3] + pavgb m0, m1 + PALIGNR m1, m2, 5, m3 + psrlq m2, m1, 8 + psllq m3, m1, 8 + LOWPASS 2, 1, 3, 4 + + ; ABCD <- for the following predictor: + ; EFGH + ; IABC | m0 contains ABCDxxxx + ; JEFG | m2 contains xJIEFGHx + +%if cpuflag(ssse3) + punpckldq m0, m2 + pshufb m2, [pb_13456_3xm1] + movd [dstq+strideq*0], m0 + pshufb m0, [pb_6012_4xm1] + movd [dstq+stride3q ], m2 + psrlq m2, 8 + movd [dstq+strideq*2], m0 + movd [dstq+strideq*1], m2 +%else + psllq m1, m2, 40 + psrlq m2, 24 + movd [dstq+strideq*0], m0 + movd [dstq+strideq*1], m2 + PALIGNR m0, m1, 7, m3 + psllq m1, 8 + PALIGNR m2, m1, 7, m3 + movd [dstq+strideq*2], m0 + movd [dstq+stride3q ], m2 +%endif + RET +%endmacro + +INIT_MMX mmxext +VR_MMX_FUNCS +INIT_MMX ssse3 +VR_MMX_FUNCS + +%macro VR_XMM_FUNCS 1 ; n_xmm_regs for 16x16 +cglobal vp9_ipred_vr_8x8, 4, 4, 5, dst, stride, l, a + movu m1, [aq-1] + movhps m2, [lq] + movq m0, [aq] + DEFINE_ARGS dst, stride, stride3 + lea stride3q, [strideq*3] + pavgb m0, m1 + PALIGNR m1, m2, 9, m3 + pslldq m2, m1, 1 + pslldq m3, m1, 2 + LOWPASS 1, 2, 3, 4 + + ; ABCDEFGH <- for the following predictor: + ; IJKLMNOP + ; QABCDEFG | m0 contains ABCDEFGHxxxxxxxx + ; RIJKLMNO | m1 contains xxVUTSRQIJKLMNOP + ; SQABCDEF + ; TRIJKLMN + ; USQABCDE + ; VTRIJKLM + +%if cpuflag(ssse3) + punpcklqdq m0, m1 ; ABCDEFGHxxVUTSRQ +%endif + movq [dstq+strideq*0], m0 + movhps [dstq+strideq*1], m1 +%if cpuflag(ssse3) + pshufb m0, [pb_6xm1_BDF_0to6] ; xxxxxxUSQABCDEFG + pshufb m1, [pb_6xm1_246_8toE] ; xxxxxxVTRIJKLMNO +%else + psrlw m2, m1, 8 ; x_U_S_Q_xxxxxxxx + pand m3, m1, [pw_255] ; x_V_T_R_xxxxxxxx + packuswb m3, m2 ; xVTRxxxxxUSQxxxx + pslldq m3, 4 ; xxxxxVTRxxxxxUSQ + PALIGNR m0, m3, 7, m4 ; xxxxxxUSQABCDEFG + psrldq m1, 8 + pslldq m3, 8 + PALIGNR m1, m3, 7, m4 ; xxxxxxVTRIJKLMNO +%endif + movhps [dstq+strideq*2], m0 + movhps [dstq+stride3q ], m1 + lea dstq, [dstq+strideq*4] + pslldq m0, 1 + pslldq m1, 1 + movhps [dstq+strideq*0], m0 + movhps [dstq+strideq*1], m1 + pslldq m0, 1 + pslldq m1, 1 + movhps [dstq+strideq*2], m0 + movhps [dstq+stride3q ], m1 + RET + +cglobal vp9_ipred_vr_16x16, 4, 4, %1, dst, stride, l, a + mova m0, [aq] + movu m1, [aq-1] + mova m2, [lq] + DEFINE_ARGS dst, stride, stride3, cnt + lea stride3q, [strideq*3] + PALIGNR m3, m1, m2, 15, m6 + LOWPASS 3, 1, 0, 4 + pavgb m0, m1 + PALIGNR m1, m2, 1, m6 + pslldq m4, m2, 1 + LOWPASS 1, 2, 4, 5 +%if cpuflag(ssse3) + pshufb m1, [pb_02468ACE_13579BDF] +%else + psrlw m5, m1, 8 + pand m1, [pw_255] + packuswb m1, m5 +%endif + mov cntd, 4 + +.loop: + movlhps m2, m1 + mova [dstq+strideq*0], m0 + mova [dstq+strideq*1], m3 + PALIGNR m4, m0, m1, 15, m6 + PALIGNR m5, m3, m2, 15, m6 + mova [dstq+strideq*2], m4 + mova [dstq+stride3q ], m5 + lea dstq, [dstq+strideq*4] + PALIGNR m0, m1, 14, m6 + PALIGNR m3, m2, 14, m6 + pslldq m1, 2 + dec cntd + jg .loop + RET + +cglobal vp9_ipred_vr_32x32, 4, 4, 9, dst, stride, l, a + mova m0, [aq] + mova m2, [aq+16] + movu m1, [aq-1] + PALIGNR m3, m2, m0, 15, m6 + PALIGNR m4, m2, m0, 14, m6 + LOWPASS 4, 3, 2, 5 + pavgb m3, m2 + mova m2, [lq+16] + PALIGNR m5, m1, m2, 15, m6 + LOWPASS 5, 1, 0, 6 + pavgb m0, m1 + mova m6, [lq] +%if ARCH_X86_64 + SWAP 0, 8 +%else + mova [dstq], m0 +%endif + PALIGNR m1, m2, 1, m0 + PALIGNR m7, m2, m6, 15, m0 + LOWPASS 1, 2, 7, 0 + PALIGNR m2, m6, 1, m0 + pslldq m7, m6, 1 + LOWPASS 2, 6, 7, 0 +%if cpuflag(ssse3) + pshufb m1, [pb_02468ACE_13579BDF] + pshufb m2, [pb_02468ACE_13579BDF] +%else + psrlw m0, m1, 8 + psrlw m6, m2, 8 + pand m1, [pw_255] + pand m2, [pw_255] + packuswb m1, m0 + packuswb m2, m6 +%endif + DEFINE_ARGS dst, stride, dst16, cnt + lea dst16q, [dstq +strideq*8] + lea dst16q, [dst16q+strideq*8] + SBUTTERFLY qdq, 2, 1, 6 +%if ARCH_X86_64 + SWAP 0, 8 +%else + mova m0, [dstq] +%endif + mov cntd, 8 + +.loop: + ; even lines (0, 2, 4, ...): m1 | m0, m3 + ; odd lines (1, 3, 5, ...): m2 | m5, m4 +%macro %%write 4 + mova [dstq+stride%1+ 0], %3 + mova [dstq+stride%1+16], %4 + movhps [dst16q+stride%1 ], %2 + movu [dst16q+stride%1+ 8], %3 + movq [dst16q+stride%1+24], %4 + PALIGNR %4, %3, 15, m6 + PALIGNR %3, %2, 15, m6 + pslldq %2, 1 +%endmacro + + %%write q*0, m1, m0, m3 + %%write q*1, m2, m5, m4 + lea dstq, [dstq +strideq*2] + lea dst16q, [dst16q+strideq*2] + dec cntd + jg .loop + RET +%endmacro + +INIT_XMM sse2 +VR_XMM_FUNCS 7 +INIT_XMM ssse3 +VR_XMM_FUNCS 6 +INIT_XMM avx +VR_XMM_FUNCS 6 + +; hd + +INIT_MMX mmxext +cglobal vp9_ipred_hd_4x4, 4, 4, 0, dst, stride, l, a + movd m0, [lq] + punpckldq m0, [aq-1] + DEFINE_ARGS dst, stride, stride3 + lea stride3q, [strideq*3] + psrlq m1, m0, 8 + psrlq m2, m1, 8 + LOWPASS 2, 1, 0, 3 + pavgb m1, m0 + + ; DHIJ <- for the following predictor: + ; CGDH + ; BFCG | m1 contains ABCDxxxx + ; AEBF | m2 contains EFGHIJxx + + punpcklbw m1, m2 + punpckhdq m0, m1, m2 + + ; m1 contains AEBFCGDH + ; m0 contains CGDHIJxx + + movd [dstq+stride3q ], m1 + movd [dstq+strideq*1], m0 + psrlq m1, 16 + psrlq m0, 16 + movd [dstq+strideq*2], m1 + movd [dstq+strideq*0], m0 + RET + +%macro HD_XMM_FUNCS 0 +cglobal vp9_ipred_hd_8x8, 4, 4, 5, dst, stride, l, a + movq m0, [lq] + movhps m0, [aq-1] + DEFINE_ARGS dst, stride, stride3, dst4 + lea stride3q, [strideq*3] + lea dst4q, [dstq+strideq*4] + psrldq m1, m0, 1 + psrldq m2, m1, 1 + LOWPASS 2, 1, 0, 3 + pavgb m1, m0 + + ; HPQRSTUV <- for the following predictor + ; GOHPQRST + ; FNGOHPQR | m1 contains ABCDEFGHxxxxxxxx + ; EMFNGOHP | m2 contains IJKLMNOPQRSTUVxx + ; DLEMFNGO + ; CKDLEMFN + ; BJCKDLEM + ; AIBJCKDL + + punpcklbw m1, m2 + movhlps m2, m2 + + ; m1 contains AIBJCKDLEMFNGOHP + ; m2 contains QRSTUVxxxxxxxxxx + + movhps [dstq +stride3q ], m1 + movq [dst4q+stride3q ], m1 + PALIGNR m3, m2, m1, 2, m4 + movhps [dstq +strideq*2], m3 + movq [dst4q+strideq*2], m3 + PALIGNR m3, m2, m1, 4, m4 + movhps [dstq +strideq*1], m3 + movq [dst4q+strideq*1], m3 + PALIGNR m2, m1, 6, m4 + movhps [dstq +strideq*0], m2 + movq [dst4q+strideq*0], m2 + RET + +cglobal vp9_ipred_hd_16x16, 4, 6, 7, dst, stride, l, a + mova m0, [lq] + movu m3, [aq-1] + DEFINE_ARGS dst, stride, stride4, dst4, dst8, dst12 + lea stride4q, [strideq*4] + lea dst4q, [dstq +stride4q] + lea dst8q, [dst4q+stride4q] + lea dst12q, [dst8q+stride4q] + psrldq m4, m3, 1 + psrldq m5, m3, 2 + LOWPASS 5, 4, 3, 6 + PALIGNR m1, m3, m0, 1, m6 + PALIGNR m2, m3, m0, 2, m6 + LOWPASS 2, 1, 0, 6 + pavgb m1, m0 + SBUTTERFLY bw, 1, 2, 6 + + ; I PROBABLY INVERTED L0 ad L16 here + ; m1, m2, m5 +.loop: + sub stride4q, strideq + movhps [dstq +stride4q +0], m2 + movq [dstq +stride4q +8], m5 + mova [dst4q+stride4q ], m2 + movhps [dst8q+stride4q +0], m1 + movq [dst8q+stride4q +8], m2 + mova [dst12q+stride4q ], m1 +%if cpuflag(avx) + palignr m1, m2, m1, 2 + palignr m2, m5, m2, 2 +%elif cpuflag(ssse3) + palignr m3, m2, m1, 2 + palignr m0, m5, m2, 2 + mova m1, m3 + mova m2, m0 +%else + ; slightly modified version of PALIGNR + mova m6, m2 + mova m4, m5 + pslldq m6, 14 + pslldq m4, 14 + psrldq m1, 2 + psrldq m2, 2 + por m1, m6 + por m2, m4 +%endif + psrldq m5, 2 + jg .loop + RET + +cglobal vp9_ipred_hd_32x32, 4, 6, 8, dst, stride, l, a + mova m0, [lq] + mova m1, [lq+16] + movu m2, [aq-1] + movu m3, [aq+15] + DEFINE_ARGS dst, stride, stride8, dst8, dst16, dst24 + lea stride8q, [strideq*8] + lea dst8q, [dstq +stride8q] + lea dst16q, [dst8q +stride8q] + lea dst24q, [dst16q+stride8q] + psrldq m4, m3, 1 + psrldq m5, m3, 2 + LOWPASS 5, 4, 3, 6 + PALIGNR m4, m3, m2, 2, m6 + PALIGNR m3, m2, 1, m6 + LOWPASS 4, 3, 2, 6 + PALIGNR m3, m2, m1, 2, m6 + PALIGNR m2, m1, 1, m6 + LOWPASS 3, 2, 1, 6 + pavgb m2, m1 + PALIGNR m6, m1, m0, 1, m7 + PALIGNR m1, m0, 2, m7 + LOWPASS 1, 6, 0, 7 + pavgb m0, m6 + SBUTTERFLY bw, 2, 3, 6 + SBUTTERFLY bw, 0, 1, 6 + + ; m0, m1, m2, m3, m4, m5 +.loop: + sub stride8q, strideq + mova [dstq +stride8q+ 0], m3 + mova [dstq +stride8q+16], m4 + mova [dst8q +stride8q+ 0], m2 + mova [dst8q +stride8q+16], m3 + mova [dst16q+stride8q+ 0], m1 + mova [dst16q+stride8q+16], m2 + mova [dst24q+stride8q+ 0], m0 + mova [dst24q+stride8q+16], m1 +%if cpuflag(avx) + palignr m0, m1, m0, 2 + palignr m1, m2, m1, 2 + palignr m2, m3, m2, 2 + palignr m3, m4, m3, 2 + palignr m4, m5, m4, 2 + psrldq m5, 2 +%elif cpuflag(ssse3) + psrldq m6, m5, 2 + palignr m5, m4, 2 + palignr m4, m3, 2 + palignr m3, m2, 2 + palignr m2, m1, 2 + palignr m1, m0, 2 + mova m0, m1 + mova m1, m2 + mova m2, m3 + mova m3, m4 + mova m4, m5 + mova m5, m6 +%else + ; sort of a half-integrated version of PALIGNR + pslldq m7, m4, 14 + pslldq m6, m5, 14 + psrldq m4, 2 + psrldq m5, 2 + por m4, m6 + pslldq m6, m3, 14 + psrldq m3, 2 + por m3, m7 + pslldq m7, m2, 14 + psrldq m2, 2 + por m2, m6 + pslldq m6, m1, 14 + psrldq m1, 2 + por m1, m7 + psrldq m0, 2 + por m0, m6 +%endif + jg .loop + RET +%endmacro + +INIT_XMM sse2 +HD_XMM_FUNCS +INIT_XMM ssse3 +HD_XMM_FUNCS +INIT_XMM avx +HD_XMM_FUNCS + +%macro HU_MMX_FUNCS 0 +cglobal vp9_ipred_hu_4x4, 3, 3, 0, dst, stride, l + movd m0, [lq] +%if cpuflag(ssse3) + pshufb m0, [pb_0to2_5x3] +%else + punpcklbw m1, m0, m0 ; 00112233 + pshufw m1, m1, q3333 ; 33333333 + punpckldq m0, m1 ; 01233333 +%endif + psrlq m1, m0, 8 + psrlq m2, m1, 8 + LOWPASS 2, 1, 0, 3 + pavgb m1, m0 + DEFINE_ARGS dst, stride, stride3 + lea stride3q, [strideq*3] + SBUTTERFLY bw, 1, 2, 0 + PALIGNR m2, m1, 2, m0 + movd [dstq+strideq*0], m1 + movd [dstq+strideq*1], m2 + punpckhdq m1, m1 + punpckhdq m2, m2 + movd [dstq+strideq*2], m1 + movd [dstq+stride3q ], m2 + RET +%endmacro + +INIT_MMX mmxext +HU_MMX_FUNCS +INIT_MMX ssse3 +HU_MMX_FUNCS + +%macro HU_XMM_FUNCS 1 ; n_xmm_regs in hu_32x32 +cglobal vp9_ipred_hu_8x8, 3, 4, 4, dst, stride, l + movq m0, [lq] +%if cpuflag(ssse3) + pshufb m0, [pb_0to6_9x7] +%else + punpcklbw m1, m0, m0 ; 0011223344556677 + punpckhwd m1, m1 ; 4444555566667777 + shufps m0, m1, q3310 ; 0123456777777777 +%endif + psrldq m1, m0, 1 + psrldq m2, m1, 1 + LOWPASS 2, 1, 0, 3 + pavgb m1, m0 + DEFINE_ARGS dst, stride, stride3, dst4 + lea stride3q, [strideq*3] + lea dst4q, [dstq+strideq*4] + SBUTTERFLY bw, 1, 2, 0 + movq [dstq +strideq*0], m1 + movhps [dst4q+strideq*0], m1 + PALIGNR m0, m2, m1, 2, m3 + movq [dstq +strideq*1], m0 + movhps [dst4q+strideq*1], m0 + PALIGNR m0, m2, m1, 4, m3 + movq [dstq +strideq*2], m0 + movhps [dst4q+strideq*2], m0 + PALIGNR m2, m1, 6, m3 + movq [dstq +stride3q ], m2 + movhps [dst4q+stride3q ], m2 + RET + +cglobal vp9_ipred_hu_16x16, 3, 4, 5, dst, stride, l + mova m0, [lq] +%if cpuflag(ssse3) + mova m3, [pb_2toE_3xF] + pshufb m1, m0, [pb_1toE_2xF] + pshufb m2, m0, m3 +%else + pand m3, m0, [pb_15x0_1xm1] + psrldq m1, m0, 1 + por m1, m3 + punpckhbw m3, m3 + psrldq m2, m0, 2 + por m2, m3 +%endif + LOWPASS 2, 1, 0, 4 + pavgb m1, m0 + DEFINE_ARGS dst, stride, stride9, cnt + lea stride9q, [strideq*8+strideq] + mov cntd, 4 + SBUTTERFLY bw, 1, 2, 0 + +.loop: + mova [dstq+strideq*0], m1 + mova [dstq+strideq*8], m2 + PALIGNR m0, m2, m1, 2, m4 +%if cpuflag(ssse3) + pshufb m2, m3 +%else + psrldq m2, 2 + por m2, m3 +%endif + mova [dstq+strideq*1], m0 + mova [dstq+stride9q ], m2 + PALIGNR m1, m2, m0, 2, m4 +%if cpuflag(ssse3) + pshufb m2, m3 +%else + psrldq m2, 2 + por m2, m3 +%endif + lea dstq, [dstq+strideq*2] + dec cntd + jg .loop + RET + +cglobal vp9_ipred_hu_32x32, 3, 7, %1, dst, stride, l + mova m1, [lq] + mova m0, [lq+16] + PALIGNR m2, m0, m1, 1, m5 + PALIGNR m3, m0, m1, 2, m5 + LOWPASS 3, 2, 1, 5 + pavgb m2, m1 +%if cpuflag(ssse3) + mova m4, [pb_2toE_3xF] + pshufb m5, m0, [pb_1toE_2xF] + pshufb m1, m0, m4 +%else + pand m4, m0, [pb_15x0_1xm1] + psrldq m5, m0, 1 + por m5, m4 + punpckhbw m4, m4 + psrldq m1, m0, 2 + por m1, m4 +%endif + LOWPASS 1, 5, 0, 6 + pavgb m0, m5 + DEFINE_ARGS dst, stride, cnt, stride0, dst8, dst16, dst24 + mov cntd, 8 + xor stride0q, stride0q + lea dst8q, [dstq +strideq*8] + lea dst16q, [dst8q +strideq*8] + lea dst24q, [dst16q+strideq*8] + SBUTTERFLY bw, 0, 1, 5 + SBUTTERFLY bw, 2, 3, 5 +%if cpuflag(ssse3) + pshufb m6, m1, [pb_15] +%else + pshufhw m6, m4, q3333 + punpckhqdq m6, m6 +%endif + +.loop: + mova [dstq +stride0q+ 0], m2 + mova [dstq +stride0q+16], m3 + mova [dst8q +stride0q+ 0], m3 + mova [dst8q +stride0q+16], m0 + mova [dst16q+stride0q+ 0], m0 + mova [dst16q+stride0q+16], m1 + mova [dst24q+stride0q+ 0], m1 + mova [dst24q+stride0q+16], m6 +%if cpuflag(avx) + palignr m2, m3, m2, 2 + palignr m3, m0, m3, 2 + palignr m0, m1, m0, 2 + pshufb m1, m4 +%elif cpuflag(ssse3) + pshufb m5, m1, m4 + palignr m1, m0, 2 + palignr m0, m3, 2 + palignr m3, m2, 2 + mova m2, m3 + mova m3, m0 + mova m0, m1 + mova m1, m5 +%else + ; half-integrated version of PALIGNR + pslldq m5, m1, 14 + pslldq m7, m0, 14 + psrldq m1, 2 + psrldq m0, 2 + por m1, m4 + por m0, m5 + pslldq m5, m3, 14 + psrldq m3, 2 + por m3, m7 + psrldq m2, 2 + por m2, m5 +%endif + add stride0q, strideq + dec cntd + jg .loop + RET +%endmacro + +INIT_XMM sse2 +HU_XMM_FUNCS 8 +INIT_XMM ssse3 +HU_XMM_FUNCS 7 +INIT_XMM avx +HU_XMM_FUNCS 7 + +; FIXME 127, 128, 129 ? diff --git a/media/ffvpx/libavcodec/x86/vp9intrapred_16bpp.asm b/media/ffvpx/libavcodec/x86/vp9intrapred_16bpp.asm new file mode 100644 index 0000000000..808056a809 --- /dev/null +++ b/media/ffvpx/libavcodec/x86/vp9intrapred_16bpp.asm @@ -0,0 +1,2497 @@ +;****************************************************************************** +;* VP9 Intra prediction SIMD optimizations +;* +;* Copyright (c) 2015 Ronald S. Bultje <rsbultje gmail com> +;* Copyright (c) 2015 Henrik Gramner <henrik gramner com> +;* +;* This file is part of FFmpeg. +;* +;* FFmpeg is free software; you can redistribute it and/or +;* modify it under the terms of the GNU Lesser General Public +;* License as published by the Free Software Foundation; either +;* version 2.1 of the License, or (at your option) any later version. +;* +;* FFmpeg is distributed in the hope that it will be useful, +;* but WITHOUT ANY WARRANTY; without even the implied warranty of +;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +;* Lesser General Public License for more details. +;* +;* You should have received a copy of the GNU Lesser General Public +;* License along with FFmpeg; if not, write to the Free Software +;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA +;****************************************************************************** + +%include "libavutil/x86/x86util.asm" + +SECTION_RODATA 32 + +pd_2: times 8 dd 2 +pd_4: times 8 dd 4 +pd_8: times 8 dd 8 + +pb_2to15_14_15: db 2,3,4,5,6,7,8,9,10,11,12,13,14,15,14,15 +pb_4_5_8to13_8x0: db 4,5,8,9,10,11,12,13,0,0,0,0,0,0,0,0 +pb_0to7_67x4: db 0,1,2,3,4,5,6,7,6,7,6,7,6,7,6,7 + +cextern pw_1 +cextern pw_1023 +cextern pw_4095 +cextern pd_16 +cextern pd_32 +cextern pd_65535; + +; FIXME most top-only functions (ddl, vl, v, dc_top) can be modified to take +; only 3 registers on x86-32, which would make it one cycle faster, but that +; would make the code quite a bit uglier... + +SECTION .text + +%macro SCRATCH 3-4 +%if ARCH_X86_64 + SWAP %1, %2 +%if %0 == 4 +%define reg_%4 m%2 +%endif +%else + mova [%3], m%1 +%if %0 == 4 +%define reg_%4 [%3] +%endif +%endif +%endmacro + +%macro UNSCRATCH 3-4 +%if ARCH_X86_64 + SWAP %1, %2 +%else + mova m%1, [%3] +%endif +%if %0 == 4 +%undef reg_%4 +%endif +%endmacro + +%macro PRELOAD 2-3 +%if ARCH_X86_64 + mova m%1, [%2] +%if %0 == 3 +%define reg_%3 m%1 +%endif +%elif %0 == 3 +%define reg_%3 [%2] +%endif +%endmacro + +INIT_MMX mmx +cglobal vp9_ipred_v_4x4_16, 2, 4, 1, dst, stride, l, a + movifnidn aq, amp + mova m0, [aq] + DEFINE_ARGS dst, stride, stride3 + lea stride3q, [strideq*3] + mova [dstq+strideq*0], m0 + mova [dstq+strideq*1], m0 + mova [dstq+strideq*2], m0 + mova [dstq+stride3q ], m0 + RET + +INIT_XMM sse +cglobal vp9_ipred_v_8x8_16, 2, 4, 1, dst, stride, l, a + movifnidn aq, amp + mova m0, [aq] + DEFINE_ARGS dst, stride, stride3 + lea stride3q, [strideq*3] + mova [dstq+strideq*0], m0 + mova [dstq+strideq*1], m0 + mova [dstq+strideq*2], m0 + mova [dstq+stride3q ], m0 + lea dstq, [dstq+strideq*4] + mova [dstq+strideq*0], m0 + mova [dstq+strideq*1], m0 + mova [dstq+strideq*2], m0 + mova [dstq+stride3q ], m0 + RET + +INIT_XMM sse +cglobal vp9_ipred_v_16x16_16, 2, 4, 2, dst, stride, l, a + movifnidn aq, amp + mova m0, [aq] + mova m1, [aq+mmsize] + DEFINE_ARGS dst, stride, stride3, cnt + lea stride3q, [strideq*3] + mov cntd, 4 +.loop: + mova [dstq+strideq*0+ 0], m0 + mova [dstq+strideq*0+16], m1 + mova [dstq+strideq*1+ 0], m0 + mova [dstq+strideq*1+16], m1 + mova [dstq+strideq*2+ 0], m0 + mova [dstq+strideq*2+16], m1 + mova [dstq+stride3q + 0], m0 + mova [dstq+stride3q +16], m1 + lea dstq, [dstq+strideq*4] + dec cntd + jg .loop + RET + +INIT_XMM sse +cglobal vp9_ipred_v_32x32_16, 2, 4, 4, dst, stride, l, a + movifnidn aq, amp + mova m0, [aq+mmsize*0] + mova m1, [aq+mmsize*1] + mova m2, [aq+mmsize*2] + mova m3, [aq+mmsize*3] + DEFINE_ARGS dst, stride, cnt + mov cntd, 16 +.loop: + mova [dstq+strideq*0+ 0], m0 + mova [dstq+strideq*0+16], m1 + mova [dstq+strideq*0+32], m2 + mova [dstq+strideq*0+48], m3 + mova [dstq+strideq*1+ 0], m0 + mova [dstq+strideq*1+16], m1 + mova [dstq+strideq*1+32], m2 + mova [dstq+strideq*1+48], m3 + lea dstq, [dstq+strideq*2] + dec cntd + jg .loop + RET + +INIT_MMX mmxext +cglobal vp9_ipred_h_4x4_16, 3, 3, 4, dst, stride, l, a + mova m3, [lq] + DEFINE_ARGS dst, stride, stride3 + lea stride3q, [strideq*3] + pshufw m0, m3, q3333 + pshufw m1, m3, q2222 + pshufw m2, m3, q1111 + pshufw m3, m3, q0000 + mova [dstq+strideq*0], m0 + mova [dstq+strideq*1], m1 + mova [dstq+strideq*2], m2 + mova [dstq+stride3q ], m3 + RET + +INIT_XMM sse2 +cglobal vp9_ipred_h_8x8_16, 3, 3, 4, dst, stride, l, a + mova m2, [lq] + DEFINE_ARGS dst, stride, stride3 + lea stride3q, [strideq*3] + punpckhwd m3, m2, m2 + pshufd m0, m3, q3333 + pshufd m1, m3, q2222 + mova [dstq+strideq*0], m0 + mova [dstq+strideq*1], m1 + pshufd m0, m3, q1111 + pshufd m1, m3, q0000 + mova [dstq+strideq*2], m0 + mova [dstq+stride3q ], m1 + lea dstq, [dstq+strideq*4] + punpcklwd m2, m2 + pshufd m0, m2, q3333 + pshufd m1, m2, q2222 + mova [dstq+strideq*0], m0 + mova [dstq+strideq*1], m1 + pshufd m0, m2, q1111 + pshufd m1, m2, q0000 + mova [dstq+strideq*2], m0 + mova [dstq+stride3q ], m1 + RET + +INIT_XMM sse2 +cglobal vp9_ipred_h_16x16_16, 3, 5, 4, dst, stride, l, stride3, cnt + mov cntd, 3 + lea stride3q, [strideq*3] +.loop: + movh m3, [lq+cntq*8] + punpcklwd m3, m3 + pshufd m0, m3, q3333 + pshufd m1, m3, q2222 + pshufd m2, m3, q1111 + pshufd m3, m3, q0000 + mova [dstq+strideq*0+ 0], m0 + mova [dstq+strideq*0+16], m0 + mova [dstq+strideq*1+ 0], m1 + mova [dstq+strideq*1+16], m1 + mova [dstq+strideq*2+ 0], m2 + mova [dstq+strideq*2+16], m2 + mova [dstq+stride3q + 0], m3 + mova [dstq+stride3q +16], m3 + lea dstq, [dstq+strideq*4] + dec cntd + jge .loop + RET + +INIT_XMM sse2 +cglobal vp9_ipred_h_32x32_16, 3, 5, 4, dst, stride, l, stride3, cnt + mov cntd, 7 + lea stride3q, [strideq*3] +.loop: + movh m3, [lq+cntq*8] + punpcklwd m3, m3 + pshufd m0, m3, q3333 + pshufd m1, m3, q2222 + pshufd m2, m3, q1111 + pshufd m3, m3, q0000 + mova [dstq+strideq*0+ 0], m0 + mova [dstq+strideq*0+16], m0 + mova [dstq+strideq*0+32], m0 + mova [dstq+strideq*0+48], m0 + mova [dstq+strideq*1+ 0], m1 + mova [dstq+strideq*1+16], m1 + mova [dstq+strideq*1+32], m1 + mova [dstq+strideq*1+48], m1 + mova [dstq+strideq*2+ 0], m2 + mova [dstq+strideq*2+16], m2 + mova [dstq+strideq*2+32], m2 + mova [dstq+strideq*2+48], m2 + mova [dstq+stride3q + 0], m3 + mova [dstq+stride3q +16], m3 + mova [dstq+stride3q +32], m3 + mova [dstq+stride3q +48], m3 + lea dstq, [dstq+strideq*4] + dec cntd + jge .loop + RET + +INIT_MMX mmxext +cglobal vp9_ipred_dc_4x4_16, 4, 4, 2, dst, stride, l, a + mova m0, [lq] + paddw m0, [aq] + DEFINE_ARGS dst, stride, stride3 + lea stride3q, [strideq*3] + pmaddwd m0, [pw_1] + pshufw m1, m0, q3232 + paddd m0, [pd_4] + paddd m0, m1 + psrad m0, 3 + pshufw m0, m0, q0000 + mova [dstq+strideq*0], m0 + mova [dstq+strideq*1], m0 + mova [dstq+strideq*2], m0 + mova [dstq+stride3q ], m0 + RET + +INIT_XMM sse2 +cglobal vp9_ipred_dc_8x8_16, 4, 4, 2, dst, stride, l, a + mova m0, [lq] + paddw m0, [aq] + DEFINE_ARGS dst, stride, stride3 + lea stride3q, [strideq*3] + pmaddwd m0, [pw_1] + pshufd m1, m0, q3232 + paddd m0, m1 + pshufd m1, m0, q1111 + paddd m0, [pd_8] + paddd m0, m1 + psrad m0, 4 + pshuflw m0, m0, q0000 + punpcklqdq m0, m0 + mova [dstq+strideq*0], m0 + mova [dstq+strideq*1], m0 + mova [dstq+strideq*2], m0 + mova [dstq+stride3q ], m0 + lea dstq, [dstq+strideq*4] + mova [dstq+strideq*0], m0 + mova [dstq+strideq*1], m0 + mova [dstq+strideq*2], m0 + mova [dstq+stride3q ], m0 + RET + +INIT_XMM sse2 +cglobal vp9_ipred_dc_16x16_16, 4, 4, 2, dst, stride, l, a + mova m0, [lq] + paddw m0, [lq+mmsize] + paddw m0, [aq] + paddw m0, [aq+mmsize] + DEFINE_ARGS dst, stride, stride3, cnt + lea stride3q, [strideq*3] + mov cntd, 4 + pmaddwd m0, [pw_1] + pshufd m1, m0, q3232 + paddd m0, m1 + pshufd m1, m0, q1111 + paddd m0, [pd_16] + paddd m0, m1 + psrad m0, 5 + pshuflw m0, m0, q0000 + punpcklqdq m0, m0 +.loop: + mova [dstq+strideq*0+ 0], m0 + mova [dstq+strideq*0+16], m0 + mova [dstq+strideq*1+ 0], m0 + mova [dstq+strideq*1+16], m0 + mova [dstq+strideq*2+ 0], m0 + mova [dstq+strideq*2+16], m0 + mova [dstq+stride3q + 0], m0 + mova [dstq+stride3q +16], m0 + lea dstq, [dstq+strideq*4] + dec cntd + jg .loop + RET + +INIT_XMM sse2 +cglobal vp9_ipred_dc_32x32_16, 4, 4, 2, dst, stride, l, a + mova m0, [lq+mmsize*0] + paddw m0, [lq+mmsize*1] + paddw m0, [lq+mmsize*2] + paddw m0, [lq+mmsize*3] + paddw m0, [aq+mmsize*0] + paddw m0, [aq+mmsize*1] + paddw m0, [aq+mmsize*2] + paddw m0, [aq+mmsize*3] + DEFINE_ARGS dst, stride, stride3, cnt + lea stride3q, [strideq*3] + mov cntd, 16 + pmaddwd m0, [pw_1] + pshufd m1, m0, q3232 + paddd m0, m1 + pshufd m1, m0, q1111 + paddd m0, [pd_32] + paddd m0, m1 + psrad m0, 6 + pshuflw m0, m0, q0000 + punpcklqdq m0, m0 +.loop: + mova [dstq+strideq*0+ 0], m0 + mova [dstq+strideq*0+16], m0 + mova [dstq+strideq*0+32], m0 + mova [dstq+strideq*0+48], m0 + mova [dstq+strideq*1+ 0], m0 + mova [dstq+strideq*1+16], m0 + mova [dstq+strideq*1+32], m0 + mova [dstq+strideq*1+48], m0 + lea dstq, [dstq+strideq*2] + dec cntd + jg .loop + RET + +%macro DC_1D_FNS 2 +INIT_MMX mmxext +cglobal vp9_ipred_dc_%1_4x4_16, 4, 4, 2, dst, stride, l, a + mova m0, [%2] + DEFINE_ARGS dst, stride, stride3 + lea stride3q, [strideq*3] + pmaddwd m0, [pw_1] + pshufw m1, m0, q3232 + paddd m0, [pd_2] + paddd m0, m1 + psrad m0, 2 + pshufw m0, m0, q0000 + mova [dstq+strideq*0], m0 + mova [dstq+strideq*1], m0 + mova [dstq+strideq*2], m0 + mova [dstq+stride3q ], m0 + RET + +INIT_XMM sse2 +cglobal vp9_ipred_dc_%1_8x8_16, 4, 4, 2, dst, stride, l, a + mova m0, [%2] + DEFINE_ARGS dst, stride, stride3 + lea stride3q, [strideq*3] + pmaddwd m0, [pw_1] + pshufd m1, m0, q3232 + paddd m0, m1 + pshufd m1, m0, q1111 + paddd m0, [pd_4] + paddd m0, m1 + psrad m0, 3 + pshuflw m0, m0, q0000 + punpcklqdq m0, m0 + mova [dstq+strideq*0], m0 + mova [dstq+strideq*1], m0 + mova [dstq+strideq*2], m0 + mova [dstq+stride3q ], m0 + lea dstq, [dstq+strideq*4] + mova [dstq+strideq*0], m0 + mova [dstq+strideq*1], m0 + mova [dstq+strideq*2], m0 + mova [dstq+stride3q ], m0 + RET + +INIT_XMM sse2 +cglobal vp9_ipred_dc_%1_16x16_16, 4, 4, 2, dst, stride, l, a + mova m0, [%2] + paddw m0, [%2+mmsize] + DEFINE_ARGS dst, stride, stride3, cnt + lea stride3q, [strideq*3] + mov cntd, 4 + pmaddwd m0, [pw_1] + pshufd m1, m0, q3232 + paddd m0, m1 + pshufd m1, m0, q1111 + paddd m0, [pd_8] + paddd m0, m1 + psrad m0, 4 + pshuflw m0, m0, q0000 + punpcklqdq m0, m0 +.loop: + mova [dstq+strideq*0+ 0], m0 + mova [dstq+strideq*0+16], m0 + mova [dstq+strideq*1+ 0], m0 + mova [dstq+strideq*1+16], m0 + mova [dstq+strideq*2+ 0], m0 + mova [dstq+strideq*2+16], m0 + mova [dstq+stride3q + 0], m0 + mova [dstq+stride3q +16], m0 + lea dstq, [dstq+strideq*4] + dec cntd + jg .loop + RET + +INIT_XMM sse2 +cglobal vp9_ipred_dc_%1_32x32_16, 4, 4, 2, dst, stride, l, a + mova m0, [%2+mmsize*0] + paddw m0, [%2+mmsize*1] + paddw m0, [%2+mmsize*2] + paddw m0, [%2+mmsize*3] + DEFINE_ARGS dst, stride, cnt + mov cntd, 16 + pmaddwd m0, [pw_1] + pshufd m1, m0, q3232 + paddd m0, m1 + pshufd m1, m0, q1111 + paddd m0, [pd_16] + paddd m0, m1 + psrad m0, 5 + pshuflw m0, m0, q0000 + punpcklqdq m0, m0 +.loop: + mova [dstq+strideq*0+ 0], m0 + mova [dstq+strideq*0+16], m0 + mova [dstq+strideq*0+32], m0 + mova [dstq+strideq*0+48], m0 + mova [dstq+strideq*1+ 0], m0 + mova [dstq+strideq*1+16], m0 + mova [dstq+strideq*1+32], m0 + mova [dstq+strideq*1+48], m0 + lea dstq, [dstq+strideq*2] + dec cntd + jg .loop + RET +%endmacro + +DC_1D_FNS top, aq +DC_1D_FNS left, lq + +INIT_MMX mmxext +cglobal vp9_ipred_tm_4x4_10, 4, 4, 6, dst, stride, l, a + mova m5, [pw_1023] +.body: + mova m4, [aq] + mova m3, [lq] + movd m0, [aq-4] + pshufw m0, m0, q1111 + psubw m4, m0 + DEFINE_ARGS dst, stride, stride3 + lea stride3q, [strideq*3] + pshufw m0, m3, q3333 + pshufw m1, m3, q2222 + pshufw m2, m3, q1111 + pshufw m3, m3, q0000 + paddw m0, m4 + paddw m1, m4 + paddw m2, m4 + paddw m3, m4 + pxor m4, m4 + pmaxsw m0, m4 + pmaxsw m1, m4 + pmaxsw m2, m4 + pmaxsw m3, m4 + pminsw m0, m5 + pminsw m1, m5 + pminsw m2, m5 + pminsw m3, m5 + mova [dstq+strideq*0], m0 + mova [dstq+strideq*1], m1 + mova [dstq+strideq*2], m2 + mova [dstq+stride3q ], m3 + RET + +cglobal vp9_ipred_tm_4x4_12, 4, 4, 6, dst, stride, l, a + mova m5, [pw_4095] + jmp mangle(private_prefix %+ _ %+ vp9_ipred_tm_4x4_10 %+ SUFFIX).body + +INIT_XMM sse2 +cglobal vp9_ipred_tm_8x8_10, 4, 5, 7, dst, stride, l, a + mova m4, [pw_1023] +.body: + pxor m6, m6 + mova m5, [aq] + movd m0, [aq-4] + pshuflw m0, m0, q1111 + punpcklqdq m0, m0 + psubw m5, m0 + DEFINE_ARGS dst, stride, l, stride3, cnt + lea stride3q, [strideq*3] + mov cntd, 1 +.loop: + movh m3, [lq+cntq*8] + punpcklwd m3, m3 + pshufd m0, m3, q3333 + pshufd m1, m3, q2222 + pshufd m2, m3, q1111 + pshufd m3, m3, q0000 + paddw m0, m5 + paddw m1, m5 + paddw m2, m5 + paddw m3, m5 + pmaxsw m0, m6 + pmaxsw m1, m6 + pmaxsw m2, m6 + pmaxsw m3, m6 + pminsw m0, m4 + pminsw m1, m4 + pminsw m2, m4 + pminsw m3, m4 + mova [dstq+strideq*0], m0 + mova [dstq+strideq*1], m1 + mova [dstq+strideq*2], m2 + mova [dstq+stride3q ], m3 + lea dstq, [dstq+strideq*4] + dec cntd + jge .loop + RET + +cglobal vp9_ipred_tm_8x8_12, 4, 5, 7, dst, stride, l, a + mova m4, [pw_4095] + jmp mangle(private_prefix %+ _ %+ vp9_ipred_tm_8x8_10 %+ SUFFIX).body + +INIT_XMM sse2 +cglobal vp9_ipred_tm_16x16_10, 4, 4, 8, dst, stride, l, a + mova m7, [pw_1023] +.body: + pxor m6, m6 + mova m4, [aq] + mova m5, [aq+mmsize] + movd m0, [aq-4] + pshuflw m0, m0, q1111 + punpcklqdq m0, m0 + psubw m4, m0 + psubw m5, m0 + DEFINE_ARGS dst, stride, l, cnt + mov cntd, 7 +.loop: + movd m3, [lq+cntq*4] + punpcklwd m3, m3 + pshufd m2, m3, q1111 + pshufd m3, m3, q0000 + paddw m0, m2, m4 + paddw m2, m5 + paddw m1, m3, m4 + paddw m3, m5 + pmaxsw m0, m6 + pmaxsw m2, m6 + pmaxsw m1, m6 + pmaxsw m3, m6 + pminsw m0, m7 + pminsw m2, m7 + pminsw m1, m7 + pminsw m3, m7 + mova [dstq+strideq*0+ 0], m0 + mova [dstq+strideq*0+16], m2 + mova [dstq+strideq*1+ 0], m1 + mova [dstq+strideq*1+16], m3 + lea dstq, [dstq+strideq*2] + dec cntd + jge .loop + RET + +cglobal vp9_ipred_tm_16x16_12, 4, 4, 8, dst, stride, l, a + mova m7, [pw_4095] + jmp mangle(private_prefix %+ _ %+ vp9_ipred_tm_16x16_10 %+ SUFFIX).body + +INIT_XMM sse2 +cglobal vp9_ipred_tm_32x32_10, 4, 4, 10, 32 * -ARCH_X86_32, dst, stride, l, a + mova m0, [pw_1023] +.body: + pxor m1, m1 +%if ARCH_X86_64 + SWAP 0, 8 + SWAP 1, 9 +%define reg_min m9 +%define reg_max m8 +%else + mova [rsp+ 0], m0 + mova [rsp+16], m1 +%define reg_min [rsp+16] +%define reg_max [rsp+ 0] +%endif + + mova m4, [aq+mmsize*0] + mova m5, [aq+mmsize*1] + mova m6, [aq+mmsize*2] + mova m7, [aq+mmsize*3] + movd m0, [aq-4] + pshuflw m0, m0, q1111 + punpcklqdq m0, m0 + psubw m4, m0 + psubw m5, m0 + psubw m6, m0 + psubw m7, m0 + DEFINE_ARGS dst, stride, l, cnt + mov cntd, 31 +.loop: + pinsrw m3, [lq+cntq*2], 0 + punpcklwd m3, m3 + pshufd m3, m3, q0000 + paddw m0, m3, m4 + paddw m1, m3, m5 + paddw m2, m3, m6 + paddw m3, m7 + pmaxsw m0, reg_min + pmaxsw m1, reg_min + pmaxsw m2, reg_min + pmaxsw m3, reg_min + pminsw m0, reg_max + pminsw m1, reg_max + pminsw m2, reg_max + pminsw m3, reg_max + mova [dstq+strideq*0+ 0], m0 + mova [dstq+strideq*0+16], m1 + mova [dstq+strideq*0+32], m2 + mova [dstq+strideq*0+48], m3 + add dstq, strideq + dec cntd + jge .loop + RET + +cglobal vp9_ipred_tm_32x32_12, 4, 4, 10, 32 * -ARCH_X86_32, dst, stride, l, a + mova m0, [pw_4095] + jmp mangle(private_prefix %+ _ %+ vp9_ipred_tm_32x32_10 %+ SUFFIX).body + +; Directional intra predicion functions +; +; in the functions below, 'abcdefgh' refers to above data (sometimes simply +; abbreviated as a[N-M]). 'stuvwxyz' refers to left data (sometimes simply +; abbreviated as l[N-M]). * is top-left data. ABCDEFG or A[N-M] is filtered +; above data, STUVWXYZ or L[N-M] is filtered left data, and # is filtered +; top-left data. + +; left=(left+2*center+right+2)>>2 +%macro LOWPASS 3 ; left [dst], center, right + paddw m%1, m%3 + psraw m%1, 1 + pavgw m%1, m%2 +%endmacro + +; abcdefgh (src) -> bcdefghh (dst) +; dst/src can be the same register +%macro SHIFT_RIGHT 2-3 [pb_2to15_14_15] ; dst, src, [ssse3_shift_reg] +%if cpuflag(ssse3) + pshufb %1, %2, %3 ; abcdefgh -> bcdefghh +%else + psrldq %1, %2, 2 ; abcdefgh -> bcdefgh. + pshufhw %1, %1, q2210 ; bcdefgh. -> bcdefghh +%endif +%endmacro + +; abcdefgh (src) -> bcdefghh (dst1) and cdefghhh (dst2) +%macro SHIFT_RIGHTx2 3-4 [pb_2to15_14_15] ; dst1, dst2, src, [ssse3_shift_reg] +%if cpuflag(ssse3) + pshufb %1, %3, %4 ; abcdefgh -> bcdefghh + pshufb %2, %1, %4 ; bcdefghh -> cdefghhh +%else + psrldq %1, %3, 2 ; abcdefgh -> bcdefgh. + psrldq %2, %3, 4 ; abcdefgh -> cdefgh.. + pshufhw %1, %1, q2210 ; bcdefgh. -> bcdefghh + pshufhw %2, %2, q1110 ; cdefgh.. -> cdefghhh +%endif +%endmacro + +%macro DL_FUNCS 0 +cglobal vp9_ipred_dl_4x4_16, 2, 4, 3, dst, stride, l, a + movifnidn aq, amp + movu m1, [aq] ; abcdefgh + pshufhw m0, m1, q3310 ; abcdefhh + SHIFT_RIGHT m1, m1 ; bcdefghh + psrldq m2, m1, 2 ; cdefghh. + LOWPASS 0, 1, 2 ; BCDEFGh. + pshufd m1, m0, q3321 ; DEFGh... + movh [dstq+strideq*0], m0 + movh [dstq+strideq*2], m1 + add dstq, strideq + psrldq m0, 2 ; CDEFGh.. + psrldq m1, 2 ; EFGh.... + movh [dstq+strideq*0], m0 + movh [dstq+strideq*2], m1 + RET + +cglobal vp9_ipred_dl_8x8_16, 2, 4, 5, dst, stride, l, a + movifnidn aq, amp + mova m0, [aq] ; abcdefgh +%if cpuflag(ssse3) + mova m4, [pb_2to15_14_15] +%endif + SHIFT_RIGHTx2 m1, m2, m0, m4 ; bcdefghh/cdefghhh + LOWPASS 0, 1, 2 ; BCDEFGHh + shufps m1, m0, m2, q3332 ; FGHhhhhh + shufps m3, m0, m1, q2121 ; DEFGHhhh + DEFINE_ARGS dst, stride, stride5 + lea stride5q, [strideq*5] + + mova [dstq+strideq*0], m0 + mova [dstq+strideq*4], m1 + SHIFT_RIGHT m0, m0, m4 ; CDEFGHhh + pshuflw m1, m1, q3321 ; GHhhhhhh + pshufd m2, m0, q3321 ; EFGHhhhh + mova [dstq+strideq*1], m0 + mova [dstq+stride5q ], m1 + lea dstq, [dstq+strideq*2] + pshuflw m1, m1, q3321 ; Hhhhhhhh + mova [dstq+strideq*0], m3 + mova [dstq+strideq*4], m1 + pshuflw m1, m1, q3321 ; hhhhhhhh + mova [dstq+strideq*1], m2 + mova [dstq+stride5q ], m1 + RET + +cglobal vp9_ipred_dl_16x16_16, 2, 4, 5, dst, stride, l, a + movifnidn aq, amp + mova m0, [aq] ; abcdefgh + mova m3, [aq+mmsize] ; ijklmnop + PALIGNR m1, m3, m0, 2, m4 ; bcdefghi + PALIGNR m2, m3, m0, 4, m4 ; cdefghij + LOWPASS 0, 1, 2 ; BCDEFGHI +%if cpuflag(ssse3) + mova m4, [pb_2to15_14_15] +%endif + SHIFT_RIGHTx2 m2, m1, m3, m4 ; jklmnopp/klmnoppp + LOWPASS 1, 2, 3 ; JKLMNOPp + pshufd m2, m2, q3333 ; pppppppp + DEFINE_ARGS dst, stride, cnt + mov cntd, 8 + +.loop: + mova [dstq+strideq*0+ 0], m0 + mova [dstq+strideq*0+16], m1 + mova [dstq+strideq*8+ 0], m1 + mova [dstq+strideq*8+16], m2 + add dstq, strideq +%if cpuflag(avx) + vpalignr m0, m1, m0, 2 +%else + PALIGNR m3, m1, m0, 2, m4 + mova m0, m3 +%endif + SHIFT_RIGHT m1, m1, m4 + dec cntd + jg .loop + RET + +cglobal vp9_ipred_dl_32x32_16, 2, 5, 7, dst, stride, l, a + movifnidn aq, amp + mova m0, [aq+mmsize*0] ; abcdefgh + mova m1, [aq+mmsize*1] ; ijklmnop + mova m2, [aq+mmsize*2] ; qrstuvwx + mova m3, [aq+mmsize*3] ; yz012345 + PALIGNR m4, m1, m0, 2, m6 + PALIGNR m5, m1, m0, 4, m6 + LOWPASS 0, 4, 5 ; BCDEFGHI + PALIGNR m4, m2, m1, 2, m6 + PALIGNR m5, m2, m1, 4, m6 + LOWPASS 1, 4, 5 ; JKLMNOPQ + PALIGNR m4, m3, m2, 2, m6 + PALIGNR m5, m3, m2, 4, m6 + LOWPASS 2, 4, 5 ; RSTUVWXY +%if cpuflag(ssse3) + mova m6, [pb_2to15_14_15] +%endif + SHIFT_RIGHTx2 m4, m5, m3, m6 + LOWPASS 3, 4, 5 ; Z0123455 + pshufd m4, m4, q3333 ; 55555555 + DEFINE_ARGS dst, stride, stride8, stride24, cnt + mov cntd, 8 + lea stride8q, [strideq*8] + lea stride24q, [stride8q*3] + +.loop: + mova [dstq+stride8q*0+ 0], m0 + mova [dstq+stride8q*0+16], m1 + mova [dstq+stride8q*0+32], m2 + mova [dstq+stride8q*0+48], m3 + mova [dstq+stride8q*1+ 0], m1 + mova [dstq+stride8q*1+16], m2 + mova [dstq+stride8q*1+32], m3 + mova [dstq+stride8q*1+48], m4 + mova [dstq+stride8q*2+ 0], m2 + mova [dstq+stride8q*2+16], m3 + mova [dstq+stride8q*2+32], m4 + mova [dstq+stride8q*2+48], m4 + mova [dstq+stride24q + 0], m3 + mova [dstq+stride24q +16], m4 + mova [dstq+stride24q +32], m4 + mova [dstq+stride24q +48], m4 + add dstq, strideq +%if cpuflag(avx) + vpalignr m0, m1, m0, 2 + vpalignr m1, m2, m1, 2 + vpalignr m2, m3, m2, 2 +%else + PALIGNR m5, m1, m0, 2, m6 + mova m0, m5 + PALIGNR m5, m2, m1, 2, m6 + mova m1, m5 + PALIGNR m5, m3, m2, 2, m6 + mova m2, m5 +%endif + SHIFT_RIGHT m3, m3, m6 + dec cntd + jg .loop + RET +%endmacro + +INIT_XMM sse2 +DL_FUNCS +INIT_XMM ssse3 +DL_FUNCS +INIT_XMM avx +DL_FUNCS + +%if HAVE_AVX2_EXTERNAL +INIT_YMM avx2 +cglobal vp9_ipred_dl_16x16_16, 2, 4, 5, dst, stride, l, a + movifnidn aq, amp + mova m0, [aq] ; abcdefghijklmnop + vpbroadcastw xm1, [aq+30] ; pppppppp + vperm2i128 m2, m0, m1, q0201 ; ijklmnoppppppppp + vpalignr m3, m2, m0, 2 ; bcdefghijklmnopp + vpalignr m4, m2, m0, 4 ; cdefghijklmnoppp + LOWPASS 0, 3, 4 ; BCDEFGHIJKLMNOPp + vperm2i128 m2, m0, m1, q0201 ; JKLMNOPppppppppp + DEFINE_ARGS dst, stride, stride3, cnt + mov cntd, 2 + lea stride3q, [strideq*3] + +.loop: + mova [dstq+strideq*0], m0 + vpalignr m3, m2, m0, 2 + vpalignr m4, m2, m0, 4 + mova [dstq+strideq*1], m3 + mova [dstq+strideq*2], m4 + vpalignr m3, m2, m0, 6 + vpalignr m4, m2, m0, 8 + mova [dstq+stride3q ], m3 + lea dstq, [dstq+strideq*4] + mova [dstq+strideq*0], m4 + vpalignr m3, m2, m0, 10 + vpalignr m4, m2, m0, 12 + mova [dstq+strideq*1], m3 + mova [dstq+strideq*2], m4 + vpalignr m3, m2, m0, 14 + mova [dstq+stride3q ], m3 + lea dstq, [dstq+strideq*4] + mova m0, m2 + vperm2i128 m2, m2, m2, q0101 ; pppppppppppppppp + dec cntd + jg .loop + RET + +cglobal vp9_ipred_dl_32x32_16, 2, 6, 7, dst, stride, l, a + movifnidn aq, amp + mova m0, [aq+mmsize*0+ 0] ; abcdefghijklmnop + mova m1, [aq+mmsize*1+ 0] ; qrstuvwxyz012345 + vpbroadcastw xm4, [aq+mmsize*1+30] ; 55555555 + vperm2i128 m5, m0, m1, q0201 ; ijklmnopqrstuvwx + vpalignr m2, m5, m0, 2 ; bcdefghijklmnopq + vpalignr m3, m5, m0, 4 ; cdefghijklmnopqr + LOWPASS 0, 2, 3 ; BCDEFGHIJKLMNOPQ + vperm2i128 m5, m1, m4, q0201 ; yz01234555555555 + vpalignr m2, m5, m1, 2 ; rstuvwxyz0123455 + vpalignr m3, m5, m1, 4 ; stuvwxyz01234555 + LOWPASS 1, 2, 3 ; RSTUVWXYZ......5 + vperm2i128 m2, m1, m4, q0201 ; Z......555555555 + vperm2i128 m5, m0, m1, q0201 ; JKLMNOPQRSTUVWXY + DEFINE_ARGS dst, stride, stride3, cnt + lea stride3q, [strideq*3] + mov cntd, 4 + +.loop: + mova [dstq+strideq*0 + 0], m0 + mova [dstq+strideq*0 +32], m1 + vpalignr m3, m5, m0, 2 + vpalignr m4, m2, m1, 2 + mova [dstq+strideq*1 + 0], m3 + mova [dstq+strideq*1 +32], m4 + vpalignr m3, m5, m0, 4 + vpalignr m4, m2, m1, 4 + mova [dstq+strideq*2 + 0], m3 + mova [dstq+strideq*2 +32], m4 + vpalignr m3, m5, m0, 6 + vpalignr m4, m2, m1, 6 + mova [dstq+stride3q*1+ 0], m3 + mova [dstq+stride3q*1+32], m4 + lea dstq, [dstq+strideq*4] + vpalignr m3, m5, m0, 8 + vpalignr m4, m2, m1, 8 + mova [dstq+strideq*0 + 0], m3 + mova [dstq+strideq*0 +32], m4 + vpalignr m3, m5, m0, 10 + vpalignr m4, m2, m1, 10 + mova [dstq+strideq*1 + 0], m3 + mova [dstq+strideq*1 +32], m4 + vpalignr m3, m5, m0, 12 + vpalignr m4, m2, m1, 12 + mova [dstq+strideq*2+ 0], m3 + mova [dstq+strideq*2+32], m4 + vpalignr m3, m5, m0, 14 + vpalignr m4, m2, m1, 14 + mova [dstq+stride3q+ 0], m3 + mova [dstq+stride3q+ 32], m4 + vpalignr m3, m5, m0, 16 + vpalignr m4, m2, m1, 16 + vperm2i128 m5, m3, m4, q0201 + vperm2i128 m2, m4, m4, q0101 + mova m0, m3 + mova m1, m4 + lea dstq, [dstq+strideq*4] + dec cntd + jg .loop + RET +%endif + +%macro DR_FUNCS 1 ; stack_mem_for_32x32_32bit_function +cglobal vp9_ipred_dr_4x4_16, 4, 4, 3, dst, stride, l, a + movh m0, [lq] ; wxyz.... + movhps m0, [aq-2] ; wxyz*abc + movd m1, [aq+6] ; d....... + PALIGNR m1, m0, 2, m2 ; xyz*abcd + psrldq m2, m1, 2 ; yz*abcd. + LOWPASS 0, 1, 2 ; XYZ#ABC. + DEFINE_ARGS dst, stride, stride3 + lea stride3q, [strideq*3] + + movh [dstq+stride3q ], m0 + psrldq m0, 2 ; YZ#ABC.. + movh [dstq+strideq*2], m0 + psrldq m0, 2 ; Z#ABC... + movh [dstq+strideq*1], m0 + psrldq m0, 2 ; #ABC.... + movh [dstq+strideq*0], m0 + RET + +cglobal vp9_ipred_dr_8x8_16, 4, 4, 5, dst, stride, l, a + mova m0, [lq] ; stuvwxyz + movu m1, [aq-2] ; *abcdefg + mova m2, [aq] ; abcdefgh + psrldq m3, m2, 2 ; bcdefgh. + LOWPASS 3, 2, 1 ; ABCDEFG. + PALIGNR m1, m0, 2, m4 ; tuvwxyz* + PALIGNR m2, m1, 2, m4 ; uvwxyz*a + LOWPASS 2, 1, 0 ; TUVWXYZ# + DEFINE_ARGS dst, stride, dst4, stride3 + lea stride3q, [strideq*3] + lea dst4q, [dstq+strideq*4] + + movhps [dstq +stride3q +0], m2 + movh [dstq+ stride3q +8], m3 + mova [dst4q+stride3q +0], m2 + PALIGNR m1, m3, m2, 2, m0 + psrldq m3, 2 + movhps [dstq +strideq*2+0], m1 + movh [dstq+ strideq*2+8], m3 + mova [dst4q+strideq*2+0], m1 + PALIGNR m2, m3, m1, 2, m0 + psrldq m3, 2 + movhps [dstq +strideq*1+0], m2 + movh [dstq+ strideq*1+8], m3 + mova [dst4q+strideq*1+0], m2 + PALIGNR m1, m3, m2, 2, m0 + psrldq m3, 2 + movhps [dstq +strideq*0+0], m1 + movh [dstq+ strideq*0+8], m3 + mova [dst4q+strideq*0+0], m1 + RET + +cglobal vp9_ipred_dr_16x16_16, 4, 4, 7, dst, stride, l, a + mova m0, [lq] ; klmnopqr + mova m1, [lq+mmsize] ; stuvwxyz + movu m2, [aq-2] ; *abcdefg + movu m3, [aq+mmsize-2] ; hijklmno + mova m4, [aq] ; abcdefgh + mova m5, [aq+mmsize] ; ijklmnop + psrldq m6, m5, 2 ; jklmnop. + LOWPASS 6, 5, 3 ; IJKLMNO. + PALIGNR m5, m4, 2, m3 ; bcdefghi + LOWPASS 5, 4, 2 ; ABCDEFGH + PALIGNR m2, m1, 2, m3 ; tuvwxyz* + PALIGNR m4, m2, 2, m3 ; uvwxyz*a + LOWPASS 4, 2, 1 ; TUVWXYZ# + PALIGNR m1, m0, 2, m3 ; lmnopqrs + PALIGNR m2, m1, 2, m3 ; mnopqrst + LOWPASS 2, 1, 0 ; LMNOPQRS + DEFINE_ARGS dst, stride, dst8, cnt + lea dst8q, [dstq+strideq*8] + mov cntd, 8 + +.loop: + sub dst8q, strideq + mova [dst8q+strideq*0+ 0], m4 + mova [dst8q+strideq*0+16], m5 + mova [dst8q+strideq*8+ 0], m2 + mova [dst8q+strideq*8+16], m4 +%if cpuflag(avx) + vpalignr m2, m4, m2, 2 + vpalignr m4, m5, m4, 2 + vpalignr m5, m6, m5, 2 +%else + PALIGNR m0, m4, m2, 2, m1 + mova m2, m0 + PALIGNR m0, m5, m4, 2, m1 + mova m4, m0 + PALIGNR m0, m6, m5, 2, m1 + mova m5, m0 +%endif + psrldq m6, 2 + dec cntd + jg .loop + RET + +cglobal vp9_ipred_dr_32x32_16, 4, 5, 10 + notcpuflag(ssse3), \ + %1 * ARCH_X86_32 * -mmsize, dst, stride, l, a + mova m0, [aq+mmsize*3] ; a[24-31] + movu m1, [aq+mmsize*3-2] ; a[23-30] + psrldq m2, m0, 2 ; a[25-31]. + LOWPASS 2, 0, 1 ; A[24-30]. + mova m1, [aq+mmsize*2] ; a[16-23] + movu m3, [aq+mmsize*2-2] ; a[15-22] + PALIGNR m0, m1, 2, m4 ; a[17-24] + LOWPASS 0, 1, 3 ; A[16-23] + mova m3, [aq+mmsize*1] ; a[8-15] + movu m4, [aq+mmsize*1-2] ; a[7-14] + PALIGNR m1, m3, 2, m5 ; a[9-16] + LOWPASS 1, 3, 4 ; A[8-15] + mova m4, [aq+mmsize*0] ; a[0-7] + movu m5, [aq+mmsize*0-2] ; *a[0-6] + PALIGNR m3, m4, 2, m6 ; a[1-8] + LOWPASS 3, 4, 5 ; A[0-7] + SCRATCH 1, 8, rsp+0*mmsize + SCRATCH 3, 9, rsp+1*mmsize +%if notcpuflag(ssse3) + SCRATCH 0, 10, rsp+2*mmsize +%endif + mova m6, [lq+mmsize*3] ; l[24-31] + PALIGNR m5, m6, 2, m0 ; l[25-31]* + PALIGNR m4, m5, 2, m0 ; l[26-31]*a + LOWPASS 4, 5, 6 ; L[25-31]# + mova m7, [lq+mmsize*2] ; l[16-23] + PALIGNR m6, m7, 2, m0 ; l[17-24] + PALIGNR m5, m6, 2, m0 ; l[18-25] + LOWPASS 5, 6, 7 ; L[17-24] + mova m1, [lq+mmsize*1] ; l[8-15] + PALIGNR m7, m1, 2, m0 ; l[9-16] + PALIGNR m6, m7, 2, m0 ; l[10-17] + LOWPASS 6, 7, 1 ; L[9-16] + mova m3, [lq+mmsize*0] ; l[0-7] + PALIGNR m1, m3, 2, m0 ; l[1-8] + PALIGNR m7, m1, 2, m0 ; l[2-9] + LOWPASS 7, 1, 3 ; L[1-8] +%if cpuflag(ssse3) +%if cpuflag(avx) + UNSCRATCH 1, 8, rsp+0*mmsize +%endif + UNSCRATCH 3, 9, rsp+1*mmsize +%else + UNSCRATCH 0, 10, rsp+2*mmsize +%endif + DEFINE_ARGS dst8, stride, stride8, stride24, cnt + lea stride8q, [strideq*8] + lea stride24q, [stride8q*3] + lea dst8q, [dst8q+strideq*8] + mov cntd, 8 + +.loop: + sub dst8q, strideq +%if notcpuflag(avx) + UNSCRATCH 1, 8, rsp+0*mmsize +%if notcpuflag(ssse3) + UNSCRATCH 3, 9, rsp+1*mmsize +%endif +%endif + mova [dst8q+stride8q*0+ 0], m4 + mova [dst8q+stride8q*0+16], m3 + mova [dst8q+stride8q*0+32], m1 + mova [dst8q+stride8q*0+48], m0 + mova [dst8q+stride8q*1+ 0], m5 + mova [dst8q+stride8q*1+16], m4 + mova [dst8q+stride8q*1+32], m3 + mova [dst8q+stride8q*1+48], m1 + mova [dst8q+stride8q*2+ 0], m6 + mova [dst8q+stride8q*2+16], m5 + mova [dst8q+stride8q*2+32], m4 + mova [dst8q+stride8q*2+48], m3 + mova [dst8q+stride24q + 0], m7 + mova [dst8q+stride24q +16], m6 + mova [dst8q+stride24q +32], m5 + mova [dst8q+stride24q +48], m4 +%if cpuflag(avx) + vpalignr m7, m6, m7, 2 + vpalignr m6, m5, m6, 2 + vpalignr m5, m4, m5, 2 + vpalignr m4, m3, m4, 2 + vpalignr m3, m1, m3, 2 + vpalignr m1, m0, m1, 2 + vpalignr m0, m2, m0, 2 +%else + SCRATCH 2, 8, rsp+0*mmsize +%if notcpuflag(ssse3) + SCRATCH 0, 9, rsp+1*mmsize +%endif + PALIGNR m2, m6, m7, 2, m0 + mova m7, m2 + PALIGNR m2, m5, m6, 2, m0 + mova m6, m2 + PALIGNR m2, m4, m5, 2, m0 + mova m5, m2 + PALIGNR m2, m3, m4, 2, m0 + mova m4, m2 + PALIGNR m2, m1, m3, 2, m0 + mova m3, m2 +%if notcpuflag(ssse3) + UNSCRATCH 0, 9, rsp+1*mmsize + SCRATCH 3, 9, rsp+1*mmsize +%endif + PALIGNR m2, m0, m1, 2, m3 + mova m1, m2 + UNSCRATCH 2, 8, rsp+0*mmsize + SCRATCH 1, 8, rsp+0*mmsize + PALIGNR m1, m2, m0, 2, m3 + mova m0, m1 +%endif + psrldq m2, 2 + dec cntd + jg .loop + RET +%endmacro + +INIT_XMM sse2 +DR_FUNCS 3 +INIT_XMM ssse3 +DR_FUNCS 2 +INIT_XMM avx +DR_FUNCS 2 + +%if HAVE_AVX2_EXTERNAL +INIT_YMM avx2 +cglobal vp9_ipred_dr_16x16_16, 4, 5, 6, dst, stride, l, a + mova m0, [lq] ; klmnopqrstuvwxyz + movu m1, [aq-2] ; *abcdefghijklmno + mova m2, [aq] ; abcdefghijklmnop + vperm2i128 m4, m2, m2, q2001 ; ijklmnop........ + vpalignr m5, m4, m2, 2 ; bcdefghijklmnop. + vperm2i128 m3, m0, m1, q0201 ; stuvwxyz*abcdefg + LOWPASS 1, 2, 5 ; ABCDEFGHIJKLMNO. + vpalignr m4, m3, m0, 2 ; lmnopqrstuvwxyz* + vpalignr m5, m3, m0, 4 ; mnopqrstuvwxyz*a + LOWPASS 0, 4, 5 ; LMNOPQRSTUVWXYZ# + vperm2i128 m5, m0, m1, q0201 ; TUVWXYZ#ABCDEFGH + DEFINE_ARGS dst, stride, stride3, stride5, dst3 + lea dst3q, [dstq+strideq*4] + lea stride3q, [strideq*3] + lea stride5q, [stride3q+strideq*2] + + vpalignr m3, m5, m0, 2 + vpalignr m4, m1, m5, 2 + mova [dst3q+stride5q*2], m3 ; 14 + mova [ dstq+stride3q*2], m4 ; 6 + vpalignr m3, m5, m0, 4 + vpalignr m4, m1, m5, 4 + sub dst3q, strideq + mova [dst3q+stride5q*2], m3 ; 13 + mova [dst3q+strideq*2 ], m4 ; 5 + mova [dst3q+stride3q*4], m0 ; 15 + vpalignr m3, m5, m0, 6 + vpalignr m4, m1, m5, 6 + mova [dstq+stride3q*4], m3 ; 12 + mova [dst3q+strideq*1], m4 ; 4 + vpalignr m3, m5, m0, 8 + vpalignr m4, m1, m5, 8 + mova [dst3q+strideq*8], m3 ; 11 + mova [dst3q+strideq*0], m4 ; 3 + vpalignr m3, m5, m0, 10 + vpalignr m4, m1, m5, 10 + mova [dstq+stride5q*2], m3 ; 10 + mova [dstq+strideq*2 ], m4 ; 2 + vpalignr m3, m5, m0, 12 + vpalignr m4, m1, m5, 12 + mova [dst3q+stride3q*2], m3 ; 9 + mova [dstq+strideq*1 ], m4 ; 1 + vpalignr m3, m5, m0, 14 + vpalignr m4, m1, m5, 14 + mova [dstq+strideq*8], m3 ; 8 + mova [dstq+strideq*0], m4 ; 0 + mova [dst3q+strideq*4], m5 ; 7 + RET + +cglobal vp9_ipred_vl_16x16_16, 4, 5, 7, dst, stride, l, a + movifnidn aq, amp + mova m0, [aq] ; abcdefghijklmnop + vpbroadcastw xm1, [aq+30] ; pppppppp + vperm2i128 m2, m0, m1, q0201 ; ijklmnoppppppppp + vpalignr m3, m2, m0, 2 ; bcdefghijklmnopp + vperm2i128 m4, m3, m1, q0201 ; jklmnopppppppppp + vpalignr m5, m2, m0, 4 ; cdefghijklmnoppp + vperm2i128 m6, m5, m1, q0201 ; klmnoppppppppppp + LOWPASS 5, 3, 0 ; BCDEFGHIJKLMNOPP + LOWPASS 6, 4, 2 ; JKLMNOPPPPPPPPPP + pavgw m3, m0 ; abcdefghijklmnop + pavgw m4, m2 ; ijklmnoppppppppp + DEFINE_ARGS dst, stride, stride3, stride5, dst4 + lea dst4q, [dstq+strideq*4] + lea stride3q, [strideq*3] + lea stride5q, [stride3q+strideq*2] + + mova [dstq+strideq*0], m3 ; 0 abcdefghijklmnop + mova [dstq+strideq*1], m5 ; 1 BCDEFGHIJKLMNOPP + vpalignr m0, m4, m3, 2 + vpalignr m1, m6, m5, 2 + mova [dstq+strideq*2 ], m0 ; 2 bcdefghijklmnopp + mova [dstq+stride3q*1], m1 ; 3 CDEFGHIJKLMNOPPP + vpalignr m0, m4, m3, 4 + vpalignr m1, m6, m5, 4 + mova [dst4q+strideq*0], m0 ; 4 cdefghijklmnoppp + mova [dstq+stride5q*1], m1 ; 5 DEFGHIJKLMNOPPPP + vpalignr m0, m4, m3, 6 + vpalignr m1, m6, m5, 6 + mova [ dstq+stride3q*2], m0 ; 6 defghijklmnopppp + mova [dst4q+stride3q*1], m1 ; 7 EFGHIJKLMNOPPPPP + vpalignr m0, m4, m3, 8 + vpalignr m1, m6, m5, 8 + mova [ dstq+strideq*8], m0 ; 8 efghijklmnoppppp + mova [dst4q+stride5q*1], m1 ; 9 FGHIJKLMNOPPPPPP + vpalignr m0, m4, m3, 10 + mova [dstq+stride5q*2], m0 ; 10 fghijklmnopppppp + vpalignr m0, m4, m3, 12 + mova [dst4q+strideq*8], m0 ; 12 ghijklmnoppppppp + vpalignr m0, m4, m3, 14 + mova [dst4q+stride5q*2], m0 ; 14 hijklmnopppppppp + sub dst4q, strideq + vpalignr m1, m6, m5, 10 + mova [dst4q+strideq*8], m1 ; 11 GHIJKLMNOPPPPPPP + vpalignr m1, m6, m5, 12 + mova [dst4q+stride5q*2], m1 ; 13 HIJKLMNOPPPPPPPP + vpalignr m1, m6, m5, 14 + mova [dst4q+stride3q*4], m1 ; 15 IJKLMNOPPPPPPPPP + RET + +cglobal vp9_ipred_hd_16x16_16, 4, 5, 7, dst, stride, l, a + movu m0, [aq-2] ; *abcdefghijklmno + mova m1, [lq] ; klmnopqrstuvwxyz + vperm2i128 m2, m1, m0, q0201 ; stuvwxyz*abcdefg + vpalignr m3, m2, m1, 2 ; lmnopqrstuvwxyz* + vpalignr m4, m2, m1, 4 ; mnopqrstuvwxyz*a + LOWPASS 4, 3, 1 ; LMNOPQRSTUVWXYZ# + pavgw m3, m1 ; klmnopqrstuvwxyz + mova m1, [aq] ; abcdefghijklmnop + movu m2, [aq+2] ; bcdefghijklmnop. + LOWPASS 2, 1, 0 ; ABCDEFGHIJKLMNO. + vpunpcklwd m0, m3, m4 ; kLlMmNnOsTtUuVvW + vpunpckhwd m1, m3, m4 ; oPpQqRrSwXxYyZz# + vperm2i128 m3, m1, m0, q0002 ; kLlMmNnOoPpQqRrS + vperm2i128 m4, m0, m1, q0301 ; sTtUuVvWwXxYyZz# + vperm2i128 m0, m4, m2, q0201 ; wXxYyZz#ABCDEFGH + vperm2i128 m1, m3, m4, q0201 ; oPpQqRrSsTtUuVvW + DEFINE_ARGS dst, stride, stride3, stride5, dst5 + lea stride3q, [strideq*3] + lea stride5q, [stride3q+strideq*2] + lea dst5q, [dstq+stride5q] + + mova [dst5q+stride5q*2], m3 ; 15 kLlMmNnOoPpQqRrS + mova [dst5q+stride3q*2], m1 ; 11 oPpQqRrSsTtUuVvW + mova [dst5q+strideq*2], m4 ; 7 sTtUuVvWwXxYyZz# + mova [dstq+stride3q*1], m0 ; 3 wXxYyZz#ABCDEFGH + vpalignr m5, m4, m1, 4 + mova [dstq+stride5q*2], m5 ; 10 pQqRrSsTtUuVvWwX + vpalignr m5, m0, m4, 4 + vpalignr m6, m2, m0, 4 + mova [dstq+stride3q*2], m5 ; 6 tUuVvWwXxYyZz#AB + mova [dstq+strideq*2], m6 ; 2 xYyZz#ABCDEFGHIJ + vpalignr m5, m4, m1, 8 + mova [dst5q+strideq*4], m5 ; 9 qRrSsTtUuVvWwXxY + vpalignr m5, m0, m4, 8 + vpalignr m6, m2, m0, 8 + mova [dstq+stride5q*1], m5 ; 5 uVvWwXxYyZz#ABCD + mova [dstq+strideq*1], m6 ; 1 yZz#ABCDEFGHIJKL + vpalignr m5, m1, m3, 12 + vpalignr m6, m4, m1, 12 + mova [dstq+stride3q*4], m5 ; 12 nOoPpQqRrSsTtUuV + mova [dst5q+stride3q], m6 ; 8 rSsTtUuVvWwXxYyZ + vpalignr m5, m0, m4, 12 + vpalignr m6, m2, m0, 12 + mova [dstq+strideq*4], m5 ; 4 nOoPpQqRrSsTtUuV + mova [dstq+strideq*0], m6 ; 0 z#ABCDEFGHIJKLMN + sub dst5q, strideq + vpalignr m5, m1, m3, 4 + mova [dst5q+stride5q*2], m5 ; 14 lMmNnOoPpQqRrSsT + sub dst5q, strideq + vpalignr m5, m1, m3, 8 + mova [dst5q+stride5q*2], m5 ; 13 mNnOoPpQqRrSsTtU + RET + +%if ARCH_X86_64 +cglobal vp9_ipred_dr_32x32_16, 4, 7, 10, dst, stride, l, a + mova m0, [lq+mmsize*0+0] ; l[0-15] + mova m1, [lq+mmsize*1+0] ; l[16-31] + movu m2, [aq+mmsize*0-2] ; *abcdefghijklmno + mova m3, [aq+mmsize*0+0] ; abcdefghijklmnop + mova m4, [aq+mmsize*1+0] ; qrstuvwxyz012345 + vperm2i128 m5, m0, m1, q0201 ; lmnopqrstuvwxyz0 + vpalignr m6, m5, m0, 2 ; mnopqrstuvwxyz01 + vpalignr m7, m5, m0, 4 ; nopqrstuvwxyz012 + LOWPASS 0, 6, 7 ; L[0-15] + vperm2i128 m7, m1, m2, q0201 ; stuvwxyz*abcdefg + vpalignr m5, m7, m1, 2 ; lmnopqrstuvwxyz* + vpalignr m6, m7, m1, 4 ; mnopqrstuvwxyz*a + LOWPASS 1, 5, 6 ; L[16-31]# + vperm2i128 m5, m3, m4, q0201 ; ijklmnopqrstuvwx + vpalignr m6, m5, m3, 2 ; bcdefghijklmnopq + LOWPASS 2, 3, 6 ; A[0-15] + movu m3, [aq+mmsize*1-2] ; pqrstuvwxyz01234 + vperm2i128 m6, m4, m4, q2001 ; yz012345........ + vpalignr m7, m6, m4, 2 ; rstuvwxyz012345. + LOWPASS 3, 4, 7 ; A[16-31]. + vperm2i128 m4, m1, m2, q0201 ; TUVWXYZ#ABCDEFGH + vperm2i128 m5, m0, m1, q0201 ; L[7-15]L[16-23] + vperm2i128 m8, m2, m3, q0201 ; IJKLMNOPQRSTUVWX + DEFINE_ARGS dst8, stride, stride3, stride7, stride5, dst24, cnt + lea stride3q, [strideq*3] + lea stride5q, [stride3q+strideq*2] + lea stride7q, [strideq*4+stride3q] + lea dst24q, [dst8q+stride3q*8] + lea dst8q, [dst8q+strideq*8] + mov cntd, 2 + +.loop: + mova [dst24q+stride7q+0 ], m0 ; 31 23 15 7 + mova [dst24q+stride7q+32], m1 + mova [dst8q+stride7q+0], m1 + mova [dst8q+stride7q+32], m2 + vpalignr m6, m4, m1, 2 + vpalignr m7, m5, m0, 2 + vpalignr m9, m8, m2, 2 + mova [dst24q+stride3q*2+0], m7 ; 30 22 14 6 + mova [dst24q+stride3q*2+32], m6 + mova [dst8q+stride3q*2+0], m6 + mova [dst8q+stride3q*2+32], m9 + vpalignr m6, m4, m1, 4 + vpalignr m7, m5, m0, 4 + vpalignr m9, m8, m2, 4 + mova [dst24q+stride5q+0], m7 ; 29 21 13 5 + mova [dst24q+stride5q+32], m6 + mova [dst8q+stride5q+0], m6 + mova [dst8q+stride5q+32], m9 + vpalignr m6, m4, m1, 6 + vpalignr m7, m5, m0, 6 + vpalignr m9, m8, m2, 6 + mova [dst24q+strideq*4+0 ], m7 ; 28 20 12 4 + mova [dst24q+strideq*4+32], m6 + mova [dst8q+strideq*4+0], m6 + mova [dst8q+strideq*4+32], m9 + vpalignr m6, m4, m1, 8 + vpalignr m7, m5, m0, 8 + vpalignr m9, m8, m2, 8 + mova [dst24q+stride3q+0 ], m7 ; 27 19 11 3 + mova [dst24q+stride3q+32], m6 + mova [dst8q+stride3q+0], m6 + mova [dst8q+stride3q+32], m9 + vpalignr m6, m4, m1, 10 + vpalignr m7, m5, m0, 10 + vpalignr m9, m8, m2, 10 + mova [dst24q+strideq*2+0 ], m7 ; 26 18 10 2 + mova [dst24q+strideq*2+32], m6 + mova [dst8q+strideq*2+0], m6 + mova [dst8q+strideq*2+32], m9 + vpalignr m6, m4, m1, 12 + vpalignr m7, m5, m0, 12 + vpalignr m9, m8, m2, 12 + mova [dst24q+strideq+0 ], m7 ; 25 17 9 1 + mova [dst24q+strideq+32], m6 + mova [dst8q+strideq+0], m6 + mova [dst8q+strideq+32], m9 + vpalignr m6, m4, m1, 14 + vpalignr m7, m5, m0, 14 + vpalignr m9, m8, m2, 14 + mova [dst24q+strideq*0+0 ], m7 ; 24 16 8 0 + mova [dst24q+strideq*0+32], m6 + mova [dst8q+strideq*0+0], m6 + mova [dst8q+strideq*0+32], m9 + mova m0, m5 + mova m5, m1 + mova m1, m4 + mova m4, m2 + mova m2, m8 + mova m8, m3 + sub dst24q, stride7q + sub dst24q, strideq + sub dst8q, stride7q + sub dst8q, strideq + dec cntd + jg .loop + RET +%endif +%endif + +%macro VL_FUNCS 1 ; stack_mem_for_32x32_32bit_function +cglobal vp9_ipred_vl_4x4_16, 2, 4, 3, dst, stride, l, a + movifnidn aq, amp + movu m0, [aq] ; abcdefgh + psrldq m1, m0, 2 ; bcdefgh. + psrldq m2, m0, 4 ; cdefgh.. + LOWPASS 2, 1, 0 ; BCDEFGH. + pavgw m1, m0 ; ABCDEFG. + DEFINE_ARGS dst, stride, stride3 + lea stride3q, [strideq*3] + + movh [dstq+strideq*0], m1 + movh [dstq+strideq*1], m2 + psrldq m1, 2 + psrldq m2, 2 + movh [dstq+strideq*2], m1 + movh [dstq+stride3q ], m2 + RET + +cglobal vp9_ipred_vl_8x8_16, 2, 4, 4, dst, stride, l, a + movifnidn aq, amp + mova m0, [aq] ; abcdefgh +%if cpuflag(ssse3) + mova m3, [pb_2to15_14_15] +%endif + SHIFT_RIGHTx2 m1, m2, m0, m3 ; bcdefghh/cdefghhh + LOWPASS 2, 1, 0 ; BCDEFGHh + pavgw m1, m0 ; ABCDEFGh + DEFINE_ARGS dst, stride, stride3 + lea stride3q, [strideq*3] + + mova [dstq+strideq*0], m1 + mova [dstq+strideq*1], m2 + SHIFT_RIGHT m1, m1, m3 + SHIFT_RIGHT m2, m2, m3 + mova [dstq+strideq*2], m1 + mova [dstq+stride3q ], m2 + lea dstq, [dstq+strideq*4] + SHIFT_RIGHT m1, m1, m3 + SHIFT_RIGHT m2, m2, m3 + mova [dstq+strideq*0], m1 + mova [dstq+strideq*1], m2 + SHIFT_RIGHT m1, m1, m3 + SHIFT_RIGHT m2, m2, m3 + mova [dstq+strideq*2], m1 + mova [dstq+stride3q ], m2 + RET + +cglobal vp9_ipred_vl_16x16_16, 2, 4, 6, dst, stride, l, a + movifnidn aq, amp + mova m0, [aq] + mova m1, [aq+mmsize] + PALIGNR m2, m1, m0, 2, m3 + PALIGNR m3, m1, m0, 4, m4 + LOWPASS 3, 2, 0 + pavgw m2, m0 +%if cpuflag(ssse3) + mova m4, [pb_2to15_14_15] +%endif + SHIFT_RIGHTx2 m5, m0, m1, m4 + LOWPASS 0, 5, 1 + pavgw m1, m5 + DEFINE_ARGS dst, stride, cnt + mov cntd, 8 + +.loop: + mova [dstq+strideq*0+ 0], m2 + mova [dstq+strideq*0+16], m1 + mova [dstq+strideq*1+ 0], m3 + mova [dstq+strideq*1+16], m0 + lea dstq, [dstq+strideq*2] +%if cpuflag(avx) + vpalignr m2, m1, m2, 2 + vpalignr m3, m0, m3, 2 +%else + PALIGNR m5, m1, m2, 2, m4 + mova m2, m5 + PALIGNR m5, m0, m3, 2, m4 + mova m3, m5 +%endif + SHIFT_RIGHT m1, m1, m4 + SHIFT_RIGHT m0, m0, m4 + dec cntd + jg .loop + RET + +cglobal vp9_ipred_vl_32x32_16, 2, 5, 11, %1 * mmsize * ARCH_X86_32, dst, stride, l, a + movifnidn aq, amp + mova m0, [aq+mmsize*0] + mova m1, [aq+mmsize*1] + mova m2, [aq+mmsize*2] + PALIGNR m6, m1, m0, 2, m5 + PALIGNR m7, m1, m0, 4, m5 + LOWPASS 7, 6, 0 + pavgw m6, m0 + SCRATCH 6, 8, rsp+0*mmsize + PALIGNR m4, m2, m1, 2, m0 + PALIGNR m5, m2, m1, 4, m0 + LOWPASS 5, 4, 1 + pavgw m4, m1 + mova m0, [aq+mmsize*3] + PALIGNR m1, m0, m2, 2, m6 + PALIGNR m3, m0, m2, 4, m6 + LOWPASS 3, 1, 2 + pavgw m2, m1 +%if cpuflag(ssse3) + PRELOAD 10, pb_2to15_14_15, shuf +%endif + SHIFT_RIGHTx2 m6, m1, m0, reg_shuf + LOWPASS 1, 6, 0 + pavgw m0, m6 +%if ARCH_X86_64 + pshufd m9, m6, q3333 +%endif +%if cpuflag(avx) + UNSCRATCH 6, 8, rsp+0*mmsize +%endif + DEFINE_ARGS dst, stride, cnt, stride16, stride17 + mov stride16q, strideq + mov cntd, 8 + shl stride16q, 4 + lea stride17q, [stride16q+strideq] + + ; FIXME m8 is unused for avx, so we could save one register here for win64 +.loop: +%if notcpuflag(avx) + UNSCRATCH 6, 8, rsp+0*mmsize +%endif + mova [dstq+strideq*0+ 0], m6 + mova [dstq+strideq*0+16], m4 + mova [dstq+strideq*0+32], m2 + mova [dstq+strideq*0+48], m0 + mova [dstq+strideq*1+ 0], m7 + mova [dstq+strideq*1+16], m5 + mova [dstq+strideq*1+32], m3 + mova [dstq+strideq*1+48], m1 + mova [dstq+stride16q+ 0], m4 + mova [dstq+stride16q+16], m2 + mova [dstq+stride16q+32], m0 +%if ARCH_X86_64 + mova [dstq+stride16q+48], m9 +%endif + mova [dstq+stride17q+ 0], m5 + mova [dstq+stride17q+16], m3 + mova [dstq+stride17q+32], m1 +%if ARCH_X86_64 + mova [dstq+stride17q+48], m9 +%endif + lea dstq, [dstq+strideq*2] +%if cpuflag(avx) + vpalignr m6, m4, m6, 2 + vpalignr m4, m2, m4, 2 + vpalignr m2, m0, m2, 2 + vpalignr m7, m5, m7, 2 + vpalignr m5, m3, m5, 2 + vpalignr m3, m1, m3, 2 +%else + SCRATCH 3, 8, rsp+0*mmsize +%if notcpuflag(ssse3) + SCRATCH 1, 10, rsp+1*mmsize +%endif + PALIGNR m3, m4, m6, 2, m1 + mova m6, m3 + PALIGNR m3, m2, m4, 2, m1 + mova m4, m3 + PALIGNR m3, m0, m2, 2, m1 + mova m2, m3 + PALIGNR m3, m5, m7, 2, m1 + mova m7, m3 + UNSCRATCH 3, 8, rsp+0*mmsize + SCRATCH 6, 8, rsp+0*mmsize +%if notcpuflag(ssse3) + UNSCRATCH 1, 10, rsp+1*mmsize + SCRATCH 7, 10, rsp+1*mmsize +%endif + PALIGNR m6, m3, m5, 2, m7 + mova m5, m6 + PALIGNR m6, m1, m3, 2, m7 + mova m3, m6 +%if notcpuflag(ssse3) + UNSCRATCH 7, 10, rsp+1*mmsize +%endif +%endif + SHIFT_RIGHT m1, m1, reg_shuf + SHIFT_RIGHT m0, m0, reg_shuf + dec cntd + jg .loop + +%if ARCH_X86_32 + DEFINE_ARGS dst, stride, stride3 + lea stride3q, [strideq*3] +%assign %%n 0 +%rep 4 + mova [dstq+strideq*0+48], m0 + mova [dstq+strideq*1+48], m0 + mova [dstq+strideq*2+48], m0 + mova [dstq+stride3q +48], m0 +%if %%n < 3 + lea dstq, [dstq+strideq*4] +%endif +%assign %%n (%%n+1) +%endrep +%endif + RET +%endmacro + +INIT_XMM sse2 +VL_FUNCS 2 +INIT_XMM ssse3 +VL_FUNCS 1 +INIT_XMM avx +VL_FUNCS 1 + +%macro VR_FUNCS 0 +cglobal vp9_ipred_vr_4x4_16, 4, 4, 3, dst, stride, l, a + movu m0, [aq-2] + movhps m1, [lq] + PALIGNR m0, m1, 10, m2 ; xyz*abcd + pslldq m1, m0, 2 ; .xyz*abc + pslldq m2, m0, 4 ; ..xyz*ab + LOWPASS 2, 1, 0 ; ..YZ#ABC + pavgw m1, m0 ; ....#ABC + DEFINE_ARGS dst, stride, stride3 + lea stride3q, [strideq*3] + + movhps [dstq+strideq*0], m1 + movhps [dstq+strideq*1], m2 + shufps m0, m2, m1, q3210 +%if cpuflag(ssse3) + pshufb m2, [pb_4_5_8to13_8x0] +%else + pshuflw m2, m2, q2222 + psrldq m2, 6 +%endif + psrldq m0, 6 + movh [dstq+strideq*2], m0 + movh [dstq+stride3q ], m2 + RET + +cglobal vp9_ipred_vr_8x8_16, 4, 4, 5, dst, stride, l, a + movu m1, [aq-2] ; *abcdefg + movu m2, [lq] ; stuvwxyz + mova m0, [aq] ; abcdefgh + PALIGNR m3, m1, m2, 14, m4 ; z*abcdef + LOWPASS 3, 1, 0 + pavgw m0, m1 + PALIGNR m1, m2, 2, m4 ; tuvwxyz* + pslldq m4, m2, 2 ; .stuvwxy + LOWPASS 4, 2, 1 + DEFINE_ARGS dst, stride, stride3 + lea stride3q, [strideq*3] + + mova [dstq+strideq*0], m0 + mova [dstq+strideq*1], m3 + PALIGNR m0, m4, 14, m1 + pslldq m4, 2 + PALIGNR m3, m4, 14, m1 + pslldq m4, 2 + mova [dstq+strideq*2], m0 + mova [dstq+stride3q ], m3 + lea dstq, [dstq+strideq*4] + PALIGNR m0, m4, 14, m1 + pslldq m4, 2 + PALIGNR m3, m4, 14, m1 + pslldq m4, 2 + mova [dstq+strideq*0], m0 + mova [dstq+strideq*1], m3 + PALIGNR m0, m4, 14, m1 + pslldq m4, 2 + PALIGNR m3, m4, 14, m4 + mova [dstq+strideq*2], m0 + mova [dstq+stride3q ], m3 + RET + +cglobal vp9_ipred_vr_16x16_16, 4, 4, 8, dst, stride, l, a + movu m1, [aq-2] ; *abcdefg + movu m2, [aq+mmsize-2] ; hijklmno + mova m3, [aq] ; abcdefgh + mova m4, [aq+mmsize] ; ijklmnop + mova m5, [lq+mmsize] ; stuvwxyz + PALIGNR m0, m1, m5, 14, m6 ; z*abcdef + movu m6, [aq+mmsize-4] ; ghijklmn + LOWPASS 6, 2, 4 + pavgw m2, m4 + LOWPASS 0, 1, 3 + pavgw m3, m1 + PALIGNR m1, m5, 2, m7 ; tuvwxyz* + movu m7, [lq+mmsize-2] ; rstuvwxy + LOWPASS 1, 5, 7 + movu m5, [lq+2] ; lmnopqrs + pslldq m4, m5, 2 ; .lmnopqr + pslldq m7, m5, 4 ; ..lmnopq + LOWPASS 5, 4, 7 + psrld m4, m1, 16 + psrld m7, m5, 16 + pand m1, [pd_65535] + pand m5, [pd_65535] + packssdw m7, m4 + packssdw m5, m1 + DEFINE_ARGS dst, stride, cnt + mov cntd, 8 + +.loop: + mova [dstq+strideq*0+ 0], m3 + mova [dstq+strideq*0+16], m2 + mova [dstq+strideq*1+ 0], m0 + mova [dstq+strideq*1+16], m6 + lea dstq, [dstq+strideq*2] + PALIGNR m2, m3, 14, m4 + PALIGNR m3, m7, 14, m4 + pslldq m7, 2 + PALIGNR m6, m0, 14, m4 + PALIGNR m0, m5, 14, m4 + pslldq m5, 2 + dec cntd + jg .loop + RET + +cglobal vp9_ipred_vr_32x32_16, 4, 5, 14, 6 * mmsize * ARCH_X86_32, dst, stride, l, a + movu m0, [aq+mmsize*0-2] ; *a[0-6] + movu m1, [aq+mmsize*1-2] ; a[7-14] + movu m2, [aq+mmsize*2-2] ; a[15-22] + movu m3, [aq+mmsize*3-2] ; a[23-30] + mova m4, [aq+mmsize*3+0] ; a[24-31] + movu m5, [aq+mmsize*3-4] ; a[22-29] + LOWPASS 5, 3, 4 ; A[23-30] + SCRATCH 5, 8, rsp+0*mmsize + pavgw m3, m4 + mova m4, [aq+mmsize*2+0] ; a[16-23] + movu m6, [aq+mmsize*2-4] ; a[14-21] + LOWPASS 6, 2, 4 ; A[15-22] + SCRATCH 6, 9, rsp+1*mmsize + pavgw m2, m4 + mova m4, [aq+mmsize*1+0] ; a[8-15] + movu m7, [aq+mmsize*1-4] ; a[6-13] + LOWPASS 7, 1, 4 ; A[7-14] + SCRATCH 7, 10, rsp+2*mmsize + pavgw m1, m4 + mova m4, [aq+mmsize*0+0] ; a[0-7] + mova m5, [lq+mmsize*3+0] ; l[24-31] + PALIGNR m6, m0, m5, 14, m7 ; l[31]*a[0-5] + LOWPASS 6, 0, 4 ; #A[0-6] + SCRATCH 6, 11, rsp+3*mmsize + pavgw m4, m0 + PALIGNR m0, m5, 2, m7 ; l[25-31]* + movu m7, [lq+mmsize*3-2] ; l[23-30] + LOWPASS 0, 5, 7 ; L[24-31] + movu m5, [lq+mmsize*2-2] ; l[15-22] + mova m7, [lq+mmsize*2+0] ; l[16-23] + movu m6, [lq+mmsize*2+2] ; l[17-24] + LOWPASS 5, 7, 6 ; L[16-23] + psrld m7, m0, 16 + psrld m6, m5, 16 + pand m0, [pd_65535] + pand m5, [pd_65535] + packssdw m6, m7 + packssdw m5, m0 + SCRATCH 5, 12, rsp+4*mmsize + SCRATCH 6, 13, rsp+5*mmsize + movu m6, [lq+mmsize*1-2] ; l[7-14] + mova m0, [lq+mmsize*1+0] ; l[8-15] + movu m5, [lq+mmsize*1+2] ; l[9-16] + LOWPASS 6, 0, 5 ; L[8-15] + movu m0, [lq+mmsize*0+2] ; l[1-8] + pslldq m5, m0, 2 ; .l[1-7] + pslldq m7, m0, 4 ; ..l[1-6] + LOWPASS 0, 5, 7 + psrld m5, m6, 16 + psrld m7, m0, 16 + pand m6, [pd_65535] + pand m0, [pd_65535] + packssdw m7, m5 + packssdw m0, m6 + UNSCRATCH 6, 13, rsp+5*mmsize + DEFINE_ARGS dst, stride, stride16, cnt, stride17 + mov stride16q, strideq + mov cntd, 8 + shl stride16q, 4 +%if ARCH_X86_64 + lea stride17q, [stride16q+strideq] +%endif + +.loop: + mova [dstq+strideq*0+ 0], m4 + mova [dstq+strideq*0+16], m1 + mova [dstq+strideq*0+32], m2 + mova [dstq+strideq*0+48], m3 +%if ARCH_X86_64 + mova [dstq+strideq*1+ 0], m11 + mova [dstq+strideq*1+16], m10 + mova [dstq+strideq*1+32], m9 + mova [dstq+strideq*1+48], m8 +%endif + mova [dstq+stride16q+ 0], m6 + mova [dstq+stride16q+16], m4 + mova [dstq+stride16q+32], m1 + mova [dstq+stride16q+48], m2 +%if ARCH_X86_64 + mova [dstq+stride17q+ 0], m12 + mova [dstq+stride17q+16], m11 + mova [dstq+stride17q+32], m10 + mova [dstq+stride17q+48], m9 +%endif + lea dstq, [dstq+strideq*2] + PALIGNR m3, m2, 14, m5 + PALIGNR m2, m1, 14, m5 + PALIGNR m1, m4, 14, m5 + PALIGNR m4, m6, 14, m5 + PALIGNR m6, m7, 14, m5 + pslldq m7, 2 +%if ARCH_X86_64 + PALIGNR m8, m9, 14, m5 + PALIGNR m9, m10, 14, m5 + PALIGNR m10, m11, 14, m5 + PALIGNR m11, m12, 14, m5 + PALIGNR m12, m0, 14, m5 + pslldq m0, 2 +%endif + dec cntd + jg .loop + +%if ARCH_X86_32 + UNSCRATCH 5, 12, rsp+4*mmsize + UNSCRATCH 4, 11, rsp+3*mmsize + UNSCRATCH 3, 10, rsp+2*mmsize + UNSCRATCH 2, 9, rsp+1*mmsize + UNSCRATCH 1, 8, rsp+0*mmsize + mov dstq, dstm + mov cntd, 8 + add dstq, strideq +.loop2: + mova [dstq+strideq*0+ 0], m4 + mova [dstq+strideq*0+16], m3 + mova [dstq+strideq*0+32], m2 + mova [dstq+strideq*0+48], m1 + mova [dstq+stride16q+ 0], m5 + mova [dstq+stride16q+16], m4 + mova [dstq+stride16q+32], m3 + mova [dstq+stride16q+48], m2 + lea dstq, [dstq+strideq*2] + PALIGNR m1, m2, 14, m6 + PALIGNR m2, m3, 14, m6 + PALIGNR m3, m4, 14, m6 + PALIGNR m4, m5, 14, m6 + PALIGNR m5, m0, 14, m6 + pslldq m0, 2 + dec cntd + jg .loop2 +%endif + RET +%endmacro + +INIT_XMM sse2 +VR_FUNCS +INIT_XMM ssse3 +VR_FUNCS +INIT_XMM avx +VR_FUNCS + +%macro HU_FUNCS 1 ; stack_mem_for_32x32_32bit_function +cglobal vp9_ipred_hu_4x4_16, 3, 3, 3, dst, stride, l, a + movh m0, [lq] ; abcd +%if cpuflag(ssse3) + pshufb m0, [pb_0to7_67x4] ; abcddddd +%else + punpcklqdq m0, m0 + pshufhw m0, m0, q3333 ; abcddddd +%endif + psrldq m1, m0, 2 ; bcddddd. + psrldq m2, m0, 4 ; cddddd.. + LOWPASS 2, 1, 0 ; BCDddd.. + pavgw m1, m0 ; abcddddd + SBUTTERFLY wd, 1, 2, 0 ; aBbCcDdd, dddddddd + PALIGNR m2, m1, 4, m0 ; bCcDdddd + DEFINE_ARGS dst, stride, stride3 + lea stride3q, [strideq*3] + + movh [dstq+strideq*0], m1 ; aBbC + movh [dstq+strideq*1], m2 ; bCcD + movhps [dstq+strideq*2], m1 ; cDdd + movhps [dstq+stride3q ], m2 ; dddd + RET + +cglobal vp9_ipred_hu_8x8_16, 3, 3, 4, dst, stride, l, a + mova m0, [lq] +%if cpuflag(ssse3) + mova m3, [pb_2to15_14_15] +%endif + SHIFT_RIGHTx2 m1, m2, m0, m3 + LOWPASS 2, 1, 0 + pavgw m1, m0 + SBUTTERFLY wd, 1, 2, 0 + shufps m0, m1, m2, q1032 + pshufd m3, m2, q3332 + DEFINE_ARGS dst, stride, stride3 + lea stride3q, [strideq*3] + + mova [dstq+strideq *0], m1 + mova [dstq+strideq *2], m0 + mova [dstq+strideq *4], m2 + mova [dstq+stride3q*2], m3 + add dstq, strideq +%if cpuflag(avx) + vpalignr m1, m2, m1, 4 +%else + PALIGNR m0, m2, m1, 4, m3 + mova m1, m0 +%endif + pshufd m2, m2, q3321 + shufps m0, m1, m2, q1032 + pshufd m3, m2, q3332 + mova [dstq+strideq *0], m1 + mova [dstq+strideq *2], m0 + mova [dstq+strideq *4], m2 + mova [dstq+stride3q*2], m3 + RET + +cglobal vp9_ipred_hu_16x16_16, 3, 4, 6 + notcpuflag(ssse3), dst, stride, l, a + mova m0, [lq] + mova m3, [lq+mmsize] + movu m1, [lq+2] + movu m2, [lq+4] + LOWPASS 2, 1, 0 + pavgw m1, m0 + SBUTTERFLY wd, 1, 2, 0 +%if cpuflag(ssse3) + mova m5, [pb_2to15_14_15] +%endif + SHIFT_RIGHTx2 m0, m4, m3, m5 + LOWPASS 4, 0, 3 + pavgw m3, m0 + SBUTTERFLY wd, 3, 4, 5 + pshufd m0, m0, q3333 + DEFINE_ARGS dst, stride, stride3, cnt + lea stride3q, [strideq*3] + mov cntd, 4 + +.loop: + mova [dstq+strideq *0+ 0], m1 + mova [dstq+strideq *0+16], m2 + mova [dstq+strideq *4+ 0], m2 + mova [dstq+strideq *4+16], m3 + mova [dstq+strideq *8+ 0], m3 + mova [dstq+strideq *8+16], m4 + mova [dstq+stride3q*4+ 0], m4 + mova [dstq+stride3q*4+16], m0 + add dstq, strideq +%if cpuflag(avx) + vpalignr m1, m2, m1, 4 + vpalignr m2, m3, m2, 4 + vpalignr m3, m4, m3, 4 + vpalignr m4, m0, m4, 4 +%else + PALIGNR m5, m2, m1, 4, m6 + mova m1, m5 + PALIGNR m5, m3, m2, 4, m6 + mova m2, m5 + PALIGNR m5, m4, m3, 4, m6 + mova m3, m5 + PALIGNR m5, m0, m4, 4, m6 + mova m4, m5 +%endif + dec cntd + jg .loop + RET + +cglobal vp9_ipred_hu_32x32_16, 3, 7, 10 + notcpuflag(ssse3), \ + %1 * -mmsize * ARCH_X86_32, dst, stride, l, a + mova m2, [lq+mmsize*0+0] + movu m1, [lq+mmsize*0+2] + movu m0, [lq+mmsize*0+4] + LOWPASS 0, 1, 2 + pavgw m1, m2 + SBUTTERFLY wd, 1, 0, 2 + SCRATCH 1, 8, rsp+0*mmsize + mova m4, [lq+mmsize*1+0] + movu m3, [lq+mmsize*1+2] + movu m2, [lq+mmsize*1+4] + LOWPASS 2, 3, 4 + pavgw m3, m4 + SBUTTERFLY wd, 3, 2, 4 + mova m6, [lq+mmsize*2+0] + movu m5, [lq+mmsize*2+2] + movu m4, [lq+mmsize*2+4] + LOWPASS 4, 5, 6 + pavgw m5, m6 + SBUTTERFLY wd, 5, 4, 6 + mova m7, [lq+mmsize*3+0] + SCRATCH 0, 9, rsp+1*mmsize +%if cpuflag(ssse3) + mova m0, [pb_2to15_14_15] +%endif + SHIFT_RIGHTx2 m1, m6, m7, m0 + LOWPASS 6, 1, 7 + pavgw m7, m1 + SBUTTERFLY wd, 7, 6, 0 + pshufd m1, m1, q3333 + UNSCRATCH 0, 9, rsp+1*mmsize + DEFINE_ARGS dst, stride, cnt, stride3, stride4, stride20, stride28 + lea stride3q, [strideq*3] + lea stride4q, [strideq*4] + lea stride28q, [stride4q*8] + lea stride20q, [stride4q*5] + sub stride28q, stride4q + mov cntd, 4 + +.loop: +%if ARCH_X86_64 + SWAP 1, 8 +%else + mova [rsp+1*mmsize], m1 + mova m1, [rsp+0*mmsize] +%endif + mova [dstq+strideq *0+ 0], m1 + mova [dstq+strideq *0+16], m0 + mova [dstq+strideq *0+32], m3 + mova [dstq+strideq *0+48], m2 + mova [dstq+stride4q*1+ 0], m0 + mova [dstq+stride4q*1+16], m3 + mova [dstq+stride4q*1+32], m2 + mova [dstq+stride4q*1+48], m5 + mova [dstq+stride4q*2+ 0], m3 + mova [dstq+stride4q*2+16], m2 + mova [dstq+stride4q*2+32], m5 + mova [dstq+stride4q*2+48], m4 +%if cpuflag(avx) + vpalignr m1, m0, m1, 4 + vpalignr m0, m3, m0, 4 + vpalignr m3, m2, m3, 4 +%else + SCRATCH 6, 9, rsp+2*mmsize +%if notcpuflag(ssse3) + SCRATCH 7, 10, rsp+3*mmsize +%endif + PALIGNR m6, m0, m1, 4, m7 + mova m1, m6 + PALIGNR m6, m3, m0, 4, m7 + mova m0, m6 + PALIGNR m6, m2, m3, 4, m7 + mova m3, m6 + UNSCRATCH 6, 9, rsp+2*mmsize + SCRATCH 0, 9, rsp+2*mmsize +%if notcpuflag(ssse3) + UNSCRATCH 7, 10, rsp+3*mmsize + SCRATCH 3, 10, rsp+3*mmsize +%endif +%endif +%if ARCH_X86_64 + SWAP 1, 8 +%else + mova [rsp+0*mmsize], m1 + mova m1, [rsp+1*mmsize] +%endif + mova [dstq+stride3q*4+ 0], m2 + mova [dstq+stride3q*4+16], m5 + mova [dstq+stride3q*4+32], m4 + mova [dstq+stride3q*4+48], m7 + mova [dstq+stride4q*4+ 0], m5 + mova [dstq+stride4q*4+16], m4 + mova [dstq+stride4q*4+32], m7 + mova [dstq+stride4q*4+48], m6 + mova [dstq+stride20q + 0], m4 + mova [dstq+stride20q +16], m7 + mova [dstq+stride20q +32], m6 + mova [dstq+stride20q +48], m1 + mova [dstq+stride3q*8+ 0], m7 + mova [dstq+stride3q*8+16], m6 + mova [dstq+stride3q*8+32], m1 + mova [dstq+stride3q*8+48], m1 + mova [dstq+stride28q + 0], m6 + mova [dstq+stride28q +16], m1 + mova [dstq+stride28q +32], m1 + mova [dstq+stride28q +48], m1 +%if cpuflag(avx) + vpalignr m2, m5, m2, 4 + vpalignr m5, m4, m5, 4 + vpalignr m4, m7, m4, 4 + vpalignr m7, m6, m7, 4 + vpalignr m6, m1, m6, 4 +%else + PALIGNR m0, m5, m2, 4, m3 + mova m2, m0 + PALIGNR m0, m4, m5, 4, m3 + mova m5, m0 + PALIGNR m0, m7, m4, 4, m3 + mova m4, m0 + PALIGNR m0, m6, m7, 4, m3 + mova m7, m0 + PALIGNR m0, m1, m6, 4, m3 + mova m6, m0 + UNSCRATCH 0, 9, rsp+2*mmsize +%if notcpuflag(ssse3) + UNSCRATCH 3, 10, rsp+3*mmsize +%endif +%endif + add dstq, strideq + dec cntd + jg .loop + RET +%endmacro + +INIT_XMM sse2 +HU_FUNCS 4 +INIT_XMM ssse3 +HU_FUNCS 3 +INIT_XMM avx +HU_FUNCS 2 + +%macro HD_FUNCS 0 +cglobal vp9_ipred_hd_4x4_16, 4, 4, 4, dst, stride, l, a + movh m0, [lq] + movhps m0, [aq-2] + psrldq m1, m0, 2 + psrldq m2, m0, 4 + LOWPASS 2, 1, 0 + pavgw m1, m0 + punpcklwd m1, m2 + DEFINE_ARGS dst, stride, stride3 + lea stride3q, [strideq*3] + + movh [dstq+stride3q ], m1 + movhps [dstq+strideq*1], m1 + movhlps m2, m2 + PALIGNR m2, m1, 4, m0 + movh [dstq+strideq*2], m2 + movhps [dstq+strideq*0], m2 + RET + +cglobal vp9_ipred_hd_8x8_16, 4, 4, 5, dst, stride, l, a + mova m0, [lq] + movu m1, [aq-2] + PALIGNR m2, m1, m0, 2, m3 + PALIGNR m3, m1, m0, 4, m4 + LOWPASS 3, 2, 0 + pavgw m2, m0 + SBUTTERFLY wd, 2, 3, 0 + psrldq m0, m1, 2 + psrldq m4, m1, 4 + LOWPASS 1, 0, 4 + DEFINE_ARGS dst8, mstride, cnt + lea dst8q, [dst8q+mstrideq*8] + neg mstrideq + mov cntd, 4 + +.loop: + add dst8q, mstrideq + mova [dst8q+mstrideq*0], m2 + mova [dst8q+mstrideq*4], m3 +%if cpuflag(avx) + vpalignr m2, m3, m2, 4 + vpalignr m3, m1, m3, 4 +%else + PALIGNR m0, m3, m2, 4, m4 + mova m2, m0 + PALIGNR m0, m1, m3, 4, m4 + mova m3, m0 +%endif + psrldq m1, 4 + dec cntd + jg .loop + RET + +cglobal vp9_ipred_hd_16x16_16, 4, 4, 8, dst, stride, l, a + mova m2, [lq] + movu m1, [lq+2] + movu m0, [lq+4] + LOWPASS 0, 1, 2 + pavgw m1, m2 + mova m4, [lq+mmsize] + movu m5, [aq-2] + PALIGNR m3, m5, m4, 2, m6 + PALIGNR m2, m5, m4, 4, m6 + LOWPASS 2, 3, 4 + pavgw m3, m4 + SBUTTERFLY wd, 1, 0, 4 + SBUTTERFLY wd, 3, 2, 4 + mova m6, [aq] + movu m4, [aq+2] + LOWPASS 4, 6, 5 + movu m5, [aq+mmsize-2] + psrldq m6, m5, 2 + psrldq m7, m5, 4 + LOWPASS 5, 6, 7 + DEFINE_ARGS dst, mstride, mstride3, cnt + lea dstq, [dstq+mstrideq*8] + lea dstq, [dstq+mstrideq*8] + neg mstrideq + lea mstride3q, [mstrideq*3] + mov cntd, 4 + +.loop: + add dstq, mstrideq + mova [dstq+mstride3q*4+ 0], m2 + mova [dstq+mstride3q*4+16], m4 + mova [dstq+mstrideq *8+ 0], m3 + mova [dstq+mstrideq *8+16], m2 + mova [dstq+mstrideq *4+ 0], m0 + mova [dstq+mstrideq *4+16], m3 + mova [dstq+mstrideq *0+ 0], m1 + mova [dstq+mstrideq *0+16], m0 +%if cpuflag(avx) + vpalignr m1, m0, m1, 4 + vpalignr m0, m3, m0, 4 + vpalignr m3, m2, m3, 4 + vpalignr m2, m4, m2, 4 + vpalignr m4, m5, m4, 4 +%else + PALIGNR m6, m0, m1, 4, m7 + mova m1, m6 + PALIGNR m6, m3, m0, 4, m7 + mova m0, m6 + PALIGNR m6, m2, m3, 4, m7 + mova m3, m6 + PALIGNR m6, m4, m2, 4, m7 + mova m2, m6 + PALIGNR m6, m5, m4, 4, m7 + mova m4, m6 +%endif + psrldq m5, 4 + dec cntd + jg .loop + RET + +cglobal vp9_ipred_hd_32x32_16, 4, 4 + 3 * ARCH_X86_64, 14, \ + 10 * -mmsize * ARCH_X86_32, dst, stride, l, a + mova m2, [lq+mmsize*0+0] + movu m1, [lq+mmsize*0+2] + movu m0, [lq+mmsize*0+4] + LOWPASS 0, 1, 2 + pavgw m1, m2 + SBUTTERFLY wd, 1, 0, 2 + mova m4, [lq+mmsize*1+0] + movu m3, [lq+mmsize*1+2] + movu m2, [lq+mmsize*1+4] + LOWPASS 2, 3, 4 + pavgw m3, m4 + SBUTTERFLY wd, 3, 2, 4 + SCRATCH 0, 8, rsp+0*mmsize + SCRATCH 1, 9, rsp+1*mmsize + SCRATCH 2, 10, rsp+2*mmsize + SCRATCH 3, 11, rsp+3*mmsize + mova m6, [lq+mmsize*2+0] + movu m5, [lq+mmsize*2+2] + movu m4, [lq+mmsize*2+4] + LOWPASS 4, 5, 6 + pavgw m5, m6 + SBUTTERFLY wd, 5, 4, 6 + mova m0, [lq+mmsize*3+0] + movu m1, [aq+mmsize*0-2] + PALIGNR m7, m1, m0, 2, m2 + PALIGNR m6, m1, m0, 4, m2 + LOWPASS 6, 7, 0 + pavgw m7, m0 + SBUTTERFLY wd, 7, 6, 0 + mova m2, [aq+mmsize*0+0] + movu m0, [aq+mmsize*0+2] + LOWPASS 0, 2, 1 + movu m1, [aq+mmsize*1-2] + mova m2, [aq+mmsize*1+0] + movu m3, [aq+mmsize*1+2] + LOWPASS 1, 2, 3 + SCRATCH 6, 12, rsp+6*mmsize + SCRATCH 7, 13, rsp+7*mmsize + movu m2, [aq+mmsize*2-2] + mova m3, [aq+mmsize*2+0] + movu m6, [aq+mmsize*2+2] + LOWPASS 2, 3, 6 + movu m3, [aq+mmsize*3-2] + psrldq m6, m3, 2 + psrldq m7, m3, 4 + LOWPASS 3, 6, 7 + UNSCRATCH 6, 12, rsp+6*mmsize + UNSCRATCH 7, 13, rsp+7*mmsize +%if ARCH_X86_32 + mova [rsp+4*mmsize], m4 + mova [rsp+5*mmsize], m5 + ; we already backed up m6/m7 earlier on x86-32 in SCRATCH, so we don't need + ; to do it again here +%endif + DEFINE_ARGS dst, stride, cnt, stride3, stride4, stride20, stride28 + mov cntd, 4 + lea stride3q, [strideq*3] +%if ARCH_X86_64 + lea stride4q, [strideq*4] + lea stride28q, [stride4q*8] + lea stride20q, [stride4q*5] + sub stride28q, stride4q +%endif + add dstq, stride3q + + ; x86-32 doesn't have enough registers, so on that platform, we split + ; the loop in 2... Otherwise you spend most of the loop (un)scratching +.loop: +%if ARCH_X86_64 + mova [dstq+stride28q + 0], m9 + mova [dstq+stride28q +16], m8 + mova [dstq+stride28q +32], m11 + mova [dstq+stride28q +48], m10 + mova [dstq+stride3q*8+ 0], m8 + mova [dstq+stride3q*8+16], m11 + mova [dstq+stride3q*8+32], m10 + mova [dstq+stride3q*8+48], m5 + mova [dstq+stride20q + 0], m11 + mova [dstq+stride20q +16], m10 + mova [dstq+stride20q +32], m5 + mova [dstq+stride20q +48], m4 + mova [dstq+stride4q*4+ 0], m10 + mova [dstq+stride4q*4+16], m5 + mova [dstq+stride4q*4+32], m4 + mova [dstq+stride4q*4+48], m7 +%endif + mova [dstq+stride3q*4+ 0], m5 + mova [dstq+stride3q*4+16], m4 + mova [dstq+stride3q*4+32], m7 + mova [dstq+stride3q*4+48], m6 + mova [dstq+strideq* 8+ 0], m4 + mova [dstq+strideq* 8+16], m7 + mova [dstq+strideq* 8+32], m6 + mova [dstq+strideq* 8+48], m0 + mova [dstq+strideq* 4+ 0], m7 + mova [dstq+strideq* 4+16], m6 + mova [dstq+strideq* 4+32], m0 + mova [dstq+strideq* 4+48], m1 + mova [dstq+strideq* 0+ 0], m6 + mova [dstq+strideq* 0+16], m0 + mova [dstq+strideq* 0+32], m1 + mova [dstq+strideq* 0+48], m2 + sub dstq, strideq +%if cpuflag(avx) +%if ARCH_X86_64 + vpalignr m9, m8, m9, 4 + vpalignr m8, m11, m8, 4 + vpalignr m11, m10, m11, 4 + vpalignr m10, m5, m10, 4 +%endif + vpalignr m5, m4, m5, 4 + vpalignr m4, m7, m4, 4 + vpalignr m7, m6, m7, 4 + vpalignr m6, m0, m6, 4 + vpalignr m0, m1, m0, 4 + vpalignr m1, m2, m1, 4 + vpalignr m2, m3, m2, 4 +%else +%if ARCH_X86_64 + PALIGNR m12, m8, m9, 4, m13 + mova m9, m12 + PALIGNR m12, m11, m8, 4, m13 + mova m8, m12 + PALIGNR m12, m10, m11, 4, m13 + mova m11, m12 + PALIGNR m12, m5, m10, 4, m13 + mova m10, m12 +%endif + SCRATCH 3, 12, rsp+8*mmsize, sh +%if notcpuflag(ssse3) + SCRATCH 2, 13, rsp+9*mmsize +%endif + PALIGNR m3, m4, m5, 4, m2 + mova m5, m3 + PALIGNR m3, m7, m4, 4, m2 + mova m4, m3 + PALIGNR m3, m6, m7, 4, m2 + mova m7, m3 + PALIGNR m3, m0, m6, 4, m2 + mova m6, m3 + PALIGNR m3, m1, m0, 4, m2 + mova m0, m3 +%if notcpuflag(ssse3) + UNSCRATCH 2, 13, rsp+9*mmsize + SCRATCH 0, 13, rsp+9*mmsize +%endif + PALIGNR m3, m2, m1, 4, m0 + mova m1, m3 + PALIGNR m3, reg_sh, m2, 4, m0 + mova m2, m3 +%if notcpuflag(ssse3) + UNSCRATCH 0, 13, rsp+9*mmsize +%endif + UNSCRATCH 3, 12, rsp+8*mmsize, sh +%endif + psrldq m3, 4 + dec cntd + jg .loop + +%if ARCH_X86_32 + UNSCRATCH 0, 8, rsp+0*mmsize + UNSCRATCH 1, 9, rsp+1*mmsize + UNSCRATCH 2, 10, rsp+2*mmsize + UNSCRATCH 3, 11, rsp+3*mmsize + mova m4, [rsp+4*mmsize] + mova m5, [rsp+5*mmsize] + mova m6, [rsp+6*mmsize] + mova m7, [rsp+7*mmsize] + DEFINE_ARGS dst, stride, stride5, stride3 + lea stride5q, [strideq*5] + lea dstq, [dstq+stride5q*4] + DEFINE_ARGS dst, stride, cnt, stride3 + mov cntd, 4 +.loop_2: + mova [dstq+stride3q*4+ 0], m1 + mova [dstq+stride3q*4+16], m0 + mova [dstq+stride3q*4+32], m3 + mova [dstq+stride3q*4+48], m2 + mova [dstq+strideq* 8+ 0], m0 + mova [dstq+strideq* 8+16], m3 + mova [dstq+strideq* 8+32], m2 + mova [dstq+strideq* 8+48], m5 + mova [dstq+strideq* 4+ 0], m3 + mova [dstq+strideq* 4+16], m2 + mova [dstq+strideq* 4+32], m5 + mova [dstq+strideq* 4+48], m4 + mova [dstq+strideq* 0+ 0], m2 + mova [dstq+strideq* 0+16], m5 + mova [dstq+strideq* 0+32], m4 + mova [dstq+strideq* 0+48], m7 + sub dstq, strideq +%if cpuflag(avx) + vpalignr m1, m0, m1, 4 + vpalignr m0, m3, m0, 4 + vpalignr m3, m2, m3, 4 + vpalignr m2, m5, m2, 4 + vpalignr m5, m4, m5, 4 + vpalignr m4, m7, m4, 4 + vpalignr m7, m6, m7, 4 +%else + SCRATCH 6, 12, rsp+8*mmsize, sh +%if notcpuflag(ssse3) + SCRATCH 7, 13, rsp+9*mmsize +%endif + PALIGNR m6, m0, m1, 4, m7 + mova m1, m6 + PALIGNR m6, m3, m0, 4, m7 + mova m0, m6 + PALIGNR m6, m2, m3, 4, m7 + mova m3, m6 + PALIGNR m6, m5, m2, 4, m7 + mova m2, m6 + PALIGNR m6, m4, m5, 4, m7 + mova m5, m6 +%if notcpuflag(ssse3) + UNSCRATCH 7, 13, rsp+9*mmsize + SCRATCH 5, 13, rsp+9*mmsize +%endif + PALIGNR m6, m7, m4, 4, m5 + mova m4, m6 + PALIGNR m6, reg_sh, m7, 4, m5 + mova m7, m6 +%if notcpuflag(ssse3) + UNSCRATCH 5, 13, rsp+9*mmsize +%endif + UNSCRATCH 6, 12, rsp+8*mmsize, sh +%endif + psrldq m6, 4 + dec cntd + jg .loop_2 +%endif + RET +%endmacro + +INIT_XMM sse2 +HD_FUNCS +INIT_XMM ssse3 +HD_FUNCS +INIT_XMM avx +HD_FUNCS diff --git a/media/ffvpx/libavcodec/x86/vp9itxfm.asm b/media/ffvpx/libavcodec/x86/vp9itxfm.asm new file mode 100644 index 0000000000..2c63fe514a --- /dev/null +++ b/media/ffvpx/libavcodec/x86/vp9itxfm.asm @@ -0,0 +1,3197 @@ +;****************************************************************************** +;* VP9 IDCT SIMD optimizations +;* +;* Copyright (C) 2013 Clément Bœsch <u pkh me> +;* Copyright (C) 2013 Ronald S. Bultje <rsbultje gmail com> +;* +;* This file is part of FFmpeg. +;* +;* FFmpeg is free software; you can redistribute it and/or +;* modify it under the terms of the GNU Lesser General Public +;* License as published by the Free Software Foundation; either +;* version 2.1 of the License, or (at your option) any later version. +;* +;* FFmpeg is distributed in the hope that it will be useful, +;* but WITHOUT ANY WARRANTY; without even the implied warranty of +;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +;* Lesser General Public License for more details. +;* +;* You should have received a copy of the GNU Lesser General Public +;* License along with FFmpeg; if not, write to the Free Software +;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA +;****************************************************************************** + +%include "libavutil/x86/x86util.asm" +%include "vp9itxfm_template.asm" + +SECTION_RODATA 32 + +%macro VP9_IDCT_COEFFS 2-3 0 +const pw_m%1_%2 +times 8 dw -%1, %2 +const pw_%2_%1 +times 8 dw %2, %1 + +%if %3 == 1 +const pw_m%2_m%1 +times 8 dw -%2, -%1 +%if %1 != %2 +const pw_m%2_%1 +times 8 dw -%2, %1 +const pw_%1_%2 +times 8 dw %1, %2 +%endif +%endif + +%if %1 < 11585 +pw_m%1x2: times 16 dw -%1*2 +%elif %1 > 11585 +pw_%1x2: times 16 dw %1*2 +%else +const pw_%1x2 +times 16 dw %1*2 +%endif + +%if %2 != %1 +pw_%2x2: times 16 dw %2*2 +%endif +%endmacro + +VP9_IDCT_COEFFS 16364, 804 +VP9_IDCT_COEFFS 16305, 1606 +VP9_IDCT_COEFFS 16069, 3196, 1 +VP9_IDCT_COEFFS 15893, 3981 +VP9_IDCT_COEFFS 15137, 6270, 1 +VP9_IDCT_COEFFS 14811, 7005 +VP9_IDCT_COEFFS 14449, 7723 +VP9_IDCT_COEFFS 13160, 9760 +VP9_IDCT_COEFFS 11585, 11585, 1 +VP9_IDCT_COEFFS 11003, 12140 +VP9_IDCT_COEFFS 10394, 12665 +VP9_IDCT_COEFFS 9102, 13623, 1 +VP9_IDCT_COEFFS 8423, 14053 +VP9_IDCT_COEFFS 5520, 15426 +VP9_IDCT_COEFFS 4756, 15679 +VP9_IDCT_COEFFS 2404, 16207 + +const pw_5283_13377 +times 4 dw 5283, 13377 +const pw_9929_13377 +times 4 dw 9929, 13377 +const pw_15212_m13377 +times 4 dw 15212, -13377 +const pw_15212_9929 +times 4 dw 15212, 9929 +const pw_m5283_m15212 +times 4 dw -5283, -15212 +const pw_13377x2 +times 8 dw 13377*2 +const pw_m13377_13377 +times 4 dw -13377, 13377 +const pw_13377_0 +times 4 dw 13377, 0 + +cextern pw_8 +cextern pw_16 +cextern pw_32 +cextern pw_512 +cextern pw_1024 +cextern pw_2048 +cextern pw_m1 +cextern pd_8192 + +SECTION .text + +%macro VP9_UNPACK_MULSUB_2D_4X 6 ; dst1 [src1], dst2 [src2], dst3, dst4, mul1, mul2 + punpckhwd m%4, m%2, m%1 + punpcklwd m%2, m%1 + pmaddwd m%3, m%4, [pw_m%5_%6] + pmaddwd m%4, [pw_%6_%5] + pmaddwd m%1, m%2, [pw_m%5_%6] + pmaddwd m%2, [pw_%6_%5] +%endmacro + +%macro VP9_RND_SH_SUMSUB_BA 6 ; dst1 [src1], dst2 [src2], src3, src4, tmp, round + SUMSUB_BA d, %1, %2, %5 + SUMSUB_BA d, %3, %4, %5 + paddd m%1, %6 + paddd m%2, %6 + paddd m%3, %6 + paddd m%4, %6 + psrad m%1, 14 + psrad m%2, 14 + psrad m%3, 14 + psrad m%4, 14 + packssdw m%1, m%3 + packssdw m%2, m%4 +%endmacro + +%macro VP9_STORE_2X 5-6 dstq ; reg1, reg2, tmp1, tmp2, zero, dst +%if mmsize == 32 + pmovzxbw m%3, [%6] + pmovzxbw m%4, [%6+strideq] +%else + movh m%3, [%6] + movh m%4, [%6+strideq] + punpcklbw m%3, m%5 + punpcklbw m%4, m%5 +%endif + paddw m%3, m%1 + paddw m%4, m%2 +%if mmsize == 32 + packuswb m%3, m%4 + ; Intel... + vpermq m%3, m%3, q3120 + mova [%6], xm%3 + vextracti128 [%6+strideq], m%3, 1 +%elif mmsize == 16 + packuswb m%3, m%4 + movh [%6], m%3 + movhps [%6+strideq], m%3 +%else + packuswb m%3, m%5 + packuswb m%4, m%5 + movh [%6], m%3 + movh [%6+strideq], m%4 +%endif +%endmacro + +%macro ZERO_BLOCK 4 ; mem, stride, nnzcpl, zero_reg +%assign %%y 0 +%rep %3 +%assign %%x 0 +%rep %3*2/mmsize + mova [%1+%%y+%%x], %4 +%assign %%x (%%x+mmsize) +%endrep +%assign %%y (%%y+%2) +%endrep +%endmacro + +;------------------------------------------------------------------------------------------- +; void vp9_iwht_iwht_4x4_add_<opt>(uint8_t *dst, ptrdiff_t stride, int16_t *block, int eob); +;------------------------------------------------------------------------------------------- + +INIT_MMX mmx +cglobal vp9_iwht_iwht_4x4_add, 3, 3, 0, dst, stride, block, eob + mova m0, [blockq+0*8] + mova m1, [blockq+1*8] + mova m2, [blockq+2*8] + mova m3, [blockq+3*8] + psraw m0, 2 + psraw m1, 2 + psraw m2, 2 + psraw m3, 2 + + VP9_IWHT4_1D + TRANSPOSE4x4W 0, 1, 2, 3, 4 + VP9_IWHT4_1D + + pxor m4, m4 + VP9_STORE_2X 0, 1, 5, 6, 4 + lea dstq, [dstq+strideq*2] + VP9_STORE_2X 2, 3, 5, 6, 4 + ZERO_BLOCK blockq, 8, 4, m4 + RET + +;------------------------------------------------------------------------------------------- +; void vp9_idct_idct_4x4_add_<opt>(uint8_t *dst, ptrdiff_t stride, int16_t *block, int eob); +;------------------------------------------------------------------------------------------- + +; 2x2 top left corner +%macro VP9_IDCT4_2x2_1D 0 + pmulhrsw m0, m5 ; m0=t1 + mova m2, m0 ; m2=t0 + mova m3, m1 + pmulhrsw m1, m6 ; m1=t2 + pmulhrsw m3, m7 ; m3=t3 + VP9_IDCT4_1D_FINALIZE +%endmacro + +%macro VP9_IDCT4_WRITEOUT 0 +%if cpuflag(ssse3) + mova m5, [pw_2048] + pmulhrsw m0, m5 ; (x*2048 + (1<<14))>>15 <=> (x+8)>>4 + pmulhrsw m1, m5 +%else + mova m5, [pw_8] + paddw m0, m5 + paddw m1, m5 + psraw m0, 4 + psraw m1, 4 +%endif + VP9_STORE_2X 0, 1, 6, 7, 4 + lea dstq, [dstq+2*strideq] +%if cpuflag(ssse3) + pmulhrsw m2, m5 + pmulhrsw m3, m5 +%else + paddw m2, m5 + paddw m3, m5 + psraw m2, 4 + psraw m3, 4 +%endif + VP9_STORE_2X 2, 3, 6, 7, 4 +%endmacro + +%macro IDCT_4x4_FN 1 +INIT_MMX %1 +cglobal vp9_idct_idct_4x4_add, 4, 4, 0, dst, stride, block, eob + +%if cpuflag(ssse3) + cmp eobd, 4 ; 2x2 or smaller + jg .idctfull + + cmp eobd, 1 ; faster path for when only DC is set + jne .idct2x2 +%else + cmp eobd, 1 + jg .idctfull +%endif + +%if cpuflag(ssse3) + movd m0, [blockq] + mova m5, [pw_11585x2] + pmulhrsw m0, m5 + pmulhrsw m0, m5 +%else + DEFINE_ARGS dst, stride, block, coef + movsx coefd, word [blockq] + imul coefd, 11585 + add coefd, 8192 + sar coefd, 14 + imul coefd, 11585 + add coefd, (8 << 14) + 8192 + sar coefd, 14 + 4 + movd m0, coefd +%endif + pshufw m0, m0, 0 + pxor m4, m4 + movh [blockq], m4 +%if cpuflag(ssse3) + pmulhrsw m0, [pw_2048] ; (x*2048 + (1<<14))>>15 <=> (x+8)>>4 +%endif + VP9_STORE_2X 0, 0, 6, 7, 4 + lea dstq, [dstq+2*strideq] + VP9_STORE_2X 0, 0, 6, 7, 4 + RET + +%if cpuflag(ssse3) +; faster path for when only top left 2x2 block is set +.idct2x2: + movd m0, [blockq+0] + movd m1, [blockq+8] + mova m5, [pw_11585x2] + mova m6, [pw_6270x2] + mova m7, [pw_15137x2] + VP9_IDCT4_2x2_1D + ; partial 2x4 transpose + punpcklwd m0, m1 + punpcklwd m2, m3 + SBUTTERFLY dq, 0, 2, 1 + SWAP 1, 2 + VP9_IDCT4_2x2_1D + pxor m4, m4 ; used for the block reset, and VP9_STORE_2X + movh [blockq+ 0], m4 + movh [blockq+ 8], m4 + VP9_IDCT4_WRITEOUT + RET +%endif + +.idctfull: ; generic full 4x4 idct/idct + mova m0, [blockq+ 0] + mova m1, [blockq+ 8] + mova m2, [blockq+16] + mova m3, [blockq+24] +%if cpuflag(ssse3) + mova m6, [pw_11585x2] +%endif + mova m7, [pd_8192] ; rounding + VP9_IDCT4_1D + TRANSPOSE4x4W 0, 1, 2, 3, 4 + VP9_IDCT4_1D + pxor m4, m4 ; used for the block reset, and VP9_STORE_2X + mova [blockq+ 0], m4 + mova [blockq+ 8], m4 + mova [blockq+16], m4 + mova [blockq+24], m4 + VP9_IDCT4_WRITEOUT + RET +%endmacro + +IDCT_4x4_FN mmxext +IDCT_4x4_FN ssse3 + +;------------------------------------------------------------------------------------------- +; void vp9_iadst_iadst_4x4_add_<opt>(uint8_t *dst, ptrdiff_t stride, int16_t *block, int eob); +;------------------------------------------------------------------------------------------- + +%macro IADST4_FN 5 +INIT_MMX %5 +cglobal vp9_%1_%3_4x4_add, 3, 3, 0, dst, stride, block, eob +%if WIN64 && notcpuflag(ssse3) + WIN64_SPILL_XMM 8 +%endif + movdqa xmm5, [pd_8192] + mova m0, [blockq+ 0] + mova m1, [blockq+ 8] + mova m2, [blockq+16] + mova m3, [blockq+24] +%if cpuflag(ssse3) + mova m6, [pw_11585x2] +%endif +%ifnidn %1%3, iadstiadst + movdq2q m7, xmm5 +%endif + VP9_%2_1D + TRANSPOSE4x4W 0, 1, 2, 3, 4 + VP9_%4_1D + pxor m4, m4 ; used for the block reset, and VP9_STORE_2X + mova [blockq+ 0], m4 + mova [blockq+ 8], m4 + mova [blockq+16], m4 + mova [blockq+24], m4 + VP9_IDCT4_WRITEOUT + RET +%endmacro + +IADST4_FN idct, IDCT4, iadst, IADST4, sse2 +IADST4_FN iadst, IADST4, idct, IDCT4, sse2 +IADST4_FN iadst, IADST4, iadst, IADST4, sse2 + +IADST4_FN idct, IDCT4, iadst, IADST4, ssse3 +IADST4_FN iadst, IADST4, idct, IDCT4, ssse3 +IADST4_FN iadst, IADST4, iadst, IADST4, ssse3 + +%macro SCRATCH 3 +%if ARCH_X86_64 + SWAP %1, %2 +%else + mova [%3], m%1 +%endif +%endmacro + +%macro UNSCRATCH 3 +%if ARCH_X86_64 + SWAP %1, %2 +%else + mova m%1, [%3] +%endif +%endmacro + +;------------------------------------------------------------------------------------------- +; void vp9_idct_idct_8x8_add_<opt>(uint8_t *dst, ptrdiff_t stride, int16_t *block, int eob); +;------------------------------------------------------------------------------------------- + +%macro VP9_IDCT8_1D_FINALIZE 0 + SUMSUB_BA w, 3, 6, 5 ; m3=t0+t7, m6=t0-t7 + SUMSUB_BA w, 1, 2, 5 ; m1=t1+t6, m2=t1-t6 + SUMSUB_BA w, 7, 0, 5 ; m7=t2+t5, m0=t2-t5 + + UNSCRATCH 5, 8, blockq+ 0 + SCRATCH 2, 8, blockq+ 0 + + SUMSUB_BA w, 5, 4, 2 ; m5=t3+t4, m4=t3-t4 + SWAP 7, 6, 2 + SWAP 3, 5, 0 + +%if ARCH_X86_64 + SWAP 6, 8 +%endif +%endmacro + +; x86-32 +; - in: m0/m4 is in mem +; - out: m6 is in mem +; x86-64: +; - everything is in registers (m0-7) +%macro VP9_IDCT8_1D 0 +%if ARCH_X86_64 + SWAP 0, 8 + SWAP 4, 9 +%endif + + VP9_UNPACK_MULSUB_2W_4X 5, 3, 9102, 13623, D_8192_REG, 0, 4 ; m5=t5a, m3=t6a + VP9_UNPACK_MULSUB_2W_4X 1, 7, 16069, 3196, D_8192_REG, 0, 4 ; m1=t4a, m7=t7a + SUMSUB_BA w, 5, 1, 0 ; m5=t4a+t5a (t4), m1=t4a-t5a (t5a) + SUMSUB_BA w, 3, 7, 0 ; m3=t7a+t6a (t7), m7=t7a-t6a (t6a) +%if cpuflag(ssse3) + SUMSUB_BA w, 1, 7, 0 ; m1=t6a+t5a (t6), m7=t6a-t5a (t5) + pmulhrsw m1, W_11585x2_REG ; m1=t6 + pmulhrsw m7, W_11585x2_REG ; m7=t5 +%else + VP9_UNPACK_MULSUB_2W_4X 7, 1, 11585, 11585, D_8192_REG, 0, 4 +%endif + VP9_UNPACK_MULSUB_2W_4X 2, 6, 15137, 6270, D_8192_REG, 0, 4 ; m2=t2a, m6=t3a + + UNSCRATCH 0, 8, blockq+ 0 ; IN(0) + UNSCRATCH 4, 9, blockq+64 ; IN(4) + SCRATCH 5, 8, blockq+ 0 + +%if cpuflag(ssse3) + SUMSUB_BA w, 4, 0, 5 ; m4=IN(0)+IN(4) m0=IN(0)-IN(4) + pmulhrsw m4, W_11585x2_REG ; m4=t0a + pmulhrsw m0, W_11585x2_REG ; m0=t1a +%else + SCRATCH 7, 9, blockq+64 + VP9_UNPACK_MULSUB_2W_4X 0, 4, 11585, 11585, D_8192_REG, 5, 7 + UNSCRATCH 7, 9, blockq+64 +%endif + SUMSUB_BA w, 6, 4, 5 ; m6=t0a+t3a (t0), m4=t0a-t3a (t3) + SUMSUB_BA w, 2, 0, 5 ; m2=t1a+t2a (t1), m0=t1a-t2a (t2) + + VP9_IDCT8_1D_FINALIZE +%endmacro + +%macro VP9_IDCT8_4x4_1D 0 + pmulhrsw m0, W_11585x2_REG ; m0=t1a/t0a + pmulhrsw m6, m2, [pw_15137x2] ; m6=t3a + pmulhrsw m2, [pw_6270x2] ; m2=t2a + pmulhrsw m7, m1, [pw_16069x2] ; m7=t7a + pmulhrsw m1, [pw_3196x2] ; m1=t4a + pmulhrsw m5, m3, [pw_m9102x2] ; m5=t5a + pmulhrsw m3, [pw_13623x2] ; m3=t6a + SUMSUB_BA w, 5, 1, 4 ; m1=t4a+t5a (t4), m5=t4a-t5a (t5a) + SUMSUB_BA w, 3, 7, 4 ; m3=t7a+t6a (t7), m7=t7a-t6a (t6a) + SUMSUB_BA w, 1, 7, 4 ; m1=t6a+t5a (t6), m7=t6a-t5a (t5) + pmulhrsw m1, W_11585x2_REG ; m1=t6 + pmulhrsw m7, W_11585x2_REG ; m7=t5 + psubw m4, m0, m6 ; m4=t0a-t3a (t3) + paddw m6, m0 ; m6=t0a+t3a (t0) + SCRATCH 5, 8, blockq+ 0 + SUMSUB_BA w, 2, 0, 5 ; m2=t1a+t2a (t1), m0=t1a-t2a (t2) + VP9_IDCT8_1D_FINALIZE +%endmacro + +%macro VP9_IDCT8_2x2_1D 1 + pmulhrsw m0, W_11585x2_REG ; m0=t0 + pmulhrsw m3, m1, W_16069x2_REG ; m3=t7 + pmulhrsw m1, W_3196x2_REG ; m1=t4 + psubw m7, m3, m1 ; t5 = t7a - t4a + paddw m5, m3, m1 ; t6 = t7a + t4a + pmulhrsw m7, W_11585x2_REG ; m7=t5 + pmulhrsw m5, W_11585x2_REG ; m5=t6 + SWAP 5, 1 + ; merged VP9_IDCT8_1D_FINALIZE to make register-sharing w/ avx easier + psubw m6, m0, m3 ; m6=t0-t7 + paddw m3, m0 ; m3=t0+t7 + psubw m2, m0, m1 ; m2=t1-t6 + paddw m1, m0 ; m1=t1+t6 +%if %1 == 1 + punpcklwd m3, m1 +%define SCRATCH_REG 1 +%elif ARCH_X86_32 + mova [blockq+ 0], m2 +%define SCRATCH_REG 2 +%else +%define SCRATCH_REG 8 +%endif + psubw m4, m0, m5 ; m4=t3-t4 + paddw m5, m0 ; m5=t3+t4 + SUMSUB_BA w, 7, 0, SCRATCH_REG ; m7=t2+t5, m0=t2-t5 + SWAP 7, 6, 2 + SWAP 3, 5, 0 +%undef SCRATCH_REG +%endmacro + +%macro VP9_IDCT8_WRITEx2 6-8 5 ; line1, line2, tmp1, tmp2, zero, pw_1024/pw_16, shift +%if cpuflag(ssse3) + pmulhrsw m%1, %6 ; (x*1024 + (1<<14))>>15 <=> (x+16)>>5 + pmulhrsw m%2, %6 +%else + paddw m%1, %6 + paddw m%2, %6 + psraw m%1, %7 + psraw m%2, %7 +%endif +%if %0 <= 7 + VP9_STORE_2X %1, %2, %3, %4, %5 +%else + VP9_STORE_2X %1, %2, %3, %4, %5, %8 +%endif +%endmacro + +; x86-32: +; - m6 is in mem +; x86-64: +; - m8 holds m6 (SWAP) +; m6 holds zero +%macro VP9_IDCT8_WRITEOUT 0 +%if ARCH_X86_64 +%if cpuflag(ssse3) + mova m9, [pw_1024] +%else + mova m9, [pw_16] +%endif +%define ROUND_REG m9 +%else +%if cpuflag(ssse3) +%define ROUND_REG [pw_1024] +%else +%define ROUND_REG [pw_16] +%endif +%endif + SCRATCH 5, 10, blockq+16 + SCRATCH 7, 11, blockq+32 + VP9_IDCT8_WRITEx2 0, 1, 5, 7, 6, ROUND_REG + lea dstq, [dstq+2*strideq] + VP9_IDCT8_WRITEx2 2, 3, 5, 7, 6, ROUND_REG + lea dstq, [dstq+2*strideq] + UNSCRATCH 5, 10, blockq+16 + UNSCRATCH 7, 11, blockq+32 + VP9_IDCT8_WRITEx2 4, 5, 0, 1, 6, ROUND_REG + lea dstq, [dstq+2*strideq] + UNSCRATCH 5, 8, blockq+ 0 + VP9_IDCT8_WRITEx2 5, 7, 0, 1, 6, ROUND_REG + +%undef ROUND_REG +%endmacro + +%macro VP9_IDCT_IDCT_8x8_ADD_XMM 2 +INIT_XMM %1 +cglobal vp9_idct_idct_8x8_add, 4, 4, %2, dst, stride, block, eob + +%if cpuflag(ssse3) +%if ARCH_X86_64 + mova m12, [pw_11585x2] ; often used +%define W_11585x2_REG m12 +%else +%define W_11585x2_REG [pw_11585x2] +%endif + + cmp eobd, 12 ; top left half or less + jg .idctfull + + cmp eobd, 3 ; top left corner or less + jg .idcthalf + + cmp eobd, 1 ; faster path for when only DC is set + jne .idcttopleftcorner +%else + cmp eobd, 1 + jg .idctfull +%endif + +%if cpuflag(ssse3) + movd m0, [blockq] + pmulhrsw m0, W_11585x2_REG + pmulhrsw m0, W_11585x2_REG +%else + DEFINE_ARGS dst, stride, block, coef + movsx coefd, word [blockq] + imul coefd, 11585 + add coefd, 8192 + sar coefd, 14 + imul coefd, 11585 + add coefd, (16 << 14) + 8192 + sar coefd, 14 + 5 + movd m0, coefd +%endif + SPLATW m0, m0, 0 + pxor m4, m4 + movd [blockq], m4 +%if cpuflag(ssse3) + pmulhrsw m0, [pw_1024] ; (x*1024 + (1<<14))>>15 <=> (x+16)>>5 +%endif +%rep 3 + VP9_STORE_2X 0, 0, 6, 7, 4 + lea dstq, [dstq+2*strideq] +%endrep + VP9_STORE_2X 0, 0, 6, 7, 4 + RET + +%if cpuflag(ssse3) +; faster path for when only left corner is set (3 input: DC, right to DC, below +; to DC). Note: also working with a 2x2 block +.idcttopleftcorner: + movd m0, [blockq+0] + movd m1, [blockq+16] +%if ARCH_X86_64 + mova m10, [pw_3196x2] + mova m11, [pw_16069x2] +%define W_3196x2_REG m10 +%define W_16069x2_REG m11 +%else +%define W_3196x2_REG [pw_3196x2] +%define W_16069x2_REG [pw_16069x2] +%endif + VP9_IDCT8_2x2_1D 1 + ; partial 2x8 transpose + ; punpcklwd m0, m1 already done inside idct + punpcklwd m2, m3 + punpcklwd m4, m5 + punpcklwd m6, m7 + punpckldq m0, m2 + punpckldq m4, m6 + SBUTTERFLY qdq, 0, 4, 1 + SWAP 1, 4 + VP9_IDCT8_2x2_1D 2 +%if ARCH_X86_64 + SWAP 6, 8 +%endif + pxor m6, m6 ; used for the block reset, and VP9_STORE_2X + VP9_IDCT8_WRITEOUT +%if ARCH_X86_64 + movd [blockq+ 0], m6 + movd [blockq+16], m6 +%else + mova [blockq+ 0], m6 + mova [blockq+16], m6 + mova [blockq+32], m6 +%endif + RET + +.idcthalf: + movh m0, [blockq + 0] + movh m1, [blockq +16] + movh m2, [blockq +32] + movh m3, [blockq +48] + VP9_IDCT8_4x4_1D + ; partial 4x8 transpose +%if ARCH_X86_32 + mova m6, [blockq+ 0] +%endif + punpcklwd m0, m1 + punpcklwd m2, m3 + punpcklwd m4, m5 + punpcklwd m6, m7 + SBUTTERFLY dq, 0, 2, 1 + SBUTTERFLY dq, 4, 6, 5 + SBUTTERFLY qdq, 0, 4, 1 + SBUTTERFLY qdq, 2, 6, 5 + SWAP 1, 4 + SWAP 3, 6 + VP9_IDCT8_4x4_1D +%if ARCH_X86_64 + SWAP 6, 8 +%endif + pxor m6, m6 + VP9_IDCT8_WRITEOUT +%if ARCH_X86_64 + movh [blockq+ 0], m6 + movh [blockq+16], m6 + movh [blockq+32], m6 +%else + mova [blockq+ 0], m6 + mova [blockq+16], m6 + mova [blockq+32], m6 +%endif + movh [blockq+48], m6 + RET +%endif + +.idctfull: ; generic full 8x8 idct/idct +%if ARCH_X86_64 + mova m0, [blockq+ 0] ; IN(0) +%endif + mova m1, [blockq+ 16] ; IN(1) + mova m2, [blockq+ 32] ; IN(2) + mova m3, [blockq+ 48] ; IN(3) +%if ARCH_X86_64 + mova m4, [blockq+ 64] ; IN(4) +%endif + mova m5, [blockq+ 80] ; IN(5) + mova m6, [blockq+ 96] ; IN(6) + mova m7, [blockq+112] ; IN(7) +%if ARCH_X86_64 + mova m11, [pd_8192] ; rounding +%define D_8192_REG m11 +%else +%define D_8192_REG [pd_8192] +%endif + VP9_IDCT8_1D +%if ARCH_X86_64 + TRANSPOSE8x8W 0, 1, 2, 3, 4, 5, 6, 7, 8 +%else + TRANSPOSE8x8W 0, 1, 2, 3, 4, 5, 6, 7, [blockq+0], [blockq+64], 1 + mova [blockq+0], m0 +%endif + VP9_IDCT8_1D + +%if ARCH_X86_64 + SWAP 6, 8 +%endif + pxor m6, m6 ; used for the block reset, and VP9_STORE_2X + VP9_IDCT8_WRITEOUT + ZERO_BLOCK blockq, 16, 8, m6 + RET +%undef W_11585x2_REG +%endmacro + +VP9_IDCT_IDCT_8x8_ADD_XMM sse2, 12 +VP9_IDCT_IDCT_8x8_ADD_XMM ssse3, 13 +VP9_IDCT_IDCT_8x8_ADD_XMM avx, 13 + +;--------------------------------------------------------------------------------------------- +; void vp9_iadst_iadst_8x8_add_<opt>(uint8_t *dst, ptrdiff_t stride, int16_t *block, int eob); +;--------------------------------------------------------------------------------------------- + +; x86-32: +; - in: m0/3/4/7 are in mem [blockq+N*16] +; - out: m6 is in mem [blockq+0] +; x86-64: +; - everything is in registers +%macro VP9_IADST8_1D 0 ; input/output=m0/1/2/3/4/5/6/7 +%if ARCH_X86_64 + SWAP 0, 8 + SWAP 3, 9 + SWAP 4, 10 + SWAP 7, 11 +%endif + + VP9_UNPACK_MULSUB_2D_4X 5, 2, 0, 3, 14449, 7723 ; m5/2=t3[d], m2/4=t2[d] + VP9_UNPACK_MULSUB_2D_4X 1, 6, 4, 7, 4756, 15679 ; m1/4=t7[d], m6/7=t6[d] + SCRATCH 4, 12, blockq+1*16 + VP9_RND_SH_SUMSUB_BA 6, 2, 7, 3, 4, D_8192_REG ; m6=t2[w], m2=t6[w] + UNSCRATCH 4, 12, blockq+1*16 + VP9_RND_SH_SUMSUB_BA 1, 5, 4, 0, 3, D_8192_REG ; m1=t3[w], m5=t7[w] + + UNSCRATCH 0, 8, blockq+16*0 + UNSCRATCH 3, 9, blockq+16*3 + UNSCRATCH 4, 10, blockq+16*4 + UNSCRATCH 7, 11, blockq+16*7 + SCRATCH 1, 8, blockq+16*1 + SCRATCH 2, 9, blockq+16*2 + SCRATCH 5, 10, blockq+16*5 + SCRATCH 6, 11, blockq+16*6 + + VP9_UNPACK_MULSUB_2D_4X 7, 0, 1, 2, 16305, 1606 ; m7/1=t1[d], m0/2=t0[d] + VP9_UNPACK_MULSUB_2D_4X 3, 4, 5, 6, 10394, 12665 ; m3/5=t5[d], m4/6=t4[d] + SCRATCH 1, 12, blockq+ 0*16 + VP9_RND_SH_SUMSUB_BA 4, 0, 6, 2, 1, D_8192_REG ; m4=t0[w], m0=t4[w] + UNSCRATCH 1, 12, blockq+ 0*16 + VP9_RND_SH_SUMSUB_BA 3, 7, 5, 1, 2, D_8192_REG ; m3=t1[w], m7=t5[w] + + UNSCRATCH 2, 9, blockq+16*2 + UNSCRATCH 5, 10, blockq+16*5 + SCRATCH 3, 9, blockq+16*3 + SCRATCH 4, 10, blockq+16*4 + + ; m4=t0, m3=t1, m6=t2, m1=t3, m0=t4, m7=t5, m2=t6, m5=t7 + + VP9_UNPACK_MULSUB_2D_4X 0, 7, 1, 3, 15137, 6270 ; m0/1=t5[d], m7/3=t4[d] + VP9_UNPACK_MULSUB_2D_4X 5, 2, 4, 6, 6270, 15137 ; m5/4=t6[d], m2/6=t7[d] + SCRATCH 1, 12, blockq+ 0*16 + VP9_RND_SH_SUMSUB_BA 5, 7, 4, 3, 1, D_8192_REG + UNSCRATCH 1, 12, blockq+ 0*16 + PSIGNW m5, W_M1_REG ; m5=out1[w], m7=t6[w] + VP9_RND_SH_SUMSUB_BA 2, 0, 6, 1, 3, D_8192_REG ; m2=out6[w], m0=t7[w] + + UNSCRATCH 1, 8, blockq+16*1 + UNSCRATCH 3, 9, blockq+16*3 + UNSCRATCH 4, 10, blockq+16*4 + UNSCRATCH 6, 11, blockq+16*6 + SCRATCH 2, 8, blockq+16*0 + + SUMSUB_BA w, 6, 4, 2 ; m6=out0[w], m4=t2[w] + SUMSUB_BA w, 1, 3, 2 + PSIGNW m1, W_M1_REG ; m1=out7[w], m3=t3[w] + + ; m6=out0, m5=out1, m4=t2, m3=t3, m7=t6, m0=t7, m2=out6, m1=out7 + + ; unfortunately, the code below overflows in some cases +%if 0; cpuflag(ssse3) + SUMSUB_BA w, 3, 4, 2 + SUMSUB_BA w, 0, 7, 2 + pmulhrsw m3, W_11585x2_REG + pmulhrsw m7, W_11585x2_REG + pmulhrsw m4, W_11585x2_REG ; out4 + pmulhrsw m0, W_11585x2_REG ; out2 +%else + SCRATCH 5, 9, blockq+16*1 + VP9_UNPACK_MULSUB_2W_4X 4, 3, 11585, 11585, D_8192_REG, 2, 5 + VP9_UNPACK_MULSUB_2W_4X 7, 0, 11585, 11585, D_8192_REG, 2, 5 + UNSCRATCH 5, 9, blockq+16*1 +%endif + PSIGNW m3, W_M1_REG ; out3 + PSIGNW m7, W_M1_REG ; out5 + + ; m6=out0, m5=out1, m0=out2, m3=out3, m4=out4, m7=out5, m2=out6, m1=out7 + +%if ARCH_X86_64 + SWAP 2, 8 +%endif + SWAP 0, 6, 2 + SWAP 7, 1, 5 +%endmacro + +%macro IADST8_FN 6 +INIT_XMM %5 +cglobal vp9_%1_%3_8x8_add, 3, 3, %6, dst, stride, block, eob + +%ifidn %1, idct +%define first_is_idct 1 +%else +%define first_is_idct 0 +%endif + +%ifidn %3, idct +%define second_is_idct 1 +%else +%define second_is_idct 0 +%endif + +%if ARCH_X86_64 + mova m0, [blockq+ 0] ; IN(0) +%endif + mova m1, [blockq+ 16] ; IN(1) + mova m2, [blockq+ 32] ; IN(2) +%if ARCH_X86_64 || first_is_idct + mova m3, [blockq+ 48] ; IN(3) +%endif +%if ARCH_X86_64 + mova m4, [blockq+ 64] ; IN(4) +%endif + mova m5, [blockq+ 80] ; IN(5) + mova m6, [blockq+ 96] ; IN(6) +%if ARCH_X86_64 || first_is_idct + mova m7, [blockq+112] ; IN(7) +%endif +%if ARCH_X86_64 +%if cpuflag(ssse3) + mova m15, [pw_11585x2] ; often used +%endif + mova m13, [pd_8192] ; rounding + mova m14, [pw_m1] +%define W_11585x2_REG m15 +%define D_8192_REG m13 +%define W_M1_REG m14 +%else +%define W_11585x2_REG [pw_11585x2] +%define D_8192_REG [pd_8192] +%define W_M1_REG [pw_m1] +%endif + + ; note different calling conventions for idct8 vs. iadst8 on x86-32 + VP9_%2_1D +%if ARCH_X86_64 + TRANSPOSE8x8W 0, 1, 2, 3, 4, 5, 6, 7, 8 +%else + TRANSPOSE8x8W 0, 1, 2, 3, 4, 5, 6, 7, [blockq+0], [blockq+64], 1 + mova [blockq+ 0], m0 +%if second_is_idct == 0 + mova [blockq+ 48], m3 + mova [blockq+112], m7 +%endif +%endif + VP9_%4_1D + +%if ARCH_X86_64 + SWAP 6, 8 +%endif + pxor m6, m6 ; used for the block reset, and VP9_STORE_2X + VP9_IDCT8_WRITEOUT + ZERO_BLOCK blockq, 16, 8, m6 + RET + +%undef W_11585x2_REG +%undef first_is_idct +%undef second_is_idct + +%endmacro + +IADST8_FN idct, IDCT8, iadst, IADST8, sse2, 15 +IADST8_FN iadst, IADST8, idct, IDCT8, sse2, 15 +IADST8_FN iadst, IADST8, iadst, IADST8, sse2, 15 +IADST8_FN idct, IDCT8, iadst, IADST8, ssse3, 16 +IADST8_FN idct, IDCT8, iadst, IADST8, avx, 16 +IADST8_FN iadst, IADST8, idct, IDCT8, ssse3, 16 +IADST8_FN iadst, IADST8, idct, IDCT8, avx, 16 +IADST8_FN iadst, IADST8, iadst, IADST8, ssse3, 16 +IADST8_FN iadst, IADST8, iadst, IADST8, avx, 16 + +;--------------------------------------------------------------------------------------------- +; void vp9_idct_idct_16x16_add_<opt>(uint8_t *dst, ptrdiff_t stride, int16_t *block, int eob); +;--------------------------------------------------------------------------------------------- + +; x86-64: +; at the end of this macro, m7 is stored in [%4+15*%5] +; everything else (t0-6 and t8-15) is stored in m0-6 and m8-15 +; the following sumsubs have not been done yet: +; SUMSUB_BA w, 6, 9, 15 ; t6, t9 +; SUMSUB_BA w, 7, 8, 15 ; t7, t8 +; or (x86-32) t0-t5 are in m0-m5, t10-t15 are in x11/9/7/5/3/1, +; and the following simsubs have not been done yet: +; SUMSUB_BA w, x13, x14, 7 ; t6, t9 +; SUMSUB_BA w, x15, x12, 7 ; t7, t8 + +%macro VP9_IDCT16_1D_START 6 ; src, nnzc, stride, scratch, scratch_stride, is_iadst +%if %2 <= 4 + mova m3, [%1+ 1*%3] ; IN(1) + mova m0, [%1+ 3*%3] ; IN(3) + + pmulhrsw m4, m3, [pw_16305x2] ; t14-15 + pmulhrsw m3, [pw_1606x2] ; t8-9 + pmulhrsw m7, m0, [pw_m4756x2] ; t10-11 + pmulhrsw m0, [pw_15679x2] ; t12-13 + + ; m8=t0, m9=t1, m10=t2, m11=t3, m12=t4, m14=t5, m13=t6, m15=t7 + ; m3=t8, m5=t9, m1=t10, m7=t11, m0=t12, m6=t13, m2=t14, m4=t15 + + VP9_UNPACK_MULSUB_2W_4X 2, 5, 4, 3, 15137, 6270, [pd_8192], 1, 6 ; t9, t14 + SCRATCH 4, 10, %4+ 1*%5 + SCRATCH 5, 11, %4+ 7*%5 + VP9_UNPACK_MULSUB_2W_4X 6, 1, 0, 7, 6270, m15137, [pd_8192], 4, 5 ; t10, t13 + UNSCRATCH 5, 11, %4+ 7*%5 + + ; m15=t0, m14=t1, m13=t2, m12=t3, m11=t4, m10=t5, m9=t6, m8=t7 + ; m7=t8, m6=t9, m2=t10, m3=t11, m4=t12, m5=t13, m1=t14, m0=t15 +%else + mova m5, [%1+ 1*%3] ; IN(1) + mova m4, [%1+ 7*%3] ; IN(7) +%if %2 <= 8 + pmulhrsw m2, m5, [pw_16305x2] ; t15 + pmulhrsw m5, [pw_1606x2] ; t8 + pmulhrsw m3, m4, [pw_m10394x2] ; t9 + pmulhrsw m4, [pw_12665x2] ; t14 +%else + mova m3, [%1+ 9*%3] ; IN(9) + mova m2, [%1+15*%3] ; IN(15) + + ; m10=in0, m5=in1, m14=in2, m6=in3, m9=in4, m7=in5, m15=in6, m4=in7 + ; m11=in8, m3=in9, m12=in10 m0=in11, m8=in12, m1=in13, m13=in14, m2=in15 + + VP9_UNPACK_MULSUB_2W_4X 5, 2, 16305, 1606, [pd_8192], 0, 1 ; t8, t15 + VP9_UNPACK_MULSUB_2W_4X 3, 4, 10394, 12665, [pd_8192], 0, 1 ; t9, t14 +%endif + + SUMSUB_BA w, 3, 5, 0 ; t8, t9 + SUMSUB_BA w, 4, 2, 0 ; t15, t14 + + VP9_UNPACK_MULSUB_2W_4X 2, 5, 15137, 6270, [pd_8192], 0, 1 ; t9, t14 + + SCRATCH 4, 10, %4+ 1*%5 + SCRATCH 5, 11, %4+ 7*%5 + + mova m6, [%1+ 3*%3] ; IN(3) + mova m7, [%1+ 5*%3] ; IN(5) +%if %2 <= 8 + pmulhrsw m0, m7, [pw_14449x2] ; t13 + pmulhrsw m7, [pw_7723x2] ; t10 + pmulhrsw m1, m6, [pw_m4756x2] ; t11 + pmulhrsw m6, [pw_15679x2] ; t12 +%else + mova m0, [%1+11*%3] ; IN(11) + mova m1, [%1+13*%3] ; IN(13) + + VP9_UNPACK_MULSUB_2W_4X 7, 0, 14449, 7723, [pd_8192], 4, 5 ; t10, t13 + VP9_UNPACK_MULSUB_2W_4X 1, 6, 4756, 15679, [pd_8192], 4, 5 ; t11, t12 +%endif + + ; m11=t0, m10=t1, m9=t2, m8=t3, m14=t4, m12=t5, m15=t6, m13=t7 + ; m5=t8, m3=t9, m7=t10, m1=t11, m6=t12, m0=t13, m4=t14, m2=t15 + + SUMSUB_BA w, 7, 1, 4 ; t11, t10 + SUMSUB_BA w, 0, 6, 4 ; t12, t13 + + ; m8=t0, m9=t1, m10=t2, m11=t3, m12=t4, m14=t5, m13=t6, m15=t7 + ; m3=t8, m5=t9, m1=t10, m7=t11, m0=t12, m6=t13, m2=t14, m4=t15 + + VP9_UNPACK_MULSUB_2W_4X 6, 1, 6270, m15137, [pd_8192], 4, 5 ; t10, t13 + + UNSCRATCH 5, 11, %4+ 7*%5 +%endif + + ; m8=t0, m9=t1, m10=t2, m11=t3, m12=t4, m13=t5, m14=t6, m15=t7 + ; m3=t8, m2=t9, m6=t10, m7=t11, m0=t12, m1=t13, m5=t14, m4=t15 + + SUMSUB_BA w, 7, 3, 4 ; t8, t11 + + ; backup first register + mova [%4+15*%5], m7 + + SUMSUB_BA w, 6, 2, 7 ; t9, t10 + UNSCRATCH 4, 10, %4+ 1*%5 + SUMSUB_BA w, 0, 4, 7 ; t15, t12 + SUMSUB_BA w, 1, 5, 7 ; t14. t13 + + ; m15=t0, m14=t1, m13=t2, m12=t3, m11=t4, m10=t5, m9=t6, m8=t7 + ; m7=t8, m6=t9, m2=t10, m3=t11, m4=t12, m5=t13, m1=t14, m0=t15 + +%if cpuflag(ssse3) && %6 == 0 + SUMSUB_BA w, 2, 5, 7 + SUMSUB_BA w, 3, 4, 7 + pmulhrsw m5, [pw_11585x2] ; t10 + pmulhrsw m4, [pw_11585x2] ; t11 + pmulhrsw m3, [pw_11585x2] ; t12 + pmulhrsw m2, [pw_11585x2] ; t13 +%else + SCRATCH 6, 10, %4+ 1*%5 + VP9_UNPACK_MULSUB_2W_4X 5, 2, 11585, 11585, [pd_8192], 6, 7 ; t10, t13 + VP9_UNPACK_MULSUB_2W_4X 4, 3, 11585, 11585, [pd_8192], 6, 7 ; t11, t12 + UNSCRATCH 6, 10, %4+ 1*%5 +%endif + + ; m15=t0, m14=t1, m13=t2, m12=t3, m11=t4, m10=t5, m9=t6, m8=t7 + ; m7=t8, m6=t9, m5=t10, m4=t11, m3=t12, m2=t13, m1=t14, m0=t15 + + SCRATCH 0, 8, %4+ 1*%5 + SCRATCH 1, 9, %4+ 3*%5 + SCRATCH 2, 10, %4+ 5*%5 + SCRATCH 3, 11, %4+ 7*%5 + SCRATCH 4, 12, %4+ 9*%5 + SCRATCH 5, 13, %4+11*%5 + SCRATCH 6, 14, %4+13*%5 + + ; even (tx8x8) +%if %2 <= 4 + mova m3, [%1+ 0*%3] ; IN(0) + mova m4, [%1+ 2*%3] ; IN(2) + + pmulhrsw m3, [pw_11585x2] ; t0-t3 + pmulhrsw m7, m4, [pw_16069x2] ; t6-7 + pmulhrsw m4, [pw_3196x2] ; t4-5 + +%if 0 ; overflows :( + paddw m6, m7, m4 + psubw m5, m7, m4 + pmulhrsw m5, [pw_11585x2] ; t5 + pmulhrsw m6, [pw_11585x2] ; t6 +%else + VP9_UNPACK_MULSUB_2W_4X 5, 6, 7, 4, 11585, 11585, [pd_8192], 0, 1 ; t5, t6 +%endif + + psubw m0, m3, m7 + paddw m7, m3 + psubw m1, m3, m6 + paddw m6, m3 + psubw m2, m3, m5 + paddw m5, m3 + +%if ARCH_X86_32 + SWAP 0, 7 +%endif + SCRATCH 7, 15, %4+12*%5 +%else + mova m6, [%1+ 2*%3] ; IN(2) + mova m1, [%1+ 4*%3] ; IN(4) + mova m7, [%1+ 6*%3] ; IN(6) +%if %2 <= 8 + pmulhrsw m0, m1, [pw_15137x2] ; t3 + pmulhrsw m1, [pw_6270x2] ; t2 + pmulhrsw m5, m6, [pw_16069x2] ; t7 + pmulhrsw m6, [pw_3196x2] ; t4 + pmulhrsw m4, m7, [pw_m9102x2] ; t5 + pmulhrsw m7, [pw_13623x2] ; t6 +%else + mova m4, [%1+10*%3] ; IN(10) + mova m0, [%1+12*%3] ; IN(12) + mova m5, [%1+14*%3] ; IN(14) + + VP9_UNPACK_MULSUB_2W_4X 1, 0, 15137, 6270, [pd_8192], 2, 3 ; t2, t3 + VP9_UNPACK_MULSUB_2W_4X 6, 5, 16069, 3196, [pd_8192], 2, 3 ; t4, t7 + VP9_UNPACK_MULSUB_2W_4X 4, 7, 9102, 13623, [pd_8192], 2, 3 ; t5, t6 +%endif + + SUMSUB_BA w, 4, 6, 2 ; t4, t5 + SUMSUB_BA w, 7, 5, 2 ; t7, t6 + +%if cpuflag(ssse3) && %6 == 0 + SUMSUB_BA w, 6, 5, 2 + pmulhrsw m5, [pw_11585x2] ; t5 + pmulhrsw m6, [pw_11585x2] ; t6 +%else + VP9_UNPACK_MULSUB_2W_4X 5, 6, 11585, 11585, [pd_8192], 2, 3 ; t5, t6 +%endif + + SCRATCH 5, 15, %4+10*%5 + mova m2, [%1+ 0*%3] ; IN(0) +%if %2 <= 8 + pmulhrsw m2, [pw_11585x2] ; t0 and t1 + psubw m3, m2, m0 + paddw m0, m2 + + SUMSUB_BA w, 7, 0, 5 ; t0, t7 +%else + mova m3, [%1+ 8*%3] ; IN(8) + + ; from 3 stages back +%if cpuflag(ssse3) && %6 == 0 + SUMSUB_BA w, 3, 2, 5 + pmulhrsw m3, [pw_11585x2] ; t0 + pmulhrsw m2, [pw_11585x2] ; t1 +%else + mova [%1+ 0*%3], m0 + VP9_UNPACK_MULSUB_2W_4X 2, 3, 11585, 11585, [pd_8192], 5, 0 ; t0, t1 + mova m0, [%1+ 0*%3] +%endif + + ; from 2 stages back + SUMSUB_BA w, 0, 3, 5 ; t0, t3 + + SUMSUB_BA w, 7, 0, 5 ; t0, t7 +%endif + UNSCRATCH 5, 15, %4+10*%5 +%if ARCH_X86_32 + SWAP 0, 7 +%endif + SCRATCH 7, 15, %4+12*%5 + SUMSUB_BA w, 1, 2, 7 ; t1, t2 + + ; from 1 stage back + SUMSUB_BA w, 6, 1, 7 ; t1, t6 + SUMSUB_BA w, 5, 2, 7 ; t2, t5 +%endif + SUMSUB_BA w, 4, 3, 7 ; t3, t4 + +%if ARCH_X86_64 + SWAP 0, 8 + SWAP 1, 9 + SWAP 2, 10 + SWAP 3, 11 + SWAP 4, 12 + SWAP 5, 13 + SWAP 6, 14 + + SUMSUB_BA w, 0, 15, 7 ; t0, t15 + SUMSUB_BA w, 1, 14, 7 ; t1, t14 + SUMSUB_BA w, 2, 13, 7 ; t2, t13 + SUMSUB_BA w, 3, 12, 7 ; t3, t12 + SUMSUB_BA w, 4, 11, 7 ; t4, t11 + SUMSUB_BA w, 5, 10, 7 ; t5, t10 +%else + SWAP 1, 6 + SWAP 2, 5 + SWAP 3, 4 + mova [%4+14*%5], m6 + +%macro %%SUMSUB_BA_STORE 5 ; reg, from_mem, to_mem, scratch, scratch_stride + mova m6, [%4+%2*%5] + SUMSUB_BA w, 6, %1, 7 + SWAP %1, 6 + mova [%4+%3*%5], m6 +%endmacro + + %%SUMSUB_BA_STORE 0, 1, 1, %4, %5 ; t0, t15 + %%SUMSUB_BA_STORE 1, 3, 3, %4, %5 ; t1, t14 + %%SUMSUB_BA_STORE 2, 5, 5, %4, %5 ; t2, t13 + %%SUMSUB_BA_STORE 3, 7, 7, %4, %5 ; t3, t12 + %%SUMSUB_BA_STORE 4, 9, 9, %4, %5 ; t4, t11 + %%SUMSUB_BA_STORE 5, 11, 11, %4, %5 ; t5, t10 +%endif +%endmacro + +%macro VP9_IDCT16_1D 2-4 16, 1 ; src, pass, nnzc, is_iadst +%if %2 == 1 + VP9_IDCT16_1D_START %1, %3, 32, tmpq, 16, %4 + +%if ARCH_X86_64 + ; backup a different register + mova m7, [tmpq+15*16] + mova [tmpq+ 1*16], m15 + + SUMSUB_BA w, 6, 9, 15 ; t6, t9 + SUMSUB_BA w, 7, 8, 15 ; t7, t8 + + TRANSPOSE8x8W 0, 1, 2, 3, 4, 5, 6, 7, 15 + mova [tmpq+ 0], m0 + mova [tmpq+ 32], m1 + mova [tmpq+ 64], m2 + mova [tmpq+ 96], m3 + mova [tmpq+128], m4 + mova [tmpq+160], m5 + mova [tmpq+192], m6 + mova [tmpq+224], m7 + + mova m15, [tmpq+ 1*16] + TRANSPOSE8x8W 8, 9, 10, 11, 12, 13, 14, 15, 0 + mova [tmpq+ 16], m8 + mova [tmpq+ 48], m9 + mova [tmpq+ 80], m10 + mova [tmpq+112], m11 + mova [tmpq+144], m12 + mova [tmpq+176], m13 + mova [tmpq+208], m14 + mova [tmpq+240], m15 +%else + mova m6, [tmpq+13*16] + mova m7, [tmpq+14*16] + SUMSUB_BA w, 6, 7 ; t6, t9 + mova [tmpq+14*16], m6 + mova [tmpq+13*16], m7 + mova m7, [tmpq+15*16] + mova m6, [tmpq+12*16] + SUMSUB_BA w, 7, 6 ; t7, t8 + mova [tmpq+15*16], m6 + + TRANSPOSE8x8W 0, 1, 2, 3, 4, 5, 6, 7, [tmpq+14*16], [tmpq+ 8*16], 1 + mova [tmpq+ 0*16], m0 + mova [tmpq+ 2*16], m1 + mova [tmpq+ 4*16], m2 + mova [tmpq+ 6*16], m3 + mova [tmpq+10*16], m5 + mova [tmpq+12*16], m6 + mova [tmpq+14*16], m7 + + mova m0, [tmpq+15*16] + mova m1, [tmpq+13*16] + mova m2, [tmpq+11*16] + mova m3, [tmpq+ 9*16] + mova m4, [tmpq+ 7*16] + mova m5, [tmpq+ 5*16] + mova m7, [tmpq+ 1*16] + TRANSPOSE8x8W 0, 1, 2, 3, 4, 5, 6, 7, [tmpq+ 3*16], [tmpq+ 9*16], 1 + mova [tmpq+ 1*16], m0 + mova [tmpq+ 3*16], m1 + mova [tmpq+ 5*16], m2 + mova [tmpq+ 7*16], m3 + mova [tmpq+11*16], m5 + mova [tmpq+13*16], m6 + mova [tmpq+15*16], m7 +%endif +%else ; %2 == 2 + VP9_IDCT16_1D_START %1, %3, 32, %1, 32, %4 + +%if cpuflag(ssse3) +%define ROUND_REG [pw_512] +%else +%define ROUND_REG [pw_32] +%endif + + pxor m7, m7 +%if ARCH_X86_64 + ; backup more registers + mova [%1+ 2*32], m8 + mova [%1+ 3*32], m9 + + VP9_IDCT8_WRITEx2 0, 1, 8, 9, 7, ROUND_REG, 6 + lea dstq, [dstq+strideq*2] + VP9_IDCT8_WRITEx2 2, 3, 8, 9, 7, ROUND_REG, 6 + lea dstq, [dstq+strideq*2] + VP9_IDCT8_WRITEx2 4, 5, 8, 9, 7, ROUND_REG, 6 + lea dstq, [dstq+strideq*2] + + ; restore from cache + SWAP 0, 7 ; move zero from m7 to m0 + mova m7, [%1+15*32] + mova m8, [%1+ 2*32] + mova m9, [%1+ 3*32] + + SUMSUB_BA w, 6, 9, 3 ; t6, t9 + SUMSUB_BA w, 7, 8, 3 ; t7, t8 + + VP9_IDCT8_WRITEx2 6, 7, 3, 4, 0, ROUND_REG, 6 + lea dstq, [dstq+strideq*2] + VP9_IDCT8_WRITEx2 8, 9, 3, 4, 0, ROUND_REG, 6 + lea dstq, [dstq+strideq*2] + VP9_IDCT8_WRITEx2 10, 11, 1, 2, 0, ROUND_REG, 6 + lea dstq, [dstq+strideq*2] + VP9_IDCT8_WRITEx2 12, 13, 1, 2, 0, ROUND_REG, 6 + lea dstq, [dstq+strideq*2] + VP9_IDCT8_WRITEx2 14, 15, 1, 2, 0, ROUND_REG, 6 +%else + mova [tmpq+ 0*32], m5 + + VP9_IDCT8_WRITEx2 0, 1, 5, 6, 7, ROUND_REG, 6 + lea dstq, [dstq+strideq*2] + VP9_IDCT8_WRITEx2 2, 3, 5, 6, 7, ROUND_REG, 6 + lea dstq, [dstq+strideq*2] + + SWAP 0, 7 ; move zero from m7 to m0 + mova m5, [tmpq+ 0*32] + + VP9_IDCT8_WRITEx2 4, 5, 1, 2, 0, ROUND_REG, 6 + lea dstq, [dstq+strideq*2] + + mova m4, [tmpq+13*32] + mova m7, [tmpq+14*32] + mova m5, [tmpq+15*32] + mova m6, [tmpq+12*32] + SUMSUB_BADC w, 4, 7, 5, 6, 1 + + VP9_IDCT8_WRITEx2 4, 5, 1, 2, 0, ROUND_REG, 6 + lea dstq, [dstq+strideq*2] + VP9_IDCT8_WRITEx2 6, 7, 1, 2, 0, ROUND_REG, 6 + lea dstq, [dstq+strideq*2] + + mova m4, [tmpq+11*32] + mova m5, [tmpq+ 9*32] + mova m6, [tmpq+ 7*32] + mova m7, [tmpq+ 5*32] + + VP9_IDCT8_WRITEx2 4, 5, 1, 2, 0, ROUND_REG, 6 + lea dstq, [dstq+strideq*2] + VP9_IDCT8_WRITEx2 6, 7, 1, 2, 0, ROUND_REG, 6 + lea dstq, [dstq+strideq*2] + + mova m4, [tmpq+ 3*32] + mova m5, [tmpq+ 1*32] + + VP9_IDCT8_WRITEx2 4, 5, 1, 2, 0, ROUND_REG, 6 + lea dstq, [dstq+strideq*2] +%endif + +%undef ROUND_REG +%endif ; %2 == 1/2 +%endmacro + +%macro VP9_STORE_2XFULL 6-7 strideq; dc, tmp1, tmp2, tmp3, tmp4, zero, stride + mova m%3, [dstq] + mova m%5, [dstq+%7] + punpcklbw m%2, m%3, m%6 + punpckhbw m%3, m%6 + punpcklbw m%4, m%5, m%6 + punpckhbw m%5, m%6 + paddw m%2, m%1 + paddw m%3, m%1 + paddw m%4, m%1 + paddw m%5, m%1 + packuswb m%2, m%3 + packuswb m%4, m%5 + mova [dstq], m%2 + mova [dstq+%7], m%4 +%endmacro + +%macro VP9_IDCT_IDCT_16x16_ADD_XMM 1 +INIT_XMM %1 +cglobal vp9_idct_idct_16x16_add, 4, 6, 16, 512, dst, stride, block, eob +%if cpuflag(ssse3) + ; 2x2=eob=3, 4x4=eob=10 + cmp eobd, 38 + jg .idctfull + cmp eobd, 1 ; faster path for when only DC is set + jne .idct8x8 +%else + cmp eobd, 1 ; faster path for when only DC is set + jg .idctfull +%endif + + ; dc-only +%if cpuflag(ssse3) + movd m0, [blockq] + mova m1, [pw_11585x2] + pmulhrsw m0, m1 + pmulhrsw m0, m1 +%else + DEFINE_ARGS dst, stride, block, coef + movsx coefd, word [blockq] + imul coefd, 11585 + add coefd, 8192 + sar coefd, 14 + imul coefd, 11585 + add coefd, (32 << 14) + 8192 + sar coefd, 14 + 6 + movd m0, coefd +%endif + SPLATW m0, m0, q0000 +%if cpuflag(ssse3) + pmulhrsw m0, [pw_512] +%endif + pxor m5, m5 + movd [blockq], m5 +%rep 7 + VP9_STORE_2XFULL 0, 1, 2, 3, 4, 5 + lea dstq, [dstq+2*strideq] +%endrep + VP9_STORE_2XFULL 0, 1, 2, 3, 4, 5 + RET + + DEFINE_ARGS dst, stride, block, cnt, dst_bak, tmp +%if cpuflag(ssse3) +.idct8x8: + mov tmpq, rsp + VP9_IDCT16_1D blockq, 1, 8, 0 + + mov cntd, 2 + mov dst_bakq, dstq +.loop2_8x8: + VP9_IDCT16_1D tmpq, 2, 8, 0 + lea dstq, [dst_bakq+8] + add tmpq, 16 + dec cntd + jg .loop2_8x8 + + ; at the end of the loop, m0 should still be zero + ; use that to zero out block coefficients + ZERO_BLOCK blockq, 32, 8, m0 + RET +%endif + +.idctfull: + mov cntd, 2 + mov tmpq, rsp +.loop1_full: + VP9_IDCT16_1D blockq, 1, 16, 0 + add blockq, 16 + add tmpq, 256 + dec cntd + jg .loop1_full + sub blockq, 32 + + mov cntd, 2 + mov tmpq, rsp + mov dst_bakq, dstq +.loop2_full: + VP9_IDCT16_1D tmpq, 2, 16, 0 + lea dstq, [dst_bakq+8] + add tmpq, 16 + dec cntd + jg .loop2_full + + ; at the end of the loop, m0 should still be zero + ; use that to zero out block coefficients + ZERO_BLOCK blockq, 32, 16, m0 + RET +%endmacro + +VP9_IDCT_IDCT_16x16_ADD_XMM sse2 +VP9_IDCT_IDCT_16x16_ADD_XMM ssse3 +VP9_IDCT_IDCT_16x16_ADD_XMM avx + +%macro VP9_IDCT16_YMM_1D 0 + VP9_UNPACK_MULSUB_2W_4X 1, 15, 16305, 1606, [pd_8192], 0, 4 ; t8, t15 + VP9_UNPACK_MULSUB_2W_4X 9, 7, 10394, 12665, [pd_8192], 0, 4 ; t9, t14 + + SUMSUB_BA w, 9, 1, 0 ; t8, t9 + SUMSUB_BA w, 7, 15, 0 ; t15, t14 + + VP9_UNPACK_MULSUB_2W_4X 15, 1, 15137, 6270, [pd_8192], 0, 4 ; t9, t14 + + VP9_UNPACK_MULSUB_2W_4X 5, 11, 14449, 7723, [pd_8192], 0, 4 ; t10, t13 + VP9_UNPACK_MULSUB_2W_4X 13, 3, 4756, 15679, [pd_8192], 0, 4 ; t11, t12 + + SUMSUB_BA w, 5, 13, 0 ; t11, t10 + SUMSUB_BA w, 11, 3, 0 ; t12, t13 + + VP9_UNPACK_MULSUB_2W_4X 3, 13, 6270, m15137, [pd_8192], 0, 4 ; t10, t13 + + SUMSUB_BA w, 5, 9, 0 ; t8, t11 + SUMSUB_BA w, 3, 15, 0 ; t9, t10 + SUMSUB_BA w, 11, 7, 0 ; t15, t12 + SUMSUB_BA w, 13, 1, 0 ; t14, t13 + + SUMSUB_BA w, 15, 1, 0 + SUMSUB_BA w, 9, 7, 0 + pmulhrsw m1, [pw_11585x2] ; t10 + pmulhrsw m7, [pw_11585x2] ; t11 + pmulhrsw m9, [pw_11585x2] ; t12 + pmulhrsw m15, [pw_11585x2] ; t13 + + ; even (tx8x8) + mova m4, [blockq+128] + mova [blockq+128], m5 + VP9_UNPACK_MULSUB_2W_4X 4, 12, 15137, 6270, [pd_8192], 0, 5 ; t2, t3 + VP9_UNPACK_MULSUB_2W_4X 2, 14, 16069, 3196, [pd_8192], 0, 5 ; t4, t7 + VP9_UNPACK_MULSUB_2W_4X 10, 6, 9102, 13623, [pd_8192], 0, 5 ; t5, t6 + mova m0, [blockq+ 0] + SUMSUB_BA w, 8, 0, 5 + pmulhrsw m8, [pw_11585x2] ; t0 + pmulhrsw m0, [pw_11585x2] ; t1 + + SUMSUB_BA w, 10, 2, 5 ; t4, t5 + SUMSUB_BA w, 6, 14, 5 ; t7, t6 + SUMSUB_BA w, 12, 8, 5 ; t0, t3 + SUMSUB_BA w, 4, 0, 5 ; t1, t2 + + SUMSUB_BA w, 2, 14, 5 + pmulhrsw m14, [pw_11585x2] ; t5 + pmulhrsw m2, [pw_11585x2] ; t6 + + SUMSUB_BA w, 6, 12, 5 ; t0, t7 + SUMSUB_BA w, 2, 4, 5 ; t1, t6 + SUMSUB_BA w, 14, 0, 5 ; t2, t5 + SUMSUB_BA w, 10, 8, 5 ; t3, t4 + + ; final stage + SUMSUB_BA w, 11, 6, 5 ; out0, out15 + SUMSUB_BA w, 13, 2, 5 ; out1, out14 + SUMSUB_BA w, 15, 14, 5 ; out2, out13 + SUMSUB_BA w, 9, 10, 5 ; out3, out12 + SUMSUB_BA w, 7, 8, 5 ; out4, out11 + SUMSUB_BA w, 1, 0, 5 ; out5, out10 + SUMSUB_BA w, 3, 4, 5 ; out6, out9 + mova m5, [blockq+128] + mova [blockq+192], m3 + SUMSUB_BA w, 5, 12, 3 ; out7, out8 + + SWAP 0, 11, 8, 12, 10 + SWAP 1, 13, 14, 2, 15, 6, 3, 9, 4, 7, 5 +%endmacro + +; this is almost identical to VP9_STORE_2X, but it does two rows +; for slightly improved interleaving, and it omits vpermq since the +; input is DC so all values are identical +%macro VP9_STORE_YMM_DC_4X 6 ; reg, tmp1, tmp2, tmp3, tmp4, zero + mova xm%2, [dstq] + mova xm%4, [dstq+strideq*2] + vinserti128 m%2, m%2, [dstq+strideq], 1 + vinserti128 m%4, m%4, [dstq+stride3q], 1 + punpckhbw m%3, m%2, m%6 + punpcklbw m%2, m%6 + punpckhbw m%5, m%4, m%6 + punpcklbw m%4, m%6 + paddw m%3, m%1 + paddw m%2, m%1 + paddw m%5, m%1 + paddw m%4, m%1 + packuswb m%2, m%3 + packuswb m%4, m%5 + mova [dstq], xm%2 + mova [dstq+strideq*2], xm%4 + vextracti128 [dstq+strideq], m%2, 1 + vextracti128 [dstq+stride3q], m%4, 1 +%endmacro + +%if ARCH_X86_64 && HAVE_AVX2_EXTERNAL +INIT_YMM avx2 +cglobal vp9_idct_idct_16x16_add, 4, 4, 16, dst, stride, block, eob + cmp eobd, 1 ; faster path for when only DC is set + jg .idctfull + + ; dc-only + mova m1, [pw_11585x2] + vpbroadcastw m0, [blockq] + pmulhrsw m0, m1 + pmulhrsw m0, m1 + pxor m5, m5 + pmulhrsw m0, [pw_512] + movd [blockq], xm5 + + DEFINE_ARGS dst, stride, stride3, cnt + mov cntd, 4 + lea stride3q, [strideq*3] +.loop_dc: + VP9_STORE_YMM_DC_4X 0, 1, 2, 3, 4, 5 + lea dstq, [dstq+4*strideq] + dec cntd + jg .loop_dc + RET + + DEFINE_ARGS dst, stride, block, eob +.idctfull: + mova m1, [blockq+ 32] + mova m2, [blockq+ 64] + mova m3, [blockq+ 96] + mova m5, [blockq+160] + mova m6, [blockq+192] + mova m7, [blockq+224] + mova m8, [blockq+256] + mova m9, [blockq+288] + mova m10, [blockq+320] + mova m11, [blockq+352] + mova m12, [blockq+384] + mova m13, [blockq+416] + mova m14, [blockq+448] + mova m15, [blockq+480] + + VP9_IDCT16_YMM_1D + TRANSPOSE16x16W 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, \ + [blockq+192], [blockq+128], 1 + mova [blockq+ 0], m0 + VP9_IDCT16_YMM_1D + + mova [blockq+224], m7 + + ; store + VP9_IDCT8_WRITEx2 0, 1, 6, 7, unused, [pw_512], 6 + lea dstq, [dstq+2*strideq] + VP9_IDCT8_WRITEx2 2, 3, 6, 7, unused, [pw_512], 6 + lea dstq, [dstq+2*strideq] + VP9_IDCT8_WRITEx2 4, 5, 6, 7, unused, [pw_512], 6 + lea dstq, [dstq+2*strideq] + mova m6, [blockq+192] + mova m7, [blockq+224] + VP9_IDCT8_WRITEx2 6, 7, 1, 2, unused, [pw_512], 6 + lea dstq, [dstq+2*strideq] + VP9_IDCT8_WRITEx2 8, 9, 1, 2, unused, [pw_512], 6 + lea dstq, [dstq+2*strideq] + VP9_IDCT8_WRITEx2 10, 11, 1, 2, unused, [pw_512], 6 + lea dstq, [dstq+2*strideq] + VP9_IDCT8_WRITEx2 12, 13, 1, 2, unused, [pw_512], 6 + lea dstq, [dstq+2*strideq] + VP9_IDCT8_WRITEx2 14, 15, 1, 2, unused, [pw_512], 6 + lea dstq, [dstq+2*strideq] + + ; at the end of the loop, m0 should still be zero + ; use that to zero out block coefficients + pxor m0, m0 + ZERO_BLOCK blockq, 32, 16, m0 + RET +%endif + +;--------------------------------------------------------------------------------------------- +; void vp9_iadst_iadst_16x16_add_<opt>(uint8_t *dst, ptrdiff_t stride, int16_t *block, int eob); +;--------------------------------------------------------------------------------------------- + +%macro VP9_IADST16_1D 2 ; src, pass +%assign %%str 16*%2 + mova m0, [%1+ 0*32] ; in0 + mova m1, [%1+15*32] ; in15 + mova m2, [%1+ 7*32] ; in7 + mova m3, [%1+ 8*32] ; in8 + + VP9_UNPACK_MULSUB_2D_4X 1, 0, 4, 5, 16364, 804 ; m1/4=t1[d], m0/5=t0[d] + VP9_UNPACK_MULSUB_2D_4X 2, 3, 7, 6, 11003, 12140 ; m2/7=t9[d], m3/6=t8[d] + SCRATCH 4, 8, tmpq+ 0*%%str + VP9_RND_SH_SUMSUB_BA 3, 0, 6, 5, 4, [pd_8192] ; m3=t0[w], m0=t8[w] + UNSCRATCH 4, 8, tmpq+ 0*%%str + VP9_RND_SH_SUMSUB_BA 2, 1, 7, 4, 5, [pd_8192] ; m2=t1[w], m1=t9[w] + + SCRATCH 0, 10, tmpq+ 0*%%str + SCRATCH 1, 11, tmpq+15*%%str + mova [tmpq+ 7*%%str], m2 + mova [tmpq+ 8*%%str], m3 + + mova m1, [%1+ 2*32] ; in2 + mova m0, [%1+13*32] ; in13 + mova m3, [%1+ 5*32] ; in5 + mova m2, [%1+10*32] ; in10 + + VP9_UNPACK_MULSUB_2D_4X 0, 1, 6, 7, 15893, 3981 ; m0/6=t3[d], m1/7=t2[d] + VP9_UNPACK_MULSUB_2D_4X 3, 2, 4, 5, 8423, 14053 ; m3/4=t11[d], m2/5=t10[d] + SCRATCH 4, 12, tmpq+ 2*%%str + VP9_RND_SH_SUMSUB_BA 2, 1, 5, 7, 4, [pd_8192] ; m2=t2[w], m1=t10[w] + UNSCRATCH 4, 12, tmpq+ 2*%%str + VP9_RND_SH_SUMSUB_BA 3, 0, 4, 6, 5, [pd_8192] ; m3=t3[w], m0=t11[w] + + SCRATCH 0, 12, tmpq+ 2*%%str + SCRATCH 1, 13, tmpq+13*%%str + mova [tmpq+ 5*%%str], m2 + mova [tmpq+10*%%str], m3 + + mova m2, [%1+ 4*32] ; in4 + mova m3, [%1+11*32] ; in11 + mova m0, [%1+ 3*32] ; in3 + mova m1, [%1+12*32] ; in12 + + VP9_UNPACK_MULSUB_2D_4X 3, 2, 7, 6, 14811, 7005 ; m3/7=t5[d], m2/6=t4[d] + VP9_UNPACK_MULSUB_2D_4X 0, 1, 4, 5, 5520, 15426 ; m0/4=t13[d], m1/5=t12[d] + SCRATCH 4, 9, tmpq+ 4*%%str + VP9_RND_SH_SUMSUB_BA 1, 2, 5, 6, 4, [pd_8192] ; m1=t4[w], m2=t12[w] + UNSCRATCH 4, 9, tmpq+ 4*%%str + VP9_RND_SH_SUMSUB_BA 0, 3, 4, 7, 6, [pd_8192] ; m0=t5[w], m3=t13[w] + + SCRATCH 0, 8, tmpq+ 4*%%str + mova [tmpq+11*%%str], m1 ; t4:m1->r11 + UNSCRATCH 0, 10, tmpq+ 0*%%str + UNSCRATCH 1, 11, tmpq+15*%%str + + ; round 2 interleaved part 1 + VP9_UNPACK_MULSUB_2D_4X 0, 1, 6, 7, 16069, 3196 ; m1/7=t8[d], m0/6=t9[d] + VP9_UNPACK_MULSUB_2D_4X 3, 2, 5, 4, 3196, 16069 ; m3/5=t12[d], m2/4=t13[d] + SCRATCH 4, 9, tmpq+ 3*%%str + VP9_RND_SH_SUMSUB_BA 3, 1, 5, 7, 4, [pd_8192] ; m3=t8[w], m1=t12[w] + UNSCRATCH 4, 9, tmpq+ 3*%%str + VP9_RND_SH_SUMSUB_BA 2, 0, 4, 6, 5, [pd_8192] ; m2=t9[w], m0=t13[w] + + SCRATCH 0, 10, tmpq+ 0*%%str + SCRATCH 1, 11, tmpq+15*%%str + SCRATCH 2, 14, tmpq+ 3*%%str + SCRATCH 3, 15, tmpq+12*%%str + + mova m2, [%1+ 6*32] ; in6 + mova m3, [%1+ 9*32] ; in9 + mova m0, [%1+ 1*32] ; in1 + mova m1, [%1+14*32] ; in14 + + VP9_UNPACK_MULSUB_2D_4X 3, 2, 7, 6, 13160, 9760 ; m3/7=t7[d], m2/6=t6[d] + VP9_UNPACK_MULSUB_2D_4X 0, 1, 4, 5, 2404, 16207 ; m0/4=t15[d], m1/5=t14[d] + SCRATCH 4, 9, tmpq+ 6*%%str + VP9_RND_SH_SUMSUB_BA 1, 2, 5, 6, 4, [pd_8192] ; m1=t6[w], m2=t14[w] + UNSCRATCH 4, 9, tmpq+ 6*%%str + VP9_RND_SH_SUMSUB_BA 0, 3, 4, 7, 6, [pd_8192] ; m0=t7[w], m3=t15[w] + + ; r8=t0, r7=t1, r5=t2, r10=t3, r11=t4, m8|r4=t5, m1=t6, m0=t7 + ; m10|r0=t8, m11|r15=t9, m13|r13=t10, m12|r2=t11, m14|r3=t12, m15|r12=t13, m2=t14, m3=t15 + + UNSCRATCH 4, 12, tmpq+ 2*%%str + UNSCRATCH 5, 13, tmpq+13*%%str + SCRATCH 0, 12, tmpq+ 1*%%str + SCRATCH 1, 13, tmpq+14*%%str + + ; remainder of round 2 (rest of t8-15) + VP9_UNPACK_MULSUB_2D_4X 5, 4, 6, 7, 9102, 13623 ; m5/6=t11[d], m4/7=t10[d] + VP9_UNPACK_MULSUB_2D_4X 3, 2, 1, 0, 13623, 9102 ; m3/1=t14[d], m2/0=t15[d] + SCRATCH 0, 9, tmpq+ 6*%%str + VP9_RND_SH_SUMSUB_BA 3, 4, 1, 7, 0, [pd_8192] ; m3=t10[w], m4=t14[w] + UNSCRATCH 0, 9, tmpq+ 6*%%str + VP9_RND_SH_SUMSUB_BA 2, 5, 0, 6, 1, [pd_8192] ; m2=t11[w], m5=t15[w] + + ; m15|r12=t8, m14|r3=t9, m3=t10, m2=t11, m11|r15=t12, m10|r0=t13, m4=t14, m5=t15 + + UNSCRATCH 6, 14, tmpq+ 3*%%str + UNSCRATCH 7, 15, tmpq+12*%%str + + SUMSUB_BA w, 3, 7, 1 + PSIGNW m3, [pw_m1] ; m3=out1[w], m7=t10[w] + SUMSUB_BA w, 2, 6, 1 ; m2=out14[w], m6=t11[w] + + ; unfortunately, the code below overflows in some cases, e.g. + ; http://downloads.webmproject.org/test_data/libvpx/vp90-2-14-resize-fp-tiles-16-8.webm +%if 0; cpuflag(ssse3) + SUMSUB_BA w, 7, 6, 1 + pmulhrsw m7, [pw_11585x2] ; m7=out6[w] + pmulhrsw m6, [pw_11585x2] ; m6=out9[w] +%else + VP9_UNPACK_MULSUB_2W_4X 6, 7, 11585, 11585, [pd_8192], 1, 0 +%endif + + mova [tmpq+ 3*%%str], m6 + mova [tmpq+ 6*%%str], m7 + UNSCRATCH 6, 10, tmpq+ 0*%%str + UNSCRATCH 7, 11, tmpq+15*%%str + mova [tmpq+13*%%str], m2 + SCRATCH 3, 11, tmpq+ 9*%%str + + VP9_UNPACK_MULSUB_2D_4X 7, 6, 2, 3, 15137, 6270 ; m6/3=t13[d], m7/2=t12[d] + VP9_UNPACK_MULSUB_2D_4X 5, 4, 1, 0, 6270, 15137 ; m5/1=t14[d], m4/0=t15[d] + SCRATCH 0, 9, tmpq+ 2*%%str + VP9_RND_SH_SUMSUB_BA 5, 6, 1, 3, 0, [pd_8192] ; m5=out2[w], m6=t14[w] + UNSCRATCH 0, 9, tmpq+ 2*%%str + VP9_RND_SH_SUMSUB_BA 4, 7, 0, 2, 1, [pd_8192] + PSIGNW m4, [pw_m1] ; m4=out13[w], m7=t15[w] + + ; unfortunately, the code below overflows in some cases +%if 0; cpuflag(ssse3) + SUMSUB_BA w, 7, 6, 1 + pmulhrsw m7, [pw_m11585x2] ; m7=out5[w] + pmulhrsw m6, [pw_11585x2] ; m6=out10[w] +%else + PSIGNW m7, [pw_m1] + VP9_UNPACK_MULSUB_2W_4X 7, 6, 11585, 11585, [pd_8192], 1, 0 +%endif + + ; m11|r13=out1, m5=out2, m7=out5, r15=out6, r3=out9, m6=out10, m4=out13, r2=out14 + + mova m2, [tmpq+ 8*%%str] + mova m3, [tmpq+ 7*%%str] + mova m1, [tmpq+11*%%str] + mova [tmpq+ 7*%%str], m6 + mova [tmpq+11*%%str], m4 + mova m4, [tmpq+ 5*%%str] + SCRATCH 5, 14, tmpq+ 5*%%str + SCRATCH 7, 15, tmpq+ 8*%%str + UNSCRATCH 6, 8, tmpq+ 4*%%str + UNSCRATCH 5, 12, tmpq+ 1*%%str + UNSCRATCH 7, 13, tmpq+14*%%str + + ; m2=t0, m3=t1, m9=t2, m0=t3, m1=t4, m8=t5, m13=t6, m12=t7 + ; m11|r13=out1, m5=out2, m7=out5, r15=out6, r3=out9, r10=out10, r11=out13, r2=out14 + + SUMSUB_BA w, 1, 2, 0 ; m1=t0[w], m2=t4[w] + mova m0, [tmpq+10*%%str] + SCRATCH 1, 12, tmpq+ 1*%%str + SUMSUB_BA w, 6, 3, 1 ; m8=t1[w], m3=t5[w] + SCRATCH 6, 13, tmpq+ 4*%%str + SUMSUB_BA w, 7, 4, 1 ; m13=t2[w], m9=t6[w] + SCRATCH 7, 8, tmpq+10*%%str + SUMSUB_BA w, 5, 0, 1 ; m12=t3[w], m0=t7[w] + SCRATCH 5, 9, tmpq+14*%%str + + VP9_UNPACK_MULSUB_2D_4X 2, 3, 7, 5, 15137, 6270 ; m2/6=t5[d], m3/10=t4[d] + VP9_UNPACK_MULSUB_2D_4X 0, 4, 1, 6, 6270, 15137 ; m0/14=t6[d], m9/15=t7[d] + SCRATCH 6, 10, tmpq+ 0*%%str + VP9_RND_SH_SUMSUB_BA 0, 3, 1, 5, 6, [pd_8192] + UNSCRATCH 6, 10, tmpq+ 0*%%str + PSIGNW m0, [pw_m1] ; m0=out3[w], m3=t6[w] + VP9_RND_SH_SUMSUB_BA 4, 2, 6, 7, 5, [pd_8192] ; m9=out12[w], m2=t7[w] + + UNSCRATCH 1, 8, tmpq+10*%%str + UNSCRATCH 5, 9, tmpq+14*%%str + UNSCRATCH 6, 12, tmpq+ 1*%%str + UNSCRATCH 7, 13, tmpq+ 4*%%str + SCRATCH 4, 9, tmpq+14*%%str + + SUMSUB_BA w, 1, 6, 4 ; m13=out0[w], m1=t2[w] + SUMSUB_BA w, 5, 7, 4 + PSIGNW m5, [pw_m1] ; m12=out15[w], m8=t3[w] + + ; unfortunately, the code below overflows in some cases, e.g. + ; http://downloads.webmproject.org/test_data/libvpx/vp90-2-14-resize-fp-tiles-16-8-4-2-1.webm +%if 0 ; cpuflag(ssse3) + SUMSUB_BA w, 7, 6, 4 + pmulhrsw m7, [pw_m11585x2] ; m8=out7[w] + pmulhrsw m6, [pw_11585x2] ; m1=out8[w] + SWAP 6, 7 + SUMSUB_BA w, 3, 2, 4 + pmulhrsw m3, [pw_11585x2] ; m3=out4[w] + pmulhrsw m2, [pw_11585x2] ; m2=out11[w] +%else + SCRATCH 5, 8, tmpq+10*%%str + VP9_UNPACK_MULSUB_2W_4X 6, 7, 11585, m11585, [pd_8192], 5, 4 + VP9_UNPACK_MULSUB_2W_4X 2, 3, 11585, 11585, [pd_8192], 5, 4 + UNSCRATCH 5, 8, tmpq+10*%%str +%endif + + ; m13=out0, m0=out3, m3=out4, m8=out7, m1=out8, m2=out11, m9=out12, m12=out15 + ; m11|r13=out1, m5=out2, m7=out5, r15=out6, r3=out9, r10=out10, r11=out13, r2=out14 + +%if %2 == 1 +%if ARCH_X86_64 + mova m13, [tmpq+ 6*%%str] + TRANSPOSE8x8W 1, 11, 14, 0, 3, 15, 13, 6, 10 + mova [tmpq+ 0*16], m1 + mova [tmpq+ 2*16], m11 + mova [tmpq+ 4*16], m14 + mova [tmpq+ 6*16], m0 + mova m1, [tmpq+ 3*%%str] + mova m11, [tmpq+ 7*%%str] + mova m14, [tmpq+11*%%str] + mova m0, [tmpq+13*%%str] + mova [tmpq+ 8*16], m3 + mova [tmpq+10*16], m15 + mova [tmpq+12*16], m13 + mova [tmpq+14*16], m6 + + TRANSPOSE8x8W 7, 1, 11, 2, 9, 14, 0, 5, 10 + mova [tmpq+ 1*16], m7 + mova [tmpq+ 3*16], m1 + mova [tmpq+ 5*16], m11 + mova [tmpq+ 7*16], m2 + mova [tmpq+ 9*16], m9 + mova [tmpq+11*16], m14 + mova [tmpq+13*16], m0 + mova [tmpq+15*16], m5 +%else + mova [tmpq+12*%%str], m2 + mova [tmpq+ 1*%%str], m5 + mova [tmpq+15*%%str], m7 + mova m2, [tmpq+ 9*%%str] + mova m5, [tmpq+ 5*%%str] + mova m7, [tmpq+ 8*%%str] + TRANSPOSE8x8W 1, 2, 5, 0, 3, 7, 4, 6, [tmpq+ 6*%%str], [tmpq+ 8*%%str], 1 + mova [tmpq+ 0*16], m1 + mova [tmpq+ 2*16], m2 + mova [tmpq+ 4*16], m5 + mova [tmpq+ 6*16], m0 + mova [tmpq+10*16], m7 + mova m3, [tmpq+12*%%str] + mova [tmpq+12*16], m4 + mova m4, [tmpq+14*%%str] + mova [tmpq+14*16], m6 + + mova m0, [tmpq+15*%%str] + mova m1, [tmpq+ 3*%%str] + mova m2, [tmpq+ 7*%%str] + mova m5, [tmpq+11*%%str] + mova m7, [tmpq+ 1*%%str] + TRANSPOSE8x8W 0, 1, 2, 3, 4, 5, 6, 7, [tmpq+13*%%str], [tmpq+ 9*%%str], 1 + mova [tmpq+ 1*16], m0 + mova [tmpq+ 3*16], m1 + mova [tmpq+ 5*16], m2 + mova [tmpq+ 7*16], m3 + mova [tmpq+11*16], m5 + mova [tmpq+13*16], m6 + mova [tmpq+15*16], m7 +%endif +%else + pxor m4, m4 + +%if cpuflag(ssse3) +%define ROUND_REG [pw_512] +%else +%define ROUND_REG [pw_32] +%endif + +%if ARCH_X86_64 + mova m12, [tmpq+ 6*%%str] + VP9_IDCT8_WRITEx2 1, 11, 10, 8, 4, ROUND_REG, 6 + lea dstq, [dstq+strideq*2] + VP9_IDCT8_WRITEx2 14, 0, 10, 8, 4, ROUND_REG, 6 + lea dstq, [dstq+strideq*2] + VP9_IDCT8_WRITEx2 3, 15, 10, 8, 4, ROUND_REG, 6 + lea dstq, [dstq+strideq*2] + VP9_IDCT8_WRITEx2 12, 6, 10, 8, 4, ROUND_REG, 6 + lea dstq, [dstq+strideq*2] + + mova m1, [tmpq+ 3*%%str] + mova m11, [tmpq+ 7*%%str] + mova m14, [tmpq+11*%%str] + mova m0, [tmpq+13*%%str] + + VP9_IDCT8_WRITEx2 7, 1, 10, 8, 4, ROUND_REG, 6 + lea dstq, [dstq+strideq*2] + VP9_IDCT8_WRITEx2 11, 2, 10, 8, 4, ROUND_REG, 6 + lea dstq, [dstq+strideq*2] + VP9_IDCT8_WRITEx2 9, 14, 10, 8, 4, ROUND_REG, 6 + lea dstq, [dstq+strideq*2] + VP9_IDCT8_WRITEx2 0, 5, 10, 8, 4, ROUND_REG, 6 +%else + mova [tmpq+ 0*%%str], m2 + mova [tmpq+ 1*%%str], m5 + mova [tmpq+ 2*%%str], m7 + mova m2, [tmpq+ 9*%%str] + VP9_IDCT8_WRITEx2 1, 2, 5, 7, 4, ROUND_REG, 6 + lea dstq, [dstq+strideq*2] + mova m5, [tmpq+ 5*%%str] + VP9_IDCT8_WRITEx2 5, 0, 1, 2, 4, ROUND_REG, 6 + lea dstq, [dstq+strideq*2] + mova m5, [tmpq+ 8*%%str] + VP9_IDCT8_WRITEx2 3, 5, 1, 2, 4, ROUND_REG, 6 + lea dstq, [dstq+strideq*2] + mova m5, [tmpq+ 6*%%str] + VP9_IDCT8_WRITEx2 5, 6, 1, 2, 4, ROUND_REG, 6 + lea dstq, [dstq+strideq*2] + + mova m0, [tmpq+ 2*%%str] + mova m3, [tmpq+ 3*%%str] + VP9_IDCT8_WRITEx2 0, 3, 1, 2, 4, ROUND_REG, 6 + lea dstq, [dstq+strideq*2] + mova m0, [tmpq+ 7*%%str] + mova m3, [tmpq+ 0*%%str] + VP9_IDCT8_WRITEx2 0, 3, 1, 2, 4, ROUND_REG, 6 + lea dstq, [dstq+strideq*2] + mova m0, [tmpq+14*%%str] + mova m3, [tmpq+11*%%str] + VP9_IDCT8_WRITEx2 0, 3, 1, 2, 4, ROUND_REG, 6 + lea dstq, [dstq+strideq*2] + mova m0, [tmpq+13*%%str] + mova m3, [tmpq+ 1*%%str] + VP9_IDCT8_WRITEx2 0, 3, 1, 2, 4, ROUND_REG, 6 +%endif + + SWAP 0, 4 ; zero +%undef ROUND_REG +%endif +%endmacro + +%macro IADST16_FN 5 +INIT_XMM %5 +cglobal vp9_%1_%3_16x16_add, 3, 6, 16, 512, dst, stride, block, cnt, dst_bak, tmp + mov cntd, 2 + mov tmpq, rsp +.loop1_full: + VP9_%2_1D blockq, 1 + add blockq, 16 + add tmpq, 256 + dec cntd + jg .loop1_full + sub blockq, 32 + + mov cntd, 2 + mov tmpq, rsp + mov dst_bakq, dstq +.loop2_full: + VP9_%4_1D tmpq, 2 + lea dstq, [dst_bakq+8] + add tmpq, 16 + dec cntd + jg .loop2_full + + ; at the end of the loop, m0 should still be zero + ; use that to zero out block coefficients + ZERO_BLOCK blockq, 32, 16, m0 + RET +%endmacro + +IADST16_FN idct, IDCT16, iadst, IADST16, sse2 +IADST16_FN iadst, IADST16, idct, IDCT16, sse2 +IADST16_FN iadst, IADST16, iadst, IADST16, sse2 +IADST16_FN idct, IDCT16, iadst, IADST16, ssse3 +IADST16_FN iadst, IADST16, idct, IDCT16, ssse3 +IADST16_FN iadst, IADST16, iadst, IADST16, ssse3 +IADST16_FN idct, IDCT16, iadst, IADST16, avx +IADST16_FN iadst, IADST16, idct, IDCT16, avx +IADST16_FN iadst, IADST16, iadst, IADST16, avx + +; in: data in m[0-15] except m0/m4, which are in [blockq+0] and [blockq+128] +; out: m[0-15] except m6, which is in [blockq+192] +; uses blockq as scratch space +%macro VP9_IADST16_YMM_1D 0 + mova [blockq+ 32], m3 + mova [blockq+ 64], m7 + mova [blockq+ 96], m8 + + ; first half of round 1 + VP9_UNPACK_MULSUB_2D_4X 9, 6, 0, 3, 13160, 9760 ; m9/x=t7[d], m6/x=t6[d] + VP9_UNPACK_MULSUB_2D_4X 1, 14, 4, 7, 2404, 16207 ; m1/x=t15[d], m14/x=t14[d] + VP9_RND_SH_SUMSUB_BA 14, 6, 7, 3, 8, [pd_8192] ; m14=t6[w], m6=t14[w] + VP9_RND_SH_SUMSUB_BA 1, 9, 4, 0, 8, [pd_8192] ; m1=t7[w], m9=t15[w] + + VP9_UNPACK_MULSUB_2D_4X 13, 2, 4, 7, 15893, 3981 ; m13/x=t3[d], m2/x=t2[d] + VP9_UNPACK_MULSUB_2D_4X 5, 10, 0, 3, 8423, 14053 ; m5/x=t11[d], m10/x=t10[d] + VP9_RND_SH_SUMSUB_BA 10, 2, 3, 7, 8, [pd_8192] ; m10=t2[w], m2=t10[w] + VP9_RND_SH_SUMSUB_BA 5, 13, 0, 4, 8, [pd_8192] ; m5=t3[w], m13=t11[w] + + ; half of round 2 t8-15 + VP9_UNPACK_MULSUB_2D_4X 2, 13, 4, 7, 9102, 13623 ; m2/x=t11[d], m13/x=t10[d] + VP9_UNPACK_MULSUB_2D_4X 9, 6, 3, 0, 13623, 9102 ; m9/x=t14[d], m6/x=t15[d] + VP9_RND_SH_SUMSUB_BA 9, 13, 3, 7, 8, [pd_8192] ; m9=t10[w], m13=t14[w] + VP9_RND_SH_SUMSUB_BA 6, 2, 0, 4, 8, [pd_8192] ; m6=t11[w], m2=t15[w] + + SUMSUB_BA w, 14, 10, 8 ; m14=t2, m10=t6 + SUMSUB_BA w, 1, 5, 8 ; m1=t3, m5=t7 + + mova m0, [blockq+ 0] + mova m4, [blockq+128] + mova m3, [blockq+ 32] + mova m7, [blockq+ 64] + mova m8, [blockq+ 96] + mova [blockq+ 0], m1 + mova [blockq+128], m14 + mova [blockq+ 32], m6 + mova [blockq+ 64], m9 + mova [blockq+ 96], m10 + + ; second half of round 1 + VP9_UNPACK_MULSUB_2D_4X 15, 0, 1, 9, 16364, 804 ; m15/x=t1[d], m0/x=t0[d] + VP9_UNPACK_MULSUB_2D_4X 7, 8, 10, 6, 11003, 12140 ; m7/x=t9[d], m8/x=t8[d] + VP9_RND_SH_SUMSUB_BA 8, 0, 6, 9, 14, [pd_8192] ; m8=t0[w], m0=t8[w] + VP9_RND_SH_SUMSUB_BA 7, 15, 10, 1, 14, [pd_8192] ; m7=t1[w], m15=t9[w] + + VP9_UNPACK_MULSUB_2D_4X 11, 4, 10, 6, 14811, 7005 ; m11/x=t5[d], m4/x=t4[d] + VP9_UNPACK_MULSUB_2D_4X 3, 12, 1, 9, 5520, 15426 ; m3/x=t13[d], m12/x=t12[d] + VP9_RND_SH_SUMSUB_BA 12, 4, 9, 6, 14, [pd_8192] ; m12=t4[w], m4=t12[w] + VP9_RND_SH_SUMSUB_BA 3, 11, 1, 10, 14, [pd_8192] ; m3=t5[w], m11=t13[w] + + ; second half of round 2 t8-15 + VP9_UNPACK_MULSUB_2D_4X 0, 15, 6, 10, 16069, 3196 ; m15/x=t8[d], m0/x=t9[d] + VP9_UNPACK_MULSUB_2D_4X 11, 4, 9, 1, 3196, 16069 ; m11/x=t12[d], m4/x=t13[d] + VP9_RND_SH_SUMSUB_BA 11, 15, 9, 10, 14, [pd_8192] ; m11=t8[w], m15=t12[w] + VP9_RND_SH_SUMSUB_BA 4, 0, 1, 6, 14, [pd_8192] ; m4=t9[w], m0=t13[w] + + SUMSUB_BA w, 12, 8, 14 ; m12=t0, m8=t4 + SUMSUB_BA w, 3, 7, 14 ; m3=t1, m7=t5 + + mova m10, [blockq+ 96] + mova [blockq+ 96], m12 + + ; round 3 + VP9_UNPACK_MULSUB_2D_4X 15, 0, 9, 12, 15137, 6270 ; m15/x=t13[d], m0/x=t12[d] + VP9_UNPACK_MULSUB_2D_4X 2, 13, 1, 6, 6270, 15137 ; m2/x=t14[d], m13/x=t15[d] + VP9_RND_SH_SUMSUB_BA 2, 0, 1, 12, 14, [pd_8192] ; m2=out2[w], m0=t14a[w] + VP9_RND_SH_SUMSUB_BA 13, 15, 6, 9, 14, [pd_8192] + PSIGNW m13, [pw_m1] ; m13=out13[w], m15=t15a[w] + + VP9_UNPACK_MULSUB_2D_4X 8, 7, 12, 9, 15137, 6270 ; m8/x=t5[d], m7/x=t4[d] + VP9_UNPACK_MULSUB_2D_4X 5, 10, 1, 6, 6270, 15137 ; m5/x=t6[d], m10/x=t7[d] + VP9_RND_SH_SUMSUB_BA 5, 7, 1, 9, 14, [pd_8192] + PSIGNW m5, [pw_m1] ; m5=out3[w], m7=t6[w] + VP9_RND_SH_SUMSUB_BA 10, 8, 6, 12, 14, [pd_8192] ; m10=out12[w], m8=t7[w] + + mova m1, [blockq+ 0] + mova m14, [blockq+128] + mova m6, [blockq+ 32] + mova m9, [blockq+ 64] + mova m12, [blockq+ 96] + mova [blockq+ 0], m10 + mova [blockq+128], m5 + + SUMSUB_BA w, 14, 12, 5 ; m14=out0, m12=t2a + SUMSUB_BA w, 1, 3, 5 + PSIGNW m1, [pw_m1] ; m1=out15, m3=t3a + + SUMSUB_BA w, 9, 11, 5 + PSIGNW m9, [pw_m1] ; m9=out1, m11=t10 + SUMSUB_BA w, 6, 4, 5 ; m6=out14, m4=t11 + + VP9_UNPACK_MULSUB_2W_4X 4, 11, 11585, 11585, [pd_8192], 5, 10 ; m4=out9, m11=out6 + mova m5, [blockq+128] + mova [blockq+192], m11 + PSIGNW m15, [pw_m1] + VP9_UNPACK_MULSUB_2W_4X 15, 0, 11585, 11585, [pd_8192], 10, 11 ; m15=out5, m0=out10 + + PSIGNW m3, [pw_m1] + VP9_UNPACK_MULSUB_2W_4X 3, 12, 11585, 11585, [pd_8192], 10, 11 ; m3=out7,m12=out8 + VP9_UNPACK_MULSUB_2W_4X 8, 7, 11585, 11585, [pd_8192], 10, 11 ; m8=out11,m7=out4 + + mova m10, [blockq+ 0] + + SWAP 0, 14, 6, 11, 8, 12, 10 + SWAP 1, 9, 15, 4, 7, 3, 5 + SWAP 5, 9, 15 +%endmacro + +%if ARCH_X86_64 && HAVE_AVX2_EXTERNAL +%macro IADST16_YMM_FN 4 +INIT_YMM avx2 +cglobal vp9_%1_%3_16x16_add, 4, 4, 16, dst, stride, block, eob + mova m1, [blockq+ 32] + mova m2, [blockq+ 64] + mova m3, [blockq+ 96] + mova m5, [blockq+160] + mova m6, [blockq+192] + mova m7, [blockq+224] + mova m8, [blockq+256] + mova m9, [blockq+288] + mova m10, [blockq+320] + mova m11, [blockq+352] + mova m12, [blockq+384] + mova m13, [blockq+416] + mova m14, [blockq+448] + mova m15, [blockq+480] + + VP9_%2_YMM_1D + TRANSPOSE16x16W 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, \ + [blockq+192], [blockq+128], 1 + mova [blockq+ 0], m0 + VP9_%4_YMM_1D + + mova [blockq+224], m7 + + ; store + VP9_IDCT8_WRITEx2 0, 1, 6, 7, unused, [pw_512], 6 + lea dstq, [dstq+2*strideq] + VP9_IDCT8_WRITEx2 2, 3, 6, 7, unused, [pw_512], 6 + lea dstq, [dstq+2*strideq] + VP9_IDCT8_WRITEx2 4, 5, 6, 7, unused, [pw_512], 6 + lea dstq, [dstq+2*strideq] + mova m6, [blockq+192] + mova m7, [blockq+224] + VP9_IDCT8_WRITEx2 6, 7, 1, 2, unused, [pw_512], 6 + lea dstq, [dstq+2*strideq] + VP9_IDCT8_WRITEx2 8, 9, 1, 2, unused, [pw_512], 6 + lea dstq, [dstq+2*strideq] + VP9_IDCT8_WRITEx2 10, 11, 1, 2, unused, [pw_512], 6 + lea dstq, [dstq+2*strideq] + VP9_IDCT8_WRITEx2 12, 13, 1, 2, unused, [pw_512], 6 + lea dstq, [dstq+2*strideq] + VP9_IDCT8_WRITEx2 14, 15, 1, 2, unused, [pw_512], 6 + lea dstq, [dstq+2*strideq] + + ; at the end of the loop, m0 should still be zero + ; use that to zero out block coefficients + pxor m0, m0 + ZERO_BLOCK blockq, 32, 16, m0 + RET +%endmacro + +IADST16_YMM_FN idct, IDCT16, iadst, IADST16 +IADST16_YMM_FN iadst, IADST16, idct, IDCT16 +IADST16_YMM_FN iadst, IADST16, iadst, IADST16 +%endif + +;--------------------------------------------------------------------------------------------- +; void vp9_idct_idct_32x32_add_<opt>(uint8_t *dst, ptrdiff_t stride, int16_t *block, int eob); +;--------------------------------------------------------------------------------------------- + +%macro VP9_IDCT32_1D 2-3 32 ; src, pass, nnzc +%if %2 == 1 +%assign %%str mmsize +%else +%assign %%str 64 +%endif + + ; first do t0-15, this can be done identical to idct16x16 + VP9_IDCT16_1D_START %1, %3/2, 64*2, tmpq, 2*%%str, 1 + + ; store everything on stack to make space available for t16-31 + ; we store interleaved with the output of the second half (t16-31) + ; so we don't need to allocate extra stack space + mova [tmpq+ 0*%%str], m0 ; t0 + mova [tmpq+ 4*%%str], m1 ; t1 + mova [tmpq+ 8*%%str], m2 ; t2 + mova [tmpq+12*%%str], m3 ; t3 + mova [tmpq+16*%%str], m4 ; t4 + mova [tmpq+20*%%str], m5 ; t5 +%if ARCH_X86_64 + mova [tmpq+22*%%str], m10 ; t10 + mova [tmpq+18*%%str], m11 ; t11 + mova [tmpq+14*%%str], m12 ; t12 + mova [tmpq+10*%%str], m13 ; t13 + mova [tmpq+ 6*%%str], m14 ; t14 + mova [tmpq+ 2*%%str], m15 ; t15 +%endif + + mova m0, [tmpq+ 30*%%str] + UNSCRATCH 1, 6, tmpq+26*%%str + UNSCRATCH 2, 8, tmpq+24*%%str + UNSCRATCH 3, 9, tmpq+28*%%str + SUMSUB_BA w, 1, 3, 4 ; t6, t9 + SUMSUB_BA w, 0, 2, 4 ; t7, t8 + + mova [tmpq+24*%%str], m1 ; t6 + mova [tmpq+28*%%str], m0 ; t7 + mova [tmpq+30*%%str], m2 ; t8 + mova [tmpq+26*%%str], m3 ; t9 + + ; then, secondly, do t16-31 +%if %3 <= 8 + mova m4, [%1+ 1*64] + mova m7, [%1+ 7*64] + + pmulhrsw m1, m4, [pw_16364x2] ;t31 + pmulhrsw m4, [pw_804x2] ;t16 + + VP9_UNPACK_MULSUB_2W_4X 5, 0, 1, 4, 16069, 3196, [pd_8192], 6, 2 ; t17, t30 + + pmulhrsw m3, m7, [pw_m5520x2] ;t19 + pmulhrsw m7, [pw_15426x2] ;t28 + + SCRATCH 4, 13, tmpq+ 1*%%str + SCRATCH 5, 12, tmpq+15*%%str + + VP9_UNPACK_MULSUB_2W_4X 2, 6, 7, 3, 3196, m16069, [pd_8192], 4, 5 ; t18, t29 +%else + mova m0, [%1+ 1*64] + mova m1, [%1+15*64] +%if %3 <= 16 + pmulhrsw m5, m0, [pw_16364x2] + pmulhrsw m0, [pw_804x2] + pmulhrsw m4, m1, [pw_m11003x2] + pmulhrsw m1, [pw_12140x2] +%else + mova m4, [%1+17*64] + mova m5, [%1+31*64] + + VP9_UNPACK_MULSUB_2W_4X 0, 5, 16364, 804, [pd_8192], 2, 3 ; t16, t31 + VP9_UNPACK_MULSUB_2W_4X 4, 1, 11003, 12140, [pd_8192], 2, 3 ; t17, t30 +%endif + SUMSUB_BA w, 4, 0, 2 + SUMSUB_BA w, 1, 5, 2 + + VP9_UNPACK_MULSUB_2W_4X 5, 0, 16069, 3196, [pd_8192], 2, 3 ; t17, t30 + + SCRATCH 4, 13, tmpq+ 1*%%str + SCRATCH 5, 12, tmpq+15*%%str + + mova m2, [%1+ 7*64] + mova m3, [%1+ 9*64] +%if %3 <= 16 + pmulhrsw m7, m3, [pw_14811x2] + pmulhrsw m3, [pw_7005x2] + pmulhrsw m6, m2, [pw_m5520x2] + pmulhrsw m2, [pw_15426x2] +%else + mova m7, [%1+23*64] + mova m6, [%1+25*64] + + VP9_UNPACK_MULSUB_2W_4X 3, 7, 14811, 7005, [pd_8192], 4, 5 ; t18, t29 + VP9_UNPACK_MULSUB_2W_4X 6, 2, 5520, 15426, [pd_8192], 4, 5 ; t19, t28 +%endif + SUMSUB_BA w, 3, 6, 4 + SUMSUB_BA w, 7, 2, 4 + + VP9_UNPACK_MULSUB_2W_4X 2, 6, 3196, m16069, [pd_8192], 4, 5 ; t18, t29 +%endif + + UNSCRATCH 5, 12, tmpq+15*%%str + SUMSUB_BA w, 6, 0, 4 + mova [tmpq+25*%%str], m6 ; t19 + UNSCRATCH 4, 13, tmpq+ 1*%%str + SUMSUB_BA w, 7, 1, 6 + SUMSUB_BA w, 3, 4, 6 + mova [tmpq+23*%%str], m3 ; t16 + SUMSUB_BA w, 2, 5, 6 + + VP9_UNPACK_MULSUB_2W_4X 0, 5, 15137, 6270, [pd_8192], 6, 3 ; t18, t29 + VP9_UNPACK_MULSUB_2W_4X 1, 4, 15137, 6270, [pd_8192], 6, 3 ; t19, t28 + + SCRATCH 0, 10, tmpq+ 1*%%str + SCRATCH 1, 11, tmpq+ 7*%%str + SCRATCH 2, 9, tmpq+ 9*%%str + SCRATCH 4, 14, tmpq+15*%%str + SCRATCH 5, 15, tmpq+17*%%str + SCRATCH 7, 13, tmpq+31*%%str + +%if %3 <= 8 + mova m0, [%1+ 5*64] + mova m3, [%1+ 3*64] + + pmulhrsw m5, m0, [pw_15893x2] ;t27 + pmulhrsw m0, [pw_3981x2] ;t20 + + VP9_UNPACK_MULSUB_2W_4X 1, 4, 5, 0, 9102, 13623, [pd_8192], 7, 2 ; t21, t26 + + pmulhrsw m6, m3, [pw_m2404x2] ;t23 + pmulhrsw m3, [pw_16207x2] ;t24 + + SCRATCH 5, 8, tmpq+ 5*%%str + SCRATCH 4, 12, tmpq+11*%%str + + VP9_UNPACK_MULSUB_2W_4X 7, 2, 3, 6, 13623, m9102, [pd_8192], 4, 5 ; t22, t25 +%else + mova m4, [%1+ 5*64] + mova m5, [%1+11*64] +%if %3 <= 16 + pmulhrsw m1, m4, [pw_15893x2] + pmulhrsw m4, [pw_3981x2] + pmulhrsw m0, m5, [pw_m8423x2] + pmulhrsw m5, [pw_14053x2] +%else + mova m0, [%1+21*64] + mova m1, [%1+27*64] + + VP9_UNPACK_MULSUB_2W_4X 4, 1, 15893, 3981, [pd_8192], 2, 3 ; t20, t27 + VP9_UNPACK_MULSUB_2W_4X 0, 5, 8423, 14053, [pd_8192], 2, 3 ; t21, t26 +%endif + SUMSUB_BA w, 0, 4, 2 + SUMSUB_BA w, 5, 1, 2 + + VP9_UNPACK_MULSUB_2W_4X 1, 4, 9102, 13623, [pd_8192], 2, 3 ; t21, t26 + + SCRATCH 5, 8, tmpq+ 5*%%str + SCRATCH 4, 12, tmpq+11*%%str + + mova m7, [%1+ 3*64] + mova m6, [%1+13*64] +%if %3 <= 16 + pmulhrsw m3, m6, [pw_13160x2] + pmulhrsw m6, [pw_9760x2] + pmulhrsw m2, m7, [pw_m2404x2] + pmulhrsw m7, [pw_16207x2] +%else + mova m2, [%1+29*64] + mova m3, [%1+19*64] + VP9_UNPACK_MULSUB_2W_4X 6, 3, 13160, 9760, [pd_8192], 4, 5 ; t22, t25 + VP9_UNPACK_MULSUB_2W_4X 2, 7, 2404, 16207, [pd_8192], 4, 5 ; t23, t24 +%endif + SUMSUB_BA w, 6, 2, 4 + SUMSUB_BA w, 3, 7, 4 + + VP9_UNPACK_MULSUB_2W_4X 7, 2, 13623, m9102, [pd_8192], 4, 5 ; t22, t25 +%endif + + ; m4=t16, m5=t17, m9=t18, m8=t19, m0=t20, m1=t21, m13=t22, m12=t23, + ; m3=t24, m2=t25, m14=t26, m15=t27, m7=t28, m6=t29, m10=t30, m11=t31 + + UNSCRATCH 4, 12, tmpq+11*%%str + SUMSUB_BA w, 0, 6, 5 + SUMSUB_BA w, 4, 2, 5 + UNSCRATCH 5, 8, tmpq+ 5*%%str + SCRATCH 4, 8, tmpq+11*%%str + SUMSUB_BA w, 1, 7, 4 + SUMSUB_BA w, 5, 3, 4 + SCRATCH 5, 12, tmpq+ 5*%%str + + VP9_UNPACK_MULSUB_2W_4X 3, 6, 6270, m15137, [pd_8192], 4, 5 ; t20, t27 + VP9_UNPACK_MULSUB_2W_4X 2, 7, 6270, m15137, [pd_8192], 4, 5 ; t21, t26 + + ; m8[s]=t16, m9=t17, m5=t18, m4[s]=t19, m12=t20, m13=t21, m1=t22, m0=t23, + ; m15=t24, m14=t25, m2=t26, m3=t27, m11=t28, m10=t29, m6=t30, m7=t31 + + UNSCRATCH 5, 9, tmpq+ 9*%%str + mova m4, [tmpq+23*%%str] ; t16 +%if ARCH_X86_64 + SUMSUB_BA w, 1, 5, 9 + SUMSUB_BA w, 0, 4, 9 +%else + SUMSUB_BADC w, 1, 5, 0, 4 +%endif + mova [tmpq+29*%%str], m1 ; t17 + mova [tmpq+21*%%str], m0 ; t16 + UNSCRATCH 0, 10, tmpq+ 1*%%str + UNSCRATCH 1, 11, tmpq+ 7*%%str +%if ARCH_X86_64 + SUMSUB_BA w, 2, 0, 9 + SUMSUB_BA w, 3, 1, 9 +%else + SUMSUB_BADC w, 2, 0, 3, 1 +%endif + mova [tmpq+ 9*%%str], m2 ; t18 + mova [tmpq+13*%%str], m3 ; t19 + SCRATCH 0, 10, tmpq+23*%%str + SCRATCH 1, 11, tmpq+27*%%str + + UNSCRATCH 2, 14, tmpq+15*%%str + UNSCRATCH 3, 15, tmpq+17*%%str + SUMSUB_BA w, 6, 2, 0 + SUMSUB_BA w, 7, 3, 0 + SCRATCH 6, 14, tmpq+ 3*%%str + SCRATCH 7, 15, tmpq+ 7*%%str + + UNSCRATCH 0, 8, tmpq+11*%%str + mova m1, [tmpq+25*%%str] ; t19 + UNSCRATCH 6, 12, tmpq+ 5*%%str + UNSCRATCH 7, 13, tmpq+31*%%str +%if ARCH_X86_64 + SUMSUB_BA w, 0, 1, 9 + SUMSUB_BA w, 6, 7, 9 +%else + SUMSUB_BADC w, 0, 1, 6, 7 +%endif + + ; m0=t16, m1=t17, m2=t18, m3=t19, m11=t20, m10=t21, m9=t22, m8=t23, + ; m7=t24, m6=t25, m5=t26, m4=t27, m12=t28, m13=t29, m14=t30, m15=t31 + +%if 0; cpuflag(ssse3) +%if ARCH_X86_64 + SUMSUB_BA w, 4, 7, 8 + SUMSUB_BA w, 5, 1, 8 +%else + SUMSUB_BADC w, 4, 7, 5, 1 +%endif + + pmulhrsw m7, [pw_11585x2] + pmulhrsw m4, [pw_11585x2] + pmulhrsw m1, [pw_11585x2] + pmulhrsw m5, [pw_11585x2] + + mova [tmpq+ 5*%%str], m7 ; t23 + SCRATCH 1, 13, tmpq+25*%%str + UNSCRATCH 7, 10, tmpq+23*%%str + UNSCRATCH 1, 11, tmpq+27*%%str + +%if ARCH_X86_64 + SUMSUB_BA w, 7, 3, 10 + SUMSUB_BA w, 1, 2, 10 +%else + SUMSUB_BADC w, 7, 3, 1, 2 +%endif + + pmulhrsw m3, [pw_11585x2] + pmulhrsw m7, [pw_11585x2] + pmulhrsw m2, [pw_11585x2] + pmulhrsw m1, [pw_11585x2] +%else + SCRATCH 0, 8, tmpq+15*%%str + SCRATCH 6, 9, tmpq+17*%%str + VP9_UNPACK_MULSUB_2W_4X 7, 4, 11585, 11585, [pd_8192], 0, 6 + mova [tmpq+ 5*%%str], m7 ; t23 + UNSCRATCH 7, 10, tmpq+23*%%str + VP9_UNPACK_MULSUB_2W_4X 1, 5, 11585, 11585, [pd_8192], 0, 6 + SCRATCH 1, 13, tmpq+25*%%str + UNSCRATCH 1, 11, tmpq+27*%%str + VP9_UNPACK_MULSUB_2W_4X 3, 7, 11585, 11585, [pd_8192], 0, 6 + VP9_UNPACK_MULSUB_2W_4X 2, 1, 11585, 11585, [pd_8192], 0, 6 + UNSCRATCH 0, 8, tmpq+15*%%str + UNSCRATCH 6, 9, tmpq+17*%%str +%endif + + ; m0=t16, m1=t17, m2=t18, m3=t19, m4=t20, m5=t21, m6=t22, m7=t23, + ; m8=t24, m9=t25, m10=t26, m11=t27, m12=t28, m13=t29, m14=t30, m15=t31 + + ; then do final pass to sumsub+store the two halves +%if %2 == 1 + mova [tmpq+17*%%str], m2 ; t20 + mova [tmpq+ 1*%%str], m3 ; t21 +%if ARCH_X86_64 + mova [tmpq+25*%%str], m13 ; t22 + + mova m8, [tmpq+ 0*%%str] ; t0 + mova m9, [tmpq+ 4*%%str] ; t1 + mova m12, [tmpq+ 8*%%str] ; t2 + mova m11, [tmpq+12*%%str] ; t3 + mova m2, [tmpq+16*%%str] ; t4 + mova m3, [tmpq+20*%%str] ; t5 + mova m13, [tmpq+24*%%str] ; t6 + + SUMSUB_BA w, 6, 8, 10 + mova [tmpq+ 3*%%str], m8 ; t15 + SUMSUB_BA w, 0, 9, 8 + SUMSUB_BA w, 15, 12, 8 + SUMSUB_BA w, 14, 11, 8 + SUMSUB_BA w, 1, 2, 8 + SUMSUB_BA w, 7, 3, 8 + SUMSUB_BA w, 5, 13, 8 + mova m10, [tmpq+28*%%str] ; t7 + SUMSUB_BA w, 4, 10, 8 +%if cpuflag(avx2) + ; the "shitty" about this idct is that the final pass does the outermost + ; interleave sumsubs (t0/31, t1/30, etc) but the tN for the 16x16 need + ; to be sequential, which means I need to load/store half of the sumsub + ; intermediates back to/from memory to get a 16x16 transpose going... + ; This would be easier if we had more (e.g. 32) YMM regs here. + mova [tmpq+ 7*%%str], m9 + mova [tmpq+11*%%str], m12 + mova [tmpq+15*%%str], m11 + mova [tmpq+19*%%str], m2 + mova [tmpq+23*%%str], m3 + mova [tmpq+27*%%str], m13 + mova [tmpq+31*%%str], m10 + mova [tmpq+12*%%str], m5 + + mova m13, [tmpq+30*%%str] ; t8 + mova m12, [tmpq+26*%%str] ; t9 + mova m11, [tmpq+22*%%str] ; t10 + mova m10, [tmpq+18*%%str] ; t11 + mova m9, [tmpq+17*%%str] ; t20 + mova m8, [tmpq+ 1*%%str] ; t21 + mova m3, [tmpq+25*%%str] ; t22 + mova m2, [tmpq+ 5*%%str] ; t23 + + SUMSUB_BA w, 9, 10, 5 + SUMSUB_BA w, 8, 11, 5 + SUMSUB_BA w, 3, 12, 5 + SUMSUB_BA w, 2, 13, 5 + mova [tmpq+ 1*%%str], m10 + mova [tmpq+ 5*%%str], m11 + mova [tmpq+17*%%str], m12 + mova [tmpq+25*%%str], m13 + + mova m13, [tmpq+14*%%str] ; t12 + mova m12, [tmpq+10*%%str] ; t13 + mova m11, [tmpq+ 9*%%str] ; t18 + mova m10, [tmpq+13*%%str] ; t19 + + SUMSUB_BA w, 11, 12, 5 + SUMSUB_BA w, 10, 13, 5 + mova [tmpq+ 9*%%str], m13 + mova [tmpq+13*%%str], m12 + mova [tmpq+10*%%str], m10 + mova [tmpq+14*%%str], m11 + + mova m13, [tmpq+ 6*%%str] ; t14 + mova m12, [tmpq+ 2*%%str] ; t15 + mova m11, [tmpq+21*%%str] ; t16 + mova m10, [tmpq+29*%%str] ; t17 + SUMSUB_BA w, 11, 12, 5 + SUMSUB_BA w, 10, 13, 5 + mova [tmpq+21*%%str], m12 + mova [tmpq+29*%%str], m13 + mova m12, [tmpq+10*%%str] + mova m13, [tmpq+14*%%str] + + TRANSPOSE16x16W 6, 0, 15, 14, 1, 7, 5, 4, \ + 2, 3, 8, 9, 12, 13, 10, 11, \ + [tmpq+12*%%str], [tmpq+ 8*%%str], 1 + mova [tmpq+ 0*%%str], m6 + mova [tmpq+ 2*%%str], m0 + mova [tmpq+ 4*%%str], m15 + mova [tmpq+ 6*%%str], m14 + mova [tmpq+10*%%str], m7 + mova [tmpq+12*%%str], m5 + mova [tmpq+14*%%str], m4 + mova [tmpq+16*%%str], m2 + mova [tmpq+18*%%str], m3 + mova [tmpq+20*%%str], m8 + mova [tmpq+22*%%str], m9 + mova [tmpq+24*%%str], m12 + mova [tmpq+26*%%str], m13 + mova [tmpq+28*%%str], m10 + mova [tmpq+30*%%str], m11 + + mova m0, [tmpq+21*%%str] + mova m1, [tmpq+29*%%str] + mova m2, [tmpq+13*%%str] + mova m3, [tmpq+ 9*%%str] + mova m4, [tmpq+ 1*%%str] + mova m5, [tmpq+ 5*%%str] + mova m7, [tmpq+25*%%str] + mova m8, [tmpq+31*%%str] + mova m9, [tmpq+27*%%str] + mova m10, [tmpq+23*%%str] + mova m11, [tmpq+19*%%str] + mova m12, [tmpq+15*%%str] + mova m13, [tmpq+11*%%str] + mova m14, [tmpq+ 7*%%str] + mova m15, [tmpq+ 3*%%str] + TRANSPOSE16x16W 0, 1, 2, 3, 4, 5, 6, 7, \ + 8, 9, 10, 11, 12, 13, 14, 15, \ + [tmpq+17*%%str], [tmpq+ 9*%%str], 1 + mova [tmpq+ 1*%%str], m0 + mova [tmpq+ 3*%%str], m1 + mova [tmpq+ 5*%%str], m2 + mova [tmpq+ 7*%%str], m3 + mova [tmpq+11*%%str], m5 + mova [tmpq+13*%%str], m6 + mova [tmpq+15*%%str], m7 + mova [tmpq+17*%%str], m8 + mova [tmpq+19*%%str], m9 + mova [tmpq+21*%%str], m10 + mova [tmpq+23*%%str], m11 + mova [tmpq+25*%%str], m12 + mova [tmpq+27*%%str], m13 + mova [tmpq+29*%%str], m14 + mova [tmpq+31*%%str], m15 +%else ; !avx2 + TRANSPOSE8x8W 6, 0, 15, 14, 1, 7, 5, 4, 8 + mova [tmpq+ 0*%%str], m6 + mova [tmpq+ 4*%%str], m0 + mova [tmpq+ 8*%%str], m15 + mova [tmpq+12*%%str], m14 + mova [tmpq+16*%%str], m1 + mova [tmpq+20*%%str], m7 + mova [tmpq+24*%%str], m5 + mova [tmpq+28*%%str], m4 + + mova m8, [tmpq+ 3*%%str] ; t15 + TRANSPOSE8x8W 10, 13, 3, 2, 11, 12, 9, 8, 0 + mova [tmpq+ 3*%%str], m10 + mova [tmpq+ 7*%%str], m13 + mova [tmpq+11*%%str], m3 + mova [tmpq+15*%%str], m2 + mova [tmpq+19*%%str], m11 + mova [tmpq+23*%%str], m12 + mova [tmpq+27*%%str], m9 + mova [tmpq+31*%%str], m8 + + mova m15, [tmpq+30*%%str] ; t8 + mova m14, [tmpq+26*%%str] ; t9 + mova m13, [tmpq+22*%%str] ; t10 + mova m12, [tmpq+18*%%str] ; t11 + mova m11, [tmpq+14*%%str] ; t12 + mova m10, [tmpq+10*%%str] ; t13 + mova m9, [tmpq+ 6*%%str] ; t14 + mova m8, [tmpq+ 2*%%str] ; t15 + mova m7, [tmpq+21*%%str] ; t16 + mova m6, [tmpq+29*%%str] ; t17 + mova m5, [tmpq+ 9*%%str] ; t18 + mova m4, [tmpq+13*%%str] ; t19 + mova m3, [tmpq+17*%%str] ; t20 + mova m2, [tmpq+ 1*%%str] ; t21 + mova m1, [tmpq+25*%%str] ; t22 + + SUMSUB_BA w, 7, 8, 0 + mova [tmpq+ 2*%%str], m8 + mova m0, [tmpq+ 5*%%str] ; t23 + SUMSUB_BA w, 6, 9, 8 + SUMSUB_BA w, 5, 10, 8 + SUMSUB_BA w, 4, 11, 8 + SUMSUB_BA w, 3, 12, 8 + SUMSUB_BA w, 2, 13, 8 + SUMSUB_BA w, 1, 14, 8 + SUMSUB_BA w, 0, 15, 8 + + TRANSPOSE8x8W 0, 1, 2, 3, 4, 5, 6, 7, 8 + mova [tmpq+ 1*%%str], m0 + mova [tmpq+ 5*%%str], m1 + mova [tmpq+ 9*%%str], m2 + mova [tmpq+13*%%str], m3 + mova [tmpq+17*%%str], m4 + mova [tmpq+21*%%str], m5 + mova [tmpq+25*%%str], m6 + mova [tmpq+29*%%str], m7 + + mova m8, [tmpq+ 2*%%str] + TRANSPOSE8x8W 8, 9, 10, 11, 12, 13, 14, 15, 0 + mova [tmpq+ 2*%%str], m8 + mova [tmpq+ 6*%%str], m9 + mova [tmpq+10*%%str], m10 + mova [tmpq+14*%%str], m11 + mova [tmpq+18*%%str], m12 + mova [tmpq+22*%%str], m13 + mova [tmpq+26*%%str], m14 + mova [tmpq+30*%%str], m15 +%endif ; avx2 +%else + mova m2, [tmpq+24*%%str] ; t6 + mova m3, [tmpq+28*%%str] ; t7 + SUMSUB_BADC w, 5, 2, 4, 3 + mova [tmpq+24*%%str], m5 + mova [tmpq+23*%%str], m2 + mova [tmpq+28*%%str], m4 + mova [tmpq+19*%%str], m3 + + mova m2, [tmpq+16*%%str] ; t4 + mova m3, [tmpq+20*%%str] ; t5 + SUMSUB_BA w, 1, 2, 5 + SUMSUB_BA w, 7, 3, 5 + mova [tmpq+15*%%str], m2 + mova [tmpq+11*%%str], m3 + + mova m2, [tmpq+ 0*%%str] ; t0 + mova m3, [tmpq+ 4*%%str] ; t1 + SUMSUB_BA w, 6, 2, 5 + SUMSUB_BA w, 0, 3, 5 + mova [tmpq+31*%%str], m2 + mova [tmpq+27*%%str], m3 + + mova m2, [tmpq+ 8*%%str] ; t2 + mova m3, [tmpq+12*%%str] ; t3 + mova m5, [tmpq+ 7*%%str] + mova m4, [tmpq+ 3*%%str] + SUMSUB_BADC w, 5, 2, 4, 3 + mova [tmpq+ 7*%%str], m2 + mova [tmpq+ 3*%%str], m3 + + mova m3, [tmpq+28*%%str] + TRANSPOSE8x8W 6, 0, 5, 4, 1, 7, 2, 3, [tmpq+24*%%str], [tmpq+16*%%str], 1 + mova [tmpq+ 0*%%str], m6 + mova [tmpq+ 4*%%str], m0 + mova [tmpq+ 8*%%str], m5 + mova [tmpq+12*%%str], m4 + mova [tmpq+20*%%str], m7 + mova [tmpq+24*%%str], m2 + mova [tmpq+28*%%str], m3 + + mova m6, [tmpq+19*%%str] + mova m0, [tmpq+23*%%str] + mova m5, [tmpq+11*%%str] + mova m4, [tmpq+15*%%str] + mova m1, [tmpq+ 3*%%str] + mova m7, [tmpq+ 7*%%str] + mova m3, [tmpq+31*%%str] + TRANSPOSE8x8W 6, 0, 5, 4, 1, 7, 2, 3, [tmpq+27*%%str], [tmpq+19*%%str], 1 + mova [tmpq+ 3*%%str], m6 + mova [tmpq+ 7*%%str], m0 + mova [tmpq+11*%%str], m5 + mova [tmpq+15*%%str], m4 + mova [tmpq+23*%%str], m7 + mova [tmpq+27*%%str], m2 + mova [tmpq+31*%%str], m3 + + mova m1, [tmpq+ 6*%%str] ; t14 + mova m0, [tmpq+ 2*%%str] ; t15 + mova m7, [tmpq+21*%%str] ; t16 + mova m6, [tmpq+29*%%str] ; t17 + SUMSUB_BA w, 7, 0, 2 + SUMSUB_BA w, 6, 1, 2 + mova [tmpq+29*%%str], m7 + mova [tmpq+ 2*%%str], m0 + mova [tmpq+21*%%str], m6 + mova [tmpq+ 6*%%str], m1 + + mova m1, [tmpq+14*%%str] ; t12 + mova m0, [tmpq+10*%%str] ; t13 + mova m5, [tmpq+ 9*%%str] ; t18 + mova m4, [tmpq+13*%%str] ; t19 + SUMSUB_BA w, 5, 0, 2 + SUMSUB_BA w, 4, 1, 2 + mova [tmpq+10*%%str], m0 + mova [tmpq+14*%%str], m1 + + mova m1, [tmpq+22*%%str] ; t10 + mova m0, [tmpq+18*%%str] ; t11 + mova m3, [tmpq+17*%%str] ; t20 + mova m2, [tmpq+ 1*%%str] ; t21 + SUMSUB_BA w, 3, 0, 6 + SUMSUB_BA w, 2, 1, 6 + mova [tmpq+18*%%str], m0 + mova [tmpq+22*%%str], m1 + + mova m7, [tmpq+30*%%str] ; t8 + mova m6, [tmpq+26*%%str] ; t9 + mova m1, [tmpq+25*%%str] ; t22 + mova m0, [tmpq+ 5*%%str] ; t23 + SUMSUB_BADC w, 1, 6, 0, 7 + mova [tmpq+26*%%str], m6 + mova [tmpq+30*%%str], m7 + + mova m7, [tmpq+29*%%str] + TRANSPOSE8x8W 0, 1, 2, 3, 4, 5, 6, 7, [tmpq+21*%%str], [tmpq+17*%%str], 1 + mova [tmpq+ 1*%%str], m0 + mova [tmpq+ 5*%%str], m1 + mova [tmpq+ 9*%%str], m2 + mova [tmpq+13*%%str], m3 + mova [tmpq+21*%%str], m5 + mova [tmpq+25*%%str], m6 + mova [tmpq+29*%%str], m7 + + mova m0, [tmpq+ 2*%%str] + mova m1, [tmpq+ 6*%%str] + mova m2, [tmpq+10*%%str] + mova m3, [tmpq+14*%%str] + mova m4, [tmpq+18*%%str] + mova m5, [tmpq+22*%%str] + mova m7, [tmpq+30*%%str] + TRANSPOSE8x8W 0, 1, 2, 3, 4, 5, 6, 7, [tmpq+26*%%str], [tmpq+18*%%str], 1 + mova [tmpq+ 2*%%str], m0 + mova [tmpq+ 6*%%str], m1 + mova [tmpq+10*%%str], m2 + mova [tmpq+14*%%str], m3 + mova [tmpq+22*%%str], m5 + mova [tmpq+26*%%str], m6 + mova [tmpq+30*%%str], m7 +%endif +%else + ; t0-7 is in [tmpq+{0,4,8,12,16,20,24,28}*%%str] + ; t8-15 is in [tmpq+{2,6,10,14,18,22,26,30}*%%str] + ; t16-19 and t23 is in [tmpq+{1,5,9,13,29}*%%str] + ; t20-22 is in m4-6 + ; t24-31 is in m8-15 + +%if cpuflag(ssse3) +%define ROUND_REG [pw_512] +%else +%define ROUND_REG [pw_32] +%endif + +%macro %%STORE_2X2 7-8 1 ; src[1-4], tmp[1-2], zero, inc_dst_ptrs + SUMSUB_BA w, %4, %1, %5 + SUMSUB_BA w, %3, %2, %5 + VP9_IDCT8_WRITEx2 %4, %3, %5, %6, %7, ROUND_REG, 6 +%if %8 == 1 + add dstq, stride2q +%endif + VP9_IDCT8_WRITEx2 %2, %1, %5, %6, %7, ROUND_REG, 6, dst_endq +%if %8 == 1 + sub dst_endq, stride2q +%endif +%endmacro + +%if ARCH_X86_64 + pxor m10, m10 + + ; store t0-1 and t30-31 + mova m8, [tmpq+ 0*%%str] + mova m9, [tmpq+ 4*%%str] + %%STORE_2X2 8, 9, 0, 6, 12, 11, 10 + + ; store t2-3 and t28-29 + mova m8, [tmpq+ 8*%%str] + mova m9, [tmpq+12*%%str] + %%STORE_2X2 8, 9, 14, 15, 12, 11, 10 + + ; store t4-5 and t26-27 + mova m8, [tmpq+16*%%str] + mova m9, [tmpq+20*%%str] + %%STORE_2X2 8, 9, 7, 1, 12, 11, 10 + + ; store t6-7 and t24-25 + mova m8, [tmpq+24*%%str] + mova m9, [tmpq+28*%%str] + %%STORE_2X2 8, 9, 4, 5, 12, 11, 10 + + ; store t8-9 and t22-23 + mova m8, [tmpq+30*%%str] + mova m9, [tmpq+26*%%str] + mova m0, [tmpq+ 5*%%str] + %%STORE_2X2 8, 9, 13, 0, 12, 11, 10 + + ; store t10-11 and t20-21 + mova m8, [tmpq+22*%%str] + mova m9, [tmpq+18*%%str] + %%STORE_2X2 8, 9, 2, 3, 12, 11, 10 + + ; store t12-13 and t18-19 + mova m8, [tmpq+14*%%str] + mova m9, [tmpq+10*%%str] + mova m5, [tmpq+13*%%str] + mova m4, [tmpq+ 9*%%str] + %%STORE_2X2 8, 9, 4, 5, 12, 11, 10 + + ; store t14-17 + mova m8, [tmpq+ 6*%%str] + mova m9, [tmpq+ 2*%%str] + mova m5, [tmpq+29*%%str] + mova m4, [tmpq+21*%%str] + %%STORE_2X2 8, 9, 4, 5, 12, 11, 10, 0 + + SWAP 1, 10 ; zero +%else + mova [tmpq+ 1*%%str], m1 + mova [tmpq+11*%%str], m2 + mova [tmpq+15*%%str], m3 + mova [tmpq+17*%%str], m4 + mova [tmpq+19*%%str], m5 + pxor m1, m1 + + ; store t0-1 and t30-31 + mova m2, [tmpq+ 0*%%str] + mova m3, [tmpq+ 4*%%str] + %%STORE_2X2 2, 3, 0, 6, 4, 5, 1 + + ; store t2-3 and t28-29 + mova m2, [tmpq+ 8*%%str] + mova m3, [tmpq+12*%%str] + mova m0, [tmpq+ 3*%%str] + mova m6, [tmpq+ 7*%%str] + %%STORE_2X2 2, 3, 0, 6, 4, 5, 1 + + ; store t4-5 and t26-27 + mova m2, [tmpq+16*%%str] + mova m3, [tmpq+20*%%str] + mova m0, [tmpq+ 1*%%str] + %%STORE_2X2 2, 3, 7, 0, 4, 5, 1 + + ; store t6-7 and t24-25 + mova m2, [tmpq+24*%%str] + mova m3, [tmpq+28*%%str] + mova m0, [tmpq+17*%%str] + mova m6, [tmpq+19*%%str] + %%STORE_2X2 2, 3, 0, 6, 4, 5, 1 + + ; store t8-9 and t22-23 + mova m2, [tmpq+30*%%str] + mova m3, [tmpq+26*%%str] + mova m0, [tmpq+25*%%str] + mova m6, [tmpq+ 5*%%str] + %%STORE_2X2 2, 3, 0, 6, 4, 5, 1 + + ; store t10-11 and t20-21 + mova m2, [tmpq+22*%%str] + mova m3, [tmpq+18*%%str] + mova m0, [tmpq+11*%%str] + mova m6, [tmpq+15*%%str] + %%STORE_2X2 2, 3, 0, 6, 4, 5, 1 + + ; store t12-13 and t18-19 + mova m2, [tmpq+14*%%str] + mova m3, [tmpq+10*%%str] + mova m6, [tmpq+13*%%str] + mova m0, [tmpq+ 9*%%str] + %%STORE_2X2 2, 3, 0, 6, 4, 5, 1 + + ; store t14-17 + mova m2, [tmpq+ 6*%%str] + mova m3, [tmpq+ 2*%%str] + mova m6, [tmpq+29*%%str] + mova m0, [tmpq+21*%%str] + %%STORE_2X2 2, 3, 0, 6, 4, 5, 1, 0 +%endif +%undef ROUND_REG +%endif +%endmacro + +%macro VP9_IDCT_IDCT_32x32_ADD_XMM 1 +INIT_XMM %1 +cglobal vp9_idct_idct_32x32_add, 0, 6 + ARCH_X86_64 * 3, 16, 2048, dst, stride, block, eob + movifnidn eobd, dword eobm +%if cpuflag(ssse3) + cmp eobd, 135 + jg .idctfull + cmp eobd, 34 + jg .idct16x16 + cmp eobd, 1 + jg .idct8x8 +%else + cmp eobd, 1 + jg .idctfull +%endif + + ; dc-only case + movifnidn blockq, blockmp + movifnidn dstq, dstmp + movifnidn strideq, stridemp +%if cpuflag(ssse3) + movd m0, [blockq] + mova m1, [pw_11585x2] + pmulhrsw m0, m1 + pmulhrsw m0, m1 +%else + DEFINE_ARGS dst, stride, block, coef + movsx coefd, word [blockq] + imul coefd, 11585 + add coefd, 8192 + sar coefd, 14 + imul coefd, 11585 + add coefd, (32 << 14) + 8192 + sar coefd, 14 + 6 + movd m0, coefd +%endif + SPLATW m0, m0, q0000 +%if cpuflag(ssse3) + pmulhrsw m0, [pw_512] +%endif + pxor m5, m5 + movd [blockq], m5 +%rep 31 + VP9_STORE_2XFULL 0, 1, 2, 3, 4, 5, mmsize + add dstq, strideq +%endrep + VP9_STORE_2XFULL 0, 1, 2, 3, 4, 5, mmsize + RET + +%if ARCH_X86_64 + DEFINE_ARGS dst_bak, stride, block, cnt, dst, stride30, dst_end, stride2, tmp +%else +%define dst_bakq r0mp +%endif +%if cpuflag(ssse3) +.idct8x8: +%if ARCH_X86_32 + DEFINE_ARGS block, u1, u2, u3, u4, tmp + mov blockq, r2mp +%endif + mov tmpq, rsp + VP9_IDCT32_1D blockq, 1, 8 + +%if ARCH_X86_32 + DEFINE_ARGS dst, stride, stride30, dst_end, stride2, tmp + mov strideq, r1mp +%define cntd dword r3m +%endif + mov stride30q, strideq ; stride + lea stride2q, [strideq*2] ; stride*2 + shl stride30q, 5 ; stride*32 + mov cntd, 4 + sub stride30q, stride2q ; stride*30 +.loop2_8x8: + mov dstq, dst_bakq + lea dst_endq, [dstq+stride30q] + VP9_IDCT32_1D tmpq, 2, 8 + add dst_bakq, 8 + add tmpq, 16 + dec cntd + jg .loop2_8x8 + + ; at the end of the loop, m7 should still be zero + ; use that to zero out block coefficients +%if ARCH_X86_32 + DEFINE_ARGS block + mov blockq, r2mp +%endif + ZERO_BLOCK blockq, 64, 8, m1 + RET + +.idct16x16: +%if ARCH_X86_32 + DEFINE_ARGS block, tmp, cnt + mov blockq, r2mp +%endif + mov cntd, 2 + mov tmpq, rsp +.loop1_16x16: + VP9_IDCT32_1D blockq, 1, 16 + add blockq, 16 + add tmpq, 512 + dec cntd + jg .loop1_16x16 + +%if ARCH_X86_64 + sub blockq, 32 +%else + DEFINE_ARGS dst, stride, stride30, dst_end, stride2, tmp + mov strideq, r1mp +%define cntd dword r3m +%endif + + mov stride30q, strideq ; stride + lea stride2q, [strideq*2] ; stride*2 + shl stride30q, 5 ; stride*32 + mov cntd, 4 + mov tmpq, rsp + sub stride30q, stride2q ; stride*30 +.loop2_16x16: + mov dstq, dst_bakq + lea dst_endq, [dstq+stride30q] + VP9_IDCT32_1D tmpq, 2, 16 + add dst_bakq, 8 + add tmpq, 16 + dec cntd + jg .loop2_16x16 + + ; at the end of the loop, m7 should still be zero + ; use that to zero out block coefficients +%if ARCH_X86_32 + DEFINE_ARGS block + mov blockq, r2mp +%endif + ZERO_BLOCK blockq, 64, 16, m1 + RET +%endif + +.idctfull: +%if ARCH_X86_32 + DEFINE_ARGS block, tmp, cnt + mov blockq, r2mp +%endif + mov cntd, 4 + mov tmpq, rsp +.loop1_full: + VP9_IDCT32_1D blockq, 1 + add blockq, 16 + add tmpq, 512 + dec cntd + jg .loop1_full + +%if ARCH_X86_64 + sub blockq, 64 +%else + DEFINE_ARGS dst, stride, stride30, dst_end, stride2, tmp + mov strideq, r1mp +%define cntd dword r3m +%endif + + mov stride30q, strideq ; stride + lea stride2q, [strideq*2] ; stride*2 + shl stride30q, 5 ; stride*32 + mov cntd, 4 + mov tmpq, rsp + sub stride30q, stride2q ; stride*30 +.loop2_full: + mov dstq, dst_bakq + lea dst_endq, [dstq+stride30q] + VP9_IDCT32_1D tmpq, 2 + add dst_bakq, 8 + add tmpq, 16 + dec cntd + jg .loop2_full + + ; at the end of the loop, m7 should still be zero + ; use that to zero out block coefficients +%if ARCH_X86_32 + DEFINE_ARGS block + mov blockq, r2mp +%endif + ZERO_BLOCK blockq, 64, 32, m1 + RET +%endmacro + +VP9_IDCT_IDCT_32x32_ADD_XMM sse2 +VP9_IDCT_IDCT_32x32_ADD_XMM ssse3 +VP9_IDCT_IDCT_32x32_ADD_XMM avx + +; this is almost identical to VP9_STORE_2X, but it does two rows +; for slightly improved interleaving, and it omits vpermq since the +; input is DC so all values are identical +%macro VP9_STORE_YMM_DC_2X2 6 ; reg, tmp1, tmp2, tmp3, tmp4, zero + mova m%2, [dstq] + mova m%4, [dstq+strideq] + punpckhbw m%3, m%2, m%6 + punpcklbw m%2, m%6 + punpckhbw m%5, m%4, m%6 + punpcklbw m%4, m%6 + paddw m%3, m%1 + paddw m%2, m%1 + paddw m%5, m%1 + paddw m%4, m%1 + packuswb m%2, m%3 + packuswb m%4, m%5 + mova [dstq+strideq*0], m%2 + mova [dstq+strideq*1], m%4 +%endmacro + +%if ARCH_X86_64 && HAVE_AVX2_EXTERNAL +INIT_YMM avx2 +cglobal vp9_idct_idct_32x32_add, 4, 9, 16, 2048, dst, stride, block, eob + cmp eobd, 135 + jg .idctfull + cmp eobd, 1 + jg .idct16x16 + + ; dc-only case + mova m1, [pw_11585x2] + vpbroadcastw m0, [blockq] + pmulhrsw m0, m1 + pmulhrsw m0, m1 + pxor m5, m5 + pmulhrsw m0, [pw_512] + movd [blockq], xm5 + + DEFINE_ARGS dst, stride, cnt + mov cntd, 16 +.loop_dc: + VP9_STORE_YMM_DC_2X2 0, 1, 2, 3, 4, 5 + lea dstq, [dstq+2*strideq] + dec cntd + jg .loop_dc + RET + + DEFINE_ARGS dst_bak, stride, block, cnt, dst, stride30, dst_end, stride2, tmp +.idct16x16: + mov tmpq, rsp + VP9_IDCT32_1D blockq, 1, 16 + + mov stride30q, strideq ; stride + lea stride2q, [strideq*2] ; stride*2 + shl stride30q, 5 ; stride*32 + mov cntd, 2 + sub stride30q, stride2q ; stride*30 +.loop2_16x16: + mov dstq, dst_bakq + lea dst_endq, [dstq+stride30q] + VP9_IDCT32_1D tmpq, 2, 16 + add dst_bakq, 16 + add tmpq, 32 + dec cntd + jg .loop2_16x16 + + ; at the end of the loop, m1 should still be zero + ; use that to zero out block coefficients + ZERO_BLOCK blockq, 64, 16, m1 + RET + +.idctfull: + mov cntd, 2 + mov tmpq, rsp +.loop1_full: + VP9_IDCT32_1D blockq, 1 + add blockq, 32 + add tmpq, 1024 + dec cntd + jg .loop1_full + + sub blockq, 64 + + mov stride30q, strideq ; stride + lea stride2q, [strideq*2] ; stride*2 + shl stride30q, 5 ; stride*32 + mov cntd, 2 + mov tmpq, rsp + sub stride30q, stride2q ; stride*30 +.loop2_full: + mov dstq, dst_bakq + lea dst_endq, [dstq+stride30q] + VP9_IDCT32_1D tmpq, 2 + add dst_bakq, 16 + add tmpq, 32 + dec cntd + jg .loop2_full + + ; at the end of the loop, m1 should still be zero + ; use that to zero out block coefficients + ZERO_BLOCK blockq, 64, 32, m1 + RET +%endif diff --git a/media/ffvpx/libavcodec/x86/vp9itxfm_16bpp.asm b/media/ffvpx/libavcodec/x86/vp9itxfm_16bpp.asm new file mode 100644 index 0000000000..902685edf6 --- /dev/null +++ b/media/ffvpx/libavcodec/x86/vp9itxfm_16bpp.asm @@ -0,0 +1,2044 @@ +;****************************************************************************** +;* VP9 inverse transform x86 SIMD optimizations +;* +;* Copyright (C) 2015 Ronald S. Bultje <rsbultje gmail com> +;* +;* This file is part of FFmpeg. +;* +;* FFmpeg is free software; you can redistribute it and/or +;* modify it under the terms of the GNU Lesser General Public +;* License as published by the Free Software Foundation; either +;* version 2.1 of the License, or (at your option) any later version. +;* +;* FFmpeg is distributed in the hope that it will be useful, +;* but WITHOUT ANY WARRANTY; without even the implied warranty of +;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +;* Lesser General Public License for more details. +;* +;* You should have received a copy of the GNU Lesser General Public +;* License along with FFmpeg; if not, write to the Free Software +;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA +;****************************************************************************** + +%include "libavutil/x86/x86util.asm" +%include "vp9itxfm_template.asm" + +SECTION_RODATA + +cextern pw_8 +cextern pw_1023 +cextern pw_2048 +cextern pw_4095 +cextern pw_m1 +cextern pd_1 +cextern pd_16 +cextern pd_32 +cextern pd_8192 + +pd_8: times 4 dd 8 +pd_3fff: times 4 dd 0x3fff + +cextern pw_11585x2 + +cextern pw_5283_13377 +cextern pw_9929_13377 +cextern pw_15212_m13377 +cextern pw_15212_9929 +cextern pw_m5283_m15212 +cextern pw_13377x2 +cextern pw_m13377_13377 +cextern pw_13377_0 + +pw_9929_m5283: times 4 dw 9929, -5283 + +%macro COEF_PAIR 2-3 +cextern pw_m%1_%2 +cextern pw_%2_%1 +%if %0 == 3 +cextern pw_m%1_m%2 +%if %1 != %2 +cextern pw_m%2_%1 +cextern pw_%1_%2 +%endif +%endif +%endmacro + +COEF_PAIR 2404, 16207 +COEF_PAIR 3196, 16069, 1 +COEF_PAIR 4756, 15679 +COEF_PAIR 5520, 15426 +COEF_PAIR 6270, 15137, 1 +COEF_PAIR 8423, 14053 +COEF_PAIR 10394, 12665 +COEF_PAIR 11003, 12140 +COEF_PAIR 11585, 11585, 1 +COEF_PAIR 13160, 9760 +COEF_PAIR 13623, 9102, 1 +COEF_PAIR 14449, 7723 +COEF_PAIR 14811, 7005 +COEF_PAIR 15893, 3981 +COEF_PAIR 16305, 1606 +COEF_PAIR 16364, 804 + +default_8x8: +times 12 db 1 +times 52 db 2 +row_8x8: +times 18 db 1 +times 46 db 2 +col_8x8: +times 6 db 1 +times 58 db 2 +default_16x16: +times 10 db 1 +times 28 db 2 +times 51 db 3 +times 167 db 4 +row_16x16: +times 21 db 1 +times 45 db 2 +times 60 db 3 +times 130 db 4 +col_16x16: +times 5 db 1 +times 12 db 2 +times 25 db 3 +times 214 db 4 +default_32x32: +times 9 db 1 +times 25 db 2 +times 36 db 3 +times 65 db 4 +times 105 db 5 +times 96 db 6 +times 112 db 7 +times 576 db 8 + +SECTION .text + +%macro VP9_STORE_2X 6-7 dstq ; reg1, reg2, tmp1, tmp2, min, max, dst + mova m%3, [%7] + mova m%4, [%7+strideq] + paddw m%3, m%1 + paddw m%4, m%2 + pmaxsw m%3, m%5 + pmaxsw m%4, m%5 + pminsw m%3, m%6 + pminsw m%4, m%6 + mova [%7], m%3 + mova [%7+strideq], m%4 +%endmacro + +%macro ZERO_BLOCK 4 ; mem, stride, nnzcpl, zero_reg +%assign %%y 0 +%rep %3 +%assign %%x 0 +%rep %3*4/mmsize + mova [%1+%%y+%%x], %4 +%assign %%x (%%x+mmsize) +%endrep +%assign %%y (%%y+%2) +%endrep +%endmacro + +; the input coefficients are scaled up by 2 bit (which we downscale immediately +; in the iwht), and is otherwise orthonormally increased by 1 bit per iwht_1d. +; therefore, a diff of 10-12+sign bit will fit in 12-14+sign bit after scaling, +; i.e. everything can be done in 15+1bpp words. Since the quant fractional bits +; add 2 bits, we need to scale before converting to word in 12bpp, since the +; input will be 16+sign bit which doesn't fit in 15+sign words, but in 10bpp +; we can scale after converting to words (which is half the instructions), +; since the input is only 14+sign bit, which fits in 15+sign words directly. + +%macro IWHT4_FN 2 ; bpp, max +cglobal vp9_iwht_iwht_4x4_add_%1, 3, 3, 8, dst, stride, block, eob + mova m7, [pw_%2] + mova m0, [blockq+0*16+0] + mova m1, [blockq+1*16+0] +%if %1 >= 12 + mova m4, [blockq+0*16+8] + mova m5, [blockq+1*16+8] + psrad m0, 2 + psrad m1, 2 + psrad m4, 2 + psrad m5, 2 + packssdw m0, m4 + packssdw m1, m5 +%else + packssdw m0, [blockq+0*16+8] + packssdw m1, [blockq+1*16+8] + psraw m0, 2 + psraw m1, 2 +%endif + mova m2, [blockq+2*16+0] + mova m3, [blockq+3*16+0] +%if %1 >= 12 + mova m4, [blockq+2*16+8] + mova m5, [blockq+3*16+8] + psrad m2, 2 + psrad m3, 2 + psrad m4, 2 + psrad m5, 2 + packssdw m2, m4 + packssdw m3, m5 +%else + packssdw m2, [blockq+2*16+8] + packssdw m3, [blockq+3*16+8] + psraw m2, 2 + psraw m3, 2 +%endif + + VP9_IWHT4_1D + TRANSPOSE4x4W 0, 1, 2, 3, 4 + VP9_IWHT4_1D + + pxor m6, m6 + VP9_STORE_2X 0, 1, 4, 5, 6, 7 + lea dstq, [dstq+strideq*2] + VP9_STORE_2X 2, 3, 4, 5, 6, 7 + ZERO_BLOCK blockq, 16, 4, m6 + RET +%endmacro + +INIT_MMX mmxext +IWHT4_FN 10, 1023 +INIT_MMX mmxext +IWHT4_FN 12, 4095 + +%macro VP9_IDCT4_WRITEOUT 0 +%if cpuflag(ssse3) + mova m5, [pw_2048] + pmulhrsw m0, m5 + pmulhrsw m1, m5 + pmulhrsw m2, m5 + pmulhrsw m3, m5 +%else + mova m5, [pw_8] + paddw m0, m5 + paddw m1, m5 + paddw m2, m5 + paddw m3, m5 + psraw m0, 4 + psraw m1, 4 + psraw m2, 4 + psraw m3, 4 +%endif + mova m5, [pw_1023] + VP9_STORE_2X 0, 1, 6, 7, 4, 5 + lea dstq, [dstq+2*strideq] + VP9_STORE_2X 2, 3, 6, 7, 4, 5 +%endmacro + +%macro DC_ONLY 2 ; shift, zero + mov coefd, dword [blockq] + movd [blockq], %2 + imul coefd, 11585 + add coefd, 8192 + sar coefd, 14 + imul coefd, 11585 + add coefd, ((1 << (%1 - 1)) << 14) + 8192 + sar coefd, 14 + %1 +%endmacro + +; 4x4 coefficients are 5+depth+sign bits, so for 10bpp, everything still fits +; in 15+1 words without additional effort, since the coefficients are 15bpp. + +%macro IDCT4_10_FN 0 +cglobal vp9_idct_idct_4x4_add_10, 4, 4, 8, dst, stride, block, eob + cmp eobd, 1 + jg .idctfull + + ; dc-only + pxor m4, m4 +%if cpuflag(ssse3) + movd m0, [blockq] + movd [blockq], m4 + mova m5, [pw_11585x2] + pmulhrsw m0, m5 + pmulhrsw m0, m5 +%else + DEFINE_ARGS dst, stride, block, coef + DC_ONLY 4, m4 + movd m0, coefd +%endif + pshufw m0, m0, 0 + mova m5, [pw_1023] +%if cpuflag(ssse3) + pmulhrsw m0, [pw_2048] ; (x*2048 + (1<<14))>>15 <=> (x+8)>>4 +%endif + VP9_STORE_2X 0, 0, 6, 7, 4, 5 + lea dstq, [dstq+2*strideq] + VP9_STORE_2X 0, 0, 6, 7, 4, 5 + RET + +.idctfull: + mova m0, [blockq+0*16+0] + mova m1, [blockq+1*16+0] + packssdw m0, [blockq+0*16+8] + packssdw m1, [blockq+1*16+8] + mova m2, [blockq+2*16+0] + mova m3, [blockq+3*16+0] + packssdw m2, [blockq+2*16+8] + packssdw m3, [blockq+3*16+8] + +%if cpuflag(ssse3) + mova m6, [pw_11585x2] +%endif + mova m7, [pd_8192] ; rounding + VP9_IDCT4_1D + TRANSPOSE4x4W 0, 1, 2, 3, 4 + VP9_IDCT4_1D + + pxor m4, m4 + ZERO_BLOCK blockq, 16, 4, m4 + VP9_IDCT4_WRITEOUT + RET +%endmacro + +INIT_MMX mmxext +IDCT4_10_FN +INIT_MMX ssse3 +IDCT4_10_FN + +%macro IADST4_FN 4 +cglobal vp9_%1_%3_4x4_add_10, 3, 3, 0, dst, stride, block, eob +%if WIN64 && notcpuflag(ssse3) + WIN64_SPILL_XMM 8 +%endif + movdqa xmm5, [pd_8192] + mova m0, [blockq+0*16+0] + mova m1, [blockq+1*16+0] + packssdw m0, [blockq+0*16+8] + packssdw m1, [blockq+1*16+8] + mova m2, [blockq+2*16+0] + mova m3, [blockq+3*16+0] + packssdw m2, [blockq+2*16+8] + packssdw m3, [blockq+3*16+8] + +%if cpuflag(ssse3) + mova m6, [pw_11585x2] +%endif +%ifnidn %1%3, iadstiadst + movdq2q m7, xmm5 +%endif + VP9_%2_1D + TRANSPOSE4x4W 0, 1, 2, 3, 4 + VP9_%4_1D + + pxor m4, m4 + ZERO_BLOCK blockq, 16, 4, m4 + VP9_IDCT4_WRITEOUT + RET +%endmacro + +INIT_MMX sse2 +IADST4_FN idct, IDCT4, iadst, IADST4 +IADST4_FN iadst, IADST4, idct, IDCT4 +IADST4_FN iadst, IADST4, iadst, IADST4 + +INIT_MMX ssse3 +IADST4_FN idct, IDCT4, iadst, IADST4 +IADST4_FN iadst, IADST4, idct, IDCT4 +IADST4_FN iadst, IADST4, iadst, IADST4 + +; inputs and outputs are dwords, coefficients are words +; +; dst1 = src1 * coef1 + src2 * coef2 + rnd >> 14 +; dst2 = src1 * coef2 - src2 * coef1 + rnd >> 14 +%macro SUMSUB_MUL 6-8 [pd_8192], [pd_3fff] ; src/dst 1-2, tmp1-2, coef1-2, rnd, mask + pand m%3, m%1, %8 + pand m%4, m%2, %8 + psrad m%1, 14 + psrad m%2, 14 + packssdw m%4, m%2 + packssdw m%3, m%1 + punpckhwd m%2, m%4, m%3 + punpcklwd m%4, m%3 + pmaddwd m%3, m%4, [pw_%6_%5] + pmaddwd m%1, m%2, [pw_%6_%5] + pmaddwd m%4, [pw_m%5_%6] + pmaddwd m%2, [pw_m%5_%6] + paddd m%3, %7 + paddd m%4, %7 + psrad m%3, 14 + psrad m%4, 14 + paddd m%1, m%3 + paddd m%2, m%4 +%endmacro + +%macro IDCT4_12BPP_1D 0-8 [pd_8192], [pd_3fff], 0, 1, 2, 3, 4, 5 ; rnd, mask, in/out0-3, tmp0-1 + SUMSUB_MUL %3, %5, %7, %8, 11585, 11585, %1, %2 + SUMSUB_MUL %4, %6, %7, %8, 15137, 6270, %1, %2 + SUMSUB_BA d, %4, %3, %7 + SUMSUB_BA d, %6, %5, %7 + SWAP %4, %6, %3 +%endmacro + +%macro STORE_4x4 6 ; tmp1-2, reg1-2, min, max + movh m%1, [dstq+strideq*0] + movh m%2, [dstq+strideq*2] + movhps m%1, [dstq+strideq*1] + movhps m%2, [dstq+stride3q ] + paddw m%1, m%3 + paddw m%2, m%4 + pmaxsw m%1, %5 + pmaxsw m%2, %5 + pminsw m%1, %6 + pminsw m%2, %6 + movh [dstq+strideq*0], m%1 + movhps [dstq+strideq*1], m%1 + movh [dstq+strideq*2], m%2 + movhps [dstq+stride3q ], m%2 +%endmacro + +%macro ROUND_AND_STORE_4x4 8 ; reg1-4, min, max, rnd, shift + paddd m%1, %7 + paddd m%2, %7 + paddd m%3, %7 + paddd m%4, %7 + psrad m%1, %8 + psrad m%2, %8 + psrad m%3, %8 + psrad m%4, %8 + packssdw m%1, m%2 + packssdw m%3, m%4 + STORE_4x4 %2, %4, %1, %3, %5, %6 +%endmacro + +INIT_XMM sse2 +cglobal vp9_idct_idct_4x4_add_12, 4, 4, 8, dst, stride, block, eob + cmp eobd, 1 + jg .idctfull + + ; dc-only - this is special, since for 4x4 12bpp, the max coef size is + ; 17+sign bpp. Since the multiply is with 11585, which is 14bpp, the + ; result of each multiply is 31+sign bit, i.e. it _exactly_ fits in a + ; dword. After the final shift (4), the result is 13+sign bits, so we + ; don't need any additional processing to fit it in a word + DEFINE_ARGS dst, stride, block, coef + pxor m4, m4 + DC_ONLY 4, m4 + movd m0, coefd + pshuflw m0, m0, q0000 + punpcklqdq m0, m0 + mova m5, [pw_4095] + DEFINE_ARGS dst, stride, stride3 + lea stride3q, [strideq*3] + STORE_4x4 1, 3, 0, 0, m4, m5 + RET + +.idctfull: + DEFINE_ARGS dst, stride, block, eob + mova m0, [blockq+0*16] + mova m1, [blockq+1*16] + mova m2, [blockq+2*16] + mova m3, [blockq+3*16] + mova m6, [pd_8192] + mova m7, [pd_3fff] + + IDCT4_12BPP_1D m6, m7 + TRANSPOSE4x4D 0, 1, 2, 3, 4 + IDCT4_12BPP_1D m6, m7 + + pxor m4, m4 + ZERO_BLOCK blockq, 16, 4, m4 + + ; writeout + DEFINE_ARGS dst, stride, stride3 + lea stride3q, [strideq*3] + mova m5, [pw_4095] + mova m6, [pd_8] + ROUND_AND_STORE_4x4 0, 1, 2, 3, m4, m5, m6, 4 + RET + +%macro SCRATCH 3-4 +%if ARCH_X86_64 + SWAP %1, %2 +%if %0 == 4 +%define reg_%4 m%2 +%endif +%else + mova [%3], m%1 +%if %0 == 4 +%define reg_%4 [%3] +%endif +%endif +%endmacro + +%macro UNSCRATCH 3-4 +%if ARCH_X86_64 + SWAP %1, %2 +%else + mova m%1, [%3] +%endif +%if %0 == 4 +%undef reg_%4 +%endif +%endmacro + +%macro PRELOAD 2-3 +%if ARCH_X86_64 + mova m%1, [%2] +%if %0 == 3 +%define reg_%3 m%1 +%endif +%elif %0 == 3 +%define reg_%3 [%2] +%endif +%endmacro + +; out0 = 5283 * in0 + 13377 + in1 + 15212 * in2 + 9929 * in3 + rnd >> 14 +; out1 = 9929 * in0 + 13377 * in1 - 5283 * in2 - 15282 * in3 + rnd >> 14 +; out2 = 13377 * in0 - 13377 * in2 + 13377 * in3 + rnd >> 14 +; out3 = 15212 * in0 - 13377 * in1 + 9929 * in2 - 5283 * in3 + rnd >> 14 +%macro IADST4_12BPP_1D 0-2 [pd_8192], [pd_3fff] ; rnd, mask + pand m4, m0, %2 + pand m5, m1, %2 + psrad m0, 14 + psrad m1, 14 + packssdw m5, m1 + packssdw m4, m0 + punpckhwd m1, m4, m5 + punpcklwd m4, m5 + pand m5, m2, %2 + pand m6, m3, %2 + psrad m2, 14 + psrad m3, 14 + packssdw m6, m3 + packssdw m5, m2 + punpckhwd m3, m5, m6 + punpcklwd m5, m6 + SCRATCH 1, 8, rsp+0*mmsize, a + SCRATCH 5, 9, rsp+1*mmsize, b + + ; m1/3 have the high bits of 0,1,2,3 + ; m4/5 have the low bits of 0,1,2,3 + ; m0/2/6/7 are free + + mova m2, [pw_15212_9929] + mova m0, [pw_5283_13377] + pmaddwd m7, m2, reg_b + pmaddwd m6, m4, m0 + pmaddwd m2, m3 + pmaddwd m0, reg_a + paddd m6, m7 + paddd m0, m2 + mova m1, [pw_m13377_13377] + mova m5, [pw_13377_0] + pmaddwd m7, m1, reg_b + pmaddwd m2, m4, m5 + pmaddwd m1, m3 + pmaddwd m5, reg_a + paddd m2, m7 + paddd m1, m5 + paddd m6, %1 + paddd m2, %1 + psrad m6, 14 + psrad m2, 14 + paddd m0, m6 ; t0 + paddd m2, m1 ; t2 + + mova m7, [pw_m5283_m15212] + mova m5, [pw_9929_13377] + pmaddwd m1, m7, reg_b + pmaddwd m6, m4, m5 + pmaddwd m7, m3 + pmaddwd m5, reg_a + paddd m6, m1 + paddd m7, m5 + UNSCRATCH 5, 9, rsp+1*mmsize, b + pmaddwd m5, [pw_9929_m5283] + pmaddwd m4, [pw_15212_m13377] + pmaddwd m3, [pw_9929_m5283] + UNSCRATCH 1, 8, rsp+0*mmsize, a + pmaddwd m1, [pw_15212_m13377] + paddd m4, m5 + paddd m3, m1 + paddd m6, %1 + paddd m4, %1 + psrad m6, 14 + psrad m4, 14 + paddd m7, m6 ; t1 + paddd m3, m4 ; t3 + + SWAP 1, 7 +%endmacro + +%macro IADST4_12BPP_FN 4 +cglobal vp9_%1_%3_4x4_add_12, 3, 3, 12, 2 * ARCH_X86_32 * mmsize, dst, stride, block, eob + mova m0, [blockq+0*16] + mova m1, [blockq+1*16] + mova m2, [blockq+2*16] + mova m3, [blockq+3*16] + + PRELOAD 10, pd_8192, rnd + PRELOAD 11, pd_3fff, mask + %2_12BPP_1D reg_rnd, reg_mask + TRANSPOSE4x4D 0, 1, 2, 3, 4 + %4_12BPP_1D reg_rnd, reg_mask + + pxor m4, m4 + ZERO_BLOCK blockq, 16, 4, m4 + + ; writeout + DEFINE_ARGS dst, stride, stride3 + lea stride3q, [strideq*3] + mova m5, [pw_4095] + mova m6, [pd_8] + ROUND_AND_STORE_4x4 0, 1, 2, 3, m4, m5, m6, 4 + RET +%endmacro + +INIT_XMM sse2 +IADST4_12BPP_FN idct, IDCT4, iadst, IADST4 +IADST4_12BPP_FN iadst, IADST4, idct, IDCT4 +IADST4_12BPP_FN iadst, IADST4, iadst, IADST4 + +; the following line has not been executed at the end of this macro: +; UNSCRATCH 6, 8, rsp+%3*mmsize +%macro IDCT8_1D 1-5 [pd_8192], [pd_3fff], 2 * mmsize, 17 ; src, rnd, mask, src_stride, stack_offset + mova m0, [%1+0*%4] + mova m2, [%1+2*%4] + mova m4, [%1+4*%4] + mova m6, [%1+6*%4] + IDCT4_12BPP_1D %2, %3, 0, 2, 4, 6, 1, 3 ; m0/2/4/6 have t0/1/2/3 + SCRATCH 4, 8, rsp+(%5+0)*mmsize + SCRATCH 6, 9, rsp+(%5+1)*mmsize + mova m1, [%1+1*%4] + mova m3, [%1+3*%4] + mova m5, [%1+5*%4] + mova m7, [%1+7*%4] + SUMSUB_MUL 1, 7, 4, 6, 16069, 3196, %2, %3 ; m1=t7a, m7=t4a + SUMSUB_MUL 5, 3, 4, 6, 9102, 13623, %2, %3 ; m5=t6a, m3=t5a + SUMSUB_BA d, 3, 7, 4 ; m3=t4, m7=t5a + SUMSUB_BA d, 5, 1, 4 ; m5=t7, m1=t6a + SUMSUB_MUL 1, 7, 4, 6, 11585, 11585, %2, %3 ; m1=t6, m7=t5 + SUMSUB_BA d, 5, 0, 4 ; m5=out0, m0=out7 + SUMSUB_BA d, 1, 2, 4 ; m1=out1, m2=out6 + UNSCRATCH 4, 8, rsp+(%5+0)*mmsize + UNSCRATCH 6, 9, rsp+(%5+1)*mmsize + SCRATCH 2, 8, rsp+(%5+0)*mmsize + SUMSUB_BA d, 7, 4, 2 ; m7=out2, m4=out5 + SUMSUB_BA d, 3, 6, 2 ; m3=out3, m6=out4 + SWAP 0, 5, 4, 6, 2, 7 +%endmacro + +%macro STORE_2x8 5-7 dstq, strideq ; tmp1-2, reg, min, max + mova m%1, [%6+%7*0] + mova m%2, [%6+%7*1] + paddw m%1, m%3 + paddw m%2, m%3 + pmaxsw m%1, %4 + pmaxsw m%2, %4 + pminsw m%1, %5 + pminsw m%2, %5 + mova [%6+%7*0], m%1 + mova [%6+%7*1], m%2 +%endmacro + +; FIXME we can use the intermediate storage (rsp[0-15]) on x86-32 for temp +; storage also instead of allocating two more stack spaces. This doesn't +; matter much but it's something... +INIT_XMM sse2 +cglobal vp9_idct_idct_8x8_add_10, 4, 6 + ARCH_X86_64, 14, \ + 16 * mmsize + 3 * ARCH_X86_32 * mmsize, \ + dst, stride, block, eob + mova m0, [pw_1023] + cmp eobd, 1 + jg .idctfull + + ; dc-only - the 10bit version can be done entirely in 32bit, since the max + ; coef values are 16+sign bit, and the coef is 14bit, so 30+sign easily + ; fits in 32bit + DEFINE_ARGS dst, stride, block, coef + pxor m2, m2 + DC_ONLY 5, m2 + movd m1, coefd + pshuflw m1, m1, q0000 + punpcklqdq m1, m1 + DEFINE_ARGS dst, stride, cnt + mov cntd, 4 +.loop_dc: + STORE_2x8 3, 4, 1, m2, m0 + lea dstq, [dstq+strideq*2] + dec cntd + jg .loop_dc + RET + +.idctfull: + SCRATCH 0, 12, rsp+16*mmsize, max + DEFINE_ARGS dst, stride, block, cnt, ptr, skip, dstbak +%if ARCH_X86_64 + mov dstbakq, dstq + movsxd cntq, cntd +%endif +%ifdef PIC + lea ptrq, [default_8x8] + movzx cntd, byte [ptrq+cntq-1] +%else + movzx cntd, byte [default_8x8+cntq-1] +%endif + mov skipd, 2 + sub skipd, cntd + mov ptrq, rsp + PRELOAD 10, pd_8192, rnd + PRELOAD 11, pd_3fff, mask + PRELOAD 13, pd_16, srnd +.loop_1: + IDCT8_1D blockq, reg_rnd, reg_mask + + TRANSPOSE4x4D 0, 1, 2, 3, 6 + mova [ptrq+ 0*mmsize], m0 + mova [ptrq+ 2*mmsize], m1 + mova [ptrq+ 4*mmsize], m2 + mova [ptrq+ 6*mmsize], m3 + UNSCRATCH 6, 8, rsp+17*mmsize + TRANSPOSE4x4D 4, 5, 6, 7, 0 + mova [ptrq+ 1*mmsize], m4 + mova [ptrq+ 3*mmsize], m5 + mova [ptrq+ 5*mmsize], m6 + mova [ptrq+ 7*mmsize], m7 + add ptrq, 8 * mmsize + add blockq, mmsize + dec cntd + jg .loop_1 + + ; zero-pad the remainder (skipped cols) + test skipd, skipd + jz .end + add skipd, skipd + lea blockq, [blockq+skipq*(mmsize/2)] + pxor m0, m0 +.loop_z: + mova [ptrq+mmsize*0], m0 + mova [ptrq+mmsize*1], m0 + mova [ptrq+mmsize*2], m0 + mova [ptrq+mmsize*3], m0 + add ptrq, 4 * mmsize + dec skipd + jg .loop_z +.end: + + DEFINE_ARGS dst, stride, block, cnt, ptr, stride3, dstbak + lea stride3q, [strideq*3] + mov cntd, 2 + mov ptrq, rsp +.loop_2: + IDCT8_1D ptrq, reg_rnd, reg_mask + + pxor m6, m6 + ROUND_AND_STORE_4x4 0, 1, 2, 3, m6, reg_max, reg_srnd, 5 + lea dstq, [dstq+strideq*4] + UNSCRATCH 0, 8, rsp+17*mmsize + UNSCRATCH 1, 12, rsp+16*mmsize, max + UNSCRATCH 2, 13, pd_16, srnd + ROUND_AND_STORE_4x4 4, 5, 0, 7, m6, m1, m2, 5 + add ptrq, 16 +%if ARCH_X86_64 + lea dstq, [dstbakq+8] +%else + mov dstq, dstm + add dstq, 8 +%endif + dec cntd + jg .loop_2 + + ; m6 is still zero + ZERO_BLOCK blockq-2*mmsize, 32, 8, m6 + RET + +%macro DC_ONLY_64BIT 2 ; shift, zero +%if ARCH_X86_64 + movsxd coefq, dword [blockq] + movd [blockq], %2 + imul coefq, 11585 + add coefq, 8192 + sar coefq, 14 + imul coefq, 11585 + add coefq, ((1 << (%1 - 1)) << 14) + 8192 + sar coefq, 14 + %1 +%else + mov coefd, dword [blockq] + movd [blockq], %2 + DEFINE_ARGS dst, stride, cnt, coef, coefl + mov cntd, 2 +.loop_dc_calc: + mov coefld, coefd + sar coefd, 14 + and coefld, 0x3fff + imul coefd, 11585 + imul coefld, 11585 + add coefld, 8192 + sar coefld, 14 + add coefd, coefld + dec cntd + jg .loop_dc_calc + add coefd, 1 << (%1 - 1) + sar coefd, %1 +%endif +%endmacro + +INIT_XMM sse2 +cglobal vp9_idct_idct_8x8_add_12, 4, 6 + ARCH_X86_64, 14, \ + 16 * mmsize + 3 * ARCH_X86_32 * mmsize, \ + dst, stride, block, eob + mova m0, [pw_4095] + cmp eobd, 1 + jg mangle(private_prefix %+ _ %+ vp9_idct_idct_8x8_add_10 %+ SUFFIX).idctfull + + ; dc-only - unfortunately, this one can overflow, since coefs are 18+sign + ; bpp, and 18+14+sign does not fit in 32bit, so we do 2-stage multiplies + DEFINE_ARGS dst, stride, block, coef, coefl + pxor m2, m2 + DC_ONLY_64BIT 5, m2 + movd m1, coefd + pshuflw m1, m1, q0000 + punpcklqdq m1, m1 + DEFINE_ARGS dst, stride, cnt + mov cntd, 4 +.loop_dc: + STORE_2x8 3, 4, 1, m2, m0 + lea dstq, [dstq+strideq*2] + dec cntd + jg .loop_dc + RET + +; inputs and outputs are dwords, coefficients are words +; +; dst1[hi]:dst3[lo] = src1 * coef1 + src2 * coef2 +; dst2[hi]:dst4[lo] = src1 * coef2 - src2 * coef1 +%macro SUMSUB_MUL_D 6-7 [pd_3fff] ; src/dst 1-2, dst3-4, coef1-2, mask + pand m%3, m%1, %7 + pand m%4, m%2, %7 + psrad m%1, 14 + psrad m%2, 14 + packssdw m%4, m%2 + packssdw m%3, m%1 + punpckhwd m%2, m%4, m%3 + punpcklwd m%4, m%3 + pmaddwd m%3, m%4, [pw_%6_%5] + pmaddwd m%1, m%2, [pw_%6_%5] + pmaddwd m%4, [pw_m%5_%6] + pmaddwd m%2, [pw_m%5_%6] +%endmacro + +; dst1 = src2[hi]:src4[lo] + src1[hi]:src3[lo] + rnd >> 14 +; dst2 = src2[hi]:src4[lo] - src1[hi]:src3[lo] + rnd >> 14 +%macro SUMSUB_PACK_D 5-6 [pd_8192] ; src/dst 1-2, src3-4, tmp, rnd + SUMSUB_BA d, %1, %2, %5 + SUMSUB_BA d, %3, %4, %5 + paddd m%3, %6 + paddd m%4, %6 + psrad m%3, 14 + psrad m%4, 14 + paddd m%1, m%3 + paddd m%2, m%4 +%endmacro + +%macro NEGD 1 +%if cpuflag(ssse3) + psignd %1, [pw_m1] +%else + pxor %1, [pw_m1] + paddd %1, [pd_1] +%endif +%endmacro + +; the following line has not been executed at the end of this macro: +; UNSCRATCH 6, 8, rsp+17*mmsize +%macro IADST8_1D 1-3 [pd_8192], [pd_3fff] ; src, rnd, mask + mova m0, [%1+ 0*mmsize] + mova m3, [%1+ 6*mmsize] + mova m4, [%1+ 8*mmsize] + mova m7, [%1+14*mmsize] + SUMSUB_MUL_D 7, 0, 1, 2, 16305, 1606, %3 ; m7/1=t0a, m0/2=t1a + SUMSUB_MUL_D 3, 4, 5, 6, 10394, 12665, %3 ; m3/5=t4a, m4/6=t5a + SCRATCH 0, 8, rsp+17*mmsize + SUMSUB_PACK_D 3, 7, 5, 1, 0, %2 ; m3=t0, m7=t4 + UNSCRATCH 0, 8, rsp+17*mmsize + SUMSUB_PACK_D 4, 0, 6, 2, 1, %2 ; m4=t1, m0=t5 + + SCRATCH 3, 8, rsp+17*mmsize + SCRATCH 4, 9, rsp+18*mmsize + SCRATCH 7, 10, rsp+19*mmsize + SCRATCH 0, 11, rsp+20*mmsize + + mova m1, [%1+ 2*mmsize] + mova m2, [%1+ 4*mmsize] + mova m5, [%1+10*mmsize] + mova m6, [%1+12*mmsize] + SUMSUB_MUL_D 5, 2, 3, 4, 14449, 7723, %3 ; m5/8=t2a, m2/9=t3a + SUMSUB_MUL_D 1, 6, 7, 0, 4756, 15679, %3 ; m1/10=t6a, m6/11=t7a + SCRATCH 2, 12, rsp+21*mmsize + SUMSUB_PACK_D 1, 5, 7, 3, 2, %2 ; m1=t2, m5=t6 + UNSCRATCH 2, 12, rsp+21*mmsize + SUMSUB_PACK_D 6, 2, 0, 4, 3, %2 ; m6=t3, m2=t7 + + UNSCRATCH 7, 10, rsp+19*mmsize + UNSCRATCH 0, 11, rsp+20*mmsize + SCRATCH 1, 10, rsp+19*mmsize + SCRATCH 6, 11, rsp+20*mmsize + + SUMSUB_MUL_D 7, 0, 3, 4, 15137, 6270, %3 ; m7/8=t4a, m0/9=t5a + SUMSUB_MUL_D 2, 5, 1, 6, 6270, 15137, %3 ; m2/10=t7a, m5/11=t6a + SCRATCH 2, 12, rsp+21*mmsize + SUMSUB_PACK_D 5, 7, 6, 3, 2, %2 ; m5=-out1, m7=t6 + UNSCRATCH 2, 12, rsp+21*mmsize + NEGD m5 ; m5=out1 + SUMSUB_PACK_D 2, 0, 1, 4, 3, %2 ; m2=out6, m0=t7 + SUMSUB_MUL 7, 0, 3, 4, 11585, 11585, %2, %3 ; m7=out2, m0=-out5 + NEGD m0 ; m0=out5 + + UNSCRATCH 3, 8, rsp+17*mmsize + UNSCRATCH 4, 9, rsp+18*mmsize + UNSCRATCH 1, 10, rsp+19*mmsize + UNSCRATCH 6, 11, rsp+20*mmsize + SCRATCH 2, 8, rsp+17*mmsize + SCRATCH 0, 9, rsp+18*mmsize + + SUMSUB_BA d, 1, 3, 2 ; m1=out0, m3=t2 + SUMSUB_BA d, 6, 4, 2 ; m6=-out7, m4=t3 + NEGD m6 ; m6=out7 + SUMSUB_MUL 3, 4, 2, 0, 11585, 11585, %2, %3 ; m3=-out3, m4=out4 + NEGD m3 ; m3=out3 + + UNSCRATCH 0, 9, rsp+18*mmsize + + SWAP 0, 1, 5 + SWAP 2, 7, 6 +%endmacro + +%macro IADST8_FN 5 +cglobal vp9_%1_%3_8x8_add_10, 4, 6 + ARCH_X86_64, 16, \ + 16 * mmsize + ARCH_X86_32 * 6 * mmsize, \ + dst, stride, block, eob + mova m0, [pw_1023] + +.body: + SCRATCH 0, 13, rsp+16*mmsize, max + DEFINE_ARGS dst, stride, block, cnt, ptr, skip, dstbak +%if ARCH_X86_64 + mov dstbakq, dstq + movsxd cntq, cntd +%endif +%ifdef PIC + lea ptrq, [%5_8x8] + movzx cntd, byte [ptrq+cntq-1] +%else + movzx cntd, byte [%5_8x8+cntq-1] +%endif + mov skipd, 2 + sub skipd, cntd + mov ptrq, rsp + PRELOAD 14, pd_8192, rnd + PRELOAD 15, pd_3fff, mask +.loop_1: + %2_1D blockq, reg_rnd, reg_mask + + TRANSPOSE4x4D 0, 1, 2, 3, 6 + mova [ptrq+ 0*mmsize], m0 + mova [ptrq+ 2*mmsize], m1 + mova [ptrq+ 4*mmsize], m2 + mova [ptrq+ 6*mmsize], m3 + UNSCRATCH 6, 8, rsp+17*mmsize + TRANSPOSE4x4D 4, 5, 6, 7, 0 + mova [ptrq+ 1*mmsize], m4 + mova [ptrq+ 3*mmsize], m5 + mova [ptrq+ 5*mmsize], m6 + mova [ptrq+ 7*mmsize], m7 + add ptrq, 8 * mmsize + add blockq, mmsize + dec cntd + jg .loop_1 + + ; zero-pad the remainder (skipped cols) + test skipd, skipd + jz .end + add skipd, skipd + lea blockq, [blockq+skipq*(mmsize/2)] + pxor m0, m0 +.loop_z: + mova [ptrq+mmsize*0], m0 + mova [ptrq+mmsize*1], m0 + mova [ptrq+mmsize*2], m0 + mova [ptrq+mmsize*3], m0 + add ptrq, 4 * mmsize + dec skipd + jg .loop_z +.end: + + DEFINE_ARGS dst, stride, block, cnt, ptr, stride3, dstbak + lea stride3q, [strideq*3] + mov cntd, 2 + mov ptrq, rsp +.loop_2: + %4_1D ptrq, reg_rnd, reg_mask + + pxor m6, m6 + PRELOAD 9, pd_16, srnd + ROUND_AND_STORE_4x4 0, 1, 2, 3, m6, reg_max, reg_srnd, 5 + lea dstq, [dstq+strideq*4] + UNSCRATCH 0, 8, rsp+17*mmsize + UNSCRATCH 1, 13, rsp+16*mmsize, max + UNSCRATCH 2, 9, pd_16, srnd + ROUND_AND_STORE_4x4 4, 5, 0, 7, m6, m1, m2, 5 + add ptrq, 16 +%if ARCH_X86_64 + lea dstq, [dstbakq+8] +%else + mov dstq, dstm + add dstq, 8 +%endif + dec cntd + jg .loop_2 + + ; m6 is still zero + ZERO_BLOCK blockq-2*mmsize, 32, 8, m6 + RET + +cglobal vp9_%1_%3_8x8_add_12, 4, 6 + ARCH_X86_64, 16, \ + 16 * mmsize + ARCH_X86_32 * 6 * mmsize, \ + dst, stride, block, eob + mova m0, [pw_4095] + jmp mangle(private_prefix %+ _ %+ vp9_%1_%3_8x8_add_10 %+ SUFFIX).body +%endmacro + +INIT_XMM sse2 +IADST8_FN idct, IDCT8, iadst, IADST8, row +IADST8_FN iadst, IADST8, idct, IDCT8, col +IADST8_FN iadst, IADST8, iadst, IADST8, default + +%macro IDCT16_1D 1-4 4 * mmsize, 65, 67 ; src, src_stride, stack_offset, mm32bit_stack_offset + IDCT8_1D %1, [pd_8192], [pd_3fff], %2 * 2, %4 ; m0-3=t0-3a, m4-5/m8|r67/m7=t4-7 + ; SCRATCH 6, 8, rsp+(%4+0)*mmsize ; t6 + SCRATCH 0, 15, rsp+(%4+7)*mmsize ; t0a + SCRATCH 1, 14, rsp+(%4+6)*mmsize ; t1a + SCRATCH 2, 13, rsp+(%4+5)*mmsize ; t2a + SCRATCH 3, 12, rsp+(%4+4)*mmsize ; t3a + SCRATCH 4, 11, rsp+(%4+3)*mmsize ; t4 + mova [rsp+(%3+0)*mmsize], m5 ; t5 + mova [rsp+(%3+1)*mmsize], m7 ; t7 + + mova m0, [%1+ 1*%2] ; in1 + mova m3, [%1+ 7*%2] ; in7 + mova m4, [%1+ 9*%2] ; in9 + mova m7, [%1+15*%2] ; in15 + + SUMSUB_MUL 0, 7, 1, 2, 16305, 1606 ; m0=t15a, m7=t8a + SUMSUB_MUL 4, 3, 1, 2, 10394, 12665 ; m4=t14a, m3=t9a + SUMSUB_BA d, 3, 7, 1 ; m3=t8, m7=t9 + SUMSUB_BA d, 4, 0, 1 ; m4=t15,m0=t14 + SUMSUB_MUL 0, 7, 1, 2, 15137, 6270 ; m0=t14a, m7=t9a + + mova m1, [%1+ 3*%2] ; in3 + mova m2, [%1+ 5*%2] ; in5 + mova m5, [%1+11*%2] ; in11 + mova m6, [%1+13*%2] ; in13 + + SCRATCH 0, 9, rsp+(%4+1)*mmsize + SCRATCH 7, 10, rsp+(%4+2)*mmsize + + SUMSUB_MUL 2, 5, 0, 7, 14449, 7723 ; m2=t13a, m5=t10a + SUMSUB_MUL 6, 1, 0, 7, 4756, 15679 ; m6=t12a, m1=t11a + SUMSUB_BA d, 5, 1, 0 ; m5=t11,m1=t10 + SUMSUB_BA d, 2, 6, 0 ; m2=t12,m6=t13 + NEGD m1 ; m1=-t10 + SUMSUB_MUL 1, 6, 0, 7, 15137, 6270 ; m1=t13a, m6=t10a + + UNSCRATCH 7, 10, rsp+(%4+2)*mmsize + SUMSUB_BA d, 5, 3, 0 ; m5=t8a, m3=t11a + SUMSUB_BA d, 6, 7, 0 ; m6=t9, m7=t10 + SUMSUB_BA d, 2, 4, 0 ; m2=t15a,m4=t12a + SCRATCH 5, 10, rsp+(%4+2)*mmsize + SUMSUB_MUL 4, 3, 0, 5, 11585, 11585 ; m4=t12, m3=t11 + UNSCRATCH 0, 9, rsp+(%4+1)*mmsize + SUMSUB_BA d, 1, 0, 5 ; m1=t14, m0=t13 + SCRATCH 6, 9, rsp+(%4+1)*mmsize + SUMSUB_MUL 0, 7, 6, 5, 11585, 11585 ; m0=t13a,m7=t10a + + ; order: 15|r74,14|r73,13|r72,12|r71,11|r70,r65,8|r67,r66,10|r69,9|r68,7,3,4,0,1,2 + ; free: 6,5 + + UNSCRATCH 5, 15, rsp+(%4+7)*mmsize + SUMSUB_BA d, 2, 5, 6 ; m2=out0, m5=out15 + SCRATCH 5, 15, rsp+(%4+7)*mmsize + UNSCRATCH 5, 14, rsp+(%4+6)*mmsize + SUMSUB_BA d, 1, 5, 6 ; m1=out1, m5=out14 + SCRATCH 5, 14, rsp+(%4+6)*mmsize + UNSCRATCH 5, 13, rsp+(%4+5)*mmsize + SUMSUB_BA d, 0, 5, 6 ; m0=out2, m5=out13 + SCRATCH 5, 13, rsp+(%4+5)*mmsize + UNSCRATCH 5, 12, rsp+(%4+4)*mmsize + SUMSUB_BA d, 4, 5, 6 ; m4=out3, m5=out12 + SCRATCH 5, 12, rsp+(%4+4)*mmsize + UNSCRATCH 5, 11, rsp+(%4+3)*mmsize + SUMSUB_BA d, 3, 5, 6 ; m3=out4, m5=out11 + SCRATCH 4, 11, rsp+(%4+3)*mmsize + mova m4, [rsp+(%3+0)*mmsize] + SUMSUB_BA d, 7, 4, 6 ; m7=out5, m4=out10 + mova [rsp+(%3+0)*mmsize], m5 + UNSCRATCH 5, 8, rsp+(%4+0)*mmsize + UNSCRATCH 6, 9, rsp+(%4+1)*mmsize + SCRATCH 2, 8, rsp+(%4+0)*mmsize + SCRATCH 1, 9, rsp+(%4+1)*mmsize + UNSCRATCH 1, 10, rsp+(%4+2)*mmsize + SCRATCH 0, 10, rsp+(%4+2)*mmsize + mova m0, [rsp+(%3+1)*mmsize] + SUMSUB_BA d, 6, 5, 2 ; m6=out6, m5=out9 + SUMSUB_BA d, 1, 0, 2 ; m1=out7, m0=out8 + + SWAP 0, 3, 1, 7, 2, 6, 4 + + ; output order: 8-11|r67-70=out0-3 + ; 0-6,r65=out4-11 + ; 12-15|r71-74=out12-15 +%endmacro + +INIT_XMM sse2 +cglobal vp9_idct_idct_16x16_add_10, 4, 6 + ARCH_X86_64, 16, \ + 67 * mmsize + ARCH_X86_32 * 8 * mmsize, \ + dst, stride, block, eob + mova m0, [pw_1023] + cmp eobd, 1 + jg .idctfull + + ; dc-only - the 10bit version can be done entirely in 32bit, since the max + ; coef values are 17+sign bit, and the coef is 14bit, so 31+sign easily + ; fits in 32bit + DEFINE_ARGS dst, stride, block, coef + pxor m2, m2 + DC_ONLY 6, m2 + movd m1, coefd + pshuflw m1, m1, q0000 + punpcklqdq m1, m1 + DEFINE_ARGS dst, stride, cnt + mov cntd, 8 +.loop_dc: + STORE_2x8 3, 4, 1, m2, m0, dstq, mmsize + STORE_2x8 3, 4, 1, m2, m0, dstq+strideq, mmsize + lea dstq, [dstq+strideq*2] + dec cntd + jg .loop_dc + RET + +.idctfull: + mova [rsp+64*mmsize], m0 + DEFINE_ARGS dst, stride, block, cnt, ptr, skip, dstbak +%if ARCH_X86_64 + mov dstbakq, dstq + movsxd cntq, cntd +%endif +%ifdef PIC + lea ptrq, [default_16x16] + movzx cntd, byte [ptrq+cntq-1] +%else + movzx cntd, byte [default_16x16+cntq-1] +%endif + mov skipd, 4 + sub skipd, cntd + mov ptrq, rsp +.loop_1: + IDCT16_1D blockq + + TRANSPOSE4x4D 0, 1, 2, 3, 7 + mova [ptrq+ 1*mmsize], m0 + mova [ptrq+ 5*mmsize], m1 + mova [ptrq+ 9*mmsize], m2 + mova [ptrq+13*mmsize], m3 + mova m7, [rsp+65*mmsize] + TRANSPOSE4x4D 4, 5, 6, 7, 0 + mova [ptrq+ 2*mmsize], m4 + mova [ptrq+ 6*mmsize], m5 + mova [ptrq+10*mmsize], m6 + mova [ptrq+14*mmsize], m7 + UNSCRATCH 0, 8, rsp+67*mmsize + UNSCRATCH 1, 9, rsp+68*mmsize + UNSCRATCH 2, 10, rsp+69*mmsize + UNSCRATCH 3, 11, rsp+70*mmsize + TRANSPOSE4x4D 0, 1, 2, 3, 7 + mova [ptrq+ 0*mmsize], m0 + mova [ptrq+ 4*mmsize], m1 + mova [ptrq+ 8*mmsize], m2 + mova [ptrq+12*mmsize], m3 + UNSCRATCH 4, 12, rsp+71*mmsize + UNSCRATCH 5, 13, rsp+72*mmsize + UNSCRATCH 6, 14, rsp+73*mmsize + UNSCRATCH 7, 15, rsp+74*mmsize + TRANSPOSE4x4D 4, 5, 6, 7, 0 + mova [ptrq+ 3*mmsize], m4 + mova [ptrq+ 7*mmsize], m5 + mova [ptrq+11*mmsize], m6 + mova [ptrq+15*mmsize], m7 + add ptrq, 16 * mmsize + add blockq, mmsize + dec cntd + jg .loop_1 + + ; zero-pad the remainder (skipped cols) + test skipd, skipd + jz .end + add skipd, skipd + lea blockq, [blockq+skipq*(mmsize/2)] + pxor m0, m0 +.loop_z: + mova [ptrq+mmsize*0], m0 + mova [ptrq+mmsize*1], m0 + mova [ptrq+mmsize*2], m0 + mova [ptrq+mmsize*3], m0 + mova [ptrq+mmsize*4], m0 + mova [ptrq+mmsize*5], m0 + mova [ptrq+mmsize*6], m0 + mova [ptrq+mmsize*7], m0 + add ptrq, 8 * mmsize + dec skipd + jg .loop_z +.end: + + DEFINE_ARGS dst, stride, block, cnt, ptr, stride3, dstbak + lea stride3q, [strideq*3] + mov cntd, 4 + mov ptrq, rsp +.loop_2: + IDCT16_1D ptrq + + pxor m7, m7 + lea dstq, [dstq+strideq*4] + ROUND_AND_STORE_4x4 0, 1, 2, 3, m7, [rsp+64*mmsize], [pd_32], 6 + lea dstq, [dstq+strideq*4] + mova m0, [rsp+65*mmsize] + mova m1, [rsp+64*mmsize] + mova m2, [pd_32] + ROUND_AND_STORE_4x4 4, 5, 6, 0, m7, m1, m2, 6 + +%if ARCH_X86_64 + DEFINE_ARGS dstbak, stride, block, cnt, ptr, stride3, dst +%else + mov dstq, dstm +%endif + UNSCRATCH 0, 8, rsp+67*mmsize + UNSCRATCH 4, 9, rsp+68*mmsize + UNSCRATCH 5, 10, rsp+69*mmsize + UNSCRATCH 3, 11, rsp+70*mmsize + ROUND_AND_STORE_4x4 0, 4, 5, 3, m7, m1, m2, 6 +%if ARCH_X86_64 + DEFINE_ARGS dst, stride, block, cnt, ptr, stride3, dstbak + lea dstq, [dstbakq+stride3q*4] +%else + lea dstq, [dstq+stride3q*4] +%endif + UNSCRATCH 4, 12, rsp+71*mmsize + UNSCRATCH 5, 13, rsp+72*mmsize + UNSCRATCH 6, 14, rsp+73*mmsize + UNSCRATCH 0, 15, rsp+74*mmsize + ROUND_AND_STORE_4x4 4, 5, 6, 0, m7, m1, m2, 6 + + add ptrq, mmsize +%if ARCH_X86_64 + add dstbakq, 8 + mov dstq, dstbakq +%else + add dword dstm, 8 + mov dstq, dstm +%endif + dec cntd + jg .loop_2 + + ; m7 is still zero + ZERO_BLOCK blockq-4*mmsize, 64, 16, m7 + RET + +INIT_XMM sse2 +cglobal vp9_idct_idct_16x16_add_12, 4, 6 + ARCH_X86_64, 16, \ + 67 * mmsize + ARCH_X86_32 * 8 * mmsize, \ + dst, stride, block, eob + mova m0, [pw_4095] + cmp eobd, 1 + jg mangle(private_prefix %+ _ %+ vp9_idct_idct_16x16_add_10 %+ SUFFIX).idctfull + + ; dc-only - unfortunately, this one can overflow, since coefs are 19+sign + ; bpp, and 19+14+sign does not fit in 32bit, so we do 2-stage multiplies + DEFINE_ARGS dst, stride, block, coef, coefl + pxor m2, m2 + DC_ONLY_64BIT 6, m2 + movd m1, coefd + pshuflw m1, m1, q0000 + punpcklqdq m1, m1 + DEFINE_ARGS dst, stride, cnt + mov cntd, 8 +.loop_dc: + STORE_2x8 3, 4, 1, m2, m0, dstq, mmsize + STORE_2x8 3, 4, 1, m2, m0, dstq+strideq, mmsize + lea dstq, [dstq+strideq*2] + dec cntd + jg .loop_dc + RET + +; r65-69 are available for spills +; r70-77 are available on x86-32 only (x86-64 should use m8-15) +; output should be in m8-11|r70-73, m0-6,r65 and m12-15|r74-77 +%macro IADST16_1D 1 ; src + mova m0, [%1+ 0*4*mmsize] ; in0 + mova m1, [%1+ 7*4*mmsize] ; in7 + mova m2, [%1+ 8*4*mmsize] ; in8 + mova m3, [%1+15*4*mmsize] ; in15 + SUMSUB_MUL_D 3, 0, 4, 5, 16364, 804 ; m3/4=t0, m0/5=t1 + SUMSUB_MUL_D 1, 2, 6, 7, 11003, 12140 ; m1/6=t8, m2/7=t9 + SCRATCH 0, 8, rsp+70*mmsize + SUMSUB_PACK_D 1, 3, 6, 4, 0 ; m1=t0a, m3=t8a + UNSCRATCH 0, 8, rsp+70*mmsize + SUMSUB_PACK_D 2, 0, 7, 5, 4 ; m2=t1a, m0=t9a + mova [rsp+67*mmsize], m1 + SCRATCH 2, 9, rsp+71*mmsize + SCRATCH 3, 12, rsp+74*mmsize + SCRATCH 0, 13, rsp+75*mmsize + + mova m0, [%1+ 3*4*mmsize] ; in3 + mova m1, [%1+ 4*4*mmsize] ; in4 + mova m2, [%1+11*4*mmsize] ; in11 + mova m3, [%1+12*4*mmsize] ; in12 + SUMSUB_MUL_D 2, 1, 4, 5, 14811, 7005 ; m2/4=t4, m1/5=t5 + SUMSUB_MUL_D 0, 3, 6, 7, 5520, 15426 ; m0/6=t12, m3/7=t13 + SCRATCH 1, 10, rsp+72*mmsize + SUMSUB_PACK_D 0, 2, 6, 4, 1 ; m0=t4a, m2=t12a + UNSCRATCH 1, 10, rsp+72*mmsize + SUMSUB_PACK_D 3, 1, 7, 5, 4 ; m3=t5a, m1=t13a + SCRATCH 0, 15, rsp+77*mmsize + SCRATCH 3, 11, rsp+73*mmsize + + UNSCRATCH 0, 12, rsp+74*mmsize ; t8a + UNSCRATCH 3, 13, rsp+75*mmsize ; t9a + SUMSUB_MUL_D 0, 3, 4, 5, 16069, 3196 ; m0/4=t8, m3/5=t9 + SUMSUB_MUL_D 1, 2, 6, 7, 3196, 16069 ; m1/6=t13, m2/7=t12 + SCRATCH 1, 12, rsp+74*mmsize + SUMSUB_PACK_D 2, 0, 7, 4, 1 ; m2=t8a, m0=t12a + UNSCRATCH 1, 12, rsp+74*mmsize + SUMSUB_PACK_D 1, 3, 6, 5, 4 ; m1=t9a, m3=t13a + mova [rsp+65*mmsize], m2 + mova [rsp+66*mmsize], m1 + SCRATCH 0, 8, rsp+70*mmsize + SCRATCH 3, 12, rsp+74*mmsize + + mova m0, [%1+ 2*4*mmsize] ; in2 + mova m1, [%1+ 5*4*mmsize] ; in5 + mova m2, [%1+10*4*mmsize] ; in10 + mova m3, [%1+13*4*mmsize] ; in13 + SUMSUB_MUL_D 3, 0, 4, 5, 15893, 3981 ; m3/4=t2, m0/5=t3 + SUMSUB_MUL_D 1, 2, 6, 7, 8423, 14053 ; m1/6=t10, m2/7=t11 + SCRATCH 0, 10, rsp+72*mmsize + SUMSUB_PACK_D 1, 3, 6, 4, 0 ; m1=t2a, m3=t10a + UNSCRATCH 0, 10, rsp+72*mmsize + SUMSUB_PACK_D 2, 0, 7, 5, 4 ; m2=t3a, m0=t11a + mova [rsp+68*mmsize], m1 + mova [rsp+69*mmsize], m2 + SCRATCH 3, 13, rsp+75*mmsize + SCRATCH 0, 14, rsp+76*mmsize + + mova m0, [%1+ 1*4*mmsize] ; in1 + mova m1, [%1+ 6*4*mmsize] ; in6 + mova m2, [%1+ 9*4*mmsize] ; in9 + mova m3, [%1+14*4*mmsize] ; in14 + SUMSUB_MUL_D 2, 1, 4, 5, 13160, 9760 ; m2/4=t6, m1/5=t7 + SUMSUB_MUL_D 0, 3, 6, 7, 2404, 16207 ; m0/6=t14, m3/7=t15 + SCRATCH 1, 10, rsp+72*mmsize + SUMSUB_PACK_D 0, 2, 6, 4, 1 ; m0=t6a, m2=t14a + UNSCRATCH 1, 10, rsp+72*mmsize + SUMSUB_PACK_D 3, 1, 7, 5, 4 ; m3=t7a, m1=t15a + + UNSCRATCH 4, 13, rsp+75*mmsize ; t10a + UNSCRATCH 5, 14, rsp+76*mmsize ; t11a + SCRATCH 0, 13, rsp+75*mmsize + SCRATCH 3, 14, rsp+76*mmsize + SUMSUB_MUL_D 4, 5, 6, 7, 9102, 13623 ; m4/6=t10, m5/7=t11 + SUMSUB_MUL_D 1, 2, 0, 3, 13623, 9102 ; m1/0=t15, m2/3=t14 + SCRATCH 0, 10, rsp+72*mmsize + SUMSUB_PACK_D 2, 4, 3, 6, 0 ; m2=t10a, m4=t14a + UNSCRATCH 0, 10, rsp+72*mmsize + SUMSUB_PACK_D 1, 5, 0, 7, 6 ; m1=t11a, m5=t15a + + UNSCRATCH 0, 8, rsp+70*mmsize ; t12a + UNSCRATCH 3, 12, rsp+74*mmsize ; t13a + SCRATCH 2, 8, rsp+70*mmsize + SCRATCH 1, 12, rsp+74*mmsize + SUMSUB_MUL_D 0, 3, 1, 2, 15137, 6270 ; m0/1=t12, m3/2=t13 + SUMSUB_MUL_D 5, 4, 7, 6, 6270, 15137 ; m5/7=t15, m4/6=t14 + SCRATCH 2, 10, rsp+72*mmsize + SUMSUB_PACK_D 4, 0, 6, 1, 2 ; m4=out2, m0=t14a + UNSCRATCH 2, 10, rsp+72*mmsize + SUMSUB_PACK_D 5, 3, 7, 2, 1 ; m5=-out13, m3=t15a + NEGD m5 ; m5=out13 + + UNSCRATCH 1, 9, rsp+71*mmsize ; t1a + mova m2, [rsp+68*mmsize] ; t2a + UNSCRATCH 6, 13, rsp+75*mmsize ; t6a + UNSCRATCH 7, 14, rsp+76*mmsize ; t7a + SCRATCH 4, 10, rsp+72*mmsize + SCRATCH 5, 13, rsp+75*mmsize + UNSCRATCH 4, 15, rsp+77*mmsize ; t4a + UNSCRATCH 5, 11, rsp+73*mmsize ; t5a + SCRATCH 0, 14, rsp+76*mmsize + SCRATCH 3, 15, rsp+77*mmsize + mova m0, [rsp+67*mmsize] ; t0a + SUMSUB_BA d, 4, 0, 3 ; m4=t0, m0=t4 + SUMSUB_BA d, 5, 1, 3 ; m5=t1, m1=t5 + SUMSUB_BA d, 6, 2, 3 ; m6=t2, m2=t6 + SCRATCH 4, 9, rsp+71*mmsize + mova m3, [rsp+69*mmsize] ; t3a + SUMSUB_BA d, 7, 3, 4 ; m7=t3, m3=t7 + + mova [rsp+67*mmsize], m5 + mova [rsp+68*mmsize], m6 + mova [rsp+69*mmsize], m7 + SUMSUB_MUL_D 0, 1, 4, 5, 15137, 6270 ; m0/4=t4a, m1/5=t5a + SUMSUB_MUL_D 3, 2, 7, 6, 6270, 15137 ; m3/7=t7a, m2/6=t6a + SCRATCH 1, 11, rsp+73*mmsize + SUMSUB_PACK_D 2, 0, 6, 4, 1 ; m2=-out3, m0=t6 + NEGD m2 ; m2=out3 + UNSCRATCH 1, 11, rsp+73*mmsize + SUMSUB_PACK_D 3, 1, 7, 5, 4 ; m3=out12, m1=t7 + SCRATCH 2, 11, rsp+73*mmsize + UNSCRATCH 2, 12, rsp+74*mmsize ; t11a + SCRATCH 3, 12, rsp+74*mmsize + + UNSCRATCH 3, 8, rsp+70*mmsize ; t10a + mova m4, [rsp+65*mmsize] ; t8a + mova m5, [rsp+66*mmsize] ; t9a + SUMSUB_BA d, 3, 4, 6 ; m3=-out1, m4=t10 + NEGD m3 ; m3=out1 + SUMSUB_BA d, 2, 5, 6 ; m2=out14, m5=t11 + UNSCRATCH 6, 9, rsp+71*mmsize ; t0 + UNSCRATCH 7, 14, rsp+76*mmsize ; t14a + SCRATCH 3, 9, rsp+71*mmsize + SCRATCH 2, 14, rsp+76*mmsize + + SUMSUB_MUL 1, 0, 2, 3, 11585, 11585 ; m1=out4, m0=out11 + mova [rsp+65*mmsize], m0 + SUMSUB_MUL 5, 4, 2, 3, 11585, 11585 ; m5=out6, m4=out9 + UNSCRATCH 0, 15, rsp+77*mmsize ; t15a + SUMSUB_MUL 7, 0, 2, 3, 11585, m11585 ; m7=out10, m0=out5 + + mova m2, [rsp+68*mmsize] ; t2 + SUMSUB_BA d, 2, 6, 3 ; m2=out0, m6=t2a + SCRATCH 2, 8, rsp+70*mmsize + mova m2, [rsp+67*mmsize] ; t1 + mova m3, [rsp+69*mmsize] ; t3 + mova [rsp+67*mmsize], m7 + SUMSUB_BA d, 3, 2, 7 ; m3=-out15, m2=t3a + NEGD m3 ; m3=out15 + SCRATCH 3, 15, rsp+77*mmsize + SUMSUB_MUL 6, 2, 7, 3, 11585, m11585 ; m6=out8, m2=out7 + mova m7, [rsp+67*mmsize] + + SWAP 0, 1 + SWAP 2, 5, 4, 6, 7, 3 +%endmacro + +%macro IADST16_FN 7 +cglobal vp9_%1_%4_16x16_add_10, 4, 6 + ARCH_X86_64, 16, \ + 70 * mmsize + ARCH_X86_32 * 8 * mmsize, \ + dst, stride, block, eob + mova m0, [pw_1023] + +.body: + mova [rsp+64*mmsize], m0 + DEFINE_ARGS dst, stride, block, cnt, ptr, skip, dstbak +%if ARCH_X86_64 + mov dstbakq, dstq + movsxd cntq, cntd +%endif +%ifdef PIC + lea ptrq, [%7_16x16] + movzx cntd, byte [ptrq+cntq-1] +%else + movzx cntd, byte [%7_16x16+cntq-1] +%endif + mov skipd, 4 + sub skipd, cntd + mov ptrq, rsp +.loop_1: + %2_1D blockq + + TRANSPOSE4x4D 0, 1, 2, 3, 7 + mova [ptrq+ 1*mmsize], m0 + mova [ptrq+ 5*mmsize], m1 + mova [ptrq+ 9*mmsize], m2 + mova [ptrq+13*mmsize], m3 + mova m7, [rsp+65*mmsize] + TRANSPOSE4x4D 4, 5, 6, 7, 0 + mova [ptrq+ 2*mmsize], m4 + mova [ptrq+ 6*mmsize], m5 + mova [ptrq+10*mmsize], m6 + mova [ptrq+14*mmsize], m7 + UNSCRATCH 0, 8, rsp+(%3+0)*mmsize + UNSCRATCH 1, 9, rsp+(%3+1)*mmsize + UNSCRATCH 2, 10, rsp+(%3+2)*mmsize + UNSCRATCH 3, 11, rsp+(%3+3)*mmsize + TRANSPOSE4x4D 0, 1, 2, 3, 7 + mova [ptrq+ 0*mmsize], m0 + mova [ptrq+ 4*mmsize], m1 + mova [ptrq+ 8*mmsize], m2 + mova [ptrq+12*mmsize], m3 + UNSCRATCH 4, 12, rsp+(%3+4)*mmsize + UNSCRATCH 5, 13, rsp+(%3+5)*mmsize + UNSCRATCH 6, 14, rsp+(%3+6)*mmsize + UNSCRATCH 7, 15, rsp+(%3+7)*mmsize + TRANSPOSE4x4D 4, 5, 6, 7, 0 + mova [ptrq+ 3*mmsize], m4 + mova [ptrq+ 7*mmsize], m5 + mova [ptrq+11*mmsize], m6 + mova [ptrq+15*mmsize], m7 + add ptrq, 16 * mmsize + add blockq, mmsize + dec cntd + jg .loop_1 + + ; zero-pad the remainder (skipped cols) + test skipd, skipd + jz .end + add skipd, skipd + lea blockq, [blockq+skipq*(mmsize/2)] + pxor m0, m0 +.loop_z: + mova [ptrq+mmsize*0], m0 + mova [ptrq+mmsize*1], m0 + mova [ptrq+mmsize*2], m0 + mova [ptrq+mmsize*3], m0 + mova [ptrq+mmsize*4], m0 + mova [ptrq+mmsize*5], m0 + mova [ptrq+mmsize*6], m0 + mova [ptrq+mmsize*7], m0 + add ptrq, 8 * mmsize + dec skipd + jg .loop_z +.end: + + DEFINE_ARGS dst, stride, block, cnt, ptr, stride3, dstbak + lea stride3q, [strideq*3] + mov cntd, 4 + mov ptrq, rsp +.loop_2: + %5_1D ptrq + + pxor m7, m7 + lea dstq, [dstq+strideq*4] + ROUND_AND_STORE_4x4 0, 1, 2, 3, m7, [rsp+64*mmsize], [pd_32], 6 + lea dstq, [dstq+strideq*4] + mova m0, [rsp+65*mmsize] + mova m1, [rsp+64*mmsize] + mova m2, [pd_32] + ROUND_AND_STORE_4x4 4, 5, 6, 0, m7, m1, m2, 6 + +%if ARCH_X86_64 + DEFINE_ARGS dstbak, stride, block, cnt, ptr, stride3, dst +%else + mov dstq, dstm +%endif + UNSCRATCH 0, 8, rsp+(%6+0)*mmsize + UNSCRATCH 4, 9, rsp+(%6+1)*mmsize + UNSCRATCH 5, 10, rsp+(%6+2)*mmsize + UNSCRATCH 3, 11, rsp+(%6+3)*mmsize + ROUND_AND_STORE_4x4 0, 4, 5, 3, m7, m1, m2, 6 +%if ARCH_X86_64 + DEFINE_ARGS dst, stride, block, cnt, ptr, stride3, dstbak + lea dstq, [dstbakq+stride3q*4] +%else + lea dstq, [dstq+stride3q*4] +%endif + UNSCRATCH 4, 12, rsp+(%6+4)*mmsize + UNSCRATCH 5, 13, rsp+(%6+5)*mmsize + UNSCRATCH 6, 14, rsp+(%6+6)*mmsize + UNSCRATCH 0, 15, rsp+(%6+7)*mmsize + ROUND_AND_STORE_4x4 4, 5, 6, 0, m7, m1, m2, 6 + + add ptrq, mmsize +%if ARCH_X86_64 + add dstbakq, 8 + mov dstq, dstbakq +%else + add dword dstm, 8 + mov dstq, dstm +%endif + dec cntd + jg .loop_2 + + ; m7 is still zero + ZERO_BLOCK blockq-4*mmsize, 64, 16, m7 + RET + +cglobal vp9_%1_%4_16x16_add_12, 4, 6 + ARCH_X86_64, 16, \ + 70 * mmsize + ARCH_X86_32 * 8 * mmsize, \ + dst, stride, block, eob + mova m0, [pw_4095] + jmp mangle(private_prefix %+ _ %+ vp9_%1_%4_16x16_add_10 %+ SUFFIX).body +%endmacro + +INIT_XMM sse2 +IADST16_FN idct, IDCT16, 67, iadst, IADST16, 70, row +IADST16_FN iadst, IADST16, 70, idct, IDCT16, 67, col +IADST16_FN iadst, IADST16, 70, iadst, IADST16, 70, default + +%macro IDCT32_1D 2-3 8 * mmsize; pass[1/2], src, src_stride + IDCT16_1D %2, 2 * %3, 272, 257 +%if ARCH_X86_64 + mova [rsp+257*mmsize], m8 + mova [rsp+258*mmsize], m9 + mova [rsp+259*mmsize], m10 + mova [rsp+260*mmsize], m11 + mova [rsp+261*mmsize], m12 + mova [rsp+262*mmsize], m13 + mova [rsp+263*mmsize], m14 + mova [rsp+264*mmsize], m15 +%endif + mova [rsp+265*mmsize], m0 + mova [rsp+266*mmsize], m1 + mova [rsp+267*mmsize], m2 + mova [rsp+268*mmsize], m3 + mova [rsp+269*mmsize], m4 + mova [rsp+270*mmsize], m5 + mova [rsp+271*mmsize], m6 + + ; r257-260: t0-3 + ; r265-272: t4/5a/6a/7/8/9a/10/11a + ; r261-264: t12a/13/14a/15 + ; r273-274 is free as scratch space, and 275-282 mirrors m8-15 on 32bit + + mova m0, [%2+ 1*%3] ; in1 + mova m1, [%2+15*%3] ; in15 + mova m2, [%2+17*%3] ; in17 + mova m3, [%2+31*%3] ; in31 + SUMSUB_MUL 0, 3, 4, 5, 16364, 804 ; m0=t31a, m3=t16a + SUMSUB_MUL 2, 1, 4, 5, 11003, 12140 ; m2=t30a, m1=t17a + SUMSUB_BA d, 1, 3, 4 ; m1=t16, m3=t17 + SUMSUB_BA d, 2, 0, 4 ; m2=t31, m0=t30 + SUMSUB_MUL 0, 3, 4, 5, 16069, 3196 ; m0=t30a, m3=t17a + SCRATCH 0, 8, rsp+275*mmsize + SCRATCH 2, 9, rsp+276*mmsize + + ; end of stage 1-3 first quart + + mova m0, [%2+ 7*%3] ; in7 + mova m2, [%2+ 9*%3] ; in9 + mova m4, [%2+23*%3] ; in23 + mova m5, [%2+25*%3] ; in25 + SUMSUB_MUL 2, 4, 6, 7, 14811, 7005 ; m2=t29a, m4=t18a + SUMSUB_MUL 5, 0, 6, 7, 5520, 15426 ; m5=t28a, m0=t19a + SUMSUB_BA d, 4, 0, 6 ; m4=t19, m0=t18 + SUMSUB_BA d, 2, 5, 6 ; m2=t28, m5=t29 + SUMSUB_MUL 5, 0, 6, 7, 3196, m16069 ; m5=t29a, m0=t18a + + ; end of stage 1-3 second quart + + SUMSUB_BA d, 4, 1, 6 ; m4=t16a, m1=t19a + SUMSUB_BA d, 0, 3, 6 ; m0=t17, m3=t18 + UNSCRATCH 6, 8, rsp+275*mmsize ; t30a + UNSCRATCH 7, 9, rsp+276*mmsize ; t31 + mova [rsp+273*mmsize], m4 + mova [rsp+274*mmsize], m0 + SUMSUB_BA d, 2, 7, 0 ; m2=t31a, m7=t28a + SUMSUB_BA d, 5, 6, 0 ; m5=t30, m6=t29 + SUMSUB_MUL 6, 3, 0, 4, 15137, 6270 ; m6=t29a, m3=t18a + SUMSUB_MUL 7, 1, 0, 4, 15137, 6270 ; m7=t28, m1=t19 + SCRATCH 3, 10, rsp+277*mmsize + SCRATCH 1, 11, rsp+278*mmsize + SCRATCH 7, 12, rsp+279*mmsize + SCRATCH 6, 13, rsp+280*mmsize + SCRATCH 5, 14, rsp+281*mmsize + SCRATCH 2, 15, rsp+282*mmsize + + ; end of stage 4-5 first half + + mova m0, [%2+ 5*%3] ; in5 + mova m1, [%2+11*%3] ; in11 + mova m2, [%2+21*%3] ; in21 + mova m3, [%2+27*%3] ; in27 + SUMSUB_MUL 0, 3, 4, 5, 15893, 3981 ; m0=t27a, m3=t20a + SUMSUB_MUL 2, 1, 4, 5, 8423, 14053 ; m2=t26a, m1=t21a + SUMSUB_BA d, 1, 3, 4 ; m1=t20, m3=t21 + SUMSUB_BA d, 2, 0, 4 ; m2=t27, m0=t26 + SUMSUB_MUL 0, 3, 4, 5, 9102, 13623 ; m0=t26a, m3=t21a + SCRATCH 0, 8, rsp+275*mmsize + SCRATCH 2, 9, rsp+276*mmsize + + ; end of stage 1-3 third quart + + mova m0, [%2+ 3*%3] ; in3 + mova m2, [%2+13*%3] ; in13 + mova m4, [%2+19*%3] ; in19 + mova m5, [%2+29*%3] ; in29 + SUMSUB_MUL 2, 4, 6, 7, 13160, 9760 ; m2=t25a, m4=t22a + SUMSUB_MUL 5, 0, 6, 7, 2404, 16207 ; m5=t24a, m0=t23a + SUMSUB_BA d, 4, 0, 6 ; m4=t23, m0=t22 + SUMSUB_BA d, 2, 5, 6 ; m2=t24, m5=t25 + SUMSUB_MUL 5, 0, 6, 7, 13623, m9102 ; m5=t25a, m0=t22a + + ; end of stage 1-3 fourth quart + + SUMSUB_BA d, 1, 4, 6 ; m1=t23a, m4=t20a + SUMSUB_BA d, 3, 0, 6 ; m3=t22, m0=t21 + UNSCRATCH 6, 8, rsp+275*mmsize ; t26a + UNSCRATCH 7, 9, rsp+276*mmsize ; t27 + SCRATCH 3, 8, rsp+275*mmsize + SCRATCH 1, 9, rsp+276*mmsize + SUMSUB_BA d, 7, 2, 1 ; m7=t24a, m2=t27a + SUMSUB_BA d, 6, 5, 1 ; m6=t25, m5=t26 + SUMSUB_MUL 2, 4, 1, 3, 6270, m15137 ; m2=t27, m4=t20 + SUMSUB_MUL 5, 0, 1, 3, 6270, m15137 ; m5=t26a, m0=t21a + + ; end of stage 4-5 second half + + UNSCRATCH 1, 12, rsp+279*mmsize ; t28 + UNSCRATCH 3, 13, rsp+280*mmsize ; t29a + SCRATCH 4, 12, rsp+279*mmsize + SCRATCH 0, 13, rsp+280*mmsize + SUMSUB_BA d, 5, 3, 0 ; m5=t29, m3=t26 + SUMSUB_BA d, 2, 1, 0 ; m2=t28a, m1=t27a + UNSCRATCH 0, 14, rsp+281*mmsize ; t30 + UNSCRATCH 4, 15, rsp+282*mmsize ; t31a + SCRATCH 2, 14, rsp+281*mmsize + SCRATCH 5, 15, rsp+282*mmsize + SUMSUB_BA d, 6, 0, 2 ; m6=t30a, m0=t25a + SUMSUB_BA d, 7, 4, 2 ; m7=t31, m4=t24 + + mova m2, [rsp+273*mmsize] ; t16a + mova m5, [rsp+274*mmsize] ; t17 + mova [rsp+273*mmsize], m6 + mova [rsp+274*mmsize], m7 + UNSCRATCH 6, 10, rsp+277*mmsize ; t18a + UNSCRATCH 7, 11, rsp+278*mmsize ; t19 + SCRATCH 4, 10, rsp+277*mmsize + SCRATCH 0, 11, rsp+278*mmsize + UNSCRATCH 4, 12, rsp+279*mmsize ; t20 + UNSCRATCH 0, 13, rsp+280*mmsize ; t21a + SCRATCH 3, 12, rsp+279*mmsize + SCRATCH 1, 13, rsp+280*mmsize + SUMSUB_BA d, 0, 6, 1 ; m0=t18, m6=t21 + SUMSUB_BA d, 4, 7, 1 ; m4=t19a, m7=t20a + UNSCRATCH 3, 8, rsp+275*mmsize ; t22 + UNSCRATCH 1, 9, rsp+276*mmsize ; t23a + SCRATCH 0, 8, rsp+275*mmsize + SCRATCH 4, 9, rsp+276*mmsize + SUMSUB_BA d, 3, 5, 0 ; m3=t17a, m5=t22a + SUMSUB_BA d, 1, 2, 0 ; m1=t16, m2=t23 + + ; end of stage 6 + + UNSCRATCH 0, 10, rsp+277*mmsize ; t24 + UNSCRATCH 4, 11, rsp+278*mmsize ; t25a + SCRATCH 1, 10, rsp+277*mmsize + SCRATCH 3, 11, rsp+278*mmsize + SUMSUB_MUL 0, 2, 1, 3, 11585, 11585 ; m0=t24a, m2=t23a + SUMSUB_MUL 4, 5, 1, 3, 11585, 11585 ; m4=t25, m5=t22 + UNSCRATCH 1, 12, rsp+279*mmsize ; t26 + UNSCRATCH 3, 13, rsp+280*mmsize ; t27a + SCRATCH 0, 12, rsp+279*mmsize + SCRATCH 4, 13, rsp+280*mmsize + SUMSUB_MUL 3, 7, 0, 4, 11585, 11585 ; m3=t27, m7=t20 + SUMSUB_MUL 1, 6, 0, 4, 11585, 11585 ; m1=t26a, m6=t21a + + ; end of stage 7 + + mova m0, [rsp+269*mmsize] ; t8 + mova m4, [rsp+270*mmsize] ; t9a + mova [rsp+269*mmsize], m1 ; t26a + mova [rsp+270*mmsize], m3 ; t27 + mova m3, [rsp+271*mmsize] ; t10 + SUMSUB_BA d, 2, 0, 1 ; m2=out8, m0=out23 + SUMSUB_BA d, 5, 4, 1 ; m5=out9, m4=out22 + SUMSUB_BA d, 6, 3, 1 ; m6=out10, m3=out21 + mova m1, [rsp+272*mmsize] ; t11a + mova [rsp+271*mmsize], m0 + SUMSUB_BA d, 7, 1, 0 ; m7=out11, m1=out20 + +%if %1 == 1 + TRANSPOSE4x4D 2, 5, 6, 7, 0 + mova [ptrq+ 2*mmsize], m2 + mova [ptrq+10*mmsize], m5 + mova [ptrq+18*mmsize], m6 + mova [ptrq+26*mmsize], m7 +%else ; %1 == 2 + pxor m0, m0 + lea dstq, [dstq+strideq*8] + ROUND_AND_STORE_4x4 2, 5, 6, 7, m0, [rsp+256*mmsize], [pd_32], 6 +%endif + mova m2, [rsp+271*mmsize] +%if %1 == 1 + TRANSPOSE4x4D 1, 3, 4, 2, 0 + mova [ptrq+ 5*mmsize], m1 + mova [ptrq+13*mmsize], m3 + mova [ptrq+21*mmsize], m4 + mova [ptrq+29*mmsize], m2 +%else ; %1 == 2 + lea dstq, [dstq+stride3q*4] + ROUND_AND_STORE_4x4 1, 3, 4, 2, m0, [rsp+256*mmsize], [pd_32], 6 +%endif + + ; end of last stage + store for out8-11 and out20-23 + + UNSCRATCH 0, 9, rsp+276*mmsize ; t19a + UNSCRATCH 1, 8, rsp+275*mmsize ; t18 + UNSCRATCH 2, 11, rsp+278*mmsize ; t17a + UNSCRATCH 3, 10, rsp+277*mmsize ; t16 + mova m7, [rsp+261*mmsize] ; t12a + mova m6, [rsp+262*mmsize] ; t13 + mova m5, [rsp+263*mmsize] ; t14a + SUMSUB_BA d, 0, 7, 4 ; m0=out12, m7=out19 + SUMSUB_BA d, 1, 6, 4 ; m1=out13, m6=out18 + SUMSUB_BA d, 2, 5, 4 ; m2=out14, m5=out17 + mova m4, [rsp+264*mmsize] ; t15 + SCRATCH 7, 8, rsp+275*mmsize + SUMSUB_BA d, 3, 4, 7 ; m3=out15, m4=out16 + +%if %1 == 1 + TRANSPOSE4x4D 0, 1, 2, 3, 7 + mova [ptrq+ 3*mmsize], m0 + mova [ptrq+11*mmsize], m1 + mova [ptrq+19*mmsize], m2 + mova [ptrq+27*mmsize], m3 +%else ; %1 == 2 +%if ARCH_X86_64 + SWAP 7, 9 + lea dstq, [dstbakq+stride3q*4] +%else ; x86-32 + pxor m7, m7 + mov dstq, dstm + lea dstq, [dstq+stride3q*4] +%endif + ROUND_AND_STORE_4x4 0, 1, 2, 3, m7, [rsp+256*mmsize], [pd_32], 6 +%endif + UNSCRATCH 0, 8, rsp+275*mmsize ; out19 +%if %1 == 1 + TRANSPOSE4x4D 4, 5, 6, 0, 7 + mova [ptrq+ 4*mmsize], m4 + mova [ptrq+12*mmsize], m5 + mova [ptrq+20*mmsize], m6 + mova [ptrq+28*mmsize], m0 +%else ; %1 == 2 + lea dstq, [dstq+strideq*4] + ROUND_AND_STORE_4x4 4, 5, 6, 0, m7, [rsp+256*mmsize], [pd_32], 6 +%endif + + ; end of last stage + store for out12-19 + +%if ARCH_X86_64 + SWAP 7, 8 +%endif + mova m7, [rsp+257*mmsize] ; t0 + mova m6, [rsp+258*mmsize] ; t1 + mova m5, [rsp+259*mmsize] ; t2 + mova m4, [rsp+260*mmsize] ; t3 + mova m0, [rsp+274*mmsize] ; t31 + mova m1, [rsp+273*mmsize] ; t30a + UNSCRATCH 2, 15, rsp+282*mmsize ; t29 + SUMSUB_BA d, 0, 7, 3 ; m0=out0, m7=out31 + SUMSUB_BA d, 1, 6, 3 ; m1=out1, m6=out30 + SUMSUB_BA d, 2, 5, 3 ; m2=out2, m5=out29 + SCRATCH 0, 9, rsp+276*mmsize + UNSCRATCH 3, 14, rsp+281*mmsize ; t28a + SUMSUB_BA d, 3, 4, 0 ; m3=out3, m4=out28 + +%if %1 == 1 + TRANSPOSE4x4D 4, 5, 6, 7, 0 + mova [ptrq+ 7*mmsize], m4 + mova [ptrq+15*mmsize], m5 + mova [ptrq+23*mmsize], m6 + mova [ptrq+31*mmsize], m7 +%else ; %1 == 2 +%if ARCH_X86_64 + SWAP 0, 8 +%else ; x86-32 + pxor m0, m0 +%endif + lea dstq, [dstq+stride3q*4] + ROUND_AND_STORE_4x4 4, 5, 6, 7, m0, [rsp+256*mmsize], [pd_32], 6 +%endif + UNSCRATCH 7, 9, rsp+276*mmsize ; out0 +%if %1 == 1 + TRANSPOSE4x4D 7, 1, 2, 3, 0 + mova [ptrq+ 0*mmsize], m7 + mova [ptrq+ 8*mmsize], m1 + mova [ptrq+16*mmsize], m2 + mova [ptrq+24*mmsize], m3 +%else ; %1 == 2 +%if ARCH_X86_64 + DEFINE_ARGS dstbak, stride, block, cnt, ptr, stride3, dst +%else ; x86-32 + mov dstq, dstm +%endif + ROUND_AND_STORE_4x4 7, 1, 2, 3, m0, [rsp+256*mmsize], [pd_32], 6 +%if ARCH_X86_64 + DEFINE_ARGS dst, stride, block, cnt, ptr, stride3, dstbak +%endif +%endif + + ; end of last stage + store for out0-3 and out28-31 + +%if ARCH_X86_64 + SWAP 0, 8 +%endif + mova m7, [rsp+265*mmsize] ; t4 + mova m6, [rsp+266*mmsize] ; t5a + mova m5, [rsp+267*mmsize] ; t6a + mova m4, [rsp+268*mmsize] ; t7 + mova m0, [rsp+270*mmsize] ; t27 + mova m1, [rsp+269*mmsize] ; t26a + UNSCRATCH 2, 13, rsp+280*mmsize ; t25 + SUMSUB_BA d, 0, 7, 3 ; m0=out4, m7=out27 + SUMSUB_BA d, 1, 6, 3 ; m1=out5, m6=out26 + SUMSUB_BA d, 2, 5, 3 ; m2=out6, m5=out25 + UNSCRATCH 3, 12, rsp+279*mmsize ; t24a + SCRATCH 7, 9, rsp+276*mmsize + SUMSUB_BA d, 3, 4, 7 ; m3=out7, m4=out24 + +%if %1 == 1 + TRANSPOSE4x4D 0, 1, 2, 3, 7 + mova [ptrq+ 1*mmsize], m0 + mova [ptrq+ 9*mmsize], m1 + mova [ptrq+17*mmsize], m2 + mova [ptrq+25*mmsize], m3 +%else ; %1 == 2 +%if ARCH_X86_64 + SWAP 7, 8 + lea dstq, [dstbakq+strideq*4] +%else ; x86-32 + pxor m7, m7 + lea dstq, [dstq+strideq*4] +%endif + ROUND_AND_STORE_4x4 0, 1, 2, 3, m7, [rsp+256*mmsize], [pd_32], 6 +%endif + UNSCRATCH 0, 9, rsp+276*mmsize ; out27 +%if %1 == 1 + TRANSPOSE4x4D 4, 5, 6, 0, 7 + mova [ptrq+ 6*mmsize], m4 + mova [ptrq+14*mmsize], m5 + mova [ptrq+22*mmsize], m6 + mova [ptrq+30*mmsize], m0 +%else ; %1 == 2 +%if ARCH_X86_64 + lea dstq, [dstbakq+stride3q*8] +%else + mov dstq, dstm + lea dstq, [dstq+stride3q*8] +%endif + ROUND_AND_STORE_4x4 4, 5, 6, 0, m7, [rsp+256*mmsize], [pd_32], 6 +%endif + + ; end of last stage + store for out4-7 and out24-27 +%endmacro + +INIT_XMM sse2 +cglobal vp9_idct_idct_32x32_add_10, 4, 6 + ARCH_X86_64, 16, \ + 275 * mmsize + ARCH_X86_32 * 8 * mmsize, \ + dst, stride, block, eob + mova m0, [pw_1023] + cmp eobd, 1 + jg .idctfull + + ; dc-only - the 10bit version can be done entirely in 32bit, since the max + ; coef values are 17+sign bit, and the coef is 14bit, so 31+sign easily + ; fits in 32bit + DEFINE_ARGS dst, stride, block, coef + pxor m2, m2 + DC_ONLY 6, m2 + movd m1, coefd + pshuflw m1, m1, q0000 + punpcklqdq m1, m1 + DEFINE_ARGS dst, stride, cnt + mov cntd, 32 +.loop_dc: + STORE_2x8 3, 4, 1, m2, m0, dstq, mmsize + STORE_2x8 3, 4, 1, m2, m0, dstq+mmsize*2, mmsize + add dstq, strideq + dec cntd + jg .loop_dc + RET + +.idctfull: + mova [rsp+256*mmsize], m0 + DEFINE_ARGS dst, stride, block, cnt, ptr, skip, dstbak +%if ARCH_X86_64 + mov dstbakq, dstq + movsxd cntq, cntd +%endif +%ifdef PIC + lea ptrq, [default_32x32] + movzx cntd, byte [ptrq+cntq-1] +%else + movzx cntd, byte [default_32x32+cntq-1] +%endif + mov skipd, 8 + sub skipd, cntd + mov ptrq, rsp +.loop_1: + IDCT32_1D 1, blockq + + add ptrq, 32 * mmsize + add blockq, mmsize + dec cntd + jg .loop_1 + + ; zero-pad the remainder (skipped cols) + test skipd, skipd + jz .end + shl skipd, 2 + lea blockq, [blockq+skipq*(mmsize/4)] + pxor m0, m0 +.loop_z: + mova [ptrq+mmsize*0], m0 + mova [ptrq+mmsize*1], m0 + mova [ptrq+mmsize*2], m0 + mova [ptrq+mmsize*3], m0 + mova [ptrq+mmsize*4], m0 + mova [ptrq+mmsize*5], m0 + mova [ptrq+mmsize*6], m0 + mova [ptrq+mmsize*7], m0 + add ptrq, 8 * mmsize + dec skipd + jg .loop_z +.end: + + DEFINE_ARGS dst, stride, block, cnt, ptr, stride3, dstbak + lea stride3q, [strideq*3] + mov cntd, 8 + mov ptrq, rsp +.loop_2: + IDCT32_1D 2, ptrq + + add ptrq, mmsize +%if ARCH_X86_64 + add dstbakq, 8 + mov dstq, dstbakq +%else + add dword dstm, 8 + mov dstq, dstm +%endif + dec cntd + jg .loop_2 + + ; m7 is still zero + ZERO_BLOCK blockq-8*mmsize, 128, 32, m7 + RET + +INIT_XMM sse2 +cglobal vp9_idct_idct_32x32_add_12, 4, 6 + ARCH_X86_64, 16, \ + 275 * mmsize + ARCH_X86_32 * 8 * mmsize, \ + dst, stride, block, eob + mova m0, [pw_4095] + cmp eobd, 1 + jg mangle(private_prefix %+ _ %+ vp9_idct_idct_32x32_add_10 %+ SUFFIX).idctfull + + ; dc-only - unfortunately, this one can overflow, since coefs are 19+sign + ; bpp, and 19+14+sign does not fit in 32bit, so we do 2-stage multiplies + DEFINE_ARGS dst, stride, block, coef, coefl + pxor m2, m2 + DC_ONLY_64BIT 6, m2 + movd m1, coefd + pshuflw m1, m1, q0000 + punpcklqdq m1, m1 + DEFINE_ARGS dst, stride, cnt + mov cntd, 32 +.loop_dc: + STORE_2x8 3, 4, 1, m2, m0, dstq, mmsize + STORE_2x8 3, 4, 1, m2, m0, dstq+mmsize*2, mmsize + add dstq, strideq + dec cntd + jg .loop_dc + RET diff --git a/media/ffvpx/libavcodec/x86/vp9itxfm_template.asm b/media/ffvpx/libavcodec/x86/vp9itxfm_template.asm new file mode 100644 index 0000000000..d2f2257d84 --- /dev/null +++ b/media/ffvpx/libavcodec/x86/vp9itxfm_template.asm @@ -0,0 +1,142 @@ +;****************************************************************************** +;* VP9 IDCT SIMD optimizations +;* +;* Copyright (C) 2013 Clément Bœsch <u pkh me> +;* Copyright (C) 2013 Ronald S. Bultje <rsbultje gmail com> +;* +;* This file is part of FFmpeg. +;* +;* FFmpeg is free software; you can redistribute it and/or +;* modify it under the terms of the GNU Lesser General Public +;* License as published by the Free Software Foundation; either +;* version 2.1 of the License, or (at your option) any later version. +;* +;* FFmpeg is distributed in the hope that it will be useful, +;* but WITHOUT ANY WARRANTY; without even the implied warranty of +;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +;* Lesser General Public License for more details. +;* +;* You should have received a copy of the GNU Lesser General Public +;* License along with FFmpeg; if not, write to the Free Software +;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA +;****************************************************************************** + +%macro VP9_IWHT4_1D 0 + SWAP 1, 2, 3 + paddw m0, m2 + psubw m3, m1 + psubw m4, m0, m3 + psraw m4, 1 + psubw m5, m4, m1 + SWAP 5, 1 + psubw m4, m2 + SWAP 4, 2 + psubw m0, m1 + paddw m3, m2 + SWAP 3, 2, 1 +%endmacro + +; (a*x + b*y + round) >> shift +%macro VP9_MULSUB_2W_2X 5 ; dst1, dst2/src, round, coefs1, coefs2 + pmaddwd m%1, m%2, %4 + pmaddwd m%2, %5 + paddd m%1, %3 + paddd m%2, %3 + psrad m%1, 14 + psrad m%2, 14 +%endmacro + +%macro VP9_MULSUB_2W_4X 7 ; dst1, dst2, coef1, coef2, rnd, tmp1/src, tmp2 + VP9_MULSUB_2W_2X %7, %6, %5, [pw_m%3_%4], [pw_%4_%3] + VP9_MULSUB_2W_2X %1, %2, %5, [pw_m%3_%4], [pw_%4_%3] + packssdw m%1, m%7 + packssdw m%2, m%6 +%endmacro + +%macro VP9_UNPACK_MULSUB_2W_4X 7-9 ; dst1, dst2, (src1, src2,) coef1, coef2, rnd, tmp1, tmp2 +%if %0 == 7 + punpckhwd m%6, m%2, m%1 + punpcklwd m%2, m%1 + VP9_MULSUB_2W_4X %1, %2, %3, %4, %5, %6, %7 +%else + punpckhwd m%8, m%4, m%3 + punpcklwd m%2, m%4, m%3 + VP9_MULSUB_2W_4X %1, %2, %5, %6, %7, %8, %9 +%endif +%endmacro + +%macro VP9_IDCT4_1D_FINALIZE 0 + SUMSUB_BA w, 3, 2, 4 ; m3=t3+t0, m2=-t3+t0 + SUMSUB_BA w, 1, 0, 4 ; m1=t2+t1, m0=-t2+t1 + SWAP 0, 3, 2 ; 3102 -> 0123 +%endmacro + +%macro VP9_IDCT4_1D 0 +%if cpuflag(ssse3) + SUMSUB_BA w, 2, 0, 4 ; m2=IN(0)+IN(2) m0=IN(0)-IN(2) + pmulhrsw m2, m6 ; m2=t0 + pmulhrsw m0, m6 ; m0=t1 +%else ; <= sse2 + VP9_UNPACK_MULSUB_2W_4X 0, 2, 11585, 11585, m7, 4, 5 ; m0=t1, m1=t0 +%endif + VP9_UNPACK_MULSUB_2W_4X 1, 3, 15137, 6270, m7, 4, 5 ; m1=t2, m3=t3 + VP9_IDCT4_1D_FINALIZE +%endmacro + +%macro VP9_IADST4_1D 0 + movq2dq xmm0, m0 + movq2dq xmm1, m1 + movq2dq xmm2, m2 + movq2dq xmm3, m3 +%if cpuflag(ssse3) + paddw m3, m0 +%endif + punpcklwd xmm0, xmm1 + punpcklwd xmm2, xmm3 + pmaddwd xmm1, xmm0, [pw_5283_13377] + pmaddwd xmm4, xmm0, [pw_9929_13377] +%if notcpuflag(ssse3) + pmaddwd xmm6, xmm0, [pw_13377_0] +%endif + pmaddwd xmm0, [pw_15212_m13377] + pmaddwd xmm3, xmm2, [pw_15212_9929] +%if notcpuflag(ssse3) + pmaddwd xmm7, xmm2, [pw_m13377_13377] +%endif + pmaddwd xmm2, [pw_m5283_m15212] +%if cpuflag(ssse3) + psubw m3, m2 +%else + paddd xmm6, xmm7 +%endif + paddd xmm0, xmm2 + paddd xmm3, xmm5 + paddd xmm2, xmm5 +%if notcpuflag(ssse3) + paddd xmm6, xmm5 +%endif + paddd xmm1, xmm3 + paddd xmm0, xmm3 + paddd xmm4, xmm2 + psrad xmm1, 14 + psrad xmm0, 14 + psrad xmm4, 14 +%if cpuflag(ssse3) + pmulhrsw m3, [pw_13377x2] ; out2 +%else + psrad xmm6, 14 +%endif + packssdw xmm0, xmm0 + packssdw xmm1, xmm1 + packssdw xmm4, xmm4 +%if notcpuflag(ssse3) + packssdw xmm6, xmm6 +%endif + movdq2q m0, xmm0 ; out3 + movdq2q m1, xmm1 ; out0 + movdq2q m2, xmm4 ; out1 +%if notcpuflag(ssse3) + movdq2q m3, xmm6 ; out2 +%endif + SWAP 0, 1, 2, 3 +%endmacro diff --git a/media/ffvpx/libavcodec/x86/vp9lpf.asm b/media/ffvpx/libavcodec/x86/vp9lpf.asm new file mode 100644 index 0000000000..4e7ede2235 --- /dev/null +++ b/media/ffvpx/libavcodec/x86/vp9lpf.asm @@ -0,0 +1,1211 @@ +;****************************************************************************** +;* VP9 loop filter SIMD optimizations +;* +;* Copyright (C) 2013-2014 Clément Bœsch <u pkh me> +;* Copyright (C) 2014 Ronald S. Bultje <rsbultje@gmail.com> +;* +;* This file is part of FFmpeg. +;* +;* FFmpeg is free software; you can redistribute it and/or +;* modify it under the terms of the GNU Lesser General Public +;* License as published by the Free Software Foundation; either +;* version 2.1 of the License, or (at your option) any later version. +;* +;* FFmpeg is distributed in the hope that it will be useful, +;* but WITHOUT ANY WARRANTY; without even the implied warranty of +;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +;* Lesser General Public License for more details. +;* +;* You should have received a copy of the GNU Lesser General Public +;* License along with FFmpeg; if not, write to the Free Software +;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA +;****************************************************************************** + +%include "libavutil/x86/x86util.asm" + +SECTION_RODATA + +cextern pb_3 +cextern pb_80 + +pb_4: times 16 db 0x04 +pb_10: times 16 db 0x10 +pb_40: times 16 db 0x40 +pb_81: times 16 db 0x81 +pb_f8: times 16 db 0xf8 +pb_fe: times 16 db 0xfe +pb_ff: times 16 db 0xff + +cextern pw_4 +cextern pw_8 + +; with mix functions, two 8-bit thresholds are stored in a 16-bit storage, +; the following mask is used to splat both in the same register +mask_mix: times 8 db 0 + times 8 db 1 + +mask_mix84: times 8 db 0xff + times 8 db 0x00 +mask_mix48: times 8 db 0x00 + times 8 db 0xff + +SECTION .text + +%macro SCRATCH 3 +%ifdef m8 + SWAP %1, %2 +%else + mova [%3], m%1 +%endif +%endmacro + +%macro UNSCRATCH 3 +%ifdef m8 + SWAP %1, %2 +%else + mova m%1, [%3] +%endif +%endmacro + +; %1 = abs(%2-%3) +%macro ABSSUB 4 ; dst, src1 (RO), src2 (RO), tmp +%ifdef m8 + psubusb %1, %3, %2 + psubusb %4, %2, %3 +%else + mova %1, %3 + mova %4, %2 + psubusb %1, %2 + psubusb %4, %3 +%endif + por %1, %4 +%endmacro + +; %1 = %1>%2 +%macro CMP_GT 2-3 ; src/dst, cmp, pb_80 +%if %0 == 3 + pxor %1, %3 +%endif + pcmpgtb %1, %2 +%endmacro + +; %1 = abs(%2-%3) > %4 +%macro ABSSUB_GT 5-6 [pb_80]; dst, src1, src2, cmp, tmp, [pb_80] + ABSSUB %1, %2, %3, %5 ; dst = abs(src1-src2) + CMP_GT %1, %4, %6 ; dst > cmp +%endmacro + +%macro MASK_APPLY 4 ; %1=new_data/dst %2=old_data %3=mask %4=tmp + pand %1, %3 ; new &= mask + pandn %4, %3, %2 ; tmp = ~mask & old + por %1, %4 ; new&mask | old&~mask +%endmacro + +%macro UNPACK 4 +%ifdef m8 + punpck%1bw %2, %3, %4 +%else + mova %2, %3 + punpck%1bw %2, %4 +%endif +%endmacro + +%macro FILTER_SUBx2_ADDx2 11 ; %1=dst %2=h/l %3=cache %4=stack_off %5=sub1 %6=sub2 %7=add1 + ; %8=add2 %9=rshift, [unpack], [unpack_is_mem_on_x86_32] + psubw %3, [rsp+%4+%5*mmsize*2] + psubw %3, [rsp+%4+%6*mmsize*2] + paddw %3, [rsp+%4+%7*mmsize*2] +%ifnidn %10, "" +%if %11 == 0 + punpck%2bw %1, %10, m0 +%else + UNPACK %2, %1, %10, m0 +%endif + mova [rsp+%4+%8*mmsize*2], %1 + paddw %3, %1 +%else + paddw %3, [rsp+%4+%8*mmsize*2] +%endif + psraw %1, %3, %9 +%endmacro + +; FIXME interleave l/h better (for instruction pairing) +%macro FILTER_INIT 9 ; tmp1, tmp2, cacheL, cacheH, dstp, stack_off, filterid, mask, source + FILTER%7_INIT %1, l, %3, %6 + 0 + FILTER%7_INIT %2, h, %4, %6 + mmsize + packuswb %1, %2 + MASK_APPLY %1, %9, %8, %2 + mova %5, %1 +%endmacro + + +%macro FILTER_UPDATE 12-16 "", "", "", 0 ; tmp1, tmp2, cacheL, cacheH, dstp, stack_off, -, -, +, +, rshift, + ; mask, [source], [unpack + src], [unpack_is_mem_on_x86_32] +; FIXME interleave this properly with the subx2/addx2 +%ifnidn %15, "" +%if %16 == 0 || ARCH_X86_64 + mova %14, %15 +%endif +%endif + FILTER_SUBx2_ADDx2 %1, l, %3, %6 + 0, %7, %8, %9, %10, %11, %14, %16 + FILTER_SUBx2_ADDx2 %2, h, %4, %6 + mmsize, %7, %8, %9, %10, %11, %14, %16 + packuswb %1, %2 +%ifnidn %13, "" + MASK_APPLY %1, %13, %12, %2 +%else + MASK_APPLY %1, %5, %12, %2 +%endif + mova %5, %1 +%endmacro + +%macro SRSHIFT3B_2X 4 ; reg1, reg2, [pb_10], tmp + mova %4, [pb_f8] + pand %1, %4 + pand %2, %4 + psrlq %1, 3 + psrlq %2, 3 + pxor %1, %3 + pxor %2, %3 + psubb %1, %3 + psubb %2, %3 +%endmacro + +%macro EXTRACT_POS_NEG 3 ; i8, neg, pos + pxor %3, %3 + pxor %2, %2 + pcmpgtb %3, %1 ; i8 < 0 mask + psubb %2, %1 ; neg values (only the originally - will be kept) + pand %2, %3 ; negative values of i8 (but stored as +) + pandn %3, %1 ; positive values of i8 +%endmacro + +; clip_u8(u8 + i8) +%macro SIGN_ADD 4 ; dst, u8, i8, tmp1 + EXTRACT_POS_NEG %3, %4, %1 + paddusb %1, %2 ; add the positives + psubusb %1, %4 ; sub the negatives +%endmacro + +; clip_u8(u8 - i8) +%macro SIGN_SUB 4 ; dst, u8, i8, tmp1 + EXTRACT_POS_NEG %3, %1, %4 + paddusb %1, %2 ; add the negatives + psubusb %1, %4 ; sub the positives +%endmacro + +%macro FILTER6_INIT 4 ; %1=dst %2=h/l %3=cache, %4=stack_off + UNPACK %2, %1, rp3, m0 ; p3: B->W + mova [rsp+%4+0*mmsize*2], %1 + paddw %3, %1, %1 ; p3*2 + paddw %3, %1 ; p3*3 + punpck%2bw %1, m1, m0 ; p2: B->W + mova [rsp+%4+1*mmsize*2], %1 + paddw %3, %1 ; p3*3 + p2 + paddw %3, %1 ; p3*3 + p2*2 + UNPACK %2, %1, rp1, m0 ; p1: B->W + mova [rsp+%4+2*mmsize*2], %1 + paddw %3, %1 ; p3*3 + p2*2 + p1 + UNPACK %2, %1, rp0, m0 ; p0: B->W + mova [rsp+%4+3*mmsize*2], %1 + paddw %3, %1 ; p3*3 + p2*2 + p1 + p0 + UNPACK %2, %1, rq0, m0 ; q0: B->W + mova [rsp+%4+4*mmsize*2], %1 + paddw %3, %1 ; p3*3 + p2*2 + p1 + p0 + q0 + paddw %3, [pw_4] ; p3*3 + p2*2 + p1 + p0 + q0 + 4 + psraw %1, %3, 3 ; (p3*3 + p2*2 + p1 + p0 + q0 + 4) >> 3 +%endmacro + +%macro FILTER14_INIT 4 ; %1=dst %2=h/l %3=cache, %4=stack_off + punpck%2bw %1, m2, m0 ; p7: B->W + mova [rsp+%4+ 8*mmsize*2], %1 + psllw %3, %1, 3 ; p7*8 + psubw %3, %1 ; p7*7 + punpck%2bw %1, m3, m0 ; p6: B->W + mova [rsp+%4+ 9*mmsize*2], %1 + paddw %3, %1 ; p7*7 + p6 + paddw %3, %1 ; p7*7 + p6*2 + UNPACK %2, %1, rp5, m0 ; p5: B->W + mova [rsp+%4+10*mmsize*2], %1 + paddw %3, %1 ; p7*7 + p6*2 + p5 + UNPACK %2, %1, rp4, m0 ; p4: B->W + mova [rsp+%4+11*mmsize*2], %1 + paddw %3, %1 ; p7*7 + p6*2 + p5 + p4 + paddw %3, [rsp+%4+ 0*mmsize*2] ; p7*7 + p6*2 + p5 + p4 + p3 + paddw %3, [rsp+%4+ 1*mmsize*2] ; p7*7 + p6*2 + p5 + .. + p2 + paddw %3, [rsp+%4+ 2*mmsize*2] ; p7*7 + p6*2 + p5 + .. + p1 + paddw %3, [rsp+%4+ 3*mmsize*2] ; p7*7 + p6*2 + p5 + .. + p0 + paddw %3, [rsp+%4+ 4*mmsize*2] ; p7*7 + p6*2 + p5 + .. + p0 + q0 + paddw %3, [pw_8] ; p7*7 + p6*2 + p5 + .. + p0 + q0 + 8 + psraw %1, %3, 4 ; (p7*7 + p6*2 + p5 + .. + p0 + q0 + 8) >> 4 +%endmacro + +%macro TRANSPOSE16x16B 17 + mova %17, m%16 + SBUTTERFLY bw, %1, %2, %16 + SBUTTERFLY bw, %3, %4, %16 + SBUTTERFLY bw, %5, %6, %16 + SBUTTERFLY bw, %7, %8, %16 + SBUTTERFLY bw, %9, %10, %16 + SBUTTERFLY bw, %11, %12, %16 + SBUTTERFLY bw, %13, %14, %16 + mova m%16, %17 + mova %17, m%14 + SBUTTERFLY bw, %15, %16, %14 + SBUTTERFLY wd, %1, %3, %14 + SBUTTERFLY wd, %2, %4, %14 + SBUTTERFLY wd, %5, %7, %14 + SBUTTERFLY wd, %6, %8, %14 + SBUTTERFLY wd, %9, %11, %14 + SBUTTERFLY wd, %10, %12, %14 + SBUTTERFLY wd, %13, %15, %14 + mova m%14, %17 + mova %17, m%12 + SBUTTERFLY wd, %14, %16, %12 + SBUTTERFLY dq, %1, %5, %12 + SBUTTERFLY dq, %2, %6, %12 + SBUTTERFLY dq, %3, %7, %12 + SBUTTERFLY dq, %4, %8, %12 + SBUTTERFLY dq, %9, %13, %12 + SBUTTERFLY dq, %10, %14, %12 + SBUTTERFLY dq, %11, %15, %12 + mova m%12, %17 + mova %17, m%8 + SBUTTERFLY dq, %12, %16, %8 + SBUTTERFLY qdq, %1, %9, %8 + SBUTTERFLY qdq, %2, %10, %8 + SBUTTERFLY qdq, %3, %11, %8 + SBUTTERFLY qdq, %4, %12, %8 + SBUTTERFLY qdq, %5, %13, %8 + SBUTTERFLY qdq, %6, %14, %8 + SBUTTERFLY qdq, %7, %15, %8 + mova m%8, %17 + mova %17, m%1 + SBUTTERFLY qdq, %8, %16, %1 + mova m%1, %17 + SWAP %2, %9 + SWAP %3, %5 + SWAP %4, %13 + SWAP %6, %11 + SWAP %8, %15 + SWAP %12, %14 +%endmacro + +%macro TRANSPOSE8x8B 13 + SBUTTERFLY bw, %1, %2, %7 + movdq%10 m%7, %9 + movdqa %11, m%2 + SBUTTERFLY bw, %3, %4, %2 + SBUTTERFLY bw, %5, %6, %2 + SBUTTERFLY bw, %7, %8, %2 + SBUTTERFLY wd, %1, %3, %2 + movdqa m%2, %11 + movdqa %11, m%3 + SBUTTERFLY wd, %2, %4, %3 + SBUTTERFLY wd, %5, %7, %3 + SBUTTERFLY wd, %6, %8, %3 + SBUTTERFLY dq, %1, %5, %3 + SBUTTERFLY dq, %2, %6, %3 + movdqa m%3, %11 + movh %12, m%2 + movhps %13, m%2 + SBUTTERFLY dq, %3, %7, %2 + SBUTTERFLY dq, %4, %8, %2 + SWAP %2, %5 + SWAP %4, %7 +%endmacro + +%macro DEFINE_REAL_P7_TO_Q7 0-1 0 +%define P7 dstq + 4*mstrideq + %1 +%define P6 dstq + mstride3q + %1 +%define P5 dstq + 2*mstrideq + %1 +%define P4 dstq + mstrideq + %1 +%define P3 dstq + %1 +%define P2 dstq + strideq + %1 +%define P1 dstq + 2* strideq + %1 +%define P0 dstq + stride3q + %1 +%define Q0 dstq + 4* strideq + %1 +%define Q1 dst2q + mstride3q + %1 +%define Q2 dst2q + 2*mstrideq + %1 +%define Q3 dst2q + mstrideq + %1 +%define Q4 dst2q + %1 +%define Q5 dst2q + strideq + %1 +%define Q6 dst2q + 2* strideq + %1 +%define Q7 dst2q + stride3q + %1 +%endmacro + +%macro DEFINE_TRANSPOSED_P7_TO_Q7 0-1 0 +%define P3 rsp + 0*mmsize + %1 +%define P2 rsp + 1*mmsize + %1 +%define P1 rsp + 2*mmsize + %1 +%define P0 rsp + 3*mmsize + %1 +%define Q0 rsp + 4*mmsize + %1 +%define Q1 rsp + 5*mmsize + %1 +%define Q2 rsp + 6*mmsize + %1 +%define Q3 rsp + 7*mmsize + %1 +%if mmsize == 16 +%define P7 rsp + 8*mmsize + %1 +%define P6 rsp + 9*mmsize + %1 +%define P5 rsp + 10*mmsize + %1 +%define P4 rsp + 11*mmsize + %1 +%define Q4 rsp + 12*mmsize + %1 +%define Q5 rsp + 13*mmsize + %1 +%define Q6 rsp + 14*mmsize + %1 +%define Q7 rsp + 15*mmsize + %1 +%endif +%endmacro + +; ..............AB -> AAAAAAAABBBBBBBB +%macro SPLATB_MIX 1-2 [mask_mix] +%if cpuflag(ssse3) + pshufb %1, %2 +%else + punpcklbw %1, %1 + punpcklwd %1, %1 + punpckldq %1, %1 +%endif +%endmacro + +%macro LOOPFILTER 5 ; %1=v/h %2=size1 %3+%4=stack, %5=mmx/32bit stack only +%assign %%ext 0 +%if ARCH_X86_32 || mmsize == 8 +%assign %%ext %5 +%endif + +%if UNIX64 +cglobal vp9_loop_filter_%1_%2_ %+ mmsize, 5, 9, 16, %3 + %4 + %%ext, dst, stride, E, I, H, mstride, dst2, stride3, mstride3 +%else +%if WIN64 +cglobal vp9_loop_filter_%1_%2_ %+ mmsize, 4, 8, 16, %3 + %4 + %%ext, dst, stride, E, I, mstride, dst2, stride3, mstride3 +%else +cglobal vp9_loop_filter_%1_%2_ %+ mmsize, 2, 6, 16, %3 + %4 + %%ext, dst, stride, mstride, dst2, stride3, mstride3 +%define Ed dword r2m +%define Id dword r3m +%endif +%define Hd dword r4m +%endif + + mov mstrideq, strideq + neg mstrideq + + lea stride3q, [strideq*3] + lea mstride3q, [mstrideq*3] + +%ifidn %1, h +%if %2 != 16 +%if mmsize == 16 +%define movx movh +%else +%define movx mova +%endif + lea dstq, [dstq + 4*strideq - 4] +%else +%define movx movu + lea dstq, [dstq + 4*strideq - 8] ; go from top center (h pos) to center left (v pos) +%endif +%else + lea dstq, [dstq + 4*mstrideq] +%endif + ; FIXME we shouldn't need two dts registers if mmsize == 8 + lea dst2q, [dstq + 8*strideq] + + DEFINE_REAL_P7_TO_Q7 + +%ifidn %1, h + movx m0, [P7] + movx m1, [P6] + movx m2, [P5] + movx m3, [P4] + movx m4, [P3] + movx m5, [P2] +%if (ARCH_X86_64 && mmsize == 16) || %2 > 16 + movx m6, [P1] +%endif + movx m7, [P0] +%ifdef m8 + movx m8, [Q0] + movx m9, [Q1] + movx m10, [Q2] + movx m11, [Q3] + movx m12, [Q4] + movx m13, [Q5] + movx m14, [Q6] + movx m15, [Q7] + DEFINE_TRANSPOSED_P7_TO_Q7 +%if %2 == 16 + TRANSPOSE16x16B 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, [rsp] + mova [P7], m0 + mova [P6], m1 + mova [P5], m2 + mova [P4], m3 +%else ; %2 == 44/48/84/88 + ; 8x16 transpose + punpcklbw m0, m1 + punpcklbw m2, m3 + punpcklbw m4, m5 + punpcklbw m6, m7 + punpcklbw m8, m9 + punpcklbw m10, m11 + punpcklbw m12, m13 + punpcklbw m14, m15 + TRANSPOSE8x8W 0, 2, 4, 6, 8, 10, 12, 14, 15 + SWAP 0, 4 + SWAP 2, 5 + SWAP 0, 6 + SWAP 0, 7 + SWAP 10, 9 + SWAP 12, 10 + SWAP 14, 11 +%endif ; %2 + mova [P3], m4 + mova [P2], m5 + mova [P1], m6 + mova [P0], m7 + mova [Q0], m8 + mova [Q1], m9 + mova [Q2], m10 + mova [Q3], m11 +%if %2 == 16 + mova [Q4], m12 + mova [Q5], m13 + mova [Q6], m14 + mova [Q7], m15 +%endif ; %2 +%else ; x86-32 +%if %2 == 16 + TRANSPOSE8x8B 0, 1, 2, 3, 4, 5, 6, 7, [P1], u, [rsp+%3+%4], [rsp+64], [rsp+80] + DEFINE_TRANSPOSED_P7_TO_Q7 + movh [P7], m0 + movh [P5], m1 + movh [P3], m2 + movh [P1], m3 + movh [Q2], m5 + movh [Q4], m6 + movh [Q6], m7 + movhps [P6], m0 + movhps [P4], m1 + movhps [P2], m2 + movhps [P0], m3 + movhps [Q3], m5 + movhps [Q5], m6 + movhps [Q7], m7 + DEFINE_REAL_P7_TO_Q7 + movx m0, [Q0] + movx m1, [Q1] + movx m2, [Q2] + movx m3, [Q3] + movx m4, [Q4] + movx m5, [Q5] + movx m7, [Q7] + TRANSPOSE8x8B 0, 1, 2, 3, 4, 5, 6, 7, [Q6], u, [rsp+%3+%4], [rsp+72], [rsp+88] + DEFINE_TRANSPOSED_P7_TO_Q7 8 + movh [P7], m0 + movh [P5], m1 + movh [P3], m2 + movh [P1], m3 + movh [Q2], m5 + movh [Q4], m6 + movh [Q6], m7 + movhps [P6], m0 + movhps [P4], m1 + movhps [P2], m2 + movhps [P0], m3 + movhps [Q3], m5 + movhps [Q5], m6 + movhps [Q7], m7 + DEFINE_TRANSPOSED_P7_TO_Q7 +%elif %2 > 16 ; %2 == 44/48/84/88 + punpcklbw m0, m1 + punpcklbw m2, m3 + punpcklbw m4, m5 + punpcklbw m6, m7 + movx m1, [Q0] + movx m3, [Q1] + movx m5, [Q2] + movx m7, [Q3] + punpcklbw m1, m3 + punpcklbw m5, m7 + movx m3, [Q4] + movx m7, [Q5] + punpcklbw m3, m7 + mova [rsp], m3 + movx m3, [Q6] + movx m7, [Q7] + punpcklbw m3, m7 + DEFINE_TRANSPOSED_P7_TO_Q7 + TRANSPOSE8x8W 0, 2, 4, 6, 1, 5, 7, 3, [rsp], [Q0], 1 + mova [P3], m0 + mova [P2], m2 + mova [P1], m4 + mova [P0], m6 + mova [Q1], m5 + mova [Q2], m7 + mova [Q3], m3 +%else ; %2 == 4 || %2 == 8 + SBUTTERFLY bw, 0, 1, 6 + SBUTTERFLY bw, 2, 3, 6 + SBUTTERFLY bw, 4, 5, 6 + mova [rsp+4*mmsize], m5 + mova m6, [P1] + SBUTTERFLY bw, 6, 7, 5 + DEFINE_TRANSPOSED_P7_TO_Q7 + TRANSPOSE4x4W 0, 2, 4, 6, 5 + mova [P3], m0 + mova [P2], m2 + mova [P1], m4 + mova [P0], m6 + mova m5, [rsp+4*mmsize] + TRANSPOSE4x4W 1, 3, 5, 7, 0 + mova [Q0], m1 + mova [Q1], m3 + mova [Q2], m5 + mova [Q3], m7 +%endif ; %2 +%endif ; x86-32/64 +%endif ; %1 == h + + ; calc fm mask +%if %2 == 16 || mmsize == 8 +%if cpuflag(ssse3) + pxor m0, m0 +%endif + SPLATB_REG m2, I, m0 ; I I I I ... + SPLATB_REG m3, E, m0 ; E E E E ... +%else +%if cpuflag(ssse3) + mova m0, [mask_mix] +%endif + movd m2, Id + movd m3, Ed + SPLATB_MIX m2, m0 + SPLATB_MIX m3, m0 +%endif + mova m0, [pb_80] + pxor m2, m0 + pxor m3, m0 +%ifdef m8 +%ifidn %1, v + mova m8, [P3] + mova m9, [P2] + mova m10, [P1] + mova m11, [P0] + mova m12, [Q0] + mova m13, [Q1] + mova m14, [Q2] + mova m15, [Q3] +%else + ; In case of horizontal, P3..Q3 are already present in some registers due + ; to the previous transpose, so we just swap registers. + SWAP 8, 4, 12 + SWAP 9, 5, 13 + SWAP 10, 6, 14 + SWAP 11, 7, 15 +%endif +%define rp3 m8 +%define rp2 m9 +%define rp1 m10 +%define rp0 m11 +%define rq0 m12 +%define rq1 m13 +%define rq2 m14 +%define rq3 m15 +%else +%define rp3 [P3] +%define rp2 [P2] +%define rp1 [P1] +%define rp0 [P0] +%define rq0 [Q0] +%define rq1 [Q1] +%define rq2 [Q2] +%define rq3 [Q3] +%endif + ABSSUB_GT m5, rp3, rp2, m2, m7, m0 ; m5 = abs(p3-p2) <= I + ABSSUB_GT m1, rp2, rp1, m2, m7, m0 ; m1 = abs(p2-p1) <= I + por m5, m1 + ABSSUB_GT m1, rp1, rp0, m2, m7, m0 ; m1 = abs(p1-p0) <= I + por m5, m1 + ABSSUB_GT m1, rq0, rq1, m2, m7, m0 ; m1 = abs(q1-q0) <= I + por m5, m1 + ABSSUB_GT m1, rq1, rq2, m2, m7, m0 ; m1 = abs(q2-q1) <= I + por m5, m1 + ABSSUB_GT m1, rq2, rq3, m2, m7, m0 ; m1 = abs(q3-q2) <= I + por m5, m1 + ABSSUB m1, rp0, rq0, m7 ; abs(p0-q0) + paddusb m1, m1 ; abs(p0-q0) * 2 + ABSSUB m2, rp1, rq1, m7 ; abs(p1-q1) + pand m2, [pb_fe] ; drop lsb so shift can work + psrlq m2, 1 ; abs(p1-q1)/2 + paddusb m1, m2 ; abs(p0-q0)*2 + abs(p1-q1)/2 + pxor m1, m0 + pcmpgtb m1, m3 + por m1, m5 ; fm final value + SWAP 1, 3 + pxor m3, [pb_ff] + + ; (m3: fm, m8..15: p3 p2 p1 p0 q0 q1 q2 q3) + ; calc flat8in (if not 44_16) and hev masks +%if %2 != 44 && %2 != 4 + mova m6, [pb_81] ; [1 1 1 1 ...] ^ 0x80 + ABSSUB_GT m2, rp3, rp0, m6, m5 ; abs(p3 - p0) <= 1 +%ifdef m8 + mova m8, [pb_80] +%define rb80 m8 +%else +%define rb80 [pb_80] +%endif + ABSSUB_GT m1, rp2, rp0, m6, m5, rb80 ; abs(p2 - p0) <= 1 + por m2, m1 + ABSSUB m4, rp1, rp0, m5 ; abs(p1 - p0) +%if %2 <= 16 +%if cpuflag(ssse3) + pxor m0, m0 +%endif + SPLATB_REG m7, H, m0 ; H H H H ... +%else + movd m7, Hd + SPLATB_MIX m7 +%endif + pxor m7, rb80 + pxor m4, rb80 + pcmpgtb m0, m4, m7 ; abs(p1 - p0) > H (1/2 hev condition) + CMP_GT m4, m6 ; abs(p1 - p0) <= 1 + por m2, m4 ; (flat8in) + ABSSUB m4, rq1, rq0, m1 ; abs(q1 - q0) + pxor m4, rb80 + pcmpgtb m5, m4, m7 ; abs(q1 - q0) > H (2/2 hev condition) + por m0, m5 ; hev final value + CMP_GT m4, m6 ; abs(q1 - q0) <= 1 + por m2, m4 ; (flat8in) + ABSSUB_GT m1, rq2, rq0, m6, m5, rb80 ; abs(q2 - q0) <= 1 + por m2, m1 + ABSSUB_GT m1, rq3, rq0, m6, m5, rb80 ; abs(q3 - q0) <= 1 + por m2, m1 ; flat8in final value + pxor m2, [pb_ff] +%if %2 == 84 || %2 == 48 + pand m2, [mask_mix%2] +%endif +%else + mova m6, [pb_80] +%if %2 == 44 + movd m7, Hd + SPLATB_MIX m7 +%else +%if cpuflag(ssse3) + pxor m0, m0 +%endif + SPLATB_REG m7, H, m0 ; H H H H ... +%endif + pxor m7, m6 + ABSSUB m4, rp1, rp0, m1 ; abs(p1 - p0) + pxor m4, m6 + pcmpgtb m0, m4, m7 ; abs(p1 - p0) > H (1/2 hev condition) + ABSSUB m4, rq1, rq0, m1 ; abs(q1 - q0) + pxor m4, m6 + pcmpgtb m5, m4, m7 ; abs(q1 - q0) > H (2/2 hev condition) + por m0, m5 ; hev final value +%endif + +%if %2 == 16 + ; (m0: hev, m2: flat8in, m3: fm, m6: pb_81, m9..15: p2 p1 p0 q0 q1 q2 q3) + ; calc flat8out mask +%ifdef m8 + mova m8, [P7] + mova m9, [P6] +%define rp7 m8 +%define rp6 m9 +%else +%define rp7 [P7] +%define rp6 [P6] +%endif + ABSSUB_GT m1, rp7, rp0, m6, m5 ; abs(p7 - p0) <= 1 + ABSSUB_GT m7, rp6, rp0, m6, m5 ; abs(p6 - p0) <= 1 + por m1, m7 +%ifdef m8 + mova m8, [P5] + mova m9, [P4] +%define rp5 m8 +%define rp4 m9 +%else +%define rp5 [P5] +%define rp4 [P4] +%endif + ABSSUB_GT m7, rp5, rp0, m6, m5 ; abs(p5 - p0) <= 1 + por m1, m7 + ABSSUB_GT m7, rp4, rp0, m6, m5 ; abs(p4 - p0) <= 1 + por m1, m7 +%ifdef m8 + mova m14, [Q4] + mova m15, [Q5] +%define rq4 m14 +%define rq5 m15 +%else +%define rq4 [Q4] +%define rq5 [Q5] +%endif + ABSSUB_GT m7, rq4, rq0, m6, m5 ; abs(q4 - q0) <= 1 + por m1, m7 + ABSSUB_GT m7, rq5, rq0, m6, m5 ; abs(q5 - q0) <= 1 + por m1, m7 +%ifdef m8 + mova m14, [Q6] + mova m15, [Q7] +%define rq6 m14 +%define rq7 m15 +%else +%define rq6 [Q6] +%define rq7 [Q7] +%endif + ABSSUB_GT m7, rq6, rq0, m6, m5 ; abs(q4 - q0) <= 1 + por m1, m7 + ABSSUB_GT m7, rq7, rq0, m6, m5 ; abs(q5 - q0) <= 1 + por m1, m7 ; flat8out final value + pxor m1, [pb_ff] +%endif + + ; if (fm) { + ; if (out && in) filter_14() + ; else if (in) filter_6() + ; else if (hev) filter_2() + ; else filter_4() + ; } + ; + ; f14: fm & out & in + ; f6: fm & ~f14 & in => fm & ~(out & in) & in => fm & ~out & in + ; f2: fm & ~f14 & ~f6 & hev => fm & ~(out & in) & ~(~out & in) & hev => fm & ~in & hev + ; f4: fm & ~f14 & ~f6 & ~f2 => fm & ~(out & in) & ~(~out & in) & ~(~in & hev) => fm & ~in & ~hev + + ; (m0: hev, [m1: flat8out], [m2: flat8in], m3: fm, m8..15: p5 p4 p1 p0 q0 q1 q6 q7) + ; filter2() +%if %2 != 44 && %2 != 4 + mova m6, [pb_80] ; already in m6 if 44_16 + SCRATCH 2, 15, rsp+%3+%4 +%if %2 == 16 + SCRATCH 1, 8, rsp+%3+%4+16 +%endif +%endif + pxor m2, m6, rq0 ; q0 ^ 0x80 + pxor m4, m6, rp0 ; p0 ^ 0x80 + psubsb m2, m4 ; (signed) q0 - p0 + pxor m4, m6, rp1 ; p1 ^ 0x80 + pxor m5, m6, rq1 ; q1 ^ 0x80 + psubsb m4, m5 ; (signed) p1 - q1 + paddsb m4, m2 ; (q0 - p0) + (p1 - q1) + paddsb m4, m2 ; 2*(q0 - p0) + (p1 - q1) + paddsb m4, m2 ; 3*(q0 - p0) + (p1 - q1) + paddsb m6, m4, [pb_4] ; m6: f1 = clip(f + 4, 127) + paddsb m4, [pb_3] ; m4: f2 = clip(f + 3, 127) +%ifdef m8 + mova m14, [pb_10] ; will be reused in filter4() +%define rb10 m14 +%else +%define rb10 [pb_10] +%endif + SRSHIFT3B_2X m6, m4, rb10, m7 ; f1 and f2 sign byte shift by 3 + SIGN_SUB m7, rq0, m6, m5 ; m7 = q0 - f1 + SIGN_ADD m1, rp0, m4, m5 ; m1 = p0 + f2 +%if %2 != 44 && %2 != 4 +%ifdef m8 + pandn m6, m15, m3 ; ~mask(in) & mask(fm) +%else + mova m6, [rsp+%3+%4] + pandn m6, m3 +%endif + pand m6, m0 ; (~mask(in) & mask(fm)) & mask(hev) +%else + pand m6, m3, m0 +%endif + MASK_APPLY m7, rq0, m6, m5 ; m7 = filter2(q0) & mask / we write it in filter4() + MASK_APPLY m1, rp0, m6, m5 ; m1 = filter2(p0) & mask / we write it in filter4() + + ; (m0: hev, m1: p0', m2: q0-p0, m3: fm, m7: q0', [m8: flat8out], m10..13: p1 p0 q0 q1, m14: pb_10, [m15: flat8in], ) + ; filter4() + mova m4, m2 + paddsb m2, m4 ; 2 * (q0 - p0) + paddsb m2, m4 ; 3 * (q0 - p0) + paddsb m6, m2, [pb_4] ; m6: f1 = clip(f + 4, 127) + paddsb m2, [pb_3] ; m2: f2 = clip(f + 3, 127) + SRSHIFT3B_2X m6, m2, rb10, m4 ; f1 and f2 sign byte shift by 3 +%if %2 != 44 && %2 != 4 +%ifdef m8 + pandn m5, m15, m3 ; ~mask(in) & mask(fm) +%else + mova m5, [rsp+%3+%4] + pandn m5, m3 +%endif + pandn m0, m5 ; ~mask(hev) & (~mask(in) & mask(fm)) +%else + pandn m0, m3 +%endif + SIGN_SUB m5, rq0, m6, m4 ; q0 - f1 + MASK_APPLY m5, m7, m0, m4 ; filter4(q0) & mask + mova [Q0], m5 + SIGN_ADD m7, rp0, m2, m4 ; p0 + f2 + MASK_APPLY m7, m1, m0, m4 ; filter4(p0) & mask + mova [P0], m7 + paddb m6, [pb_80] ; + pxor m1, m1 ; f=(f1+1)>>1 + pavgb m6, m1 ; + psubb m6, [pb_40] ; + SIGN_ADD m1, rp1, m6, m2 ; p1 + f + SIGN_SUB m4, rq1, m6, m2 ; q1 - f + MASK_APPLY m1, rp1, m0, m2 ; m1 = filter4(p1) + MASK_APPLY m4, rq1, m0, m2 ; m4 = filter4(q1) + mova [P1], m1 + mova [Q1], m4 + +%if %2 != 44 && %2 != 4 + UNSCRATCH 2, 15, rsp+%3+%4 +%endif + + ; ([m1: flat8out], m2: flat8in, m3: fm, m10..13: p1 p0 q0 q1) + ; filter6() +%if %2 != 44 && %2 != 4 + pxor m0, m0 +%if %2 != 16 + pand m3, m2 +%else + pand m2, m3 ; mask(fm) & mask(in) +%ifdef m8 + pandn m3, m8, m2 ; ~mask(out) & (mask(fm) & mask(in)) +%else + mova m3, [rsp+%3+%4+16] + pandn m3, m2 +%endif +%endif +%ifdef m8 + mova m14, [P3] + mova m9, [Q3] +%define rp3 m14 +%define rq3 m9 +%else +%define rp3 [P3] +%define rq3 [Q3] +%endif + mova m1, [P2] + FILTER_INIT m4, m5, m6, m7, [P2], %4, 6, m3, m1 ; [p2] + mova m1, [Q2] + FILTER_UPDATE m4, m5, m6, m7, [P1], %4, 0, 1, 2, 5, 3, m3, "", rq1, "", 1 ; [p1] -p3 -p2 +p1 +q1 + FILTER_UPDATE m4, m5, m6, m7, [P0], %4, 0, 2, 3, 6, 3, m3, "", m1 ; [p0] -p3 -p1 +p0 +q2 + FILTER_UPDATE m4, m5, m6, m7, [Q0], %4, 0, 3, 4, 7, 3, m3, "", rq3, "", 1 ; [q0] -p3 -p0 +q0 +q3 + FILTER_UPDATE m4, m5, m6, m7, [Q1], %4, 1, 4, 5, 7, 3, m3, "" ; [q1] -p2 -q0 +q1 +q3 + FILTER_UPDATE m4, m5, m6, m7, [Q2], %4, 2, 5, 6, 7, 3, m3, m1 ; [q2] -p1 -q1 +q2 +q3 +%endif + +%if %2 == 16 + UNSCRATCH 1, 8, rsp+%3+%4+16 +%endif + + ; (m0: 0, [m1: flat8out], m2: fm & flat8in, m8..15: q2 q3 p1 p0 q0 q1 p3 p2) + ; filter14() + ; + ; m2 m3 m8 m9 m14 m15 m10 m11 m12 m13 + ; + ; q2 q3 p3 p2 p1 p0 q0 q1 + ; p6 -7 p7 p6 p5 p4 . . . . . + ; p5 -6 -p7 -p6 +p5 +q1 . . . . + ; p4 -5 -p7 -p5 +p4 +q2 . . . q2 + ; p3 -4 -p7 -p4 +p3 +q3 . . . q3 + ; p2 -3 -p7 -p3 +p2 +q4 . . . q4 + ; p1 -2 -p7 -p2 +p1 +q5 . . . q5 + ; p0 -1 -p7 -p1 +p0 +q6 . . . q6 + ; q0 +0 -p7 -p0 +q0 +q7 . . . q7 + ; q1 +1 -p6 -q0 +q1 +q7 q1 . . . + ; q2 +2 -p5 -q1 +q2 +q7 . q2 . . + ; q3 +3 -p4 -q2 +q3 +q7 . q3 . . + ; q4 +4 -p3 -q3 +q4 +q7 . q4 . . + ; q5 +5 -p2 -q4 +q5 +q7 . q5 . . + ; q6 +6 -p1 -q5 +q6 +q7 . q6 . . + +%if %2 == 16 + pand m1, m2 ; mask(out) & (mask(fm) & mask(in)) + mova m2, [P7] + mova m3, [P6] +%ifdef m8 + mova m8, [P5] + mova m9, [P4] +%define rp5 m8 +%define rp4 m9 +%define rp5s m8 +%define rp4s m9 +%define rp3s m14 +%define rq4 m8 +%define rq5 m9 +%define rq6 m14 +%define rq7 m15 +%define rq4s m8 +%define rq5s m9 +%define rq6s m14 +%else +%define rp5 [P5] +%define rp4 [P4] +%define rp5s "" +%define rp4s "" +%define rp3s "" +%define rq4 [Q4] +%define rq5 [Q5] +%define rq6 [Q6] +%define rq7 [Q7] +%define rq4s "" +%define rq5s "" +%define rq6s "" +%endif + FILTER_INIT m4, m5, m6, m7, [P6], %4, 14, m1, m3 ; [p6] + FILTER_UPDATE m4, m5, m6, m7, [P5], %4, 8, 9, 10, 5, 4, m1, rp5s ; [p5] -p7 -p6 +p5 +q1 + FILTER_UPDATE m4, m5, m6, m7, [P4], %4, 8, 10, 11, 6, 4, m1, rp4s ; [p4] -p7 -p5 +p4 +q2 + FILTER_UPDATE m4, m5, m6, m7, [P3], %4, 8, 11, 0, 7, 4, m1, rp3s ; [p3] -p7 -p4 +p3 +q3 + FILTER_UPDATE m4, m5, m6, m7, [P2], %4, 8, 0, 1, 12, 4, m1, "", rq4, [Q4], 1 ; [p2] -p7 -p3 +p2 +q4 + FILTER_UPDATE m4, m5, m6, m7, [P1], %4, 8, 1, 2, 13, 4, m1, "", rq5, [Q5], 1 ; [p1] -p7 -p2 +p1 +q5 + FILTER_UPDATE m4, m5, m6, m7, [P0], %4, 8, 2, 3, 14, 4, m1, "", rq6, [Q6], 1 ; [p0] -p7 -p1 +p0 +q6 + FILTER_UPDATE m4, m5, m6, m7, [Q0], %4, 8, 3, 4, 15, 4, m1, "", rq7, [Q7], 1 ; [q0] -p7 -p0 +q0 +q7 + FILTER_UPDATE m4, m5, m6, m7, [Q1], %4, 9, 4, 5, 15, 4, m1, "" ; [q1] -p6 -q0 +q1 +q7 + FILTER_UPDATE m4, m5, m6, m7, [Q2], %4, 10, 5, 6, 15, 4, m1, "" ; [q2] -p5 -q1 +q2 +q7 + FILTER_UPDATE m4, m5, m6, m7, [Q3], %4, 11, 6, 7, 15, 4, m1, "" ; [q3] -p4 -q2 +q3 +q7 + FILTER_UPDATE m4, m5, m6, m7, [Q4], %4, 0, 7, 12, 15, 4, m1, rq4s ; [q4] -p3 -q3 +q4 +q7 + FILTER_UPDATE m4, m5, m6, m7, [Q5], %4, 1, 12, 13, 15, 4, m1, rq5s ; [q5] -p2 -q4 +q5 +q7 + FILTER_UPDATE m4, m5, m6, m7, [Q6], %4, 2, 13, 14, 15, 4, m1, rq6s ; [q6] -p1 -q5 +q6 +q7 +%endif + +%ifidn %1, h +%if %2 == 16 + mova m0, [P7] + mova m1, [P6] + mova m2, [P5] + mova m3, [P4] + mova m4, [P3] + mova m5, [P2] +%if ARCH_X86_64 + mova m6, [P1] +%endif + mova m7, [P0] +%if ARCH_X86_64 + mova m8, [Q0] + mova m9, [Q1] + mova m10, [Q2] + mova m11, [Q3] + mova m12, [Q4] + mova m13, [Q5] + mova m14, [Q6] + mova m15, [Q7] + TRANSPOSE16x16B 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, [rsp] + DEFINE_REAL_P7_TO_Q7 + movu [P7], m0 + movu [P6], m1 + movu [P5], m2 + movu [P4], m3 + movu [P3], m4 + movu [P2], m5 + movu [P1], m6 + movu [P0], m7 + movu [Q0], m8 + movu [Q1], m9 + movu [Q2], m10 + movu [Q3], m11 + movu [Q4], m12 + movu [Q5], m13 + movu [Q6], m14 + movu [Q7], m15 +%else + DEFINE_REAL_P7_TO_Q7 + TRANSPOSE8x8B 0, 1, 2, 3, 4, 5, 6, 7, [rsp+32], a, [rsp+%3+%4], [Q0], [Q1] + movh [P7], m0 + movh [P5], m1 + movh [P3], m2 + movh [P1], m3 + movh [Q2], m5 + movh [Q4], m6 + movh [Q6], m7 + movhps [P6], m0 + movhps [P4], m1 + movhps [P2], m2 + movhps [P0], m3 + movhps [Q3], m5 + movhps [Q5], m6 + movhps [Q7], m7 + DEFINE_TRANSPOSED_P7_TO_Q7 + mova m0, [Q0] + mova m1, [Q1] + mova m2, [Q2] + mova m3, [Q3] + mova m4, [Q4] + mova m5, [Q5] + mova m7, [Q7] + DEFINE_REAL_P7_TO_Q7 8 + TRANSPOSE8x8B 0, 1, 2, 3, 4, 5, 6, 7, [rsp+224], a, [rsp+%3+%4], [Q0], [Q1] + movh [P7], m0 + movh [P5], m1 + movh [P3], m2 + movh [P1], m3 + movh [Q2], m5 + movh [Q4], m6 + movh [Q6], m7 + movhps [P6], m0 + movhps [P4], m1 + movhps [P2], m2 + movhps [P0], m3 + movhps [Q3], m5 + movhps [Q5], m6 + movhps [Q7], m7 +%endif +%elif %2 == 44 || %2 == 4 + SWAP 0, 1 ; m0 = p1 + SWAP 1, 7 ; m1 = p0 + SWAP 2, 5 ; m2 = q0 + SWAP 3, 4 ; m3 = q1 + DEFINE_REAL_P7_TO_Q7 2 + SBUTTERFLY bw, 0, 1, 4 + SBUTTERFLY bw, 2, 3, 4 + SBUTTERFLY wd, 0, 2, 4 + SBUTTERFLY wd, 1, 3, 4 +%if mmsize == 16 + movd [P7], m0 + movd [P3], m2 + movd [Q0], m1 + movd [Q4], m3 + psrldq m0, 4 + psrldq m1, 4 + psrldq m2, 4 + psrldq m3, 4 + movd [P6], m0 + movd [P2], m2 + movd [Q1], m1 + movd [Q5], m3 + psrldq m0, 4 + psrldq m1, 4 + psrldq m2, 4 + psrldq m3, 4 + movd [P5], m0 + movd [P1], m2 + movd [Q2], m1 + movd [Q6], m3 + psrldq m0, 4 + psrldq m1, 4 + psrldq m2, 4 + psrldq m3, 4 + movd [P4], m0 + movd [P0], m2 + movd [Q3], m1 + movd [Q7], m3 +%else + movd [P7], m0 + movd [P5], m2 + movd [P3], m1 + movd [P1], m3 + psrlq m0, 32 + psrlq m2, 32 + psrlq m1, 32 + psrlq m3, 32 + movd [P6], m0 + movd [P4], m2 + movd [P2], m1 + movd [P0], m3 +%endif +%else + ; the following code do a transpose of 8 full lines to 16 half + ; lines (high part). It is inlined to avoid the need of a staging area + mova m0, [P3] + mova m1, [P2] + mova m2, [P1] + mova m3, [P0] + mova m4, [Q0] + mova m5, [Q1] +%ifdef m8 + mova m6, [Q2] +%endif + mova m7, [Q3] + DEFINE_REAL_P7_TO_Q7 +%ifdef m8 + SBUTTERFLY bw, 0, 1, 8 + SBUTTERFLY bw, 2, 3, 8 + SBUTTERFLY bw, 4, 5, 8 + SBUTTERFLY bw, 6, 7, 8 + SBUTTERFLY wd, 0, 2, 8 + SBUTTERFLY wd, 1, 3, 8 + SBUTTERFLY wd, 4, 6, 8 + SBUTTERFLY wd, 5, 7, 8 + SBUTTERFLY dq, 0, 4, 8 + SBUTTERFLY dq, 1, 5, 8 + SBUTTERFLY dq, 2, 6, 8 + SBUTTERFLY dq, 3, 7, 8 +%else + SBUTTERFLY bw, 0, 1, 6 + mova [rsp+mmsize*4], m1 + mova m6, [rsp+mmsize*6] + SBUTTERFLY bw, 2, 3, 1 + SBUTTERFLY bw, 4, 5, 1 + SBUTTERFLY bw, 6, 7, 1 + SBUTTERFLY wd, 0, 2, 1 + mova [rsp+mmsize*6], m2 + mova m1, [rsp+mmsize*4] + SBUTTERFLY wd, 1, 3, 2 + SBUTTERFLY wd, 4, 6, 2 + SBUTTERFLY wd, 5, 7, 2 + SBUTTERFLY dq, 0, 4, 2 + SBUTTERFLY dq, 1, 5, 2 +%if mmsize == 16 + movh [Q0], m1 + movhps [Q1], m1 +%else + mova [P3], m1 +%endif + mova m2, [rsp+mmsize*6] + SBUTTERFLY dq, 2, 6, 1 + SBUTTERFLY dq, 3, 7, 1 +%endif + SWAP 3, 6 + SWAP 1, 4 +%if mmsize == 16 + movh [P7], m0 + movhps [P6], m0 + movh [P5], m1 + movhps [P4], m1 + movh [P3], m2 + movhps [P2], m2 + movh [P1], m3 + movhps [P0], m3 +%ifdef m8 + movh [Q0], m4 + movhps [Q1], m4 +%endif + movh [Q2], m5 + movhps [Q3], m5 + movh [Q4], m6 + movhps [Q5], m6 + movh [Q6], m7 + movhps [Q7], m7 +%else + mova [P7], m0 + mova [P6], m1 + mova [P5], m2 + mova [P4], m3 + mova [P2], m5 + mova [P1], m6 + mova [P0], m7 +%endif +%endif +%endif + + RET +%endmacro + +%macro LPF_16_VH 5 +INIT_XMM %5 +LOOPFILTER v, %1, %2, 0, %4 +LOOPFILTER h, %1, %2, %3, %4 +%endmacro + +%macro LPF_16_VH_ALL_OPTS 4 +LPF_16_VH %1, %2, %3, %4, sse2 +LPF_16_VH %1, %2, %3, %4, ssse3 +LPF_16_VH %1, %2, %3, %4, avx +%endmacro + +LPF_16_VH_ALL_OPTS 16, 512, 256, 32 +LPF_16_VH_ALL_OPTS 44, 0, 128, 0 +LPF_16_VH_ALL_OPTS 48, 256, 128, 16 +LPF_16_VH_ALL_OPTS 84, 256, 128, 16 +LPF_16_VH_ALL_OPTS 88, 256, 128, 16 + +INIT_MMX mmxext +LOOPFILTER v, 4, 0, 0, 0 +LOOPFILTER h, 4, 0, 64, 0 +LOOPFILTER v, 8, 128, 0, 8 +LOOPFILTER h, 8, 128, 64, 8 diff --git a/media/ffvpx/libavcodec/x86/vp9lpf_16bpp.asm b/media/ffvpx/libavcodec/x86/vp9lpf_16bpp.asm new file mode 100644 index 0000000000..c0888170c9 --- /dev/null +++ b/media/ffvpx/libavcodec/x86/vp9lpf_16bpp.asm @@ -0,0 +1,823 @@ +;****************************************************************************** +;* VP9 loop filter SIMD optimizations +;* +;* Copyright (C) 2015 Ronald S. Bultje <rsbultje@gmail.com> +;* +;* This file is part of FFmpeg. +;* +;* FFmpeg is free software; you can redistribute it and/or +;* modify it under the terms of the GNU Lesser General Public +;* License as published by the Free Software Foundation; either +;* version 2.1 of the License, or (at your option) any later version. +;* +;* FFmpeg is distributed in the hope that it will be useful, +;* but WITHOUT ANY WARRANTY; without even the implied warranty of +;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +;* Lesser General Public License for more details. +;* +;* You should have received a copy of the GNU Lesser General Public +;* License along with FFmpeg; if not, write to the Free Software +;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA +;****************************************************************************** + +%include "libavutil/x86/x86util.asm" + +SECTION_RODATA + +pw_511: times 16 dw 511 +pw_2047: times 16 dw 2047 +pw_16384: times 16 dw 16384 +pw_m512: times 16 dw -512 +pw_m2048: times 16 dw -2048 + +cextern pw_1 +cextern pw_3 +cextern pw_4 +cextern pw_8 +cextern pw_16 +cextern pw_256 +cextern pw_1023 +cextern pw_4095 +cextern pw_m1 + +SECTION .text + +%macro SCRATCH 3-4 +%if ARCH_X86_64 + SWAP %1, %2 +%if %0 == 4 +%define reg_%4 m%2 +%endif +%else + mova [%3], m%1 +%if %0 == 4 +%define reg_%4 [%3] +%endif +%endif +%endmacro + +%macro UNSCRATCH 3-4 +%if ARCH_X86_64 + SWAP %1, %2 +%else + mova m%1, [%3] +%endif +%if %0 == 4 +%undef reg_%4 +%endif +%endmacro + +%macro PRELOAD 2-3 +%if ARCH_X86_64 + mova m%1, [%2] +%if %0 == 3 +%define reg_%3 m%1 +%endif +%elif %0 == 3 +%define reg_%3 [%2] +%endif +%endmacro + +; calculate p or q portion of flat8out +%macro FLAT8OUT_HALF 0 + psubw m4, m0 ; q4-q0 + psubw m5, m0 ; q5-q0 + psubw m6, m0 ; q6-q0 + psubw m7, m0 ; q7-q0 + ABS2 m4, m5, m2, m3 ; abs(q4-q0) | abs(q5-q0) + ABS2 m6, m7, m2, m3 ; abs(q6-q0) | abs(q7-q0) + pcmpgtw m4, reg_F ; abs(q4-q0) > F + pcmpgtw m5, reg_F ; abs(q5-q0) > F + pcmpgtw m6, reg_F ; abs(q6-q0) > F + pcmpgtw m7, reg_F ; abs(q7-q0) > F + por m5, m4 + por m7, m6 + por m7, m5 ; !flat8out, q portion +%endmacro + +; calculate p or q portion of flat8in/hev/fm (excluding mb_edge condition) +%macro FLAT8IN_HALF 1 +%if %1 > 4 + psubw m4, m3, m0 ; q3-q0 + psubw m5, m2, m0 ; q2-q0 + ABS2 m4, m5, m6, m7 ; abs(q3-q0) | abs(q2-q0) + pcmpgtw m4, reg_F ; abs(q3-q0) > F + pcmpgtw m5, reg_F ; abs(q2-q0) > F +%endif + psubw m3, m2 ; q3-q2 + psubw m2, m1 ; q2-q1 + ABS2 m3, m2, m6, m7 ; abs(q3-q2) | abs(q2-q1) + pcmpgtw m3, reg_I ; abs(q3-q2) > I + pcmpgtw m2, reg_I ; abs(q2-q1) > I +%if %1 > 4 + por m4, m5 +%endif + por m2, m3 + psubw m3, m1, m0 ; q1-q0 + ABS1 m3, m5 ; abs(q1-q0) +%if %1 > 4 + pcmpgtw m6, m3, reg_F ; abs(q1-q0) > F +%endif + pcmpgtw m7, m3, reg_H ; abs(q1-q0) > H + pcmpgtw m3, reg_I ; abs(q1-q0) > I +%if %1 > 4 + por m4, m6 +%endif + por m2, m3 +%endmacro + +; one step in filter_14/filter_6 +; +; take sum $reg, downshift, apply mask and write into dst +; +; if sub2/add1-2 are present, add/sub as appropriate to prepare for the next +; step's sum $reg. This is omitted for the last row in each filter. +; +; if dont_store is set, don't write the result into memory, instead keep the +; values in register so we can write it out later +%macro FILTER_STEP 6-10 "", "", "", 0 ; tmp, reg, mask, shift, dst, \ + ; src/sub1, sub2, add1, add2, dont_store + psrlw %1, %2, %4 + psubw %1, %6 ; abs->delta +%ifnidn %7, "" + psubw %2, %6 + psubw %2, %7 + paddw %2, %8 + paddw %2, %9 +%endif + pand %1, reg_%3 ; apply mask +%if %10 == 1 + paddw %6, %1 ; delta->abs +%else + paddw %1, %6 ; delta->abs + mova [%5], %1 +%endif +%endmacro + +; FIXME avx2 versions for 16_16 and mix2_{4,8}{4,8} + +%macro LOOP_FILTER 3 ; dir[h/v], wd[4/8/16], bpp[10/12] + +%if ARCH_X86_64 +%if %2 == 16 +%assign %%num_xmm_regs 16 +%elif %2 == 8 +%assign %%num_xmm_regs 15 +%else ; %2 == 4 +%assign %%num_xmm_regs 14 +%endif ; %2 +%assign %%bak_mem 0 +%else ; ARCH_X86_32 +%assign %%num_xmm_regs 8 +%if %2 == 16 +%assign %%bak_mem 7 +%elif %2 == 8 +%assign %%bak_mem 6 +%else ; %2 == 4 +%assign %%bak_mem 5 +%endif ; %2 +%endif ; ARCH_X86_64/32 + +%if %2 == 16 +%ifidn %1, v +%assign %%num_gpr_regs 6 +%else ; %1 == h +%assign %%num_gpr_regs 5 +%endif ; %1 +%assign %%wd_mem 6 +%else ; %2 == 8/4 +%assign %%num_gpr_regs 5 +%if ARCH_X86_32 && %2 == 8 +%assign %%wd_mem 2 +%else ; ARCH_X86_64 || %2 == 4 +%assign %%wd_mem 0 +%endif ; ARCH_X86_64/32 etc. +%endif ; %2 + +%ifidn %1, v +%assign %%tsp_mem 0 +%elif %2 == 16 ; && %1 == h +%assign %%tsp_mem 16 +%else ; %1 == h && %1 == 8/4 +%assign %%tsp_mem 8 +%endif ; %1/%2 + +%assign %%off %%wd_mem +%assign %%tspoff %%bak_mem+%%wd_mem +%assign %%stack_mem ((%%bak_mem+%%wd_mem+%%tsp_mem)*mmsize) + +%if %3 == 10 +%define %%maxsgn 511 +%define %%minsgn m512 +%define %%maxusgn 1023 +%define %%maxf 4 +%else ; %3 == 12 +%define %%maxsgn 2047 +%define %%minsgn m2048 +%define %%maxusgn 4095 +%define %%maxf 16 +%endif ; %3 + +cglobal vp9_loop_filter_%1_%2_%3, 5, %%num_gpr_regs, %%num_xmm_regs, %%stack_mem, dst, stride, E, I, H + ; prepare E, I and H masks + shl Ed, %3-8 + shl Id, %3-8 + shl Hd, %3-8 +%if cpuflag(ssse3) + mova m0, [pw_256] +%endif + movd m1, Ed + movd m2, Id + movd m3, Hd +%if cpuflag(ssse3) + pshufb m1, m0 ; E << (bit_depth - 8) + pshufb m2, m0 ; I << (bit_depth - 8) + pshufb m3, m0 ; H << (bit_depth - 8) +%else + punpcklwd m1, m1 + punpcklwd m2, m2 + punpcklwd m3, m3 + pshufd m1, m1, q0000 + pshufd m2, m2, q0000 + pshufd m3, m3, q0000 +%endif + SCRATCH 1, 8, rsp+(%%off+0)*mmsize, E + SCRATCH 2, 9, rsp+(%%off+1)*mmsize, I + SCRATCH 3, 10, rsp+(%%off+2)*mmsize, H +%if %2 > 4 + PRELOAD 11, pw_ %+ %%maxf, F +%endif + + ; set up variables to load data +%ifidn %1, v + DEFINE_ARGS dst8, stride, stride3, dst0, dst4, dst12 + lea stride3q, [strideq*3] + neg strideq +%if %2 == 16 + lea dst0q, [dst8q+strideq*8] +%else + lea dst4q, [dst8q+strideq*4] +%endif + neg strideq +%if %2 == 16 + lea dst12q, [dst8q+strideq*4] + lea dst4q, [dst0q+strideq*4] +%endif + +%if %2 == 16 +%define %%p7 dst0q +%define %%p6 dst0q+strideq +%define %%p5 dst0q+strideq*2 +%define %%p4 dst0q+stride3q +%endif +%define %%p3 dst4q +%define %%p2 dst4q+strideq +%define %%p1 dst4q+strideq*2 +%define %%p0 dst4q+stride3q +%define %%q0 dst8q +%define %%q1 dst8q+strideq +%define %%q2 dst8q+strideq*2 +%define %%q3 dst8q+stride3q +%if %2 == 16 +%define %%q4 dst12q +%define %%q5 dst12q+strideq +%define %%q6 dst12q+strideq*2 +%define %%q7 dst12q+stride3q +%endif +%else ; %1 == h + DEFINE_ARGS dst0, stride, stride3, dst4 + lea stride3q, [strideq*3] + lea dst4q, [dst0q+strideq*4] + +%define %%p3 rsp+(%%tspoff+0)*mmsize +%define %%p2 rsp+(%%tspoff+1)*mmsize +%define %%p1 rsp+(%%tspoff+2)*mmsize +%define %%p0 rsp+(%%tspoff+3)*mmsize +%define %%q0 rsp+(%%tspoff+4)*mmsize +%define %%q1 rsp+(%%tspoff+5)*mmsize +%define %%q2 rsp+(%%tspoff+6)*mmsize +%define %%q3 rsp+(%%tspoff+7)*mmsize + +%if %2 < 16 + movu m0, [dst0q+strideq*0-8] + movu m1, [dst0q+strideq*1-8] + movu m2, [dst0q+strideq*2-8] + movu m3, [dst0q+stride3q -8] + movu m4, [dst4q+strideq*0-8] + movu m5, [dst4q+strideq*1-8] + movu m6, [dst4q+strideq*2-8] + movu m7, [dst4q+stride3q -8] + +%if ARCH_X86_64 + TRANSPOSE8x8W 0, 1, 2, 3, 4, 5, 6, 7, 12 +%else + TRANSPOSE8x8W 0, 1, 2, 3, 4, 5, 6, 7, [%%p0], [%%q0] +%endif + + mova [%%p3], m0 + mova [%%p2], m1 + mova [%%p1], m2 + mova [%%p0], m3 +%if ARCH_X86_64 + mova [%%q0], m4 +%endif + mova [%%q1], m5 + mova [%%q2], m6 + mova [%%q3], m7 + + ; FIXME investigate if we can _not_ load q0-3 below if h, and adjust register + ; order here accordingly +%else ; %2 == 16 + +%define %%p7 rsp+(%%tspoff+ 8)*mmsize +%define %%p6 rsp+(%%tspoff+ 9)*mmsize +%define %%p5 rsp+(%%tspoff+10)*mmsize +%define %%p4 rsp+(%%tspoff+11)*mmsize +%define %%q4 rsp+(%%tspoff+12)*mmsize +%define %%q5 rsp+(%%tspoff+13)*mmsize +%define %%q6 rsp+(%%tspoff+14)*mmsize +%define %%q7 rsp+(%%tspoff+15)*mmsize + + mova m0, [dst0q+strideq*0-16] + mova m1, [dst0q+strideq*1-16] + mova m2, [dst0q+strideq*2-16] + mova m3, [dst0q+stride3q -16] + mova m4, [dst4q+strideq*0-16] + mova m5, [dst4q+strideq*1-16] +%if ARCH_X86_64 + mova m6, [dst4q+strideq*2-16] +%endif + mova m7, [dst4q+stride3q -16] + +%if ARCH_X86_64 + TRANSPOSE8x8W 0, 1, 2, 3, 4, 5, 6, 7, 12 +%else + TRANSPOSE8x8W 0, 1, 2, 3, 4, 5, 6, 7, [dst4q+strideq*2-16], [%%p3], 1 +%endif + + mova [%%p7], m0 + mova [%%p6], m1 + mova [%%p5], m2 + mova [%%p4], m3 +%if ARCH_X86_64 + mova [%%p3], m4 +%endif + mova [%%p2], m5 + mova [%%p1], m6 + mova [%%p0], m7 + + mova m0, [dst0q+strideq*0] + mova m1, [dst0q+strideq*1] + mova m2, [dst0q+strideq*2] + mova m3, [dst0q+stride3q ] + mova m4, [dst4q+strideq*0] + mova m5, [dst4q+strideq*1] +%if ARCH_X86_64 + mova m6, [dst4q+strideq*2] +%endif + mova m7, [dst4q+stride3q ] + +%if ARCH_X86_64 + TRANSPOSE8x8W 0, 1, 2, 3, 4, 5, 6, 7, 12 +%else + TRANSPOSE8x8W 0, 1, 2, 3, 4, 5, 6, 7, [dst4q+strideq*2], [%%q4], 1 +%endif + + mova [%%q0], m0 + mova [%%q1], m1 + mova [%%q2], m2 + mova [%%q3], m3 +%if ARCH_X86_64 + mova [%%q4], m4 +%endif + mova [%%q5], m5 + mova [%%q6], m6 + mova [%%q7], m7 + + ; FIXME investigate if we can _not_ load q0|q4-7 below if h, and adjust register + ; order here accordingly +%endif ; %2 +%endif ; %1 + + ; load q0|q4-7 data + mova m0, [%%q0] +%if %2 == 16 + mova m4, [%%q4] + mova m5, [%%q5] + mova m6, [%%q6] + mova m7, [%%q7] + + ; flat8out q portion + FLAT8OUT_HALF + SCRATCH 7, 15, rsp+(%%off+6)*mmsize, F8O +%endif + + ; load q1-3 data + mova m1, [%%q1] + mova m2, [%%q2] + mova m3, [%%q3] + + ; r6-8|pw_4[m8-11]=reg_E/I/H/F + ; r9[m15]=!flatout[q] + ; m12-14=free + ; m0-3=q0-q3 + ; m4-7=free + + ; flat8in|fm|hev q portion + FLAT8IN_HALF %2 + SCRATCH 7, 13, rsp+(%%off+4)*mmsize, HEV +%if %2 > 4 + SCRATCH 4, 14, rsp+(%%off+5)*mmsize, F8I +%endif + + ; r6-8|pw_4[m8-11]=reg_E/I/H/F + ; r9[m15]=!flat8out[q] + ; r10[m13]=hev[q] + ; r11[m14]=!flat8in[q] + ; m2=!fm[q] + ; m0,1=q0-q1 + ; m2-7=free + ; m12=free + + ; load p0-1 + mova m3, [%%p0] + mova m4, [%%p1] + + ; fm mb_edge portion + psubw m5, m3, m0 ; q0-p0 + psubw m6, m4, m1 ; q1-p1 +%if ARCH_X86_64 + ABS2 m5, m6, m7, m12 ; abs(q0-p0) | abs(q1-p1) +%else + ABS1 m5, m7 ; abs(q0-p0) + ABS1 m6, m7 ; abs(q1-p1) +%endif + paddw m5, m5 + psraw m6, 1 + paddw m6, m5 ; abs(q0-p0)*2+(abs(q1-p1)>>1) + pcmpgtw m6, reg_E + por m2, m6 + SCRATCH 2, 12, rsp+(%%off+3)*mmsize, FM + + ; r6-8|pw_4[m8-11]=reg_E/I/H/F + ; r9[m15]=!flat8out[q] + ; r10[m13]=hev[q] + ; r11[m14]=!flat8in[q] + ; r12[m12]=!fm[q] + ; m3-4=q0-1 + ; m0-2/5-7=free + + ; load p4-7 data + SWAP 3, 0 ; p0 + SWAP 4, 1 ; p1 +%if %2 == 16 + mova m7, [%%p7] + mova m6, [%%p6] + mova m5, [%%p5] + mova m4, [%%p4] + + ; flat8out p portion + FLAT8OUT_HALF + por m7, reg_F8O + SCRATCH 7, 15, rsp+(%%off+6)*mmsize, F8O +%endif + + ; r6-8|pw_4[m8-11]=reg_E/I/H/F + ; r9[m15]=!flat8out + ; r10[m13]=hev[q] + ; r11[m14]=!flat8in[q] + ; r12[m12]=!fm[q] + ; m0=p0 + ; m1-7=free + + ; load p2-3 data + mova m2, [%%p2] + mova m3, [%%p3] + + ; flat8in|fm|hev p portion + FLAT8IN_HALF %2 + por m7, reg_HEV +%if %2 > 4 + por m4, reg_F8I +%endif + por m2, reg_FM +%if %2 > 4 + por m4, m2 ; !flat8|!fm +%if %2 == 16 + por m5, m4, reg_F8O ; !flat16|!fm + pandn m2, m4 ; filter4_mask + pandn m4, m5 ; filter8_mask + pxor m5, [pw_m1] ; filter16_mask + SCRATCH 5, 15, rsp+(%%off+6)*mmsize, F16M +%else + pandn m2, m4 ; filter4_mask + pxor m4, [pw_m1] ; filter8_mask +%endif + SCRATCH 4, 14, rsp+(%%off+5)*mmsize, F8M +%else + pxor m2, [pw_m1] ; filter4_mask +%endif + SCRATCH 7, 13, rsp+(%%off+4)*mmsize, HEV + SCRATCH 2, 12, rsp+(%%off+3)*mmsize, F4M + + ; r9[m15]=filter16_mask + ; r10[m13]=hev + ; r11[m14]=filter8_mask + ; r12[m12]=filter4_mask + ; m0,1=p0-p1 + ; m2-7=free + ; m8-11=free + +%if %2 > 4 +%if %2 == 16 + ; filter_14 + mova m2, [%%p7] + mova m3, [%%p6] + mova m6, [%%p5] + mova m7, [%%p4] + PRELOAD 8, %%p3, P3 + PRELOAD 9, %%p2, P2 +%endif + PRELOAD 10, %%q0, Q0 + PRELOAD 11, %%q1, Q1 +%if %2 == 16 + psllw m4, m2, 3 + paddw m5, m3, m3 + paddw m4, m6 + paddw m5, m7 + paddw m4, reg_P3 + paddw m5, reg_P2 + paddw m4, m1 + paddw m5, m0 + paddw m4, reg_Q0 ; q0+p1+p3+p5+p7*8 + psubw m5, m2 ; p0+p2+p4+p6*2-p7 + paddw m4, [pw_8] + paddw m5, m4 ; q0+p0+p1+p2+p3+p4+p5+p6*2+p7*7+8 + + ; below, we use r0-5 for storing pre-filter pixels for subsequent subtraction + ; at the end of the filter + + mova [rsp+0*mmsize], m3 + FILTER_STEP m4, m5, F16M, 4, %%p6, m3, m2, m6, reg_Q1 +%endif + mova m3, [%%q2] +%if %2 == 16 + mova [rsp+1*mmsize], m6 + FILTER_STEP m4, m5, F16M, 4, %%p5, m6, m2, m7, m3 +%endif + mova m6, [%%q3] +%if %2 == 16 + mova [rsp+2*mmsize], m7 + FILTER_STEP m4, m5, F16M, 4, %%p4, m7, m2, reg_P3, m6 + mova m7, [%%q4] +%if ARCH_X86_64 + mova [rsp+3*mmsize], reg_P3 +%else + mova m4, reg_P3 + mova [rsp+3*mmsize], m4 +%endif + FILTER_STEP m4, m5, F16M, 4, %%p3, reg_P3, m2, reg_P2, m7 + PRELOAD 8, %%q5, Q5 +%if ARCH_X86_64 + mova [rsp+4*mmsize], reg_P2 +%else + mova m4, reg_P2 + mova [rsp+4*mmsize], m4 +%endif + FILTER_STEP m4, m5, F16M, 4, %%p2, reg_P2, m2, m1, reg_Q5 + PRELOAD 9, %%q6, Q6 + mova [rsp+5*mmsize], m1 + FILTER_STEP m4, m5, F16M, 4, %%p1, m1, m2, m0, reg_Q6 + mova m1, [%%q7] + FILTER_STEP m4, m5, F16M, 4, %%p0, m0, m2, reg_Q0, m1, 1 + FILTER_STEP m4, m5, F16M, 4, %%q0, reg_Q0, [rsp+0*mmsize], reg_Q1, m1, ARCH_X86_64 + FILTER_STEP m4, m5, F16M, 4, %%q1, reg_Q1, [rsp+1*mmsize], m3, m1, ARCH_X86_64 + FILTER_STEP m4, m5, F16M, 4, %%q2, m3, [rsp+2*mmsize], m6, m1, 1 + FILTER_STEP m4, m5, F16M, 4, %%q3, m6, [rsp+3*mmsize], m7, m1 + FILTER_STEP m4, m5, F16M, 4, %%q4, m7, [rsp+4*mmsize], reg_Q5, m1 + FILTER_STEP m4, m5, F16M, 4, %%q5, reg_Q5, [rsp+5*mmsize], reg_Q6, m1 + FILTER_STEP m4, m5, F16M, 4, %%q6, reg_Q6 + + mova m7, [%%p1] +%else + SWAP 1, 7 +%endif + + mova m2, [%%p3] + mova m1, [%%p2] + + ; reg_Q0-1 (m10-m11) + ; m0=p0 + ; m1=p2 + ; m2=p3 + ; m3=q2 + ; m4-5=free + ; m6=q3 + ; m7=p1 + ; m8-9 unused + + ; filter_6 + psllw m4, m2, 2 + paddw m5, m1, m1 + paddw m4, m7 + psubw m5, m2 + paddw m4, m0 + paddw m5, reg_Q0 + paddw m4, [pw_4] + paddw m5, m4 + +%if ARCH_X86_64 + mova m8, m1 + mova m9, m7 +%else + mova [rsp+0*mmsize], m1 + mova [rsp+1*mmsize], m7 +%endif +%ifidn %1, v + FILTER_STEP m4, m5, F8M, 3, %%p2, m1, m2, m7, reg_Q1 +%else + FILTER_STEP m4, m5, F8M, 3, %%p2, m1, m2, m7, reg_Q1, 1 +%endif + FILTER_STEP m4, m5, F8M, 3, %%p1, m7, m2, m0, m3, 1 + FILTER_STEP m4, m5, F8M, 3, %%p0, m0, m2, reg_Q0, m6, 1 +%if ARCH_X86_64 + FILTER_STEP m4, m5, F8M, 3, %%q0, reg_Q0, m8, reg_Q1, m6, ARCH_X86_64 + FILTER_STEP m4, m5, F8M, 3, %%q1, reg_Q1, m9, m3, m6, ARCH_X86_64 +%else + FILTER_STEP m4, m5, F8M, 3, %%q0, reg_Q0, [rsp+0*mmsize], reg_Q1, m6, ARCH_X86_64 + FILTER_STEP m4, m5, F8M, 3, %%q1, reg_Q1, [rsp+1*mmsize], m3, m6, ARCH_X86_64 +%endif + FILTER_STEP m4, m5, F8M, 3, %%q2, m3 + + UNSCRATCH 2, 10, %%q0 + UNSCRATCH 6, 11, %%q1 +%else + SWAP 1, 7 + mova m2, [%%q0] + mova m6, [%%q1] +%endif + UNSCRATCH 3, 13, rsp+(%%off+4)*mmsize, HEV + + ; m0=p0 + ; m1=p2 + ; m2=q0 + ; m3=hev_mask + ; m4-5=free + ; m6=q1 + ; m7=p1 + + ; filter_4 + psubw m4, m7, m6 ; p1-q1 + psubw m5, m2, m0 ; q0-p0 + pand m4, m3 + pminsw m4, [pw_ %+ %%maxsgn] + pmaxsw m4, [pw_ %+ %%minsgn] ; clip_intp2(p1-q1, 9) -> f + paddw m4, m5 + paddw m5, m5 + paddw m4, m5 ; 3*(q0-p0)+f + pminsw m4, [pw_ %+ %%maxsgn] + pmaxsw m4, [pw_ %+ %%minsgn] ; clip_intp2(3*(q0-p0)+f, 9) -> f + pand m4, reg_F4M + paddw m5, m4, [pw_4] + paddw m4, [pw_3] + pminsw m5, [pw_ %+ %%maxsgn] + pminsw m4, [pw_ %+ %%maxsgn] + psraw m5, 3 ; min_intp2(f+4, 9)>>3 -> f1 + psraw m4, 3 ; min_intp2(f+3, 9)>>3 -> f2 + psubw m2, m5 ; q0-f1 + paddw m0, m4 ; p0+f2 + pandn m3, m5 ; f1 & !hev (for p1/q1 adj) + pxor m4, m4 + mova m5, [pw_ %+ %%maxusgn] + pmaxsw m2, m4 + pmaxsw m0, m4 + pminsw m2, m5 + pminsw m0, m5 +%if cpuflag(ssse3) + pmulhrsw m3, [pw_16384] ; (f1+1)>>1 +%else + paddw m3, [pw_1] + psraw m3, 1 +%endif + paddw m7, m3 ; p1+f + psubw m6, m3 ; q1-f + pmaxsw m7, m4 + pmaxsw m6, m4 + pminsw m7, m5 + pminsw m6, m5 + + ; store +%ifidn %1, v + mova [%%p1], m7 + mova [%%p0], m0 + mova [%%q0], m2 + mova [%%q1], m6 +%else ; %1 == h +%if %2 == 4 + TRANSPOSE4x4W 7, 0, 2, 6, 1 + movh [dst0q+strideq*0-4], m7 + movhps [dst0q+strideq*1-4], m7 + movh [dst0q+strideq*2-4], m0 + movhps [dst0q+stride3q -4], m0 + movh [dst4q+strideq*0-4], m2 + movhps [dst4q+strideq*1-4], m2 + movh [dst4q+strideq*2-4], m6 + movhps [dst4q+stride3q -4], m6 +%elif %2 == 8 + mova m3, [%%p3] + mova m4, [%%q2] + mova m5, [%%q3] + +%if ARCH_X86_64 + TRANSPOSE8x8W 3, 1, 7, 0, 2, 6, 4, 5, 8 +%else + TRANSPOSE8x8W 3, 1, 7, 0, 2, 6, 4, 5, [%%q2], [%%q0], 1 + mova m2, [%%q0] +%endif + + movu [dst0q+strideq*0-8], m3 + movu [dst0q+strideq*1-8], m1 + movu [dst0q+strideq*2-8], m7 + movu [dst0q+stride3q -8], m0 + movu [dst4q+strideq*0-8], m2 + movu [dst4q+strideq*1-8], m6 + movu [dst4q+strideq*2-8], m4 + movu [dst4q+stride3q -8], m5 +%else ; %2 == 16 + SCRATCH 2, 8, %%q0 + SCRATCH 6, 9, %%q1 + mova m2, [%%p7] + mova m3, [%%p6] + mova m4, [%%p5] + mova m5, [%%p4] + mova m6, [%%p3] + +%if ARCH_X86_64 + TRANSPOSE8x8W 2, 3, 4, 5, 6, 1, 7, 0, 10 +%else + mova [%%p1], m7 + TRANSPOSE8x8W 2, 3, 4, 5, 6, 1, 7, 0, [%%p1], [dst4q+strideq*0-16], 1 +%endif + + mova [dst0q+strideq*0-16], m2 + mova [dst0q+strideq*1-16], m3 + mova [dst0q+strideq*2-16], m4 + mova [dst0q+stride3q -16], m5 +%if ARCH_X86_64 + mova [dst4q+strideq*0-16], m6 +%endif + mova [dst4q+strideq*1-16], m1 + mova [dst4q+strideq*2-16], m7 + mova [dst4q+stride3q -16], m0 + + UNSCRATCH 2, 8, %%q0 + UNSCRATCH 6, 9, %%q1 + mova m0, [%%q2] + mova m1, [%%q3] + mova m3, [%%q4] + mova m4, [%%q5] +%if ARCH_X86_64 + mova m5, [%%q6] +%endif + mova m7, [%%q7] + +%if ARCH_X86_64 + TRANSPOSE8x8W 2, 6, 0, 1, 3, 4, 5, 7, 8 +%else + TRANSPOSE8x8W 2, 6, 0, 1, 3, 4, 5, 7, [%%q6], [dst4q+strideq*0], 1 +%endif + + mova [dst0q+strideq*0], m2 + mova [dst0q+strideq*1], m6 + mova [dst0q+strideq*2], m0 + mova [dst0q+stride3q ], m1 +%if ARCH_X86_64 + mova [dst4q+strideq*0], m3 +%endif + mova [dst4q+strideq*1], m4 + mova [dst4q+strideq*2], m5 + mova [dst4q+stride3q ], m7 +%endif ; %2 +%endif ; %1 + RET +%endmacro + +%macro LOOP_FILTER_CPUSETS 3 +INIT_XMM sse2 +LOOP_FILTER %1, %2, %3 +INIT_XMM ssse3 +LOOP_FILTER %1, %2, %3 +INIT_XMM avx +LOOP_FILTER %1, %2, %3 +%endmacro + +%macro LOOP_FILTER_WDSETS 2 +LOOP_FILTER_CPUSETS %1, 4, %2 +LOOP_FILTER_CPUSETS %1, 8, %2 +LOOP_FILTER_CPUSETS %1, 16, %2 +%endmacro + +LOOP_FILTER_WDSETS h, 10 +LOOP_FILTER_WDSETS v, 10 +LOOP_FILTER_WDSETS h, 12 +LOOP_FILTER_WDSETS v, 12 diff --git a/media/ffvpx/libavcodec/x86/vp9mc.asm b/media/ffvpx/libavcodec/x86/vp9mc.asm new file mode 100644 index 0000000000..efc4cfbef1 --- /dev/null +++ b/media/ffvpx/libavcodec/x86/vp9mc.asm @@ -0,0 +1,680 @@ +;****************************************************************************** +;* VP9 motion compensation SIMD optimizations +;* +;* Copyright (c) 2013 Ronald S. Bultje <rsbultje gmail com> +;* +;* This file is part of FFmpeg. +;* +;* FFmpeg is free software; you can redistribute it and/or +;* modify it under the terms of the GNU Lesser General Public +;* License as published by the Free Software Foundation; either +;* version 2.1 of the License, or (at your option) any later version. +;* +;* FFmpeg is distributed in the hope that it will be useful, +;* but WITHOUT ANY WARRANTY; without even the implied warranty of +;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +;* Lesser General Public License for more details. +;* +;* You should have received a copy of the GNU Lesser General Public +;* License along with FFmpeg; if not, write to the Free Software +;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA +;****************************************************************************** + +%include "libavutil/x86/x86util.asm" + +SECTION_RODATA 32 + +cextern pw_256 +cextern pw_64 + +%macro F8_SSSE3_TAPS 8 +times 16 db %1, %2 +times 16 db %3, %4 +times 16 db %5, %6 +times 16 db %7, %8 +%endmacro + +%macro F8_SSE2_TAPS 8 +times 8 dw %1 +times 8 dw %2 +times 8 dw %3 +times 8 dw %4 +times 8 dw %5 +times 8 dw %6 +times 8 dw %7 +times 8 dw %8 +%endmacro + +%macro F8_16BPP_TAPS 8 +times 8 dw %1, %2 +times 8 dw %3, %4 +times 8 dw %5, %6 +times 8 dw %7, %8 +%endmacro + +%macro FILTER 1 +const filters_%1 ; smooth + F8_TAPS -3, -1, 32, 64, 38, 1, -3, 0 + F8_TAPS -2, -2, 29, 63, 41, 2, -3, 0 + F8_TAPS -2, -2, 26, 63, 43, 4, -4, 0 + F8_TAPS -2, -3, 24, 62, 46, 5, -4, 0 + F8_TAPS -2, -3, 21, 60, 49, 7, -4, 0 + F8_TAPS -1, -4, 18, 59, 51, 9, -4, 0 + F8_TAPS -1, -4, 16, 57, 53, 12, -4, -1 + F8_TAPS -1, -4, 14, 55, 55, 14, -4, -1 + F8_TAPS -1, -4, 12, 53, 57, 16, -4, -1 + F8_TAPS 0, -4, 9, 51, 59, 18, -4, -1 + F8_TAPS 0, -4, 7, 49, 60, 21, -3, -2 + F8_TAPS 0, -4, 5, 46, 62, 24, -3, -2 + F8_TAPS 0, -4, 4, 43, 63, 26, -2, -2 + F8_TAPS 0, -3, 2, 41, 63, 29, -2, -2 + F8_TAPS 0, -3, 1, 38, 64, 32, -1, -3 + ; regular + F8_TAPS 0, 1, -5, 126, 8, -3, 1, 0 + F8_TAPS -1, 3, -10, 122, 18, -6, 2, 0 + F8_TAPS -1, 4, -13, 118, 27, -9, 3, -1 + F8_TAPS -1, 4, -16, 112, 37, -11, 4, -1 + F8_TAPS -1, 5, -18, 105, 48, -14, 4, -1 + F8_TAPS -1, 5, -19, 97, 58, -16, 5, -1 + F8_TAPS -1, 6, -19, 88, 68, -18, 5, -1 + F8_TAPS -1, 6, -19, 78, 78, -19, 6, -1 + F8_TAPS -1, 5, -18, 68, 88, -19, 6, -1 + F8_TAPS -1, 5, -16, 58, 97, -19, 5, -1 + F8_TAPS -1, 4, -14, 48, 105, -18, 5, -1 + F8_TAPS -1, 4, -11, 37, 112, -16, 4, -1 + F8_TAPS -1, 3, -9, 27, 118, -13, 4, -1 + F8_TAPS 0, 2, -6, 18, 122, -10, 3, -1 + F8_TAPS 0, 1, -3, 8, 126, -5, 1, 0 + ; sharp + F8_TAPS -1, 3, -7, 127, 8, -3, 1, 0 + F8_TAPS -2, 5, -13, 125, 17, -6, 3, -1 + F8_TAPS -3, 7, -17, 121, 27, -10, 5, -2 + F8_TAPS -4, 9, -20, 115, 37, -13, 6, -2 + F8_TAPS -4, 10, -23, 108, 48, -16, 8, -3 + F8_TAPS -4, 10, -24, 100, 59, -19, 9, -3 + F8_TAPS -4, 11, -24, 90, 70, -21, 10, -4 + F8_TAPS -4, 11, -23, 80, 80, -23, 11, -4 + F8_TAPS -4, 10, -21, 70, 90, -24, 11, -4 + F8_TAPS -3, 9, -19, 59, 100, -24, 10, -4 + F8_TAPS -3, 8, -16, 48, 108, -23, 10, -4 + F8_TAPS -2, 6, -13, 37, 115, -20, 9, -4 + F8_TAPS -2, 5, -10, 27, 121, -17, 7, -3 + F8_TAPS -1, 3, -6, 17, 125, -13, 5, -2 + F8_TAPS 0, 1, -3, 8, 127, -7, 3, -1 +%endmacro + +%define F8_TAPS F8_SSSE3_TAPS +; int8_t ff_filters_ssse3[3][15][4][32] +FILTER ssse3 +%define F8_TAPS F8_SSE2_TAPS +; int16_t ff_filters_sse2[3][15][8][8] +FILTER sse2 +%define F8_TAPS F8_16BPP_TAPS +; int16_t ff_filters_16bpp[3][15][4][16] +FILTER 16bpp + +SECTION .text + +%macro filter_sse2_h_fn 1 +%assign %%px mmsize/2 +cglobal vp9_%1_8tap_1d_h_ %+ %%px %+ _8, 6, 6, 15, dst, dstride, src, sstride, h, filtery + pxor m5, m5 + mova m6, [pw_64] + mova m7, [filteryq+ 0] +%if ARCH_X86_64 && mmsize > 8 + mova m8, [filteryq+ 16] + mova m9, [filteryq+ 32] + mova m10, [filteryq+ 48] + mova m11, [filteryq+ 64] + mova m12, [filteryq+ 80] + mova m13, [filteryq+ 96] + mova m14, [filteryq+112] +%endif +.loop: + movh m0, [srcq-3] + movh m1, [srcq-2] + movh m2, [srcq-1] + movh m3, [srcq+0] + movh m4, [srcq+1] + punpcklbw m0, m5 + punpcklbw m1, m5 + punpcklbw m2, m5 + punpcklbw m3, m5 + punpcklbw m4, m5 + pmullw m0, m7 +%if ARCH_X86_64 && mmsize > 8 + pmullw m1, m8 + pmullw m2, m9 + pmullw m3, m10 + pmullw m4, m11 +%else + pmullw m1, [filteryq+ 16] + pmullw m2, [filteryq+ 32] + pmullw m3, [filteryq+ 48] + pmullw m4, [filteryq+ 64] +%endif + paddw m0, m1 + paddw m2, m3 + paddw m0, m4 + movh m1, [srcq+2] + movh m3, [srcq+3] + movh m4, [srcq+4] + add srcq, sstrideq + punpcklbw m1, m5 + punpcklbw m3, m5 + punpcklbw m4, m5 +%if ARCH_X86_64 && mmsize > 8 + pmullw m1, m12 + pmullw m3, m13 + pmullw m4, m14 +%else + pmullw m1, [filteryq+ 80] + pmullw m3, [filteryq+ 96] + pmullw m4, [filteryq+112] +%endif + paddw m0, m1 + paddw m3, m4 + paddw m0, m6 + paddw m2, m3 + paddsw m0, m2 + psraw m0, 7 +%ifidn %1, avg + movh m1, [dstq] +%endif + packuswb m0, m0 +%ifidn %1, avg + pavgb m0, m1 +%endif + movh [dstq], m0 + add dstq, dstrideq + dec hd + jg .loop + RET +%endmacro + +INIT_MMX mmxext +filter_sse2_h_fn put +filter_sse2_h_fn avg + +INIT_XMM sse2 +filter_sse2_h_fn put +filter_sse2_h_fn avg + +%macro filter_h_fn 1 +%assign %%px mmsize/2 +cglobal vp9_%1_8tap_1d_h_ %+ %%px %+ _8, 6, 6, 11, dst, dstride, src, sstride, h, filtery + mova m6, [pw_256] + mova m7, [filteryq+ 0] +%if ARCH_X86_64 && mmsize > 8 + mova m8, [filteryq+32] + mova m9, [filteryq+64] + mova m10, [filteryq+96] +%endif +.loop: + movh m0, [srcq-3] + movh m1, [srcq-2] + movh m2, [srcq-1] + movh m3, [srcq+0] + movh m4, [srcq+1] + movh m5, [srcq+2] + punpcklbw m0, m1 + punpcklbw m2, m3 + movh m1, [srcq+3] + movh m3, [srcq+4] + add srcq, sstrideq + punpcklbw m4, m5 + punpcklbw m1, m3 + pmaddubsw m0, m7 +%if ARCH_X86_64 && mmsize > 8 + pmaddubsw m2, m8 + pmaddubsw m4, m9 + pmaddubsw m1, m10 +%else + pmaddubsw m2, [filteryq+32] + pmaddubsw m4, [filteryq+64] + pmaddubsw m1, [filteryq+96] +%endif + paddw m0, m4 + paddw m2, m1 + paddsw m0, m2 + pmulhrsw m0, m6 +%ifidn %1, avg + movh m1, [dstq] +%endif + packuswb m0, m0 +%ifidn %1, avg + pavgb m0, m1 +%endif + movh [dstq], m0 + add dstq, dstrideq + dec hd + jg .loop + RET +%endmacro + +INIT_MMX ssse3 +filter_h_fn put +filter_h_fn avg + +INIT_XMM ssse3 +filter_h_fn put +filter_h_fn avg + +%if ARCH_X86_64 +%macro filter_hx2_fn 1 +%assign %%px mmsize +cglobal vp9_%1_8tap_1d_h_ %+ %%px %+ _8, 6, 6, 14, dst, dstride, src, sstride, h, filtery + mova m13, [pw_256] + mova m8, [filteryq+ 0] + mova m9, [filteryq+32] + mova m10, [filteryq+64] + mova m11, [filteryq+96] +.loop: + movu m0, [srcq-3] + movu m1, [srcq-2] + movu m2, [srcq-1] + movu m3, [srcq+0] + movu m4, [srcq+1] + movu m5, [srcq+2] + movu m6, [srcq+3] + movu m7, [srcq+4] + add srcq, sstrideq + SBUTTERFLY bw, 0, 1, 12 + SBUTTERFLY bw, 2, 3, 12 + SBUTTERFLY bw, 4, 5, 12 + SBUTTERFLY bw, 6, 7, 12 + pmaddubsw m0, m8 + pmaddubsw m1, m8 + pmaddubsw m2, m9 + pmaddubsw m3, m9 + pmaddubsw m4, m10 + pmaddubsw m5, m10 + pmaddubsw m6, m11 + pmaddubsw m7, m11 + paddw m0, m4 + paddw m1, m5 + paddw m2, m6 + paddw m3, m7 + paddsw m0, m2 + paddsw m1, m3 + pmulhrsw m0, m13 + pmulhrsw m1, m13 + packuswb m0, m1 +%ifidn %1, avg + pavgb m0, [dstq] +%endif + mova [dstq], m0 + add dstq, dstrideq + dec hd + jg .loop + RET +%endmacro + +INIT_XMM ssse3 +filter_hx2_fn put +filter_hx2_fn avg + +%if HAVE_AVX2_EXTERNAL +INIT_YMM avx2 +filter_hx2_fn put +filter_hx2_fn avg +%endif + +%endif ; ARCH_X86_64 + +%macro filter_sse2_v_fn 1 +%assign %%px mmsize/2 +%if ARCH_X86_64 +cglobal vp9_%1_8tap_1d_v_ %+ %%px %+ _8, 6, 8, 15, dst, dstride, src, sstride, h, filtery, src4, sstride3 +%else +cglobal vp9_%1_8tap_1d_v_ %+ %%px %+ _8, 4, 7, 15, dst, dstride, src, sstride, filtery, src4, sstride3 + mov filteryq, r5mp +%define hd r4mp +%endif + pxor m5, m5 + mova m6, [pw_64] + lea sstride3q, [sstrideq*3] + lea src4q, [srcq+sstrideq] + sub srcq, sstride3q + mova m7, [filteryq+ 0] +%if ARCH_X86_64 && mmsize > 8 + mova m8, [filteryq+ 16] + mova m9, [filteryq+ 32] + mova m10, [filteryq+ 48] + mova m11, [filteryq+ 64] + mova m12, [filteryq+ 80] + mova m13, [filteryq+ 96] + mova m14, [filteryq+112] +%endif +.loop: + ; FIXME maybe reuse loads from previous rows, or just + ; more generally unroll this to prevent multiple loads of + ; the same data? + movh m0, [srcq] + movh m1, [srcq+sstrideq] + movh m2, [srcq+sstrideq*2] + movh m3, [srcq+sstride3q] + add srcq, sstrideq + movh m4, [src4q] + punpcklbw m0, m5 + punpcklbw m1, m5 + punpcklbw m2, m5 + punpcklbw m3, m5 + punpcklbw m4, m5 + pmullw m0, m7 +%if ARCH_X86_64 && mmsize > 8 + pmullw m1, m8 + pmullw m2, m9 + pmullw m3, m10 + pmullw m4, m11 +%else + pmullw m1, [filteryq+ 16] + pmullw m2, [filteryq+ 32] + pmullw m3, [filteryq+ 48] + pmullw m4, [filteryq+ 64] +%endif + paddw m0, m1 + paddw m2, m3 + paddw m0, m4 + movh m1, [src4q+sstrideq] + movh m3, [src4q+sstrideq*2] + movh m4, [src4q+sstride3q] + add src4q, sstrideq + punpcklbw m1, m5 + punpcklbw m3, m5 + punpcklbw m4, m5 +%if ARCH_X86_64 && mmsize > 8 + pmullw m1, m12 + pmullw m3, m13 + pmullw m4, m14 +%else + pmullw m1, [filteryq+ 80] + pmullw m3, [filteryq+ 96] + pmullw m4, [filteryq+112] +%endif + paddw m0, m1 + paddw m3, m4 + paddw m0, m6 + paddw m2, m3 + paddsw m0, m2 + psraw m0, 7 +%ifidn %1, avg + movh m1, [dstq] +%endif + packuswb m0, m0 +%ifidn %1, avg + pavgb m0, m1 +%endif + movh [dstq], m0 + add dstq, dstrideq + dec hd + jg .loop + RET +%endmacro + +INIT_MMX mmxext +filter_sse2_v_fn put +filter_sse2_v_fn avg + +INIT_XMM sse2 +filter_sse2_v_fn put +filter_sse2_v_fn avg + +%macro filter_v_fn 1 +%assign %%px mmsize/2 +%if ARCH_X86_64 +cglobal vp9_%1_8tap_1d_v_ %+ %%px %+ _8, 6, 8, 11, dst, dstride, src, sstride, h, filtery, src4, sstride3 +%else +cglobal vp9_%1_8tap_1d_v_ %+ %%px %+ _8, 4, 7, 11, dst, dstride, src, sstride, filtery, src4, sstride3 + mov filteryq, r5mp +%define hd r4mp +%endif + mova m6, [pw_256] + lea sstride3q, [sstrideq*3] + lea src4q, [srcq+sstrideq] + sub srcq, sstride3q + mova m7, [filteryq+ 0] +%if ARCH_X86_64 && mmsize > 8 + mova m8, [filteryq+32] + mova m9, [filteryq+64] + mova m10, [filteryq+96] +%endif +.loop: + ; FIXME maybe reuse loads from previous rows, or just more generally + ; unroll this to prevent multiple loads of the same data? + movh m0, [srcq] + movh m1, [srcq+sstrideq] + movh m2, [srcq+sstrideq*2] + movh m3, [srcq+sstride3q] + movh m4, [src4q] + movh m5, [src4q+sstrideq] + punpcklbw m0, m1 + punpcklbw m2, m3 + movh m1, [src4q+sstrideq*2] + movh m3, [src4q+sstride3q] + add srcq, sstrideq + add src4q, sstrideq + punpcklbw m4, m5 + punpcklbw m1, m3 + pmaddubsw m0, m7 +%if ARCH_X86_64 && mmsize > 8 + pmaddubsw m2, m8 + pmaddubsw m4, m9 + pmaddubsw m1, m10 +%else + pmaddubsw m2, [filteryq+32] + pmaddubsw m4, [filteryq+64] + pmaddubsw m1, [filteryq+96] +%endif + paddw m0, m4 + paddw m2, m1 + paddsw m0, m2 + pmulhrsw m0, m6 +%ifidn %1, avg + movh m1, [dstq] +%endif + packuswb m0, m0 +%ifidn %1, avg + pavgb m0, m1 +%endif + movh [dstq], m0 + add dstq, dstrideq + dec hd + jg .loop + RET +%endmacro + +INIT_MMX ssse3 +filter_v_fn put +filter_v_fn avg + +INIT_XMM ssse3 +filter_v_fn put +filter_v_fn avg + +%if ARCH_X86_64 + +%macro filter_vx2_fn 1 +%assign %%px mmsize +cglobal vp9_%1_8tap_1d_v_ %+ %%px %+ _8, 6, 8, 14, dst, dstride, src, sstride, h, filtery, src4, sstride3 + mova m13, [pw_256] + lea sstride3q, [sstrideq*3] + lea src4q, [srcq+sstrideq] + sub srcq, sstride3q + mova m8, [filteryq+ 0] + mova m9, [filteryq+32] + mova m10, [filteryq+64] + mova m11, [filteryq+96] +.loop: + ; FIXME maybe reuse loads from previous rows, or just + ; more generally unroll this to prevent multiple loads of + ; the same data? + movu m0, [srcq] + movu m1, [srcq+sstrideq] + movu m2, [srcq+sstrideq*2] + movu m3, [srcq+sstride3q] + movu m4, [src4q] + movu m5, [src4q+sstrideq] + movu m6, [src4q+sstrideq*2] + movu m7, [src4q+sstride3q] + add srcq, sstrideq + add src4q, sstrideq + SBUTTERFLY bw, 0, 1, 12 + SBUTTERFLY bw, 2, 3, 12 + SBUTTERFLY bw, 4, 5, 12 + SBUTTERFLY bw, 6, 7, 12 + pmaddubsw m0, m8 + pmaddubsw m1, m8 + pmaddubsw m2, m9 + pmaddubsw m3, m9 + pmaddubsw m4, m10 + pmaddubsw m5, m10 + pmaddubsw m6, m11 + pmaddubsw m7, m11 + paddw m0, m4 + paddw m1, m5 + paddw m2, m6 + paddw m3, m7 + paddsw m0, m2 + paddsw m1, m3 + pmulhrsw m0, m13 + pmulhrsw m1, m13 + packuswb m0, m1 +%ifidn %1, avg + pavgb m0, [dstq] +%endif + mova [dstq], m0 + add dstq, dstrideq + dec hd + jg .loop + RET +%endmacro + +INIT_XMM ssse3 +filter_vx2_fn put +filter_vx2_fn avg + +%if HAVE_AVX2_EXTERNAL +INIT_YMM avx2 +filter_vx2_fn put +filter_vx2_fn avg +%endif + +%endif ; ARCH_X86_64 + +%macro fpel_fn 6-8 0, 4 +%if %2 == 4 +%define %%srcfn movh +%define %%dstfn movh +%else +%define %%srcfn movu +%define %%dstfn mova +%endif + +%if %7 == 8 +%define %%pavg pavgb +%define %%szsuf _8 +%elif %7 == 16 +%define %%pavg pavgw +%define %%szsuf _16 +%else +%define %%szsuf +%endif + +%if %2 <= mmsize +cglobal vp9_%1%2 %+ %%szsuf, 5, 7, 4, dst, dstride, src, sstride, h, dstride3, sstride3 + lea sstride3q, [sstrideq*3] + lea dstride3q, [dstrideq*3] +%else +cglobal vp9_%1%2 %+ %%szsuf, 5, 5, %8, dst, dstride, src, sstride, h +%endif +.loop: + %%srcfn m0, [srcq] + %%srcfn m1, [srcq+s%3] + %%srcfn m2, [srcq+s%4] + %%srcfn m3, [srcq+s%5] +%if %2/mmsize == 8 + %%srcfn m4, [srcq+mmsize*4] + %%srcfn m5, [srcq+mmsize*5] + %%srcfn m6, [srcq+mmsize*6] + %%srcfn m7, [srcq+mmsize*7] +%endif + lea srcq, [srcq+sstrideq*%6] +%ifidn %1, avg + %%pavg m0, [dstq] + %%pavg m1, [dstq+d%3] + %%pavg m2, [dstq+d%4] +%if %2 == 4 + %%srcfn m4, [dstq+d%5] + %%pavg m3, m4 +%else + %%pavg m3, [dstq+d%5] +%endif +%if %2/mmsize == 8 + %%pavg m4, [dstq+mmsize*4] + %%pavg m5, [dstq+mmsize*5] + %%pavg m6, [dstq+mmsize*6] + %%pavg m7, [dstq+mmsize*7] +%endif +%endif + %%dstfn [dstq], m0 + %%dstfn [dstq+d%3], m1 + %%dstfn [dstq+d%4], m2 + %%dstfn [dstq+d%5], m3 +%if %2/mmsize == 8 + %%dstfn [dstq+mmsize*4], m4 + %%dstfn [dstq+mmsize*5], m5 + %%dstfn [dstq+mmsize*6], m6 + %%dstfn [dstq+mmsize*7], m7 +%endif + lea dstq, [dstq+dstrideq*%6] + sub hd, %6 + jnz .loop + RET +%endmacro + +%define d16 16 +%define s16 16 +%define d32 32 +%define s32 32 +INIT_MMX mmx +fpel_fn put, 4, strideq, strideq*2, stride3q, 4 +fpel_fn put, 8, strideq, strideq*2, stride3q, 4 +INIT_MMX mmxext +fpel_fn avg, 4, strideq, strideq*2, stride3q, 4, 8 +fpel_fn avg, 8, strideq, strideq*2, stride3q, 4, 8 +INIT_XMM sse +fpel_fn put, 16, strideq, strideq*2, stride3q, 4 +fpel_fn put, 32, mmsize, strideq, strideq+mmsize, 2 +fpel_fn put, 64, mmsize, mmsize*2, mmsize*3, 1 +fpel_fn put, 128, mmsize, mmsize*2, mmsize*3, 1, 0, 8 +INIT_XMM sse2 +fpel_fn avg, 16, strideq, strideq*2, stride3q, 4, 8 +fpel_fn avg, 32, mmsize, strideq, strideq+mmsize, 2, 8 +fpel_fn avg, 64, mmsize, mmsize*2, mmsize*3, 1, 8 +INIT_YMM avx +fpel_fn put, 32, strideq, strideq*2, stride3q, 4 +fpel_fn put, 64, mmsize, strideq, strideq+mmsize, 2 +fpel_fn put, 128, mmsize, mmsize*2, mmsize*3, 1 +%if HAVE_AVX2_EXTERNAL +INIT_YMM avx2 +fpel_fn avg, 32, strideq, strideq*2, stride3q, 4, 8 +fpel_fn avg, 64, mmsize, strideq, strideq+mmsize, 2, 8 +%endif +INIT_MMX mmxext +fpel_fn avg, 8, strideq, strideq*2, stride3q, 4, 16 +INIT_XMM sse2 +fpel_fn avg, 16, strideq, strideq*2, stride3q, 4, 16 +fpel_fn avg, 32, mmsize, strideq, strideq+mmsize, 2, 16 +fpel_fn avg, 64, mmsize, mmsize*2, mmsize*3, 1, 16 +fpel_fn avg, 128, mmsize, mmsize*2, mmsize*3, 1, 16, 8 +%if HAVE_AVX2_EXTERNAL +INIT_YMM avx2 +fpel_fn avg, 32, strideq, strideq*2, stride3q, 4, 16 +fpel_fn avg, 64, mmsize, strideq, strideq+mmsize, 2, 16 +fpel_fn avg, 128, mmsize, mmsize*2, mmsize*3, 1, 16 +%endif +%undef s16 +%undef d16 +%undef s32 +%undef d32 diff --git a/media/ffvpx/libavcodec/x86/vp9mc_16bpp.asm b/media/ffvpx/libavcodec/x86/vp9mc_16bpp.asm new file mode 100644 index 0000000000..9a462eaf80 --- /dev/null +++ b/media/ffvpx/libavcodec/x86/vp9mc_16bpp.asm @@ -0,0 +1,431 @@ +;****************************************************************************** +;* VP9 MC SIMD optimizations +;* +;* Copyright (c) 2015 Ronald S. Bultje <rsbultje gmail com> +;* +;* This file is part of FFmpeg. +;* +;* FFmpeg is free software; you can redistribute it and/or +;* modify it under the terms of the GNU Lesser General Public +;* License as published by the Free Software Foundation; either +;* version 2.1 of the License, or (at your option) any later version. +;* +;* FFmpeg is distributed in the hope that it will be useful, +;* but WITHOUT ANY WARRANTY; without even the implied warranty of +;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +;* Lesser General Public License for more details. +;* +;* You should have received a copy of the GNU Lesser General Public +;* License along with FFmpeg; if not, write to the Free Software +;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA +;****************************************************************************** + +%include "libavutil/x86/x86util.asm" + +SECTION_RODATA 32 + +pd_64: times 8 dd 64 + +cextern pw_1023 +cextern pw_4095 + +SECTION .text + +%macro filter_h4_fn 1-2 12 +cglobal vp9_%1_8tap_1d_h_4_10, 6, 6, %2, dst, dstride, src, sstride, h, filtery + mova m5, [pw_1023] +.body: +%if notcpuflag(sse4) && ARCH_X86_64 + pxor m11, m11 +%endif + mova m6, [pd_64] + mova m7, [filteryq+ 0] +%if ARCH_X86_64 && mmsize > 8 + mova m8, [filteryq+32] + mova m9, [filteryq+64] + mova m10, [filteryq+96] +%endif +.loop: + movh m0, [srcq-6] + movh m1, [srcq-4] + movh m2, [srcq-2] + movh m3, [srcq+0] + movh m4, [srcq+2] + punpcklwd m0, m1 + punpcklwd m2, m3 + pmaddwd m0, m7 +%if ARCH_X86_64 && mmsize > 8 + pmaddwd m2, m8 +%else + pmaddwd m2, [filteryq+32] +%endif + movu m1, [srcq+4] + movu m3, [srcq+6] + paddd m0, m2 + movu m2, [srcq+8] + add srcq, sstrideq + punpcklwd m4, m1 + punpcklwd m3, m2 +%if ARCH_X86_64 && mmsize > 8 + pmaddwd m4, m9 + pmaddwd m3, m10 +%else + pmaddwd m4, [filteryq+64] + pmaddwd m3, [filteryq+96] +%endif + paddd m0, m4 + paddd m0, m3 + paddd m0, m6 + psrad m0, 7 +%if cpuflag(sse4) + packusdw m0, m0 +%else + packssdw m0, m0 +%endif +%ifidn %1, avg + movh m1, [dstq] +%endif + pminsw m0, m5 +%if notcpuflag(sse4) +%if ARCH_X86_64 + pmaxsw m0, m11 +%else + pxor m2, m2 + pmaxsw m0, m2 +%endif +%endif +%ifidn %1, avg + pavgw m0, m1 +%endif + movh [dstq], m0 + add dstq, dstrideq + dec hd + jg .loop + RET + +cglobal vp9_%1_8tap_1d_h_4_12, 6, 6, %2, dst, dstride, src, sstride, h, filtery + mova m5, [pw_4095] + jmp mangle(private_prefix %+ _ %+ vp9_%1_8tap_1d_h_4_10 %+ SUFFIX).body +%endmacro + +INIT_XMM sse2 +filter_h4_fn put +filter_h4_fn avg + +%macro filter_h_fn 1-2 12 +%assign %%px mmsize/2 +cglobal vp9_%1_8tap_1d_h_ %+ %%px %+ _10, 6, 6, %2, dst, dstride, src, sstride, h, filtery + mova m5, [pw_1023] +.body: +%if notcpuflag(sse4) && ARCH_X86_64 + pxor m11, m11 +%endif + mova m6, [pd_64] + mova m7, [filteryq+ 0] +%if ARCH_X86_64 && mmsize > 8 + mova m8, [filteryq+32] + mova m9, [filteryq+64] + mova m10, [filteryq+96] +%endif +.loop: + movu m0, [srcq-6] + movu m1, [srcq-4] + movu m2, [srcq-2] + movu m3, [srcq+0] + movu m4, [srcq+2] + pmaddwd m0, m7 + pmaddwd m1, m7 +%if ARCH_X86_64 && mmsize > 8 + pmaddwd m2, m8 + pmaddwd m3, m8 + pmaddwd m4, m9 +%else + pmaddwd m2, [filteryq+32] + pmaddwd m3, [filteryq+32] + pmaddwd m4, [filteryq+64] +%endif + paddd m0, m2 + paddd m1, m3 + paddd m0, m4 + movu m2, [srcq+4] + movu m3, [srcq+6] + movu m4, [srcq+8] + add srcq, sstrideq +%if ARCH_X86_64 && mmsize > 8 + pmaddwd m2, m9 + pmaddwd m3, m10 + pmaddwd m4, m10 +%else + pmaddwd m2, [filteryq+64] + pmaddwd m3, [filteryq+96] + pmaddwd m4, [filteryq+96] +%endif + paddd m1, m2 + paddd m0, m3 + paddd m1, m4 + paddd m0, m6 + paddd m1, m6 + psrad m0, 7 + psrad m1, 7 +%if cpuflag(sse4) + packusdw m0, m0 + packusdw m1, m1 +%else + packssdw m0, m0 + packssdw m1, m1 +%endif + punpcklwd m0, m1 + pminsw m0, m5 +%if notcpuflag(sse4) +%if ARCH_X86_64 + pmaxsw m0, m11 +%else + pxor m2, m2 + pmaxsw m0, m2 +%endif +%endif +%ifidn %1, avg + pavgw m0, [dstq] +%endif + mova [dstq], m0 + add dstq, dstrideq + dec hd + jg .loop + RET + +cglobal vp9_%1_8tap_1d_h_ %+ %%px %+ _12, 6, 6, %2, dst, dstride, src, sstride, h, filtery + mova m5, [pw_4095] + jmp mangle(private_prefix %+ _ %+ vp9_%1_8tap_1d_h_ %+ %%px %+ _10 %+ SUFFIX).body +%endmacro + +INIT_XMM sse2 +filter_h_fn put +filter_h_fn avg +%if HAVE_AVX2_EXTERNAL +INIT_YMM avx2 +filter_h_fn put +filter_h_fn avg +%endif + +%macro filter_v4_fn 1-2 12 +%if ARCH_X86_64 +cglobal vp9_%1_8tap_1d_v_4_10, 6, 8, %2, dst, dstride, src, sstride, h, filtery, src4, sstride3 +%else +cglobal vp9_%1_8tap_1d_v_4_10, 4, 7, %2, dst, dstride, src, sstride, filtery, src4, sstride3 + mov filteryq, r5mp +%define hd r4mp +%endif + mova m5, [pw_1023] +.body: +%if notcpuflag(sse4) && ARCH_X86_64 + pxor m11, m11 +%endif + mova m6, [pd_64] + lea sstride3q, [sstrideq*3] + lea src4q, [srcq+sstrideq] + sub srcq, sstride3q + mova m7, [filteryq+ 0] +%if ARCH_X86_64 && mmsize > 8 + mova m8, [filteryq+ 32] + mova m9, [filteryq+ 64] + mova m10, [filteryq+ 96] +%endif +.loop: + ; FIXME maybe reuse loads from previous rows, or just + ; more generally unroll this to prevent multiple loads of + ; the same data? + movh m0, [srcq] + movh m1, [srcq+sstrideq] + movh m2, [srcq+sstrideq*2] + movh m3, [srcq+sstride3q] + add srcq, sstrideq + movh m4, [src4q] + punpcklwd m0, m1 + punpcklwd m2, m3 + pmaddwd m0, m7 +%if ARCH_X86_64 && mmsize > 8 + pmaddwd m2, m8 +%else + pmaddwd m2, [filteryq+ 32] +%endif + movh m1, [src4q+sstrideq] + movh m3, [src4q+sstrideq*2] + paddd m0, m2 + movh m2, [src4q+sstride3q] + add src4q, sstrideq + punpcklwd m4, m1 + punpcklwd m3, m2 +%if ARCH_X86_64 && mmsize > 8 + pmaddwd m4, m9 + pmaddwd m3, m10 +%else + pmaddwd m4, [filteryq+ 64] + pmaddwd m3, [filteryq+ 96] +%endif + paddd m0, m4 + paddd m0, m3 + paddd m0, m6 + psrad m0, 7 +%if cpuflag(sse4) + packusdw m0, m0 +%else + packssdw m0, m0 +%endif +%ifidn %1, avg + movh m1, [dstq] +%endif + pminsw m0, m5 +%if notcpuflag(sse4) +%if ARCH_X86_64 + pmaxsw m0, m11 +%else + pxor m2, m2 + pmaxsw m0, m2 +%endif +%endif +%ifidn %1, avg + pavgw m0, m1 +%endif + movh [dstq], m0 + add dstq, dstrideq + dec hd + jg .loop + RET + +%if ARCH_X86_64 +cglobal vp9_%1_8tap_1d_v_4_12, 6, 8, %2, dst, dstride, src, sstride, h, filtery, src4, sstride3 +%else +cglobal vp9_%1_8tap_1d_v_4_12, 4, 7, %2, dst, dstride, src, sstride, filtery, src4, sstride3 + mov filteryq, r5mp +%endif + mova m5, [pw_4095] + jmp mangle(private_prefix %+ _ %+ vp9_%1_8tap_1d_v_4_10 %+ SUFFIX).body +%endmacro + +INIT_XMM sse2 +filter_v4_fn put +filter_v4_fn avg + +%macro filter_v_fn 1-2 13 +%assign %%px mmsize/2 +%if ARCH_X86_64 +cglobal vp9_%1_8tap_1d_v_ %+ %%px %+ _10, 6, 8, %2, dst, dstride, src, sstride, h, filtery, src4, sstride3 +%else +cglobal vp9_%1_8tap_1d_v_ %+ %%px %+ _10, 4, 7, %2, dst, dstride, src, sstride, filtery, src4, sstride3 + mov filteryq, r5mp +%define hd r4mp +%endif + mova m5, [pw_1023] +.body: +%if notcpuflag(sse4) && ARCH_X86_64 + pxor m12, m12 +%endif +%if ARCH_X86_64 + mova m11, [pd_64] +%endif + lea sstride3q, [sstrideq*3] + lea src4q, [srcq+sstrideq] + sub srcq, sstride3q + mova m7, [filteryq+ 0] +%if ARCH_X86_64 && mmsize > 8 + mova m8, [filteryq+ 32] + mova m9, [filteryq+ 64] + mova m10, [filteryq+ 96] +%endif +.loop: + ; FIXME maybe reuse loads from previous rows, or just + ; more generally unroll this to prevent multiple loads of + ; the same data? + movu m0, [srcq] + movu m1, [srcq+sstrideq] + movu m2, [srcq+sstrideq*2] + movu m3, [srcq+sstride3q] + add srcq, sstrideq + movu m4, [src4q] + SBUTTERFLY wd, 0, 1, 6 + SBUTTERFLY wd, 2, 3, 6 + pmaddwd m0, m7 + pmaddwd m1, m7 +%if ARCH_X86_64 && mmsize > 8 + pmaddwd m2, m8 + pmaddwd m3, m8 +%else + pmaddwd m2, [filteryq+ 32] + pmaddwd m3, [filteryq+ 32] +%endif + paddd m0, m2 + paddd m1, m3 + movu m2, [src4q+sstrideq] + movu m3, [src4q+sstrideq*2] + SBUTTERFLY wd, 4, 2, 6 +%if ARCH_X86_64 && mmsize > 8 + pmaddwd m4, m9 + pmaddwd m2, m9 +%else + pmaddwd m4, [filteryq+ 64] + pmaddwd m2, [filteryq+ 64] +%endif + paddd m0, m4 + paddd m1, m2 + movu m4, [src4q+sstride3q] + add src4q, sstrideq + SBUTTERFLY wd, 3, 4, 6 +%if ARCH_X86_64 && mmsize > 8 + pmaddwd m3, m10 + pmaddwd m4, m10 +%else + pmaddwd m3, [filteryq+ 96] + pmaddwd m4, [filteryq+ 96] +%endif + paddd m0, m3 + paddd m1, m4 +%if ARCH_X86_64 + paddd m0, m11 + paddd m1, m11 +%else + paddd m0, [pd_64] + paddd m1, [pd_64] +%endif + psrad m0, 7 + psrad m1, 7 +%if cpuflag(sse4) + packusdw m0, m1 +%else + packssdw m0, m1 +%endif + pminsw m0, m5 +%if notcpuflag(sse4) +%if ARCH_X86_64 + pmaxsw m0, m12 +%else + pxor m2, m2 + pmaxsw m0, m2 +%endif +%endif +%ifidn %1, avg + pavgw m0, [dstq] +%endif + mova [dstq], m0 + add dstq, dstrideq + dec hd + jg .loop + RET + +%if ARCH_X86_64 +cglobal vp9_%1_8tap_1d_v_ %+ %%px %+ _12, 6, 8, %2, dst, dstride, src, sstride, h, filtery, src4, sstride3 +%else +cglobal vp9_%1_8tap_1d_v_ %+ %%px %+ _12, 4, 7, %2, dst, dstride, src, sstride, filtery, src4, sstride3 + mov filteryq, r5mp +%endif + mova m5, [pw_4095] + jmp mangle(private_prefix %+ _ %+ vp9_%1_8tap_1d_v_ %+ %%px %+ _10 %+ SUFFIX).body +%endmacro + +INIT_XMM sse2 +filter_v_fn put +filter_v_fn avg +%if HAVE_AVX2_EXTERNAL +INIT_YMM avx2 +filter_v_fn put +filter_v_fn avg +%endif diff --git a/media/ffvpx/libavcodec/x86/vpx_arith.h b/media/ffvpx/libavcodec/x86/vpx_arith.h new file mode 100644 index 0000000000..d9e4c0dec4 --- /dev/null +++ b/media/ffvpx/libavcodec/x86/vpx_arith.h @@ -0,0 +1,55 @@ +/** + * VP5 and VP6 compatible video decoder (arith decoder) + * + * Copyright (C) 2006 Aurelien Jacobs <aurel@gnuage.org> + * Copyright (C) 2010 Eli Friedman + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#ifndef AVCODEC_X86_VPX_ARITH_H +#define AVCODEC_X86_VPX_ARITH_H + +#include "libavutil/x86/asm.h" + +#if HAVE_INLINE_ASM && HAVE_FAST_CMOV && HAVE_6REGS +#include "libavutil/attributes.h" + +#define vpx_rac_get_prob vpx_rac_get_prob +static av_always_inline int vpx_rac_get_prob(VPXRangeCoder *c, uint8_t prob) +{ + unsigned int code_word = vpx_rac_renorm(c); + unsigned int low = 1 + (((c->high - 1) * prob) >> 8); + unsigned int low_shift = low << 16; + int bit = 0; + c->code_word = code_word; + + __asm__( + "subl %4, %1 \n\t" + "subl %3, %2 \n\t" + "setae %b0 \n\t" + "cmovb %4, %1 \n\t" + "cmovb %5, %2 \n\t" + : "+q"(bit), "+&r"(c->high), "+&r"(c->code_word) + : "r"(low_shift), "r"(low), "r"(code_word) + ); + + return bit; +} +#endif + +#endif /* AVCODEC_X86_VPX_ARITH_H */ |