From 26a029d407be480d791972afb5975cf62c9360a6 Mon Sep 17 00:00:00 2001 From: Daniel Baumann Date: Fri, 19 Apr 2024 02:47:55 +0200 Subject: Adding upstream version 124.0.1. Signed-off-by: Daniel Baumann --- media/ffvpx/libavutil/x86/asm.h | 154 +++ media/ffvpx/libavutil/x86/bswap.h | 88 ++ media/ffvpx/libavutil/x86/cpu.c | 284 ++++ media/ffvpx/libavutil/x86/cpu.h | 114 ++ media/ffvpx/libavutil/x86/cpuid.asm | 91 ++ media/ffvpx/libavutil/x86/emms.asm | 30 + media/ffvpx/libavutil/x86/emms.h | 58 + media/ffvpx/libavutil/x86/fixed_dsp.asm | 48 + media/ffvpx/libavutil/x86/fixed_dsp_init.c | 35 + media/ffvpx/libavutil/x86/float_dsp.asm | 588 ++++++++ media/ffvpx/libavutil/x86/float_dsp_init.c | 118 ++ media/ffvpx/libavutil/x86/imgutils.asm | 53 + media/ffvpx/libavutil/x86/imgutils_init.c | 48 + media/ffvpx/libavutil/x86/intmath.h | 169 +++ media/ffvpx/libavutil/x86/intreadwrite.h | 99 ++ media/ffvpx/libavutil/x86/lls.asm | 290 ++++ media/ffvpx/libavutil/x86/lls_init.c | 46 + media/ffvpx/libavutil/x86/moz.build | 25 + media/ffvpx/libavutil/x86/pixelutils.asm | 329 +++++ media/ffvpx/libavutil/x86/pixelutils.h | 26 + media/ffvpx/libavutil/x86/pixelutils_init.c | 85 ++ media/ffvpx/libavutil/x86/timer.h | 50 + media/ffvpx/libavutil/x86/tx_float.asm | 1936 +++++++++++++++++++++++++++ media/ffvpx/libavutil/x86/tx_float_init.c | 310 +++++ media/ffvpx/libavutil/x86/x86inc.asm | 1726 ++++++++++++++++++++++++ media/ffvpx/libavutil/x86/x86util.asm | 1028 ++++++++++++++ 26 files changed, 7828 insertions(+) create mode 100644 media/ffvpx/libavutil/x86/asm.h create mode 100644 media/ffvpx/libavutil/x86/bswap.h create mode 100644 media/ffvpx/libavutil/x86/cpu.c create mode 100644 media/ffvpx/libavutil/x86/cpu.h create mode 100644 media/ffvpx/libavutil/x86/cpuid.asm create mode 100644 media/ffvpx/libavutil/x86/emms.asm create mode 100644 media/ffvpx/libavutil/x86/emms.h create mode 100644 media/ffvpx/libavutil/x86/fixed_dsp.asm create mode 100644 media/ffvpx/libavutil/x86/fixed_dsp_init.c create mode 100644 media/ffvpx/libavutil/x86/float_dsp.asm create mode 100644 media/ffvpx/libavutil/x86/float_dsp_init.c create mode 100644 media/ffvpx/libavutil/x86/imgutils.asm create mode 100644 media/ffvpx/libavutil/x86/imgutils_init.c create mode 100644 media/ffvpx/libavutil/x86/intmath.h create mode 100644 media/ffvpx/libavutil/x86/intreadwrite.h create mode 100644 media/ffvpx/libavutil/x86/lls.asm create mode 100644 media/ffvpx/libavutil/x86/lls_init.c create mode 100644 media/ffvpx/libavutil/x86/moz.build create mode 100644 media/ffvpx/libavutil/x86/pixelutils.asm create mode 100644 media/ffvpx/libavutil/x86/pixelutils.h create mode 100644 media/ffvpx/libavutil/x86/pixelutils_init.c create mode 100644 media/ffvpx/libavutil/x86/timer.h create mode 100644 media/ffvpx/libavutil/x86/tx_float.asm create mode 100644 media/ffvpx/libavutil/x86/tx_float_init.c create mode 100644 media/ffvpx/libavutil/x86/x86inc.asm create mode 100644 media/ffvpx/libavutil/x86/x86util.asm (limited to 'media/ffvpx/libavutil/x86') diff --git a/media/ffvpx/libavutil/x86/asm.h b/media/ffvpx/libavutil/x86/asm.h new file mode 100644 index 0000000000..9bff42d628 --- /dev/null +++ b/media/ffvpx/libavutil/x86/asm.h @@ -0,0 +1,154 @@ +/* + * copyright (c) 2006 Michael Niedermayer + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#ifndef AVUTIL_X86_ASM_H +#define AVUTIL_X86_ASM_H + +#include +#include "config.h" + +typedef struct xmm_reg { uint64_t a, b; } xmm_reg; +typedef struct ymm_reg { uint64_t a, b, c, d; } ymm_reg; + +#if ARCH_X86_64 +# define FF_OPSIZE "q" +# define FF_REG_a "rax" +# define FF_REG_b "rbx" +# define FF_REG_c "rcx" +# define FF_REG_d "rdx" +# define FF_REG_D "rdi" +# define FF_REG_S "rsi" +# define FF_PTR_SIZE "8" +typedef int64_t x86_reg; + +/* FF_REG_SP is defined in Solaris sys headers, so use FF_REG_sp */ +# define FF_REG_sp "rsp" +# define FF_REG_BP "rbp" +# define FF_REGBP rbp +# define FF_REGa rax +# define FF_REGb rbx +# define FF_REGc rcx +# define FF_REGd rdx +# define FF_REGSP rsp + +#elif ARCH_X86_32 + +# define FF_OPSIZE "l" +# define FF_REG_a "eax" +# define FF_REG_b "ebx" +# define FF_REG_c "ecx" +# define FF_REG_d "edx" +# define FF_REG_D "edi" +# define FF_REG_S "esi" +# define FF_PTR_SIZE "4" +typedef int32_t x86_reg; + +# define FF_REG_sp "esp" +# define FF_REG_BP "ebp" +# define FF_REGBP ebp +# define FF_REGa eax +# define FF_REGb ebx +# define FF_REGc ecx +# define FF_REGd edx +# define FF_REGSP esp +#else +typedef int x86_reg; +#endif + +#define HAVE_7REGS (ARCH_X86_64 || (HAVE_EBX_AVAILABLE && HAVE_EBP_AVAILABLE)) +#define HAVE_6REGS (ARCH_X86_64 || (HAVE_EBX_AVAILABLE || HAVE_EBP_AVAILABLE)) + +#if ARCH_X86_64 && defined(PIC) +# define BROKEN_RELOCATIONS 1 +#endif + +/* + * If gcc is not set to support sse (-msse) it will not accept xmm registers + * in the clobber list for inline asm. XMM_CLOBBERS takes a list of xmm + * registers to be marked as clobbered and evaluates to nothing if they are + * not supported, or to the list itself if they are supported. Since a clobber + * list may not be empty, XMM_CLOBBERS_ONLY should be used if the xmm + * registers are the only in the clobber list. + * For example a list with "eax" and "xmm0" as clobbers should become: + * : XMM_CLOBBERS("xmm0",) "eax" + * and a list with only "xmm0" should become: + * XMM_CLOBBERS_ONLY("xmm0") + */ +#if HAVE_XMM_CLOBBERS +# define XMM_CLOBBERS(...) __VA_ARGS__ +# define XMM_CLOBBERS_ONLY(...) : __VA_ARGS__ +#else +# define XMM_CLOBBERS(...) +# define XMM_CLOBBERS_ONLY(...) +#endif + +/* Use to export labels from asm. */ +#define LABEL_MANGLE(a) EXTERN_PREFIX #a + +// Use rip-relative addressing if compiling PIC code on x86-64. +#if ARCH_X86_64 && defined(PIC) +# define LOCAL_MANGLE(a) #a "(%%rip)" +#else +# define LOCAL_MANGLE(a) #a +#endif + +#if HAVE_INLINE_ASM_DIRECT_SYMBOL_REFS +# define MANGLE(a) EXTERN_PREFIX LOCAL_MANGLE(a) +# define NAMED_CONSTRAINTS_ADD(...) +# define NAMED_CONSTRAINTS(...) +# define NAMED_CONSTRAINTS_ARRAY_ADD(...) +# define NAMED_CONSTRAINTS_ARRAY(...) +#else + /* When direct symbol references are used in code passed to a compiler that does not support them + * then these references need to be converted to named asm constraints instead. + * Instead of returning a direct symbol MANGLE now returns a named constraint for that specific symbol. + * In order for this to work there must also be a corresponding entry in the asm-interface. To add this + * entry use the macro NAMED_CONSTRAINTS() and pass in a list of each symbol reference used in the + * corresponding block of code. (e.g. NAMED_CONSTRAINTS(var1,var2,var3) where var1 is the first symbol etc. ). + * If there are already existing constraints then use NAMED_CONSTRAINTS_ADD to add to the existing constraint list. + */ +# define MANGLE(a) "%["#a"]" + // Intel/MSVC does not correctly expand va-args so we need a rather ugly hack in order to get it to work +# define FE_0(P,X) P(X) +# define FE_1(P,X,X1) P(X), FE_0(P,X1) +# define FE_2(P,X,X1,X2) P(X), FE_1(P,X1,X2) +# define FE_3(P,X,X1,X2,X3) P(X), FE_2(P,X1,X2,X3) +# define FE_4(P,X,X1,X2,X3,X4) P(X), FE_3(P,X1,X2,X3,X4) +# define FE_5(P,X,X1,X2,X3,X4,X5) P(X), FE_4(P,X1,X2,X3,X4,X5) +# define FE_6(P,X,X1,X2,X3,X4,X5,X6) P(X), FE_5(P,X1,X2,X3,X4,X5,X6) +# define FE_7(P,X,X1,X2,X3,X4,X5,X6,X7) P(X), FE_6(P,X1,X2,X3,X4,X5,X6,X7) +# define FE_8(P,X,X1,X2,X3,X4,X5,X6,X7,X8) P(X), FE_7(P,X1,X2,X3,X4,X5,X6,X7,X8) +# define FE_9(P,X,X1,X2,X3,X4,X5,X6,X7,X8,X9) P(X), FE_8(P,X1,X2,X3,X4,X5,X6,X7,X8,X9) +# define GET_FE_IMPL(_0,_1,_2,_3,_4,_5,_6,_7,_8,_9,NAME,...) NAME +# define GET_FE(A) GET_FE_IMPL A +# define GET_FE_GLUE(x, y) x y +# define FOR_EACH_VA(P,...) GET_FE_GLUE(GET_FE((__VA_ARGS__,FE_9,FE_8,FE_7,FE_6,FE_5,FE_4,FE_3,FE_2,FE_1,FE_0)), (P,__VA_ARGS__)) +# define NAME_CONSTRAINT(x) [x] "m"(x) + // Parameters are a list of each symbol reference required +# define NAMED_CONSTRAINTS_ADD(...) , FOR_EACH_VA(NAME_CONSTRAINT,__VA_ARGS__) + // Same but without comma for when there are no previously defined constraints +# define NAMED_CONSTRAINTS(...) FOR_EACH_VA(NAME_CONSTRAINT,__VA_ARGS__) + // Same as above NAMED_CONSTRAINTS except used for passing arrays/pointers instead of normal variables +# define NAME_CONSTRAINT_ARRAY(x) [x] "m"(*x) +# define NAMED_CONSTRAINTS_ARRAY_ADD(...) , FOR_EACH_VA(NAME_CONSTRAINT_ARRAY,__VA_ARGS__) +# define NAMED_CONSTRAINTS_ARRAY(...) FOR_EACH_VA(NAME_CONSTRAINT_ARRAY,__VA_ARGS__) +#endif + +#endif /* AVUTIL_X86_ASM_H */ diff --git a/media/ffvpx/libavutil/x86/bswap.h b/media/ffvpx/libavutil/x86/bswap.h new file mode 100644 index 0000000000..b2f18b6c93 --- /dev/null +++ b/media/ffvpx/libavutil/x86/bswap.h @@ -0,0 +1,88 @@ +/* + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +/** + * @file + * byte swapping routines + */ + +#ifndef AVUTIL_X86_BSWAP_H +#define AVUTIL_X86_BSWAP_H + +#include +#if defined(_MSC_VER) +#include +#include +#endif +#include "config.h" +#include "libavutil/attributes.h" + +#if defined(_MSC_VER) + +#define av_bswap16 av_bswap16 +static av_always_inline av_const uint16_t av_bswap16(uint16_t x) +{ + return _rotr16(x, 8); +} + +#define av_bswap32 av_bswap32 +static av_always_inline av_const uint32_t av_bswap32(uint32_t x) +{ + return _byteswap_ulong(x); +} + +#if ARCH_X86_64 +#define av_bswap64 av_bswap64 +static inline uint64_t av_const av_bswap64(uint64_t x) +{ + return _byteswap_uint64(x); +} +#endif + + +#elif HAVE_INLINE_ASM + +#if AV_GCC_VERSION_AT_MOST(4,0) +#define av_bswap16 av_bswap16 +static av_always_inline av_const unsigned av_bswap16(unsigned x) +{ + __asm__("rorw $8, %w0" : "+r"(x)); + return x; +} +#endif /* AV_GCC_VERSION_AT_MOST(4,0) */ + +#if AV_GCC_VERSION_AT_MOST(4,4) || defined(__INTEL_COMPILER) +#define av_bswap32 av_bswap32 +static av_always_inline av_const uint32_t av_bswap32(uint32_t x) +{ + __asm__("bswap %0" : "+r" (x)); + return x; +} + +#if ARCH_X86_64 +#define av_bswap64 av_bswap64 +static inline uint64_t av_const av_bswap64(uint64_t x) +{ + __asm__("bswap %0": "=r" (x) : "0" (x)); + return x; +} +#endif +#endif /* AV_GCC_VERSION_AT_MOST(4,4) */ + +#endif /* HAVE_INLINE_ASM */ +#endif /* AVUTIL_X86_BSWAP_H */ diff --git a/media/ffvpx/libavutil/x86/cpu.c b/media/ffvpx/libavutil/x86/cpu.c new file mode 100644 index 0000000000..d6cd4fab9c --- /dev/null +++ b/media/ffvpx/libavutil/x86/cpu.c @@ -0,0 +1,284 @@ +/* + * CPU detection code, extracted from mmx.h + * (c)1997-99 by H. Dietz and R. Fisher + * Converted to C and improved by Fabrice Bellard. + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#include +#include + +#include "libavutil/x86/asm.h" +#include "libavutil/x86/cpu.h" +#include "libavutil/cpu.h" +#include "libavutil/cpu_internal.h" + +#if HAVE_X86ASM + +#define cpuid(index, eax, ebx, ecx, edx) \ + ff_cpu_cpuid(index, &eax, &ebx, &ecx, &edx) + +#define xgetbv(index, eax, edx) \ + ff_cpu_xgetbv(index, &eax, &edx) + +#elif HAVE_INLINE_ASM + +/* ebx saving is necessary for PIC. gcc seems unable to see it alone */ +#define cpuid(index, eax, ebx, ecx, edx) \ + __asm__ volatile ( \ + "mov %%"FF_REG_b", %%"FF_REG_S" \n\t" \ + "cpuid \n\t" \ + "xchg %%"FF_REG_b", %%"FF_REG_S \ + : "=a" (eax), "=S" (ebx), "=c" (ecx), "=d" (edx) \ + : "0" (index), "2"(0)) + +#define xgetbv(index, eax, edx) \ + __asm__ (".byte 0x0f, 0x01, 0xd0" : "=a"(eax), "=d"(edx) : "c" (index)) + +#define get_eflags(x) \ + __asm__ volatile ("pushfl \n" \ + "pop %0 \n" \ + : "=r"(x)) + +#define set_eflags(x) \ + __asm__ volatile ("push %0 \n" \ + "popfl \n" \ + :: "r"(x)) + +#endif /* HAVE_INLINE_ASM */ + +#if ARCH_X86_64 + +#define cpuid_test() 1 + +#elif HAVE_X86ASM + +#define cpuid_test ff_cpu_cpuid_test + +#elif HAVE_INLINE_ASM + +static int cpuid_test(void) +{ + x86_reg a, c; + + /* Check if CPUID is supported by attempting to toggle the ID bit in + * the EFLAGS register. */ + get_eflags(a); + set_eflags(a ^ 0x200000); + get_eflags(c); + + return a != c; +} +#endif + +/* Function to test if multimedia instructions are supported... */ +int ff_get_cpu_flags_x86(void) +{ + int rval = 0; + +#ifdef cpuid + + int eax, ebx, ecx, edx; + int max_std_level, max_ext_level, std_caps = 0, ext_caps = 0; + int family = 0, model = 0; + union { int i[3]; char c[12]; } vendor; + int xcr0_lo = 0, xcr0_hi = 0; + + if (!cpuid_test()) + return 0; /* CPUID not supported */ + + cpuid(0, max_std_level, vendor.i[0], vendor.i[2], vendor.i[1]); + + if (max_std_level >= 1) { + cpuid(1, eax, ebx, ecx, std_caps); + family = ((eax >> 8) & 0xf) + ((eax >> 20) & 0xff); + model = ((eax >> 4) & 0xf) + ((eax >> 12) & 0xf0); + if (std_caps & (1 << 15)) + rval |= AV_CPU_FLAG_CMOV; + if (std_caps & (1 << 23)) + rval |= AV_CPU_FLAG_MMX; + if (std_caps & (1 << 25)) + rval |= AV_CPU_FLAG_MMXEXT; +#if HAVE_SSE + if (std_caps & (1 << 25)) + rval |= AV_CPU_FLAG_SSE; + if (std_caps & (1 << 26)) + rval |= AV_CPU_FLAG_SSE2; + if (ecx & 1) + rval |= AV_CPU_FLAG_SSE3; + if (ecx & 0x00000200 ) + rval |= AV_CPU_FLAG_SSSE3; + if (ecx & 0x00080000 ) + rval |= AV_CPU_FLAG_SSE4; + if (ecx & 0x00100000 ) + rval |= AV_CPU_FLAG_SSE42; + if (ecx & 0x02000000 ) + rval |= AV_CPU_FLAG_AESNI; +#if HAVE_AVX + /* Check OXSAVE and AVX bits */ + if ((ecx & 0x18000000) == 0x18000000) { + /* Check for OS support */ + xgetbv(0, xcr0_lo, xcr0_hi); + if ((xcr0_lo & 0x6) == 0x6) { + rval |= AV_CPU_FLAG_AVX; + if (ecx & 0x00001000) + rval |= AV_CPU_FLAG_FMA3; + } + } +#endif /* HAVE_AVX */ +#endif /* HAVE_SSE */ + } + if (max_std_level >= 7) { + cpuid(7, eax, ebx, ecx, edx); +#if HAVE_AVX2 + if ((rval & AV_CPU_FLAG_AVX) && (ebx & 0x00000020)) + rval |= AV_CPU_FLAG_AVX2; +#if HAVE_AVX512 /* F, CD, BW, DQ, VL */ + if ((xcr0_lo & 0xe0) == 0xe0) { /* OPMASK/ZMM state */ + if ((rval & AV_CPU_FLAG_AVX2) && (ebx & 0xd0030000) == 0xd0030000) { + rval |= AV_CPU_FLAG_AVX512; +#if HAVE_AVX512ICL + if ((ebx & 0xd0200000) == 0xd0200000 && (ecx & 0x5f42) == 0x5f42) + rval |= AV_CPU_FLAG_AVX512ICL; +#endif /* HAVE_AVX512ICL */ + } + } +#endif /* HAVE_AVX512 */ +#endif /* HAVE_AVX2 */ + /* BMI1/2 don't need OS support */ + if (ebx & 0x00000008) { + rval |= AV_CPU_FLAG_BMI1; + if (ebx & 0x00000100) + rval |= AV_CPU_FLAG_BMI2; + } + } + + cpuid(0x80000000, max_ext_level, ebx, ecx, edx); + + if (max_ext_level >= 0x80000001) { + cpuid(0x80000001, eax, ebx, ecx, ext_caps); + if (ext_caps & (1U << 31)) + rval |= AV_CPU_FLAG_3DNOW; + if (ext_caps & (1 << 30)) + rval |= AV_CPU_FLAG_3DNOWEXT; + if (ext_caps & (1 << 23)) + rval |= AV_CPU_FLAG_MMX; + if (ext_caps & (1 << 22)) + rval |= AV_CPU_FLAG_MMXEXT; + + if (!strncmp(vendor.c, "AuthenticAMD", 12)) { + /* Allow for selectively disabling SSE2 functions on AMD processors + with SSE2 support but not SSE4a. This includes Athlon64, some + Opteron, and some Sempron processors. MMX, SSE, or 3DNow! are faster + than SSE2 often enough to utilize this special-case flag. + AV_CPU_FLAG_SSE2 and AV_CPU_FLAG_SSE2SLOW are both set in this case + so that SSE2 is used unless explicitly disabled by checking + AV_CPU_FLAG_SSE2SLOW. */ + if (rval & AV_CPU_FLAG_SSE2 && !(ecx & 0x00000040)) + rval |= AV_CPU_FLAG_SSE2SLOW; + + /* Similar to the above but for AVX functions on AMD processors. + This is necessary only for functions using YMM registers on Bulldozer + and Jaguar based CPUs as they lack 256-bit execution units. SSE/AVX + functions using XMM registers are always faster on them. + AV_CPU_FLAG_AVX and AV_CPU_FLAG_AVXSLOW are both set so that AVX is + used unless explicitly disabled by checking AV_CPU_FLAG_AVXSLOW. */ + if ((family == 0x15 || family == 0x16) && (rval & AV_CPU_FLAG_AVX)) + rval |= AV_CPU_FLAG_AVXSLOW; + + /* Zen 3 and earlier have slow gather */ + if ((family <= 0x19) && (rval & AV_CPU_FLAG_AVX2)) + rval |= AV_CPU_FLAG_SLOW_GATHER; + } + + /* XOP and FMA4 use the AVX instruction coding scheme, so they can't be + * used unless the OS has AVX support. */ + if (rval & AV_CPU_FLAG_AVX) { + if (ecx & 0x00000800) + rval |= AV_CPU_FLAG_XOP; + if (ecx & 0x00010000) + rval |= AV_CPU_FLAG_FMA4; + } + } + + if (!strncmp(vendor.c, "GenuineIntel", 12)) { + if (family == 6 && (model == 9 || model == 13 || model == 14)) { + /* 6/9 (pentium-m "banias"), 6/13 (pentium-m "dothan"), and + * 6/14 (core1 "yonah") theoretically support sse2, but it's + * usually slower than mmx, so let's just pretend they don't. + * AV_CPU_FLAG_SSE2 is disabled and AV_CPU_FLAG_SSE2SLOW is + * enabled so that SSE2 is not used unless explicitly enabled + * by checking AV_CPU_FLAG_SSE2SLOW. The same situation + * applies for AV_CPU_FLAG_SSE3 and AV_CPU_FLAG_SSE3SLOW. */ + if (rval & AV_CPU_FLAG_SSE2) + rval ^= AV_CPU_FLAG_SSE2SLOW | AV_CPU_FLAG_SSE2; + if (rval & AV_CPU_FLAG_SSE3) + rval ^= AV_CPU_FLAG_SSE3SLOW | AV_CPU_FLAG_SSE3; + } + /* The Atom processor has SSSE3 support, which is useful in many cases, + * but sometimes the SSSE3 version is slower than the SSE2 equivalent + * on the Atom, but is generally faster on other processors supporting + * SSSE3. This flag allows for selectively disabling certain SSSE3 + * functions on the Atom. */ + if (family == 6 && model == 28) + rval |= AV_CPU_FLAG_ATOM; + + /* Conroe has a slow shuffle unit. Check the model number to ensure not + * to include crippled low-end Penryns and Nehalems that lack SSE4. */ + if ((rval & AV_CPU_FLAG_SSSE3) && !(rval & AV_CPU_FLAG_SSE4) && + family == 6 && model < 23) + rval |= AV_CPU_FLAG_SSSE3SLOW; + + /* Haswell has slow gather */ + if ((rval & AV_CPU_FLAG_AVX2) && family == 6 && model < 70) + rval |= AV_CPU_FLAG_SLOW_GATHER; + } + +#endif /* cpuid */ + + return rval; +} + +size_t ff_get_cpu_max_align_x86(void) +{ + int flags = av_get_cpu_flags(); + + if (flags & AV_CPU_FLAG_AVX512) + return 64; + if (flags & (AV_CPU_FLAG_AVX2 | + AV_CPU_FLAG_AVX | + AV_CPU_FLAG_XOP | + AV_CPU_FLAG_FMA4 | + AV_CPU_FLAG_FMA3 | + AV_CPU_FLAG_AVXSLOW)) + return 32; + if (flags & (AV_CPU_FLAG_AESNI | + AV_CPU_FLAG_SSE42 | + AV_CPU_FLAG_SSE4 | + AV_CPU_FLAG_SSSE3 | + AV_CPU_FLAG_SSE3 | + AV_CPU_FLAG_SSE2 | + AV_CPU_FLAG_SSE | + AV_CPU_FLAG_ATOM | + AV_CPU_FLAG_SSSE3SLOW | + AV_CPU_FLAG_SSE3SLOW | + AV_CPU_FLAG_SSE2SLOW)) + return 16; + + return 8; +} diff --git a/media/ffvpx/libavutil/x86/cpu.h b/media/ffvpx/libavutil/x86/cpu.h new file mode 100644 index 0000000000..40a1eef0ab --- /dev/null +++ b/media/ffvpx/libavutil/x86/cpu.h @@ -0,0 +1,114 @@ +/* + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#ifndef AVUTIL_X86_CPU_H +#define AVUTIL_X86_CPU_H + +#include "libavutil/cpu.h" +#include "libavutil/cpu_internal.h" + +#define AV_CPU_FLAG_AMD3DNOW AV_CPU_FLAG_3DNOW +#define AV_CPU_FLAG_AMD3DNOWEXT AV_CPU_FLAG_3DNOWEXT + +#define X86_AMD3DNOW(flags) CPUEXT(flags, AMD3DNOW) +#define X86_AMD3DNOWEXT(flags) CPUEXT(flags, AMD3DNOWEXT) +#define X86_MMX(flags) CPUEXT(flags, MMX) +#define X86_MMXEXT(flags) CPUEXT(flags, MMXEXT) +#define X86_SSE(flags) CPUEXT(flags, SSE) +#define X86_SSE2(flags) CPUEXT(flags, SSE2) +#define X86_SSE2_FAST(flags) CPUEXT_FAST(flags, SSE2) +#define X86_SSE2_SLOW(flags) CPUEXT_SLOW(flags, SSE2) +#define X86_SSE3(flags) CPUEXT(flags, SSE3) +#define X86_SSE3_FAST(flags) CPUEXT_FAST(flags, SSE3) +#define X86_SSE3_SLOW(flags) CPUEXT_SLOW(flags, SSE3) +#define X86_SSSE3(flags) CPUEXT(flags, SSSE3) +#define X86_SSSE3_FAST(flags) CPUEXT_FAST(flags, SSSE3) +#define X86_SSSE3_SLOW(flags) CPUEXT_SLOW(flags, SSSE3) +#define X86_SSE4(flags) CPUEXT(flags, SSE4) +#define X86_SSE42(flags) CPUEXT(flags, SSE42) +#define X86_AVX(flags) CPUEXT(flags, AVX) +#define X86_AVX_FAST(flags) CPUEXT_FAST(flags, AVX) +#define X86_AVX_SLOW(flags) CPUEXT_SLOW(flags, AVX) +#define X86_XOP(flags) CPUEXT(flags, XOP) +#define X86_FMA3(flags) CPUEXT(flags, FMA3) +#define X86_FMA4(flags) CPUEXT(flags, FMA4) +#define X86_AVX2(flags) CPUEXT(flags, AVX2) +#define X86_AESNI(flags) CPUEXT(flags, AESNI) +#define X86_AVX512(flags) CPUEXT(flags, AVX512) + +#define EXTERNAL_AMD3DNOW(flags) CPUEXT_SUFFIX(flags, _EXTERNAL, AMD3DNOW) +#define EXTERNAL_AMD3DNOWEXT(flags) CPUEXT_SUFFIX(flags, _EXTERNAL, AMD3DNOWEXT) +#define EXTERNAL_MMX(flags) CPUEXT_SUFFIX(flags, _EXTERNAL, MMX) +#define EXTERNAL_MMXEXT(flags) CPUEXT_SUFFIX(flags, _EXTERNAL, MMXEXT) +#define EXTERNAL_SSE(flags) CPUEXT_SUFFIX(flags, _EXTERNAL, SSE) +#define EXTERNAL_SSE2(flags) CPUEXT_SUFFIX(flags, _EXTERNAL, SSE2) +#define EXTERNAL_SSE2_FAST(flags) CPUEXT_SUFFIX_FAST(flags, _EXTERNAL, SSE2) +#define EXTERNAL_SSE2_SLOW(flags) CPUEXT_SUFFIX_SLOW(flags, _EXTERNAL, SSE2) +#define EXTERNAL_SSE3(flags) CPUEXT_SUFFIX(flags, _EXTERNAL, SSE3) +#define EXTERNAL_SSE3_FAST(flags) CPUEXT_SUFFIX_FAST(flags, _EXTERNAL, SSE3) +#define EXTERNAL_SSE3_SLOW(flags) CPUEXT_SUFFIX_SLOW(flags, _EXTERNAL, SSE3) +#define EXTERNAL_SSSE3(flags) CPUEXT_SUFFIX(flags, _EXTERNAL, SSSE3) +#define EXTERNAL_SSSE3_FAST(flags) CPUEXT_SUFFIX_FAST(flags, _EXTERNAL, SSSE3) +#define EXTERNAL_SSSE3_SLOW(flags) CPUEXT_SUFFIX_SLOW(flags, _EXTERNAL, SSSE3) +#define EXTERNAL_SSE4(flags) CPUEXT_SUFFIX(flags, _EXTERNAL, SSE4) +#define EXTERNAL_SSE42(flags) CPUEXT_SUFFIX(flags, _EXTERNAL, SSE42) +#define EXTERNAL_AVX(flags) CPUEXT_SUFFIX(flags, _EXTERNAL, AVX) +#define EXTERNAL_AVX_FAST(flags) CPUEXT_SUFFIX_FAST(flags, _EXTERNAL, AVX) +#define EXTERNAL_AVX_SLOW(flags) CPUEXT_SUFFIX_SLOW(flags, _EXTERNAL, AVX) +#define EXTERNAL_XOP(flags) CPUEXT_SUFFIX(flags, _EXTERNAL, XOP) +#define EXTERNAL_FMA3(flags) CPUEXT_SUFFIX(flags, _EXTERNAL, FMA3) +#define EXTERNAL_FMA3_FAST(flags) CPUEXT_SUFFIX_FAST2(flags, _EXTERNAL, FMA3, AVX) +#define EXTERNAL_FMA3_SLOW(flags) CPUEXT_SUFFIX_SLOW2(flags, _EXTERNAL, FMA3, AVX) +#define EXTERNAL_FMA4(flags) CPUEXT_SUFFIX(flags, _EXTERNAL, FMA4) +#define EXTERNAL_AVX2(flags) CPUEXT_SUFFIX(flags, _EXTERNAL, AVX2) +#define EXTERNAL_AVX2_FAST(flags) CPUEXT_SUFFIX_FAST2(flags, _EXTERNAL, AVX2, AVX) +#define EXTERNAL_AVX2_SLOW(flags) CPUEXT_SUFFIX_SLOW2(flags, _EXTERNAL, AVX2, AVX) +#define EXTERNAL_AESNI(flags) CPUEXT_SUFFIX(flags, _EXTERNAL, AESNI) +#define EXTERNAL_AVX512(flags) CPUEXT_SUFFIX(flags, _EXTERNAL, AVX512) +#define EXTERNAL_AVX512ICL(flags) CPUEXT_SUFFIX(flags, _EXTERNAL, AVX512ICL) + +#define INLINE_AMD3DNOW(flags) CPUEXT_SUFFIX(flags, _INLINE, AMD3DNOW) +#define INLINE_AMD3DNOWEXT(flags) CPUEXT_SUFFIX(flags, _INLINE, AMD3DNOWEXT) +#define INLINE_MMX(flags) CPUEXT_SUFFIX(flags, _INLINE, MMX) +#define INLINE_MMXEXT(flags) CPUEXT_SUFFIX(flags, _INLINE, MMXEXT) +#define INLINE_SSE(flags) CPUEXT_SUFFIX(flags, _INLINE, SSE) +#define INLINE_SSE2(flags) CPUEXT_SUFFIX(flags, _INLINE, SSE2) +#define INLINE_SSE2_FAST(flags) CPUEXT_SUFFIX_FAST(flags, _INLINE, SSE2) +#define INLINE_SSE2_SLOW(flags) CPUEXT_SUFFIX_SLOW(flags, _INLINE, SSE2) +#define INLINE_SSE3(flags) CPUEXT_SUFFIX(flags, _INLINE, SSE3) +#define INLINE_SSE3_FAST(flags) CPUEXT_SUFFIX_FAST(flags, _INLINE, SSE3) +#define INLINE_SSE3_SLOW(flags) CPUEXT_SUFFIX_SLOW(flags, _INLINE, SSE3) +#define INLINE_SSSE3(flags) CPUEXT_SUFFIX(flags, _INLINE, SSSE3) +#define INLINE_SSSE3_FAST(flags) CPUEXT_SUFFIX_FAST(flags, _INLINE, SSSE3) +#define INLINE_SSSE3_SLOW(flags) CPUEXT_SUFFIX_SLOW(flags, _INLINE, SSSE3) +#define INLINE_SSE4(flags) CPUEXT_SUFFIX(flags, _INLINE, SSE4) +#define INLINE_SSE42(flags) CPUEXT_SUFFIX(flags, _INLINE, SSE42) +#define INLINE_AVX(flags) CPUEXT_SUFFIX(flags, _INLINE, AVX) +#define INLINE_AVX_FAST(flags) CPUEXT_SUFFIX_FAST(flags, _INLINE, AVX) +#define INLINE_AVX_SLOW(flags) CPUEXT_SUFFIX_SLOW(flags, _INLINE, AVX) +#define INLINE_XOP(flags) CPUEXT_SUFFIX(flags, _INLINE, XOP) +#define INLINE_FMA3(flags) CPUEXT_SUFFIX(flags, _INLINE, FMA3) +#define INLINE_FMA4(flags) CPUEXT_SUFFIX(flags, _INLINE, FMA4) +#define INLINE_AVX2(flags) CPUEXT_SUFFIX(flags, _INLINE, AVX2) +#define INLINE_AESNI(flags) CPUEXT_SUFFIX(flags, _INLINE, AESNI) + +void ff_cpu_cpuid(int index, int *eax, int *ebx, int *ecx, int *edx); +void ff_cpu_xgetbv(int op, int *eax, int *edx); +int ff_cpu_cpuid_test(void); + +#endif /* AVUTIL_X86_CPU_H */ diff --git a/media/ffvpx/libavutil/x86/cpuid.asm b/media/ffvpx/libavutil/x86/cpuid.asm new file mode 100644 index 0000000000..766f77fcdf --- /dev/null +++ b/media/ffvpx/libavutil/x86/cpuid.asm @@ -0,0 +1,91 @@ +;***************************************************************************** +;* Copyright (C) 2005-2010 x264 project +;* +;* Authors: Loren Merritt +;* Fiona Glaser +;* +;* This file is part of FFmpeg. +;* +;* FFmpeg is free software; you can redistribute it and/or +;* modify it under the terms of the GNU Lesser General Public +;* License as published by the Free Software Foundation; either +;* version 2.1 of the License, or (at your option) any later version. +;* +;* FFmpeg is distributed in the hope that it will be useful, +;* but WITHOUT ANY WARRANTY; without even the implied warranty of +;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +;* Lesser General Public License for more details. +;* +;* You should have received a copy of the GNU Lesser General Public +;* License along with FFmpeg; if not, write to the Free Software +;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA +;****************************************************************************** + +%include "libavutil/x86/x86util.asm" + +SECTION .text + +;----------------------------------------------------------------------------- +; void ff_cpu_cpuid(int index, int *eax, int *ebx, int *ecx, int *edx) +;----------------------------------------------------------------------------- +cglobal cpu_cpuid, 5,7 + push rbx + push r4 + push r3 + push r2 + push r1 + mov eax, r0d + xor ecx, ecx + cpuid + pop r4 + mov [r4], eax + pop r4 + mov [r4], ebx + pop r4 + mov [r4], ecx + pop r4 + mov [r4], edx + pop rbx + RET + +;----------------------------------------------------------------------------- +; void ff_cpu_xgetbv(int op, int *eax, int *edx) +;----------------------------------------------------------------------------- +cglobal cpu_xgetbv, 3,7 + push r2 + push r1 + mov ecx, r0d + xgetbv + pop r4 + mov [r4], eax + pop r4 + mov [r4], edx + RET + +%if ARCH_X86_64 == 0 +;----------------------------------------------------------------------------- +; int ff_cpu_cpuid_test(void) +; return 0 if unsupported +;----------------------------------------------------------------------------- +cglobal cpu_cpuid_test + pushfd + push ebx + push ebp + push esi + push edi + pushfd + pop eax + mov ebx, eax + xor eax, 0x200000 + push eax + popfd + pushfd + pop eax + xor eax, ebx + pop edi + pop esi + pop ebp + pop ebx + popfd + ret +%endif diff --git a/media/ffvpx/libavutil/x86/emms.asm b/media/ffvpx/libavutil/x86/emms.asm new file mode 100644 index 0000000000..df84f2221b --- /dev/null +++ b/media/ffvpx/libavutil/x86/emms.asm @@ -0,0 +1,30 @@ +;***************************************************************************** +;* Copyright (C) 2013 Martin Storsjo +;* +;* This file is part of FFmpeg. +;* +;* FFmpeg is free software; you can redistribute it and/or +;* modify it under the terms of the GNU Lesser General Public +;* License as published by the Free Software Foundation; either +;* version 2.1 of the License, or (at your option) any later version. +;* +;* FFmpeg is distributed in the hope that it will be useful, +;* but WITHOUT ANY WARRANTY; without even the implied warranty of +;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +;* Lesser General Public License for more details. +;* +;* You should have received a copy of the GNU Lesser General Public +;* License along with FFmpeg; if not, write to the Free Software +;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA +;****************************************************************************** + +%include "libavutil/x86/x86util.asm" + +SECTION .text + +;----------------------------------------------------------------------------- +; void avpriv_emms_asm(void) +;----------------------------------------------------------------------------- +cvisible emms_asm, 0, 0 + emms + RET diff --git a/media/ffvpx/libavutil/x86/emms.h b/media/ffvpx/libavutil/x86/emms.h new file mode 100644 index 0000000000..8ceec110cf --- /dev/null +++ b/media/ffvpx/libavutil/x86/emms.h @@ -0,0 +1,58 @@ +/* + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#ifndef AVUTIL_X86_EMMS_H +#define AVUTIL_X86_EMMS_H + +#include "config.h" +#include "libavutil/attributes.h" + +void avpriv_emms_asm(void); + +#if HAVE_MMX_INLINE +#ifndef __MMX__ +#include "libavutil/cpu.h" +#endif + +# define emms_c emms_c +/** + * Empty mmx state. + * this must be called between any dsp function and float/double code. + * for example sin(); dsp->idct_put(); emms_c(); cos() + * Note, *alloc() and *free() also use float code in some libc implementations + * thus this also applies to them or any function using them. + */ +static av_always_inline void emms_c(void) +{ +/* Some inlined functions may also use mmx instructions regardless of + * runtime cpuflags. With that in mind, we unconditionally empty the + * mmx state if the target cpu chosen at configure time supports it. + */ +#if !defined(__MMX__) + if(av_get_cpu_flags() & AV_CPU_FLAG_MMX) +#endif + __asm__ volatile ("emms" ::: "memory"); +} +#elif HAVE_MMX && HAVE_MM_EMPTY +# include +# define emms_c _mm_empty +#elif HAVE_MMX_EXTERNAL +# define emms_c avpriv_emms_asm +#endif /* HAVE_MMX_INLINE */ + +#endif /* AVUTIL_X86_EMMS_H */ diff --git a/media/ffvpx/libavutil/x86/fixed_dsp.asm b/media/ffvpx/libavutil/x86/fixed_dsp.asm new file mode 100644 index 0000000000..2f411850f4 --- /dev/null +++ b/media/ffvpx/libavutil/x86/fixed_dsp.asm @@ -0,0 +1,48 @@ +;***************************************************************************** +;* x86-optimized Float DSP functions +;* +;* Copyright 2016 James Almer +;* +;* This file is part of FFmpeg. +;* +;* FFmpeg is free software; you can redistribute it and/or +;* modify it under the terms of the GNU Lesser General Public +;* License as published by the Free Software Foundation; either +;* version 2.1 of the License, or (at your option) any later version. +;* +;* FFmpeg is distributed in the hope that it will be useful, +;* but WITHOUT ANY WARRANTY; without even the implied warranty of +;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +;* Lesser General Public License for more details. +;* +;* You should have received a copy of the GNU Lesser General Public +;* License along with FFmpeg; if not, write to the Free Software +;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA +;****************************************************************************** + +%include "libavutil/x86/x86util.asm" + +SECTION .text + +;----------------------------------------------------------------------------- +; void ff_butterflies_fixed(float *src0, float *src1, int len); +;----------------------------------------------------------------------------- +INIT_XMM sse2 +cglobal butterflies_fixed, 3,3,3, src0, src1, len + shl lend, 2 + add src0q, lenq + add src1q, lenq + neg lenq + +align 16 +.loop: + mova m0, [src0q + lenq] + mova m1, [src1q + lenq] + mova m2, m0 + paddd m0, m1 + psubd m2, m1 + mova [src0q + lenq], m0 + mova [src1q + lenq], m2 + add lenq, mmsize + jl .loop + RET diff --git a/media/ffvpx/libavutil/x86/fixed_dsp_init.c b/media/ffvpx/libavutil/x86/fixed_dsp_init.c new file mode 100644 index 0000000000..d3f4b2e325 --- /dev/null +++ b/media/ffvpx/libavutil/x86/fixed_dsp_init.c @@ -0,0 +1,35 @@ +/* + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#include "config.h" + +#include "libavutil/attributes.h" +#include "libavutil/cpu.h" +#include "libavutil/fixed_dsp.h" +#include "cpu.h" + +void ff_butterflies_fixed_sse2(int *av_restrict src0, int *av_restrict src1, int len); + +av_cold void ff_fixed_dsp_init_x86(AVFixedDSPContext *fdsp) +{ + int cpu_flags = av_get_cpu_flags(); + + if (EXTERNAL_SSE2(cpu_flags)) { + fdsp->butterflies_fixed = ff_butterflies_fixed_sse2; + } +} diff --git a/media/ffvpx/libavutil/x86/float_dsp.asm b/media/ffvpx/libavutil/x86/float_dsp.asm new file mode 100644 index 0000000000..e84ba52566 --- /dev/null +++ b/media/ffvpx/libavutil/x86/float_dsp.asm @@ -0,0 +1,588 @@ +;***************************************************************************** +;* x86-optimized Float DSP functions +;* +;* Copyright 2006 Loren Merritt +;* +;* This file is part of FFmpeg. +;* +;* FFmpeg is free software; you can redistribute it and/or +;* modify it under the terms of the GNU Lesser General Public +;* License as published by the Free Software Foundation; either +;* version 2.1 of the License, or (at your option) any later version. +;* +;* FFmpeg is distributed in the hope that it will be useful, +;* but WITHOUT ANY WARRANTY; without even the implied warranty of +;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +;* Lesser General Public License for more details. +;* +;* You should have received a copy of the GNU Lesser General Public +;* License along with FFmpeg; if not, write to the Free Software +;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA +;****************************************************************************** + +%include "libavutil/x86/x86util.asm" + +SECTION_RODATA 32 +pd_reverse: dd 7, 6, 5, 4, 3, 2, 1, 0 + +SECTION .text + +;----------------------------------------------------------------------------- +; void vector_fmul(float *dst, const float *src0, const float *src1, int len) +;----------------------------------------------------------------------------- +%macro VECTOR_FMUL 0 +cglobal vector_fmul, 4,4,2, dst, src0, src1, len + lea lenq, [lend*4 - 64] +ALIGN 16 +.loop: +%assign a 0 +%rep 32/mmsize + mova m0, [src0q + lenq + (a+0)*mmsize] + mova m1, [src0q + lenq + (a+1)*mmsize] + mulps m0, m0, [src1q + lenq + (a+0)*mmsize] + mulps m1, m1, [src1q + lenq + (a+1)*mmsize] + mova [dstq + lenq + (a+0)*mmsize], m0 + mova [dstq + lenq + (a+1)*mmsize], m1 +%assign a a+2 +%endrep + + sub lenq, 64 + jge .loop + RET +%endmacro + +INIT_XMM sse +VECTOR_FMUL +%if HAVE_AVX_EXTERNAL +INIT_YMM avx +VECTOR_FMUL +%endif + +;----------------------------------------------------------------------------- +; void vector_dmul(double *dst, const double *src0, const double *src1, int len) +;----------------------------------------------------------------------------- +%macro VECTOR_DMUL 0 +cglobal vector_dmul, 4,4,4, dst, src0, src1, len + lea lend, [lenq*8 - mmsize*4] +ALIGN 16 +.loop: + movaps m0, [src0q + lenq + 0*mmsize] + movaps m1, [src0q + lenq + 1*mmsize] + movaps m2, [src0q + lenq + 2*mmsize] + movaps m3, [src0q + lenq + 3*mmsize] + mulpd m0, m0, [src1q + lenq + 0*mmsize] + mulpd m1, m1, [src1q + lenq + 1*mmsize] + mulpd m2, m2, [src1q + lenq + 2*mmsize] + mulpd m3, m3, [src1q + lenq + 3*mmsize] + movaps [dstq + lenq + 0*mmsize], m0 + movaps [dstq + lenq + 1*mmsize], m1 + movaps [dstq + lenq + 2*mmsize], m2 + movaps [dstq + lenq + 3*mmsize], m3 + + sub lenq, mmsize*4 + jge .loop + RET +%endmacro + +INIT_XMM sse2 +VECTOR_DMUL +%if HAVE_AVX_EXTERNAL +INIT_YMM avx +VECTOR_DMUL +%endif + +;------------------------------------------------------------------------------ +; void ff_vector_fmac_scalar(float *dst, const float *src, float mul, int len) +;------------------------------------------------------------------------------ + +%macro VECTOR_FMAC_SCALAR 0 +%if UNIX64 +cglobal vector_fmac_scalar, 3,3,5, dst, src, len +%else +cglobal vector_fmac_scalar, 4,4,5, dst, src, mul, len +%endif +%if ARCH_X86_32 + VBROADCASTSS m0, mulm +%else +%if WIN64 + SWAP 0, 2 +%endif + shufps xm0, xm0, 0 +%if cpuflag(avx) + vinsertf128 m0, m0, xm0, 1 +%endif +%endif + lea lenq, [lend*4-64] +.loop: +%if cpuflag(fma3) + mova m1, [dstq+lenq] + mova m2, [dstq+lenq+1*mmsize] + fmaddps m1, m0, [srcq+lenq], m1 + fmaddps m2, m0, [srcq+lenq+1*mmsize], m2 +%else ; cpuflag + mulps m1, m0, [srcq+lenq] + mulps m2, m0, [srcq+lenq+1*mmsize] +%if mmsize < 32 + mulps m3, m0, [srcq+lenq+2*mmsize] + mulps m4, m0, [srcq+lenq+3*mmsize] +%endif ; mmsize + addps m1, m1, [dstq+lenq] + addps m2, m2, [dstq+lenq+1*mmsize] +%if mmsize < 32 + addps m3, m3, [dstq+lenq+2*mmsize] + addps m4, m4, [dstq+lenq+3*mmsize] +%endif ; mmsize +%endif ; cpuflag + mova [dstq+lenq], m1 + mova [dstq+lenq+1*mmsize], m2 +%if mmsize < 32 + mova [dstq+lenq+2*mmsize], m3 + mova [dstq+lenq+3*mmsize], m4 +%endif ; mmsize + sub lenq, 64 + jge .loop + RET +%endmacro + +INIT_XMM sse +VECTOR_FMAC_SCALAR +%if HAVE_AVX_EXTERNAL +INIT_YMM avx +VECTOR_FMAC_SCALAR +%endif +%if HAVE_FMA3_EXTERNAL +INIT_YMM fma3 +VECTOR_FMAC_SCALAR +%endif + +;------------------------------------------------------------------------------ +; void ff_vector_fmul_scalar(float *dst, const float *src, float mul, int len) +;------------------------------------------------------------------------------ + +%macro VECTOR_FMUL_SCALAR 0 +%if UNIX64 +cglobal vector_fmul_scalar, 3,3,2, dst, src, len +%else +cglobal vector_fmul_scalar, 4,4,3, dst, src, mul, len +%endif +%if ARCH_X86_32 + movss m0, mulm +%elif WIN64 + SWAP 0, 2 +%endif + shufps m0, m0, 0 + lea lenq, [lend*4-mmsize] +.loop: + mova m1, [srcq+lenq] + mulps m1, m0 + mova [dstq+lenq], m1 + sub lenq, mmsize + jge .loop + RET +%endmacro + +INIT_XMM sse +VECTOR_FMUL_SCALAR + +;------------------------------------------------------------------------------ +; void ff_vector_dmac_scalar(double *dst, const double *src, double mul, +; int len) +;------------------------------------------------------------------------------ + +%macro VECTOR_DMAC_SCALAR 0 +%if ARCH_X86_32 +cglobal vector_dmac_scalar, 2,4,5, dst, src, mul, len, lenaddr + mov lenq, lenaddrm + VBROADCASTSD m0, mulm +%else +%if UNIX64 +cglobal vector_dmac_scalar, 3,3,5, dst, src, len +%else +cglobal vector_dmac_scalar, 4,4,5, dst, src, mul, len + SWAP 0, 2 +%endif + movlhps xm0, xm0 +%if cpuflag(avx) + vinsertf128 m0, m0, xm0, 1 +%endif +%endif + lea lenq, [lend*8-mmsize*4] +.loop: +%if cpuflag(fma3) + movaps m1, [dstq+lenq] + movaps m2, [dstq+lenq+1*mmsize] + movaps m3, [dstq+lenq+2*mmsize] + movaps m4, [dstq+lenq+3*mmsize] + fmaddpd m1, m0, [srcq+lenq], m1 + fmaddpd m2, m0, [srcq+lenq+1*mmsize], m2 + fmaddpd m3, m0, [srcq+lenq+2*mmsize], m3 + fmaddpd m4, m0, [srcq+lenq+3*mmsize], m4 +%else ; cpuflag + mulpd m1, m0, [srcq+lenq] + mulpd m2, m0, [srcq+lenq+1*mmsize] + mulpd m3, m0, [srcq+lenq+2*mmsize] + mulpd m4, m0, [srcq+lenq+3*mmsize] + addpd m1, m1, [dstq+lenq] + addpd m2, m2, [dstq+lenq+1*mmsize] + addpd m3, m3, [dstq+lenq+2*mmsize] + addpd m4, m4, [dstq+lenq+3*mmsize] +%endif ; cpuflag + movaps [dstq+lenq], m1 + movaps [dstq+lenq+1*mmsize], m2 + movaps [dstq+lenq+2*mmsize], m3 + movaps [dstq+lenq+3*mmsize], m4 + sub lenq, mmsize*4 + jge .loop + RET +%endmacro + +INIT_XMM sse2 +VECTOR_DMAC_SCALAR +%if HAVE_AVX_EXTERNAL +INIT_YMM avx +VECTOR_DMAC_SCALAR +%endif +%if HAVE_FMA3_EXTERNAL +INIT_YMM fma3 +VECTOR_DMAC_SCALAR +%endif + +;------------------------------------------------------------------------------ +; void ff_vector_dmul_scalar(double *dst, const double *src, double mul, +; int len) +;------------------------------------------------------------------------------ + +%macro VECTOR_DMUL_SCALAR 0 +%if ARCH_X86_32 +cglobal vector_dmul_scalar, 3,4,3, dst, src, mul, len, lenaddr + mov lenq, lenaddrm +%elif UNIX64 +cglobal vector_dmul_scalar, 3,3,3, dst, src, len +%else +cglobal vector_dmul_scalar, 4,4,3, dst, src, mul, len +%endif +%if ARCH_X86_32 + VBROADCASTSD m0, mulm +%else +%if WIN64 + SWAP 0, 2 +%endif + movlhps xm0, xm0 +%if cpuflag(avx) + vinsertf128 ym0, ym0, xm0, 1 +%endif +%endif + lea lenq, [lend*8-2*mmsize] +.loop: + mulpd m1, m0, [srcq+lenq ] + mulpd m2, m0, [srcq+lenq+mmsize] + movaps [dstq+lenq ], m1 + movaps [dstq+lenq+mmsize], m2 + sub lenq, 2*mmsize + jge .loop + RET +%endmacro + +INIT_XMM sse2 +VECTOR_DMUL_SCALAR +%if HAVE_AVX_EXTERNAL +INIT_YMM avx +VECTOR_DMUL_SCALAR +%endif + +;----------------------------------------------------------------------------- +; vector_fmul_window(float *dst, const float *src0, +; const float *src1, const float *win, int len); +;----------------------------------------------------------------------------- +INIT_XMM sse +cglobal vector_fmul_window, 5, 6, 6, dst, src0, src1, win, len, len1 + shl lend, 2 + lea len1q, [lenq - mmsize] + add src0q, lenq + add dstq, lenq + add winq, lenq + neg lenq +.loop: + mova m0, [winq + lenq] + mova m4, [src0q + lenq] + mova m1, [winq + len1q] + mova m5, [src1q + len1q] + shufps m1, m1, 0x1b + shufps m5, m5, 0x1b + mova m2, m0 + mova m3, m1 + mulps m2, m4 + mulps m3, m5 + mulps m1, m4 + mulps m0, m5 + addps m2, m3 + subps m1, m0 + shufps m2, m2, 0x1b + mova [dstq + lenq], m1 + mova [dstq + len1q], m2 + sub len1q, mmsize + add lenq, mmsize + jl .loop + RET + +;----------------------------------------------------------------------------- +; vector_fmul_add(float *dst, const float *src0, const float *src1, +; const float *src2, int len) +;----------------------------------------------------------------------------- +%macro VECTOR_FMUL_ADD 0 +cglobal vector_fmul_add, 5,5,4, dst, src0, src1, src2, len + lea lenq, [lend*4 - 2*mmsize] +ALIGN 16 +.loop: + mova m0, [src0q + lenq] + mova m1, [src0q + lenq + mmsize] +%if cpuflag(fma3) + mova m2, [src2q + lenq] + mova m3, [src2q + lenq + mmsize] + fmaddps m0, m0, [src1q + lenq], m2 + fmaddps m1, m1, [src1q + lenq + mmsize], m3 +%else + mulps m0, m0, [src1q + lenq] + mulps m1, m1, [src1q + lenq + mmsize] + addps m0, m0, [src2q + lenq] + addps m1, m1, [src2q + lenq + mmsize] +%endif + mova [dstq + lenq], m0 + mova [dstq + lenq + mmsize], m1 + + sub lenq, 2*mmsize + jge .loop + RET +%endmacro + +INIT_XMM sse +VECTOR_FMUL_ADD +%if HAVE_AVX_EXTERNAL +INIT_YMM avx +VECTOR_FMUL_ADD +%endif +%if HAVE_FMA3_EXTERNAL +INIT_YMM fma3 +VECTOR_FMUL_ADD +%endif + +;----------------------------------------------------------------------------- +; void vector_fmul_reverse(float *dst, const float *src0, const float *src1, +; int len) +;----------------------------------------------------------------------------- +%macro VECTOR_FMUL_REVERSE 0 +cglobal vector_fmul_reverse, 4,4,2, dst, src0, src1, len +%if cpuflag(avx2) + movaps m2, [pd_reverse] +%endif + lea lenq, [lend*4 - 2*mmsize] +ALIGN 16 +.loop: +%if cpuflag(avx2) + vpermps m0, m2, [src1q] + vpermps m1, m2, [src1q+mmsize] +%elif cpuflag(avx) + vmovaps xmm0, [src1q + 16] + vinsertf128 m0, m0, [src1q], 1 + vshufps m0, m0, m0, q0123 + vmovaps xmm1, [src1q + mmsize + 16] + vinsertf128 m1, m1, [src1q + mmsize], 1 + vshufps m1, m1, m1, q0123 +%else + mova m0, [src1q] + mova m1, [src1q + mmsize] + shufps m0, m0, q0123 + shufps m1, m1, q0123 +%endif + mulps m0, m0, [src0q + lenq + mmsize] + mulps m1, m1, [src0q + lenq] + movaps [dstq + lenq + mmsize], m0 + movaps [dstq + lenq], m1 + add src1q, 2*mmsize + sub lenq, 2*mmsize + jge .loop + RET +%endmacro + +INIT_XMM sse +VECTOR_FMUL_REVERSE +%if HAVE_AVX_EXTERNAL +INIT_YMM avx +VECTOR_FMUL_REVERSE +%endif +%if HAVE_AVX2_EXTERNAL +INIT_YMM avx2 +VECTOR_FMUL_REVERSE +%endif + +; float scalarproduct_float_sse(const float *v1, const float *v2, int len) +INIT_XMM sse +cglobal scalarproduct_float, 3,3,2, v1, v2, offset + shl offsetd, 2 + add v1q, offsetq + add v2q, offsetq + neg offsetq + xorps xmm0, xmm0 +.loop: + movaps xmm1, [v1q+offsetq] + mulps xmm1, [v2q+offsetq] + addps xmm0, xmm1 + add offsetq, 16 + js .loop + movhlps xmm1, xmm0 + addps xmm0, xmm1 + movss xmm1, xmm0 + shufps xmm0, xmm0, 1 + addss xmm0, xmm1 +%if ARCH_X86_64 == 0 + movss r0m, xmm0 + fld dword r0m +%endif + RET + +INIT_YMM fma3 +cglobal scalarproduct_float, 3,5,8, v1, v2, size, len, offset + xor offsetq, offsetq + xorps m0, m0, m0 + shl sized, 2 + mov lenq, sizeq + cmp lenq, 32 + jl .l16 + cmp lenq, 64 + jl .l32 + xorps m1, m1, m1 + cmp lenq, 128 + jl .l64 + and lenq, ~127 + xorps m2, m2, m2 + xorps m3, m3, m3 +.loop128: + movups m4, [v1q+offsetq] + movups m5, [v1q+offsetq + 32] + movups m6, [v1q+offsetq + 64] + movups m7, [v1q+offsetq + 96] + fmaddps m0, m4, [v2q+offsetq ], m0 + fmaddps m1, m5, [v2q+offsetq + 32], m1 + fmaddps m2, m6, [v2q+offsetq + 64], m2 + fmaddps m3, m7, [v2q+offsetq + 96], m3 + add offsetq, 128 + cmp offsetq, lenq + jl .loop128 + addps m0, m0, m2 + addps m1, m1, m3 + mov lenq, sizeq + and lenq, 127 + cmp lenq, 64 + jge .l64 + addps m0, m0, m1 + cmp lenq, 32 + jge .l32 + vextractf128 xmm2, m0, 1 + addps xmm0, xmm2 + cmp lenq, 16 + jge .l16 + movhlps xmm1, xmm0 + addps xmm0, xmm1 + movss xmm1, xmm0 + shufps xmm0, xmm0, 1 + addss xmm0, xmm1 +%if ARCH_X86_64 == 0 + movss r0m, xm0 + fld dword r0m +%endif + RET +.l64: + and lenq, ~63 + add lenq, offsetq +.loop64: + movups m4, [v1q+offsetq] + movups m5, [v1q+offsetq + 32] + fmaddps m0, m4, [v2q+offsetq], m0 + fmaddps m1, m5, [v2q+offsetq + 32], m1 + add offsetq, 64 + cmp offsetq, lenq + jl .loop64 + addps m0, m0, m1 + mov lenq, sizeq + and lenq, 63 + cmp lenq, 32 + jge .l32 + vextractf128 xmm2, m0, 1 + addps xmm0, xmm2 + cmp lenq, 16 + jge .l16 + movhlps xmm1, xmm0 + addps xmm0, xmm1 + movss xmm1, xmm0 + shufps xmm0, xmm0, 1 + addss xmm0, xmm1 +%if ARCH_X86_64 == 0 + movss r0m, xm0 + fld dword r0m +%endif + RET +.l32: + and lenq, ~31 + add lenq, offsetq +.loop32: + movups m4, [v1q+offsetq] + fmaddps m0, m4, [v2q+offsetq], m0 + add offsetq, 32 + cmp offsetq, lenq + jl .loop32 + vextractf128 xmm2, m0, 1 + addps xmm0, xmm2 + mov lenq, sizeq + and lenq, 31 + cmp lenq, 16 + jge .l16 + movhlps xmm1, xmm0 + addps xmm0, xmm1 + movss xmm1, xmm0 + shufps xmm0, xmm0, 1 + addss xmm0, xmm1 +%if ARCH_X86_64 == 0 + movss r0m, xm0 + fld dword r0m +%endif + RET +.l16: + and lenq, ~15 + add lenq, offsetq +.loop16: + movaps xmm1, [v1q+offsetq] + mulps xmm1, [v2q+offsetq] + addps xmm0, xmm1 + add offsetq, 16 + cmp offsetq, lenq + jl .loop16 + movhlps xmm1, xmm0 + addps xmm0, xmm1 + movss xmm1, xmm0 + shufps xmm0, xmm0, 1 + addss xmm0, xmm1 +%if ARCH_X86_64 == 0 + movss r0m, xm0 + fld dword r0m +%endif + RET + +;----------------------------------------------------------------------------- +; void ff_butterflies_float(float *src0, float *src1, int len); +;----------------------------------------------------------------------------- +INIT_XMM sse +cglobal butterflies_float, 3,3,3, src0, src1, len + shl lend, 2 + add src0q, lenq + add src1q, lenq + neg lenq +.loop: + mova m0, [src0q + lenq] + mova m1, [src1q + lenq] + subps m2, m0, m1 + addps m0, m0, m1 + mova [src1q + lenq], m2 + mova [src0q + lenq], m0 + add lenq, mmsize + jl .loop + RET diff --git a/media/ffvpx/libavutil/x86/float_dsp_init.c b/media/ffvpx/libavutil/x86/float_dsp_init.c new file mode 100644 index 0000000000..ad6b506259 --- /dev/null +++ b/media/ffvpx/libavutil/x86/float_dsp_init.c @@ -0,0 +1,118 @@ +/* + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#include "config.h" + +#include "libavutil/attributes.h" +#include "libavutil/cpu.h" +#include "libavutil/float_dsp.h" +#include "cpu.h" +#include "asm.h" + +void ff_vector_fmul_sse(float *dst, const float *src0, const float *src1, + int len); +void ff_vector_fmul_avx(float *dst, const float *src0, const float *src1, + int len); + +void ff_vector_dmul_sse2(double *dst, const double *src0, const double *src1, + int len); +void ff_vector_dmul_avx(double *dst, const double *src0, const double *src1, + int len); + +void ff_vector_fmac_scalar_sse(float *dst, const float *src, float mul, + int len); +void ff_vector_fmac_scalar_avx(float *dst, const float *src, float mul, + int len); +void ff_vector_fmac_scalar_fma3(float *dst, const float *src, float mul, + int len); + +void ff_vector_fmul_scalar_sse(float *dst, const float *src, float mul, + int len); + +void ff_vector_dmac_scalar_sse2(double *dst, const double *src, double mul, + int len); +void ff_vector_dmac_scalar_avx(double *dst, const double *src, double mul, + int len); +void ff_vector_dmac_scalar_fma3(double *dst, const double *src, double mul, + int len); + +void ff_vector_dmul_scalar_sse2(double *dst, const double *src, + double mul, int len); +void ff_vector_dmul_scalar_avx(double *dst, const double *src, + double mul, int len); + +void ff_vector_fmul_window_sse(float *dst, const float *src0, + const float *src1, const float *win, int len); + +void ff_vector_fmul_add_sse(float *dst, const float *src0, const float *src1, + const float *src2, int len); +void ff_vector_fmul_add_avx(float *dst, const float *src0, const float *src1, + const float *src2, int len); +void ff_vector_fmul_add_fma3(float *dst, const float *src0, const float *src1, + const float *src2, int len); + +void ff_vector_fmul_reverse_sse(float *dst, const float *src0, + const float *src1, int len); +void ff_vector_fmul_reverse_avx(float *dst, const float *src0, + const float *src1, int len); +void ff_vector_fmul_reverse_avx2(float *dst, const float *src0, + const float *src1, int len); + +float ff_scalarproduct_float_sse(const float *v1, const float *v2, int order); +float ff_scalarproduct_float_fma3(const float *v1, const float *v2, int order); + +void ff_butterflies_float_sse(float *av_restrict src0, float *av_restrict src1, int len); + +av_cold void ff_float_dsp_init_x86(AVFloatDSPContext *fdsp) +{ + int cpu_flags = av_get_cpu_flags(); + + if (EXTERNAL_SSE(cpu_flags)) { + fdsp->vector_fmul = ff_vector_fmul_sse; + fdsp->vector_fmac_scalar = ff_vector_fmac_scalar_sse; + fdsp->vector_fmul_scalar = ff_vector_fmul_scalar_sse; + fdsp->vector_fmul_window = ff_vector_fmul_window_sse; + fdsp->vector_fmul_add = ff_vector_fmul_add_sse; + fdsp->vector_fmul_reverse = ff_vector_fmul_reverse_sse; + fdsp->scalarproduct_float = ff_scalarproduct_float_sse; + fdsp->butterflies_float = ff_butterflies_float_sse; + } + if (EXTERNAL_SSE2(cpu_flags)) { + fdsp->vector_dmul = ff_vector_dmul_sse2; + fdsp->vector_dmac_scalar = ff_vector_dmac_scalar_sse2; + fdsp->vector_dmul_scalar = ff_vector_dmul_scalar_sse2; + } + if (EXTERNAL_AVX_FAST(cpu_flags)) { + fdsp->vector_fmul = ff_vector_fmul_avx; + fdsp->vector_dmul = ff_vector_dmul_avx; + fdsp->vector_fmac_scalar = ff_vector_fmac_scalar_avx; + fdsp->vector_dmul_scalar = ff_vector_dmul_scalar_avx; + fdsp->vector_dmac_scalar = ff_vector_dmac_scalar_avx; + fdsp->vector_fmul_add = ff_vector_fmul_add_avx; + fdsp->vector_fmul_reverse = ff_vector_fmul_reverse_avx; + } + if (EXTERNAL_AVX2_FAST(cpu_flags)) { + fdsp->vector_fmul_reverse = ff_vector_fmul_reverse_avx2; + } + if (EXTERNAL_FMA3_FAST(cpu_flags)) { + fdsp->vector_fmac_scalar = ff_vector_fmac_scalar_fma3; + fdsp->vector_fmul_add = ff_vector_fmul_add_fma3; + fdsp->vector_dmac_scalar = ff_vector_dmac_scalar_fma3; + fdsp->scalarproduct_float = ff_scalarproduct_float_fma3; + } +} diff --git a/media/ffvpx/libavutil/x86/imgutils.asm b/media/ffvpx/libavutil/x86/imgutils.asm new file mode 100644 index 0000000000..3cca56cdca --- /dev/null +++ b/media/ffvpx/libavutil/x86/imgutils.asm @@ -0,0 +1,53 @@ +;***************************************************************************** +;* Copyright 2016 Anton Khirnov +;* +;* This file is part of FFmpeg. +;* +;* FFmpeg is free software; you can redistribute it and/or +;* modify it under the terms of the GNU Lesser General Public +;* License as published by the Free Software Foundation; either +;* version 2.1 of the License, or (at your option) any later version. +;* +;* FFmpeg is distributed in the hope that it will be useful, +;* but WITHOUT ANY WARRANTY; without even the implied warranty of +;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +;* Lesser General Public License for more details. +;* +;* You should have received a copy of the GNU Lesser General Public +;* License along with FFmpeg; if not, write to the Free Software +;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA +;****************************************************************************** + +%include "libavutil/x86/x86util.asm" + +SECTION .text + +INIT_XMM sse4 +cglobal image_copy_plane_uc_from, 6, 7, 4, dst, dst_linesize, src, src_linesize, bw, height, rowpos + add dstq, bwq + add srcq, bwq + neg bwq + +.row_start: + mov rowposq, bwq + +.loop: + movntdqa m0, [srcq + rowposq + 0 * mmsize] + movntdqa m1, [srcq + rowposq + 1 * mmsize] + movntdqa m2, [srcq + rowposq + 2 * mmsize] + movntdqa m3, [srcq + rowposq + 3 * mmsize] + + mova [dstq + rowposq + 0 * mmsize], m0 + mova [dstq + rowposq + 1 * mmsize], m1 + mova [dstq + rowposq + 2 * mmsize], m2 + mova [dstq + rowposq + 3 * mmsize], m3 + + add rowposq, 4 * mmsize + jnz .loop + + add srcq, src_linesizeq + add dstq, dst_linesizeq + dec heightd + jnz .row_start + + RET diff --git a/media/ffvpx/libavutil/x86/imgutils_init.c b/media/ffvpx/libavutil/x86/imgutils_init.c new file mode 100644 index 0000000000..91a16cf594 --- /dev/null +++ b/media/ffvpx/libavutil/x86/imgutils_init.c @@ -0,0 +1,48 @@ +/* + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#include +#include + +#include "libavutil/cpu.h" +#include "libavutil/error.h" +#include "libavutil/imgutils_internal.h" +#include "libavutil/macros.h" + +#include "cpu.h" + +void ff_image_copy_plane_uc_from_sse4(uint8_t *dst, ptrdiff_t dst_linesize, + const uint8_t *src, ptrdiff_t src_linesize, + ptrdiff_t bytewidth, int height); + +int ff_image_copy_plane_uc_from_x86(uint8_t *dst, ptrdiff_t dst_linesize, + const uint8_t *src, ptrdiff_t src_linesize, + ptrdiff_t bytewidth, int height) +{ + int cpu_flags = av_get_cpu_flags(); + ptrdiff_t bw_aligned = FFALIGN(bytewidth, 64); + + if (EXTERNAL_SSE4(cpu_flags) && + bw_aligned <= dst_linesize && bw_aligned <= src_linesize) + ff_image_copy_plane_uc_from_sse4(dst, dst_linesize, src, src_linesize, + bw_aligned, height); + else + return AVERROR(ENOSYS); + + return 0; +} diff --git a/media/ffvpx/libavutil/x86/intmath.h b/media/ffvpx/libavutil/x86/intmath.h new file mode 100644 index 0000000000..8a6b5ae261 --- /dev/null +++ b/media/ffvpx/libavutil/x86/intmath.h @@ -0,0 +1,169 @@ +/* + * Copyright (c) 2015 James Almer + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#ifndef AVUTIL_X86_INTMATH_H +#define AVUTIL_X86_INTMATH_H + +#include +#include +#if HAVE_FAST_CLZ +#if defined(_MSC_VER) +#include +#elif defined(__INTEL_COMPILER) +#include +#endif +#endif +#include "config.h" + +#if HAVE_FAST_CLZ +#if (defined(__INTEL_COMPILER) && (__INTEL_COMPILER>=1216)) || defined(_MSC_VER) +# if defined(__INTEL_COMPILER) +# define ff_log2(x) (_bit_scan_reverse((x)|1)) +# else +# define ff_log2 ff_log2_x86 +static av_always_inline av_const int ff_log2_x86(unsigned int v) +{ + unsigned long n; + _BitScanReverse(&n, v|1); + return n; +} +# endif +# define ff_log2_16bit av_log2 + +#if defined(__INTEL_COMPILER) || (defined(_MSC_VER) && (_MSC_VER >= 1700) && \ + (defined(__BMI__) || !defined(__clang__))) +# define ff_ctz(v) _tzcnt_u32(v) + +# if ARCH_X86_64 +# define ff_ctzll(v) _tzcnt_u64(v) +# else +# define ff_ctzll ff_ctzll_x86 +static av_always_inline av_const int ff_ctzll_x86(long long v) +{ + return ((uint32_t)v == 0) ? _tzcnt_u32((uint32_t)(v >> 32)) + 32 : _tzcnt_u32((uint32_t)v); +} +# endif +#endif /* _MSC_VER */ + +#endif /* __INTEL_COMPILER */ + +#endif /* HAVE_FAST_CLZ */ + +#if defined(__GNUC__) + +/* Our generic version of av_popcount is faster than GCC's built-in on + * CPUs that don't support the popcnt instruction. + */ +#if defined(__POPCNT__) + #define av_popcount __builtin_popcount +#if ARCH_X86_64 + #define av_popcount64 __builtin_popcountll +#endif + +#endif /* __POPCNT__ */ + +#if defined(__BMI2__) + +#if AV_GCC_VERSION_AT_LEAST(5,1) +#define av_mod_uintp2 __builtin_ia32_bzhi_si +#elif HAVE_INLINE_ASM +/* GCC releases before 5.1.0 have a broken bzhi builtin, so for those we + * implement it using inline assembly + */ +#define av_mod_uintp2 av_mod_uintp2_bmi2 +static av_always_inline av_const unsigned av_mod_uintp2_bmi2(unsigned a, unsigned p) +{ + if (av_builtin_constant_p(p)) + return a & ((1 << p) - 1); + else { + unsigned x; + __asm__ ("bzhi %2, %1, %0 \n\t" : "=r"(x) : "rm"(a), "r"(p)); + return x; + } +} +#endif /* AV_GCC_VERSION_AT_LEAST */ + +#endif /* __BMI2__ */ + +#if defined(__SSE2__) && !defined(__INTEL_COMPILER) + +#define av_clipd av_clipd_sse2 +static av_always_inline av_const double av_clipd_sse2(double a, double amin, double amax) +{ +#if defined(ASSERT_LEVEL) && ASSERT_LEVEL >= 2 + if (amin > amax) abort(); +#endif + __asm__ ("maxsd %1, %0 \n\t" + "minsd %2, %0 \n\t" + : "+&x"(a) : "xm"(amin), "xm"(amax)); + return a; +} + +#endif /* __SSE2__ */ + +#if defined(__SSE__) && !defined(__INTEL_COMPILER) + +#define av_clipf av_clipf_sse +static av_always_inline av_const float av_clipf_sse(float a, float amin, float amax) +{ +#if defined(ASSERT_LEVEL) && ASSERT_LEVEL >= 2 + if (amin > amax) abort(); +#endif + __asm__ ("maxss %1, %0 \n\t" + "minss %2, %0 \n\t" + : "+&x"(a) : "xm"(amin), "xm"(amax)); + return a; +} + +#endif /* __SSE__ */ + +#if defined(__AVX__) && !defined(__INTEL_COMPILER) + +#undef av_clipd +#define av_clipd av_clipd_avx +static av_always_inline av_const double av_clipd_avx(double a, double amin, double amax) +{ +#if defined(ASSERT_LEVEL) && ASSERT_LEVEL >= 2 + if (amin > amax) abort(); +#endif + __asm__ ("vmaxsd %1, %0, %0 \n\t" + "vminsd %2, %0, %0 \n\t" + : "+&x"(a) : "xm"(amin), "xm"(amax)); + return a; +} + +#undef av_clipf +#define av_clipf av_clipf_avx +static av_always_inline av_const float av_clipf_avx(float a, float amin, float amax) +{ +#if defined(ASSERT_LEVEL) && ASSERT_LEVEL >= 2 + if (amin > amax) abort(); +#endif + __asm__ ("vmaxss %1, %0, %0 \n\t" + "vminss %2, %0, %0 \n\t" + : "+&x"(a) : "xm"(amin), "xm"(amax)); + return a; +} + +#endif /* __AVX__ */ + +#endif /* __GNUC__ */ + +#endif /* AVUTIL_X86_INTMATH_H */ diff --git a/media/ffvpx/libavutil/x86/intreadwrite.h b/media/ffvpx/libavutil/x86/intreadwrite.h new file mode 100644 index 0000000000..40f375b013 --- /dev/null +++ b/media/ffvpx/libavutil/x86/intreadwrite.h @@ -0,0 +1,99 @@ +/* + * Copyright (c) 2010 Alexander Strange + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#ifndef AVUTIL_X86_INTREADWRITE_H +#define AVUTIL_X86_INTREADWRITE_H + +#include +#include "config.h" +#include "libavutil/attributes.h" + +#if HAVE_MMX + +#if !HAVE_FAST_64BIT && defined(__MMX__) + +#define FF_COPY_SWAP_ZERO_USES_MMX + +#define AV_COPY64 AV_COPY64 +static av_always_inline void AV_COPY64(void *d, const void *s) +{ + __asm__("movq %1, %%mm0 \n\t" + "movq %%mm0, %0 \n\t" + : "=m"(*(uint64_t*)d) + : "m" (*(const uint64_t*)s) + : "mm0"); +} + +#define AV_SWAP64 AV_SWAP64 +static av_always_inline void AV_SWAP64(void *a, void *b) +{ + __asm__("movq %1, %%mm0 \n\t" + "movq %0, %%mm1 \n\t" + "movq %%mm0, %0 \n\t" + "movq %%mm1, %1 \n\t" + : "+m"(*(uint64_t*)a), "+m"(*(uint64_t*)b) + ::"mm0", "mm1"); +} + +#define AV_ZERO64 AV_ZERO64 +static av_always_inline void AV_ZERO64(void *d) +{ + __asm__("pxor %%mm0, %%mm0 \n\t" + "movq %%mm0, %0 \n\t" + : "=m"(*(uint64_t*)d) + :: "mm0"); +} + +#endif /* !HAVE_FAST_64BIT && defined(__MMX__) */ + +#ifdef __SSE__ + +#define AV_COPY128 AV_COPY128 +static av_always_inline void AV_COPY128(void *d, const void *s) +{ + struct v {uint64_t v[2];}; + + __asm__("movaps %1, %%xmm0 \n\t" + "movaps %%xmm0, %0 \n\t" + : "=m"(*(struct v*)d) + : "m" (*(const struct v*)s) + : "xmm0"); +} + +#endif /* __SSE__ */ + +#ifdef __SSE2__ + +#define AV_ZERO128 AV_ZERO128 +static av_always_inline void AV_ZERO128(void *d) +{ + struct v {uint64_t v[2];}; + + __asm__("pxor %%xmm0, %%xmm0 \n\t" + "movdqa %%xmm0, %0 \n\t" + : "=m"(*(struct v*)d) + :: "xmm0"); +} + +#endif /* __SSE2__ */ + +#endif /* HAVE_MMX */ + +#endif /* AVUTIL_X86_INTREADWRITE_H */ diff --git a/media/ffvpx/libavutil/x86/lls.asm b/media/ffvpx/libavutil/x86/lls.asm new file mode 100644 index 0000000000..e8141e6c4f --- /dev/null +++ b/media/ffvpx/libavutil/x86/lls.asm @@ -0,0 +1,290 @@ +;****************************************************************************** +;* linear least squares model +;* +;* Copyright (c) 2013 Loren Merritt +;* +;* This file is part of FFmpeg. +;* +;* FFmpeg is free software; you can redistribute it and/or +;* modify it under the terms of the GNU Lesser General Public +;* License as published by the Free Software Foundation; either +;* version 2.1 of the License, or (at your option) any later version. +;* +;* FFmpeg is distributed in the hope that it will be useful, +;* but WITHOUT ANY WARRANTY; without even the implied warranty of +;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +;* Lesser General Public License for more details. +;* +;* You should have received a copy of the GNU Lesser General Public +;* License along with FFmpeg; if not, write to the Free Software +;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA +;****************************************************************************** + +%include "libavutil/x86/x86util.asm" + +SECTION .text + +%define MAX_VARS 32 +%define MAX_VARS_ALIGN (MAX_VARS+4) +%define COVAR_STRIDE MAX_VARS_ALIGN*8 +%define COVAR(x,y) [covarq + (x)*8 + (y)*COVAR_STRIDE] + +struc LLSModel + .covariance: resq MAX_VARS_ALIGN*MAX_VARS_ALIGN + .coeff: resq MAX_VARS*MAX_VARS + .variance: resq MAX_VARS + .indep_count: resd 1 +endstruc + +%macro ADDPD_MEM 2 +%if cpuflag(avx) + vaddpd %2, %2, %1 +%else + addpd %2, %1 +%endif + mova %1, %2 +%endmacro + +INIT_XMM sse2 +%define movdqa movaps +cglobal update_lls, 2,5,8, ctx, var, i, j, covar2 + %define covarq ctxq + mov id, [ctxq + LLSModel.indep_count] + lea varq, [varq + iq*8] + neg iq + mov covar2q, covarq +.loopi: + ; Compute all 3 pairwise products of a 2x2 block that lies on the diagonal + mova m1, [varq + iq*8] + mova m3, [varq + iq*8 + 16] + pshufd m4, m1, q1010 + pshufd m5, m1, q3232 + pshufd m6, m3, q1010 + pshufd m7, m3, q3232 + mulpd m0, m1, m4 + mulpd m1, m1, m5 + lea covarq, [covar2q + 16] + ADDPD_MEM COVAR(-2,0), m0 + ADDPD_MEM COVAR(-2,1), m1 + lea jq, [iq + 2] + cmp jd, -2 + jg .skip4x4 +.loop4x4: + ; Compute all 16 pairwise products of a 4x4 block + mulpd m0, m4, m3 + mulpd m1, m5, m3 + mulpd m2, m6, m3 + mulpd m3, m3, m7 + ADDPD_MEM COVAR(0,0), m0 + ADDPD_MEM COVAR(0,1), m1 + ADDPD_MEM COVAR(0,2), m2 + ADDPD_MEM COVAR(0,3), m3 + mova m3, [varq + jq*8 + 16] + mulpd m0, m4, m3 + mulpd m1, m5, m3 + mulpd m2, m6, m3 + mulpd m3, m3, m7 + ADDPD_MEM COVAR(2,0), m0 + ADDPD_MEM COVAR(2,1), m1 + ADDPD_MEM COVAR(2,2), m2 + ADDPD_MEM COVAR(2,3), m3 + mova m3, [varq + jq*8 + 32] + add covarq, 32 + add jq, 4 + cmp jd, -2 + jle .loop4x4 +.skip4x4: + test jd, jd + jg .skip2x4 + mulpd m4, m3 + mulpd m5, m3 + mulpd m6, m3 + mulpd m7, m3 + ADDPD_MEM COVAR(0,0), m4 + ADDPD_MEM COVAR(0,1), m5 + ADDPD_MEM COVAR(0,2), m6 + ADDPD_MEM COVAR(0,3), m7 +.skip2x4: + add iq, 4 + add covar2q, 4*COVAR_STRIDE+32 + cmp id, -2 + jle .loopi + test id, id + jg .ret + mov jq, iq + %define covarq covar2q +.loop2x1: + movsd m0, [varq + iq*8] + movlhps m0, m0 + mulpd m0, [varq + jq*8] + ADDPD_MEM COVAR(0,0), m0 + inc iq + add covarq, COVAR_STRIDE + test id, id + jle .loop2x1 +.ret: + RET + +%macro UPDATE_LLS 0 +cglobal update_lls, 3,6,8, ctx, var, count, i, j, count2 + %define covarq ctxq + mov countd, [ctxq + LLSModel.indep_count] + lea count2d, [countq-2] + xor id, id +.loopi: + ; Compute all 10 pairwise products of a 4x4 block that lies on the diagonal + mova ymm1, [varq + iq*8] + vbroadcastsd ymm4, [varq + iq*8] + vbroadcastsd ymm5, [varq + iq*8 + 8] + vbroadcastsd ymm6, [varq + iq*8 + 16] + vbroadcastsd ymm7, [varq + iq*8 + 24] + vextractf128 xmm3, ymm1, 1 +%if cpuflag(fma3) + mova ymm0, COVAR(iq ,0) + mova xmm2, COVAR(iq+2,2) + fmaddpd ymm0, ymm1, ymm4, ymm0 + fmaddpd xmm2, xmm3, xmm6, xmm2 + fmaddpd ymm1, ymm5, ymm1, COVAR(iq ,1) + fmaddpd xmm3, xmm7, xmm3, COVAR(iq+2,3) + mova COVAR(iq ,0), ymm0 + mova COVAR(iq ,1), ymm1 + mova COVAR(iq+2,2), xmm2 + mova COVAR(iq+2,3), xmm3 +%else + vmulpd ymm0, ymm1, ymm4 + vmulpd ymm1, ymm1, ymm5 + vmulpd xmm2, xmm3, xmm6 + vmulpd xmm3, xmm3, xmm7 + ADDPD_MEM COVAR(iq ,0), ymm0 + ADDPD_MEM COVAR(iq ,1), ymm1 + ADDPD_MEM COVAR(iq+2,2), xmm2 + ADDPD_MEM COVAR(iq+2,3), xmm3 +%endif ; cpuflag(fma3) + lea jd, [iq + 4] + cmp jd, count2d + jg .skip4x4 +.loop4x4: + ; Compute all 16 pairwise products of a 4x4 block + mova ymm3, [varq + jq*8] +%if cpuflag(fma3) + mova ymm0, COVAR(jq, 0) + mova ymm1, COVAR(jq, 1) + mova ymm2, COVAR(jq, 2) + fmaddpd ymm0, ymm3, ymm4, ymm0 + fmaddpd ymm1, ymm3, ymm5, ymm1 + fmaddpd ymm2, ymm3, ymm6, ymm2 + fmaddpd ymm3, ymm7, ymm3, COVAR(jq,3) + mova COVAR(jq, 0), ymm0 + mova COVAR(jq, 1), ymm1 + mova COVAR(jq, 2), ymm2 + mova COVAR(jq, 3), ymm3 +%else + vmulpd ymm0, ymm3, ymm4 + vmulpd ymm1, ymm3, ymm5 + vmulpd ymm2, ymm3, ymm6 + vmulpd ymm3, ymm3, ymm7 + ADDPD_MEM COVAR(jq,0), ymm0 + ADDPD_MEM COVAR(jq,1), ymm1 + ADDPD_MEM COVAR(jq,2), ymm2 + ADDPD_MEM COVAR(jq,3), ymm3 +%endif ; cpuflag(fma3) + add jd, 4 + cmp jd, count2d + jle .loop4x4 +.skip4x4: + cmp jd, countd + jg .skip2x4 + mova xmm3, [varq + jq*8] +%if cpuflag(fma3) + mova xmm0, COVAR(jq, 0) + mova xmm1, COVAR(jq, 1) + mova xmm2, COVAR(jq, 2) + fmaddpd xmm0, xmm3, xmm4, xmm0 + fmaddpd xmm1, xmm3, xmm5, xmm1 + fmaddpd xmm2, xmm3, xmm6, xmm2 + fmaddpd xmm3, xmm7, xmm3, COVAR(jq,3) + mova COVAR(jq, 0), xmm0 + mova COVAR(jq, 1), xmm1 + mova COVAR(jq, 2), xmm2 + mova COVAR(jq, 3), xmm3 +%else + vmulpd xmm0, xmm3, xmm4 + vmulpd xmm1, xmm3, xmm5 + vmulpd xmm2, xmm3, xmm6 + vmulpd xmm3, xmm3, xmm7 + ADDPD_MEM COVAR(jq,0), xmm0 + ADDPD_MEM COVAR(jq,1), xmm1 + ADDPD_MEM COVAR(jq,2), xmm2 + ADDPD_MEM COVAR(jq,3), xmm3 +%endif ; cpuflag(fma3) +.skip2x4: + add id, 4 + add covarq, 4*COVAR_STRIDE + cmp id, count2d + jle .loopi + cmp id, countd + jg .ret + mov jd, id +.loop2x1: + vmovddup xmm0, [varq + iq*8] +%if cpuflag(fma3) + mova xmm1, [varq + jq*8] + fmaddpd xmm0, xmm1, xmm0, COVAR(jq,0) + mova COVAR(jq,0), xmm0 +%else + vmulpd xmm0, [varq + jq*8] + ADDPD_MEM COVAR(jq,0), xmm0 +%endif ; cpuflag(fma3) + inc id + add covarq, COVAR_STRIDE + cmp id, countd + jle .loop2x1 +.ret: + RET +%endmacro ; UPDATE_LLS + +%if HAVE_AVX_EXTERNAL +INIT_YMM avx +UPDATE_LLS +%endif +%if HAVE_FMA3_EXTERNAL +INIT_YMM fma3 +UPDATE_LLS +%endif + +INIT_XMM sse2 +cglobal evaluate_lls, 3,4,2, ctx, var, order, i + ; This function is often called on the same buffer as update_lls, but with + ; an offset. They can't both be aligned. + ; Load halves rather than movu to avoid store-forwarding stalls, since the + ; input was initialized immediately prior to this function using scalar math. + %define coefsq ctxq + mov id, orderd + imul orderd, MAX_VARS + lea coefsq, [ctxq + LLSModel.coeff + orderq*8] + movsd m0, [varq] + movhpd m0, [varq + 8] + mulpd m0, [coefsq] + lea coefsq, [coefsq + iq*8] + lea varq, [varq + iq*8] + neg iq + add iq, 2 +.loop: + movsd m1, [varq + iq*8] + movhpd m1, [varq + iq*8 + 8] + mulpd m1, [coefsq + iq*8] + addpd m0, m1 + add iq, 2 + jl .loop + jg .skip1 + movsd m1, [varq + iq*8] + mulsd m1, [coefsq + iq*8] + addpd m0, m1 +.skip1: + movhlps m1, m0 + addsd m0, m1 +%if ARCH_X86_32 + movsd r0m, m0 + fld qword r0m +%endif + RET diff --git a/media/ffvpx/libavutil/x86/lls_init.c b/media/ffvpx/libavutil/x86/lls_init.c new file mode 100644 index 0000000000..c786376915 --- /dev/null +++ b/media/ffvpx/libavutil/x86/lls_init.c @@ -0,0 +1,46 @@ +/* + * linear least squares model + * + * Copyright (c) 2013 Loren Merritt + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#include "libavutil/attributes.h" +#include "libavutil/lls.h" +#include "libavutil/x86/cpu.h" + +void ff_update_lls_sse2(LLSModel *m, const double *var); +void ff_update_lls_avx(LLSModel *m, const double *var); +void ff_update_lls_fma3(LLSModel *m, const double *var); +double ff_evaluate_lls_sse2(LLSModel *m, const double *var, int order); + +av_cold void ff_init_lls_x86(LLSModel *m) +{ + int cpu_flags = av_get_cpu_flags(); + if (EXTERNAL_SSE2(cpu_flags)) { + m->update_lls = ff_update_lls_sse2; + if (m->indep_count >= 4) + m->evaluate_lls = ff_evaluate_lls_sse2; + } + if (EXTERNAL_AVX_FAST(cpu_flags)) { + m->update_lls = ff_update_lls_avx; + } + if (EXTERNAL_FMA3_FAST(cpu_flags)) { + m->update_lls = ff_update_lls_fma3; + } +} diff --git a/media/ffvpx/libavutil/x86/moz.build b/media/ffvpx/libavutil/x86/moz.build new file mode 100644 index 0000000000..df33768f66 --- /dev/null +++ b/media/ffvpx/libavutil/x86/moz.build @@ -0,0 +1,25 @@ +# -*- Mode: python; indent-tabs-mode: nil; tab-width: 40 -*- +# vim: set filetype=python: +# This Source Code Form is subject to the terms of the Mozilla Public +# License, v. 2.0. If a copy of the MPL was not distributed with this +# file, You can obtain one at http://mozilla.org/MPL/2.0/. + +SOURCES += [ + 'cpu.c', + 'cpuid.asm', + 'emms.asm', + 'fixed_dsp.asm', + 'fixed_dsp_init.c', + 'float_dsp.asm', + 'float_dsp_init.c', + 'imgutils.asm', + 'imgutils_init.c', + 'lls.asm', + 'lls_init.c', + 'tx_float.asm', + 'tx_float_init.c', +] + +FINAL_LIBRARY = 'mozavutil' + +include('/media/ffvpx/ffvpxcommon.mozbuild') diff --git a/media/ffvpx/libavutil/x86/pixelutils.asm b/media/ffvpx/libavutil/x86/pixelutils.asm new file mode 100644 index 0000000000..0bcccb51f5 --- /dev/null +++ b/media/ffvpx/libavutil/x86/pixelutils.asm @@ -0,0 +1,329 @@ +;****************************************************************************** +;* Pixel utilities SIMD +;* +;* Copyright (C) 2002-2004 Michael Niedermayer +;* Copyright (C) 2014 Clément Bœsch +;* +;* This file is part of FFmpeg. +;* +;* FFmpeg is free software; you can redistribute it and/or +;* modify it under the terms of the GNU Lesser General Public +;* License as published by the Free Software Foundation; either +;* version 2.1 of the License, or (at your option) any later version. +;* +;* FFmpeg is distributed in the hope that it will be useful, +;* but WITHOUT ANY WARRANTY; without even the implied warranty of +;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +;* Lesser General Public License for more details. +;* +;* You should have received a copy of the GNU Lesser General Public +;* License along with FFmpeg; if not, write to the Free Software +;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA +;****************************************************************************** + +%include "libavutil/x86/x86util.asm" + +SECTION .text + +;------------------------------------------------------------------------------- +; int ff_pixelutils_sad_8x8_mmxext(const uint8_t *src1, ptrdiff_t stride1, +; const uint8_t *src2, ptrdiff_t stride2); +;------------------------------------------------------------------------------- +INIT_MMX mmxext +cglobal pixelutils_sad_8x8, 4,4,0, src1, stride1, src2, stride2 + pxor m2, m2 +%rep 4 + mova m0, [src1q] + mova m1, [src1q + stride1q] + psadbw m0, [src2q] + psadbw m1, [src2q + stride2q] + paddw m2, m0 + paddw m2, m1 + lea src1q, [src1q + 2*stride1q] + lea src2q, [src2q + 2*stride2q] +%endrep + movd eax, m2 + emms + RET + +;------------------------------------------------------------------------------- +; int ff_pixelutils_sad_16x16_sse2(const uint8_t *src1, ptrdiff_t stride1, +; const uint8_t *src2, ptrdiff_t stride2); +;------------------------------------------------------------------------------- +INIT_XMM sse2 +cglobal pixelutils_sad_16x16, 4,4,5, src1, stride1, src2, stride2 + movu m4, [src1q] + movu m2, [src2q] + movu m1, [src1q + stride1q] + movu m3, [src2q + stride2q] + psadbw m4, m2 + psadbw m1, m3 + paddw m4, m1 +%rep 7 + lea src1q, [src1q + 2*stride1q] + lea src2q, [src2q + 2*stride2q] + movu m0, [src1q] + movu m2, [src2q] + movu m1, [src1q + stride1q] + movu m3, [src2q + stride2q] + psadbw m0, m2 + psadbw m1, m3 + paddw m4, m0 + paddw m4, m1 +%endrep + movhlps m0, m4 + paddw m4, m0 + movd eax, m4 + RET + +;------------------------------------------------------------------------------- +; int ff_pixelutils_sad_[au]_16x16_sse2(const uint8_t *src1, ptrdiff_t stride1, +; const uint8_t *src2, ptrdiff_t stride2); +;------------------------------------------------------------------------------- +%macro SAD_XMM_16x16 1 +INIT_XMM sse2 +cglobal pixelutils_sad_%1_16x16, 4,4,3, src1, stride1, src2, stride2 + mov%1 m2, [src2q] + psadbw m2, [src1q] + mov%1 m1, [src2q + stride2q] + psadbw m1, [src1q + stride1q] + paddw m2, m1 +%rep 7 + lea src1q, [src1q + 2*stride1q] + lea src2q, [src2q + 2*stride2q] + mov%1 m0, [src2q] + psadbw m0, [src1q] + mov%1 m1, [src2q + stride2q] + psadbw m1, [src1q + stride1q] + paddw m2, m0 + paddw m2, m1 +%endrep + movhlps m0, m2 + paddw m2, m0 + movd eax, m2 + RET +%endmacro + +SAD_XMM_16x16 a +SAD_XMM_16x16 u + + +%macro PROCESS_SAD_32x4_U 0 + movu m1, [r2] + movu m2, [r2 + 16] + movu m3, [r0] + movu m4, [r0 + 16] + psadbw m1, m3 + psadbw m2, m4 + paddd m1, m2 + paddd m0, m1 + lea r2, [r2 + r3] + lea r0, [r0 + r1] + + movu m1, [r2] + movu m2, [r2 + 16] + movu m3, [r0] + movu m4, [r0 + 16] + psadbw m1, m3 + psadbw m2, m4 + paddd m1, m2 + paddd m0, m1 + lea r2, [r2 + r3] + lea r0, [r0 + r1] + + movu m1, [r2] + movu m2, [r2 + 16] + movu m3, [r0] + movu m4, [r0 + 16] + psadbw m1, m3 + psadbw m2, m4 + paddd m1, m2 + paddd m0, m1 + lea r2, [r2 + r3] + lea r0, [r0 + r1] + + movu m1, [r2] + movu m2, [r2 + 16] + movu m3, [r0] + movu m4, [r0 + 16] + psadbw m1, m3 + psadbw m2, m4 + paddd m1, m2 + paddd m0, m1 + lea r2, [r2 + r3] + lea r0, [r0 + r1] +%endmacro + +%macro PROCESS_SAD_32x4 1 + mov%1 m1, [r2] + mov%1 m2, [r2 + 16] + psadbw m1, [r0] + psadbw m2, [r0 + 16] + paddd m1, m2 + paddd m0, m1 + lea r2, [r2 + r3] + lea r0, [r0 + r1] + + mov%1 m1, [r2] + mov%1 m2, [r2 + 16] + psadbw m1, [r0] + psadbw m2, [r0 + 16] + paddd m1, m2 + paddd m0, m1 + lea r2, [r2 + r3] + lea r0, [r0 + r1] + + mov%1 m1, [r2] + mov%1 m2, [r2 + 16] + psadbw m1, [r0] + psadbw m2, [r0 + 16] + paddd m1, m2 + paddd m0, m1 + lea r2, [r2 + r3] + lea r0, [r0 + r1] + + mov%1 m1, [r2] + mov%1 m2, [r2 + 16] + psadbw m1, [r0] + psadbw m2, [r0 + 16] + paddd m1, m2 + paddd m0, m1 + lea r2, [r2 + r3] + lea r0, [r0 + r1] +%endmacro + +;----------------------------------------------------------------------------- +; int ff_pixelutils_sad_32x32_sse2(const uint8_t *src1, ptrdiff_t stride1, +; const uint8_t *src2, ptrdiff_t stride2); +;----------------------------------------------------------------------------- +INIT_XMM sse2 +cglobal pixelutils_sad_32x32, 4,5,5, src1, stride1, src2, stride2 + pxor m0, m0 + mov r4d, 4 +.loop: + PROCESS_SAD_32x4_U + PROCESS_SAD_32x4_U + dec r4d + jnz .loop + + movhlps m1, m0 + paddd m0, m1 + movd eax, m0 + RET + +;------------------------------------------------------------------------------- +; int ff_pixelutils_sad_[au]_32x32_sse2(const uint8_t *src1, ptrdiff_t stride1, +; const uint8_t *src2, ptrdiff_t stride2); +;------------------------------------------------------------------------------- +%macro SAD_XMM_32x32 1 +INIT_XMM sse2 +cglobal pixelutils_sad_%1_32x32, 4,5,3, src1, stride1, src2, stride2 + pxor m0, m0 + mov r4d, 4 +.loop: + PROCESS_SAD_32x4 %1 + PROCESS_SAD_32x4 %1 + dec r4d + jnz .loop + + movhlps m1, m0 + paddd m0, m1 + movd eax, m0 + RET +%endmacro + +SAD_XMM_32x32 a +SAD_XMM_32x32 u + +%if HAVE_AVX2_EXTERNAL +;------------------------------------------------------------------------------- +; int ff_pixelutils_sad_32x32_avx2(const uint8_t *src1, ptrdiff_t stride1, +; const uint8_t *src2, ptrdiff_t stride2); +;------------------------------------------------------------------------------- +INIT_YMM avx2 +cglobal pixelutils_sad_32x32, 4,7,5, src1, stride1, src2, stride2 + pxor m0, m0 + mov r4d, 32/4 + lea r5, [stride1q * 3] + lea r6, [stride2q * 3] + +.loop: + movu m1, [src1q] ; row 0 of pix0 + movu m2, [src2q] ; row 0 of pix1 + movu m3, [src1q + stride1q] ; row 1 of pix0 + movu m4, [src2q + stride2q] ; row 1 of pix1 + + psadbw m1, m2 + psadbw m3, m4 + paddd m0, m1 + paddd m0, m3 + + movu m1, [src1q + 2 * stride1q] ; row 2 of pix0 + movu m2, [src2q + 2 * stride2q] ; row 2 of pix1 + movu m3, [src1q + r5] ; row 3 of pix0 + movu m4, [src2q + r6] ; row 3 of pix1 + + psadbw m1, m2 + psadbw m3, m4 + paddd m0, m1 + paddd m0, m3 + + lea src2q, [src2q + 4 * stride2q] + lea src1q, [src1q + 4 * stride1q] + + dec r4d + jnz .loop + + vextracti128 xm1, m0, 1 + paddd xm0, xm1 + pshufd xm1, xm0, 2 + paddd xm0, xm1 + movd eax, xm0 + RET + +;------------------------------------------------------------------------------- +; int ff_pixelutils_sad_[au]_32x32_avx2(const uint8_t *src1, ptrdiff_t stride1, +; const uint8_t *src2, ptrdiff_t stride2); +;------------------------------------------------------------------------------- +%macro SAD_AVX2_32x32 1 +INIT_YMM avx2 +cglobal pixelutils_sad_%1_32x32, 4,7,3, src1, stride1, src2, stride2 + pxor m0, m0 + mov r4d, 32/4 + lea r5, [stride1q * 3] + lea r6, [stride2q * 3] + +.loop: + mov%1 m1, [src2q] ; row 0 of pix1 + psadbw m1, [src1q] + mov%1 m2, [src2q + stride2q] ; row 1 of pix1 + psadbw m2, [src1q + stride1q] + + paddd m0, m1 + paddd m0, m2 + + mov%1 m1, [src2q + 2 * stride2q] ; row 2 of pix1 + psadbw m1, [src1q + 2 * stride1q] + mov%1 m2, [src2q + r6] ; row 3 of pix1 + psadbw m2, [src1q + r5] + + paddd m0, m1 + paddd m0, m2 + + lea src2q, [src2q + 4 * stride2q] + lea src1q, [src1q + 4 * stride1q] + + dec r4d + jnz .loop + + vextracti128 xm1, m0, 1 + paddd xm0, xm1 + pshufd xm1, xm0, 2 + paddd xm0, xm1 + movd eax, xm0 + RET +%endmacro + +SAD_AVX2_32x32 a +SAD_AVX2_32x32 u +%endif diff --git a/media/ffvpx/libavutil/x86/pixelutils.h b/media/ffvpx/libavutil/x86/pixelutils.h new file mode 100644 index 0000000000..876cf46053 --- /dev/null +++ b/media/ffvpx/libavutil/x86/pixelutils.h @@ -0,0 +1,26 @@ +/* + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#ifndef AVUTIL_X86_PIXELUTILS_H +#define AVUTIL_X86_PIXELUTILS_H + +#include "libavutil/pixelutils.h" + +void ff_pixelutils_sad_init_x86(av_pixelutils_sad_fn *sad, int aligned); + +#endif /* AVUTIL_X86_PIXELUTILS_H */ diff --git a/media/ffvpx/libavutil/x86/pixelutils_init.c b/media/ffvpx/libavutil/x86/pixelutils_init.c new file mode 100644 index 0000000000..c3c0662414 --- /dev/null +++ b/media/ffvpx/libavutil/x86/pixelutils_init.c @@ -0,0 +1,85 @@ +/* + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#include "config.h" + +#include "pixelutils.h" +#include "cpu.h" + +int ff_pixelutils_sad_8x8_mmxext(const uint8_t *src1, ptrdiff_t stride1, + const uint8_t *src2, ptrdiff_t stride2); + +int ff_pixelutils_sad_16x16_sse2(const uint8_t *src1, ptrdiff_t stride1, + const uint8_t *src2, ptrdiff_t stride2); +int ff_pixelutils_sad_a_16x16_sse2(const uint8_t *src1, ptrdiff_t stride1, + const uint8_t *src2, ptrdiff_t stride2); +int ff_pixelutils_sad_u_16x16_sse2(const uint8_t *src1, ptrdiff_t stride1, + const uint8_t *src2, ptrdiff_t stride2); + +int ff_pixelutils_sad_32x32_sse2(const uint8_t *src1, ptrdiff_t stride1, + const uint8_t *src2, ptrdiff_t stride2); +int ff_pixelutils_sad_a_32x32_sse2(const uint8_t *src1, ptrdiff_t stride1, + const uint8_t *src2, ptrdiff_t stride2); +int ff_pixelutils_sad_u_32x32_sse2(const uint8_t *src1, ptrdiff_t stride1, + const uint8_t *src2, ptrdiff_t stride2); + +int ff_pixelutils_sad_32x32_avx2(const uint8_t *src1, ptrdiff_t stride1, + const uint8_t *src2, ptrdiff_t stride2); +int ff_pixelutils_sad_a_32x32_avx2(const uint8_t *src1, ptrdiff_t stride1, + const uint8_t *src2, ptrdiff_t stride2); +int ff_pixelutils_sad_u_32x32_avx2(const uint8_t *src1, ptrdiff_t stride1, + const uint8_t *src2, ptrdiff_t stride2); + +void ff_pixelutils_sad_init_x86(av_pixelutils_sad_fn *sad, int aligned) +{ + int cpu_flags = av_get_cpu_flags(); + + // The best way to use SSE2 would be to do 2 SADs in parallel, + // but we'd have to modify the pixelutils API to return SIMD functions. + + // It's probably not faster to shuffle data around + // to get two lines of 8 pixels into a single 16byte register, + // so just use the MMX 8x8 version even when SSE2 is available. + if (EXTERNAL_MMXEXT(cpu_flags)) { + sad[2] = ff_pixelutils_sad_8x8_mmxext; + } + + if (EXTERNAL_SSE2(cpu_flags)) { + switch (aligned) { + case 0: sad[3] = ff_pixelutils_sad_16x16_sse2; break; // src1 unaligned, src2 unaligned + case 1: sad[3] = ff_pixelutils_sad_u_16x16_sse2; break; // src1 aligned, src2 unaligned + case 2: sad[3] = ff_pixelutils_sad_a_16x16_sse2; break; // src1 aligned, src2 aligned + } + } + + if (EXTERNAL_SSE2(cpu_flags)) { + switch (aligned) { + case 0: sad[4] = ff_pixelutils_sad_32x32_sse2; break; // src1 unaligned, src2 unaligned + case 1: sad[4] = ff_pixelutils_sad_u_32x32_sse2; break; // src1 aligned, src2 unaligned + case 2: sad[4] = ff_pixelutils_sad_a_32x32_sse2; break; // src1 aligned, src2 aligned + } + } + + if (EXTERNAL_AVX2_FAST(cpu_flags)) { + switch (aligned) { + case 0: sad[4] = ff_pixelutils_sad_32x32_avx2; break; // src1 unaligned, src2 unaligned + case 1: sad[4] = ff_pixelutils_sad_u_32x32_avx2; break; // src1 aligned, src2 unaligned + case 2: sad[4] = ff_pixelutils_sad_a_32x32_avx2; break; // src1 aligned, src2 aligned + } + } +} diff --git a/media/ffvpx/libavutil/x86/timer.h b/media/ffvpx/libavutil/x86/timer.h new file mode 100644 index 0000000000..4d1e88def0 --- /dev/null +++ b/media/ffvpx/libavutil/x86/timer.h @@ -0,0 +1,50 @@ +/* + * copyright (c) 2006 Michael Niedermayer + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#ifndef AVUTIL_X86_TIMER_H +#define AVUTIL_X86_TIMER_H + +#include + +#if HAVE_INLINE_ASM + +#define FF_TIMER_UNITS "decicycles" +#define AV_READ_TIME read_time + +static inline uint64_t read_time(void) +{ + uint32_t a, d; + __asm__ volatile( +#if ARCH_X86_64 || defined(__SSE2__) + "lfence \n\t" +#endif + "rdtsc \n\t" + : "=a" (a), "=d" (d)); + return ((uint64_t)d << 32) + a; +} + +#elif HAVE_RDTSC + +#include +#define AV_READ_TIME __rdtsc + +#endif /* HAVE_INLINE_ASM */ + +#endif /* AVUTIL_X86_TIMER_H */ diff --git a/media/ffvpx/libavutil/x86/tx_float.asm b/media/ffvpx/libavutil/x86/tx_float.asm new file mode 100644 index 0000000000..e1533a8595 --- /dev/null +++ b/media/ffvpx/libavutil/x86/tx_float.asm @@ -0,0 +1,1936 @@ +;****************************************************************************** +;* Copyright (c) Lynne +;* +;* This file is part of FFmpeg. +;* +;* FFmpeg is free software; you can redistribute it and/or +;* modify it under the terms of the GNU Lesser General Public +;* License as published by the Free Software Foundation; either +;* version 2.1 of the License, or (at your option) any later version. +;* +;* FFmpeg is distributed in the hope that it will be useful, +;* but WITHOUT ANY WARRANTY; without even the implied warranty of +;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +;* Lesser General Public License for more details. +;* +;* You should have received a copy of the GNU Lesser General Public +;* License along with FFmpeg; if not, write to the Free Software +;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA +;****************************************************************************** + +; Open `doc/transforms.md` to see the code upon which the transforms here were +; based upon and compare. + +; Intra-asm call convention: +; 320 bytes of stack available +; 14 GPRs available (last 4 must not be clobbered) +; Additionally, don't clobber ctx, in, out, stride, len, lut +; All vector regs available + +; TODO: +; carry over registers from smaller transforms to save on ~8 loads/stores +; check if vinsertf could be faster than verpm2f128 for duplication +; even faster FFT8 (current one is very #instructions optimized) +; replace some xors with blends + addsubs? +; replace some shuffles with vblends? +; avx512 split-radix + +%include "libavutil/x86/x86util.asm" + +%define private_prefix ff_tx + +%if ARCH_X86_64 +%define ptr resq +%else +%define ptr resd +%endif + +%assign i 16 +%rep 14 +cextern tab_ %+ i %+ _float ; ff_tab_i_float... +%assign i (i << 1) +%endrep + +cextern tab_53_float + +struc AVTXContext + .len: resd 1 ; Length + .inv resd 1 ; Inverse flag + .map: ptr 1 ; Lookup table(s) + .exp: ptr 1 ; Exponentiation factors + .tmp: ptr 1 ; Temporary data + + .sub: ptr 1 ; Subcontexts + .fn: ptr 4 ; Subcontext functions + .nb_sub: resd 1 ; Subcontext count + + ; Everything else is inaccessible +endstruc + +SECTION_RODATA 32 + +%define POS 0x00000000 +%define NEG 0x80000000 + +%define M_SQRT1_2 0.707106781186547524401 +%define COS16_1 0.92387950420379638671875 +%define COS16_3 0.3826834261417388916015625 + +d8_mult_odd: dd M_SQRT1_2, -M_SQRT1_2, -M_SQRT1_2, M_SQRT1_2, \ + M_SQRT1_2, -M_SQRT1_2, -M_SQRT1_2, M_SQRT1_2 + +s8_mult_odd: dd 1.0, 1.0, -1.0, 1.0, -M_SQRT1_2, -M_SQRT1_2, M_SQRT1_2, M_SQRT1_2 +s8_perm_even: dd 1, 3, 0, 2, 1, 3, 2, 0 +s8_perm_odd1: dd 3, 3, 1, 1, 1, 1, 3, 3 +s8_perm_odd2: dd 1, 2, 0, 3, 1, 0, 0, 1 + +s16_mult_even: dd 1.0, 1.0, M_SQRT1_2, M_SQRT1_2, 1.0, -1.0, M_SQRT1_2, -M_SQRT1_2 +s16_mult_odd1: dd COS16_1, COS16_1, COS16_3, COS16_3, COS16_1, -COS16_1, COS16_3, -COS16_3 +s16_mult_odd2: dd COS16_3, -COS16_3, COS16_1, -COS16_1, -COS16_3, -COS16_3, -COS16_1, -COS16_1 +s16_perm: dd 0, 1, 2, 3, 1, 0, 3, 2 + +s15_perm: dd 0, 6, 5, 3, 2, 4, 7, 1 + +mask_mmppmmmm: dd NEG, NEG, POS, POS, NEG, NEG, NEG, NEG +mask_mmmmpppm: dd NEG, NEG, NEG, NEG, POS, POS, POS, NEG +mask_ppmpmmpm: dd POS, POS, NEG, POS, NEG, NEG, POS, NEG +mask_mppmmpmp: dd NEG, POS, POS, NEG, NEG, POS, NEG, POS +mask_mpmppmpm: dd NEG, POS, NEG, POS, POS, NEG, POS, NEG +mask_pmmppmmp: dd POS, NEG, NEG, POS, POS, NEG, NEG, POS +mask_pmpmpmpm: times 4 dd POS, NEG + +SECTION .text + +; Load complex values (64 bits) via a lookup table +; %1 - output register +; %2 - GRP of base input memory address +; %3 - GPR of LUT (int32_t indices) address +; %4 - LUT offset +; %5 - temporary GPR (only used if vgather is not used) +; %6 - temporary register (for avx only) +; %7 - temporary register (for avx only, enables vgatherdpd (AVX2) if FMA3 is set) +%macro LOAD64_LUT 5-7 +%if %0 > 6 && cpuflag(avx2) + pcmpeqd %7, %7 ; pcmpeqq has a 0.5 throughput on Zen 3, this has 0.25 + movupd xmm%6, [%3 + %4] ; float mov since vgatherdpd is a float instruction + vgatherdpd %1, [%2 + xmm%6*8], %7 ; must use separate registers for args +%else + mov %5d, [%3 + %4 + 0] + movsd xmm%1, [%2 + %5q*8] +%if sizeof%1 > 16 && %0 > 5 + mov %5d, [%3 + %4 + 8] + movsd xmm%6, [%2 + %5q*8] +%endif + mov %5d, [%3 + %4 + 4] + movhps xmm%1, [%2 + %5q*8] +%if sizeof%1 > 16 && %0 > 5 + mov %5d, [%3 + %4 + 12] + movhps xmm%6, [%2 + %5q*8] + vinsertf128 %1, %1, xmm%6, 1 +%endif +%endif +%endmacro + +; Single 2-point in-place complex FFT (will do 2 transforms at once in AVX mode) +; %1 - coefficients (r0.reim, r1.reim) +; %2 - temporary +%macro FFT2 2 + shufps %2, %1, %1, q3322 + shufps %1, %1, %1, q1100 + + addsubps %1, %1, %2 + + shufps %1, %1, %1, q2031 +%endmacro + +; Single 4-point in-place complex FFT (will do 2 transforms at once in [AVX] mode) +; %1 - even coefficients (r0.reim, r2.reim, r4.reim, r6.reim) +; %2 - odd coefficients (r1.reim, r3.reim, r5.reim, r7.reim) +; %3 - temporary +%macro FFT4 3 + subps %3, %1, %2 ; r1234, [r5678] + addps %1, %1, %2 ; t1234, [t5678] + + shufps %2, %1, %3, q1010 ; t12, r12 + shufps %1, %1, %3, q2332 ; t34, r43 + + subps %3, %2, %1 ; a34, b32 + addps %2, %2, %1 ; a12, b14 + + shufps %1, %2, %3, q1010 ; a1234 even + + shufps %2, %2, %3, q2332 ; b1423 + shufps %2, %2, %2, q1320 ; b1234 odd +%endmacro + +; Single/Dual 8-point in-place complex FFT (will do 2 transforms in [AVX] mode) +; %1 - even coefficients (a0.reim, a2.reim, [b0.reim, b2.reim]) +; %2 - even coefficients (a4.reim, a6.reim, [b4.reim, b6.reim]) +; %3 - odd coefficients (a1.reim, a3.reim, [b1.reim, b3.reim]) +; %4 - odd coefficients (a5.reim, a7.reim, [b5.reim, b7.reim]) +; %5 - temporary +; %6 - temporary +%macro FFT8 6 + addps %5, %1, %3 ; q1-8 + addps %6, %2, %4 ; k1-8 + + subps %1, %1, %3 ; r1-8 + subps %2, %2, %4 ; j1-8 + + shufps %4, %1, %1, q2323 ; r4343 + shufps %3, %5, %6, q3032 ; q34, k14 + + shufps %1, %1, %1, q1010 ; r1212 + shufps %5, %5, %6, q1210 ; q12, k32 + + xorps %4, %4, [mask_pmmppmmp] ; r4343 * pmmp + addps %6, %5, %3 ; s12, g12 + + mulps %2, %2, [d8_mult_odd] ; r8 * d8_mult_odd + subps %5, %5, %3 ; s34, g43 + + addps %3, %1, %4 ; z1234 + unpcklpd %1, %6, %5 ; s1234 + + shufps %4, %2, %2, q2301 ; j2143 + shufps %6, %6, %5, q2332 ; g1234 + + addsubps %2, %2, %4 ; l2143 + shufps %5, %2, %2, q0123 ; l3412 + addsubps %5, %5, %2 ; t1234 + + subps %2, %1, %6 ; h1234 even + subps %4, %3, %5 ; u1234 odd + + addps %1, %1, %6 ; w1234 even + addps %3, %3, %5 ; o1234 odd +%endmacro + +; Single 8-point in-place complex FFT in 20 instructions +; %1 - even coefficients (r0.reim, r2.reim, r4.reim, r6.reim) +; %2 - odd coefficients (r1.reim, r3.reim, r5.reim, r7.reim) +; %3 - temporary +; %4 - temporary +%macro FFT8_AVX 4 + subps %3, %1, %2 ; r1234, r5678 + addps %1, %1, %2 ; q1234, q5678 + + vpermilps %2, %3, [s8_perm_odd1] ; r4422, r6688 + shufps %4, %1, %1, q3322 ; q1122, q5566 + + movsldup %3, %3 ; r1133, r5577 + shufps %1, %1, %1, q1100 ; q3344, q7788 + + addsubps %3, %3, %2 ; z1234, z5678 + addsubps %1, %1, %4 ; s3142, s7586 + + mulps %3, %3, [s8_mult_odd] ; z * s8_mult_odd + vpermilps %1, %1, [s8_perm_even] ; s1234, s5687 ! + + shufps %2, %3, %3, q2332 ; junk, z7887 + xorps %4, %1, [mask_mmmmpppm] ; e1234, e5687 ! + + vpermilps %3, %3, [s8_perm_odd2] ; z2314, z6556 + vperm2f128 %1, %1, %4, 0x03 ; e5687, s1234 + + addsubps %2, %2, %3 ; junk, t5678 + subps %1, %1, %4 ; w1234, w5678 even + + vperm2f128 %2, %2, %2, 0x11 ; t5678, t5678 + vperm2f128 %3, %3, %3, 0x00 ; z2314, z2314 + + xorps %2, %2, [mask_ppmpmmpm] ; t * ppmpmmpm + addps %2, %3, %2 ; u1234, u5678 odd +%endmacro + +; Single 16-point in-place complex FFT +; %1 - even coefficients (r0.reim, r2.reim, r4.reim, r6.reim) +; %2 - even coefficients (r8.reim, r10.reim, r12.reim, r14.reim) +; %3 - odd coefficients (r1.reim, r3.reim, r5.reim, r7.reim) +; %4 - odd coefficients (r9.reim, r11.reim, r13.reim, r15.reim) +; %5, %6 - temporary +; %7, %8 - temporary (optional) +%macro FFT16 6-8 + FFT4 %3, %4, %5 +%if %0 > 7 + FFT8_AVX %1, %2, %6, %7 + movaps %8, [mask_mpmppmpm] + movaps %7, [s16_perm] +%define mask %8 +%define perm %7 +%elif %0 > 6 + FFT8_AVX %1, %2, %6, %7 + movaps %7, [s16_perm] +%define mask [mask_mpmppmpm] +%define perm %7 +%else + FFT8_AVX %1, %2, %6, %5 +%define mask [mask_mpmppmpm] +%define perm [s16_perm] +%endif + xorps %5, %5, %5 ; 0 + + shufps %6, %4, %4, q2301 ; z12.imre, z13.imre... + shufps %5, %5, %3, q2301 ; 0, 0, z8.imre... + + mulps %4, %4, [s16_mult_odd1] ; z.reim * costab + xorps %5, %5, [mask_mppmmpmp] +%if cpuflag(fma3) + fmaddps %6, %6, [s16_mult_odd2], %4 ; s[8..15] + addps %5, %3, %5 ; s[0...7] +%else + mulps %6, %6, [s16_mult_odd2] ; z.imre * costab + + addps %5, %3, %5 ; s[0...7] + addps %6, %4, %6 ; s[8..15] +%endif + mulps %5, %5, [s16_mult_even] ; s[0...7]*costab + + xorps %4, %6, mask ; s[8..15]*mpmppmpm + xorps %3, %5, mask ; s[0...7]*mpmppmpm + + vperm2f128 %4, %4, %4, 0x01 ; s[12..15, 8..11] + vperm2f128 %3, %3, %3, 0x01 ; s[4..7, 0..3] + + addps %6, %6, %4 ; y56, u56, y34, u34 + addps %5, %5, %3 ; w56, x56, w34, x34 + + vpermilps %6, %6, perm ; y56, u56, y43, u43 + vpermilps %5, %5, perm ; w56, x56, w43, x43 + + subps %4, %2, %6 ; odd part 2 + addps %3, %2, %6 ; odd part 1 + + subps %2, %1, %5 ; even part 2 + addps %1, %1, %5 ; even part 1 +%undef mask +%undef perm +%endmacro + +; Single 15-point complex FFT +; Input: +; xm0 must contain in[0,1].reim +; m2 - in[3-6].reim +; m3 - in[7-11].reim +; m4 - in[12-15].reim +; xm5 must contain in[2].reimreim +; +; Output: +; m0, m1, m2 - ACs +; xm14 - out[0] +; xm15 - out[10, 5] +%macro FFT15 0 + shufps xm1, xm0, xm0, q3223 ; in[1].imrereim + shufps xm0, xm0, xm0, q1001 ; in[0].imrereim + + xorps xm1, xm11 + addps xm1, xm0 ; pc[0,1].imre + + shufps xm0, xm1, xm1, q3232 ; pc[1].reimreim + addps xm0, xm5 ; dc[0].reimreim + + mulps xm1, xm9 ; tab[0123]*pc[01] + + shufpd xm6, xm1, xm1, 01b ; pc[1,0].reim + xorps xm1, xm11 + addps xm1, xm1, xm6 + addsubps xm1, xm5, xm1 ; dc[1,2].reim + + subps m7, m2, m3 ; q[0-3].imre + addps m6, m2, m3 ; q[4-7] + shufps m7, m7, m7, q2301 ; q[0-3].reim + + addps m5, m4, m6 ; y[0-3] + + vperm2f128 m14, m9, m9, 0x11 ; tab[23232323] + vbroadcastsd m15, xm9 ; tab[01010101] + + mulps m6, m14 + mulps m7, m15 + + subps m2, m6, m7 ; k[0-3] + addps m3, m6, m7 ; k[4-7] + + shufps m12, m11, m11, q3232 ; ppppmmmm + + addsubps m6, m4, m2 ; k[0-3] + addsubps m7, m4, m3 ; k[4-7] + + ; 15pt from here on + vpermpd m2, m5, q0123 ; y[3-0] + vpermpd m3, m6, q0123 ; k[3-0] + vpermpd m4, m7, q0123 ; k[7-4] + + xorps m5, m12 + xorps m6, m12 + xorps m7, m12 + + addps m2, m5 ; t[0-3] + addps m3, m6 ; t[4-7] + addps m4, m7 ; t[8-11] + + movlhps xm14, xm2 ; out[0] + unpcklpd xm15, xm3, xm4 ; out[10,5] + unpckhpd xm5, xm3, xm4 ; out[10,5] + + addps xm14, xm2 ; out[0] + addps xm15, xm5 ; out[10,5] + addps xm14, xm0 ; out[0] + addps xm15, xm1 ; out[10,5] + + shufps m12, m10, m10, q3232 ; tab5 4 5 4 5 8 9 8 9 + shufps m13, m10, m10, q1010 ; tab5 6 7 6 7 10 11 10 11 + + mulps m5, m2, m12 ; t[0-3] + mulps m6, m3, m12 ; t[4-7] + mulps m7, m4, m12 ; t[8-11] + + mulps m2, m13 ; r[0-3] + mulps m3, m13 ; r[4-7] + mulps m4, m13 ; r[8-11] + + shufps m5, m5, m5, q1032 ; t[1,0,3,2].reim + shufps m6, m6, m6, q1032 ; t[5,4,7,6].reim + shufps m7, m7, m7, q1032 ; t[9,8,11,10].reim + + vperm2f128 m13, m11, m11, 0x01 ; mmmmmmpp + shufps m12, m11, m11, q3232 ; ppppmmmm + + xorps m5, m13 + xorps m6, m13 + xorps m7, m13 + + addps m2, m5 ; r[0,1,2,3] + addps m3, m6 ; r[4,5,6,7] + addps m4, m7 ; r[8,9,10,11] + + shufps m5, m2, m2, q2301 + shufps m6, m3, m3, q2301 + shufps m7, m4, m4, q2301 + + xorps m2, m12 + xorps m3, m12 + xorps m4, m12 + + vpermpd m5, m5, q0123 + vpermpd m6, m6, q0123 + vpermpd m7, m7, q0123 + + addps m5, m2 + addps m6, m3 + addps m7, m4 + + vpermps m5, m8, m5 + vpermps m6, m8, m6 + vpermps m7, m8, m7 + + vbroadcastsd m0, xm0 ; dc[0] + vpermpd m2, m1, q1111 ; dc[2] + vbroadcastsd m1, xm1 ; dc[1] + + addps m0, m5 + addps m1, m6 + addps m2, m7 +%endmacro + +; Cobmines m0...m8 (tx1[even, even, odd, odd], tx2,3[even], tx2,3[odd]) coeffs +; Uses all 16 of registers. +; Output is slightly permuted such that tx2,3's coefficients are interleaved +; on a 2-point basis (look at `doc/transforms.md`) +%macro SPLIT_RADIX_COMBINE 17 +%if %1 && mmsize == 32 + vperm2f128 %14, %6, %7, 0x20 ; m2[0], m2[1], m3[0], m3[1] even + vperm2f128 %16, %9, %8, 0x20 ; m2[0], m2[1], m3[0], m3[1] odd + vperm2f128 %15, %6, %7, 0x31 ; m2[2], m2[3], m3[2], m3[3] even + vperm2f128 %17, %9, %8, 0x31 ; m2[2], m2[3], m3[2], m3[3] odd +%endif + + shufps %12, %10, %10, q2200 ; cos00224466 + shufps %13, %11, %11, q1133 ; wim77553311 + movshdup %10, %10 ; cos11335577 + shufps %11, %11, %11, q0022 ; wim66442200 + +%if %1 && mmsize == 32 + shufps %6, %14, %14, q2301 ; m2[0].imre, m2[1].imre, m2[2].imre, m2[3].imre even + shufps %8, %16, %16, q2301 ; m2[0].imre, m2[1].imre, m2[2].imre, m2[3].imre odd + shufps %7, %15, %15, q2301 ; m3[0].imre, m3[1].imre, m3[2].imre, m3[3].imre even + shufps %9, %17, %17, q2301 ; m3[0].imre, m3[1].imre, m3[2].imre, m3[3].imre odd + + mulps %14, %14, %13 ; m2[0123]reim * wim7531 even + mulps %16, %16, %11 ; m2[0123]reim * wim7531 odd + mulps %15, %15, %13 ; m3[0123]reim * wim7531 even + mulps %17, %17, %11 ; m3[0123]reim * wim7531 odd +%else + mulps %14, %6, %13 ; m2,3[01]reim * wim7531 even + mulps %16, %8, %11 ; m2,3[01]reim * wim7531 odd + mulps %15, %7, %13 ; m2,3[23]reim * wim7531 even + mulps %17, %9, %11 ; m2,3[23]reim * wim7531 odd + ; reorder the multiplies to save movs reg, reg in the %if above + shufps %6, %6, %6, q2301 ; m2[0].imre, m2[1].imre, m3[0].imre, m3[1].imre even + shufps %8, %8, %8, q2301 ; m2[0].imre, m2[1].imre, m3[0].imre, m3[1].imre odd + shufps %7, %7, %7, q2301 ; m2[2].imre, m2[3].imre, m3[2].imre, m3[3].imre even + shufps %9, %9, %9, q2301 ; m2[2].imre, m2[3].imre, m3[2].imre, m3[3].imre odd +%endif + +%if cpuflag(fma3) ; 11 - 5 = 6 instructions saved through FMA! + fmaddsubps %6, %6, %12, %14 ; w[0..8] even + fmaddsubps %8, %8, %10, %16 ; w[0..8] odd + fmsubaddps %7, %7, %12, %15 ; j[0..8] even + fmsubaddps %9, %9, %10, %17 ; j[0..8] odd + movaps %13, [mask_pmpmpmpm] ; "subaddps? pfft, who needs that!" +%else + mulps %6, %6, %12 ; m2,3[01]imre * cos0246 + mulps %8, %8, %10 ; m2,3[01]imre * cos0246 + movaps %13, [mask_pmpmpmpm] ; "subaddps? pfft, who needs that!" + mulps %7, %7, %12 ; m2,3[23]reim * cos0246 + mulps %9, %9, %10 ; m2,3[23]reim * cos0246 + addsubps %6, %6, %14 ; w[0..8] + addsubps %8, %8, %16 ; w[0..8] + xorps %15, %15, %13 ; +-m2,3[23]imre * wim7531 + xorps %17, %17, %13 ; +-m2,3[23]imre * wim7531 + addps %7, %7, %15 ; j[0..8] + addps %9, %9, %17 ; j[0..8] +%endif + + addps %14, %6, %7 ; t10235476 even + addps %16, %8, %9 ; t10235476 odd + subps %15, %6, %7 ; +-r[0..7] even + subps %17, %8, %9 ; +-r[0..7] odd + + shufps %14, %14, %14, q2301 ; t[0..7] even + shufps %16, %16, %16, q2301 ; t[0..7] odd + xorps %15, %15, %13 ; r[0..7] even + xorps %17, %17, %13 ; r[0..7] odd + + subps %6, %2, %14 ; m2,3[01] even + subps %8, %4, %16 ; m2,3[01] odd + subps %7, %3, %15 ; m2,3[23] even + subps %9, %5, %17 ; m2,3[23] odd + + addps %2, %2, %14 ; m0 even + addps %4, %4, %16 ; m0 odd + addps %3, %3, %15 ; m1 even + addps %5, %5, %17 ; m1 odd +%endmacro + +; Same as above, only does one parity at a time, takes 3 temporary registers, +; however, if the twiddles aren't needed after this, the registers they use +; can be used as any of the temporary registers. +%macro SPLIT_RADIX_COMBINE_HALF 10 +%if %1 + shufps %8, %6, %6, q2200 ; cos00224466 + shufps %9, %7, %7, q1133 ; wim77553311 +%else + shufps %8, %6, %6, q3311 ; cos11335577 + shufps %9, %7, %7, q0022 ; wim66442200 +%endif + + mulps %10, %4, %9 ; m2,3[01]reim * wim7531 even + mulps %9, %9, %5 ; m2,3[23]reim * wim7531 even + + shufps %4, %4, %4, q2301 ; m2[0].imre, m2[1].imre, m3[0].imre, m3[1].imre even + shufps %5, %5, %5, q2301 ; m2[2].imre, m2[3].imre, m3[2].imre, m3[3].imre even + +%if cpuflag(fma3) + fmaddsubps %4, %4, %8, %10 ; w[0..8] even + fmsubaddps %5, %5, %8, %9 ; j[0..8] even + movaps %10, [mask_pmpmpmpm] +%else + mulps %4, %4, %8 ; m2,3[01]imre * cos0246 + mulps %5, %5, %8 ; m2,3[23]reim * cos0246 + addsubps %4, %4, %10 ; w[0..8] + movaps %10, [mask_pmpmpmpm] + xorps %9, %9, %10 ; +-m2,3[23]imre * wim7531 + addps %5, %5, %9 ; j[0..8] +%endif + + addps %8, %4, %5 ; t10235476 + subps %9, %4, %5 ; +-r[0..7] + + shufps %8, %8, %8, q2301 ; t[0..7] + xorps %9, %9, %10 ; r[0..7] + + subps %4, %2, %8 ; %3,3[01] + subps %5, %3, %9 ; %3,3[23] + + addps %2, %2, %8 ; m0 + addps %3, %3, %9 ; m1 +%endmacro + +; Same as above, tries REALLY hard to use 2 temporary registers. +%macro SPLIT_RADIX_COMBINE_LITE 9 +%if %1 + shufps %8, %6, %6, q2200 ; cos00224466 + shufps %9, %7, %7, q1133 ; wim77553311 +%else + shufps %8, %6, %6, q3311 ; cos11335577 + shufps %9, %7, %7, q0022 ; wim66442200 +%endif + + mulps %9, %9, %4 ; m2,3[01]reim * wim7531 even + shufps %4, %4, %4, q2301 ; m2[0].imre, m2[1].imre, m3[0].imre, m3[1].imre even + +%if cpuflag(fma3) + fmaddsubps %4, %4, %8, %9 ; w[0..8] even +%else + mulps %4, %4, %8 ; m2,3[01]imre * cos0246 + addsubps %4, %4, %9 ; w[0..8] +%endif + +%if %1 + shufps %9, %7, %7, q1133 ; wim77553311 +%else + shufps %9, %7, %7, q0022 ; wim66442200 +%endif + + mulps %9, %9, %5 ; m2,3[23]reim * wim7531 even + shufps %5, %5, %5, q2301 ; m2[2].imre, m2[3].imre, m3[2].imre, m3[3].imre even +%if cpuflag (fma3) + fmsubaddps %5, %5, %8, %9 ; j[0..8] even +%else + mulps %5, %5, %8 ; m2,3[23]reim * cos0246 + xorps %9, %9, [mask_pmpmpmpm] ; +-m2,3[23]imre * wim7531 + addps %5, %5, %9 ; j[0..8] +%endif + + addps %8, %4, %5 ; t10235476 + subps %9, %4, %5 ; +-r[0..7] + + shufps %8, %8, %8, q2301 ; t[0..7] + xorps %9, %9, [mask_pmpmpmpm] ; r[0..7] + + subps %4, %2, %8 ; %3,3[01] + subps %5, %3, %9 ; %3,3[23] + + addps %2, %2, %8 ; m0 + addps %3, %3, %9 ; m1 +%endmacro + +%macro SPLIT_RADIX_COMBINE_64 0 + SPLIT_RADIX_COMBINE_LITE 1, m0, m1, tx1_e0, tx2_e0, tw_e, tw_o, tmp1, tmp2 + + movaps [outq + 0*mmsize], m0 + movaps [outq + 4*mmsize], m1 + movaps [outq + 8*mmsize], tx1_e0 + movaps [outq + 12*mmsize], tx2_e0 + + SPLIT_RADIX_COMBINE_HALF 0, m2, m3, tx1_o0, tx2_o0, tw_e, tw_o, tmp1, tmp2, m0 + + movaps [outq + 2*mmsize], m2 + movaps [outq + 6*mmsize], m3 + movaps [outq + 10*mmsize], tx1_o0 + movaps [outq + 14*mmsize], tx2_o0 + + movaps tw_e, [tab_64_float + mmsize] + vperm2f128 tw_o, tw_o, [tab_64_float + 64 - 4*7 - mmsize], 0x23 + + movaps m0, [outq + 1*mmsize] + movaps m1, [outq + 3*mmsize] + movaps m2, [outq + 5*mmsize] + movaps m3, [outq + 7*mmsize] + + SPLIT_RADIX_COMBINE 0, m0, m2, m1, m3, tx1_e1, tx2_e1, tx1_o1, tx2_o1, tw_e, tw_o, \ + tmp1, tmp2, tx2_o0, tx1_o0, tx2_e0, tx1_e0 ; temporary registers + + movaps [outq + 1*mmsize], m0 + movaps [outq + 3*mmsize], m1 + movaps [outq + 5*mmsize], m2 + movaps [outq + 7*mmsize], m3 + + movaps [outq + 9*mmsize], tx1_e1 + movaps [outq + 11*mmsize], tx1_o1 + movaps [outq + 13*mmsize], tx2_e1 + movaps [outq + 15*mmsize], tx2_o1 +%endmacro + +; Perform a single even/odd split radix combination with loads and stores +; The _4 indicates this is a quarter of the iterations required to complete a full +; combine loop +; %1 must contain len*2, %2 must contain len*4, %3 must contain len*6 +%macro SPLIT_RADIX_LOAD_COMBINE_4 8 + movaps m8, [rtabq + (%5)*mmsize + %7] + vperm2f128 m9, m9, [itabq - (%5)*mmsize + %8], 0x23 + + movaps m0, [outq + (0 + %4)*mmsize + %6] + movaps m2, [outq + (2 + %4)*mmsize + %6] + movaps m1, [outq + %1 + (0 + %4)*mmsize + %6] + movaps m3, [outq + %1 + (2 + %4)*mmsize + %6] + + movaps m4, [outq + %2 + (0 + %4)*mmsize + %6] + movaps m6, [outq + %2 + (2 + %4)*mmsize + %6] + movaps m5, [outq + %3 + (0 + %4)*mmsize + %6] + movaps m7, [outq + %3 + (2 + %4)*mmsize + %6] + + SPLIT_RADIX_COMBINE 0, m0, m1, m2, m3, \ + m4, m5, m6, m7, \ + m8, m9, \ + m10, m11, m12, m13, m14, m15 + + movaps [outq + (0 + %4)*mmsize + %6], m0 + movaps [outq + (2 + %4)*mmsize + %6], m2 + movaps [outq + %1 + (0 + %4)*mmsize + %6], m1 + movaps [outq + %1 + (2 + %4)*mmsize + %6], m3 + + movaps [outq + %2 + (0 + %4)*mmsize + %6], m4 + movaps [outq + %2 + (2 + %4)*mmsize + %6], m6 + movaps [outq + %3 + (0 + %4)*mmsize + %6], m5 + movaps [outq + %3 + (2 + %4)*mmsize + %6], m7 +%endmacro + +%macro SPLIT_RADIX_LOAD_COMBINE_FULL 2-5 +%if %0 > 2 +%define offset_c %3 +%else +%define offset_c 0 +%endif +%if %0 > 3 +%define offset_r %4 +%else +%define offset_r 0 +%endif +%if %0 > 4 +%define offset_i %5 +%else +%define offset_i 0 +%endif + + SPLIT_RADIX_LOAD_COMBINE_4 %1, 2*%1, %2, 0, 0, offset_c, offset_r, offset_i + SPLIT_RADIX_LOAD_COMBINE_4 %1, 2*%1, %2, 1, 1, offset_c, offset_r, offset_i + SPLIT_RADIX_LOAD_COMBINE_4 %1, 2*%1, %2, 4, 2, offset_c, offset_r, offset_i + SPLIT_RADIX_LOAD_COMBINE_4 %1, 2*%1, %2, 5, 3, offset_c, offset_r, offset_i +%endmacro + +; Perform a single even/odd split radix combination with loads, deinterleaves and +; stores. The _2 indicates this is a half of the iterations required to complete +; a full combine+deinterleave loop +; %3 must contain len*2, %4 must contain len*4, %5 must contain len*6 +%macro SPLIT_RADIX_COMBINE_DEINTERLEAVE_2 6 + movaps m8, [rtabq + (0 + %2)*mmsize] + vperm2f128 m9, m9, [itabq - (0 + %2)*mmsize], 0x23 + + movaps m0, [outq + (0 + 0 + %1)*mmsize + %6] + movaps m2, [outq + (2 + 0 + %1)*mmsize + %6] + movaps m1, [outq + %3 + (0 + 0 + %1)*mmsize + %6] + movaps m3, [outq + %3 + (2 + 0 + %1)*mmsize + %6] + + movaps m4, [outq + %4 + (0 + 0 + %1)*mmsize + %6] + movaps m6, [outq + %4 + (2 + 0 + %1)*mmsize + %6] + movaps m5, [outq + %5 + (0 + 0 + %1)*mmsize + %6] + movaps m7, [outq + %5 + (2 + 0 + %1)*mmsize + %6] + + SPLIT_RADIX_COMBINE 0, m0, m1, m2, m3, \ + m4, m5, m6, m7, \ + m8, m9, \ + m10, m11, m12, m13, m14, m15 + + unpckhpd m10, m0, m2 + unpckhpd m11, m1, m3 + unpckhpd m12, m4, m6 + unpckhpd m13, m5, m7 + unpcklpd m0, m0, m2 + unpcklpd m1, m1, m3 + unpcklpd m4, m4, m6 + unpcklpd m5, m5, m7 + + vextractf128 [outq + (0 + 0 + %1)*mmsize + %6 + 0], m0, 0 + vextractf128 [outq + (0 + 0 + %1)*mmsize + %6 + 16], m10, 0 + vextractf128 [outq + %3 + (0 + 0 + %1)*mmsize + %6 + 0], m1, 0 + vextractf128 [outq + %3 + (0 + 0 + %1)*mmsize + %6 + 16], m11, 0 + + vextractf128 [outq + %4 + (0 + 0 + %1)*mmsize + %6 + 0], m4, 0 + vextractf128 [outq + %4 + (0 + 0 + %1)*mmsize + %6 + 16], m12, 0 + vextractf128 [outq + %5 + (0 + 0 + %1)*mmsize + %6 + 0], m5, 0 + vextractf128 [outq + %5 + (0 + 0 + %1)*mmsize + %6 + 16], m13, 0 + + vperm2f128 m10, m10, m0, 0x13 + vperm2f128 m11, m11, m1, 0x13 + vperm2f128 m12, m12, m4, 0x13 + vperm2f128 m13, m13, m5, 0x13 + + movaps m8, [rtabq + (1 + %2)*mmsize] + vperm2f128 m9, m9, [itabq - (1 + %2)*mmsize], 0x23 + + movaps m0, [outq + (0 + 1 + %1)*mmsize + %6] + movaps m2, [outq + (2 + 1 + %1)*mmsize + %6] + movaps m1, [outq + %3 + (0 + 1 + %1)*mmsize + %6] + movaps m3, [outq + %3 + (2 + 1 + %1)*mmsize + %6] + + movaps [outq + (0 + 1 + %1)*mmsize + %6], m10 ; m0 conflict + movaps [outq + %3 + (0 + 1 + %1)*mmsize + %6], m11 ; m1 conflict + + movaps m4, [outq + %4 + (0 + 1 + %1)*mmsize + %6] + movaps m6, [outq + %4 + (2 + 1 + %1)*mmsize + %6] + movaps m5, [outq + %5 + (0 + 1 + %1)*mmsize + %6] + movaps m7, [outq + %5 + (2 + 1 + %1)*mmsize + %6] + + movaps [outq + %4 + (0 + 1 + %1)*mmsize + %6], m12 ; m4 conflict + movaps [outq + %5 + (0 + 1 + %1)*mmsize + %6], m13 ; m5 conflict + + SPLIT_RADIX_COMBINE 0, m0, m1, m2, m3, \ + m4, m5, m6, m7, \ + m8, m9, \ + m10, m11, m12, m13, m14, m15 ; temporary registers + + unpcklpd m8, m0, m2 + unpcklpd m9, m1, m3 + unpcklpd m10, m4, m6 + unpcklpd m11, m5, m7 + unpckhpd m0, m0, m2 + unpckhpd m1, m1, m3 + unpckhpd m4, m4, m6 + unpckhpd m5, m5, m7 + + vextractf128 [outq + (2 + 0 + %1)*mmsize + %6 + 0], m8, 0 + vextractf128 [outq + (2 + 0 + %1)*mmsize + %6 + 16], m0, 0 + vextractf128 [outq + (2 + 1 + %1)*mmsize + %6 + 0], m8, 1 + vextractf128 [outq + (2 + 1 + %1)*mmsize + %6 + 16], m0, 1 + + vextractf128 [outq + %3 + (2 + 0 + %1)*mmsize + %6 + 0], m9, 0 + vextractf128 [outq + %3 + (2 + 0 + %1)*mmsize + %6 + 16], m1, 0 + vextractf128 [outq + %3 + (2 + 1 + %1)*mmsize + %6 + 0], m9, 1 + vextractf128 [outq + %3 + (2 + 1 + %1)*mmsize + %6 + 16], m1, 1 + + vextractf128 [outq + %4 + (2 + 0 + %1)*mmsize + %6 + 0], m10, 0 + vextractf128 [outq + %4 + (2 + 0 + %1)*mmsize + %6 + 16], m4, 0 + vextractf128 [outq + %4 + (2 + 1 + %1)*mmsize + %6 + 0], m10, 1 + vextractf128 [outq + %4 + (2 + 1 + %1)*mmsize + %6 + 16], m4, 1 + + vextractf128 [outq + %5 + (2 + 0 + %1)*mmsize + %6 + 0], m11, 0 + vextractf128 [outq + %5 + (2 + 0 + %1)*mmsize + %6 + 16], m5, 0 + vextractf128 [outq + %5 + (2 + 1 + %1)*mmsize + %6 + 0], m11, 1 + vextractf128 [outq + %5 + (2 + 1 + %1)*mmsize + %6 + 16], m5, 1 +%endmacro + +%macro SPLIT_RADIX_COMBINE_DEINTERLEAVE_FULL 2-3 +%if %0 > 2 +%define offset %3 +%else +%define offset 0 +%endif + SPLIT_RADIX_COMBINE_DEINTERLEAVE_2 0, 0, %1, %1*2, %2, offset + SPLIT_RADIX_COMBINE_DEINTERLEAVE_2 4, 2, %1, %1*2, %2, offset +%endmacro + +INIT_XMM sse3 +cglobal fft2_asm_float, 0, 0, 0, ctx, out, in, stride + movaps m0, [inq] + FFT2 m0, m1 + movaps [outq], m0 + ret + +cglobal fft2_float, 4, 4, 2, ctx, out, in, stride + movaps m0, [inq] + FFT2 m0, m1 + movaps [outq], m0 + RET + +%macro FFT4_FN 3 +INIT_XMM sse2 +%if %3 +cglobal fft4_ %+ %1 %+ _asm_float, 0, 0, 0, ctx, out, in, stride +%else +cglobal fft4_ %+ %1 %+ _float, 4, 4, 3, ctx, out, in, stride +%endif + movaps m0, [inq + 0*mmsize] + movaps m1, [inq + 1*mmsize] + +%if %2 + shufps m2, m1, m0, q3210 + shufps m0, m0, m1, q3210 + movaps m1, m2 +%endif + + FFT4 m0, m1, m2 + + unpcklpd m2, m0, m1 + unpckhpd m0, m0, m1 + + movaps [outq + 0*mmsize], m2 + movaps [outq + 1*mmsize], m0 + +%if %3 + ret +%else + RET +%endif +%endmacro + +FFT4_FN fwd, 0, 0 +FFT4_FN fwd, 0, 1 +FFT4_FN inv, 1, 0 +FFT4_FN inv, 1, 1 + +%macro FFT8_SSE_FN 1 +INIT_XMM sse3 +%if %1 +cglobal fft8_asm_float, 0, 0, 0, ctx, out, in, stride, tmp + movaps m0, [inq + 0*mmsize] + movaps m1, [inq + 1*mmsize] + movaps m2, [inq + 2*mmsize] + movaps m3, [inq + 3*mmsize] +%else +cglobal fft8_float, 4, 4, 6, ctx, out, in, tmp + mov ctxq, [ctxq + AVTXContext.map] + LOAD64_LUT m0, inq, ctxq, (mmsize/2)*0, tmpq + LOAD64_LUT m1, inq, ctxq, (mmsize/2)*1, tmpq + LOAD64_LUT m2, inq, ctxq, (mmsize/2)*2, tmpq + LOAD64_LUT m3, inq, ctxq, (mmsize/2)*3, tmpq +%endif + + FFT8 m0, m1, m2, m3, m4, m5 + + unpcklpd m4, m0, m3 + unpcklpd m5, m1, m2 + unpckhpd m0, m0, m3 + unpckhpd m1, m1, m2 + + movups [outq + 0*mmsize], m4 + movups [outq + 1*mmsize], m0 + movups [outq + 2*mmsize], m5 + movups [outq + 3*mmsize], m1 + +%if %1 + ret +%else + RET +%endif + +%if %1 +cglobal fft8_ns_float, 4, 5, 6, ctx, out, in, stride, tmp + call mangle(ff_tx_fft8_asm_float_sse3) + RET +%endif +%endmacro + +FFT8_SSE_FN 0 +FFT8_SSE_FN 1 + +%macro FFT8_AVX_FN 1 +INIT_YMM avx +%if %1 +cglobal fft8_asm_float, 0, 0, 0, ctx, out, in, stride, tmp + movaps m0, [inq + 0*mmsize] + movaps m1, [inq + 1*mmsize] +%else +cglobal fft8_float, 4, 4, 4, ctx, out, in, tmp + mov ctxq, [ctxq + AVTXContext.map] + LOAD64_LUT m0, inq, ctxq, (mmsize/2)*0, tmpq, m2 + LOAD64_LUT m1, inq, ctxq, (mmsize/2)*1, tmpq, m3 +%endif + + FFT8_AVX m0, m1, m2, m3 + + unpcklpd m2, m0, m1 + unpckhpd m0, m0, m1 + + ; Around 2% faster than 2x vperm2f128 + 2x movapd + vextractf128 [outq + 16*0], m2, 0 + vextractf128 [outq + 16*1], m0, 0 + vextractf128 [outq + 16*2], m2, 1 + vextractf128 [outq + 16*3], m0, 1 + +%if %1 + ret +%else + RET +%endif + +%if %1 +cglobal fft8_ns_float, 4, 5, 4, ctx, out, in, stride, tmp + call mangle(ff_tx_fft8_asm_float_avx) + RET +%endif +%endmacro + +FFT8_AVX_FN 0 +FFT8_AVX_FN 1 + +%macro FFT16_FN 2 +INIT_YMM %1 +%if %2 +cglobal fft16_asm_float, 0, 0, 0, ctx, out, in, stride, tmp + movaps m0, [inq + 0*mmsize] + movaps m1, [inq + 1*mmsize] + movaps m2, [inq + 2*mmsize] + movaps m3, [inq + 3*mmsize] +%else +cglobal fft16_float, 4, 4, 8, ctx, out, in, tmp + mov ctxq, [ctxq + AVTXContext.map] + LOAD64_LUT m0, inq, ctxq, (mmsize/2)*0, tmpq, m4 + LOAD64_LUT m1, inq, ctxq, (mmsize/2)*1, tmpq, m5 + LOAD64_LUT m2, inq, ctxq, (mmsize/2)*2, tmpq, m6 + LOAD64_LUT m3, inq, ctxq, (mmsize/2)*3, tmpq, m7 +%endif + + FFT16 m0, m1, m2, m3, m4, m5, m6, m7 + + unpcklpd m5, m1, m3 + unpcklpd m4, m0, m2 + unpckhpd m1, m1, m3 + unpckhpd m0, m0, m2 + + vextractf128 [outq + 16*0], m4, 0 + vextractf128 [outq + 16*1], m0, 0 + vextractf128 [outq + 16*2], m4, 1 + vextractf128 [outq + 16*3], m0, 1 + vextractf128 [outq + 16*4], m5, 0 + vextractf128 [outq + 16*5], m1, 0 + vextractf128 [outq + 16*6], m5, 1 + vextractf128 [outq + 16*7], m1, 1 + +%if %2 + ret +%else + RET +%endif + +%if %2 +cglobal fft16_ns_float, 4, 5, 8, ctx, out, in, stride, tmp + call mangle(ff_tx_fft16_asm_float_ %+ %1) + RET +%endif +%endmacro + +FFT16_FN avx, 0 +FFT16_FN avx, 1 +FFT16_FN fma3, 0 +FFT16_FN fma3, 1 + +%macro FFT32_FN 2 +INIT_YMM %1 +%if %2 +cglobal fft32_asm_float, 0, 0, 0, ctx, out, in, stride, tmp + movaps m4, [inq + 4*mmsize] + movaps m5, [inq + 5*mmsize] + movaps m6, [inq + 6*mmsize] + movaps m7, [inq + 7*mmsize] +%else +cglobal fft32_float, 4, 4, 16, ctx, out, in, tmp + mov ctxq, [ctxq + AVTXContext.map] + LOAD64_LUT m4, inq, ctxq, (mmsize/2)*4, tmpq, m8, m12 + LOAD64_LUT m5, inq, ctxq, (mmsize/2)*5, tmpq, m9, m13 + LOAD64_LUT m6, inq, ctxq, (mmsize/2)*6, tmpq, m10, m14 + LOAD64_LUT m7, inq, ctxq, (mmsize/2)*7, tmpq, m11, m15 +%endif + + FFT8 m4, m5, m6, m7, m8, m9 + +%if %2 + movaps m0, [inq + 0*mmsize] + movaps m1, [inq + 1*mmsize] + movaps m2, [inq + 2*mmsize] + movaps m3, [inq + 3*mmsize] +%else + LOAD64_LUT m0, inq, ctxq, (mmsize/2)*0, tmpq, m8, m12 + LOAD64_LUT m1, inq, ctxq, (mmsize/2)*1, tmpq, m9, m13 + LOAD64_LUT m2, inq, ctxq, (mmsize/2)*2, tmpq, m10, m14 + LOAD64_LUT m3, inq, ctxq, (mmsize/2)*3, tmpq, m11, m15 +%endif + + movaps m8, [tab_32_float] + vperm2f128 m9, m9, [tab_32_float + 4*8 - 4*7], 0x23 + + FFT16 m0, m1, m2, m3, m10, m11, m12, m13 + + SPLIT_RADIX_COMBINE 1, m0, m1, m2, m3, m4, m5, m6, m7, m8, m9, \ + m10, m11, m12, m13, m14, m15 ; temporary registers + + unpcklpd m9, m1, m3 + unpcklpd m10, m5, m7 + unpcklpd m8, m0, m2 + unpcklpd m11, m4, m6 + unpckhpd m1, m1, m3 + unpckhpd m5, m5, m7 + unpckhpd m0, m0, m2 + unpckhpd m4, m4, m6 + + vextractf128 [outq + 16* 0], m8, 0 + vextractf128 [outq + 16* 1], m0, 0 + vextractf128 [outq + 16* 2], m8, 1 + vextractf128 [outq + 16* 3], m0, 1 + vextractf128 [outq + 16* 4], m9, 0 + vextractf128 [outq + 16* 5], m1, 0 + vextractf128 [outq + 16* 6], m9, 1 + vextractf128 [outq + 16* 7], m1, 1 + + vextractf128 [outq + 16* 8], m11, 0 + vextractf128 [outq + 16* 9], m4, 0 + vextractf128 [outq + 16*10], m11, 1 + vextractf128 [outq + 16*11], m4, 1 + vextractf128 [outq + 16*12], m10, 0 + vextractf128 [outq + 16*13], m5, 0 + vextractf128 [outq + 16*14], m10, 1 + vextractf128 [outq + 16*15], m5, 1 + +%if %2 + ret +%else + RET +%endif + +%if %2 +cglobal fft32_ns_float, 4, 5, 16, ctx, out, in, stride, tmp + call mangle(ff_tx_fft32_asm_float_ %+ %1) + RET +%endif +%endmacro + +%if ARCH_X86_64 +FFT32_FN avx, 0 +FFT32_FN avx, 1 +FFT32_FN fma3, 0 +FFT32_FN fma3, 1 +%endif + +%macro FFT_SPLIT_RADIX_DEF 1-2 +ALIGN 16 +.%1 %+ pt: + PUSH lenq + mov lenq, (%1/4) + + add outq, (%1*4) - (%1/1) + call .32pt + + add outq, (%1*2) - (%1/2) ; the synth loops also increment outq + call .32pt + + POP lenq + sub outq, (%1*4) + (%1*2) + (%1/2) + + lea rtabq, [tab_ %+ %1 %+ _float] + lea itabq, [tab_ %+ %1 %+ _float + %1 - 4*7] + +%if %0 > 1 + cmp tgtq, %1 + je .deinterleave + + mov tmpq, %1 + +.synth_ %+ %1: + SPLIT_RADIX_LOAD_COMBINE_FULL 2*%1, 6*%1, 0, 0, 0 + add outq, 8*mmsize + add rtabq, 4*mmsize + sub itabq, 4*mmsize + sub tmpq, 4*mmsize + jg .synth_ %+ %1 + + cmp lenq, %1 + jg %2 ; can't do math here, nasm doesn't get it + ret +%endif +%endmacro + +%macro FFT_SPLIT_RADIX_FN 2 +INIT_YMM %1 +%if %2 +cglobal fft_sr_asm_float, 0, 0, 0, ctx, out, in, stride, len, lut, itab, rtab, tgt, tmp +%else +cglobal fft_sr_float, 4, 10, 16, 272, ctx, out, in, stride, len, lut, itab, rtab, tgt, tmp + movsxd lenq, dword [ctxq + AVTXContext.len] + mov lutq, [ctxq + AVTXContext.map] +%endif + mov tgtq, lenq + +; Bottom-most/32-point transform =============================================== +ALIGN 16 +.32pt: +%if %2 + movaps m4, [inq + 4*mmsize] + movaps m5, [inq + 5*mmsize] + movaps m6, [inq + 6*mmsize] + movaps m7, [inq + 7*mmsize] +%else + LOAD64_LUT m4, inq, lutq, (mmsize/2)*4, tmpq, m8, m12 + LOAD64_LUT m5, inq, lutq, (mmsize/2)*5, tmpq, m9, m13 + LOAD64_LUT m6, inq, lutq, (mmsize/2)*6, tmpq, m10, m14 + LOAD64_LUT m7, inq, lutq, (mmsize/2)*7, tmpq, m11, m15 +%endif + + FFT8 m4, m5, m6, m7, m8, m9 + +%if %2 + movaps m0, [inq + 0*mmsize] + movaps m1, [inq + 1*mmsize] + movaps m2, [inq + 2*mmsize] + movaps m3, [inq + 3*mmsize] +%else + LOAD64_LUT m0, inq, lutq, (mmsize/2)*0, tmpq, m8, m12 + LOAD64_LUT m1, inq, lutq, (mmsize/2)*1, tmpq, m9, m13 + LOAD64_LUT m2, inq, lutq, (mmsize/2)*2, tmpq, m10, m14 + LOAD64_LUT m3, inq, lutq, (mmsize/2)*3, tmpq, m11, m15 +%endif + + movaps m8, [tab_32_float] + vperm2f128 m9, m9, [tab_32_float + 32 - 4*7], 0x23 + + FFT16 m0, m1, m2, m3, m10, m11, m12, m13 + + SPLIT_RADIX_COMBINE 1, m0, m1, m2, m3, m4, m5, m6, m7, m8, m9, \ + m10, m11, m12, m13, m14, m15 ; temporary registers + + movaps [outq + 1*mmsize], m1 + movaps [outq + 3*mmsize], m3 + movaps [outq + 5*mmsize], m5 + movaps [outq + 7*mmsize], m7 + +%if %2 + add inq, 8*mmsize +%else + add lutq, (mmsize/2)*8 +%endif + cmp lenq, 32 + jg .64pt + + movaps [outq + 0*mmsize], m0 + movaps [outq + 2*mmsize], m2 + movaps [outq + 4*mmsize], m4 + movaps [outq + 6*mmsize], m6 + + ret + +; 64-point transform =========================================================== +ALIGN 16 +.64pt: +; Helper defines, these make it easier to track what's happening +%define tx1_e0 m4 +%define tx1_e1 m5 +%define tx1_o0 m6 +%define tx1_o1 m7 +%define tx2_e0 m8 +%define tx2_e1 m9 +%define tx2_o0 m10 +%define tx2_o1 m11 +%define tw_e m12 +%define tw_o m13 +%define tmp1 m14 +%define tmp2 m15 + + SWAP m4, m1 + SWAP m6, m3 + +%if %2 + movaps tx1_e0, [inq + 0*mmsize] + movaps tx1_e1, [inq + 1*mmsize] + movaps tx1_o0, [inq + 2*mmsize] + movaps tx1_o1, [inq + 3*mmsize] +%else + LOAD64_LUT tx1_e0, inq, lutq, (mmsize/2)*0, tmpq, tw_e, tmp1 + LOAD64_LUT tx1_e1, inq, lutq, (mmsize/2)*1, tmpq, tw_o, tmp2 + LOAD64_LUT tx1_o0, inq, lutq, (mmsize/2)*2, tmpq, tw_e, tmp1 + LOAD64_LUT tx1_o1, inq, lutq, (mmsize/2)*3, tmpq, tw_o, tmp2 +%endif + + FFT16 tx1_e0, tx1_e1, tx1_o0, tx1_o1, tw_e, tw_o, tx2_o0, tx2_o1 + +%if %2 + movaps tx2_e0, [inq + 4*mmsize] + movaps tx2_e1, [inq + 5*mmsize] + movaps tx2_o0, [inq + 6*mmsize] + movaps tx2_o1, [inq + 7*mmsize] +%else + LOAD64_LUT tx2_e0, inq, lutq, (mmsize/2)*4, tmpq, tw_e, tmp1 + LOAD64_LUT tx2_e1, inq, lutq, (mmsize/2)*5, tmpq, tw_o, tmp2 + LOAD64_LUT tx2_o0, inq, lutq, (mmsize/2)*6, tmpq, tw_e, tmp1 + LOAD64_LUT tx2_o1, inq, lutq, (mmsize/2)*7, tmpq, tw_o, tmp2 +%endif + + FFT16 tx2_e0, tx2_e1, tx2_o0, tx2_o1, tmp1, tmp2, tw_e, tw_o + + movaps tw_e, [tab_64_float] + vperm2f128 tw_o, tw_o, [tab_64_float + 64 - 4*7], 0x23 + +%if %2 + add inq, 8*mmsize +%else + add lutq, (mmsize/2)*8 +%endif + cmp tgtq, 64 + je .64pt_deint + + SPLIT_RADIX_COMBINE_64 + + cmp lenq, 64 + jg .128pt + ret + +; 128-point transform ========================================================== +ALIGN 16 +.128pt: + PUSH lenq + mov lenq, 32 + + add outq, 16*mmsize + call .32pt + + add outq, 8*mmsize + call .32pt + + POP lenq + sub outq, 24*mmsize + + lea rtabq, [tab_128_float] + lea itabq, [tab_128_float + 128 - 4*7] + + cmp tgtq, 128 + je .deinterleave + + SPLIT_RADIX_LOAD_COMBINE_FULL 2*128, 6*128 + + cmp lenq, 128 + jg .256pt + ret + +; 256-point transform ========================================================== +ALIGN 16 +.256pt: + PUSH lenq + mov lenq, 64 + + add outq, 32*mmsize + call .32pt + + add outq, 16*mmsize + call .32pt + + POP lenq + sub outq, 48*mmsize + + lea rtabq, [tab_256_float] + lea itabq, [tab_256_float + 256 - 4*7] + + cmp tgtq, 256 + je .deinterleave + + SPLIT_RADIX_LOAD_COMBINE_FULL 2*256, 6*256 + SPLIT_RADIX_LOAD_COMBINE_FULL 2*256, 6*256, 8*mmsize, 4*mmsize, -4*mmsize + + cmp lenq, 256 + jg .512pt + ret + +; 512-point transform ========================================================== +ALIGN 16 +.512pt: + PUSH lenq + mov lenq, 128 + + add outq, 64*mmsize + call .32pt + + add outq, 32*mmsize + call .32pt + + POP lenq + sub outq, 96*mmsize + + lea rtabq, [tab_512_float] + lea itabq, [tab_512_float + 512 - 4*7] + + cmp tgtq, 512 + je .deinterleave + + mov tmpq, 4 + +.synth_512: + SPLIT_RADIX_LOAD_COMBINE_FULL 2*512, 6*512 + add outq, 8*mmsize + add rtabq, 4*mmsize + sub itabq, 4*mmsize + sub tmpq, 1 + jg .synth_512 + + cmp lenq, 512 + jg .1024pt + ret + +; 1024-point transform ========================================================== +ALIGN 16 +.1024pt: + PUSH lenq + mov lenq, 256 + + add outq, 96*mmsize + call .32pt + + add outq, 64*mmsize + call .32pt + + POP lenq + sub outq, 192*mmsize + + lea rtabq, [tab_1024_float] + lea itabq, [tab_1024_float + 1024 - 4*7] + + cmp tgtq, 1024 + je .deinterleave + + mov tmpq, 8 + +.synth_1024: + SPLIT_RADIX_LOAD_COMBINE_FULL 2*1024, 6*1024 + add outq, 8*mmsize + add rtabq, 4*mmsize + sub itabq, 4*mmsize + sub tmpq, 1 + jg .synth_1024 + + cmp lenq, 1024 + jg .2048pt + ret + +; 2048 to 131072-point transforms ============================================== +FFT_SPLIT_RADIX_DEF 2048, .4096pt +FFT_SPLIT_RADIX_DEF 4096, .8192pt +FFT_SPLIT_RADIX_DEF 8192, .16384pt +FFT_SPLIT_RADIX_DEF 16384, .32768pt +FFT_SPLIT_RADIX_DEF 32768, .65536pt +FFT_SPLIT_RADIX_DEF 65536, .131072pt +FFT_SPLIT_RADIX_DEF 131072 + +;=============================================================================== +; Final synthesis + deinterleaving code +;=============================================================================== +.deinterleave: +%if %2 + PUSH strideq +%endif + mov tgtq, lenq + imul tmpq, lenq, 2 + lea strideq, [4*lenq + tmpq] + +.synth_deinterleave: + SPLIT_RADIX_COMBINE_DEINTERLEAVE_FULL tmpq, strideq + add outq, 8*mmsize + add rtabq, 4*mmsize + sub itabq, 4*mmsize + sub tgtq, 4*mmsize + jg .synth_deinterleave + +%if %2 + POP strideq + sub outq, tmpq + neg tmpq + lea inq, [inq + tmpq*4] + ret +%else + RET +%endif + +; 64-point deinterleave which only has to load 4 registers ===================== +.64pt_deint: + SPLIT_RADIX_COMBINE_LITE 1, m0, m1, tx1_e0, tx2_e0, tw_e, tw_o, tmp1, tmp2 + SPLIT_RADIX_COMBINE_HALF 0, m2, m3, tx1_o0, tx2_o0, tw_e, tw_o, tmp1, tmp2, tw_e + + unpcklpd tmp1, m0, m2 + unpcklpd tmp2, m1, m3 + unpcklpd tw_o, tx1_e0, tx1_o0 + unpcklpd tw_e, tx2_e0, tx2_o0 + unpckhpd m0, m0, m2 + unpckhpd m1, m1, m3 + unpckhpd tx1_e0, tx1_e0, tx1_o0 + unpckhpd tx2_e0, tx2_e0, tx2_o0 + + vextractf128 [outq + 0*mmsize + 0], tmp1, 0 + vextractf128 [outq + 0*mmsize + 16], m0, 0 + vextractf128 [outq + 4*mmsize + 0], tmp2, 0 + vextractf128 [outq + 4*mmsize + 16], m1, 0 + + vextractf128 [outq + 8*mmsize + 0], tw_o, 0 + vextractf128 [outq + 8*mmsize + 16], tx1_e0, 0 + vextractf128 [outq + 9*mmsize + 0], tw_o, 1 + vextractf128 [outq + 9*mmsize + 16], tx1_e0, 1 + + vperm2f128 tmp1, tmp1, m0, 0x31 + vperm2f128 tmp2, tmp2, m1, 0x31 + + vextractf128 [outq + 12*mmsize + 0], tw_e, 0 + vextractf128 [outq + 12*mmsize + 16], tx2_e0, 0 + vextractf128 [outq + 13*mmsize + 0], tw_e, 1 + vextractf128 [outq + 13*mmsize + 16], tx2_e0, 1 + + movaps tw_e, [tab_64_float + mmsize] + vperm2f128 tw_o, tw_o, [tab_64_float + 64 - 4*7 - mmsize], 0x23 + + movaps m0, [outq + 1*mmsize] + movaps m1, [outq + 3*mmsize] + movaps m2, [outq + 5*mmsize] + movaps m3, [outq + 7*mmsize] + + movaps [outq + 1*mmsize], tmp1 + movaps [outq + 5*mmsize], tmp2 + + SPLIT_RADIX_COMBINE 0, m0, m2, m1, m3, tx1_e1, tx2_e1, tx1_o1, tx2_o1, tw_e, tw_o, \ + tmp1, tmp2, tx2_o0, tx1_o0, tx2_e0, tx1_e0 ; temporary registers + + unpcklpd tmp1, m0, m1 + unpcklpd tmp2, m2, m3 + unpcklpd tw_e, tx1_e1, tx1_o1 + unpcklpd tw_o, tx2_e1, tx2_o1 + unpckhpd m0, m0, m1 + unpckhpd m2, m2, m3 + unpckhpd tx1_e1, tx1_e1, tx1_o1 + unpckhpd tx2_e1, tx2_e1, tx2_o1 + + vextractf128 [outq + 2*mmsize + 0], tmp1, 0 + vextractf128 [outq + 2*mmsize + 16], m0, 0 + vextractf128 [outq + 3*mmsize + 0], tmp1, 1 + vextractf128 [outq + 3*mmsize + 16], m0, 1 + + vextractf128 [outq + 6*mmsize + 0], tmp2, 0 + vextractf128 [outq + 6*mmsize + 16], m2, 0 + vextractf128 [outq + 7*mmsize + 0], tmp2, 1 + vextractf128 [outq + 7*mmsize + 16], m2, 1 + + vextractf128 [outq + 10*mmsize + 0], tw_e, 0 + vextractf128 [outq + 10*mmsize + 16], tx1_e1, 0 + vextractf128 [outq + 11*mmsize + 0], tw_e, 1 + vextractf128 [outq + 11*mmsize + 16], tx1_e1, 1 + + vextractf128 [outq + 14*mmsize + 0], tw_o, 0 + vextractf128 [outq + 14*mmsize + 16], tx2_e1, 0 + vextractf128 [outq + 15*mmsize + 0], tw_o, 1 + vextractf128 [outq + 15*mmsize + 16], tx2_e1, 1 + +%if %2 + sub inq, 16*mmsize + ret +%else + RET +%endif + +%if %2 +cglobal fft_sr_ns_float, 4, 10, 16, 272, ctx, out, in, tmp, len, lut, itab, rtab, tgt, off + movsxd lenq, dword [ctxq + AVTXContext.len] + mov lutq, [ctxq + AVTXContext.map] + + call mangle(ff_tx_fft_sr_asm_float_ %+ %1) + RET +%endif +%endmacro + +%if ARCH_X86_64 +FFT_SPLIT_RADIX_FN avx, 0 +FFT_SPLIT_RADIX_FN avx, 1 +FFT_SPLIT_RADIX_FN fma3, 0 +FFT_SPLIT_RADIX_FN fma3, 1 +%if HAVE_AVX2_EXTERNAL +FFT_SPLIT_RADIX_FN avx2, 0 +FFT_SPLIT_RADIX_FN avx2, 1 +%endif +%endif + +%macro FFT15_FN 2 +INIT_YMM avx2 +cglobal fft15_ %+ %2, 4, 10, 16, ctx, out, in, stride, len, lut, tmp, tgt5, stride3, stride5 + mov lutq, [ctxq + AVTXContext.map] + + imul stride3q, strideq, 3 + imul stride5q, strideq, 5 + + movaps m11, [mask_mmppmmmm] ; mmppmmmm + movaps m10, [tab_53_float] ; tab5 + movaps xm9, [tab_53_float + 32] ; tab3 + vpermpd m9, m9, q1110 ; tab[23232323] + movaps m8, [s15_perm] + +%if %1 + movups xm0, [inq] + movddup xm5, [inq + 16] + movups m2, [inq + mmsize*0 + 24] + movups m3, [inq + mmsize*1 + 24] + movups m4, [inq + mmsize*2 + 24] +%else + LOAD64_LUT xm0, inq, lutq, 0, tmpq, m14, xm15 + LOAD64_LUT m2, inq, lutq, (mmsize/2)*0 + 12, tmpq, m6, m7 + LOAD64_LUT m3, inq, lutq, (mmsize/2)*1 + 12, tmpq, m14, m15 + LOAD64_LUT m4, inq, lutq, (mmsize/2)*2 + 12, tmpq, m6, m7 + mov tmpd, [lutq + 8] + movddup xm5, [inq + tmpq*8] +%endif + + FFT15 + + lea tgt5q, [outq + stride5q] + lea tmpq, [outq + stride5q*2] + + movhps [outq], xm14 ; out[0] + movhps [outq + stride5q*1], xm15 ; out[5] + movlps [outq + stride5q*2], xm15 ; out[10] + + vextractf128 xm3, m0, 1 + vextractf128 xm4, m1, 1 + vextractf128 xm5, m2, 1 + + movlps [outq + strideq*1], xm1 + movhps [outq + strideq*2], xm2 + movlps [outq + stride3q*1], xm3 + movhps [outq + strideq*4], xm4 + movlps [outq + stride3q*2], xm0 + movlps [outq + strideq*8], xm5 + movhps [outq + stride3q*4], xm0 + movhps [tgt5q + strideq*2], xm1 + movhps [tgt5q + strideq*4], xm3 + movlps [tmpq + strideq*1], xm2 + movlps [tmpq + stride3q*1], xm4 + movhps [tmpq + strideq*4], xm5 + + RET +%endmacro + +%if ARCH_X86_64 && HAVE_AVX2_EXTERNAL +FFT15_FN 0, float +FFT15_FN 1, ns_float +%endif + +%macro IMDCT_FN 1 +INIT_YMM %1 +cglobal mdct_inv_float, 4, 14, 16, 320, ctx, out, in, stride, len, lut, exp, t1, t2, t3, \ + t4, t5, btmp + movsxd lenq, dword [ctxq + AVTXContext.len] + mov expq, [ctxq + AVTXContext.exp] + + lea t1d, [lend - 1] + imul t1d, strided + + mov btmpq, ctxq ; backup original context + mov lutq, [ctxq + AVTXContext.map] ; load map + + cmp strideq, 4 + je .stride4 + + shl strideq, 1 + movd xm4, strided + vpbroadcastd m4, xm4 ; stride splatted + movd xm5, t1d + vpbroadcastd m5, xm5 ; offset splatted + + mov t2q, outq ; don't modify the original output + pcmpeqd m15, m15 ; set all bits to 1 + +.stridex_pre: + pmulld m2, m4, [lutq] ; multiply by stride + movaps m0, m15 + psubd m3, m5, m2 ; subtract from offset + movaps m1, m15 + vgatherdps m6, [inq + m2], m0 ; im + vgatherdps m7, [inq + m3], m1 ; re + + movaps m8, [expq + 0*mmsize] ; tab 1 + movaps m9, [expq + 1*mmsize] ; tab 2 + + unpcklps m0, m7, m6 ; re, im, re, im + unpckhps m1, m7, m6 ; re, im, re, im + + vperm2f128 m2, m1, m0, 0x02 ; output order + vperm2f128 m3, m1, m0, 0x13 ; output order + + movshdup m10, m8 ; tab 1 imim + movshdup m11, m9 ; tab 2 imim + movsldup m12, m8 ; tab 1 rere + movsldup m13, m9 ; tab 2 rere + + mulps m10, m2 ; 1 reim * imim + mulps m11, m3 ; 2 reim * imim + + shufps m10, m10, m10, q2301 + shufps m11, m11, m11, q2301 + + fmaddsubps m10, m12, m2, m10 + fmaddsubps m11, m13, m3, m11 + + movups [t2q + 0*mmsize], m10 + movups [t2q + 1*mmsize], m11 + + add expq, mmsize*2 + add lutq, mmsize + add t2q, mmsize*2 + sub lenq, mmsize/2 + jg .stridex_pre + jmp .transform + +.stride4: + lea expq, [expq + lenq*4] + lea lutq, [lutq + lenq*2] + lea t1q, [inq + t1q] + lea t1q, [t1q + strideq - mmsize] + lea t2q, [lenq*2 - mmsize/2] + +.stride4_pre: + movups m4, [inq] + movups m3, [t1q] + + movsldup m1, m4 ; im im, im im + movshdup m0, m3 ; re re, re re + movshdup m4, m4 ; re re, re re (2) + movsldup m3, m3 ; im im, im im (2) + + movups m2, [expq] ; tab + movups m5, [expq + 2*t2q] ; tab (2) + + vpermpd m0, m0, q0123 ; flip + shufps m7, m2, m2, q2301 + vpermpd m4, m4, q0123 ; flip (2) + shufps m8, m5, m5, q2301 + + mulps m1, m7 ; im im * tab.reim + mulps m3, m8 ; im im * tab.reim (2) + + fmaddsubps m0, m0, m2, m1 + fmaddsubps m4, m4, m5, m3 + + vextractf128 xm3, m0, 1 + vextractf128 xm6, m4, 1 + + ; scatter + movsxd strideq, dword [lutq + 0*4] + movsxd lenq, dword [lutq + 1*4] + movsxd t3q, dword [lutq + 2*4] + movsxd t4q, dword [lutq + 3*4] + + movlps [outq + strideq*8], xm0 + movhps [outq + lenq*8], xm0 + movlps [outq + t3q*8], xm3 + movhps [outq + t4q*8], xm3 + + movsxd strideq, dword [lutq + 0*4 + t2q] + movsxd lenq, dword [lutq + 1*4 + t2q] + movsxd t3q, dword [lutq + 2*4 + t2q] + movsxd t4q, dword [lutq + 3*4 + t2q] + + movlps [outq + strideq*8], xm4 + movhps [outq + lenq*8], xm4 + movlps [outq + t3q*8], xm6 + movhps [outq + t4q*8], xm6 + + add lutq, mmsize/2 + add expq, mmsize + add inq, mmsize + sub t1q, mmsize + sub t2q, mmsize + jge .stride4_pre + +.transform: + mov strideq, 2*4 + mov t4q, ctxq ; backup original context + mov t5q, [ctxq + AVTXContext.fn] ; subtransform's jump point + mov ctxq, [ctxq + AVTXContext.sub] + mov lutq, [ctxq + AVTXContext.map] + movsxd lenq, dword [ctxq + AVTXContext.len] + + mov inq, outq ; in-place transform + call t5q ; call the FFT + + mov ctxq, t4q ; restore original context + movsxd lenq, dword [ctxq + AVTXContext.len] + mov expq, [ctxq + AVTXContext.exp] + lea expq, [expq + lenq*4] + + xor t1q, t1q ; low + lea t2q, [lenq*4 - mmsize] ; high + +.post: + movaps m2, [expq + t2q] ; tab h + movaps m3, [expq + t1q] ; tab l + movups m0, [outq + t2q] ; in h + movups m1, [outq + t1q] ; in l + + movshdup m4, m2 ; tab h imim + movshdup m5, m3 ; tab l imim + movsldup m6, m2 ; tab h rere + movsldup m7, m3 ; tab l rere + + shufps m2, m0, m0, q2301 ; in h imre + shufps m3, m1, m1, q2301 ; in l imre + + mulps m6, m0 + mulps m7, m1 + + fmaddsubps m4, m4, m2, m6 + fmaddsubps m5, m5, m3, m7 + + vpermpd m3, m5, q0123 ; flip + vpermpd m2, m4, q0123 ; flip + + blendps m1, m2, m5, 01010101b + blendps m0, m3, m4, 01010101b + + movups [outq + t2q], m0 + movups [outq + t1q], m1 + + add t1q, mmsize + sub t2q, mmsize + sub lenq, mmsize/2 + jg .post + + RET +%endmacro + +%if ARCH_X86_64 && HAVE_AVX2_EXTERNAL +IMDCT_FN avx2 +%endif + +%macro PFA_15_FN 2 +INIT_YMM %1 +%if %2 +cglobal fft_pfa_15xM_asm_float, 0, 0, 0, ctx, out, in, stride, len, lut, buf, map, tgt, tmp, \ + tgt5, stride3, stride5, btmp +%else +cglobal fft_pfa_15xM_float, 4, 14, 16, 320, ctx, out, in, stride, len, lut, buf, map, tgt, tmp, \ + tgt5, stride3, stride5, btmp +%endif + +%if %2 + PUSH inq + PUSH tgt5q + PUSH stride3q + PUSH stride5q + PUSH btmpq +%endif + + PUSH strideq + + mov btmpq, outq + + mov outq, [ctxq + AVTXContext.tmp] +%if %2 == 0 + movsxd lenq, dword [ctxq + AVTXContext.len] + mov lutq, [ctxq + AVTXContext.map] +%endif + + ; Load stride (second transform's length) and second transform's LUT + mov tmpq, [ctxq + AVTXContext.sub] + movsxd strideq, dword [tmpq + AVTXContext.len] + mov mapq, [tmpq + AVTXContext.map] + + shl strideq, 3 + imul stride3q, strideq, 3 + imul stride5q, strideq, 5 + + movaps m11, [mask_mmppmmmm] ; mmppmmmm + movaps m10, [tab_53_float] ; tab5 + movaps xm9, [tab_53_float + 32] ; tab3 + vpermpd m9, m9, q1110 ; tab[23232323] + movaps m8, [s15_perm] + +.dim1: + mov tmpd, [mapq] + lea tgtq, [outq + tmpq*8] + +%if %2 + movups xm0, [inq] ; in[0,1].reim + movddup xm5, [inq + 16] ; in[2].reimreim + movups m2, [inq + mmsize*0 + 24] ; in[3-6].reim + movups m3, [inq + mmsize*1 + 24] ; in[7-11].reim + movups m4, [inq + mmsize*2 + 24] ; in[12-15].reim +%else + LOAD64_LUT xm0, inq, lutq, 0, tmpq, m14, xm15 ; in[0,1].reim + LOAD64_LUT m2, inq, lutq, (mmsize/2)*0 + 12, tmpq, m6, m7 + LOAD64_LUT m3, inq, lutq, (mmsize/2)*1 + 12, tmpq, m14, m15 + LOAD64_LUT m4, inq, lutq, (mmsize/2)*2 + 12, tmpq, m6, m7 + mov tmpd, [lutq + 8] + movddup xm5, [inq + tmpq*8] ; in[2].reimreim +%endif + + FFT15 + + lea tgt5q, [tgtq + stride5q] + lea tmpq, [tgtq + stride5q*2] + + movhps [tgtq], xm14 ; out[0] + movhps [tgtq + stride5q*1], xm15 ; out[5] + movlps [tgtq + stride5q*2], xm15 ; out[10] + + vextractf128 xm3, m0, 1 + vextractf128 xm4, m1, 1 + vextractf128 xm5, m2, 1 + + movlps [tgtq + strideq*1], xm1 + movhps [tgtq + strideq*2], xm2 + movlps [tgtq + stride3q*1], xm3 + movhps [tgtq + strideq*4], xm4 + movlps [tgtq + stride3q*2], xm0 + movlps [tgtq + strideq*8], xm5 + movhps [tgtq + stride3q*4], xm0 + movhps [tgt5q + strideq*2], xm1 + movhps [tgt5q + strideq*4], xm3 + movlps [tmpq + strideq*1], xm2 + movlps [tmpq + stride3q*1], xm4 + movhps [tmpq + strideq*4], xm5 + +%if %2 + add inq, mmsize*3 + 24 +%else + add lutq, (mmsize/2)*3 + 12 +%endif + add mapq, 4 + sub lenq, 15 + jg .dim1 + + ; Second transform setup + mov stride5q, ctxq ; backup original context + movsxd stride3q, dword [ctxq + AVTXContext.len] ; full length + mov tgt5q, [ctxq + AVTXContext.fn] ; subtransform's jump point + + mov inq, outq ; in-place transform + mov ctxq, [ctxq + AVTXContext.sub] ; load subtransform's context + mov lutq, [ctxq + AVTXContext.map] ; load subtransform's map + movsxd lenq, dword [ctxq + AVTXContext.len] ; load subtransform's length + +.dim2: + call tgt5q ; call the FFT + lea inq, [inq + lenq*8] + lea outq, [outq + lenq*8] + sub stride3q, lenq + jg .dim2 + + mov ctxq, stride5q ; restore original context + mov lutq, [ctxq + AVTXContext.map] + mov inq, [ctxq + AVTXContext.tmp] + movsxd lenq, dword [ctxq + AVTXContext.len] ; full length + + lea stride3q, [lutq + lenq*4] ; second part of the LUT + mov stride5q, lenq + mov tgt5q, btmpq + POP strideq + lea tmpq, [strideq + 2*strideq] + +.post: + LOAD64_LUT m0, inq, stride3q, 0, tmpq, m8, m9 + vextractf128 xm1, m0, 1 + movlps [tgt5q], xm0 + movhps [tgt5q + strideq], xm0 + movlps [tgt5q + strideq*2], xm1 + movhps [tgt5q + tmpq], xm1 + + lea tgt5q, [tgt5q + 4*strideq] + add stride3q, mmsize/2 + sub stride5q, mmsize/8 + jg .post + +%if %2 + mov outq, btmpq + POP btmpq + POP stride5q + POP stride3q + POP tgt5q + POP inq + ret +%else + RET +%endif + +%if %2 +cglobal fft_pfa_15xM_ns_float, 4, 14, 16, 320, ctx, out, in, stride, len, lut, buf, map, tgt, tmp, \ + tgt5, stride3, stride5, btmp + movsxd lenq, dword [ctxq + AVTXContext.len] + mov lutq, [ctxq + AVTXContext.map] + + call mangle(ff_tx_fft_pfa_15xM_asm_float) + RET +%endif +%endmacro + +%if ARCH_X86_64 && HAVE_AVX2_EXTERNAL +PFA_15_FN avx2, 0 +PFA_15_FN avx2, 1 +%endif diff --git a/media/ffvpx/libavutil/x86/tx_float_init.c b/media/ffvpx/libavutil/x86/tx_float_init.c new file mode 100644 index 0000000000..d3c0beb50f --- /dev/null +++ b/media/ffvpx/libavutil/x86/tx_float_init.c @@ -0,0 +1,310 @@ +/* + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#define TX_FLOAT +#include "libavutil/tx_priv.h" +#include "libavutil/attributes.h" +#include "libavutil/x86/cpu.h" + +#include "config.h" + +TX_DECL_FN(fft2, sse3) +TX_DECL_FN(fft4_fwd, sse2) +TX_DECL_FN(fft4_inv, sse2) +TX_DECL_FN(fft8, sse3) +TX_DECL_FN(fft8_ns, sse3) +TX_DECL_FN(fft8, avx) +TX_DECL_FN(fft8_ns, avx) +TX_DECL_FN(fft15, avx2) +TX_DECL_FN(fft15_ns, avx2) +TX_DECL_FN(fft16, avx) +TX_DECL_FN(fft16_ns, avx) +TX_DECL_FN(fft16, fma3) +TX_DECL_FN(fft16_ns, fma3) +TX_DECL_FN(fft32, avx) +TX_DECL_FN(fft32_ns, avx) +TX_DECL_FN(fft32, fma3) +TX_DECL_FN(fft32_ns, fma3) +TX_DECL_FN(fft_sr, avx) +TX_DECL_FN(fft_sr_ns, avx) +TX_DECL_FN(fft_sr, fma3) +TX_DECL_FN(fft_sr_ns, fma3) +TX_DECL_FN(fft_sr, avx2) +TX_DECL_FN(fft_sr_ns, avx2) + +TX_DECL_FN(fft_pfa_15xM, avx2) +TX_DECL_FN(fft_pfa_15xM_ns, avx2) + +TX_DECL_FN(mdct_inv, avx2) + +TX_DECL_FN(fft2_asm, sse3) +TX_DECL_FN(fft4_fwd_asm, sse2) +TX_DECL_FN(fft4_inv_asm, sse2) +TX_DECL_FN(fft8_asm, sse3) +TX_DECL_FN(fft8_asm, avx) +TX_DECL_FN(fft16_asm, avx) +TX_DECL_FN(fft16_asm, fma3) +TX_DECL_FN(fft32_asm, avx) +TX_DECL_FN(fft32_asm, fma3) +TX_DECL_FN(fft_sr_asm, avx) +TX_DECL_FN(fft_sr_asm, fma3) +TX_DECL_FN(fft_sr_asm, avx2) + +TX_DECL_FN(fft_pfa_15xM_asm, avx2) + +#define DECL_INIT_FN(basis, interleave) \ +static av_cold int b ##basis## _i ##interleave(AVTXContext *s, \ + const FFTXCodelet *cd, \ + uint64_t flags, \ + FFTXCodeletOptions *opts, \ + int len, int inv, \ + const void *scale) \ +{ \ + ff_tx_init_tabs_float(len); \ + if (cd->max_len == 2) \ + return ff_tx_gen_ptwo_revtab(s, opts); \ + else \ + return ff_tx_gen_split_radix_parity_revtab(s, len, inv, opts, \ + basis, interleave); \ +} + +DECL_INIT_FN(8, 0) +DECL_INIT_FN(8, 2) + +static av_cold int factor_init(AVTXContext *s, const FFTXCodelet *cd, + uint64_t flags, FFTXCodeletOptions *opts, + int len, int inv, const void *scale) +{ + int ret; + + /* The transformations below are performed in the gather domain, + * so override the option and let the infrastructure convert the map + * to SCATTER if needed. */ + FFTXCodeletOptions sub_opts = { .map_dir = FF_TX_MAP_GATHER }; + + TX_TAB(ff_tx_init_tabs)(len); + + if (len == 15) + ret = ff_tx_gen_pfa_input_map(s, &sub_opts, 3, 5); + else + ret = ff_tx_gen_default_map(s, &sub_opts); + + if (ret < 0) + return ret; + + if (len == 15) { + int cnt = 0, tmp[15]; + + /* Special permutation to simplify loads in the pre-permuted version */ + memcpy(tmp, s->map, 15*sizeof(*tmp)); + for (int i = 1; i < 15; i += 3) { + s->map[cnt] = tmp[i]; + cnt++; + } + for (int i = 2; i < 15; i += 3) { + s->map[cnt] = tmp[i]; + cnt++; + } + for (int i = 0; i < 15; i += 3) { + s->map[cnt] = tmp[i]; + cnt++; + } + memmove(&s->map[7], &s->map[6], 4*sizeof(int)); + memmove(&s->map[3], &s->map[1], 4*sizeof(int)); + s->map[1] = tmp[2]; + s->map[2] = tmp[0]; + } + + return 0; +} + +static av_cold int m_inv_init(AVTXContext *s, const FFTXCodelet *cd, + uint64_t flags, FFTXCodeletOptions *opts, + int len, int inv, const void *scale) +{ + int ret; + FFTXCodeletOptions sub_opts = { .map_dir = FF_TX_MAP_GATHER }; + + s->scale_d = *((SCALE_TYPE *)scale); + s->scale_f = s->scale_d; + + flags &= ~FF_TX_OUT_OF_PLACE; /* We want the subtransform to be */ + flags |= AV_TX_INPLACE; /* in-place */ + flags |= FF_TX_PRESHUFFLE; /* This function handles the permute step */ + flags |= FF_TX_ASM_CALL; /* We want an assembly function, not C */ + + if ((ret = ff_tx_init_subtx(s, TX_TYPE(FFT), flags, &sub_opts, len >> 1, + inv, scale))) + return ret; + + s->map = av_malloc(len*sizeof(*s->map)); + if (!s->map) + return AVERROR(ENOMEM); + + memcpy(s->map, s->sub->map, (len >> 1)*sizeof(*s->map)); + /* Invert lookup table for unstrided path */ + for (int i = 0; i < (len >> 1); i++) + s->map[(len >> 1) + s->map[i]] = i; + + if ((ret = ff_tx_mdct_gen_exp_float(s, s->map))) + return ret; + + return 0; +} + +static av_cold int fft_pfa_init(AVTXContext *s, + const FFTXCodelet *cd, + uint64_t flags, + FFTXCodeletOptions *opts, + int len, int inv, + const void *scale) +{ + int ret; + int sub_len = len / cd->factors[0]; + FFTXCodeletOptions sub_opts = { .map_dir = FF_TX_MAP_SCATTER }; + + flags &= ~FF_TX_OUT_OF_PLACE; /* We want the subtransform to be */ + flags |= AV_TX_INPLACE; /* in-place */ + flags |= FF_TX_PRESHUFFLE; /* This function handles the permute step */ + flags |= FF_TX_ASM_CALL; /* We want an assembly function, not C */ + + if ((ret = ff_tx_init_subtx(s, TX_TYPE(FFT), flags, &sub_opts, + sub_len, inv, scale))) + return ret; + + if ((ret = ff_tx_gen_compound_mapping(s, opts, s->inv, cd->factors[0], sub_len))) + return ret; + + if (cd->factors[0] == 15) { + int tmp[15]; + + /* Our 15-point transform is also a compound one, so embed its input map */ + TX_EMBED_INPUT_PFA_MAP(s->map, len, 3, 5); + + /* Special permutation to simplify loads in the pre-permuted version */ + for (int k = 0; k < s->sub[0].len; k++) { + int cnt = 0; + memcpy(tmp, &s->map[k*15], 15*sizeof(*tmp)); + for (int i = 1; i < 15; i += 3) { + s->map[k*15 + cnt] = tmp[i]; + cnt++; + } + for (int i = 2; i < 15; i += 3) { + s->map[k*15 + cnt] = tmp[i]; + cnt++; + } + for (int i = 0; i < 15; i += 3) { + s->map[k*15 + cnt] = tmp[i]; + cnt++; + } + memmove(&s->map[k*15 + 7], &s->map[k*15 + 6], 4*sizeof(int)); + memmove(&s->map[k*15 + 3], &s->map[k*15 + 1], 4*sizeof(int)); + s->map[k*15 + 1] = tmp[2]; + s->map[k*15 + 2] = tmp[0]; + } + } + + if (!(s->tmp = av_malloc(len*sizeof(*s->tmp)))) + return AVERROR(ENOMEM); + + TX_TAB(ff_tx_init_tabs)(len / sub_len); + + return 0; +} + +const FFTXCodelet * const ff_tx_codelet_list_float_x86[] = { + TX_DEF(fft2, FFT, 2, 2, 2, 0, 128, NULL, sse3, SSE3, AV_TX_INPLACE, 0), + TX_DEF(fft2_asm, FFT, 2, 2, 2, 0, 192, b8_i0, sse3, SSE3, + AV_TX_INPLACE | FF_TX_PRESHUFFLE | FF_TX_ASM_CALL, 0), + TX_DEF(fft2, FFT, 2, 2, 2, 0, 192, b8_i0, sse3, SSE3, AV_TX_INPLACE | FF_TX_PRESHUFFLE, 0), + TX_DEF(fft4_fwd, FFT, 4, 4, 2, 0, 128, NULL, sse2, SSE2, AV_TX_INPLACE | FF_TX_FORWARD_ONLY, 0), + TX_DEF(fft4_fwd_asm, FFT, 4, 4, 2, 0, 192, b8_i0, sse2, SSE2, + AV_TX_INPLACE | FF_TX_PRESHUFFLE | FF_TX_ASM_CALL, 0), + TX_DEF(fft4_inv_asm, FFT, 4, 4, 2, 0, 128, NULL, sse2, SSE2, + AV_TX_INPLACE | FF_TX_INVERSE_ONLY | FF_TX_ASM_CALL, 0), + TX_DEF(fft4_fwd, FFT, 4, 4, 2, 0, 192, b8_i0, sse2, SSE2, AV_TX_INPLACE | FF_TX_PRESHUFFLE, 0), + TX_DEF(fft4_inv, FFT, 4, 4, 2, 0, 128, NULL, sse2, SSE2, AV_TX_INPLACE | FF_TX_INVERSE_ONLY, 0), + TX_DEF(fft8, FFT, 8, 8, 2, 0, 128, b8_i0, sse3, SSE3, AV_TX_INPLACE, 0), + TX_DEF(fft8_asm, FFT, 8, 8, 2, 0, 192, b8_i0, sse3, SSE3, + AV_TX_INPLACE | FF_TX_PRESHUFFLE | FF_TX_ASM_CALL, 0), + TX_DEF(fft8_ns, FFT, 8, 8, 2, 0, 192, b8_i0, sse3, SSE3, AV_TX_INPLACE | FF_TX_PRESHUFFLE, 0), + TX_DEF(fft8, FFT, 8, 8, 2, 0, 256, b8_i0, avx, AVX, AV_TX_INPLACE, AV_CPU_FLAG_AVXSLOW), + TX_DEF(fft8_asm, FFT, 8, 8, 2, 0, 320, b8_i0, avx, AVX, + AV_TX_INPLACE | FF_TX_PRESHUFFLE | FF_TX_ASM_CALL, AV_CPU_FLAG_AVXSLOW), + TX_DEF(fft8_ns, FFT, 8, 8, 2, 0, 320, b8_i0, avx, AVX, AV_TX_INPLACE | FF_TX_PRESHUFFLE, + AV_CPU_FLAG_AVXSLOW), + TX_DEF(fft16, FFT, 16, 16, 2, 0, 256, b8_i2, avx, AVX, AV_TX_INPLACE, AV_CPU_FLAG_AVXSLOW), + TX_DEF(fft16_asm, FFT, 16, 16, 2, 0, 320, b8_i2, avx, AVX, + AV_TX_INPLACE | FF_TX_PRESHUFFLE | FF_TX_ASM_CALL, AV_CPU_FLAG_AVXSLOW), + TX_DEF(fft16_ns, FFT, 16, 16, 2, 0, 320, b8_i2, avx, AVX, AV_TX_INPLACE | FF_TX_PRESHUFFLE, + AV_CPU_FLAG_AVXSLOW), + TX_DEF(fft16, FFT, 16, 16, 2, 0, 288, b8_i2, fma3, FMA3, AV_TX_INPLACE, AV_CPU_FLAG_AVXSLOW), + TX_DEF(fft16_asm, FFT, 16, 16, 2, 0, 352, b8_i2, fma3, FMA3, + AV_TX_INPLACE | FF_TX_PRESHUFFLE | FF_TX_ASM_CALL, AV_CPU_FLAG_AVXSLOW), + TX_DEF(fft16_ns, FFT, 16, 16, 2, 0, 352, b8_i2, fma3, FMA3, AV_TX_INPLACE | FF_TX_PRESHUFFLE, + AV_CPU_FLAG_AVXSLOW), + +#if ARCH_X86_64 + TX_DEF(fft32, FFT, 32, 32, 2, 0, 256, b8_i2, avx, AVX, AV_TX_INPLACE, AV_CPU_FLAG_AVXSLOW), + TX_DEF(fft32_asm, FFT, 32, 32, 2, 0, 320, b8_i2, avx, AVX, + AV_TX_INPLACE | FF_TX_PRESHUFFLE | FF_TX_ASM_CALL, AV_CPU_FLAG_AVXSLOW), + TX_DEF(fft32_ns, FFT, 32, 32, 2, 0, 320, b8_i2, avx, AVX, AV_TX_INPLACE | FF_TX_PRESHUFFLE, + AV_CPU_FLAG_AVXSLOW), + TX_DEF(fft32, FFT, 32, 32, 2, 0, 288, b8_i2, fma3, FMA3, AV_TX_INPLACE, AV_CPU_FLAG_AVXSLOW), + TX_DEF(fft32_asm, FFT, 32, 32, 2, 0, 352, b8_i2, fma3, FMA3, + AV_TX_INPLACE | FF_TX_PRESHUFFLE | FF_TX_ASM_CALL, AV_CPU_FLAG_AVXSLOW), + TX_DEF(fft32_ns, FFT, 32, 32, 2, 0, 352, b8_i2, fma3, FMA3, AV_TX_INPLACE | FF_TX_PRESHUFFLE, + AV_CPU_FLAG_AVXSLOW), + TX_DEF(fft_sr, FFT, 64, 131072, 2, 0, 256, b8_i2, avx, AVX, 0, AV_CPU_FLAG_AVXSLOW), + TX_DEF(fft_sr_asm, FFT, 64, 131072, 2, 0, 320, b8_i2, avx, AVX, + AV_TX_INPLACE | FF_TX_PRESHUFFLE | FF_TX_ASM_CALL, AV_CPU_FLAG_AVXSLOW), + TX_DEF(fft_sr_ns, FFT, 64, 131072, 2, 0, 320, b8_i2, avx, AVX, AV_TX_INPLACE | FF_TX_PRESHUFFLE, + AV_CPU_FLAG_AVXSLOW), + TX_DEF(fft_sr, FFT, 64, 131072, 2, 0, 288, b8_i2, fma3, FMA3, 0, AV_CPU_FLAG_AVXSLOW), + TX_DEF(fft_sr_asm, FFT, 64, 131072, 2, 0, 352, b8_i2, fma3, FMA3, + AV_TX_INPLACE | FF_TX_PRESHUFFLE | FF_TX_ASM_CALL, AV_CPU_FLAG_AVXSLOW), + TX_DEF(fft_sr_ns, FFT, 64, 131072, 2, 0, 352, b8_i2, fma3, FMA3, AV_TX_INPLACE | FF_TX_PRESHUFFLE, + AV_CPU_FLAG_AVXSLOW), + +#if HAVE_AVX2_EXTERNAL + TX_DEF(fft15, FFT, 15, 15, 15, 0, 320, factor_init, avx2, AVX2, + AV_TX_INPLACE, AV_CPU_FLAG_AVXSLOW), + TX_DEF(fft15_ns, FFT, 15, 15, 15, 0, 384, factor_init, avx2, AVX2, + AV_TX_INPLACE | FF_TX_PRESHUFFLE, AV_CPU_FLAG_AVXSLOW), + + TX_DEF(fft_sr, FFT, 64, 131072, 2, 0, 320, b8_i2, avx2, AVX2, 0, + AV_CPU_FLAG_AVXSLOW | AV_CPU_FLAG_SLOW_GATHER), + TX_DEF(fft_sr_asm, FFT, 64, 131072, 2, 0, 384, b8_i2, avx2, AVX2, + AV_TX_INPLACE | FF_TX_PRESHUFFLE | FF_TX_ASM_CALL, AV_CPU_FLAG_AVXSLOW | AV_CPU_FLAG_SLOW_GATHER), + TX_DEF(fft_sr_ns, FFT, 64, 131072, 2, 0, 384, b8_i2, avx2, AVX2, AV_TX_INPLACE | FF_TX_PRESHUFFLE, + AV_CPU_FLAG_AVXSLOW | AV_CPU_FLAG_SLOW_GATHER), + + TX_DEF(fft_pfa_15xM, FFT, 60, TX_LEN_UNLIMITED, 15, 2, 320, fft_pfa_init, avx2, AVX2, + AV_TX_INPLACE, AV_CPU_FLAG_AVXSLOW | AV_CPU_FLAG_SLOW_GATHER), + TX_DEF(fft_pfa_15xM_asm, FFT, 60, TX_LEN_UNLIMITED, 15, 2, 384, fft_pfa_init, avx2, AVX2, + AV_TX_INPLACE | FF_TX_PRESHUFFLE | FF_TX_ASM_CALL, AV_CPU_FLAG_AVXSLOW | AV_CPU_FLAG_SLOW_GATHER), + TX_DEF(fft_pfa_15xM_ns, FFT, 60, TX_LEN_UNLIMITED, 15, 2, 384, fft_pfa_init, avx2, AVX2, + AV_TX_INPLACE | FF_TX_PRESHUFFLE, AV_CPU_FLAG_AVXSLOW | AV_CPU_FLAG_SLOW_GATHER), + + TX_DEF(mdct_inv, MDCT, 16, TX_LEN_UNLIMITED, 2, TX_FACTOR_ANY, 384, m_inv_init, avx2, AVX2, + FF_TX_INVERSE_ONLY, AV_CPU_FLAG_AVXSLOW | AV_CPU_FLAG_SLOW_GATHER), +#endif +#endif + + NULL, +}; diff --git a/media/ffvpx/libavutil/x86/x86inc.asm b/media/ffvpx/libavutil/x86/x86inc.asm new file mode 100644 index 0000000000..e099ee4b10 --- /dev/null +++ b/media/ffvpx/libavutil/x86/x86inc.asm @@ -0,0 +1,1726 @@ +;***************************************************************************** +;* x86inc.asm: x264asm abstraction layer +;***************************************************************************** +;* Copyright (C) 2005-2018 x264 project +;* +;* Authors: Loren Merritt +;* Henrik Gramner +;* Anton Mitrofanov +;* Fiona Glaser +;* +;* Permission to use, copy, modify, and/or distribute this software for any +;* purpose with or without fee is hereby granted, provided that the above +;* copyright notice and this permission notice appear in all copies. +;* +;* THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES +;* WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF +;* MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR +;* ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES +;* WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN +;* ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF +;* OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. +;***************************************************************************** + +; This is a header file for the x264ASM assembly language, which uses +; NASM/YASM syntax combined with a large number of macros to provide easy +; abstraction between different calling conventions (x86_32, win64, linux64). +; It also has various other useful features to simplify writing the kind of +; DSP functions that are most often used in x264. + +; Unlike the rest of x264, this file is available under an ISC license, as it +; has significant usefulness outside of x264 and we want it to be available +; to the largest audience possible. Of course, if you modify it for your own +; purposes to add a new feature, we strongly encourage contributing a patch +; as this feature might be useful for others as well. Send patches or ideas +; to x264-devel@videolan.org . + +%ifndef private_prefix + %define private_prefix x264 +%endif + +%ifndef public_prefix + %define public_prefix private_prefix +%endif + +%if HAVE_ALIGNED_STACK + %define STACK_ALIGNMENT 16 +%endif +%ifndef STACK_ALIGNMENT + %if ARCH_X86_64 + %define STACK_ALIGNMENT 16 + %else + %define STACK_ALIGNMENT 4 + %endif +%endif + +%define WIN64 0 +%define UNIX64 0 +%if ARCH_X86_64 + %ifidn __OUTPUT_FORMAT__,win32 + %define WIN64 1 + %elifidn __OUTPUT_FORMAT__,win64 + %define WIN64 1 + %elifidn __OUTPUT_FORMAT__,x64 + %define WIN64 1 + %else + %define UNIX64 1 + %endif +%endif + +%define FORMAT_ELF 0 +%ifidn __OUTPUT_FORMAT__,elf + %define FORMAT_ELF 1 +%elifidn __OUTPUT_FORMAT__,elf32 + %define FORMAT_ELF 1 +%elifidn __OUTPUT_FORMAT__,elf64 + %define FORMAT_ELF 1 +%endif + +%ifdef PREFIX + %define mangle(x) _ %+ x +%else + %define mangle(x) x +%endif + +; aout does not support align= +; NOTE: This section is out of sync with x264, in order to +; keep supporting OS/2. +%macro SECTION_RODATA 0-1 16 + %ifidn __OUTPUT_FORMAT__,aout + SECTION .text + %elifidn __OUTPUT_FORMAT__,coff + SECTION .text + %elifidn __OUTPUT_FORMAT__,win32 + SECTION .rdata align=%1 + %elif WIN64 + SECTION .rdata align=%1 + %else + SECTION .rodata align=%1 + %endif +%endmacro + +%if WIN64 + %define PIC +%elif ARCH_X86_64 == 0 +; x86_32 doesn't require PIC. +; Some distros prefer shared objects to be PIC, but nothing breaks if +; the code contains a few textrels, so we'll skip that complexity. + %undef PIC +%endif +%ifdef PIC + default rel +%endif + +%macro CPUNOP 1 + %if HAVE_CPUNOP + CPU %1 + %endif +%endmacro + +; Macros to eliminate most code duplication between x86_32 and x86_64: +; Currently this works only for leaf functions which load all their arguments +; into registers at the start, and make no other use of the stack. Luckily that +; covers most of x264's asm. + +; PROLOGUE: +; %1 = number of arguments. loads them from stack if needed. +; %2 = number of registers used. pushes callee-saved regs if needed. +; %3 = number of xmm registers used. pushes callee-saved xmm regs if needed. +; %4 = (optional) stack size to be allocated. The stack will be aligned before +; allocating the specified stack size. If the required stack alignment is +; larger than the known stack alignment the stack will be manually aligned +; and an extra register will be allocated to hold the original stack +; pointer (to not invalidate r0m etc.). To prevent the use of an extra +; register as stack pointer, request a negative stack size. +; %4+/%5+ = list of names to define to registers +; PROLOGUE can also be invoked by adding the same options to cglobal + +; e.g. +; cglobal foo, 2,3,7,0x40, dst, src, tmp +; declares a function (foo) that automatically loads two arguments (dst and +; src) into registers, uses one additional register (tmp) plus 7 vector +; registers (m0-m6) and allocates 0x40 bytes of stack space. + +; TODO Some functions can use some args directly from the stack. If they're the +; last args then you can just not declare them, but if they're in the middle +; we need more flexible macro. + +; RET: +; Pops anything that was pushed by PROLOGUE, and returns. + +; REP_RET: +; Use this instead of RET if it's a branch target. + +; registers: +; rN and rNq are the native-size register holding function argument N +; rNd, rNw, rNb are dword, word, and byte size +; rNh is the high 8 bits of the word size +; rNm is the original location of arg N (a register or on the stack), dword +; rNmp is native size + +%macro DECLARE_REG 2-3 + %define r%1q %2 + %define r%1d %2d + %define r%1w %2w + %define r%1b %2b + %define r%1h %2h + %define %2q %2 + %if %0 == 2 + %define r%1m %2d + %define r%1mp %2 + %elif ARCH_X86_64 ; memory + %define r%1m [rstk + stack_offset + %3] + %define r%1mp qword r %+ %1 %+ m + %else + %define r%1m [rstk + stack_offset + %3] + %define r%1mp dword r %+ %1 %+ m + %endif + %define r%1 %2 +%endmacro + +%macro DECLARE_REG_SIZE 3 + %define r%1q r%1 + %define e%1q r%1 + %define r%1d e%1 + %define e%1d e%1 + %define r%1w %1 + %define e%1w %1 + %define r%1h %3 + %define e%1h %3 + %define r%1b %2 + %define e%1b %2 + %if ARCH_X86_64 == 0 + %define r%1 e%1 + %endif +%endmacro + +DECLARE_REG_SIZE ax, al, ah +DECLARE_REG_SIZE bx, bl, bh +DECLARE_REG_SIZE cx, cl, ch +DECLARE_REG_SIZE dx, dl, dh +DECLARE_REG_SIZE si, sil, null +DECLARE_REG_SIZE di, dil, null +DECLARE_REG_SIZE bp, bpl, null + +; t# defines for when per-arch register allocation is more complex than just function arguments + +%macro DECLARE_REG_TMP 1-* + %assign %%i 0 + %rep %0 + CAT_XDEFINE t, %%i, r%1 + %assign %%i %%i+1 + %rotate 1 + %endrep +%endmacro + +%macro DECLARE_REG_TMP_SIZE 0-* + %rep %0 + %define t%1q t%1 %+ q + %define t%1d t%1 %+ d + %define t%1w t%1 %+ w + %define t%1h t%1 %+ h + %define t%1b t%1 %+ b + %rotate 1 + %endrep +%endmacro + +DECLARE_REG_TMP_SIZE 0,1,2,3,4,5,6,7,8,9,10,11,12,13,14 + +%if ARCH_X86_64 + %define gprsize 8 +%else + %define gprsize 4 +%endif + +; Repeats an instruction/operation for multiple arguments. +; Example usage: "REPX {psrlw x, 8}, m0, m1, m2, m3" +%macro REPX 2-* ; operation, args + %xdefine %%f(x) %1 + %rep %0 - 1 + %rotate 1 + %%f(%1) + %endrep +%endmacro + +%macro PUSH 1 + push %1 + %ifidn rstk, rsp + %assign stack_offset stack_offset+gprsize + %endif +%endmacro + +%macro POP 1 + pop %1 + %ifidn rstk, rsp + %assign stack_offset stack_offset-gprsize + %endif +%endmacro + +%macro PUSH_IF_USED 1-* + %rep %0 + %if %1 < regs_used + PUSH r%1 + %endif + %rotate 1 + %endrep +%endmacro + +%macro POP_IF_USED 1-* + %rep %0 + %if %1 < regs_used + pop r%1 + %endif + %rotate 1 + %endrep +%endmacro + +%macro LOAD_IF_USED 1-* + %rep %0 + %if %1 < num_args + mov r%1, r %+ %1 %+ mp + %endif + %rotate 1 + %endrep +%endmacro + +%macro SUB 2 + sub %1, %2 + %ifidn %1, rstk + %assign stack_offset stack_offset+(%2) + %endif +%endmacro + +%macro ADD 2 + add %1, %2 + %ifidn %1, rstk + %assign stack_offset stack_offset-(%2) + %endif +%endmacro + +%macro movifnidn 2 + %ifnidn %1, %2 + mov %1, %2 + %endif +%endmacro + +%macro movsxdifnidn 2 + %ifnidn %1, %2 + movsxd %1, %2 + %endif +%endmacro + +%macro ASSERT 1 + %if (%1) == 0 + %error assertion ``%1'' failed + %endif +%endmacro + +%macro DEFINE_ARGS 0-* + %ifdef n_arg_names + %assign %%i 0 + %rep n_arg_names + CAT_UNDEF arg_name %+ %%i, q + CAT_UNDEF arg_name %+ %%i, d + CAT_UNDEF arg_name %+ %%i, w + CAT_UNDEF arg_name %+ %%i, h + CAT_UNDEF arg_name %+ %%i, b + CAT_UNDEF arg_name %+ %%i, m + CAT_UNDEF arg_name %+ %%i, mp + CAT_UNDEF arg_name, %%i + %assign %%i %%i+1 + %endrep + %endif + + %xdefine %%stack_offset stack_offset + %undef stack_offset ; so that the current value of stack_offset doesn't get baked in by xdefine + %assign %%i 0 + %rep %0 + %xdefine %1q r %+ %%i %+ q + %xdefine %1d r %+ %%i %+ d + %xdefine %1w r %+ %%i %+ w + %xdefine %1h r %+ %%i %+ h + %xdefine %1b r %+ %%i %+ b + %xdefine %1m r %+ %%i %+ m + %xdefine %1mp r %+ %%i %+ mp + CAT_XDEFINE arg_name, %%i, %1 + %assign %%i %%i+1 + %rotate 1 + %endrep + %xdefine stack_offset %%stack_offset + %assign n_arg_names %0 +%endmacro + +%define required_stack_alignment ((mmsize + 15) & ~15) +%define vzeroupper_required (mmsize > 16 && (ARCH_X86_64 == 0 || xmm_regs_used > 16 || notcpuflag(avx512))) +%define high_mm_regs (16*cpuflag(avx512)) + +%macro ALLOC_STACK 1-2 0 ; stack_size, n_xmm_regs (for win64 only) + %ifnum %1 + %if %1 != 0 + %assign %%pad 0 + %assign stack_size %1 + %if stack_size < 0 + %assign stack_size -stack_size + %endif + %if WIN64 + %assign %%pad %%pad + 32 ; shadow space + %if mmsize != 8 + %assign xmm_regs_used %2 + %if xmm_regs_used > 8 + %assign %%pad %%pad + (xmm_regs_used-8)*16 ; callee-saved xmm registers + %endif + %endif + %endif + %if required_stack_alignment <= STACK_ALIGNMENT + ; maintain the current stack alignment + %assign stack_size_padded stack_size + %%pad + ((-%%pad-stack_offset-gprsize) & (STACK_ALIGNMENT-1)) + SUB rsp, stack_size_padded + %else + %assign %%reg_num (regs_used - 1) + %xdefine rstk r %+ %%reg_num + ; align stack, and save original stack location directly above + ; it, i.e. in [rsp+stack_size_padded], so we can restore the + ; stack in a single instruction (i.e. mov rsp, rstk or mov + ; rsp, [rsp+stack_size_padded]) + %if %1 < 0 ; need to store rsp on stack + %xdefine rstkm [rsp + stack_size + %%pad] + %assign %%pad %%pad + gprsize + %else ; can keep rsp in rstk during whole function + %xdefine rstkm rstk + %endif + %assign stack_size_padded stack_size + ((%%pad + required_stack_alignment-1) & ~(required_stack_alignment-1)) + mov rstk, rsp + and rsp, ~(required_stack_alignment-1) + sub rsp, stack_size_padded + movifnidn rstkm, rstk + %endif + WIN64_PUSH_XMM + %endif + %endif +%endmacro + +%macro SETUP_STACK_POINTER 1 + %ifnum %1 + %if %1 != 0 && required_stack_alignment > STACK_ALIGNMENT + %if %1 > 0 + ; Reserve an additional register for storing the original stack pointer, but avoid using + ; eax/rax for this purpose since it can potentially get overwritten as a return value. + %assign regs_used (regs_used + 1) + %if ARCH_X86_64 && regs_used == 7 + %assign regs_used 8 + %elif ARCH_X86_64 == 0 && regs_used == 1 + %assign regs_used 2 + %endif + %endif + %if ARCH_X86_64 && regs_used < 5 + UNIX64 * 3 + ; Ensure that we don't clobber any registers containing arguments. For UNIX64 we also preserve r6 (rax) + ; since it's used as a hidden argument in vararg functions to specify the number of vector registers used. + %assign regs_used 5 + UNIX64 * 3 + %endif + %endif + %endif +%endmacro + +%if WIN64 ; Windows x64 ;================================================= + +DECLARE_REG 0, rcx +DECLARE_REG 1, rdx +DECLARE_REG 2, R8 +DECLARE_REG 3, R9 +DECLARE_REG 4, R10, 40 +DECLARE_REG 5, R11, 48 +DECLARE_REG 6, rax, 56 +DECLARE_REG 7, rdi, 64 +DECLARE_REG 8, rsi, 72 +DECLARE_REG 9, rbx, 80 +DECLARE_REG 10, rbp, 88 +DECLARE_REG 11, R14, 96 +DECLARE_REG 12, R15, 104 +DECLARE_REG 13, R12, 112 +DECLARE_REG 14, R13, 120 + +%macro PROLOGUE 2-5+ 0, 0 ; #args, #regs, #xmm_regs, [stack_size,] arg_names... + %assign num_args %1 + %assign regs_used %2 + ASSERT regs_used >= num_args + SETUP_STACK_POINTER %4 + ASSERT regs_used <= 15 + PUSH_IF_USED 7, 8, 9, 10, 11, 12, 13, 14 + ALLOC_STACK %4, %3 + %if mmsize != 8 && stack_size == 0 + WIN64_SPILL_XMM %3 + %endif + LOAD_IF_USED 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14 + %if %0 > 4 + %ifnum %4 + DEFINE_ARGS %5 + %else + DEFINE_ARGS %4, %5 + %endif + %elifnnum %4 + DEFINE_ARGS %4 + %endif +%endmacro + +%macro WIN64_PUSH_XMM 0 + ; Use the shadow space to store XMM6 and XMM7, the rest needs stack space allocated. + %if xmm_regs_used > 6 + high_mm_regs + movaps [rstk + stack_offset + 8], xmm6 + %endif + %if xmm_regs_used > 7 + high_mm_regs + movaps [rstk + stack_offset + 24], xmm7 + %endif + %assign %%xmm_regs_on_stack xmm_regs_used - high_mm_regs - 8 + %if %%xmm_regs_on_stack > 0 + %assign %%i 8 + %rep %%xmm_regs_on_stack + movaps [rsp + (%%i-8)*16 + stack_size + 32], xmm %+ %%i + %assign %%i %%i+1 + %endrep + %endif +%endmacro + +%macro WIN64_SPILL_XMM 1 + %assign xmm_regs_used %1 + ASSERT xmm_regs_used <= 16 + high_mm_regs + %assign %%xmm_regs_on_stack xmm_regs_used - high_mm_regs - 8 + %if %%xmm_regs_on_stack > 0 + ; Allocate stack space for callee-saved xmm registers plus shadow space and align the stack. + %assign %%pad %%xmm_regs_on_stack*16 + 32 + %assign stack_size_padded %%pad + ((-%%pad-stack_offset-gprsize) & (STACK_ALIGNMENT-1)) + SUB rsp, stack_size_padded + %endif + WIN64_PUSH_XMM +%endmacro + +%macro WIN64_RESTORE_XMM_INTERNAL 0 + %assign %%pad_size 0 + %assign %%xmm_regs_on_stack xmm_regs_used - high_mm_regs - 8 + %if %%xmm_regs_on_stack > 0 + %assign %%i xmm_regs_used - high_mm_regs + %rep %%xmm_regs_on_stack + %assign %%i %%i-1 + movaps xmm %+ %%i, [rsp + (%%i-8)*16 + stack_size + 32] + %endrep + %endif + %if stack_size_padded > 0 + %if stack_size > 0 && required_stack_alignment > STACK_ALIGNMENT + mov rsp, rstkm + %else + add rsp, stack_size_padded + %assign %%pad_size stack_size_padded + %endif + %endif + %if xmm_regs_used > 7 + high_mm_regs + movaps xmm7, [rsp + stack_offset - %%pad_size + 24] + %endif + %if xmm_regs_used > 6 + high_mm_regs + movaps xmm6, [rsp + stack_offset - %%pad_size + 8] + %endif +%endmacro + +%macro WIN64_RESTORE_XMM 0 + WIN64_RESTORE_XMM_INTERNAL + %assign stack_offset (stack_offset-stack_size_padded) + %assign stack_size_padded 0 + %assign xmm_regs_used 0 +%endmacro + +%define has_epilogue regs_used > 7 || stack_size > 0 || vzeroupper_required || xmm_regs_used > 6+high_mm_regs + +%macro RET 0 + WIN64_RESTORE_XMM_INTERNAL + POP_IF_USED 14, 13, 12, 11, 10, 9, 8, 7 + %if vzeroupper_required + vzeroupper + %endif + AUTO_REP_RET +%endmacro + +%elif ARCH_X86_64 ; *nix x64 ;============================================= + +DECLARE_REG 0, rdi +DECLARE_REG 1, rsi +DECLARE_REG 2, rdx +DECLARE_REG 3, rcx +DECLARE_REG 4, R8 +DECLARE_REG 5, R9 +DECLARE_REG 6, rax, 8 +DECLARE_REG 7, R10, 16 +DECLARE_REG 8, R11, 24 +DECLARE_REG 9, rbx, 32 +DECLARE_REG 10, rbp, 40 +DECLARE_REG 11, R14, 48 +DECLARE_REG 12, R15, 56 +DECLARE_REG 13, R12, 64 +DECLARE_REG 14, R13, 72 + +%macro PROLOGUE 2-5+ 0, 0 ; #args, #regs, #xmm_regs, [stack_size,] arg_names... + %assign num_args %1 + %assign regs_used %2 + %assign xmm_regs_used %3 + ASSERT regs_used >= num_args + SETUP_STACK_POINTER %4 + ASSERT regs_used <= 15 + PUSH_IF_USED 9, 10, 11, 12, 13, 14 + ALLOC_STACK %4 + LOAD_IF_USED 6, 7, 8, 9, 10, 11, 12, 13, 14 + %if %0 > 4 + %ifnum %4 + DEFINE_ARGS %5 + %else + DEFINE_ARGS %4, %5 + %endif + %elifnnum %4 + DEFINE_ARGS %4 + %endif +%endmacro + +%define has_epilogue regs_used > 9 || stack_size > 0 || vzeroupper_required + +%macro RET 0 + %if stack_size_padded > 0 + %if required_stack_alignment > STACK_ALIGNMENT + mov rsp, rstkm + %else + add rsp, stack_size_padded + %endif + %endif + POP_IF_USED 14, 13, 12, 11, 10, 9 + %if vzeroupper_required + vzeroupper + %endif + AUTO_REP_RET +%endmacro + +%else ; X86_32 ;============================================================== + +DECLARE_REG 0, eax, 4 +DECLARE_REG 1, ecx, 8 +DECLARE_REG 2, edx, 12 +DECLARE_REG 3, ebx, 16 +DECLARE_REG 4, esi, 20 +DECLARE_REG 5, edi, 24 +DECLARE_REG 6, ebp, 28 +%define rsp esp + +%macro DECLARE_ARG 1-* + %rep %0 + %define r%1m [rstk + stack_offset + 4*%1 + 4] + %define r%1mp dword r%1m + %rotate 1 + %endrep +%endmacro + +DECLARE_ARG 7, 8, 9, 10, 11, 12, 13, 14 + +%macro PROLOGUE 2-5+ 0, 0 ; #args, #regs, #xmm_regs, [stack_size,] arg_names... + %assign num_args %1 + %assign regs_used %2 + ASSERT regs_used >= num_args + %if num_args > 7 + %assign num_args 7 + %endif + %if regs_used > 7 + %assign regs_used 7 + %endif + SETUP_STACK_POINTER %4 + ASSERT regs_used <= 7 + PUSH_IF_USED 3, 4, 5, 6 + ALLOC_STACK %4 + LOAD_IF_USED 0, 1, 2, 3, 4, 5, 6 + %if %0 > 4 + %ifnum %4 + DEFINE_ARGS %5 + %else + DEFINE_ARGS %4, %5 + %endif + %elifnnum %4 + DEFINE_ARGS %4 + %endif +%endmacro + +%define has_epilogue regs_used > 3 || stack_size > 0 || vzeroupper_required + +%macro RET 0 + %if stack_size_padded > 0 + %if required_stack_alignment > STACK_ALIGNMENT + mov rsp, rstkm + %else + add rsp, stack_size_padded + %endif + %endif + POP_IF_USED 6, 5, 4, 3 + %if vzeroupper_required + vzeroupper + %endif + AUTO_REP_RET +%endmacro + +%endif ;====================================================================== + +%if WIN64 == 0 + %macro WIN64_SPILL_XMM 1 + %endmacro + %macro WIN64_RESTORE_XMM 0 + %endmacro + %macro WIN64_PUSH_XMM 0 + %endmacro +%endif + +; On AMD cpus <=K10, an ordinary ret is slow if it immediately follows either +; a branch or a branch target. So switch to a 2-byte form of ret in that case. +; We can automatically detect "follows a branch", but not a branch target. +; (SSSE3 is a sufficient condition to know that your cpu doesn't have this problem.) +%macro REP_RET 0 + %if has_epilogue || cpuflag(ssse3) + RET + %else + rep ret + %endif + annotate_function_size +%endmacro + +%define last_branch_adr $$ +%macro AUTO_REP_RET 0 + %if notcpuflag(ssse3) + times ((last_branch_adr-$)>>31)+1 rep ; times 1 iff $ == last_branch_adr. + %endif + ret + annotate_function_size +%endmacro + +%macro BRANCH_INSTR 0-* + %rep %0 + %macro %1 1-2 %1 + %2 %1 + %if notcpuflag(ssse3) + %%branch_instr equ $ + %xdefine last_branch_adr %%branch_instr + %endif + %endmacro + %rotate 1 + %endrep +%endmacro + +BRANCH_INSTR jz, je, jnz, jne, jl, jle, jnl, jnle, jg, jge, jng, jnge, ja, jae, jna, jnae, jb, jbe, jnb, jnbe, jc, jnc, js, jns, jo, jno, jp, jnp + +%macro TAIL_CALL 2 ; callee, is_nonadjacent + %if has_epilogue + call %1 + RET + %elif %2 + jmp %1 + %endif + annotate_function_size +%endmacro + +;============================================================================= +; arch-independent part +;============================================================================= + +%assign function_align 16 + +; Begin a function. +; Applies any symbol mangling needed for C linkage, and sets up a define such that +; subsequent uses of the function name automatically refer to the mangled version. +; Appends cpuflags to the function name if cpuflags has been specified. +; The "" empty default parameter is a workaround for nasm, which fails if SUFFIX +; is empty and we call cglobal_internal with just %1 %+ SUFFIX (without %2). +%macro cglobal 1-2+ "" ; name, [PROLOGUE args] + cglobal_internal 1, %1 %+ SUFFIX, %2 +%endmacro +%macro cvisible 1-2+ "" ; name, [PROLOGUE args] + cglobal_internal 0, %1 %+ SUFFIX, %2 +%endmacro +%macro cglobal_internal 2-3+ + annotate_function_size + %if %1 + %xdefine %%FUNCTION_PREFIX private_prefix + %xdefine %%VISIBILITY hidden + %else + %xdefine %%FUNCTION_PREFIX public_prefix + %xdefine %%VISIBILITY + %endif + %ifndef cglobaled_%2 + %xdefine %2 mangle(%%FUNCTION_PREFIX %+ _ %+ %2) + %xdefine %2.skip_prologue %2 %+ .skip_prologue + CAT_XDEFINE cglobaled_, %2, 1 + %endif + %xdefine current_function %2 + %xdefine current_function_section __SECT__ + %if FORMAT_ELF + global %2:function %%VISIBILITY + %else + global %2 + %endif + align function_align + %2: + RESET_MM_PERMUTATION ; needed for x86-64, also makes disassembly somewhat nicer + %xdefine rstk rsp ; copy of the original stack pointer, used when greater alignment than the known stack alignment is required + %assign stack_offset 0 ; stack pointer offset relative to the return address + %assign stack_size 0 ; amount of stack space that can be freely used inside a function + %assign stack_size_padded 0 ; total amount of allocated stack space, including space for callee-saved xmm registers on WIN64 and alignment padding + %assign xmm_regs_used 0 ; number of XMM registers requested, used for dealing with callee-saved registers on WIN64 and vzeroupper + %ifnidn %3, "" + PROLOGUE %3 + %endif +%endmacro + +; Create a global symbol from a local label with the correct name mangling and type +%macro cglobal_label 1 + %if FORMAT_ELF + global current_function %+ %1:function hidden + %else + global current_function %+ %1 + %endif + %1: +%endmacro + +%macro cextern 1 + %xdefine %1 mangle(private_prefix %+ _ %+ %1) + CAT_XDEFINE cglobaled_, %1, 1 + extern %1 +%endmacro + +; like cextern, but without the prefix +%macro cextern_naked 1 + %ifdef PREFIX + %xdefine %1 mangle(%1) + %endif + CAT_XDEFINE cglobaled_, %1, 1 + extern %1 +%endmacro + +%macro const 1-2+ + %xdefine %1 mangle(private_prefix %+ _ %+ %1) + %if FORMAT_ELF + global %1:data hidden + %else + global %1 + %endif + %1: %2 +%endmacro + +; This is needed for ELF, otherwise the GNU linker assumes the stack is executable by default. +%if FORMAT_ELF + [SECTION .note.GNU-stack noalloc noexec nowrite progbits] +%endif + +; Tell debuggers how large the function was. +; This may be invoked multiple times per function; we rely on later instances overriding earlier ones. +; This is invoked by RET and similar macros, and also cglobal does it for the previous function, +; but if the last function in a source file doesn't use any of the standard macros for its epilogue, +; then its size might be unspecified. +%macro annotate_function_size 0 + %ifdef __YASM_VER__ + %ifdef current_function + %if FORMAT_ELF + current_function_section + %%ecf equ $ + size current_function %%ecf - current_function + __SECT__ + %endif + %endif + %endif +%endmacro + +; cpuflags + +%assign cpuflags_mmx (1<<0) +%assign cpuflags_mmx2 (1<<1) | cpuflags_mmx +%assign cpuflags_3dnow (1<<2) | cpuflags_mmx +%assign cpuflags_3dnowext (1<<3) | cpuflags_3dnow +%assign cpuflags_sse (1<<4) | cpuflags_mmx2 +%assign cpuflags_sse2 (1<<5) | cpuflags_sse +%assign cpuflags_sse2slow (1<<6) | cpuflags_sse2 +%assign cpuflags_lzcnt (1<<7) | cpuflags_sse2 +%assign cpuflags_sse3 (1<<8) | cpuflags_sse2 +%assign cpuflags_ssse3 (1<<9) | cpuflags_sse3 +%assign cpuflags_sse4 (1<<10)| cpuflags_ssse3 +%assign cpuflags_sse42 (1<<11)| cpuflags_sse4 +%assign cpuflags_aesni (1<<12)| cpuflags_sse42 +%assign cpuflags_avx (1<<13)| cpuflags_sse42 +%assign cpuflags_xop (1<<14)| cpuflags_avx +%assign cpuflags_fma4 (1<<15)| cpuflags_avx +%assign cpuflags_fma3 (1<<16)| cpuflags_avx +%assign cpuflags_bmi1 (1<<17)| cpuflags_avx|cpuflags_lzcnt +%assign cpuflags_bmi2 (1<<18)| cpuflags_bmi1 +%assign cpuflags_avx2 (1<<19)| cpuflags_fma3|cpuflags_bmi2 +%assign cpuflags_avx512 (1<<20)| cpuflags_avx2 ; F, CD, BW, DQ, VL +%assign cpuflags_avx512icl (1<<25)| cpuflags_avx512 + +%assign cpuflags_cache32 (1<<21) +%assign cpuflags_cache64 (1<<22) +%assign cpuflags_aligned (1<<23) ; not a cpu feature, but a function variant +%assign cpuflags_atom (1<<24) + +; Returns a boolean value expressing whether or not the specified cpuflag is enabled. +%define cpuflag(x) (((((cpuflags & (cpuflags_ %+ x)) ^ (cpuflags_ %+ x)) - 1) >> 31) & 1) +%define notcpuflag(x) (cpuflag(x) ^ 1) + +; Takes an arbitrary number of cpuflags from the above list. +; All subsequent functions (up to the next INIT_CPUFLAGS) is built for the specified cpu. +; You shouldn't need to invoke this macro directly, it's a subroutine for INIT_MMX &co. +%macro INIT_CPUFLAGS 0-* + %xdefine SUFFIX + %undef cpuname + %assign cpuflags 0 + + %if %0 >= 1 + %rep %0 + %ifdef cpuname + %xdefine cpuname cpuname %+ _%1 + %else + %xdefine cpuname %1 + %endif + %assign cpuflags cpuflags | cpuflags_%1 + %rotate 1 + %endrep + %xdefine SUFFIX _ %+ cpuname + + %if cpuflag(avx) + %assign avx_enabled 1 + %endif + %if (mmsize == 16 && notcpuflag(sse2)) || (mmsize == 32 && notcpuflag(avx2)) + %define mova movaps + %define movu movups + %define movnta movntps + %endif + %if cpuflag(aligned) + %define movu mova + %elif cpuflag(sse3) && notcpuflag(ssse3) + %define movu lddqu + %endif + %endif + + %if ARCH_X86_64 || cpuflag(sse2) + CPUNOP amdnop + %else + CPUNOP basicnop + %endif +%endmacro + +; Merge mmx, sse*, and avx* +; m# is a simd register of the currently selected size +; xm# is the corresponding xmm register if mmsize >= 16, otherwise the same as m# +; ym# is the corresponding ymm register if mmsize >= 32, otherwise the same as m# +; zm# is the corresponding zmm register if mmsize >= 64, otherwise the same as m# +; (All 4 remain in sync through SWAP.) + +%macro CAT_XDEFINE 3 + %xdefine %1%2 %3 +%endmacro + +%macro CAT_UNDEF 2 + %undef %1%2 +%endmacro + +%macro DEFINE_MMREGS 1 ; mmtype + %assign %%prev_mmregs 0 + %ifdef num_mmregs + %assign %%prev_mmregs num_mmregs + %endif + + %assign num_mmregs 8 + %if ARCH_X86_64 && mmsize >= 16 + %assign num_mmregs 16 + %if cpuflag(avx512) || mmsize == 64 + %assign num_mmregs 32 + %endif + %endif + + %assign %%i 0 + %rep num_mmregs + CAT_XDEFINE m, %%i, %1 %+ %%i + CAT_XDEFINE nn%1, %%i, %%i + %assign %%i %%i+1 + %endrep + %if %%prev_mmregs > num_mmregs + %rep %%prev_mmregs - num_mmregs + CAT_UNDEF m, %%i + CAT_UNDEF nn %+ mmtype, %%i + %assign %%i %%i+1 + %endrep + %endif + %xdefine mmtype %1 +%endmacro + +; Prefer registers 16-31 over 0-15 to avoid having to use vzeroupper +%macro AVX512_MM_PERMUTATION 0-1 0 ; start_reg + %if ARCH_X86_64 && cpuflag(avx512) + %assign %%i %1 + %rep 16-%1 + %assign %%i_high %%i+16 + SWAP %%i, %%i_high + %assign %%i %%i+1 + %endrep + %endif +%endmacro + +%macro INIT_MMX 0-1+ + %assign avx_enabled 0 + %define RESET_MM_PERMUTATION INIT_MMX %1 + %define mmsize 8 + %define mova movq + %define movu movq + %define movh movd + %define movnta movntq + INIT_CPUFLAGS %1 + DEFINE_MMREGS mm +%endmacro + +%macro INIT_XMM 0-1+ + %assign avx_enabled 0 + %define RESET_MM_PERMUTATION INIT_XMM %1 + %define mmsize 16 + %define mova movdqa + %define movu movdqu + %define movh movq + %define movnta movntdq + INIT_CPUFLAGS %1 + DEFINE_MMREGS xmm + %if WIN64 + AVX512_MM_PERMUTATION 6 ; Swap callee-saved registers with volatile registers + %endif +%endmacro + +%macro INIT_YMM 0-1+ + %assign avx_enabled 1 + %define RESET_MM_PERMUTATION INIT_YMM %1 + %define mmsize 32 + %define mova movdqa + %define movu movdqu + %undef movh + %define movnta movntdq + INIT_CPUFLAGS %1 + DEFINE_MMREGS ymm + AVX512_MM_PERMUTATION +%endmacro + +%macro INIT_ZMM 0-1+ + %assign avx_enabled 1 + %define RESET_MM_PERMUTATION INIT_ZMM %1 + %define mmsize 64 + %define mova movdqa + %define movu movdqu + %undef movh + %define movnta movntdq + INIT_CPUFLAGS %1 + DEFINE_MMREGS zmm + AVX512_MM_PERMUTATION +%endmacro + +INIT_XMM + +%macro DECLARE_MMCAST 1 + %define mmmm%1 mm%1 + %define mmxmm%1 mm%1 + %define mmymm%1 mm%1 + %define mmzmm%1 mm%1 + %define xmmmm%1 mm%1 + %define xmmxmm%1 xmm%1 + %define xmmymm%1 xmm%1 + %define xmmzmm%1 xmm%1 + %define ymmmm%1 mm%1 + %define ymmxmm%1 xmm%1 + %define ymmymm%1 ymm%1 + %define ymmzmm%1 ymm%1 + %define zmmmm%1 mm%1 + %define zmmxmm%1 xmm%1 + %define zmmymm%1 ymm%1 + %define zmmzmm%1 zmm%1 + %define xm%1 xmm %+ m%1 + %define ym%1 ymm %+ m%1 + %define zm%1 zmm %+ m%1 +%endmacro + +%assign i 0 +%rep 32 + DECLARE_MMCAST i + %assign i i+1 +%endrep + +; I often want to use macros that permute their arguments. e.g. there's no +; efficient way to implement butterfly or transpose or dct without swapping some +; arguments. +; +; I would like to not have to manually keep track of the permutations: +; If I insert a permutation in the middle of a function, it should automatically +; change everything that follows. For more complex macros I may also have multiple +; implementations, e.g. the SSE2 and SSSE3 versions may have different permutations. +; +; Hence these macros. Insert a PERMUTE or some SWAPs at the end of a macro that +; permutes its arguments. It's equivalent to exchanging the contents of the +; registers, except that this way you exchange the register names instead, so it +; doesn't cost any cycles. + +%macro PERMUTE 2-* ; takes a list of pairs to swap + %rep %0/2 + %xdefine %%tmp%2 m%2 + %rotate 2 + %endrep + %rep %0/2 + %xdefine m%1 %%tmp%2 + CAT_XDEFINE nn, m%1, %1 + %rotate 2 + %endrep +%endmacro + +%macro SWAP 2+ ; swaps a single chain (sometimes more concise than pairs) + %ifnum %1 ; SWAP 0, 1, ... + SWAP_INTERNAL_NUM %1, %2 + %else ; SWAP m0, m1, ... + SWAP_INTERNAL_NAME %1, %2 + %endif +%endmacro + +%macro SWAP_INTERNAL_NUM 2-* + %rep %0-1 + %xdefine %%tmp m%1 + %xdefine m%1 m%2 + %xdefine m%2 %%tmp + CAT_XDEFINE nn, m%1, %1 + CAT_XDEFINE nn, m%2, %2 + %rotate 1 + %endrep +%endmacro + +%macro SWAP_INTERNAL_NAME 2-* + %xdefine %%args nn %+ %1 + %rep %0-1 + %xdefine %%args %%args, nn %+ %2 + %rotate 1 + %endrep + SWAP_INTERNAL_NUM %%args +%endmacro + +; If SAVE_MM_PERMUTATION is placed at the end of a function, then any later +; calls to that function will automatically load the permutation, so values can +; be returned in mmregs. +%macro SAVE_MM_PERMUTATION 0-1 + %if %0 + %xdefine %%f %1_m + %else + %xdefine %%f current_function %+ _m + %endif + %assign %%i 0 + %rep num_mmregs + CAT_XDEFINE %%f, %%i, m %+ %%i + %assign %%i %%i+1 + %endrep +%endmacro + +%macro LOAD_MM_PERMUTATION 1 ; name to load from + %ifdef %1_m0 + %assign %%i 0 + %rep num_mmregs + CAT_XDEFINE m, %%i, %1_m %+ %%i + CAT_XDEFINE nn, m %+ %%i, %%i + %assign %%i %%i+1 + %endrep + %endif +%endmacro + +; Append cpuflags to the callee's name iff the appended name is known and the plain name isn't +%macro call 1 + %ifid %1 + call_internal %1 %+ SUFFIX, %1 + %else + call %1 + %endif +%endmacro +%macro call_internal 2 + %xdefine %%i %2 + %ifndef cglobaled_%2 + %ifdef cglobaled_%1 + %xdefine %%i %1 + %endif + %endif + call %%i + LOAD_MM_PERMUTATION %%i +%endmacro + +; Substitutions that reduce instruction size but are functionally equivalent +%macro add 2 + %ifnum %2 + %if %2==128 + sub %1, -128 + %else + add %1, %2 + %endif + %else + add %1, %2 + %endif +%endmacro + +%macro sub 2 + %ifnum %2 + %if %2==128 + add %1, -128 + %else + sub %1, %2 + %endif + %else + sub %1, %2 + %endif +%endmacro + +;============================================================================= +; AVX abstraction layer +;============================================================================= + +%assign i 0 +%rep 32 + %if i < 8 + CAT_XDEFINE sizeofmm, i, 8 + CAT_XDEFINE regnumofmm, i, i + %endif + CAT_XDEFINE sizeofxmm, i, 16 + CAT_XDEFINE sizeofymm, i, 32 + CAT_XDEFINE sizeofzmm, i, 64 + CAT_XDEFINE regnumofxmm, i, i + CAT_XDEFINE regnumofymm, i, i + CAT_XDEFINE regnumofzmm, i, i + %assign i i+1 +%endrep +%undef i + +%macro CHECK_AVX_INSTR_EMU 3-* + %xdefine %%opcode %1 + %xdefine %%dst %2 + %rep %0-2 + %ifidn %%dst, %3 + %error non-avx emulation of ``%%opcode'' is not supported + %endif + %rotate 1 + %endrep +%endmacro + +;%1 == instruction +;%2 == minimal instruction set +;%3 == 1 if float, 0 if int +;%4 == 1 if 4-operand emulation, 0 if 3-operand emulation, 255 otherwise (no emulation) +;%5 == 1 if commutative (i.e. doesn't matter which src arg is which), 0 if not +;%6+: operands +%macro RUN_AVX_INSTR 6-9+ + %ifnum sizeof%7 + %assign __sizeofreg sizeof%7 + %elifnum sizeof%6 + %assign __sizeofreg sizeof%6 + %else + %assign __sizeofreg mmsize + %endif + %assign __emulate_avx 0 + %if avx_enabled && __sizeofreg >= 16 + %xdefine __instr v%1 + %else + %xdefine __instr %1 + %if %0 >= 8+%4 + %assign __emulate_avx 1 + %endif + %endif + %ifnidn %2, fnord + %ifdef cpuname + %if notcpuflag(%2) + %error use of ``%1'' %2 instruction in cpuname function: current_function + %elif cpuflags_%2 < cpuflags_sse && notcpuflag(sse2) && __sizeofreg > 8 + %error use of ``%1'' sse2 instruction in cpuname function: current_function + %endif + %endif + %endif + + %if __emulate_avx + %xdefine __src1 %7 + %xdefine __src2 %8 + %if %5 && %4 == 0 + %ifnidn %6, %7 + %ifidn %6, %8 + %xdefine __src1 %8 + %xdefine __src2 %7 + %elifnnum sizeof%8 + ; 3-operand AVX instructions with a memory arg can only have it in src2, + ; whereas SSE emulation prefers to have it in src1 (i.e. the mov). + ; So, if the instruction is commutative with a memory arg, swap them. + %xdefine __src1 %8 + %xdefine __src2 %7 + %endif + %endif + %endif + %ifnidn %6, __src1 + %if %0 >= 9 + CHECK_AVX_INSTR_EMU {%1 %6, %7, %8, %9}, %6, __src2, %9 + %else + CHECK_AVX_INSTR_EMU {%1 %6, %7, %8}, %6, __src2 + %endif + %if __sizeofreg == 8 + MOVQ %6, __src1 + %elif %3 + MOVAPS %6, __src1 + %else + MOVDQA %6, __src1 + %endif + %endif + %if %0 >= 9 + %1 %6, __src2, %9 + %else + %1 %6, __src2 + %endif + %elif %0 >= 9 + __instr %6, %7, %8, %9 + %elif %0 == 8 + __instr %6, %7, %8 + %elif %0 == 7 + __instr %6, %7 + %else + __instr %6 + %endif +%endmacro + +;%1 == instruction +;%2 == minimal instruction set +;%3 == 1 if float, 0 if int +;%4 == 1 if 4-operand emulation, 0 if 3-operand emulation, 255 otherwise (no emulation) +;%5 == 1 if commutative (i.e. doesn't matter which src arg is which), 0 if not +%macro AVX_INSTR 1-5 fnord, 0, 255, 0 + %macro %1 1-10 fnord, fnord, fnord, fnord, %1, %2, %3, %4, %5 + %ifidn %2, fnord + RUN_AVX_INSTR %6, %7, %8, %9, %10, %1 + %elifidn %3, fnord + RUN_AVX_INSTR %6, %7, %8, %9, %10, %1, %2 + %elifidn %4, fnord + RUN_AVX_INSTR %6, %7, %8, %9, %10, %1, %2, %3 + %elifidn %5, fnord + RUN_AVX_INSTR %6, %7, %8, %9, %10, %1, %2, %3, %4 + %else + RUN_AVX_INSTR %6, %7, %8, %9, %10, %1, %2, %3, %4, %5 + %endif + %endmacro +%endmacro + +; Instructions with both VEX/EVEX and legacy encodings +; Non-destructive instructions are written without parameters +AVX_INSTR addpd, sse2, 1, 0, 1 +AVX_INSTR addps, sse, 1, 0, 1 +AVX_INSTR addsd, sse2, 1, 0, 0 +AVX_INSTR addss, sse, 1, 0, 0 +AVX_INSTR addsubpd, sse3, 1, 0, 0 +AVX_INSTR addsubps, sse3, 1, 0, 0 +AVX_INSTR aesdec, aesni, 0, 0, 0 +AVX_INSTR aesdeclast, aesni, 0, 0, 0 +AVX_INSTR aesenc, aesni, 0, 0, 0 +AVX_INSTR aesenclast, aesni, 0, 0, 0 +AVX_INSTR aesimc, aesni +AVX_INSTR aeskeygenassist, aesni +AVX_INSTR andnpd, sse2, 1, 0, 0 +AVX_INSTR andnps, sse, 1, 0, 0 +AVX_INSTR andpd, sse2, 1, 0, 1 +AVX_INSTR andps, sse, 1, 0, 1 +AVX_INSTR blendpd, sse4, 1, 1, 0 +AVX_INSTR blendps, sse4, 1, 1, 0 +AVX_INSTR blendvpd, sse4 ; can't be emulated +AVX_INSTR blendvps, sse4 ; can't be emulated +AVX_INSTR cmpeqpd, sse2, 1, 0, 1 +AVX_INSTR cmpeqps, sse, 1, 0, 1 +AVX_INSTR cmpeqsd, sse2, 1, 0, 0 +AVX_INSTR cmpeqss, sse, 1, 0, 0 +AVX_INSTR cmplepd, sse2, 1, 0, 0 +AVX_INSTR cmpleps, sse, 1, 0, 0 +AVX_INSTR cmplesd, sse2, 1, 0, 0 +AVX_INSTR cmpless, sse, 1, 0, 0 +AVX_INSTR cmpltpd, sse2, 1, 0, 0 +AVX_INSTR cmpltps, sse, 1, 0, 0 +AVX_INSTR cmpltsd, sse2, 1, 0, 0 +AVX_INSTR cmpltss, sse, 1, 0, 0 +AVX_INSTR cmpneqpd, sse2, 1, 0, 1 +AVX_INSTR cmpneqps, sse, 1, 0, 1 +AVX_INSTR cmpneqsd, sse2, 1, 0, 0 +AVX_INSTR cmpneqss, sse, 1, 0, 0 +AVX_INSTR cmpnlepd, sse2, 1, 0, 0 +AVX_INSTR cmpnleps, sse, 1, 0, 0 +AVX_INSTR cmpnlesd, sse2, 1, 0, 0 +AVX_INSTR cmpnless, sse, 1, 0, 0 +AVX_INSTR cmpnltpd, sse2, 1, 0, 0 +AVX_INSTR cmpnltps, sse, 1, 0, 0 +AVX_INSTR cmpnltsd, sse2, 1, 0, 0 +AVX_INSTR cmpnltss, sse, 1, 0, 0 +AVX_INSTR cmpordpd, sse2 1, 0, 1 +AVX_INSTR cmpordps, sse 1, 0, 1 +AVX_INSTR cmpordsd, sse2 1, 0, 0 +AVX_INSTR cmpordss, sse 1, 0, 0 +AVX_INSTR cmppd, sse2, 1, 1, 0 +AVX_INSTR cmpps, sse, 1, 1, 0 +AVX_INSTR cmpsd, sse2, 1, 1, 0 +AVX_INSTR cmpss, sse, 1, 1, 0 +AVX_INSTR cmpunordpd, sse2, 1, 0, 1 +AVX_INSTR cmpunordps, sse, 1, 0, 1 +AVX_INSTR cmpunordsd, sse2, 1, 0, 0 +AVX_INSTR cmpunordss, sse, 1, 0, 0 +AVX_INSTR comisd, sse2 +AVX_INSTR comiss, sse +AVX_INSTR cvtdq2pd, sse2 +AVX_INSTR cvtdq2ps, sse2 +AVX_INSTR cvtpd2dq, sse2 +AVX_INSTR cvtpd2ps, sse2 +AVX_INSTR cvtps2dq, sse2 +AVX_INSTR cvtps2pd, sse2 +AVX_INSTR cvtsd2si, sse2 +AVX_INSTR cvtsd2ss, sse2, 1, 0, 0 +AVX_INSTR cvtsi2sd, sse2, 1, 0, 0 +AVX_INSTR cvtsi2ss, sse, 1, 0, 0 +AVX_INSTR cvtss2sd, sse2, 1, 0, 0 +AVX_INSTR cvtss2si, sse +AVX_INSTR cvttpd2dq, sse2 +AVX_INSTR cvttps2dq, sse2 +AVX_INSTR cvttsd2si, sse2 +AVX_INSTR cvttss2si, sse +AVX_INSTR divpd, sse2, 1, 0, 0 +AVX_INSTR divps, sse, 1, 0, 0 +AVX_INSTR divsd, sse2, 1, 0, 0 +AVX_INSTR divss, sse, 1, 0, 0 +AVX_INSTR dppd, sse4, 1, 1, 0 +AVX_INSTR dpps, sse4, 1, 1, 0 +AVX_INSTR extractps, sse4 +AVX_INSTR haddpd, sse3, 1, 0, 0 +AVX_INSTR haddps, sse3, 1, 0, 0 +AVX_INSTR hsubpd, sse3, 1, 0, 0 +AVX_INSTR hsubps, sse3, 1, 0, 0 +AVX_INSTR insertps, sse4, 1, 1, 0 +AVX_INSTR lddqu, sse3 +AVX_INSTR ldmxcsr, sse +AVX_INSTR maskmovdqu, sse2 +AVX_INSTR maxpd, sse2, 1, 0, 1 +AVX_INSTR maxps, sse, 1, 0, 1 +AVX_INSTR maxsd, sse2, 1, 0, 0 +AVX_INSTR maxss, sse, 1, 0, 0 +AVX_INSTR minpd, sse2, 1, 0, 1 +AVX_INSTR minps, sse, 1, 0, 1 +AVX_INSTR minsd, sse2, 1, 0, 0 +AVX_INSTR minss, sse, 1, 0, 0 +AVX_INSTR movapd, sse2 +AVX_INSTR movaps, sse +AVX_INSTR movd, mmx +AVX_INSTR movddup, sse3 +AVX_INSTR movdqa, sse2 +AVX_INSTR movdqu, sse2 +AVX_INSTR movhlps, sse, 1, 0, 0 +AVX_INSTR movhpd, sse2, 1, 0, 0 +AVX_INSTR movhps, sse, 1, 0, 0 +AVX_INSTR movlhps, sse, 1, 0, 0 +AVX_INSTR movlpd, sse2, 1, 0, 0 +AVX_INSTR movlps, sse, 1, 0, 0 +AVX_INSTR movmskpd, sse2 +AVX_INSTR movmskps, sse +AVX_INSTR movntdq, sse2 +AVX_INSTR movntdqa, sse4 +AVX_INSTR movntpd, sse2 +AVX_INSTR movntps, sse +AVX_INSTR movq, mmx +AVX_INSTR movsd, sse2, 1, 0, 0 +AVX_INSTR movshdup, sse3 +AVX_INSTR movsldup, sse3 +AVX_INSTR movss, sse, 1, 0, 0 +AVX_INSTR movupd, sse2 +AVX_INSTR movups, sse +AVX_INSTR mpsadbw, sse4, 0, 1, 0 +AVX_INSTR mulpd, sse2, 1, 0, 1 +AVX_INSTR mulps, sse, 1, 0, 1 +AVX_INSTR mulsd, sse2, 1, 0, 0 +AVX_INSTR mulss, sse, 1, 0, 0 +AVX_INSTR orpd, sse2, 1, 0, 1 +AVX_INSTR orps, sse, 1, 0, 1 +AVX_INSTR pabsb, ssse3 +AVX_INSTR pabsd, ssse3 +AVX_INSTR pabsw, ssse3 +AVX_INSTR packsswb, mmx, 0, 0, 0 +AVX_INSTR packssdw, mmx, 0, 0, 0 +AVX_INSTR packuswb, mmx, 0, 0, 0 +AVX_INSTR packusdw, sse4, 0, 0, 0 +AVX_INSTR paddb, mmx, 0, 0, 1 +AVX_INSTR paddw, mmx, 0, 0, 1 +AVX_INSTR paddd, mmx, 0, 0, 1 +AVX_INSTR paddq, sse2, 0, 0, 1 +AVX_INSTR paddsb, mmx, 0, 0, 1 +AVX_INSTR paddsw, mmx, 0, 0, 1 +AVX_INSTR paddusb, mmx, 0, 0, 1 +AVX_INSTR paddusw, mmx, 0, 0, 1 +AVX_INSTR palignr, ssse3, 0, 1, 0 +AVX_INSTR pand, mmx, 0, 0, 1 +AVX_INSTR pandn, mmx, 0, 0, 0 +AVX_INSTR pavgb, mmx2, 0, 0, 1 +AVX_INSTR pavgw, mmx2, 0, 0, 1 +AVX_INSTR pblendvb, sse4 ; can't be emulated +AVX_INSTR pblendw, sse4, 0, 1, 0 +AVX_INSTR pclmulqdq, fnord, 0, 1, 0 +AVX_INSTR pclmulhqhqdq, fnord, 0, 0, 0 +AVX_INSTR pclmulhqlqdq, fnord, 0, 0, 0 +AVX_INSTR pclmullqhqdq, fnord, 0, 0, 0 +AVX_INSTR pclmullqlqdq, fnord, 0, 0, 0 +AVX_INSTR pcmpestri, sse42 +AVX_INSTR pcmpestrm, sse42 +AVX_INSTR pcmpistri, sse42 +AVX_INSTR pcmpistrm, sse42 +AVX_INSTR pcmpeqb, mmx, 0, 0, 1 +AVX_INSTR pcmpeqw, mmx, 0, 0, 1 +AVX_INSTR pcmpeqd, mmx, 0, 0, 1 +AVX_INSTR pcmpeqq, sse4, 0, 0, 1 +AVX_INSTR pcmpgtb, mmx, 0, 0, 0 +AVX_INSTR pcmpgtw, mmx, 0, 0, 0 +AVX_INSTR pcmpgtd, mmx, 0, 0, 0 +AVX_INSTR pcmpgtq, sse42, 0, 0, 0 +AVX_INSTR pextrb, sse4 +AVX_INSTR pextrd, sse4 +AVX_INSTR pextrq, sse4 +AVX_INSTR pextrw, mmx2 +AVX_INSTR phaddw, ssse3, 0, 0, 0 +AVX_INSTR phaddd, ssse3, 0, 0, 0 +AVX_INSTR phaddsw, ssse3, 0, 0, 0 +AVX_INSTR phminposuw, sse4 +AVX_INSTR phsubw, ssse3, 0, 0, 0 +AVX_INSTR phsubd, ssse3, 0, 0, 0 +AVX_INSTR phsubsw, ssse3, 0, 0, 0 +AVX_INSTR pinsrb, sse4, 0, 1, 0 +AVX_INSTR pinsrd, sse4, 0, 1, 0 +AVX_INSTR pinsrq, sse4, 0, 1, 0 +AVX_INSTR pinsrw, mmx2, 0, 1, 0 +AVX_INSTR pmaddwd, mmx, 0, 0, 1 +AVX_INSTR pmaddubsw, ssse3, 0, 0, 0 +AVX_INSTR pmaxsb, sse4, 0, 0, 1 +AVX_INSTR pmaxsw, mmx2, 0, 0, 1 +AVX_INSTR pmaxsd, sse4, 0, 0, 1 +AVX_INSTR pmaxub, mmx2, 0, 0, 1 +AVX_INSTR pmaxuw, sse4, 0, 0, 1 +AVX_INSTR pmaxud, sse4, 0, 0, 1 +AVX_INSTR pminsb, sse4, 0, 0, 1 +AVX_INSTR pminsw, mmx2, 0, 0, 1 +AVX_INSTR pminsd, sse4, 0, 0, 1 +AVX_INSTR pminub, mmx2, 0, 0, 1 +AVX_INSTR pminuw, sse4, 0, 0, 1 +AVX_INSTR pminud, sse4, 0, 0, 1 +AVX_INSTR pmovmskb, mmx2 +AVX_INSTR pmovsxbw, sse4 +AVX_INSTR pmovsxbd, sse4 +AVX_INSTR pmovsxbq, sse4 +AVX_INSTR pmovsxwd, sse4 +AVX_INSTR pmovsxwq, sse4 +AVX_INSTR pmovsxdq, sse4 +AVX_INSTR pmovzxbw, sse4 +AVX_INSTR pmovzxbd, sse4 +AVX_INSTR pmovzxbq, sse4 +AVX_INSTR pmovzxwd, sse4 +AVX_INSTR pmovzxwq, sse4 +AVX_INSTR pmovzxdq, sse4 +AVX_INSTR pmuldq, sse4, 0, 0, 1 +AVX_INSTR pmulhrsw, ssse3, 0, 0, 1 +AVX_INSTR pmulhuw, mmx2, 0, 0, 1 +AVX_INSTR pmulhw, mmx, 0, 0, 1 +AVX_INSTR pmullw, mmx, 0, 0, 1 +AVX_INSTR pmulld, sse4, 0, 0, 1 +AVX_INSTR pmuludq, sse2, 0, 0, 1 +AVX_INSTR por, mmx, 0, 0, 1 +AVX_INSTR psadbw, mmx2, 0, 0, 1 +AVX_INSTR pshufb, ssse3, 0, 0, 0 +AVX_INSTR pshufd, sse2 +AVX_INSTR pshufhw, sse2 +AVX_INSTR pshuflw, sse2 +AVX_INSTR psignb, ssse3, 0, 0, 0 +AVX_INSTR psignw, ssse3, 0, 0, 0 +AVX_INSTR psignd, ssse3, 0, 0, 0 +AVX_INSTR psllw, mmx, 0, 0, 0 +AVX_INSTR pslld, mmx, 0, 0, 0 +AVX_INSTR psllq, mmx, 0, 0, 0 +AVX_INSTR pslldq, sse2, 0, 0, 0 +AVX_INSTR psraw, mmx, 0, 0, 0 +AVX_INSTR psrad, mmx, 0, 0, 0 +AVX_INSTR psrlw, mmx, 0, 0, 0 +AVX_INSTR psrld, mmx, 0, 0, 0 +AVX_INSTR psrlq, mmx, 0, 0, 0 +AVX_INSTR psrldq, sse2, 0, 0, 0 +AVX_INSTR psubb, mmx, 0, 0, 0 +AVX_INSTR psubw, mmx, 0, 0, 0 +AVX_INSTR psubd, mmx, 0, 0, 0 +AVX_INSTR psubq, sse2, 0, 0, 0 +AVX_INSTR psubsb, mmx, 0, 0, 0 +AVX_INSTR psubsw, mmx, 0, 0, 0 +AVX_INSTR psubusb, mmx, 0, 0, 0 +AVX_INSTR psubusw, mmx, 0, 0, 0 +AVX_INSTR ptest, sse4 +AVX_INSTR punpckhbw, mmx, 0, 0, 0 +AVX_INSTR punpckhwd, mmx, 0, 0, 0 +AVX_INSTR punpckhdq, mmx, 0, 0, 0 +AVX_INSTR punpckhqdq, sse2, 0, 0, 0 +AVX_INSTR punpcklbw, mmx, 0, 0, 0 +AVX_INSTR punpcklwd, mmx, 0, 0, 0 +AVX_INSTR punpckldq, mmx, 0, 0, 0 +AVX_INSTR punpcklqdq, sse2, 0, 0, 0 +AVX_INSTR pxor, mmx, 0, 0, 1 +AVX_INSTR rcpps, sse +AVX_INSTR rcpss, sse, 1, 0, 0 +AVX_INSTR roundpd, sse4 +AVX_INSTR roundps, sse4 +AVX_INSTR roundsd, sse4, 1, 1, 0 +AVX_INSTR roundss, sse4, 1, 1, 0 +AVX_INSTR rsqrtps, sse +AVX_INSTR rsqrtss, sse, 1, 0, 0 +AVX_INSTR shufpd, sse2, 1, 1, 0 +AVX_INSTR shufps, sse, 1, 1, 0 +AVX_INSTR sqrtpd, sse2 +AVX_INSTR sqrtps, sse +AVX_INSTR sqrtsd, sse2, 1, 0, 0 +AVX_INSTR sqrtss, sse, 1, 0, 0 +AVX_INSTR stmxcsr, sse +AVX_INSTR subpd, sse2, 1, 0, 0 +AVX_INSTR subps, sse, 1, 0, 0 +AVX_INSTR subsd, sse2, 1, 0, 0 +AVX_INSTR subss, sse, 1, 0, 0 +AVX_INSTR ucomisd, sse2 +AVX_INSTR ucomiss, sse +AVX_INSTR unpckhpd, sse2, 1, 0, 0 +AVX_INSTR unpckhps, sse, 1, 0, 0 +AVX_INSTR unpcklpd, sse2, 1, 0, 0 +AVX_INSTR unpcklps, sse, 1, 0, 0 +AVX_INSTR xorpd, sse2, 1, 0, 1 +AVX_INSTR xorps, sse, 1, 0, 1 + +; 3DNow instructions, for sharing code between AVX, SSE and 3DN +AVX_INSTR pfadd, 3dnow, 1, 0, 1 +AVX_INSTR pfsub, 3dnow, 1, 0, 0 +AVX_INSTR pfmul, 3dnow, 1, 0, 1 + +; base-4 constants for shuffles +%assign i 0 +%rep 256 + %assign j ((i>>6)&3)*1000 + ((i>>4)&3)*100 + ((i>>2)&3)*10 + (i&3) + %if j < 10 + CAT_XDEFINE q000, j, i + %elif j < 100 + CAT_XDEFINE q00, j, i + %elif j < 1000 + CAT_XDEFINE q0, j, i + %else + CAT_XDEFINE q, j, i + %endif + %assign i i+1 +%endrep +%undef i +%undef j + +%macro FMA_INSTR 3 + %macro %1 4-7 %1, %2, %3 + %if cpuflag(xop) + v%5 %1, %2, %3, %4 + %elifnidn %1, %4 + %6 %1, %2, %3 + %7 %1, %4 + %else + %error non-xop emulation of ``%5 %1, %2, %3, %4'' is not supported + %endif + %endmacro +%endmacro + +FMA_INSTR pmacsww, pmullw, paddw +FMA_INSTR pmacsdd, pmulld, paddd ; sse4 emulation +FMA_INSTR pmacsdql, pmuldq, paddq ; sse4 emulation +FMA_INSTR pmadcswd, pmaddwd, paddd + +; tzcnt is equivalent to "rep bsf" and is backwards-compatible with bsf. +; This lets us use tzcnt without bumping the yasm version requirement yet. +%define tzcnt rep bsf + +; Macros for consolidating FMA3 and FMA4 using 4-operand (dst, src1, src2, src3) syntax. +; FMA3 is only possible if dst is the same as one of the src registers. +; Either src2 or src3 can be a memory operand. +%macro FMA4_INSTR 2-* + %push fma4_instr + %xdefine %$prefix %1 + %rep %0 - 1 + %macro %$prefix%2 4-6 %$prefix, %2 + %if notcpuflag(fma3) && notcpuflag(fma4) + %error use of ``%5%6'' fma instruction in cpuname function: current_function + %elif cpuflag(fma4) + v%5%6 %1, %2, %3, %4 + %elifidn %1, %2 + ; If %3 or %4 is a memory operand it needs to be encoded as the last operand. + %ifnum sizeof%3 + v%{5}213%6 %2, %3, %4 + %else + v%{5}132%6 %2, %4, %3 + %endif + %elifidn %1, %3 + v%{5}213%6 %3, %2, %4 + %elifidn %1, %4 + v%{5}231%6 %4, %2, %3 + %else + %error fma3 emulation of ``%5%6 %1, %2, %3, %4'' is not supported + %endif + %endmacro + %rotate 1 + %endrep + %pop +%endmacro + +FMA4_INSTR fmadd, pd, ps, sd, ss +FMA4_INSTR fmaddsub, pd, ps +FMA4_INSTR fmsub, pd, ps, sd, ss +FMA4_INSTR fmsubadd, pd, ps +FMA4_INSTR fnmadd, pd, ps, sd, ss +FMA4_INSTR fnmsub, pd, ps, sd, ss + +; Macros for converting VEX instructions to equivalent EVEX ones. +%macro EVEX_INSTR 2-3 0 ; vex, evex, prefer_evex + %macro %1 2-7 fnord, fnord, %1, %2, %3 + %ifidn %3, fnord + %define %%args %1, %2 + %elifidn %4, fnord + %define %%args %1, %2, %3 + %else + %define %%args %1, %2, %3, %4 + %endif + %assign %%evex_required cpuflag(avx512) & %7 + %ifnum regnumof%1 + %if regnumof%1 >= 16 || sizeof%1 > 32 + %assign %%evex_required 1 + %endif + %endif + %ifnum regnumof%2 + %if regnumof%2 >= 16 || sizeof%2 > 32 + %assign %%evex_required 1 + %endif + %endif + %if %%evex_required + %6 %%args + %else + %5 %%args ; Prefer VEX over EVEX due to shorter instruction length + %endif + %endmacro +%endmacro + +EVEX_INSTR vbroadcastf128, vbroadcastf32x4 +EVEX_INSTR vbroadcasti128, vbroadcasti32x4 +EVEX_INSTR vextractf128, vextractf32x4 +EVEX_INSTR vextracti128, vextracti32x4 +EVEX_INSTR vinsertf128, vinsertf32x4 +EVEX_INSTR vinserti128, vinserti32x4 +EVEX_INSTR vmovdqa, vmovdqa32 +EVEX_INSTR vmovdqu, vmovdqu32 +EVEX_INSTR vpand, vpandd +EVEX_INSTR vpandn, vpandnd +EVEX_INSTR vpor, vpord +EVEX_INSTR vpxor, vpxord +EVEX_INSTR vrcpps, vrcp14ps, 1 ; EVEX versions have higher precision +EVEX_INSTR vrcpss, vrcp14ss, 1 +EVEX_INSTR vrsqrtps, vrsqrt14ps, 1 +EVEX_INSTR vrsqrtss, vrsqrt14ss, 1 + +; workaround: vpbroadcastq is broken in x86_32 due to a yasm bug (fixed in 1.3.0) +%ifdef __YASM_VER__ + %if __YASM_VERSION_ID__ < 0x01030000 && ARCH_X86_64 == 0 + %macro vpbroadcastq 2 + %if sizeof%1 == 16 + movddup %1, %2 + %else + vbroadcastsd %1, %2 + %endif + %endmacro + %endif +%endif diff --git a/media/ffvpx/libavutil/x86/x86util.asm b/media/ffvpx/libavutil/x86/x86util.asm new file mode 100644 index 0000000000..d7cd996842 --- /dev/null +++ b/media/ffvpx/libavutil/x86/x86util.asm @@ -0,0 +1,1028 @@ +;***************************************************************************** +;* x86util.asm +;***************************************************************************** +;* Copyright (C) 2008-2010 x264 project +;* +;* Authors: Loren Merritt +;* Holger Lubitz +;* +;* This file is part of FFmpeg. +;* +;* FFmpeg is free software; you can redistribute it and/or +;* modify it under the terms of the GNU Lesser General Public +;* License as published by the Free Software Foundation; either +;* version 2.1 of the License, or (at your option) any later version. +;* +;* FFmpeg is distributed in the hope that it will be useful, +;* but WITHOUT ANY WARRANTY; without even the implied warranty of +;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +;* Lesser General Public License for more details. +;* +;* You should have received a copy of the GNU Lesser General Public +;* License along with FFmpeg; if not, write to the Free Software +;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA +;****************************************************************************** + +%define private_prefix ff +%define public_prefix avpriv +%define cpuflags_mmxext cpuflags_mmx2 + +%include "libavutil/x86/x86inc.asm" + +; expands to [base],...,[base+7*stride] +%define PASS8ROWS(base, base3, stride, stride3) \ + [base], [base + stride], [base + 2*stride], [base3], \ + [base3 + stride], [base3 + 2*stride], [base3 + stride3], [base3 + stride*4] + +; Interleave low src0 with low src1 and store in src0, +; interleave high src0 with high src1 and store in src1. +; %1 - types +; %2 - index of the register with src0 +; %3 - index of the register with src1 +; %4 - index of the register for intermediate results +; example for %1 - wd: input: src0: x0 x1 x2 x3 z0 z1 z2 z3 +; src1: y0 y1 y2 y3 q0 q1 q2 q3 +; output: src0: x0 y0 x1 y1 x2 y2 x3 y3 +; src1: z0 q0 z1 q1 z2 q2 z3 q3 +%macro SBUTTERFLY 4 +%ifidn %1, dqqq + vperm2i128 m%4, m%2, m%3, q0301 + vinserti128 m%2, m%2, xm%3, 1 +%elif avx_enabled == 0 + mova m%4, m%2 + punpckl%1 m%2, m%3 + punpckh%1 m%4, m%3 +%else + punpckh%1 m%4, m%2, m%3 + punpckl%1 m%2, m%3 +%endif + SWAP %3, %4 +%endmacro + +%macro SBUTTERFLY2 4 + punpckl%1 m%4, m%2, m%3 + punpckh%1 m%2, m%2, m%3 + SWAP %2, %4, %3 +%endmacro + +%macro SBUTTERFLYPS 3 + unpcklps m%3, m%1, m%2 + unpckhps m%1, m%1, m%2 + SWAP %1, %3, %2 +%endmacro + +%macro SBUTTERFLYPD 3 + movlhps m%3, m%1, m%2 + movhlps m%2, m%2, m%1 + SWAP %1, %3 +%endmacro + +%macro TRANSPOSE4x4B 5 + SBUTTERFLY bw, %1, %2, %5 + SBUTTERFLY bw, %3, %4, %5 + SBUTTERFLY wd, %1, %3, %5 + SBUTTERFLY wd, %2, %4, %5 + SWAP %2, %3 +%endmacro + +%macro TRANSPOSE4x4W 5 + SBUTTERFLY wd, %1, %2, %5 + SBUTTERFLY wd, %3, %4, %5 + SBUTTERFLY dq, %1, %3, %5 + SBUTTERFLY dq, %2, %4, %5 + SWAP %2, %3 +%endmacro + +%macro TRANSPOSE2x4x4B 5 + SBUTTERFLY bw, %1, %2, %5 + SBUTTERFLY bw, %3, %4, %5 + SBUTTERFLY wd, %1, %3, %5 + SBUTTERFLY wd, %2, %4, %5 + SBUTTERFLY dq, %1, %2, %5 + SBUTTERFLY dq, %3, %4, %5 +%endmacro + +%macro TRANSPOSE2x4x4W 5 + SBUTTERFLY wd, %1, %2, %5 + SBUTTERFLY wd, %3, %4, %5 + SBUTTERFLY dq, %1, %3, %5 + SBUTTERFLY dq, %2, %4, %5 + SBUTTERFLY qdq, %1, %2, %5 + SBUTTERFLY qdq, %3, %4, %5 +%endmacro + +%macro TRANSPOSE4x4D 5 + SBUTTERFLY dq, %1, %2, %5 + SBUTTERFLY dq, %3, %4, %5 + SBUTTERFLY qdq, %1, %3, %5 + SBUTTERFLY qdq, %2, %4, %5 + SWAP %2, %3 +%endmacro + +; identical behavior to TRANSPOSE4x4D, but using SSE1 float ops +%macro TRANSPOSE4x4PS 5 + SBUTTERFLYPS %1, %2, %5 + SBUTTERFLYPS %3, %4, %5 + SBUTTERFLYPD %1, %3, %5 + SBUTTERFLYPD %2, %4, %5 + SWAP %2, %3 +%endmacro + +%macro TRANSPOSE8x4D 9-11 +%if ARCH_X86_64 + SBUTTERFLY dq, %1, %2, %9 + SBUTTERFLY dq, %3, %4, %9 + SBUTTERFLY dq, %5, %6, %9 + SBUTTERFLY dq, %7, %8, %9 + SBUTTERFLY qdq, %1, %3, %9 + SBUTTERFLY qdq, %2, %4, %9 + SBUTTERFLY qdq, %5, %7, %9 + SBUTTERFLY qdq, %6, %8, %9 + SWAP %2, %5 + SWAP %4, %7 +%else +; in: m0..m7 +; out: m0..m7, unless %11 in which case m2 is in %9 +; spills into %9 and %10 + movdqa %9, m%7 + SBUTTERFLY dq, %1, %2, %7 + movdqa %10, m%2 + movdqa m%7, %9 + SBUTTERFLY dq, %3, %4, %2 + SBUTTERFLY dq, %5, %6, %2 + SBUTTERFLY dq, %7, %8, %2 + SBUTTERFLY qdq, %1, %3, %2 + movdqa %9, m%3 + movdqa m%2, %10 + SBUTTERFLY qdq, %2, %4, %3 + SBUTTERFLY qdq, %5, %7, %3 + SBUTTERFLY qdq, %6, %8, %3 + SWAP %2, %5 + SWAP %4, %7 +%if %0<11 + movdqa m%3, %9 +%endif +%endif +%endmacro + +%macro TRANSPOSE8x8W 9-11 +%if ARCH_X86_64 + SBUTTERFLY wd, %1, %2, %9 + SBUTTERFLY wd, %3, %4, %9 + SBUTTERFLY wd, %5, %6, %9 + SBUTTERFLY wd, %7, %8, %9 + SBUTTERFLY dq, %1, %3, %9 + SBUTTERFLY dq, %2, %4, %9 + SBUTTERFLY dq, %5, %7, %9 + SBUTTERFLY dq, %6, %8, %9 + SBUTTERFLY qdq, %1, %5, %9 + SBUTTERFLY qdq, %2, %6, %9 + SBUTTERFLY qdq, %3, %7, %9 + SBUTTERFLY qdq, %4, %8, %9 + SWAP %2, %5 + SWAP %4, %7 +%else +; in: m0..m7, unless %11 in which case m6 is in %9 +; out: m0..m7, unless %11 in which case m4 is in %10 +; spills into %9 and %10 +%if %0<11 + movdqa %9, m%7 +%endif + SBUTTERFLY wd, %1, %2, %7 + movdqa %10, m%2 + movdqa m%7, %9 + SBUTTERFLY wd, %3, %4, %2 + SBUTTERFLY wd, %5, %6, %2 + SBUTTERFLY wd, %7, %8, %2 + SBUTTERFLY dq, %1, %3, %2 + movdqa %9, m%3 + movdqa m%2, %10 + SBUTTERFLY dq, %2, %4, %3 + SBUTTERFLY dq, %5, %7, %3 + SBUTTERFLY dq, %6, %8, %3 + SBUTTERFLY qdq, %1, %5, %3 + SBUTTERFLY qdq, %2, %6, %3 + movdqa %10, m%2 + movdqa m%3, %9 + SBUTTERFLY qdq, %3, %7, %2 + SBUTTERFLY qdq, %4, %8, %2 + SWAP %2, %5 + SWAP %4, %7 +%if %0<11 + movdqa m%5, %10 +%endif +%endif +%endmacro + +%macro TRANSPOSE16x16W 18-19 +; in: m0..m15, unless %19 in which case m6 is in %17 +; out: m0..m15, unless %19 in which case m4 is in %18 +; spills into %17 and %18 +%if %0 < 19 + mova %17, m%7 +%endif + + SBUTTERFLY dqqq, %1, %9, %7 + SBUTTERFLY dqqq, %2, %10, %7 + SBUTTERFLY dqqq, %3, %11, %7 + SBUTTERFLY dqqq, %4, %12, %7 + SBUTTERFLY dqqq, %5, %13, %7 + SBUTTERFLY dqqq, %6, %14, %7 + mova %18, m%14 + mova m%7, %17 + SBUTTERFLY dqqq, %7, %15, %14 + SBUTTERFLY dqqq, %8, %16, %14 + + SBUTTERFLY wd, %1, %2, %14 + SBUTTERFLY wd, %3, %4, %14 + SBUTTERFLY wd, %5, %6, %14 + SBUTTERFLY wd, %7, %8, %14 + SBUTTERFLY wd, %9, %10, %14 + SBUTTERFLY wd, %11, %12, %14 + mova %17, m%12 + mova m%14, %18 + SBUTTERFLY wd, %13, %14, %12 + SBUTTERFLY wd, %15, %16, %12 + + SBUTTERFLY dq, %1, %3, %12 + SBUTTERFLY dq, %2, %4, %12 + SBUTTERFLY dq, %5, %7, %12 + SBUTTERFLY dq, %6, %8, %12 + SBUTTERFLY dq, %9, %11, %12 + mova %18, m%11 + mova m%12, %17 + SBUTTERFLY dq, %10, %12, %11 + SBUTTERFLY dq, %13, %15, %11 + SBUTTERFLY dq, %14, %16, %11 + + SBUTTERFLY qdq, %1, %5, %11 + SBUTTERFLY qdq, %2, %6, %11 + SBUTTERFLY qdq, %3, %7, %11 + SBUTTERFLY qdq, %4, %8, %11 + + SWAP %2, %5 + SWAP %4, %7 + + SBUTTERFLY qdq, %9, %13, %11 + SBUTTERFLY qdq, %10, %14, %11 + mova m%11, %18 + mova %18, m%5 + SBUTTERFLY qdq, %11, %15, %5 + SBUTTERFLY qdq, %12, %16, %5 + +%if %0 < 19 + mova m%5, %18 +%endif + + SWAP %10, %13 + SWAP %12, %15 +%endmacro + +%macro TRANSPOSE_8X8B 8 + %if mmsize == 8 + %error "This macro does not support mmsize == 8" + %endif + punpcklbw m%1, m%2 + punpcklbw m%3, m%4 + punpcklbw m%5, m%6 + punpcklbw m%7, m%8 + TRANSPOSE4x4W %1, %3, %5, %7, %2 + MOVHL m%2, m%1 + MOVHL m%4, m%3 + MOVHL m%6, m%5 + MOVHL m%8, m%7 +%endmacro + +; PABSW macro assumes %1 != %2, while ABS1/2 macros work in-place +%macro PABSW 2 +%if cpuflag(ssse3) + pabsw %1, %2 +%elif cpuflag(mmxext) + pxor %1, %1 + psubw %1, %2 + pmaxsw %1, %2 +%else + pxor %1, %1 + pcmpgtw %1, %2 + pxor %2, %1 + psubw %2, %1 + SWAP %1, %2 +%endif +%endmacro + +%macro PSIGNW 2 +%if cpuflag(ssse3) + psignw %1, %2 +%else + pxor %1, %2 + psubw %1, %2 +%endif +%endmacro + +%macro ABS1 2 +%if cpuflag(ssse3) + pabsw %1, %1 +%elif cpuflag(mmxext) ; a, tmp + pxor %2, %2 + psubw %2, %1 + pmaxsw %1, %2 +%else ; a, tmp + pxor %2, %2 + pcmpgtw %2, %1 + pxor %1, %2 + psubw %1, %2 +%endif +%endmacro + +%macro ABS2 4 +%if cpuflag(ssse3) + pabsw %1, %1 + pabsw %2, %2 +%elif cpuflag(mmxext) ; a, b, tmp0, tmp1 + pxor %3, %3 + pxor %4, %4 + psubw %3, %1 + psubw %4, %2 + pmaxsw %1, %3 + pmaxsw %2, %4 +%else ; a, b, tmp0, tmp1 + pxor %3, %3 + pxor %4, %4 + pcmpgtw %3, %1 + pcmpgtw %4, %2 + pxor %1, %3 + pxor %2, %4 + psubw %1, %3 + psubw %2, %4 +%endif +%endmacro + +%macro ABSB 2 ; source mmreg, temp mmreg (unused for SSSE3) +%if cpuflag(ssse3) + pabsb %1, %1 +%else + pxor %2, %2 + psubb %2, %1 + pminub %1, %2 +%endif +%endmacro + +%macro ABSB2 4 ; src1, src2, tmp1, tmp2 (tmp1/2 unused for SSSE3) +%if cpuflag(ssse3) + pabsb %1, %1 + pabsb %2, %2 +%else + pxor %3, %3 + pxor %4, %4 + psubb %3, %1 + psubb %4, %2 + pminub %1, %3 + pminub %2, %4 +%endif +%endmacro + +%macro ABSD2 4 + pxor %3, %3 + pxor %4, %4 + pcmpgtd %3, %1 + pcmpgtd %4, %2 + pxor %1, %3 + pxor %2, %4 + psubd %1, %3 + psubd %2, %4 +%endmacro + +%macro ABS4 6 + ABS2 %1, %2, %5, %6 + ABS2 %3, %4, %5, %6 +%endmacro + +%macro SPLATB_LOAD 3 +%if cpuflag(ssse3) + movd %1, [%2-3] + pshufb %1, %3 +%else + movd %1, [%2-3] ;to avoid crossing a cacheline + punpcklbw %1, %1 + SPLATW %1, %1, 3 +%endif +%endmacro + +%macro SPLATB_REG 3 +%if cpuflag(ssse3) + movd %1, %2d + pshufb %1, %3 +%else + movd %1, %2d + punpcklbw %1, %1 + SPLATW %1, %1, 0 +%endif +%endmacro + +%macro HADDD 2 ; sum junk +%if sizeof%1 == 32 +%define %2 xmm%2 + vextracti128 %2, %1, 1 +%define %1 xmm%1 + paddd %1, %2 +%endif +%if mmsize >= 16 +%if cpuflag(xop) && sizeof%1 == 16 + vphadddq %1, %1 +%endif + movhlps %2, %1 + paddd %1, %2 +%endif +%if notcpuflag(xop) || sizeof%1 != 16 +%if cpuflag(mmxext) + PSHUFLW %2, %1, q0032 +%else ; mmx + mova %2, %1 + psrlq %2, 32 +%endif + paddd %1, %2 +%endif +%undef %1 +%undef %2 +%endmacro + +%macro HADDW 2 ; reg, tmp +%if cpuflag(xop) && sizeof%1 == 16 + vphaddwq %1, %1 + movhlps %2, %1 + paddd %1, %2 +%else + pmaddwd %1, [pw_1] + HADDD %1, %2 +%endif +%endmacro + +%macro HADDPS 3 ; dst, src, tmp +%if cpuflag(sse3) + haddps %1, %1, %2 +%else + movaps %3, %1 + shufps %1, %2, q2020 + shufps %3, %2, q3131 + addps %1, %3 +%endif +%endmacro + +%macro PALIGNR 4-5 +%if cpuflag(ssse3) +%if %0==5 + palignr %1, %2, %3, %4 +%else + palignr %1, %2, %3 +%endif +%else ; [dst,] src1, src2, imm, tmp + %define %%dst %1 +%if %0==5 +%ifnidn %1, %2 + mova %%dst, %2 +%endif + %rotate 1 +%endif +%ifnidn %4, %2 + mova %4, %2 +%endif +%if mmsize==8 + psllq %%dst, (8-%3)*8 + psrlq %4, %3*8 +%else + pslldq %%dst, 16-%3 + psrldq %4, %3 +%endif + por %%dst, %4 +%endif +%endmacro + +%macro PAVGB 2-4 +%if cpuflag(mmxext) + pavgb %1, %2 +%elif cpuflag(3dnow) + pavgusb %1, %2 +%elif cpuflag(mmx) + movu %3, %2 + por %3, %1 + pxor %1, %2 + pand %1, %4 + psrlq %1, 1 + psubb %3, %1 + SWAP %1, %3 +%endif +%endmacro + +%macro PSHUFLW 1+ + %if mmsize == 8 + pshufw %1 + %else + pshuflw %1 + %endif +%endmacro + +%macro PSWAPD 2 +%if cpuflag(mmxext) + pshufw %1, %2, q1032 +%elif cpuflag(3dnowext) + pswapd %1, %2 +%elif cpuflag(3dnow) + movq %1, %2 + psrlq %1, 32 + punpckldq %1, %2 +%endif +%endmacro + +%macro DEINTB 5 ; mask, reg1, mask, reg2, optional src to fill masks from +%ifnum %5 + pand m%3, m%5, m%4 ; src .. y6 .. y4 + pand m%1, m%5, m%2 ; dst .. y6 .. y4 +%else + mova m%1, %5 + pand m%3, m%1, m%4 ; src .. y6 .. y4 + pand m%1, m%1, m%2 ; dst .. y6 .. y4 +%endif + psrlw m%2, 8 ; dst .. y7 .. y5 + psrlw m%4, 8 ; src .. y7 .. y5 +%endmacro + +%macro SUMSUB_BA 3-4 +%if %0==3 + padd%1 m%2, m%3 + padd%1 m%3, m%3 + psub%1 m%3, m%2 +%else +%if avx_enabled == 0 + mova m%4, m%2 + padd%1 m%2, m%3 + psub%1 m%3, m%4 +%else + padd%1 m%4, m%2, m%3 + psub%1 m%3, m%2 + SWAP %2, %4 +%endif +%endif +%endmacro + +%macro SUMSUB_BADC 5-6 +%if %0==6 + SUMSUB_BA %1, %2, %3, %6 + SUMSUB_BA %1, %4, %5, %6 +%else + padd%1 m%2, m%3 + padd%1 m%4, m%5 + padd%1 m%3, m%3 + padd%1 m%5, m%5 + psub%1 m%3, m%2 + psub%1 m%5, m%4 +%endif +%endmacro + +%macro SUMSUB2_AB 4 +%ifnum %3 + psub%1 m%4, m%2, m%3 + psub%1 m%4, m%3 + padd%1 m%2, m%2 + padd%1 m%2, m%3 +%else + mova m%4, m%2 + padd%1 m%2, m%2 + padd%1 m%2, %3 + psub%1 m%4, %3 + psub%1 m%4, %3 +%endif +%endmacro + +%macro SUMSUB2_BA 4 +%if avx_enabled == 0 + mova m%4, m%2 + padd%1 m%2, m%3 + padd%1 m%2, m%3 + psub%1 m%3, m%4 + psub%1 m%3, m%4 +%else + padd%1 m%4, m%2, m%3 + padd%1 m%4, m%3 + psub%1 m%3, m%2 + psub%1 m%3, m%2 + SWAP %2, %4 +%endif +%endmacro + +%macro SUMSUBD2_AB 5 +%ifnum %4 + psra%1 m%5, m%2, 1 ; %3: %3>>1 + psra%1 m%4, m%3, 1 ; %2: %2>>1 + padd%1 m%4, m%2 ; %3: %3>>1+%2 + psub%1 m%5, m%3 ; %2: %2>>1-%3 + SWAP %2, %5 + SWAP %3, %4 +%else + mova %5, m%2 + mova %4, m%3 + psra%1 m%3, 1 ; %3: %3>>1 + psra%1 m%2, 1 ; %2: %2>>1 + padd%1 m%3, %5 ; %3: %3>>1+%2 + psub%1 m%2, %4 ; %2: %2>>1-%3 +%endif +%endmacro + +%macro DCT4_1D 5 +%ifnum %5 + SUMSUB_BADC w, %4, %1, %3, %2, %5 + SUMSUB_BA w, %3, %4, %5 + SUMSUB2_AB w, %1, %2, %5 + SWAP %1, %3, %4, %5, %2 +%else + SUMSUB_BADC w, %4, %1, %3, %2 + SUMSUB_BA w, %3, %4 + mova [%5], m%2 + SUMSUB2_AB w, %1, [%5], %2 + SWAP %1, %3, %4, %2 +%endif +%endmacro + +%macro IDCT4_1D 6-7 +%ifnum %6 + SUMSUBD2_AB %1, %3, %5, %7, %6 + ; %3: %3>>1-%5 %5: %3+%5>>1 + SUMSUB_BA %1, %4, %2, %7 + ; %4: %2+%4 %2: %2-%4 + SUMSUB_BADC %1, %5, %4, %3, %2, %7 + ; %5: %2+%4 + (%3+%5>>1) + ; %4: %2+%4 - (%3+%5>>1) + ; %3: %2-%4 + (%3>>1-%5) + ; %2: %2-%4 - (%3>>1-%5) +%else +%ifidn %1, w + SUMSUBD2_AB %1, %3, %5, [%6], [%6+16] +%else + SUMSUBD2_AB %1, %3, %5, [%6], [%6+32] +%endif + SUMSUB_BA %1, %4, %2 + SUMSUB_BADC %1, %5, %4, %3, %2 +%endif + SWAP %2, %5, %4 + ; %2: %2+%4 + (%3+%5>>1) row0 + ; %3: %2-%4 + (%3>>1-%5) row1 + ; %4: %2-%4 - (%3>>1-%5) row2 + ; %5: %2+%4 - (%3+%5>>1) row3 +%endmacro + + +%macro LOAD_DIFF 5 +%ifidn %3, none + movh %1, %4 + movh %2, %5 + punpcklbw %1, %2 + punpcklbw %2, %2 + psubw %1, %2 +%else + movh %1, %4 + punpcklbw %1, %3 + movh %2, %5 + punpcklbw %2, %3 + psubw %1, %2 +%endif +%endmacro + +%macro STORE_DCT 6 + movq [%5+%6+ 0], m%1 + movq [%5+%6+ 8], m%2 + movq [%5+%6+16], m%3 + movq [%5+%6+24], m%4 + movhps [%5+%6+32], m%1 + movhps [%5+%6+40], m%2 + movhps [%5+%6+48], m%3 + movhps [%5+%6+56], m%4 +%endmacro + +%macro LOAD_DIFF_8x4P 7-10 r0,r2,0 ; 4x dest, 2x temp, 2x pointer, increment? + LOAD_DIFF m%1, m%5, m%7, [%8], [%9] + LOAD_DIFF m%2, m%6, m%7, [%8+r1], [%9+r3] + LOAD_DIFF m%3, m%5, m%7, [%8+2*r1], [%9+2*r3] + LOAD_DIFF m%4, m%6, m%7, [%8+r4], [%9+r5] +%if %10 + lea %8, [%8+4*r1] + lea %9, [%9+4*r3] +%endif +%endmacro + +%macro DIFFx2 6-7 + movh %3, %5 + punpcklbw %3, %4 + psraw %1, 6 + paddsw %1, %3 + movh %3, %6 + punpcklbw %3, %4 + psraw %2, 6 + paddsw %2, %3 + packuswb %2, %1 +%endmacro + +%macro STORE_DIFF 4 + movh %2, %4 + punpcklbw %2, %3 + psraw %1, 6 + paddsw %1, %2 + packuswb %1, %1 + movh %4, %1 +%endmacro + +%macro STORE_DIFFx2 8 ; add1, add2, reg1, reg2, zero, shift, source, stride + movh %3, [%7] + movh %4, [%7+%8] + psraw %1, %6 + psraw %2, %6 + punpcklbw %3, %5 + punpcklbw %4, %5 + paddw %3, %1 + paddw %4, %2 + packuswb %3, %5 + packuswb %4, %5 + movh [%7], %3 + movh [%7+%8], %4 +%endmacro + +%macro PMINUB 3 ; dst, src, ignored +%if cpuflag(mmxext) + pminub %1, %2 +%else ; dst, src, tmp + mova %3, %1 + psubusb %3, %2 + psubb %1, %3 +%endif +%endmacro + +%macro SPLATW 2-3 0 +%if cpuflag(avx2) && %3 == 0 + vpbroadcastw %1, %2 +%elif mmsize == 16 + pshuflw %1, %2, (%3)*0x55 + punpcklqdq %1, %1 +%elif cpuflag(mmxext) + pshufw %1, %2, (%3)*0x55 +%else + %ifnidn %1, %2 + mova %1, %2 + %endif + %if %3 & 2 + punpckhwd %1, %1 + %else + punpcklwd %1, %1 + %endif + %if %3 & 1 + punpckhwd %1, %1 + %else + punpcklwd %1, %1 + %endif +%endif +%endmacro + +%macro SPLATD 1 +%if mmsize == 8 + punpckldq %1, %1 +%elif cpuflag(sse2) + pshufd %1, %1, 0 +%elif cpuflag(sse) + shufps %1, %1, 0 +%endif +%endmacro + +%macro CLIPUB 3 ;(dst, min, max) + pmaxub %1, %2 + pminub %1, %3 +%endmacro + +%macro CLIPW 3 ;(dst, min, max) + pmaxsw %1, %2 + pminsw %1, %3 +%endmacro + +%macro PMINSD 3 ; dst, src, tmp/unused +%if cpuflag(sse4) + pminsd %1, %2 +%elif cpuflag(sse2) + cvtdq2ps %1, %1 + minps %1, %2 + cvtps2dq %1, %1 +%else + mova %3, %2 + pcmpgtd %3, %1 + pxor %1, %2 + pand %1, %3 + pxor %1, %2 +%endif +%endmacro + +%macro PMAXSD 3 ; dst, src, tmp/unused +%if cpuflag(sse4) + pmaxsd %1, %2 +%else + mova %3, %1 + pcmpgtd %3, %2 + pand %1, %3 + pandn %3, %2 + por %1, %3 +%endif +%endmacro + +%macro CLIPD 3-4 +%if cpuflag(sse4); src/dst, min, max, unused + pminsd %1, %3 + pmaxsd %1, %2 +%elif cpuflag(sse2) ; src/dst, min (float), max (float), unused + cvtdq2ps %1, %1 + minps %1, %3 + maxps %1, %2 + cvtps2dq %1, %1 +%else ; src/dst, min, max, tmp + PMINSD %1, %3, %4 + PMAXSD %1, %2, %4 +%endif +%endmacro + +%macro VBROADCASTSS 2 ; dst xmm/ymm, src m32/xmm +%if cpuflag(avx2) + vbroadcastss %1, %2 +%elif cpuflag(avx) + %ifnum sizeof%2 ; avx1 register + shufps xmm%1, xmm%2, xmm%2, q0000 + %if sizeof%1 >= 32 ; mmsize>=32 + vinsertf128 %1, %1, xmm%1, 1 + %endif + %else ; avx1 memory + vbroadcastss %1, %2 + %endif +%else + %ifnum sizeof%2 ; sse register + shufps %1, %2, %2, q0000 + %else ; sse memory + movss %1, %2 + shufps %1, %1, 0 + %endif +%endif +%endmacro + +%macro VBROADCASTSD 2 ; dst xmm/ymm, src m64 +%if cpuflag(avx) && mmsize == 32 + vbroadcastsd %1, %2 +%elif cpuflag(sse3) + movddup %1, %2 +%else ; sse2 + movsd %1, %2 + movlhps %1, %1 +%endif +%endmacro + +%macro VPBROADCASTD 2 ; dst xmm/ymm, src m32/xmm +%if cpuflag(avx2) + vpbroadcastd %1, %2 +%elif cpuflag(avx) && sizeof%1 >= 32 + %error vpbroadcastd not possible with ymm on avx1. try vbroadcastss +%else + %ifnum sizeof%2 ; sse2 register + pshufd %1, %2, q0000 + %else ; sse memory + movd %1, %2 + pshufd %1, %1, 0 + %endif +%endif +%endmacro + +%macro VBROADCASTI128 2 ; dst xmm/ymm, src : 128bits val +%if mmsize > 16 + vbroadcasti128 %1, %2 +%else + mova %1, %2 +%endif +%endmacro + +%macro SHUFFLE_MASK_W 8 + %rep 8 + %if %1>=0x80 + db %1, %1 + %else + db %1*2 + db %1*2+1 + %endif + %rotate 1 + %endrep +%endmacro + +%macro PMOVSXWD 2; dst, src +%if cpuflag(sse4) + pmovsxwd %1, %2 +%else + %ifnidn %1, %2 + mova %1, %2 + %endif + punpcklwd %1, %1 + psrad %1, 16 +%endif +%endmacro + +; Wrapper for non-FMA version of fmaddps +%macro FMULADD_PS 5 + %if cpuflag(fma3) || cpuflag(fma4) + fmaddps %1, %2, %3, %4 + %elifidn %1, %4 + mulps %5, %2, %3 + addps %1, %4, %5 + %else + mulps %1, %2, %3 + addps %1, %4 + %endif +%endmacro + +%macro LSHIFT 2 +%if mmsize > 8 + pslldq %1, %2 +%else + psllq %1, 8*(%2) +%endif +%endmacro + +%macro RSHIFT 2 +%if mmsize > 8 + psrldq %1, %2 +%else + psrlq %1, 8*(%2) +%endif +%endmacro + +%macro MOVHL 2 ; dst, src +%ifidn %1, %2 + punpckhqdq %1, %2 +%elif cpuflag(avx) + punpckhqdq %1, %2, %2 +%elif cpuflag(sse4) + pshufd %1, %2, q3232 ; pshufd is slow on some older CPUs, so only use it on more modern ones +%else + movhlps %1, %2 ; may cause an int/float domain transition and has a dependency on dst +%endif +%endmacro + +; Horizontal Sum of Packed Single precision floats +; The resulting sum is in all elements. +%macro HSUMPS 2 ; dst/src, tmp +%if cpuflag(avx) + %if sizeof%1>=32 ; avx + vperm2f128 %2, %1, %1, (0)*16+(1) + addps %1, %2 + %endif + shufps %2, %1, %1, q1032 + addps %1, %2 + shufps %2, %1, %1, q0321 + addps %1, %2 +%else ; this form is a bit faster than the short avx-like emulation. + movaps %2, %1 + shufps %1, %1, q1032 + addps %1, %2 + movaps %2, %1 + shufps %1, %1, q0321 + addps %1, %2 + ; all %1 members should be equal for as long as float a+b==b+a +%endif +%endmacro + +; Emulate blendvps if not available +; +; src_b is destroyed when using emulation with logical operands +; SSE41 blendv instruction is hard coded to use xmm0 as mask +%macro BLENDVPS 3 ; dst/src_a, src_b, mask +%if cpuflag(avx) + blendvps %1, %1, %2, %3 +%elif cpuflag(sse4) + %ifnidn %3,xmm0 + %error sse41 blendvps uses xmm0 as default 3d operand, you used %3 + %endif + blendvps %1, %2, %3 +%else + xorps %2, %1 + andps %2, %3 + xorps %1, %2 +%endif +%endmacro + +; Emulate pblendvb if not available +; +; src_b is destroyed when using emulation with logical operands +; SSE41 blendv instruction is hard coded to use xmm0 as mask +%macro PBLENDVB 3 ; dst/src_a, src_b, mask +%if cpuflag(avx) + %if cpuflag(avx) && notcpuflag(avx2) && sizeof%1 >= 32 + %error pblendb not possible with ymm on avx1, try blendvps. + %endif + pblendvb %1, %1, %2, %3 +%elif cpuflag(sse4) + %ifnidn %3,xmm0 + %error sse41 pblendvd uses xmm0 as default 3d operand, you used %3 + %endif + pblendvb %1, %2, %3 +%else + pxor %2, %1 + pand %2, %3 + pxor %1, %2 +%endif +%endmacro -- cgit v1.2.3