diff options
Diffstat (limited to 'media/ffvpx/libavutil/aarch64')
-rw-r--r-- | media/ffvpx/libavutil/aarch64/asm.S | 260 | ||||
-rw-r--r-- | media/ffvpx/libavutil/aarch64/bswap.h | 56 | ||||
-rw-r--r-- | media/ffvpx/libavutil/aarch64/cpu.c | 127 | ||||
-rw-r--r-- | media/ffvpx/libavutil/aarch64/cpu.h | 31 | ||||
-rw-r--r-- | media/ffvpx/libavutil/aarch64/float_dsp_init.c | 69 | ||||
-rw-r--r-- | media/ffvpx/libavutil/aarch64/float_dsp_neon.S | 202 | ||||
-rw-r--r-- | media/ffvpx/libavutil/aarch64/moz.build | 21 | ||||
-rw-r--r-- | media/ffvpx/libavutil/aarch64/timer.h | 50 | ||||
-rw-r--r-- | media/ffvpx/libavutil/aarch64/tx_float_init.c | 64 | ||||
-rw-r--r-- | media/ffvpx/libavutil/aarch64/tx_float_neon.S | 1294 |
10 files changed, 2174 insertions, 0 deletions
diff --git a/media/ffvpx/libavutil/aarch64/asm.S b/media/ffvpx/libavutil/aarch64/asm.S new file mode 100644 index 0000000000..1840f9fb01 --- /dev/null +++ b/media/ffvpx/libavutil/aarch64/asm.S @@ -0,0 +1,260 @@ +/* + * Copyright (c) 2008 Mans Rullgard <mans@mansr.com> + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#include "config.h" + +#ifdef __ELF__ +# define ELF +#else +# define ELF # +#endif + +#if HAVE_AS_FUNC +# define FUNC +#else +# define FUNC # +#endif + +#ifndef __has_feature +# define __has_feature(x) 0 +#endif + +#if HAVE_AS_ARCH_DIRECTIVE + .arch AS_ARCH_LEVEL +#endif + +#if HAVE_AS_ARCHEXT_DOTPROD_DIRECTIVE +#define ENABLE_DOTPROD .arch_extension dotprod +#define DISABLE_DOTPROD .arch_extension nodotprod +#else +#define ENABLE_DOTPROD +#define DISABLE_DOTPROD +#endif + +#if HAVE_AS_ARCHEXT_I8MM_DIRECTIVE +#define ENABLE_I8MM .arch_extension i8mm +#define DISABLE_I8MM .arch_extension noi8mm +#else +#define ENABLE_I8MM +#define DISABLE_I8MM +#endif + +DISABLE_DOTPROD +DISABLE_I8MM + + +/* Support macros for + * - Armv8.3-A Pointer Authentication and + * - Armv8.5-A Branch Target Identification + * features which require emitting a .note.gnu.property section with the + * appropriate architecture-dependent feature bits set. + * + * |AARCH64_SIGN_LINK_REGISTER| and |AARCH64_VALIDATE_LINK_REGISTER| expand to + * PACIxSP and AUTIxSP, respectively. |AARCH64_SIGN_LINK_REGISTER| should be + * used immediately before saving the LR register (x30) to the stack. + * |AARCH64_VALIDATE_LINK_REGISTER| should be used immediately after restoring + * it. Note |AARCH64_SIGN_LINK_REGISTER|'s modifications to LR must be undone + * with |AARCH64_VALIDATE_LINK_REGISTER| before RET. The SP register must also + * have the same value at the two points. For example: + * + * .global f + * f: + * AARCH64_SIGN_LINK_REGISTER + * stp x29, x30, [sp, #-96]! + * mov x29, sp + * ... + * ldp x29, x30, [sp], #96 + * AARCH64_VALIDATE_LINK_REGISTER + * ret + * + * |AARCH64_VALID_CALL_TARGET| expands to BTI 'c'. Either it, or + * |AARCH64_SIGN_LINK_REGISTER|, must be used at every point that may be an + * indirect call target. In particular, all symbols exported from a file must + * begin with one of these macros. For example, a leaf function that does not + * save LR can instead use |AARCH64_VALID_CALL_TARGET|: + * + * .globl return_zero + * return_zero: + * AARCH64_VALID_CALL_TARGET + * mov x0, #0 + * ret + * + * A non-leaf function which does not immediately save LR may need both macros + * because |AARCH64_SIGN_LINK_REGISTER| appears late. For example, the function + * may jump to an alternate implementation before setting up the stack: + * + * .globl with_early_jump + * with_early_jump: + * AARCH64_VALID_CALL_TARGET + * cmp x0, #128 + * b.lt .Lwith_early_jump_128 + * AARCH64_SIGN_LINK_REGISTER + * stp x29, x30, [sp, #-96]! + * mov x29, sp + * ... + * ldp x29, x30, [sp], #96 + * AARCH64_VALIDATE_LINK_REGISTER + * ret + * + * .Lwith_early_jump_128: + * ... + * ret + * + * These annotations are only required with indirect calls. Private symbols that + * are only the target of direct calls do not require annotations. Also note + * that |AARCH64_VALID_CALL_TARGET| is only valid for indirect calls (BLR), not + * indirect jumps (BR). Indirect jumps in assembly are supported through + * |AARCH64_VALID_JUMP_TARGET|. Landing Pads which shall serve for jumps and + * calls can be created using |AARCH64_VALID_JUMP_CALL_TARGET|. + * + * Although not necessary, it is safe to use these macros in 32-bit ARM + * assembly. This may be used to simplify dual 32-bit and 64-bit files. + * + * References: + * - "ELF for the ArmĀ® 64-bit Architecture" + * https: *github.com/ARM-software/abi-aa/blob/master/aaelf64/aaelf64.rst + * - "Providing protection for complex software" + * https://developer.arm.com/architectures/learn-the-architecture/providing-protection-for-complex-software + */ +#if defined(__ARM_FEATURE_BTI_DEFAULT) && (__ARM_FEATURE_BTI_DEFAULT == 1) +# define GNU_PROPERTY_AARCH64_BTI (1 << 0) // Has BTI +# define AARCH64_VALID_CALL_TARGET hint #34 // BTI 'c' +# define AARCH64_VALID_JUMP_TARGET hint #38 // BTI 'j' +#else +# define GNU_PROPERTY_AARCH64_BTI 0 // No BTI +# define AARCH64_VALID_CALL_TARGET +# define AARCH64_VALID_JUMP_TARGET +#endif + +#if defined(__ARM_FEATURE_PAC_DEFAULT) +# if ((__ARM_FEATURE_PAC_DEFAULT & (1 << 0)) != 0) // authentication using key A +# define AARCH64_SIGN_LINK_REGISTER paciasp +# define AARCH64_VALIDATE_LINK_REGISTER autiasp +# elif ((__ARM_FEATURE_PAC_DEFAULT & (1 << 1)) != 0) // authentication using key B +# define AARCH64_SIGN_LINK_REGISTER pacibsp +# define AARCH64_VALIDATE_LINK_REGISTER autibsp +# else +# error Pointer authentication defines no valid key! +# endif +# if ((__ARM_FEATURE_PAC_DEFAULT & (1 << 2)) != 0) +# error Authentication of leaf functions is enabled but not supported in FFmpeg! +# endif +# define GNU_PROPERTY_AARCH64_PAC (1 << 1) +#else +# define GNU_PROPERTY_AARCH64_PAC 0 +# define AARCH64_SIGN_LINK_REGISTER +# define AARCH64_VALIDATE_LINK_REGISTER +#endif + + +#if (GNU_PROPERTY_AARCH64_BTI != 0 || GNU_PROPERTY_AARCH64_PAC != 0) && defined(__ELF__) + .pushsection .note.gnu.property, "a" + .balign 8 + .long 4 + .long 0x10 + .long 0x5 + .asciz "GNU" + .long 0xc0000000 /* GNU_PROPERTY_AARCH64_FEATURE_1_AND */ + .long 4 + .long (GNU_PROPERTY_AARCH64_BTI | GNU_PROPERTY_AARCH64_PAC) + .long 0 + .popsection +#endif + +.macro function name, export=0, align=2 + .macro endfunc +ELF .size \name, . - \name +FUNC .endfunc + .purgem endfunc + .endm + .text + .align \align + .if \export + .global EXTERN_ASM\name +ELF .type EXTERN_ASM\name, %function +FUNC .func EXTERN_ASM\name +EXTERN_ASM\name: + AARCH64_VALID_CALL_TARGET + .else +ELF .type \name, %function +FUNC .func \name +\name: + .endif +.endm + +.macro const name, align=2, relocate=0 + .macro endconst +ELF .size \name, . - \name + .purgem endconst + .endm +#if HAVE_SECTION_DATA_REL_RO +.if \relocate + .section .data.rel.ro +.else + .section .rodata +.endif +#elif defined(_WIN32) + .section .rdata +#elif !defined(__MACH__) + .section .rodata +#else + .const_data +#endif + .align \align +\name: +.endm + +.macro movrel rd, val, offset=0 +#if CONFIG_PIC && defined(__APPLE__) + .if \offset < 0 + adrp \rd, \val@PAGE + add \rd, \rd, \val@PAGEOFF + sub \rd, \rd, -(\offset) + .else + adrp \rd, \val+(\offset)@PAGE + add \rd, \rd, \val+(\offset)@PAGEOFF + .endif +#elif CONFIG_PIC && defined(_WIN32) + .if \offset < 0 + adrp \rd, \val + add \rd, \rd, :lo12:\val + sub \rd, \rd, -(\offset) + .else + adrp \rd, \val+(\offset) + add \rd, \rd, :lo12:\val+(\offset) + .endif +#elif CONFIG_PIC +# if __has_feature(hwaddress_sanitizer) + adrp \rd, :pg_hi21_nc:\val+(\offset) +# else + adrp \rd, \val+(\offset) +# endif + add \rd, \rd, :lo12:\val+(\offset) +#else + ldr \rd, =\val+\offset +#endif +.endm + +#define GLUE(a, b) a ## b +#define JOIN(a, b) GLUE(a, b) +#define X(s) JOIN(EXTERN_ASM, s) + +#define x18 do_not_use_x18 +#define w18 do_not_use_w18 diff --git a/media/ffvpx/libavutil/aarch64/bswap.h b/media/ffvpx/libavutil/aarch64/bswap.h new file mode 100644 index 0000000000..7abca657ba --- /dev/null +++ b/media/ffvpx/libavutil/aarch64/bswap.h @@ -0,0 +1,56 @@ +/* + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#ifndef AVUTIL_AARCH64_BSWAP_H +#define AVUTIL_AARCH64_BSWAP_H + +#include <stdint.h> +#include "config.h" +#include "libavutil/attributes.h" + +#if HAVE_INLINE_ASM + +#define av_bswap16 av_bswap16 +static av_always_inline av_const unsigned av_bswap16(unsigned x) +{ + unsigned y; + + __asm__("rev16 %w0, %w1" : "=r"(y) : "r"(x)); + return y; +} + +#define av_bswap32 av_bswap32 +static av_always_inline av_const uint32_t av_bswap32(uint32_t x) +{ + uint32_t y; + + __asm__("rev %w0, %w1" : "=r"(y) : "r"(x)); + return y; +} + +#define av_bswap64 av_bswap64 +static av_always_inline av_const uint64_t av_bswap64(uint64_t x) +{ + uint64_t y; + + __asm__("rev %0, %1" : "=r"(y) : "r"(x)); + return y; +} + +#endif /* HAVE_INLINE_ASM */ +#endif /* AVUTIL_AARCH64_BSWAP_H */ diff --git a/media/ffvpx/libavutil/aarch64/cpu.c b/media/ffvpx/libavutil/aarch64/cpu.c new file mode 100644 index 0000000000..f27fef3992 --- /dev/null +++ b/media/ffvpx/libavutil/aarch64/cpu.c @@ -0,0 +1,127 @@ +/* + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#include "libavutil/cpu.h" +#include "libavutil/cpu_internal.h" +#include "config.h" + +#if (defined(__linux__) || defined(__ANDROID__)) && HAVE_GETAUXVAL +#include <stdint.h> +#include <sys/auxv.h> + +#define get_cpu_feature_reg(reg, val) \ + __asm__("mrs %0, " #reg : "=r" (val)) + +static int detect_flags(void) +{ + int flags = 0; + +#if defined(HWCAP_CPUID) && HAVE_INLINE_ASM + unsigned long hwcap = getauxval(AT_HWCAP); + // We can check for DOTPROD and I8MM using HWCAP_ASIMDDP and + // HWCAP2_I8MM too, avoiding to read the CPUID registers (which triggers + // a trap, handled by the kernel). However the HWCAP_* defines for these + // extensions are added much later than HWCAP_CPUID, so the userland + // headers might lack support for them even if the binary later is run + // on hardware that does support it (and where the kernel might support + // HWCAP_CPUID). + // See https://www.kernel.org/doc/html/latest/arm64/cpu-feature-registers.html + if (hwcap & HWCAP_CPUID) { + uint64_t tmp; + + get_cpu_feature_reg(ID_AA64ISAR0_EL1, tmp); + if (((tmp >> 44) & 0xf) == 0x1) + flags |= AV_CPU_FLAG_DOTPROD; + get_cpu_feature_reg(ID_AA64ISAR1_EL1, tmp); + if (((tmp >> 52) & 0xf) == 0x1) + flags |= AV_CPU_FLAG_I8MM; + } +#endif + + return flags; +} + +#elif defined(__APPLE__) && HAVE_SYSCTLBYNAME +#include <sys/sysctl.h> + +static int detect_flags(void) +{ + uint32_t value = 0; + size_t size; + int flags = 0; + + size = sizeof(value); + if (!sysctlbyname("hw.optional.arm.FEAT_DotProd", &value, &size, NULL, 0)) { + if (value) + flags |= AV_CPU_FLAG_DOTPROD; + } + size = sizeof(value); + if (!sysctlbyname("hw.optional.arm.FEAT_I8MM", &value, &size, NULL, 0)) { + if (value) + flags |= AV_CPU_FLAG_I8MM; + } + return flags; +} + +#elif defined(_WIN32) +#include <windows.h> + +static int detect_flags(void) +{ + int flags = 0; +#ifdef PF_ARM_V82_DP_INSTRUCTIONS_AVAILABLE + if (IsProcessorFeaturePresent(PF_ARM_V82_DP_INSTRUCTIONS_AVAILABLE)) + flags |= AV_CPU_FLAG_DOTPROD; +#endif + return flags; +} +#else + +static int detect_flags(void) +{ + return 0; +} + +#endif + +int ff_get_cpu_flags_aarch64(void) +{ + int flags = AV_CPU_FLAG_ARMV8 * HAVE_ARMV8 | + AV_CPU_FLAG_NEON * HAVE_NEON; + +#ifdef __ARM_FEATURE_DOTPROD + flags |= AV_CPU_FLAG_DOTPROD; +#endif +#ifdef __ARM_FEATURE_MATMUL_INT8 + flags |= AV_CPU_FLAG_I8MM; +#endif + + flags |= detect_flags(); + + return flags; +} + +size_t ff_get_cpu_max_align_aarch64(void) +{ + int flags = av_get_cpu_flags(); + + if (flags & AV_CPU_FLAG_NEON) + return 16; + + return 8; +} diff --git a/media/ffvpx/libavutil/aarch64/cpu.h b/media/ffvpx/libavutil/aarch64/cpu.h new file mode 100644 index 0000000000..64d703be37 --- /dev/null +++ b/media/ffvpx/libavutil/aarch64/cpu.h @@ -0,0 +1,31 @@ +/* + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#ifndef AVUTIL_AARCH64_CPU_H +#define AVUTIL_AARCH64_CPU_H + +#include "libavutil/cpu.h" +#include "libavutil/cpu_internal.h" + +#define have_armv8(flags) CPUEXT(flags, ARMV8) +#define have_neon(flags) CPUEXT(flags, NEON) +#define have_vfp(flags) CPUEXT(flags, VFP) +#define have_dotprod(flags) CPUEXT(flags, DOTPROD) +#define have_i8mm(flags) CPUEXT(flags, I8MM) + +#endif /* AVUTIL_AARCH64_CPU_H */ diff --git a/media/ffvpx/libavutil/aarch64/float_dsp_init.c b/media/ffvpx/libavutil/aarch64/float_dsp_init.c new file mode 100644 index 0000000000..4325071821 --- /dev/null +++ b/media/ffvpx/libavutil/aarch64/float_dsp_init.c @@ -0,0 +1,69 @@ +/* + * ARM NEON optimised Float DSP functions + * Copyright (c) 2008 Mans Rullgard <mans@mansr.com> + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#include <stdint.h> + +#include "libavutil/attributes.h" +#include "libavutil/cpu.h" +#include "libavutil/float_dsp.h" +#include "cpu.h" + +void ff_vector_fmul_neon(float *dst, const float *src0, const float *src1, + int len); + +void ff_vector_fmac_scalar_neon(float *dst, const float *src, float mul, + int len); + +void ff_vector_fmul_scalar_neon(float *dst, const float *src, float mul, + int len); + +void ff_vector_dmul_scalar_neon(double *dst, const double *src, double mul, + int len); + +void ff_vector_fmul_window_neon(float *dst, const float *src0, + const float *src1, const float *win, int len); + +void ff_vector_fmul_add_neon(float *dst, const float *src0, const float *src1, + const float *src2, int len); + +void ff_vector_fmul_reverse_neon(float *dst, const float *src0, + const float *src1, int len); + +void ff_butterflies_float_neon(float *v1, float *v2, int len); + +float ff_scalarproduct_float_neon(const float *v1, const float *v2, int len); + +av_cold void ff_float_dsp_init_aarch64(AVFloatDSPContext *fdsp) +{ + int cpu_flags = av_get_cpu_flags(); + + if (have_neon(cpu_flags)) { + fdsp->butterflies_float = ff_butterflies_float_neon; + fdsp->scalarproduct_float = ff_scalarproduct_float_neon; + fdsp->vector_dmul_scalar = ff_vector_dmul_scalar_neon; + fdsp->vector_fmul = ff_vector_fmul_neon; + fdsp->vector_fmac_scalar = ff_vector_fmac_scalar_neon; + fdsp->vector_fmul_add = ff_vector_fmul_add_neon; + fdsp->vector_fmul_reverse = ff_vector_fmul_reverse_neon; + fdsp->vector_fmul_scalar = ff_vector_fmul_scalar_neon; + fdsp->vector_fmul_window = ff_vector_fmul_window_neon; + } +} diff --git a/media/ffvpx/libavutil/aarch64/float_dsp_neon.S b/media/ffvpx/libavutil/aarch64/float_dsp_neon.S new file mode 100644 index 0000000000..35e2715b87 --- /dev/null +++ b/media/ffvpx/libavutil/aarch64/float_dsp_neon.S @@ -0,0 +1,202 @@ +/* + * ARM NEON optimised Float DSP functions + * Copyright (c) 2008 Mans Rullgard <mans@mansr.com> + * Copyright (c) 2014 Janne Grunau <janne-libav@jannau.net> + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#include "config.h" +#include "asm.S" + +function ff_vector_fmul_neon, export=1 +1: subs w3, w3, #16 + ld1 {v0.4s, v1.4s}, [x1], #32 + ld1 {v2.4s, v3.4s}, [x1], #32 + ld1 {v4.4s, v5.4s}, [x2], #32 + ld1 {v6.4s, v7.4s}, [x2], #32 + fmul v16.4s, v0.4s, v4.4s + fmul v17.4s, v1.4s, v5.4s + fmul v18.4s, v2.4s, v6.4s + fmul v19.4s, v3.4s, v7.4s + st1 {v16.4s, v17.4s}, [x0], #32 + st1 {v18.4s, v19.4s}, [x0], #32 + b.ne 1b + ret +endfunc + +function ff_vector_fmac_scalar_neon, export=1 + mov x3, #-32 +1: subs w2, w2, #16 + ld1 {v16.4s, v17.4s}, [x0], #32 + ld1 {v18.4s, v19.4s}, [x0], x3 + ld1 {v4.4s, v5.4s}, [x1], #32 + ld1 {v6.4s, v7.4s}, [x1], #32 + fmla v16.4s, v4.4s, v0.s[0] + fmla v17.4s, v5.4s, v0.s[0] + fmla v18.4s, v6.4s, v0.s[0] + fmla v19.4s, v7.4s, v0.s[0] + st1 {v16.4s, v17.4s}, [x0], #32 + st1 {v18.4s, v19.4s}, [x0], #32 + b.ne 1b + ret +endfunc + +function ff_vector_fmul_scalar_neon, export=1 + mov w4, #15 + bics w3, w2, w4 + dup v16.4s, v0.s[0] + b.eq 3f + ld1 {v0.4s, v1.4s}, [x1], #32 +1: subs w3, w3, #16 + fmul v0.4s, v0.4s, v16.4s + ld1 {v2.4s, v3.4s}, [x1], #32 + fmul v1.4s, v1.4s, v16.4s + fmul v2.4s, v2.4s, v16.4s + st1 {v0.4s, v1.4s}, [x0], #32 + fmul v3.4s, v3.4s, v16.4s + b.eq 2f + ld1 {v0.4s, v1.4s}, [x1], #32 + st1 {v2.4s, v3.4s}, [x0], #32 + b 1b +2: ands w2, w2, #15 + st1 {v2.4s, v3.4s}, [x0], #32 + b.eq 4f +3: ld1 {v0.4s}, [x1], #16 + fmul v0.4s, v0.4s, v16.4s + st1 {v0.4s}, [x0], #16 + subs w2, w2, #4 + b.gt 3b +4: ret +endfunc + +function ff_vector_dmul_scalar_neon, export=1 + dup v16.2d, v0.d[0] + ld1 {v0.2d, v1.2d}, [x1], #32 +1: subs w2, w2, #8 + fmul v0.2d, v0.2d, v16.2d + ld1 {v2.2d, v3.2d}, [x1], #32 + fmul v1.2d, v1.2d, v16.2d + fmul v2.2d, v2.2d, v16.2d + st1 {v0.2d, v1.2d}, [x0], #32 + fmul v3.2d, v3.2d, v16.2d + ld1 {v0.2d, v1.2d}, [x1], #32 + st1 {v2.2d, v3.2d}, [x0], #32 + b.gt 1b + ret +endfunc + +function ff_vector_fmul_window_neon, export=1 + sxtw x4, w4 // len + sub x2, x2, #8 + sub x5, x4, #2 + add x2, x2, x5, lsl #2 // src1 + 4 * (len - 4) + add x6, x3, x5, lsl #3 // win + 8 * (len - 2) + add x5, x0, x5, lsl #3 // dst + 8 * (len - 2) + mov x7, #-16 + ld1 {v0.4s}, [x1], #16 // s0 + ld1 {v2.4s}, [x3], #16 // wi + ld1 {v1.4s}, [x2], x7 // s1 +1: ld1 {v3.4s}, [x6], x7 // wj + subs x4, x4, #4 + fmul v17.4s, v0.4s, v2.4s // s0 * wi + rev64 v4.4s, v1.4s + rev64 v5.4s, v3.4s + rev64 v17.4s, v17.4s + ext v4.16b, v4.16b, v4.16b, #8 // s1_r + ext v5.16b, v5.16b, v5.16b, #8 // wj_r + ext v17.16b, v17.16b, v17.16b, #8 // (s0 * wi)_rev + fmul v16.4s, v0.4s, v5.4s // s0 * wj_r + fmla v17.4s, v1.4s, v3.4s // (s0 * wi)_rev + s1 * wj + b.eq 2f + ld1 {v0.4s}, [x1], #16 + fmls v16.4s, v4.4s, v2.4s // s0 * wj_r - s1_r * wi + st1 {v17.4s}, [x5], x7 + ld1 {v2.4s}, [x3], #16 + ld1 {v1.4s}, [x2], x7 + st1 {v16.4s}, [x0], #16 + b 1b +2: + fmls v16.4s, v4.4s, v2.4s // s0 * wj_r - s1_r * wi + st1 {v17.4s}, [x5], x7 + st1 {v16.4s}, [x0], #16 + ret +endfunc + +function ff_vector_fmul_add_neon, export=1 + ld1 {v0.4s, v1.4s}, [x1], #32 + ld1 {v2.4s, v3.4s}, [x2], #32 + ld1 {v4.4s, v5.4s}, [x3], #32 +1: subs w4, w4, #8 + fmla v4.4s, v0.4s, v2.4s + fmla v5.4s, v1.4s, v3.4s + b.eq 2f + ld1 {v0.4s, v1.4s}, [x1], #32 + ld1 {v2.4s, v3.4s}, [x2], #32 + st1 {v4.4s, v5.4s}, [x0], #32 + ld1 {v4.4s, v5.4s}, [x3], #32 + b 1b +2: st1 {v4.4s, v5.4s}, [x0], #32 + ret +endfunc + +function ff_vector_fmul_reverse_neon, export=1 + sxtw x3, w3 + add x2, x2, x3, lsl #2 + sub x2, x2, #32 + mov x4, #-32 + ld1 {v2.4s, v3.4s}, [x2], x4 + ld1 {v0.4s, v1.4s}, [x1], #32 +1: subs x3, x3, #8 + rev64 v3.4s, v3.4s + rev64 v2.4s, v2.4s + ext v3.16b, v3.16b, v3.16b, #8 + ext v2.16b, v2.16b, v2.16b, #8 + fmul v16.4s, v0.4s, v3.4s + fmul v17.4s, v1.4s, v2.4s + b.eq 2f + ld1 {v2.4s, v3.4s}, [x2], x4 + ld1 {v0.4s, v1.4s}, [x1], #32 + st1 {v16.4s, v17.4s}, [x0], #32 + b 1b +2: st1 {v16.4s, v17.4s}, [x0], #32 + ret +endfunc + +function ff_butterflies_float_neon, export=1 +1: ld1 {v0.4s}, [x0] + ld1 {v1.4s}, [x1] + subs w2, w2, #4 + fsub v2.4s, v0.4s, v1.4s + fadd v3.4s, v0.4s, v1.4s + st1 {v2.4s}, [x1], #16 + st1 {v3.4s}, [x0], #16 + b.gt 1b + ret +endfunc + +function ff_scalarproduct_float_neon, export=1 + movi v2.4s, #0 +1: ld1 {v0.4s}, [x0], #16 + ld1 {v1.4s}, [x1], #16 + subs w2, w2, #4 + fmla v2.4s, v0.4s, v1.4s + b.gt 1b + faddp v0.4s, v2.4s, v2.4s + faddp s0, v0.2s + ret +endfunc diff --git a/media/ffvpx/libavutil/aarch64/moz.build b/media/ffvpx/libavutil/aarch64/moz.build new file mode 100644 index 0000000000..fa80cf8896 --- /dev/null +++ b/media/ffvpx/libavutil/aarch64/moz.build @@ -0,0 +1,21 @@ +# -*- Mode: python; indent-tabs-mode: nil; tab-width: 40 -*- +# vim: set filetype=python: +# This Source Code Form is subject to the terms of the Mozilla Public +# License, v. 2.0. If a copy of the MPL was not distributed with this +# file, You can obtain one at http://mozilla.org/MPL/2.0/. + +SOURCES += [ + 'cpu.c', + 'float_dsp_init.c', + 'float_dsp_neon.S', + 'tx_float_init.c', + 'tx_float_neon.S' +] + +if CONFIG['OS_ARCH'] == 'WINNT': + USE_INTEGRATED_CLANGCL_AS = True + DEFINES['EXTERN_ASM'] = '' + +FINAL_LIBRARY = 'mozavutil' + +include('/media/ffvpx/ffvpxcommon.mozbuild') diff --git a/media/ffvpx/libavutil/aarch64/timer.h b/media/ffvpx/libavutil/aarch64/timer.h new file mode 100644 index 0000000000..8b28fd354c --- /dev/null +++ b/media/ffvpx/libavutil/aarch64/timer.h @@ -0,0 +1,50 @@ +/* + * Copyright (c) 2015 Janne Grunau <janne-libav@jannau.net> + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#ifndef AVUTIL_AARCH64_TIMER_H +#define AVUTIL_AARCH64_TIMER_H + +#include <stdint.h> +#include "config.h" + +#if defined(__APPLE__) + +#include <mach/mach_time.h> + +#define AV_READ_TIME mach_absolute_time + +#elif HAVE_INLINE_ASM + +#define AV_READ_TIME read_time + +static inline uint64_t read_time(void) +{ + uint64_t cycle_counter; + __asm__ volatile( + "isb \t\n" + "mrs %0, pmccntr_el0 " + : "=r"(cycle_counter) :: "memory" ); + + return cycle_counter; +} + +#endif /* HAVE_INLINE_ASM */ + +#endif /* AVUTIL_AARCH64_TIMER_H */ diff --git a/media/ffvpx/libavutil/aarch64/tx_float_init.c b/media/ffvpx/libavutil/aarch64/tx_float_init.c new file mode 100644 index 0000000000..8300472c4c --- /dev/null +++ b/media/ffvpx/libavutil/aarch64/tx_float_init.c @@ -0,0 +1,64 @@ +/* + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#define TX_FLOAT +#include "libavutil/tx_priv.h" +#include "libavutil/attributes.h" +#include "libavutil/aarch64/cpu.h" + +TX_DECL_FN(fft2, neon) +TX_DECL_FN(fft4_fwd, neon) +TX_DECL_FN(fft4_inv, neon) +TX_DECL_FN(fft8, neon) +TX_DECL_FN(fft8_ns, neon) +TX_DECL_FN(fft16, neon) +TX_DECL_FN(fft16_ns, neon) +TX_DECL_FN(fft32, neon) +TX_DECL_FN(fft32_ns, neon) +TX_DECL_FN(fft_sr, neon) +TX_DECL_FN(fft_sr_ns, neon) + +static av_cold int neon_init(AVTXContext *s, const FFTXCodelet *cd, + uint64_t flags, FFTXCodeletOptions *opts, + int len, int inv, const void *scale) +{ + ff_tx_init_tabs_float(len); + if (cd->max_len == 2) + return ff_tx_gen_ptwo_revtab(s, opts); + else + return ff_tx_gen_split_radix_parity_revtab(s, len, inv, opts, 8, 0); +} + +const FFTXCodelet * const ff_tx_codelet_list_float_aarch64[] = { + TX_DEF(fft2, FFT, 2, 2, 2, 0, 128, NULL, neon, NEON, AV_TX_INPLACE, 0), + TX_DEF(fft2, FFT, 2, 2, 2, 0, 192, neon_init, neon, NEON, AV_TX_INPLACE | FF_TX_PRESHUFFLE, 0), + TX_DEF(fft4_fwd, FFT, 4, 4, 2, 0, 128, NULL, neon, NEON, AV_TX_INPLACE | FF_TX_FORWARD_ONLY, 0), + TX_DEF(fft4_fwd, FFT, 4, 4, 2, 0, 192, neon_init, neon, NEON, AV_TX_INPLACE | FF_TX_PRESHUFFLE, 0), + TX_DEF(fft4_inv, FFT, 4, 4, 2, 0, 128, NULL, neon, NEON, AV_TX_INPLACE | FF_TX_INVERSE_ONLY, 0), + TX_DEF(fft8, FFT, 8, 8, 2, 0, 128, neon_init, neon, NEON, AV_TX_INPLACE, 0), + TX_DEF(fft8_ns, FFT, 8, 8, 2, 0, 192, neon_init, neon, NEON, AV_TX_INPLACE | FF_TX_PRESHUFFLE, 0), + TX_DEF(fft16, FFT, 16, 16, 2, 0, 128, neon_init, neon, NEON, AV_TX_INPLACE, 0), + TX_DEF(fft16_ns, FFT, 16, 16, 2, 0, 192, neon_init, neon, NEON, AV_TX_INPLACE | FF_TX_PRESHUFFLE, 0), + TX_DEF(fft32, FFT, 32, 32, 2, 0, 128, neon_init, neon, NEON, AV_TX_INPLACE, 0), + TX_DEF(fft32_ns, FFT, 32, 32, 2, 0, 192, neon_init, neon, NEON, AV_TX_INPLACE | FF_TX_PRESHUFFLE, 0), + + TX_DEF(fft_sr, FFT, 64, 131072, 2, 0, 128, neon_init, neon, NEON, 0, 0), + TX_DEF(fft_sr_ns, FFT, 64, 131072, 2, 0, 192, neon_init, neon, NEON, AV_TX_INPLACE | FF_TX_PRESHUFFLE, 0), + + NULL, +}; diff --git a/media/ffvpx/libavutil/aarch64/tx_float_neon.S b/media/ffvpx/libavutil/aarch64/tx_float_neon.S new file mode 100644 index 0000000000..78e4876d6c --- /dev/null +++ b/media/ffvpx/libavutil/aarch64/tx_float_neon.S @@ -0,0 +1,1294 @@ +/* + * Copyright (c) Lynne + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#include "libavutil/aarch64/asm.S" + +/* Open `doc/transforms.md` to see the code upon which the transforms here were + * based upon. + * + * File conventions: + * GPRs: x0-x3 - arguments, untouched + * x4 - Lookup table base pointer + * x5-x6 - macro ld1 temps/function scratch + * x7-x9 - FFT table state + * x10-x17 - lookup table/macro scratch + * w19-w20 - current/target length when needed + * x21-x22 - len*2, len*6 + * + * Vectors: v0-v7 - coefficients + * v8-v15 - coefficients when needed, otherwise untouched + * v16-v30 - used as needed + * v31 - -1.0, +1.0, -1.0, +1.0. Never touched after loading. + * + * Stack: backup for v8-v15 and x19-x22 when needed, and transform lengths + */ + +#define M_SQRT1_2 0.707106781186547524401 +#define COS16_1 0.92387950420379638671875 +#define COS16_3 0.3826834261417388916015625 + +/* We only ever load this once at the start, and then live with losing an + * entire register as we need to lug this all the time everywhere. + * Clearly should be integrated into an fsadd and fmlsa, but "muh RISC!". */ +const subadd, align=4 + .float -1.0, 1.0, -1.0, 1.0 +endconst + +.macro LOAD_SUBADD + movrel x5, subadd + ld1 { v31.4s }, [x5] +.endm + +.macro SETUP_LUT no_lut=0 +.if \no_lut == 0 + ldr x4, [x0, #8] +.endif +.endm + +.macro LOAD_INPUT dst1, dst2, dst3, dst4, src, no_lut=0, discont=0 +.if \no_lut == 1 +.if \discont == 1 + ldp q\dst1\(), q\dst2\(), [\src\()] + ldp q\dst3\(), q\dst4\(), [\src\(), #32] + add \src\(), \src\(), #64 +.else + ld1 { v\dst1\().4s, v\dst2\().4s, v\dst3\().4s, v\dst4\().4s }, [\src], #64 +.endif +.else + ldp w10, w11, [x4, #0 ] + ldp w12, w13, [x4, #8 ] + ldp w14, w15, [x4, #16] + ldp w16, w17, [x4, #24] + + add x4, x4, #32 + + ldr d\dst1, [\src, x10, lsl #3] + add x11, \src, x11, lsl #3 + ldr d\dst2, [\src, x12, lsl #3] + add x13, \src, x13, lsl #3 + ldr d\dst3, [\src, x14, lsl #3] + add x15, \src, x15, lsl #3 + ldr d\dst4, [\src, x16, lsl #3] + add x17, \src, x17, lsl #3 + + ld1 { v\dst1\().d }[1], [x11] + ld1 { v\dst2\().d }[1], [x13] + ld1 { v\dst3\().d }[1], [x15] + ld1 { v\dst4\().d }[1], [x17] +.endif +.endm + +.macro FFT4 e0, o0, standalone + fadd v16.4s, \e0\().4s, \o0\().4s // r1..4 + fsub \e0\().4s, \e0\().4s, \o0\().4s // t1..4 + + rev64 v18.4s, \e0\().4s + + zip2 \o0\().2d, v16.2d, \e0\().2d + zip1 v17.2d, v16.2d, \e0\().2d + + mov \o0\().d[1], v18.d[1] + + fadd \e0\().4s, v17.4s, \o0\().4s // a1,2 b1,4 + fsub v16.4s, v17.4s, \o0\().4s // a3,4 b3,2 + + mov \o0\().16b, v16.16b // Swap once again... + mov \o0\().s[3], \e0\().s[3] + mov \e0\().s[3], v16.s[3] + +.if \standalone == 0 + uzp2 \o0\().2d, \e0\().2d, \o0\().2d + uzp1 \e0\().2d, \e0\().2d, v16.2d +.endif +.endm + +const shuf_4pt_x2, align=4 + .byte 24, 25, 26, 27 // reg2, 3 + .byte 12, 13, 14, 15 // reg1, 4 + .byte 8, 9, 10, 11 // reg1, 3 + .byte 28, 29, 30, 31 // reg2, 4 +endconst + +// Identical to FFT4, but does 2 transforms in parallel, with no deinterleaving +.macro FFT4_X2 e0, o0, e1, o1, \ + t0=v16, t1=v17, t2=v18, t3=v19, t4=v20, t5=v21, t6=v22 + + fadd \t0\().4s, \e0\().4s, \o0\().4s // r1234 + fadd \t2\().4s, \e1\().4s, \o1\().4s // r1234 + fsub \e0\().4s, \e0\().4s, \o0\().4s // t1234 + fsub \e1\().4s, \e1\().4s, \o1\().4s // t1234 + + movrel x5, shuf_4pt_x2 + + rev64 \t4\().4s, \e0\().4s + rev64 \t5\().4s, \e1\().4s + + zip2 \o0\().2d, \t0\().2d, \e0\().2d // t3,4 r3,4 + zip2 \o1\().2d, \t2\().2d, \e1\().2d // t3,4 r3,4 + + ld1 { \t6\().16b }, [x5] + + mov \o0\().d[1], \t4\().d[1] + mov \o1\().d[1], \t5\().d[1] + + zip1 \t1\().2d, \t0\().2d, \e0\().2d // t1,2 r1,2 + zip1 \t3\().2d, \t2\().2d, \e1\().2d // t1,2 r1,2 + + fsub \t4\().4s, \t1\().4s, \o0\().4s // a34 b32 + fadd \t5\().4s, \t1\().4s, \o0\().4s // a12 b14 + fsub \t2\().4s, \t3\().4s, \o1\().4s // a34 b32 + fadd \t3\().4s, \t3\().4s, \o1\().4s // a12 b14 + + // TODO: experiment with movs instead of tables here + tbl \o0\().16b, { \t4\().16b, \t5\().16b }, \t6\().16b // b1234 + tbl \o1\().16b, { \t2\().16b, \t3\().16b }, \t6\().16b // b1234 + + zip1 \e0\().2d, \t5\().2d, \t4\().2d // a1234 +// zip2 \o0\().2d, \t5\().2d, \t4\().2d // b1432 + zip1 \e1\().2d, \t3\().2d, \t2\().2d // a1234 +// zip2 \o1\().2d, \t3\().2d, \t2\().2d // b1432 +// rev64 \o0\().4s, \o0\().4s // b4123 +// rev64 \o1\().4s, \o1\().4s // b4123 +// ext \o0\().16b, \o0\().16b, \o0\().16b, #4 // b1234 +// ext \o1\().16b, \o1\().16b, \o1\().16b, #4 // b1234 +.endm + +const tab_8pt, align=4 + .float M_SQRT1_2, -M_SQRT1_2, -M_SQRT1_2, M_SQRT1_2 +endconst + +.macro FFT8 e0, e1, o0, o1, \ + t0=v16, t1=v17, t2=v18, t3=v19, t4=v20, t5=v21, t6=v22 + + movrel x5, tab_8pt + + fsub \t1\().4s, \e1\().4s, \o1\().4s // j1234 + fadd \o1\().4s, \e1\().4s, \o1\().4s // k1234 + fsub \t0\().4s, \e0\().4s, \o0\().4s // r1234 + fadd \o0\().4s, \e0\().4s, \o0\().4s // q1234 + + ld1 { \t5\().4s }, [x5] + + ext \t4\().16b, \o1\().16b, \o1\().16b, #12 + rev64 \t4\().4s, \t4\().4s + + ext \t2\().16b, \o0\().16b, \t4\().16b, #8 // o0[0,1], o1[3,2] + mov \o0\().d[1], \t4\().d[1] // o0[3, 4]; o1[1, 4] + + fsub \e1\().4s, \o0\().4s, \t2\().4s // s34, g43 + fadd \t2\().4s, \o0\().4s, \t2\().4s // s12, g12 + + rev64 \t6\().4s, v31.4s // 1, -1, 1, -1 + dup \o0\().2d, \t0\().d[0] // r1212 + dup \o1\().2d, \t0\().d[1] // r3434 + + rev64 \t4\().4s, \e1\().4s // xxg34 + rev64 \o1\().4s, \o1\().4s // r4343 + + ext \t6\().16b, v31.16b, \t6\().16b, #8 // -1, 1, 1, -1 + zip1 \t3\().2d, \t2\().2d, \e1\().2d // s1234 + zip2 \t2\().2d, \t2\().2d, \t4\().2d // g1234 + + fadd \e0\().4s, \t3\().4s, \t2\().4s // out_e1 + fsub \e1\().4s, \t3\().4s, \t2\().4s // out_e2 + + fmul \t1\().4s, \t1\().4s, \t5\().4s // j * +--+M_SQRT1_2 + fmls \o0\().4s, \o1\().4s, \t6\().4s // z1234 + + rev64 \t4\().4s, \t1\().4s // j2143 + fmla \t1\().4s, \t4\().4s, v31.4s // l2143 + + rev64 \t4\().4s, \t1\().4s // l1234 + ext \t4\().16b, \t4\().16b, \t4\().16b, #8 // l3412 + + fmla \t4\().4s, \t1\().4s, v31.4s // t1234 + + fadd \o1\().4s, \o0\().4s, \t4\().4s // out_o2 + fsub \o0\().4s, \o0\().4s, \t4\().4s // out_o1 +.endm + +// Identical as FFT8, but does 2 transforms in parallel +.macro FFT8_X2 e0, e1, o0, o1, e2, e3, o2, o3 + + movrel x5, tab_8pt + + fadd v19.4s, \e3\().4s, \o3\().4s // k1234 + fadd v17.4s, \e1\().4s, \o1\().4s // k1234 + fadd v18.4s, \e2\().4s, \o2\().4s // q1234 + fadd v16.4s, \e0\().4s, \o0\().4s // q1234 + + ld1 { v23.4s }, [x5] + + ext v22.16b, v19.16b, v19.16b, #12 + ext v21.16b, v17.16b, v17.16b, #12 + + rev64 v22.4s, v22.4s + rev64 v21.4s, v21.4s + + ext v19.16b, v18.16b, v22.16b, #8 + ext v17.16b, v16.16b, v21.16b, #8 + + mov v18.d[1], v22.d[1] + mov v21.d[0], v16.d[0] + + fadd v22.4s, v18.4s, v19.4s // s12, g12 + fsub v19.4s, v18.4s, v19.4s // s34, g43 + fsub v18.4s, v21.4s, v17.4s // s34, g43 + fadd v16.4s, v21.4s, v17.4s // s12, g12 + + fsub \e0\().4s, \e0\().4s, \o0\().4s // r1234 + fsub v20.4s, \e1\().4s, \o1\().4s // j1234 + fsub \e2\().4s, \e2\().4s, \o2\().4s // r1234 + fsub v21.4s, \e3\().4s, \o3\().4s // j1234 + + rev64 v24.4s, v31.4s // 1, -1, 1, -1 + zip1 v17.2d, v16.2d, v18.2d // s1234 + zip1 \e1\().2d, v22.2d, v19.2d // s1234 + + rev64 v18.4s, v18.4s // xxg34 + rev64 v19.4s, v19.4s // xxg34 + + zip2 v16.2d, v16.2d, v18.2d // g1234 + zip2 \e3\().2d, v22.2d, v19.2d // g1234 + + dup \o0\().2d, \e0\().d[0] // r1212 + dup \o1\().2d, \e0\().d[1] // r3434 + dup \o2\().2d, \e2\().d[0] // r1212 + dup \o3\().2d, \e2\().d[1] // r3434 + + fadd \e2\().4s, \e1\().4s, \e3\().4s // out_e1 + fsub \e3\().4s, \e1\().4s, \e3\().4s // out_e2 + fadd \e0\().4s, v17.4s, v16.4s // out_e1 + fsub \e1\().4s, v17.4s, v16.4s // out_e2 + + ext v24.16b, v31.16b, v24.16b, #8 // -1, 1, 1, -1 + rev64 \o1\().4s, \o1\().4s // r4343 + rev64 \o3\().4s, \o3\().4s // r4343 + + fmul v19.4s, v20.4s, v23.4s // j * +--+M_SQRT1_2 + fmul v21.4s, v21.4s, v23.4s // j * +--+M_SQRT1_2 + + rev64 v20.4s, v19.4s // j2143 + rev64 v18.4s, v21.4s // j2143 + + fmls \o0\().4s, \o1\().4s, v24.4s // z1234 + fmls \o2\().4s, \o3\().4s, v24.4s // z1234 + + fmla v19.4s, v20.4s, v31.4s // l2143 + fmla v21.4s, v18.4s, v31.4s // l2143 + + rev64 v20.4s, v19.4s // l1234 + rev64 v18.4s, v21.4s // l1234 + ext v20.16b, v20.16b, v20.16b, #8 // l3412 + ext v18.16b, v18.16b, v18.16b, #8 // l3412 + + fmla v20.4s, v19.4s, v31.4s // t1234 + fmla v18.4s, v21.4s, v31.4s // t1234 + + fadd \o1\().4s, \o0\().4s, v20.4s // out_o2 + fadd \o3\().4s, \o2\().4s, v18.4s // out_o2 + fsub \o0\().4s, \o0\().4s, v20.4s // out_o1 + fsub \o2\().4s, \o2\().4s, v18.4s // out_o1 +.endm + +const tab_16pt, align=4 + .float -COS16_1, COS16_1, -COS16_3, COS16_3 // Could be +-+- too + .float COS16_3, COS16_3, COS16_1, COS16_1 + .float 1.0, 1.0, M_SQRT1_2, M_SQRT1_2 +endconst + +// 16-point FFT +// t3, t4, t5, t6 must be sequential +.macro FFT16 e0, e1, e2, e3, o0, o1, o2, o3, \ + t0=v16, t1=v17, t2=v18, t3=v19, t4=v20, t5=v21, t6=v22 + + FFT8 \e0, \e1, \e2, \e3, \t0, \t1, \t2, \t3, \t4, \t5, \t6 + FFT4_X2 \o0, \o1, \o2, \o3, \t0, \t1, \t2, \t3, \t4, \t5, \t6 + + movrel x5, tab_16pt + + rev64 \t0\().4s, \o0\().4s // z[ 8, 9].imre + rev64 \t1\().4s, \o2\().4s // z[10,11].imre + + ins \t0\().d[0], xzr + ins \t1\().d[0], xzr + + ld1 { \t4\().4s, \t5\().4s, \t6\().4s }, [x5] + // TODO: We could derive \t4\() or \t5\() from either, but it seems cheaper to load + + fmla \o2\().4s, \t1\().4s, v31.4s // s[4567] + fmls \o0\().4s, \t0\().4s, v31.4s // s[0123] + + fmul \t2\().4s, \o1\().4s, \t4\().4s + fmul \t3\().4s, \o3\().4s, \t4\().4s + + rev64 \o3\().4s, \o3\().4s + rev64 \o1\().4s, \o1\().4s + + fmla \t3\().4s, \o3\().4s, \t5\().4s // s[12, 13, 14, 15] + fmls \t2\().4s, \o1\().4s, \t5\().4s // s[ 8, 9, 10, 11] + + fmul \t1\().4s, \o2\().4s, \t6\().4s // s[4567] * mult + fmul \t0\().4s, \o0\().4s, \t6\().4s // s[0123] * mult + + mov \o1\().16b, \t3\().16b + mov \o2\().16b, \t1\().16b + + fsub \t3\().4s, \t3\().4s, \t2\().4s // y34, u34 + fsub \t1\().4s, \t1\().4s, \t0\().4s // w34, x34 + + fadd \t2\().4s, \t2\().4s, \o1\().4s // y56, u56 + rev64 \t3\().4s, \t3\().4s + fadd \t0\().4s, \t0\().4s, \o2\().4s // w56, x56 + rev64 \t1\().4s, \t1\().4s + + fmul \t2\().4s, \t2\().4s, v31.4s + fmul \t1\().4s, \t1\().4s, v31.4s + + fadd \o3\().4s, \e3\().4s, \t3\().4s + fsub \o2\().4s, \e3\().4s, \t3\().4s + fsub \o1\().4s, \e2\().4s, \t2\().4s + fadd \o0\().4s, \e2\().4s, \t2\().4s + + fsub \e2\().4s, \e0\().4s, \t0\().4s + fadd \e0\().4s, \e0\().4s, \t0\().4s + fsub \e3\().4s, \e1\().4s, \t1\().4s + fadd \e1\().4s, \e1\().4s, \t1\().4s +.endm + +function ff_tx_fft2_float_neon, export=1 + ld2r { v0.2d, v1.2d }, [x2] + + fneg v2.2s, v1.2s + mov v2.d[1], v1.d[0] + + fsub v2.4s, v0.4s, v2.4s + + st1 { v2.4s }, [x1] + ret +endfunc + +.macro FFT4_FN name, inv +function ff_tx_fft4_\name\()_float_neon, export=1 + ld1 {v0.4s, v1.4s}, [x2] + +.if \inv == 1 + mov v2.d[0], v0.d[1] + mov v0.d[1], v1.d[1] + mov v1.d[1], v2.d[0] +.endif + + FFT4 v0, v1, 1 + + st1 { v0.4s, v1.4s }, [x1] + ret +endfunc +.endm + +FFT4_FN fwd, 0 +FFT4_FN inv, 1 + +.macro FFT8_FN name, no_perm +function ff_tx_fft8_\name\()_neon, export=1 + SETUP_LUT \no_perm + LOAD_INPUT 0, 1, 2, 3, x2, \no_perm + + LOAD_SUBADD + FFT8 v0, v1, v2, v3 + + zip1 v16.2d, v0.2d, v2.2d + zip2 v17.2d, v0.2d, v2.2d + zip1 v18.2d, v1.2d, v3.2d + zip2 v19.2d, v1.2d, v3.2d + st1 { v16.4s, v17.4s, v18.4s, v19.4s }, [x1] + + ret +endfunc +.endm + +FFT8_FN float, 0 +FFT8_FN ns_float, 1 + +.macro FFT16_FN name, no_perm +function ff_tx_fft16_\name\()_neon, export=1 + SETUP_LUT \no_perm + LOAD_INPUT 0, 1, 2, 3, x2, \no_perm + LOAD_INPUT 4, 5, 6, 7, x2, \no_perm + + LOAD_SUBADD + FFT16 v0, v1, v2, v3, v4, v5, v6, v7 + + zip1 v20.2d, v0.2d, v4.2d + zip2 v21.2d, v0.2d, v4.2d + zip1 v22.2d, v1.2d, v6.2d + zip2 v23.2d, v1.2d, v6.2d + st1 { v20.4s, v21.4s, v22.4s, v23.4s }, [x1], #64 + + zip1 v24.2d, v2.2d, v5.2d + zip2 v25.2d, v2.2d, v5.2d + zip1 v26.2d, v3.2d, v7.2d + zip2 v27.2d, v3.2d, v7.2d + st1 { v24.4s, v25.4s, v26.4s, v27.4s }, [x1] + + ret +endfunc +.endm + +FFT16_FN float, 0 +FFT16_FN ns_float, 1 + +.macro SETUP_SR_RECOMB len, re, im, dec + ldr w5, =(\len - 4*7) + movrel \re, X(ff_tx_tab_\len\()_float) + add \im, \re, x5 + mov \dec, #-32 + +.if \len > 32 + mov x21, #2*\len + add x22, x21, x21, lsl #1 +.endif +.endm + +.macro SR_COMBINE e0, e1, e2, e3, e4, e5, e6, e7, \ + o0, o1, o2, o3, o4, o5, o6, o7, \ + re, im, dec, swap_im, \ + t0=v16, t1=v17, t2=v18, t3=v19, t4=v20, t5=v21, \ + t6=v22, t7=v23, t8=v24, t9=v25, ta=v26, tb=v27 + + ld1 { \t8\().4s, \t9\().4s }, [\im], \dec + ld1 { \t0\().4s, \t1\().4s }, [\re], #32 + +.if \swap_im == 1 + ext \t2\().16b, \t9\().16b, \t9\().16b, #8 + ext \t3\().16b, \t8\().16b, \t8\().16b, #8 +.else + ext \t2\().16b, \t8\().16b, \t8\().16b, #8 + ext \t3\().16b, \t9\().16b, \t9\().16b, #8 +.endif + + trn1 \t4\().4s, \t0\().4s, \t0\().4s // cos0022 + trn2 \t0\().4s, \t0\().4s, \t0\().4s // cos4466 + trn1 \t5\().4s, \t1\().4s, \t1\().4s // cos1133 + trn2 \t1\().4s, \t1\().4s, \t1\().4s // cos5577 + + rev64 \t6\().4s, \o0\().4s // E m2[0,1].imre + rev64 \t7\().4s, \o2\().4s // O m2[0,1].imre + rev64 \t8\().4s, \o4\().4s // E m2[2,3].imre + rev64 \t9\().4s, \o6\().4s // O m2[2,3].imre + + fmul \t6\().4s, \t6\().4s, \t4\().4s // E m2[0,1].imre*t1[0,2] + fmul \t7\().4s, \t7\().4s, \t0\().4s // O m2[0,1].imre*t1[0,2] + fmul \t8\().4s, \t8\().4s, \t4\().4s // E m2[2,3].imre*t1[0,2] + fmul \t9\().4s, \t9\().4s, \t0\().4s // O m2[2,3].imre*t1[0,2] + + rev64 \ta\().4s, \o1\().4s // E m3[0,1].imre + rev64 \tb\().4s, \o3\().4s // O m3[0,1].imre + rev64 \t4\().4s, \o5\().4s // E m3[2,3].imre + rev64 \t0\().4s, \o7\().4s // O m3[2,3].imre + + fmul \ta\().4s, \ta\().4s, \t5\().4s // E m3[0,1].imre*t1[4,6] + fmul \tb\().4s, \tb\().4s, \t1\().4s // O m3[0,1].imre*t1[4,6] + fmul \t4\().4s, \t4\().4s, \t5\().4s // E m3[2,3].imre*t1[4,6] + fmul \t0\().4s, \t0\().4s, \t1\().4s // O m3[2,3].imre*t1[4,6] + + trn1 \t5\().4s, \t3\().4s, \t3\().4s // wim2200 + trn2 \t3\().4s, \t3\().4s, \t3\().4s // wim3311 + trn1 \t1\().4s, \t2\().4s, \t2\().4s // wim6644 + trn2 \t2\().4s, \t2\().4s, \t2\().4s // wim7755 + + fmul \t5\().4s, \t5\().4s, v31.4s + fmul \t3\().4s, \t3\().4s, v31.4s + fmul \t1\().4s, \t1\().4s, v31.4s + fmul \t2\().4s, \t2\().4s, v31.4s + + fmla \t7\().4s, \o2\().4s, \t5\().4s // O w0123 + fmls \t9\().4s, \o6\().4s, \t5\().4s // O j0123 + fmla \t6\().4s, \o0\().4s, \t3\().4s // E w0123 + fmls \t8\().4s, \o4\().4s, \t3\().4s // E j0123 + + fmla \ta\().4s, \o1\().4s, \t2\().4s // E w4567 + fmla \tb\().4s, \o3\().4s, \t1\().4s // O w4567 + fmls \t4\().4s, \o5\().4s, \t2\().4s // E j4567 + fmls \t0\().4s, \o7\().4s, \t1\().4s // O j4567 + + fsub \t2\().4s, \t7\().4s, \t9\().4s + fsub \t1\().4s, \t8\().4s, \t6\().4s + fsub \t3\().4s, \t4\().4s, \ta\().4s + fsub \t5\().4s, \t0\().4s, \tb\().4s + + fadd \t6\().4s, \t8\().4s, \t6\().4s + fadd \t7\().4s, \t9\().4s, \t7\().4s + fadd \t8\().4s, \t4\().4s, \ta\().4s + fadd \t9\().4s, \t0\().4s, \tb\().4s + + fmul \t1\().4s, \t1\().4s, v31.4s + fmul \t2\().4s, \t2\().4s, v31.4s + fmul \t3\().4s, \t3\().4s, v31.4s + fmul \t5\().4s, \t5\().4s, v31.4s + + rev64 \t6\().4s, \t6\().4s + rev64 \t8\().4s, \t8\().4s + rev64 \t7\().4s, \t7\().4s + rev64 \t9\().4s, \t9\().4s + + fsub \o0\().4s, \e0\().4s, \t6\().4s + fsub \o1\().4s, \e1\().4s, \t8\().4s + fsub \o2\().4s, \e2\().4s, \t1\().4s + fsub \o3\().4s, \e3\().4s, \t3\().4s + + fsub \o4\().4s, \e4\().4s, \t7\().4s + fsub \o5\().4s, \e6\().4s, \t9\().4s + fadd \o6\().4s, \e5\().4s, \t2\().4s + fsub \o7\().4s, \e7\().4s, \t5\().4s + + fadd \e0\().4s, \e0\().4s, \t6\().4s + fadd \e1\().4s, \e1\().4s, \t8\().4s + fadd \e2\().4s, \e2\().4s, \t1\().4s + fadd \e3\().4s, \e3\().4s, \t3\().4s + + fadd \e4\().4s, \e4\().4s, \t7\().4s + fsub \e5\().4s, \e5\().4s, \t2\().4s // swapped + fadd \e6\().4s, \e6\().4s, \t9\().4s // swapped + fadd \e7\().4s, \e7\().4s, \t5\().4s +.endm + +.macro SR_COMBINE_HALF e0, e1, e2, e3, \ + o0, o1, o2, o3, \ + c0, c1, c2, c3, \ + t0, t1, t2, t3, t4, t5, part + +.if \part == 0 + trn1 \t4\().4s, \c0\().4s, \c0\().4s // cos0022 + trn1 \c1\().4s, \c1\().4s, \c1\().4s // cos1133 +.else + trn2 \t4\().4s, \c0\().4s, \c0\().4s // cos0022 + trn2 \c1\().4s, \c1\().4s, \c1\().4s // cos1133 +.endif +.if \part == 0 + trn2 \t5\().4s, \c2\().4s, \c2\().4s // wim7755 + trn2 \c3\().4s, \c3\().4s, \c3\().4s // wim3311 +.else + trn1 \t5\().4s, \c2\().4s, \c2\().4s // wim7755 + trn1 \c3\().4s, \c3\().4s, \c3\().4s // wim3311 +.endif + + fmul \t5\().4s, \t5\().4s, v31.4s + fmul \c3\().4s, \c3\().4s, v31.4s + + rev64 \t0\().4s, \o0\().4s // E m2[0,1].imre + rev64 \t1\().4s, \o2\().4s // E m2[2,3].imre + rev64 \t2\().4s, \o1\().4s // E m3[0,1].imre + rev64 \t3\().4s, \o3\().4s // E m3[2,3].imre + + fmul \o0\().4s, \o0\().4s, \c3\().4s // E m2[0,1].imre*t1[0,2] + fmul \o1\().4s, \o1\().4s, \t5\().4s // E m3[0,1].imre*t1[4,6] + fmla \o0\().4s, \t0\().4s, \t4\().4s // E w0123 + fmla \o1\().4s, \t2\().4s, \c1\().4s // E w4567 + + fmul \t1\().4s, \t1\().4s, \t4\().4s // E m2[2,3].imre*t1[0,2] + fmul \t3\().4s, \t3\().4s, \c1\().4s // E m3[2,3].imre*t1[4,6] + fmls \t1\().4s, \o2\().4s, \c3\().4s // E j0123 + fmls \t3\().4s, \o3\().4s, \t5\().4s // E j4567 + + fsub \t0\().4s, \t1\().4s, \o0\().4s + fadd \t1\().4s, \t1\().4s, \o0\().4s + fadd \t2\().4s, \t3\().4s, \o1\().4s + fsub \t3\().4s, \t3\().4s, \o1\().4s + + fmul \t0\().4s, \t0\().4s, v31.4s + fmul \t3\().4s, \t3\().4s, v31.4s + + rev64 \t1\().4s, \t1\().4s + rev64 \t2\().4s, \t2\().4s + +.if \part == 0 + fsub \o0\().4s, \e0\().4s, \t1\().4s + fsub \o1\().4s, \e1\().4s, \t2\().4s + fsub \o2\().4s, \e2\().4s, \t0\().4s + fsub \o3\().4s, \e3\().4s, \t3\().4s +.else + fsub \o0\().4s, \e0\().4s, \t1\().4s + fadd \o2\().4s, \e1\().4s, \t2\().4s + fsub \o1\().4s, \e2\().4s, \t0\().4s + fadd \o3\().4s, \e3\().4s, \t3\().4s +.endif + +.if \part == 0 + fadd \e0\().4s, \e0\().4s, \t1\().4s + fadd \e1\().4s, \e1\().4s, \t2\().4s + fadd \e2\().4s, \e2\().4s, \t0\().4s + fadd \e3\().4s, \e3\().4s, \t3\().4s +.else + fadd \e0\().4s, \e0\().4s, \t1\().4s + fsub \e1\().4s, \e1\().4s, \t2\().4s // swapped + fadd \e2\().4s, \e2\().4s, \t0\().4s // swapped + fsub \e3\().4s, \e3\().4s, \t3\().4s +.endif +.endm + +/* Same as SR_COMBINE_HALF, but heroically tries to use 3 temporary registers + * without touching the tables. */ +.macro SR_COMBINE_LITE e0, e1, e2, e3, \ + o0, o1, o2, o3, \ + c0, c1, c2, c3, \ + t0, t1, t2, part + + rev64 \t0\().4s, \o0\().4s // E m2[0,1].imre + rev64 \t1\().4s, \o2\().4s // E m2[2,3].imre +.if \part == 0 + trn2 \t2\().4s, \c3\().4s, \c3\().4s // wim3311 +.else + trn1 \t2\().4s, \c3\().4s, \c3\().4s // wim3311 +.endif + fmul \t2\().4s, \t2\().4s, v31.4s + fmul \o2\().4s, \o2\().4s, \t2\().4s + fmul \o0\().4s, \o0\().4s, \t2\().4s // E m2[0,1].imre*t1[0,2] +.if \part == 0 + trn1 \t2\().4s, \c0\().4s, \c0\().4s // cos0022 +.else + trn2 \t2\().4s, \c0\().4s, \c0\().4s // cos0022 +.endif + fmul \t1\().4s, \t1\().4s, \t2\().4s // E m2[2,3].imre*t1[0,2] + fmla \o0\().4s, \t0\().4s, \t2\().4s // E w0123 + fsub \t1\().4s, \t1\().4s, \o2\().4s // E j0123 + + rev64 \t2\().4s, \o1\().4s // E m3[0,1].imre + rev64 \o2\().4s, \o3\().4s // E m3[2,3].imre + +.if \part == 0 + trn2 \t0\().4s, \c2\().4s, \c2\().4s // wim7755 +.else + trn1 \t0\().4s, \c2\().4s, \c2\().4s // wim7755 +.endif + fmul \t0\().4s, \t0\().4s, v31.4s + + fmul \o1\().4s, \o1\().4s, \t0\().4s // E m3[0,1].imre*t1[4,6] + fmul \o3\().4s, \o3\().4s, \t0\().4s + +.if \part == 0 + trn1 \t0\().4s, \c1\().4s, \c1\().4s // cos1133 +.else + trn2 \t0\().4s, \c1\().4s, \c1\().4s // cos1133 +.endif + fmul \o2\().4s, \o2\().4s, \t0\().4s // E m3[2,3].imre*t1[4,6] + fmla \o1\().4s, \t2\().4s, \t0\().4s // E w4567 + fsub \o2\().4s, \o2\().4s, \o3\().4s // E j4567 + + fsub \t0\().4s, \t1\().4s, \o0\().4s + fadd \o0\().4s, \t1\().4s, \o0\().4s + fadd \t2\().4s, \o2\().4s, \o1\().4s + fsub \t1\().4s, \o2\().4s, \o1\().4s + + fmul \t0\().4s, \t0\().4s, v31.4s + fmul \t1\().4s, \t1\().4s, v31.4s + + rev64 \t2\().4s, \t2\().4s + rev64 \o0\().4s, \o0\().4s + +.if \part == 0 + fsub \o1\().4s, \e1\().4s, \t2\().4s + fsub \o2\().4s, \e2\().4s, \t0\().4s + fsub \o3\().4s, \e3\().4s, \t1\().4s +.else + fadd \o2\().4s, \e1\().4s, \t0\().4s + fsub \o1\().4s, \e2\().4s, \t2\().4s + fadd \o3\().4s, \e3\().4s, \t1\().4s +.endif + +.if \part == 0 + fadd \e1\().4s, \e1\().4s, \t2\().4s + fadd \e2\().4s, \e2\().4s, \t0\().4s + fadd \e3\().4s, \e3\().4s, \t1\().4s +.else + fsub \e1\().4s, \e1\().4s, \t0\().4s // swapped + fadd \e2\().4s, \e2\().4s, \t2\().4s // swapped + fsub \e3\().4s, \e3\().4s, \t1\().4s +.endif + + mov \t1\().16b, \o0\().16b + + fsub \o0\().4s, \e0\().4s, \t1\().4s + fadd \e0\().4s, \e0\().4s, \t1\().4s +.endm + +.macro SR_COMBINE_4 len, part, off + add x10, x1, x21 + add x11, x1, x21, lsl #1 + add x12, x1, x22 + + ldp q0, q1, [x1, #((0 + \part)*32 + \off)] + ldp q4, q5, [x1, #((2 + \part)*32 + \off)] + ldp q2, q3, [x10, #((0 + \part)*32 + \off)] + ldp q6, q7, [x10, #((2 + \part)*32 + \off)] + + ldp q8, q9, [x11, #((0 + \part)*32 + \off)] + ldp q10, q11, [x11, #((2 + \part)*32 + \off)] + ldp q12, q13, [x12, #((0 + \part)*32 + \off)] + ldp q14, q15, [x12, #((2 + \part)*32 + \off)] + + SR_COMBINE v0, v1, v2, v3, v4, v6, v5, v7, \ + v8, v9, v10, v11, v12, v13, v14, v15, \ + x7, x8, x9, 0 + + stp q0, q1, [x1, #((0 + \part)*32 + \off)] + stp q4, q5, [x1, #((2 + \part)*32 + \off)] + stp q2, q3, [x10, #((0 + \part)*32 + \off)] + stp q6, q7, [x10, #((2 + \part)*32 + \off)] + + stp q8, q9, [x11, #((0 + \part)*32 + \off)] + stp q12, q13, [x11, #((2 + \part)*32 + \off)] + stp q10, q11, [x12, #((0 + \part)*32 + \off)] + stp q14, q15, [x12, #((2 + \part)*32 + \off)] +.endm + +.macro SR_COMBINE_FULL len, off=0 + add x10, x1, x21 + add x11, x1, x21, lsl #1 + add x12, x1, x22 + + SR_COMBINE_4 \len, 0, \off + SR_COMBINE_4 \len, 1, \off + SR_COMBINE_4 \len, 4, \off + SR_COMBINE_4 \len, 5, \off +.endm + +.macro SR_COMBINE_D2 part, off + add x10, x1, #((\part)*32 + \off) + add x11, x14, #((\part)*32 + \off) + add x12, x15, #((\part)*32 + \off) + add x13, x16, #((\part)*32 + \off) + + ldp q0, q1, [x10] + ldp q4, q5, [x10, #(2*32)] + ldp q2, q3, [x11] + ldp q6, q7, [x11, #(2*32)] + + ldp q8, q9, [x12] + ldp q10, q11, [x12, #(2*32)] + ldp q12, q13, [x13] + ldp q14, q15, [x13, #(2*32)] + + SR_COMBINE v0, v1, v2, v3, v4, v6, v5, v7, \ + v8, v9, v10, v11, v12, v13, v14, v15, \ + x7, x8, x9, 0, \ + v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27 + + zip1 v16.2d, v0.2d, v4.2d + zip2 v17.2d, v0.2d, v4.2d + zip1 v18.2d, v1.2d, v5.2d + zip2 v19.2d, v1.2d, v5.2d + + zip1 v20.2d, v2.2d, v6.2d + zip2 v21.2d, v2.2d, v6.2d + zip1 v22.2d, v3.2d, v7.2d + zip2 v23.2d, v3.2d, v7.2d + + ldp q0, q1, [x10, #(1*32)] + ldp q4, q5, [x10, #(3*32)] + ldp q2, q3, [x11, #(1*32)] + ldp q6, q7, [x11, #(3*32)] + + st1 { v16.4s, v17.4s, v18.4s, v19.4s }, [x10], #64 + st1 { v20.4s, v21.4s, v22.4s, v23.4s }, [x11], #64 + + zip1 v20.2d, v8.2d, v12.2d + zip2 v21.2d, v8.2d, v12.2d + zip1 v22.2d, v9.2d, v13.2d + zip2 v23.2d, v9.2d, v13.2d + zip1 v24.2d, v10.2d, v14.2d + zip2 v25.2d, v10.2d, v14.2d + zip1 v26.2d, v11.2d, v15.2d + zip2 v27.2d, v11.2d, v15.2d + + ldp q8, q9, [x12, #(1*32)] + ldp q10, q11, [x12, #(3*32)] + ldp q12, q13, [x13, #(1*32)] + ldp q14, q15, [x13, #(3*32)] + + st1 { v20.4s, v21.4s, v22.4s, v23.4s }, [x12], #64 + st1 { v24.4s, v25.4s, v26.4s, v27.4s }, [x13], #64 + + SR_COMBINE v0, v1, v2, v3, v4, v6, v5, v7, \ + v8, v9, v10, v11, v12, v13, v14, v15, \ + x7, x8, x9, 0, \ + v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27 + + zip1 v16.2d, v0.2d, v4.2d + zip2 v17.2d, v0.2d, v4.2d + zip1 v18.2d, v1.2d, v5.2d + zip2 v19.2d, v1.2d, v5.2d + st1 { v16.4s, v17.4s, v18.4s, v19.4s }, [x10] + + zip1 v16.2d, v2.2d, v6.2d + zip2 v17.2d, v2.2d, v6.2d + zip1 v18.2d, v3.2d, v7.2d + zip2 v19.2d, v3.2d, v7.2d + st1 { v16.4s, v17.4s, v18.4s, v19.4s }, [x11] + + zip1 v20.2d, v8.2d, v12.2d + zip2 v21.2d, v8.2d, v12.2d + zip1 v22.2d, v9.2d, v13.2d + zip2 v23.2d, v9.2d, v13.2d + st1 { v20.4s, v21.4s, v22.4s, v23.4s }, [x12] + + zip1 v24.2d, v10.2d, v14.2d + zip2 v25.2d, v10.2d, v14.2d + zip1 v26.2d, v11.2d, v15.2d + zip2 v27.2d, v11.2d, v15.2d + st1 { v24.4s, v25.4s, v26.4s, v27.4s }, [x13] +.endm + +.macro SR_COMBINE_DINT off=0 + add x14, x1, x21 + add x15, x1, x21, lsl #1 + add x16, x1, x22 + + SR_COMBINE_D2 0, \off + SR_COMBINE_D2 4, \off +.endm + +.macro FFT32_FN name, no_perm +function ff_tx_fft32_\name\()_neon, export=1 + stp d14, d15, [sp, #-16*4]! + stp d8, d9, [sp, #16*3] + stp d10, d11, [sp, #16*2] + stp d12, d13, [sp, #16] + + LOAD_SUBADD + SETUP_SR_RECOMB 32, x7, x8, x9 + + SETUP_LUT \no_perm + LOAD_INPUT 0, 1, 2, 3, x2, \no_perm + LOAD_INPUT 4, 5, 6, 7, x2, \no_perm + LOAD_INPUT 8, 9, 10, 11, x2, \no_perm + LOAD_INPUT 12, 13, 14, 15, x2, \no_perm + + FFT8_X2 v8, v9, v10, v11, v12, v13, v14, v15 + FFT16 v0, v1, v2, v3, v4, v5, v6, v7 + + SR_COMBINE v0, v1, v2, v3, v4, v5, v6, v7, \ + v8, v9, v10, v11, v12, v13, v14, v15, \ + x7, x8, x9, 0 + + zip1 v16.2d, v0.2d, v4.2d + zip2 v17.2d, v0.2d, v4.2d + zip1 v18.2d, v1.2d, v6.2d + zip2 v19.2d, v1.2d, v6.2d + st1 { v16.4s, v17.4s, v18.4s, v19.4s }, [x1], #64 + + zip1 v20.2d, v2.2d, v5.2d + zip2 v21.2d, v2.2d, v5.2d + zip1 v22.2d, v3.2d, v7.2d + zip2 v23.2d, v3.2d, v7.2d + st1 { v20.4s, v21.4s, v22.4s, v23.4s }, [x1], #64 + + zip1 v24.2d, v8.2d, v12.2d + zip2 v25.2d, v8.2d, v12.2d + zip1 v26.2d, v9.2d, v13.2d + zip2 v27.2d, v9.2d, v13.2d + st1 { v24.4s, v25.4s, v26.4s, v27.4s }, [x1], #64 + + zip1 v28.2d, v10.2d, v14.2d + zip2 v29.2d, v10.2d, v14.2d + zip1 v30.2d, v11.2d, v15.2d + zip2 v31.2d, v11.2d, v15.2d + st1 { v28.4s, v29.4s, v30.4s, v31.4s }, [x1] + + ldp d12, d13, [sp, #16] + ldp d10, d11, [sp, #16*2] + ldp d8, d9, [sp, #16*3] + ldp d14, d15, [sp], #16*4 + + ret +endfunc +.endm + +FFT32_FN float, 0 +FFT32_FN ns_float, 1 + +.macro cmp_imm reg, imm +.if \imm >= 4096 + cmp \reg, #((\imm)/4096), lsl #12 +.else + cmp \reg, #(\imm) +.endif +.endm + +.macro SR_TRANSFORM_DEF len, next=0 +\len: + stp x20, x30, [sp, #-16]! + mov w20, #(\len/4) + mov x5, #((\len*4) - (\len/1)) + add x1, x1, x5 + bl 32b + mov x5, #((\len*2) - (\len/2)) + add x1, x1, x5 + bl 32b + ldp x20, x30, [sp], #16 + ldr w5, =(\len*6 + \len/2) + sub x1, x1, x5 + + SETUP_SR_RECOMB \len, x7, x8, x9 + +.if \next\() != 0 + cmp_imm w19, \len + b.eq 0f + + mov w5, #(\len/128) +\len\()5: + SR_COMBINE_FULL \len + add x1, x1, 8*32 + subs w5, w5, 1 + b.gt \len\()5b + + cmp_imm w20, \len + b.gt \next\()f + ret +.endif +.endm + +.macro FFT_SPLIT_RADIX_FN name, no_perm +function ff_tx_fft_sr_\name\()_neon, export=1 + stp x21, x22, [sp, #-16*6]! + stp d8, d9, [sp, #16*5] + stp d10, d11, [sp, #16*4] + stp d12, d13, [sp, #16*3] + stp d14, d15, [sp, #16*2] + stp x19, x20, [sp, #16] + + ldr w19, [x0, #0] // global target + mov w20, w19 // local length + + LOAD_SUBADD + SETUP_LUT \no_perm + +32: + SETUP_SR_RECOMB 32, x7, x8, x9 + + LOAD_INPUT 0, 1, 2, 3, x2, \no_perm + LOAD_INPUT 4, 6, 5, 7, x2, \no_perm, 1 + LOAD_INPUT 8, 9, 10, 11, x2, \no_perm + LOAD_INPUT 12, 13, 14, 15, x2, \no_perm + + FFT8_X2 v8, v9, v10, v11, v12, v13, v14, v15 + FFT16 v0, v1, v2, v3, v4, v6, v5, v7 + + SR_COMBINE v0, v1, v2, v3, v4, v6, v5, v7, \ + v8, v9, v10, v11, v12, v13, v14, v15, \ + x7, x8, x9, 0 + + stp q2, q3, [x1, #32*1] + stp q6, q7, [x1, #32*3] + stp q10, q11, [x1, #32*5] + stp q14, q15, [x1, #32*7] + + cmp w20, #32 + b.gt 64f + + stp q0, q1, [x1, #32*0] + stp q4, q5, [x1, #32*2] + stp q8, q9, [x1, #32*4] + stp q12, q13, [x1, #32*6] + + ret +64: + SETUP_SR_RECOMB 64, x7, x8, x9 + + LOAD_INPUT 2, 3, 10, 11, x2, \no_perm, 1 + LOAD_INPUT 6, 14, 7, 15, x2, \no_perm, 1 + + FFT16 v2, v3, v10, v11, v6, v14, v7, v15 + + LOAD_INPUT 16, 17, 18, 19, x2, \no_perm + LOAD_INPUT 20, 22, 21, 23, x2, \no_perm, 1 + + FFT16 v16, v17, v18, v19, v20, v22, v21, v23, \ + v24, v25, v26, v27, v28, v29, v30 + + ld1 { v26.4s, v27.4s }, [x8], x9 + ldp q24, q25, [x7], #32 + + ext v26.16b, v26.16b, v26.16b, #8 + ext v27.16b, v27.16b, v27.16b, #8 + + cmp w19, #64 + b.eq 2f // custom deinterleave + + // TODO: investigate doing the 2 combines like in deinterleave + // TODO: experiment with spilling to gprs and converting to HALF or full + SR_COMBINE_LITE v0, v1, v8, v9, \ + v2, v3, v16, v17, \ + v24, v25, v26, v27, \ + v28, v29, v30, 0 + + stp q0, q1, [x1, #32* 0] + stp q8, q9, [x1, #32* 4] + stp q2, q3, [x1, #32* 8] + stp q16, q17, [x1, #32*12] + + SR_COMBINE_HALF v4, v5, v12, v13, \ + v6, v7, v20, v21, \ + v24, v25, v26, v27, \ + v28, v29, v30, v0, v1, v8, 1 + + stp q4, q20, [x1, #32* 2] + stp q12, q21, [x1, #32* 6] + stp q6, q5, [x1, #32*10] + stp q7, q13, [x1, #32*14] + + ldp q2, q3, [x1, #32*1] + ldp q6, q7, [x1, #32*3] + ldp q12, q13, [x1, #32*5] + ldp q16, q17, [x1, #32*7] + + SR_COMBINE v2, v3, v12, v13, v6, v16, v7, v17, \ + v10, v11, v14, v15, v18, v19, v22, v23, \ + x7, x8, x9, 0, \ + v24, v25, v26, v27, v28, v29, v30, v8, v0, v1, v4, v5 + + stp q2, q3, [x1, #32* 1] + stp q6, q7, [x1, #32* 3] + stp q12, q13, [x1, #32* 5] + stp q16, q17, [x1, #32* 7] + + stp q10, q11, [x1, #32* 9] + stp q18, q19, [x1, #32*11] + stp q14, q15, [x1, #32*13] + stp q22, q23, [x1, #32*15] + + cmp w20, #64 + b.gt 128f + ret +128: + stp x20, x30, [sp, #-16]! + mov w20, #32 + add x1, x1, #16*32 + bl 32b + add x1, x1, #8*32 + bl 32b + ldp x20, x30, [sp], #16 + sub x1, x1, #24*32 + + SETUP_SR_RECOMB 128, x7, x8, x9 + + cmp w19, #128 + b.eq 0f + + SR_COMBINE_FULL 128 + + cmp w20, #128 + b.gt 256f + ret +256: + stp x20, x30, [sp, #-16]! + mov w20, #64 + add x1, x1, #32*32 + bl 32b + add x1, x1, #16*32 + bl 32b + ldp x20, x30, [sp], #16 + sub x1, x1, #48*32 + + SETUP_SR_RECOMB 256, x7, x8, x9 + + cmp w19, #256 + b.eq 0f + + SR_COMBINE_FULL 256 + SR_COMBINE_FULL 256, 8*32 + + cmp w20, #256 + b.gt 512f + ret +512: + stp x20, x30, [sp, #-16]! + mov w20, #128 + add x1, x1, #64*32 + bl 32b + add x1, x1, #32*32 + bl 32b + ldp x20, x30, [sp], #16 + sub x1, x1, #96*32 + + SETUP_SR_RECOMB 512, x7, x8, x9 + + cmp w19, #512 + b.eq 0f + + mov x5, 4 +5125: + SR_COMBINE_FULL 512 + add x1, x1, 8*32 + subs w5, w5, 1 + b.gt 5125b + + cmp w20, #512 + b.gt 1024f + + ret +1024: + stp x20, x30, [sp, #-16]! + mov w20, #256 + add x1, x1, #96*32 + bl 32b + add x1, x1, #64*32 + bl 32b + ldp x20, x30, [sp], #16 + mov x5, #192*32 + sub x1, x1, x5 + + SETUP_SR_RECOMB 1024, x7, x8, x9 + + cmp w19, #1024 + b.eq 0f + + mov w5, 8 +10245: + SR_COMBINE_FULL 1024 + add x1, x1, 8*32 + subs w5, w5, 1 + b.gt 10245b + + cmp w20, #1024 + b.gt 2048f + + ret + +SR_TRANSFORM_DEF 2048, 4096 +SR_TRANSFORM_DEF 4096, 8192 +SR_TRANSFORM_DEF 8192, 16384 +SR_TRANSFORM_DEF 16384, 32768 +SR_TRANSFORM_DEF 32768, 65536 +SR_TRANSFORM_DEF 65536, 131072 +SR_TRANSFORM_DEF 131072 + +0: // general deinterleave loop + SR_COMBINE_DINT + add x1, x1, #32*8 + subs w19, w19, #32*4 + b.gt 0b + + ldp x19, x20, [sp, #16] + ldp d14, d15, [sp, #16*2] + ldp d12, d13, [sp, #16*3] + ldp d10, d11, [sp, #16*4] + ldp d8, d9, [sp, #16*5] + ldp x21, x22, [sp], #16*6 + + ret + +2: // special case for 64 point deinterleave + mov x10, v23.d[0] + mov x11, v23.d[1] + + SR_COMBINE_LITE v0, v1, v8, v9, \ + v2, v3, v16, v17, \ + v24, v25, v26, v27, \ + v28, v29, v30, 0 + + SR_COMBINE_HALF v4, v5, v12, v13, \ + v6, v7, v20, v21, \ + v24, v25, v26, v27, \ + v28, v29, v30, v23, v24, v26, 1 + + zip1 v23.2d, v0.2d, v4.2d + zip2 v24.2d, v0.2d, v4.2d + zip1 v25.2d, v1.2d, v20.2d + zip2 v26.2d, v1.2d, v20.2d + + zip1 v27.2d, v8.2d, v12.2d + zip2 v28.2d, v8.2d, v12.2d + zip1 v29.2d, v9.2d, v21.2d + zip2 v30.2d, v9.2d, v21.2d + + mov v20.16b, v5.16b + mov v21.16b, v7.16b + mov x12, x1 + add x13, x1, #32* 4 + add x14, x1, #32* 8 + add x15, x1, #32*12 + + zip1 v4.2d, v2.2d, v6.2d + zip2 v5.2d, v2.2d, v6.2d + zip1 v6.2d, v3.2d, v20.2d + zip2 v7.2d, v3.2d, v20.2d + + zip1 v0.2d, v16.2d, v21.2d + zip2 v1.2d, v16.2d, v21.2d + zip1 v2.2d, v17.2d, v13.2d + zip2 v3.2d, v17.2d, v13.2d + + // stp is faster by a little on A53, but this is faster on M1s (theory) + ldp q8, q9, [x1, #32*1] + ldp q12, q13, [x1, #32*5] + + st1 { v23.4s, v24.4s, v25.4s, v26.4s }, [x12], #64 // 32* 0...1 + st1 { v27.4s, v28.4s, v29.4s, v30.4s }, [x13], #64 // 32* 4...5 + st1 { v4.4s, v5.4s, v6.4s, v7.4s }, [x14], #64 // 32* 8...9 + st1 { v0.4s, v1.4s, v2.4s, v3.4s }, [x15], #64 // 32*12..13 + + mov v23.d[0], x10 + mov v23.d[1], x11 + + ldp q6, q7, [x1, #32*3] + ldp q16, q17, [x1, #32*7] + + SR_COMBINE v8, v9, v12, v13, v6, v16, v7, v17, \ + v10, v11, v14, v15, v18, v19, v22, v23, \ + x7, x8, x9, 0, \ + v24, v25, v26, v27, v28, v29, v30, v4, v0, v1, v5, v20 + + zip1 v0.2d, v8.2d, v6.2d + zip2 v1.2d, v8.2d, v6.2d + zip1 v2.2d, v9.2d, v7.2d + zip2 v3.2d, v9.2d, v7.2d + st1 { v0.4s, v1.4s, v2.4s, v3.4s }, [x12] + + zip1 v4.2d, v12.2d, v16.2d + zip2 v5.2d, v12.2d, v16.2d + zip1 v6.2d, v13.2d, v17.2d + zip2 v7.2d, v13.2d, v17.2d + st1 { v4.4s, v5.4s, v6.4s, v7.4s }, [x13] + + zip1 v0.2d, v10.2d, v18.2d + zip2 v1.2d, v10.2d, v18.2d + zip1 v2.2d, v11.2d, v19.2d + zip2 v3.2d, v11.2d, v19.2d + st1 { v0.4s, v1.4s, v2.4s, v3.4s }, [x14] + + zip1 v4.2d, v14.2d, v22.2d + zip2 v5.2d, v14.2d, v22.2d + zip1 v6.2d, v15.2d, v23.2d + zip2 v7.2d, v15.2d, v23.2d + st1 { v4.4s, v5.4s, v6.4s, v7.4s }, [x15] + + ldp x19, x20, [sp, #16] + ldp d14, d15, [sp, #16*2] + ldp d12, d13, [sp, #16*3] + ldp d10, d11, [sp, #16*4] + ldp d8, d9, [sp, #16*5] + ldp x21, x22, [sp], #16*6 + + ret +endfunc +.endm + +FFT_SPLIT_RADIX_FN float, 0 +FFT_SPLIT_RADIX_FN ns_float, 1 |