Adding upstream version 124.0.1.upstream/124.0.1

Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
author: Daniel Baumann <daniel.baumann@progress-linux.org> 2024-04-19 00:47:55 +0000
committer: Daniel Baumann <daniel.baumann@progress-linux.org> 2024-04-19 00:47:55 +0000
commit: 26a029d407be480d791972afb5975cf62c9360a6 (patch)
tree: f435a8308119effd964b339f76abb83a57c29483 /media/ffvpx/libavutil/aarch64
parent: Initial commit. (diff)
download: firefox-26a029d407be480d791972afb5975cf62c9360a6.tar.xz
firefox-26a029d407be480d791972afb5975cf62c9360a6.zip
10 files changed, 2174 insertions, 0 deletions
diff --git a/media/ffvpx/libavutil/aarch64/asm.S b/media/ffvpx/libavutil/aarch64/asm.S
new file mode 100644
index 0000000000..1840f9fb01
--- /dev/null
+++ b/media/ffvpx/libavutil/aarch64/asm.S
@@ -0,0 +1,260 @@
+/*
+ * Copyright (c) 2008 Mans Rullgard <mans@mansr.com>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "config.h"
+
+#ifdef __ELF__
+#   define ELF
+#else
+#   define ELF #
+#endif
+
+#if HAVE_AS_FUNC
+#   define FUNC
+#else
+#   define FUNC #
+#endif
+
+#ifndef __has_feature
+#   define __has_feature(x) 0
+#endif
+
+#if HAVE_AS_ARCH_DIRECTIVE
+        .arch           AS_ARCH_LEVEL
+#endif
+
+#if HAVE_AS_ARCHEXT_DOTPROD_DIRECTIVE
+#define ENABLE_DOTPROD  .arch_extension dotprod
+#define DISABLE_DOTPROD .arch_extension nodotprod
+#else
+#define ENABLE_DOTPROD
+#define DISABLE_DOTPROD
+#endif
+
+#if HAVE_AS_ARCHEXT_I8MM_DIRECTIVE
+#define ENABLE_I8MM  .arch_extension i8mm
+#define DISABLE_I8MM .arch_extension noi8mm
+#else
+#define ENABLE_I8MM
+#define DISABLE_I8MM
+#endif
+
+DISABLE_DOTPROD
+DISABLE_I8MM
+
+
+/* Support macros for
+ *   - Armv8.3-A Pointer Authentication and
+ *   - Armv8.5-A Branch Target Identification
+ * features which require emitting a .note.gnu.property section with the
+ * appropriate architecture-dependent feature bits set.
+ *
+ * |AARCH64_SIGN_LINK_REGISTER| and |AARCH64_VALIDATE_LINK_REGISTER| expand to
+ * PACIxSP and AUTIxSP, respectively. |AARCH64_SIGN_LINK_REGISTER| should be
+ * used immediately before saving the LR register (x30) to the stack.
+ * |AARCH64_VALIDATE_LINK_REGISTER| should be used immediately after restoring
+ * it. Note |AARCH64_SIGN_LINK_REGISTER|'s modifications to LR must be undone
+ * with |AARCH64_VALIDATE_LINK_REGISTER| before RET. The SP register must also
+ * have the same value at the two points. For example:
+ *
+ *   .global f
+ *   f:
+ *     AARCH64_SIGN_LINK_REGISTER
+ *     stp x29, x30, [sp, #-96]!
+ *     mov x29, sp
+ *     ...
+ *     ldp x29, x30, [sp], #96
+ *     AARCH64_VALIDATE_LINK_REGISTER
+ *     ret
+ *
+ * |AARCH64_VALID_CALL_TARGET| expands to BTI 'c'. Either it, or
+ * |AARCH64_SIGN_LINK_REGISTER|, must be used at every point that may be an
+ * indirect call target. In particular, all symbols exported from a file must
+ * begin with one of these macros. For example, a leaf function that does not
+ * save LR can instead use |AARCH64_VALID_CALL_TARGET|:
+ *
+ *   .globl return_zero
+ *   return_zero:
+ *     AARCH64_VALID_CALL_TARGET
+ *     mov x0, #0
+ *     ret
+ *
+ * A non-leaf function which does not immediately save LR may need both macros
+ * because |AARCH64_SIGN_LINK_REGISTER| appears late. For example, the function
+ * may jump to an alternate implementation before setting up the stack:
+ *
+ *   .globl with_early_jump
+ *   with_early_jump:
+ *     AARCH64_VALID_CALL_TARGET
+ *     cmp x0, #128
+ *     b.lt .Lwith_early_jump_128
+ *     AARCH64_SIGN_LINK_REGISTER
+ *     stp x29, x30, [sp, #-96]!
+ *     mov x29, sp
+ *     ...
+ *     ldp x29, x30, [sp], #96
+ *     AARCH64_VALIDATE_LINK_REGISTER
+ *     ret
+ *
+ *  .Lwith_early_jump_128:
+ *     ...
+ *     ret
+ *
+ * These annotations are only required with indirect calls. Private symbols that
+ * are only the target of direct calls do not require annotations. Also note
+ * that |AARCH64_VALID_CALL_TARGET| is only valid for indirect calls (BLR), not
+ * indirect jumps (BR). Indirect jumps in assembly are supported through
+ * |AARCH64_VALID_JUMP_TARGET|. Landing Pads which shall serve for jumps and
+ * calls can be created using |AARCH64_VALID_JUMP_CALL_TARGET|.
+ *
+ * Although not necessary, it is safe to use these macros in 32-bit ARM
+ * assembly. This may be used to simplify dual 32-bit and 64-bit files.
+ *
+ * References:
+ * - "ELF for the Arm® 64-bit Architecture"
+ *   https: *github.com/ARM-software/abi-aa/blob/master/aaelf64/aaelf64.rst
+ * - "Providing protection for complex software"
+ *   https://developer.arm.com/architectures/learn-the-architecture/providing-protection-for-complex-software
+ */
+#if defined(__ARM_FEATURE_BTI_DEFAULT) && (__ARM_FEATURE_BTI_DEFAULT == 1)
+#   define GNU_PROPERTY_AARCH64_BTI (1 << 0)   // Has BTI
+#   define AARCH64_VALID_CALL_TARGET hint #34  // BTI 'c'
+#   define AARCH64_VALID_JUMP_TARGET hint #38  // BTI 'j'
+#else
+#   define GNU_PROPERTY_AARCH64_BTI 0          // No BTI
+#   define AARCH64_VALID_CALL_TARGET
+#   define AARCH64_VALID_JUMP_TARGET
+#endif
+
+#if defined(__ARM_FEATURE_PAC_DEFAULT)
+#   if ((__ARM_FEATURE_PAC_DEFAULT & (1 << 0)) != 0) // authentication using key A
+#       define AARCH64_SIGN_LINK_REGISTER      paciasp
+#       define AARCH64_VALIDATE_LINK_REGISTER  autiasp
+#   elif ((__ARM_FEATURE_PAC_DEFAULT & (1 << 1)) != 0) // authentication using key B
+#       define AARCH64_SIGN_LINK_REGISTER      pacibsp
+#       define AARCH64_VALIDATE_LINK_REGISTER  autibsp
+#   else
+#       error Pointer authentication defines no valid key!
+#   endif
+#   if ((__ARM_FEATURE_PAC_DEFAULT & (1 << 2)) != 0)
+#       error Authentication of leaf functions is enabled but not supported in FFmpeg!
+#   endif
+#   define GNU_PROPERTY_AARCH64_PAC (1 << 1)
+#else
+#   define GNU_PROPERTY_AARCH64_PAC 0
+#   define AARCH64_SIGN_LINK_REGISTER
+#   define AARCH64_VALIDATE_LINK_REGISTER
+#endif
+
+
+#if (GNU_PROPERTY_AARCH64_BTI != 0 || GNU_PROPERTY_AARCH64_PAC != 0) && defined(__ELF__)
+        .pushsection .note.gnu.property, "a"
+        .balign 8
+        .long 4
+        .long 0x10
+        .long 0x5
+        .asciz "GNU"
+        .long 0xc0000000 /* GNU_PROPERTY_AARCH64_FEATURE_1_AND */
+        .long 4
+        .long (GNU_PROPERTY_AARCH64_BTI | GNU_PROPERTY_AARCH64_PAC)
+        .long 0
+        .popsection
+#endif
+
+.macro  function name, export=0, align=2
+    .macro endfunc
+ELF     .size   \name, . - \name
+FUNC    .endfunc
+        .purgem endfunc
+    .endm
+        .text
+        .align          \align
+    .if \export
+        .global EXTERN_ASM\name
+ELF     .type   EXTERN_ASM\name, %function
+FUNC    .func   EXTERN_ASM\name
+EXTERN_ASM\name:
+        AARCH64_VALID_CALL_TARGET
+    .else
+ELF     .type   \name, %function
+FUNC    .func   \name
+\name:
+    .endif
+.endm
+
+.macro  const   name, align=2, relocate=0
+    .macro endconst
+ELF     .size   \name, . - \name
+        .purgem endconst
+    .endm
+#if HAVE_SECTION_DATA_REL_RO
+.if \relocate
+        .section        .data.rel.ro
+.else
+        .section        .rodata
+.endif
+#elif defined(_WIN32)
+        .section        .rdata
+#elif !defined(__MACH__)
+        .section        .rodata
+#else
+        .const_data
+#endif
+        .align          \align
+\name:
+.endm
+
+.macro  movrel rd, val, offset=0
+#if CONFIG_PIC && defined(__APPLE__)
+    .if \offset < 0
+        adrp            \rd, \val@PAGE
+        add             \rd, \rd, \val@PAGEOFF
+        sub             \rd, \rd, -(\offset)
+    .else
+        adrp            \rd, \val+(\offset)@PAGE
+        add             \rd, \rd, \val+(\offset)@PAGEOFF
+    .endif
+#elif CONFIG_PIC && defined(_WIN32)
+    .if \offset < 0
+        adrp            \rd, \val
+        add             \rd, \rd, :lo12:\val
+        sub             \rd, \rd, -(\offset)
+    .else
+        adrp            \rd, \val+(\offset)
+        add             \rd, \rd, :lo12:\val+(\offset)
+    .endif
+#elif CONFIG_PIC
+#   if __has_feature(hwaddress_sanitizer)
+        adrp            \rd, :pg_hi21_nc:\val+(\offset)
+#   else
+        adrp            \rd, \val+(\offset)
+#   endif
+        add             \rd, \rd, :lo12:\val+(\offset)
+#else
+        ldr             \rd, =\val+\offset
+#endif
+.endm
+
+#define GLUE(a, b) a ## b
+#define JOIN(a, b) GLUE(a, b)
+#define X(s) JOIN(EXTERN_ASM, s)
+
+#define x18 do_not_use_x18
+#define w18 do_not_use_w18
diff --git a/media/ffvpx/libavutil/aarch64/bswap.h b/media/ffvpx/libavutil/aarch64/bswap.h
new file mode 100644
index 0000000000..7abca657ba
--- /dev/null
+++ b/media/ffvpx/libavutil/aarch64/bswap.h
@@ -0,0 +1,56 @@
+/*
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#ifndef AVUTIL_AARCH64_BSWAP_H
+#define AVUTIL_AARCH64_BSWAP_H
+
+#include <stdint.h>
+#include "config.h"
+#include "libavutil/attributes.h"
+
+#if HAVE_INLINE_ASM
+
+#define av_bswap16 av_bswap16
+static av_always_inline av_const unsigned av_bswap16(unsigned x)
+{
+    unsigned y;
+
+    __asm__("rev16 %w0, %w1" : "=r"(y) : "r"(x));
+    return y;
+}
+
+#define av_bswap32 av_bswap32
+static av_always_inline av_const uint32_t av_bswap32(uint32_t x)
+{
+    uint32_t y;
+
+    __asm__("rev %w0, %w1" : "=r"(y) : "r"(x));
+    return y;
+}
+
+#define av_bswap64 av_bswap64
+static av_always_inline av_const uint64_t av_bswap64(uint64_t x)
+{
+    uint64_t y;
+
+    __asm__("rev %0, %1" : "=r"(y) : "r"(x));
+    return y;
+}
+
+#endif /* HAVE_INLINE_ASM */
+#endif /* AVUTIL_AARCH64_BSWAP_H */
diff --git a/media/ffvpx/libavutil/aarch64/cpu.c b/media/ffvpx/libavutil/aarch64/cpu.c
new file mode 100644
index 0000000000..f27fef3992
--- /dev/null
+++ b/media/ffvpx/libavutil/aarch64/cpu.c
@@ -0,0 +1,127 @@
+/*
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavutil/cpu.h"
+#include "libavutil/cpu_internal.h"
+#include "config.h"
+
+#if (defined(__linux__) || defined(__ANDROID__)) && HAVE_GETAUXVAL
+#include <stdint.h>
+#include <sys/auxv.h>
+
+#define get_cpu_feature_reg(reg, val) \
+        __asm__("mrs %0, " #reg : "=r" (val))
+
+static int detect_flags(void)
+{
+    int flags = 0;
+
+#if defined(HWCAP_CPUID) && HAVE_INLINE_ASM
+    unsigned long hwcap = getauxval(AT_HWCAP);
+    // We can check for DOTPROD and I8MM using HWCAP_ASIMDDP and
+    // HWCAP2_I8MM too, avoiding to read the CPUID registers (which triggers
+    // a trap, handled by the kernel). However the HWCAP_* defines for these
+    // extensions are added much later than HWCAP_CPUID, so the userland
+    // headers might lack support for them even if the binary later is run
+    // on hardware that does support it (and where the kernel might support
+    // HWCAP_CPUID).
+    // See https://www.kernel.org/doc/html/latest/arm64/cpu-feature-registers.html
+    if (hwcap & HWCAP_CPUID) {
+        uint64_t tmp;
+
+        get_cpu_feature_reg(ID_AA64ISAR0_EL1, tmp);
+        if (((tmp >> 44) & 0xf) == 0x1)
+            flags |= AV_CPU_FLAG_DOTPROD;
+        get_cpu_feature_reg(ID_AA64ISAR1_EL1, tmp);
+        if (((tmp >> 52) & 0xf) == 0x1)
+            flags |= AV_CPU_FLAG_I8MM;
+    }
+#endif
+
+    return flags;
+}
+
+#elif defined(__APPLE__) && HAVE_SYSCTLBYNAME
+#include <sys/sysctl.h>
+
+static int detect_flags(void)
+{
+    uint32_t value = 0;
+    size_t size;
+    int flags = 0;
+
+    size = sizeof(value);
+    if (!sysctlbyname("hw.optional.arm.FEAT_DotProd", &value, &size, NULL, 0)) {
+        if (value)
+            flags |= AV_CPU_FLAG_DOTPROD;
+    }
+    size = sizeof(value);
+    if (!sysctlbyname("hw.optional.arm.FEAT_I8MM", &value, &size, NULL, 0)) {
+        if (value)
+            flags |= AV_CPU_FLAG_I8MM;
+    }
+    return flags;
+}
+
+#elif defined(_WIN32)
+#include <windows.h>
+
+static int detect_flags(void)
+{
+    int flags = 0;
+#ifdef PF_ARM_V82_DP_INSTRUCTIONS_AVAILABLE
+    if (IsProcessorFeaturePresent(PF_ARM_V82_DP_INSTRUCTIONS_AVAILABLE))
+        flags |= AV_CPU_FLAG_DOTPROD;
+#endif
+    return flags;
+}
+#else
+
+static int detect_flags(void)
+{
+    return 0;
+}
+
+#endif
+
+int ff_get_cpu_flags_aarch64(void)
+{
+    int flags = AV_CPU_FLAG_ARMV8 * HAVE_ARMV8 |
+                AV_CPU_FLAG_NEON  * HAVE_NEON;
+
+#ifdef __ARM_FEATURE_DOTPROD
+    flags |= AV_CPU_FLAG_DOTPROD;
+#endif
+#ifdef __ARM_FEATURE_MATMUL_INT8
+    flags |= AV_CPU_FLAG_I8MM;
+#endif
+
+    flags |= detect_flags();
+
+    return flags;
+}
+
+size_t ff_get_cpu_max_align_aarch64(void)
+{
+    int flags = av_get_cpu_flags();
+
+    if (flags & AV_CPU_FLAG_NEON)
+        return 16;
+
+    return 8;
+}
diff --git a/media/ffvpx/libavutil/aarch64/cpu.h b/media/ffvpx/libavutil/aarch64/cpu.h
new file mode 100644
index 0000000000..64d703be37
--- /dev/null
+++ b/media/ffvpx/libavutil/aarch64/cpu.h
@@ -0,0 +1,31 @@
+/*
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#ifndef AVUTIL_AARCH64_CPU_H
+#define AVUTIL_AARCH64_CPU_H
+
+#include "libavutil/cpu.h"
+#include "libavutil/cpu_internal.h"
+
+#define have_armv8(flags) CPUEXT(flags, ARMV8)
+#define have_neon(flags) CPUEXT(flags, NEON)
+#define have_vfp(flags)  CPUEXT(flags, VFP)
+#define have_dotprod(flags) CPUEXT(flags, DOTPROD)
+#define have_i8mm(flags)    CPUEXT(flags, I8MM)
+
+#endif /* AVUTIL_AARCH64_CPU_H */
diff --git a/media/ffvpx/libavutil/aarch64/float_dsp_init.c b/media/ffvpx/libavutil/aarch64/float_dsp_init.c
new file mode 100644
index 0000000000..4325071821
--- /dev/null
+++ b/media/ffvpx/libavutil/aarch64/float_dsp_init.c
@@ -0,0 +1,69 @@
+/*
+ * ARM NEON optimised Float DSP functions
+ * Copyright (c) 2008 Mans Rullgard <mans@mansr.com>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include <stdint.h>
+
+#include "libavutil/attributes.h"
+#include "libavutil/cpu.h"
+#include "libavutil/float_dsp.h"
+#include "cpu.h"
+
+void ff_vector_fmul_neon(float *dst, const float *src0, const float *src1,
+                         int len);
+
+void ff_vector_fmac_scalar_neon(float *dst, const float *src, float mul,
+                                int len);
+
+void ff_vector_fmul_scalar_neon(float *dst, const float *src, float mul,
+                                int len);
+
+void ff_vector_dmul_scalar_neon(double *dst, const double *src, double mul,
+                                int len);
+
+void ff_vector_fmul_window_neon(float *dst, const float *src0,
+                                const float *src1, const float *win, int len);
+
+void ff_vector_fmul_add_neon(float *dst, const float *src0, const float *src1,
+                             const float *src2, int len);
+
+void ff_vector_fmul_reverse_neon(float *dst, const float *src0,
+                                 const float *src1, int len);
+
+void ff_butterflies_float_neon(float *v1, float *v2, int len);
+
+float ff_scalarproduct_float_neon(const float *v1, const float *v2, int len);
+
+av_cold void ff_float_dsp_init_aarch64(AVFloatDSPContext *fdsp)
+{
+    int cpu_flags = av_get_cpu_flags();
+
+    if (have_neon(cpu_flags)) {
+        fdsp->butterflies_float   = ff_butterflies_float_neon;
+        fdsp->scalarproduct_float = ff_scalarproduct_float_neon;
+        fdsp->vector_dmul_scalar  = ff_vector_dmul_scalar_neon;
+        fdsp->vector_fmul         = ff_vector_fmul_neon;
+        fdsp->vector_fmac_scalar  = ff_vector_fmac_scalar_neon;
+        fdsp->vector_fmul_add     = ff_vector_fmul_add_neon;
+        fdsp->vector_fmul_reverse = ff_vector_fmul_reverse_neon;
+        fdsp->vector_fmul_scalar  = ff_vector_fmul_scalar_neon;
+        fdsp->vector_fmul_window  = ff_vector_fmul_window_neon;
+    }
+}
diff --git a/media/ffvpx/libavutil/aarch64/float_dsp_neon.S b/media/ffvpx/libavutil/aarch64/float_dsp_neon.S
new file mode 100644
index 0000000000..35e2715b87
--- /dev/null
+++ b/media/ffvpx/libavutil/aarch64/float_dsp_neon.S
@@ -0,0 +1,202 @@
+/*
+ * ARM NEON optimised Float DSP functions
+ * Copyright (c) 2008 Mans Rullgard <mans@mansr.com>
+ * Copyright (c) 2014 Janne Grunau <janne-libav@jannau.net>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "config.h"
+#include "asm.S"
+
+function ff_vector_fmul_neon, export=1
+1:      subs            w3,  w3,  #16
+        ld1             {v0.4s, v1.4s}, [x1], #32
+        ld1             {v2.4s, v3.4s}, [x1], #32
+        ld1             {v4.4s, v5.4s}, [x2], #32
+        ld1             {v6.4s, v7.4s}, [x2], #32
+        fmul            v16.4s, v0.4s,  v4.4s
+        fmul            v17.4s, v1.4s,  v5.4s
+        fmul            v18.4s, v2.4s,  v6.4s
+        fmul            v19.4s, v3.4s,  v7.4s
+        st1             {v16.4s, v17.4s}, [x0], #32
+        st1             {v18.4s, v19.4s}, [x0], #32
+        b.ne            1b
+        ret
+endfunc
+
+function ff_vector_fmac_scalar_neon, export=1
+        mov             x3,  #-32
+1:      subs            w2,  w2,  #16
+        ld1             {v16.4s, v17.4s}, [x0], #32
+        ld1             {v18.4s, v19.4s}, [x0], x3
+        ld1             {v4.4s,  v5.4s},  [x1], #32
+        ld1             {v6.4s,  v7.4s},  [x1], #32
+        fmla            v16.4s, v4.4s,  v0.s[0]
+        fmla            v17.4s, v5.4s,  v0.s[0]
+        fmla            v18.4s, v6.4s,  v0.s[0]
+        fmla            v19.4s, v7.4s,  v0.s[0]
+        st1             {v16.4s, v17.4s}, [x0], #32
+        st1             {v18.4s, v19.4s}, [x0], #32
+        b.ne            1b
+        ret
+endfunc
+
+function ff_vector_fmul_scalar_neon, export=1
+        mov             w4,  #15
+        bics            w3,  w2,  w4
+        dup             v16.4s, v0.s[0]
+        b.eq            3f
+        ld1             {v0.4s, v1.4s}, [x1], #32
+1:      subs            w3,  w3,  #16
+        fmul            v0.4s,  v0.4s,  v16.4s
+        ld1             {v2.4s, v3.4s}, [x1], #32
+        fmul            v1.4s,  v1.4s,  v16.4s
+        fmul            v2.4s,  v2.4s,  v16.4s
+        st1             {v0.4s, v1.4s}, [x0], #32
+        fmul            v3.4s,  v3.4s,  v16.4s
+        b.eq            2f
+        ld1             {v0.4s, v1.4s}, [x1], #32
+        st1             {v2.4s, v3.4s}, [x0], #32
+        b               1b
+2:      ands            w2,  w2,  #15
+        st1             {v2.4s, v3.4s}, [x0], #32
+        b.eq            4f
+3:      ld1             {v0.4s}, [x1], #16
+        fmul            v0.4s,  v0.4s,  v16.4s
+        st1             {v0.4s}, [x0], #16
+        subs            w2,  w2,  #4
+        b.gt            3b
+4:      ret
+endfunc
+
+function ff_vector_dmul_scalar_neon, export=1
+        dup             v16.2d, v0.d[0]
+        ld1             {v0.2d, v1.2d}, [x1], #32
+1:      subs            w2,  w2,  #8
+        fmul            v0.2d,  v0.2d,  v16.2d
+        ld1             {v2.2d, v3.2d}, [x1], #32
+        fmul            v1.2d,  v1.2d,  v16.2d
+        fmul            v2.2d,  v2.2d,  v16.2d
+        st1             {v0.2d, v1.2d}, [x0], #32
+        fmul            v3.2d,  v3.2d,  v16.2d
+        ld1             {v0.2d, v1.2d}, [x1], #32
+        st1             {v2.2d, v3.2d}, [x0], #32
+        b.gt            1b
+        ret
+endfunc
+
+function ff_vector_fmul_window_neon, export=1
+        sxtw            x4,  w4                 // len
+        sub             x2,  x2,  #8
+        sub             x5,  x4,  #2
+        add             x2,  x2,  x5, lsl #2    // src1 + 4 * (len - 4)
+        add             x6,  x3,  x5, lsl #3    // win  + 8 * (len - 2)
+        add             x5,  x0,  x5, lsl #3    // dst  + 8 * (len - 2)
+        mov             x7,  #-16
+        ld1             {v0.4s},  [x1], #16     // s0
+        ld1             {v2.4s},  [x3], #16     // wi
+        ld1             {v1.4s},  [x2], x7      // s1
+1:      ld1             {v3.4s},  [x6], x7      // wj
+        subs            x4,  x4,  #4
+        fmul            v17.4s, v0.4s,  v2.4s   // s0 * wi
+        rev64           v4.4s,  v1.4s
+        rev64           v5.4s,  v3.4s
+        rev64           v17.4s, v17.4s
+        ext             v4.16b,  v4.16b,  v4.16b,  #8 // s1_r
+        ext             v5.16b,  v5.16b,  v5.16b,  #8 // wj_r
+        ext             v17.16b, v17.16b, v17.16b, #8 // (s0 * wi)_rev
+        fmul            v16.4s, v0.4s,  v5.4s  // s0 * wj_r
+        fmla            v17.4s, v1.4s,  v3.4s  // (s0 * wi)_rev + s1 * wj
+        b.eq            2f
+        ld1             {v0.4s},  [x1], #16
+        fmls            v16.4s, v4.4s,  v2.4s  // s0 * wj_r - s1_r * wi
+        st1             {v17.4s}, [x5], x7
+        ld1             {v2.4s},  [x3], #16
+        ld1             {v1.4s},  [x2], x7
+        st1             {v16.4s}, [x0], #16
+        b               1b
+2:
+        fmls            v16.4s, v4.4s,  v2.4s  // s0 * wj_r - s1_r * wi
+        st1             {v17.4s}, [x5], x7
+        st1             {v16.4s}, [x0], #16
+        ret
+endfunc
+
+function ff_vector_fmul_add_neon, export=1
+        ld1             {v0.4s, v1.4s},  [x1], #32
+        ld1             {v2.4s, v3.4s},  [x2], #32
+        ld1             {v4.4s, v5.4s},  [x3], #32
+1:      subs            w4,  w4,  #8
+        fmla            v4.4s,  v0.4s,  v2.4s
+        fmla            v5.4s,  v1.4s,  v3.4s
+        b.eq            2f
+        ld1             {v0.4s, v1.4s},  [x1], #32
+        ld1             {v2.4s, v3.4s},  [x2], #32
+        st1             {v4.4s, v5.4s},  [x0], #32
+        ld1             {v4.4s, v5.4s},  [x3], #32
+        b               1b
+2:      st1             {v4.4s, v5.4s},  [x0], #32
+        ret
+endfunc
+
+function ff_vector_fmul_reverse_neon, export=1
+        sxtw            x3,  w3
+        add             x2,  x2,  x3,  lsl #2
+        sub             x2,  x2,  #32
+        mov             x4, #-32
+        ld1             {v2.4s, v3.4s},  [x2], x4
+        ld1             {v0.4s, v1.4s},  [x1], #32
+1:      subs            x3,  x3,  #8
+        rev64           v3.4s,  v3.4s
+        rev64           v2.4s,  v2.4s
+        ext             v3.16b, v3.16b, v3.16b,  #8
+        ext             v2.16b, v2.16b, v2.16b,  #8
+        fmul            v16.4s, v0.4s,  v3.4s
+        fmul            v17.4s, v1.4s,  v2.4s
+        b.eq            2f
+        ld1             {v2.4s, v3.4s},  [x2], x4
+        ld1             {v0.4s, v1.4s},  [x1], #32
+        st1             {v16.4s, v17.4s},  [x0], #32
+        b               1b
+2:      st1             {v16.4s, v17.4s},  [x0], #32
+        ret
+endfunc
+
+function ff_butterflies_float_neon, export=1
+1:      ld1             {v0.4s}, [x0]
+        ld1             {v1.4s}, [x1]
+        subs            w2,  w2,  #4
+        fsub            v2.4s,   v0.4s,  v1.4s
+        fadd            v3.4s,   v0.4s,  v1.4s
+        st1             {v2.4s}, [x1],   #16
+        st1             {v3.4s}, [x0],   #16
+        b.gt            1b
+        ret
+endfunc
+
+function ff_scalarproduct_float_neon, export=1
+        movi            v2.4s,  #0
+1:      ld1             {v0.4s}, [x0],   #16
+        ld1             {v1.4s}, [x1],   #16
+        subs            w2,      w2,     #4
+        fmla            v2.4s,   v0.4s,  v1.4s
+        b.gt            1b
+        faddp           v0.4s,   v2.4s,  v2.4s
+        faddp           s0,      v0.2s
+        ret
+endfunc
diff --git a/media/ffvpx/libavutil/aarch64/moz.build b/media/ffvpx/libavutil/aarch64/moz.build
new file mode 100644
index 0000000000..fa80cf8896
--- /dev/null
+++ b/media/ffvpx/libavutil/aarch64/moz.build
@@ -0,0 +1,21 @@
+# -*- Mode: python; indent-tabs-mode: nil; tab-width: 40 -*-
+# vim: set filetype=python:
+# This Source Code Form is subject to the terms of the Mozilla Public
+# License, v. 2.0. If a copy of the MPL was not distributed with this
+# file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+SOURCES += [
+    'cpu.c',
+    'float_dsp_init.c',
+    'float_dsp_neon.S',
+    'tx_float_init.c',
+    'tx_float_neon.S'
+]
+
+if CONFIG['OS_ARCH'] == 'WINNT':
+    USE_INTEGRATED_CLANGCL_AS = True
+    DEFINES['EXTERN_ASM'] = ''
+
+FINAL_LIBRARY = 'mozavutil'
+
+include('/media/ffvpx/ffvpxcommon.mozbuild')
diff --git a/media/ffvpx/libavutil/aarch64/timer.h b/media/ffvpx/libavutil/aarch64/timer.h
new file mode 100644
index 0000000000..8b28fd354c
--- /dev/null
+++ b/media/ffvpx/libavutil/aarch64/timer.h
@@ -0,0 +1,50 @@
+/*
+ * Copyright (c) 2015 Janne Grunau <janne-libav@jannau.net>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#ifndef AVUTIL_AARCH64_TIMER_H
+#define AVUTIL_AARCH64_TIMER_H
+
+#include <stdint.h>
+#include "config.h"
+
+#if defined(__APPLE__)
+
+#include <mach/mach_time.h>
+
+#define AV_READ_TIME mach_absolute_time
+
+#elif HAVE_INLINE_ASM
+
+#define AV_READ_TIME read_time
+
+static inline uint64_t read_time(void)
+{
+    uint64_t cycle_counter;
+    __asm__ volatile(
+        "isb                   \t\n"
+        "mrs %0, pmccntr_el0       "
+        : "=r"(cycle_counter) :: "memory" );
+
+    return cycle_counter;
+}
+
+#endif /* HAVE_INLINE_ASM */
+
+#endif /* AVUTIL_AARCH64_TIMER_H */
diff --git a/media/ffvpx/libavutil/aarch64/tx_float_init.c b/media/ffvpx/libavutil/aarch64/tx_float_init.c
new file mode 100644
index 0000000000..8300472c4c
--- /dev/null
+++ b/media/ffvpx/libavutil/aarch64/tx_float_init.c
@@ -0,0 +1,64 @@
+/*
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#define TX_FLOAT
+#include "libavutil/tx_priv.h"
+#include "libavutil/attributes.h"
+#include "libavutil/aarch64/cpu.h"
+
+TX_DECL_FN(fft2,      neon)
+TX_DECL_FN(fft4_fwd,  neon)
+TX_DECL_FN(fft4_inv,  neon)
+TX_DECL_FN(fft8,      neon)
+TX_DECL_FN(fft8_ns,   neon)
+TX_DECL_FN(fft16,     neon)
+TX_DECL_FN(fft16_ns,  neon)
+TX_DECL_FN(fft32,     neon)
+TX_DECL_FN(fft32_ns,  neon)
+TX_DECL_FN(fft_sr,    neon)
+TX_DECL_FN(fft_sr_ns, neon)
+
+static av_cold int neon_init(AVTXContext *s, const FFTXCodelet *cd,
+                             uint64_t flags, FFTXCodeletOptions *opts,
+                             int len, int inv, const void *scale)
+{
+    ff_tx_init_tabs_float(len);
+    if (cd->max_len == 2)
+        return ff_tx_gen_ptwo_revtab(s, opts);
+    else
+        return ff_tx_gen_split_radix_parity_revtab(s, len, inv, opts, 8, 0);
+}
+
+const FFTXCodelet * const ff_tx_codelet_list_float_aarch64[] = {
+    TX_DEF(fft2,      FFT,  2,  2, 2, 0, 128, NULL,      neon, NEON, AV_TX_INPLACE, 0),
+    TX_DEF(fft2,      FFT,  2,  2, 2, 0, 192, neon_init, neon, NEON, AV_TX_INPLACE | FF_TX_PRESHUFFLE, 0),
+    TX_DEF(fft4_fwd,  FFT,  4,  4, 2, 0, 128, NULL,      neon, NEON, AV_TX_INPLACE | FF_TX_FORWARD_ONLY, 0),
+    TX_DEF(fft4_fwd,  FFT,  4,  4, 2, 0, 192, neon_init, neon, NEON, AV_TX_INPLACE | FF_TX_PRESHUFFLE, 0),
+    TX_DEF(fft4_inv,  FFT,  4,  4, 2, 0, 128, NULL,      neon, NEON, AV_TX_INPLACE | FF_TX_INVERSE_ONLY, 0),
+    TX_DEF(fft8,      FFT,  8,  8, 2, 0, 128, neon_init, neon, NEON, AV_TX_INPLACE, 0),
+    TX_DEF(fft8_ns,   FFT,  8,  8, 2, 0, 192, neon_init, neon, NEON, AV_TX_INPLACE | FF_TX_PRESHUFFLE, 0),
+    TX_DEF(fft16,     FFT, 16, 16, 2, 0, 128, neon_init, neon, NEON, AV_TX_INPLACE, 0),
+    TX_DEF(fft16_ns,  FFT, 16, 16, 2, 0, 192, neon_init, neon, NEON, AV_TX_INPLACE | FF_TX_PRESHUFFLE, 0),
+    TX_DEF(fft32,     FFT, 32, 32, 2, 0, 128, neon_init, neon, NEON, AV_TX_INPLACE, 0),
+    TX_DEF(fft32_ns,  FFT, 32, 32, 2, 0, 192, neon_init, neon, NEON, AV_TX_INPLACE | FF_TX_PRESHUFFLE, 0),
+
+    TX_DEF(fft_sr,    FFT, 64, 131072, 2, 0, 128, neon_init, neon, NEON, 0, 0),
+    TX_DEF(fft_sr_ns, FFT, 64, 131072, 2, 0, 192, neon_init, neon, NEON, AV_TX_INPLACE | FF_TX_PRESHUFFLE, 0),
+
+    NULL,
+};
diff --git a/media/ffvpx/libavutil/aarch64/tx_float_neon.S b/media/ffvpx/libavutil/aarch64/tx_float_neon.S
new file mode 100644
index 0000000000..78e4876d6c
--- /dev/null
+++ b/media/ffvpx/libavutil/aarch64/tx_float_neon.S
@@ -0,0 +1,1294 @@
+/*
+ * Copyright (c) Lynne
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavutil/aarch64/asm.S"
+
+/* Open `doc/transforms.md` to see the code upon which the transforms here were
+ * based upon.
+ *
+ * File conventions:
+ * GPRs:    x0-x3   - arguments, untouched
+ *          x4      - Lookup table base pointer
+ *          x5-x6   - macro ld1 temps/function scratch
+ *          x7-x9   - FFT table state
+ *          x10-x17 - lookup table/macro scratch
+ *          w19-w20 - current/target length when needed
+ *          x21-x22 - len*2, len*6
+ *
+ * Vectors: v0-v7   - coefficients
+ *          v8-v15  - coefficients when needed, otherwise untouched
+ *          v16-v30 - used as needed
+ *          v31     - -1.0, +1.0, -1.0, +1.0. Never touched after loading.
+ *
+ * Stack:   backup for v8-v15 and x19-x22 when needed, and transform lengths
+ */
+
+#define M_SQRT1_2 0.707106781186547524401
+#define COS16_1   0.92387950420379638671875
+#define COS16_3   0.3826834261417388916015625
+
+/* We only ever load this once at the start, and then live with losing an
+ * entire register as we need to lug this all the time everywhere.
+ * Clearly should be integrated into an fsadd and fmlsa, but "muh RISC!". */
+const subadd, align=4
+        .float -1.0,  1.0, -1.0,  1.0
+endconst
+
+.macro LOAD_SUBADD
+        movrel          x5, subadd
+        ld1             { v31.4s }, [x5]
+.endm
+
+.macro SETUP_LUT no_lut=0
+.if \no_lut == 0
+        ldr             x4, [x0, #8]
+.endif
+.endm
+
+.macro LOAD_INPUT dst1, dst2, dst3, dst4, src, no_lut=0, discont=0
+.if \no_lut == 1
+.if \discont == 1
+        ldp             q\dst1\(), q\dst2\(), [\src\()]
+        ldp             q\dst3\(), q\dst4\(), [\src\(), #32]
+        add             \src\(), \src\(), #64
+.else
+        ld1             { v\dst1\().4s, v\dst2\().4s, v\dst3\().4s, v\dst4\().4s }, [\src], #64
+.endif
+.else
+        ldp             w10, w11, [x4, #0 ]
+        ldp             w12, w13, [x4, #8 ]
+        ldp             w14, w15, [x4, #16]
+        ldp             w16, w17, [x4, #24]
+
+        add             x4, x4, #32
+
+        ldr             d\dst1, [\src, x10, lsl #3]
+        add             x11, \src, x11, lsl #3
+        ldr             d\dst2, [\src, x12, lsl #3]
+        add             x13, \src, x13, lsl #3
+        ldr             d\dst3, [\src, x14, lsl #3]
+        add             x15, \src, x15, lsl #3
+        ldr             d\dst4, [\src, x16, lsl #3]
+        add             x17, \src, x17, lsl #3
+
+        ld1             { v\dst1\().d }[1], [x11]
+        ld1             { v\dst2\().d }[1], [x13]
+        ld1             { v\dst3\().d }[1], [x15]
+        ld1             { v\dst4\().d }[1], [x17]
+.endif
+.endm
+
+.macro FFT4 e0, o0, standalone
+        fadd            v16.4s, \e0\().4s, \o0\().4s         // r1..4
+        fsub            \e0\().4s, \e0\().4s, \o0\().4s      // t1..4
+
+        rev64           v18.4s, \e0\().4s
+
+        zip2            \o0\().2d, v16.2d, \e0\().2d
+        zip1            v17.2d, v16.2d, \e0\().2d
+
+        mov             \o0\().d[1], v18.d[1]
+
+        fadd            \e0\().4s, v17.4s, \o0\().4s         // a1,2 b1,4
+        fsub            v16.4s,    v17.4s, \o0\().4s         // a3,4 b3,2
+
+        mov             \o0\().16b, v16.16b                  // Swap once again...
+        mov             \o0\().s[3], \e0\().s[3]
+        mov             \e0\().s[3], v16.s[3]
+
+.if \standalone == 0
+        uzp2            \o0\().2d, \e0\().2d, \o0\().2d
+        uzp1            \e0\().2d, \e0\().2d, v16.2d
+.endif
+.endm
+
+const shuf_4pt_x2, align=4
+        .byte   24, 25, 26, 27 // reg2, 3
+        .byte   12, 13, 14, 15 // reg1, 4
+        .byte    8,  9, 10, 11 // reg1, 3
+        .byte   28, 29, 30, 31 // reg2, 4
+endconst
+
+// Identical to FFT4, but does 2 transforms in parallel, with no deinterleaving
+.macro FFT4_X2 e0, o0, e1, o1, \
+               t0=v16, t1=v17, t2=v18, t3=v19, t4=v20, t5=v21, t6=v22
+
+        fadd            \t0\().4s, \e0\().4s, \o0\().4s                     // r1234
+        fadd            \t2\().4s, \e1\().4s, \o1\().4s                     // r1234
+        fsub            \e0\().4s, \e0\().4s, \o0\().4s                     // t1234
+        fsub            \e1\().4s, \e1\().4s, \o1\().4s                     // t1234
+
+        movrel          x5, shuf_4pt_x2
+
+        rev64           \t4\().4s, \e0\().4s
+        rev64           \t5\().4s, \e1\().4s
+
+        zip2            \o0\().2d, \t0\().2d, \e0\().2d                     // t3,4 r3,4
+        zip2            \o1\().2d, \t2\().2d, \e1\().2d                     // t3,4 r3,4
+
+        ld1             { \t6\().16b }, [x5]
+
+        mov             \o0\().d[1], \t4\().d[1]
+        mov             \o1\().d[1], \t5\().d[1]
+
+        zip1            \t1\().2d, \t0\().2d, \e0\().2d                     // t1,2 r1,2
+        zip1            \t3\().2d, \t2\().2d, \e1\().2d                     // t1,2 r1,2
+
+        fsub            \t4\().4s, \t1\().4s, \o0\().4s                     // a34 b32
+        fadd            \t5\().4s, \t1\().4s, \o0\().4s                     // a12 b14
+        fsub            \t2\().4s, \t3\().4s, \o1\().4s                     // a34 b32
+        fadd            \t3\().4s, \t3\().4s, \o1\().4s                     // a12 b14
+
+        // TODO: experiment with movs instead of tables here
+        tbl             \o0\().16b, { \t4\().16b, \t5\().16b }, \t6\().16b  // b1234
+        tbl             \o1\().16b, { \t2\().16b, \t3\().16b }, \t6\().16b  // b1234
+
+        zip1            \e0\().2d, \t5\().2d, \t4\().2d                     // a1234
+//        zip2            \o0\().2d, \t5\().2d, \t4\().2d                     // b1432
+        zip1            \e1\().2d, \t3\().2d, \t2\().2d                     // a1234
+//        zip2            \o1\().2d, \t3\().2d, \t2\().2d                     // b1432
+//        rev64           \o0\().4s, \o0\().4s                                // b4123
+//        rev64           \o1\().4s, \o1\().4s                                // b4123
+//        ext             \o0\().16b, \o0\().16b, \o0\().16b, #4              // b1234
+//        ext             \o1\().16b, \o1\().16b, \o1\().16b, #4              // b1234
+.endm
+
+const tab_8pt, align=4
+        .float M_SQRT1_2, -M_SQRT1_2, -M_SQRT1_2,  M_SQRT1_2
+endconst
+
+.macro FFT8 e0, e1, o0, o1, \
+            t0=v16, t1=v17, t2=v18, t3=v19, t4=v20, t5=v21, t6=v22
+
+        movrel          x5, tab_8pt
+
+        fsub            \t1\().4s, \e1\().4s, \o1\().4s             // j1234
+        fadd            \o1\().4s, \e1\().4s, \o1\().4s             // k1234
+        fsub            \t0\().4s, \e0\().4s, \o0\().4s             // r1234
+        fadd            \o0\().4s, \e0\().4s, \o0\().4s             // q1234
+
+        ld1             { \t5\().4s }, [x5]
+
+        ext             \t4\().16b, \o1\().16b, \o1\().16b, #12
+        rev64           \t4\().4s,  \t4\().4s
+
+        ext             \t2\().16b, \o0\().16b, \t4\().16b, #8      // o0[0,1], o1[3,2]
+        mov             \o0\().d[1], \t4\().d[1]                    // o0[3, 4]; o1[1, 4]
+
+        fsub            \e1\().4s, \o0\().4s, \t2\().4s             // s34, g43
+        fadd            \t2\().4s, \o0\().4s, \t2\().4s             // s12, g12
+
+        rev64           \t6\().4s, v31.4s                           // 1, -1, 1, -1
+        dup             \o0\().2d, \t0\().d[0]                      // r1212
+        dup             \o1\().2d, \t0\().d[1]                      // r3434
+
+        rev64           \t4\().4s, \e1\().4s                        // xxg34
+        rev64           \o1\().4s, \o1\().4s                        // r4343
+
+        ext             \t6\().16b, v31.16b, \t6\().16b, #8         // -1, 1, 1, -1
+        zip1            \t3\().2d, \t2\().2d, \e1\().2d             // s1234
+        zip2            \t2\().2d, \t2\().2d, \t4\().2d             // g1234
+
+        fadd            \e0\().4s, \t3\().4s, \t2\().4s             // out_e1
+        fsub            \e1\().4s, \t3\().4s, \t2\().4s             // out_e2
+
+        fmul            \t1\().4s, \t1\().4s, \t5\().4s             // j * +--+M_SQRT1_2
+        fmls            \o0\().4s, \o1\().4s, \t6\().4s             // z1234
+
+        rev64           \t4\().4s, \t1\().4s                        // j2143
+        fmla            \t1\().4s, \t4\().4s, v31.4s                // l2143
+
+        rev64           \t4\().4s, \t1\().4s                        // l1234
+        ext             \t4\().16b, \t4\().16b, \t4\().16b, #8      // l3412
+
+        fmla            \t4\().4s, \t1\().4s, v31.4s                // t1234
+
+        fadd            \o1\().4s, \o0\().4s, \t4\().4s             // out_o2
+        fsub            \o0\().4s, \o0\().4s, \t4\().4s             // out_o1
+.endm
+
+// Identical as FFT8, but does 2 transforms in parallel
+.macro FFT8_X2 e0, e1, o0, o1, e2, e3, o2, o3
+
+        movrel          x5, tab_8pt
+
+        fadd            v19.4s, \e3\().4s, \o3\().4s             // k1234
+        fadd            v17.4s, \e1\().4s, \o1\().4s             // k1234
+        fadd            v18.4s, \e2\().4s, \o2\().4s             // q1234
+        fadd            v16.4s, \e0\().4s, \o0\().4s             // q1234
+
+        ld1             { v23.4s }, [x5]
+
+        ext             v22.16b, v19.16b, v19.16b, #12
+        ext             v21.16b, v17.16b, v17.16b, #12
+
+        rev64           v22.4s,  v22.4s
+        rev64           v21.4s,  v21.4s
+
+        ext             v19.16b, v18.16b, v22.16b, #8
+        ext             v17.16b, v16.16b, v21.16b, #8
+
+        mov             v18.d[1], v22.d[1]
+        mov             v21.d[0], v16.d[0]
+
+        fadd            v22.4s, v18.4s, v19.4s                   // s12, g12
+        fsub            v19.4s, v18.4s, v19.4s                   // s34, g43
+        fsub            v18.4s, v21.4s, v17.4s                   // s34, g43
+        fadd            v16.4s, v21.4s, v17.4s                   // s12, g12
+
+        fsub            \e0\().4s, \e0\().4s, \o0\().4s          // r1234
+        fsub            v20.4s, \e1\().4s, \o1\().4s             // j1234
+        fsub            \e2\().4s, \e2\().4s, \o2\().4s          // r1234
+        fsub            v21.4s, \e3\().4s, \o3\().4s             // j1234
+
+        rev64           v24.4s, v31.4s                           // 1, -1, 1, -1
+        zip1            v17.2d, v16.2d, v18.2d                   // s1234
+        zip1            \e1\().2d, v22.2d, v19.2d                // s1234
+
+        rev64           v18.4s, v18.4s                           // xxg34
+        rev64           v19.4s, v19.4s                           // xxg34
+
+        zip2            v16.2d, v16.2d, v18.2d                   // g1234
+        zip2            \e3\().2d, v22.2d, v19.2d                // g1234
+
+        dup             \o0\().2d, \e0\().d[0]                   // r1212
+        dup             \o1\().2d, \e0\().d[1]                   // r3434
+        dup             \o2\().2d, \e2\().d[0]                   // r1212
+        dup             \o3\().2d, \e2\().d[1]                   // r3434
+
+        fadd            \e2\().4s, \e1\().4s, \e3\().4s          // out_e1
+        fsub            \e3\().4s, \e1\().4s, \e3\().4s          // out_e2
+        fadd            \e0\().4s, v17.4s, v16.4s                // out_e1
+        fsub            \e1\().4s, v17.4s, v16.4s                // out_e2
+
+        ext             v24.16b, v31.16b, v24.16b, #8            // -1, 1, 1, -1
+        rev64           \o1\().4s, \o1\().4s                     // r4343
+        rev64           \o3\().4s, \o3\().4s                     // r4343
+
+        fmul            v19.4s, v20.4s, v23.4s                   // j * +--+M_SQRT1_2
+        fmul            v21.4s, v21.4s, v23.4s                   // j * +--+M_SQRT1_2
+
+        rev64           v20.4s, v19.4s                           // j2143
+        rev64           v18.4s, v21.4s                           // j2143
+
+        fmls            \o0\().4s, \o1\().4s, v24.4s             // z1234
+        fmls            \o2\().4s, \o3\().4s, v24.4s             // z1234
+
+        fmla            v19.4s, v20.4s, v31.4s                   // l2143
+        fmla            v21.4s, v18.4s, v31.4s                   // l2143
+
+        rev64           v20.4s, v19.4s                           // l1234
+        rev64           v18.4s, v21.4s                           // l1234
+        ext             v20.16b, v20.16b, v20.16b, #8            // l3412
+        ext             v18.16b, v18.16b, v18.16b, #8            // l3412
+
+        fmla            v20.4s, v19.4s, v31.4s                   // t1234
+        fmla            v18.4s, v21.4s, v31.4s                   // t1234
+
+        fadd            \o1\().4s, \o0\().4s, v20.4s             // out_o2
+        fadd            \o3\().4s, \o2\().4s, v18.4s             // out_o2
+        fsub            \o0\().4s, \o0\().4s, v20.4s             // out_o1
+        fsub            \o2\().4s, \o2\().4s, v18.4s             // out_o1
+.endm
+
+const tab_16pt, align=4
+        .float   -COS16_1,    COS16_1,   -COS16_3,    COS16_3    // Could be +-+- too
+        .float    COS16_3,    COS16_3,    COS16_1,    COS16_1
+        .float        1.0,        1.0,  M_SQRT1_2,  M_SQRT1_2
+endconst
+
+// 16-point FFT
+// t3, t4, t5, t6 must be sequential
+.macro FFT16 e0, e1, e2, e3, o0, o1, o2, o3, \
+             t0=v16, t1=v17, t2=v18, t3=v19, t4=v20, t5=v21, t6=v22
+
+        FFT8            \e0, \e1, \e2, \e3, \t0, \t1, \t2, \t3, \t4, \t5, \t6
+        FFT4_X2         \o0, \o1, \o2, \o3, \t0, \t1, \t2, \t3, \t4, \t5, \t6
+
+        movrel          x5, tab_16pt
+
+        rev64           \t0\().4s, \o0\().4s                     // z[ 8, 9].imre
+        rev64           \t1\().4s, \o2\().4s                     // z[10,11].imre
+
+        ins             \t0\().d[0], xzr
+        ins             \t1\().d[0], xzr
+
+        ld1             { \t4\().4s, \t5\().4s, \t6\().4s }, [x5]
+        // TODO: We could derive \t4\() or \t5\() from either, but it seems cheaper to load
+
+        fmla            \o2\().4s, \t1\().4s, v31.4s             // s[4567]
+        fmls            \o0\().4s, \t0\().4s, v31.4s             // s[0123]
+
+        fmul            \t2\().4s, \o1\().4s, \t4\().4s
+        fmul            \t3\().4s, \o3\().4s, \t4\().4s
+
+        rev64           \o3\().4s, \o3\().4s
+        rev64           \o1\().4s, \o1\().4s
+
+        fmla            \t3\().4s, \o3\().4s, \t5\().4s          // s[12, 13, 14, 15]
+        fmls            \t2\().4s, \o1\().4s, \t5\().4s          // s[ 8,  9, 10, 11]
+
+        fmul            \t1\().4s, \o2\().4s, \t6\().4s          // s[4567] * mult
+        fmul            \t0\().4s, \o0\().4s, \t6\().4s          // s[0123] * mult
+
+        mov             \o1\().16b, \t3\().16b
+        mov             \o2\().16b, \t1\().16b
+
+        fsub            \t3\().4s, \t3\().4s, \t2\().4s          // y34, u34
+        fsub            \t1\().4s, \t1\().4s, \t0\().4s          // w34, x34
+
+        fadd            \t2\().4s, \t2\().4s, \o1\().4s          // y56, u56
+        rev64           \t3\().4s, \t3\().4s
+        fadd            \t0\().4s, \t0\().4s, \o2\().4s          // w56, x56
+        rev64           \t1\().4s, \t1\().4s
+
+        fmul            \t2\().4s, \t2\().4s, v31.4s
+        fmul            \t1\().4s, \t1\().4s, v31.4s
+
+        fadd            \o3\().4s, \e3\().4s, \t3\().4s
+        fsub            \o2\().4s, \e3\().4s, \t3\().4s
+        fsub            \o1\().4s, \e2\().4s, \t2\().4s
+        fadd            \o0\().4s, \e2\().4s, \t2\().4s
+
+        fsub            \e2\().4s, \e0\().4s, \t0\().4s
+        fadd            \e0\().4s, \e0\().4s, \t0\().4s
+        fsub            \e3\().4s, \e1\().4s, \t1\().4s
+        fadd            \e1\().4s, \e1\().4s, \t1\().4s
+.endm
+
+function ff_tx_fft2_float_neon, export=1
+        ld2r            { v0.2d, v1.2d }, [x2]
+
+        fneg            v2.2s, v1.2s
+        mov             v2.d[1], v1.d[0]
+
+        fsub            v2.4s, v0.4s, v2.4s
+
+        st1             { v2.4s }, [x1]
+        ret
+endfunc
+
+.macro FFT4_FN name, inv
+function ff_tx_fft4_\name\()_float_neon, export=1
+        ld1             {v0.4s, v1.4s}, [x2]
+
+.if \inv == 1
+        mov             v2.d[0], v0.d[1]
+        mov             v0.d[1], v1.d[1]
+        mov             v1.d[1], v2.d[0]
+.endif
+
+        FFT4            v0, v1, 1
+
+        st1             { v0.4s, v1.4s }, [x1]
+        ret
+endfunc
+.endm
+
+FFT4_FN fwd, 0
+FFT4_FN inv, 1
+
+.macro FFT8_FN name, no_perm
+function ff_tx_fft8_\name\()_neon, export=1
+        SETUP_LUT       \no_perm
+        LOAD_INPUT      0, 1, 2, 3, x2, \no_perm
+
+        LOAD_SUBADD
+        FFT8            v0, v1, v2, v3
+
+        zip1            v16.2d, v0.2d, v2.2d
+        zip2            v17.2d, v0.2d, v2.2d
+        zip1            v18.2d, v1.2d, v3.2d
+        zip2            v19.2d, v1.2d, v3.2d
+        st1             { v16.4s, v17.4s, v18.4s, v19.4s }, [x1]
+
+        ret
+endfunc
+.endm
+
+FFT8_FN float,    0
+FFT8_FN ns_float, 1
+
+.macro FFT16_FN name, no_perm
+function ff_tx_fft16_\name\()_neon, export=1
+        SETUP_LUT       \no_perm
+        LOAD_INPUT      0, 1, 2, 3, x2, \no_perm
+        LOAD_INPUT      4, 5, 6, 7, x2, \no_perm
+
+        LOAD_SUBADD
+        FFT16           v0, v1, v2, v3, v4, v5, v6, v7
+
+        zip1            v20.2d, v0.2d, v4.2d
+        zip2            v21.2d, v0.2d, v4.2d
+        zip1            v22.2d, v1.2d, v6.2d
+        zip2            v23.2d, v1.2d, v6.2d
+        st1             { v20.4s, v21.4s, v22.4s, v23.4s }, [x1], #64
+
+        zip1            v24.2d, v2.2d, v5.2d
+        zip2            v25.2d, v2.2d, v5.2d
+        zip1            v26.2d, v3.2d, v7.2d
+        zip2            v27.2d, v3.2d, v7.2d
+        st1             { v24.4s, v25.4s, v26.4s, v27.4s }, [x1]
+
+        ret
+endfunc
+.endm
+
+FFT16_FN float,    0
+FFT16_FN ns_float, 1
+
+.macro SETUP_SR_RECOMB len, re, im, dec
+        ldr             w5, =(\len - 4*7)
+        movrel          \re, X(ff_tx_tab_\len\()_float)
+        add             \im, \re, x5
+        mov             \dec, #-32
+
+.if \len > 32
+        mov             x21, #2*\len
+        add             x22, x21, x21, lsl #1
+.endif
+.endm
+
+.macro SR_COMBINE e0, e1, e2, e3, e4, e5, e6, e7, \
+                  o0, o1, o2, o3, o4, o5, o6, o7, \
+                  re, im, dec, swap_im, \
+                  t0=v16, t1=v17, t2=v18, t3=v19, t4=v20, t5=v21, \
+                  t6=v22, t7=v23, t8=v24, t9=v25, ta=v26, tb=v27
+
+        ld1             { \t8\().4s, \t9\().4s }, [\im], \dec
+        ld1             { \t0\().4s, \t1\().4s }, [\re], #32
+
+.if \swap_im == 1
+        ext             \t2\().16b, \t9\().16b, \t9\().16b, #8
+        ext             \t3\().16b, \t8\().16b, \t8\().16b, #8
+.else
+        ext             \t2\().16b, \t8\().16b, \t8\().16b, #8
+        ext             \t3\().16b, \t9\().16b, \t9\().16b, #8
+.endif
+
+        trn1            \t4\().4s, \t0\().4s, \t0\().4s      // cos0022
+        trn2            \t0\().4s, \t0\().4s, \t0\().4s      // cos4466
+        trn1            \t5\().4s, \t1\().4s, \t1\().4s      // cos1133
+        trn2            \t1\().4s, \t1\().4s, \t1\().4s      // cos5577
+
+        rev64           \t6\().4s, \o0\().4s                 // E m2[0,1].imre
+        rev64           \t7\().4s, \o2\().4s                 // O m2[0,1].imre
+        rev64           \t8\().4s, \o4\().4s                 // E m2[2,3].imre
+        rev64           \t9\().4s, \o6\().4s                 // O m2[2,3].imre
+
+        fmul            \t6\().4s, \t6\().4s, \t4\().4s      // E m2[0,1].imre*t1[0,2]
+        fmul            \t7\().4s, \t7\().4s, \t0\().4s      // O m2[0,1].imre*t1[0,2]
+        fmul            \t8\().4s, \t8\().4s, \t4\().4s      // E m2[2,3].imre*t1[0,2]
+        fmul            \t9\().4s, \t9\().4s, \t0\().4s      // O m2[2,3].imre*t1[0,2]
+
+        rev64           \ta\().4s, \o1\().4s                 // E m3[0,1].imre
+        rev64           \tb\().4s, \o3\().4s                 // O m3[0,1].imre
+        rev64           \t4\().4s, \o5\().4s                 // E m3[2,3].imre
+        rev64           \t0\().4s, \o7\().4s                 // O m3[2,3].imre
+
+        fmul            \ta\().4s, \ta\().4s, \t5\().4s      // E m3[0,1].imre*t1[4,6]
+        fmul            \tb\().4s, \tb\().4s, \t1\().4s      // O m3[0,1].imre*t1[4,6]
+        fmul            \t4\().4s, \t4\().4s, \t5\().4s      // E m3[2,3].imre*t1[4,6]
+        fmul            \t0\().4s, \t0\().4s, \t1\().4s      // O m3[2,3].imre*t1[4,6]
+
+        trn1            \t5\().4s, \t3\().4s, \t3\().4s      // wim2200
+        trn2            \t3\().4s, \t3\().4s, \t3\().4s      // wim3311
+        trn1            \t1\().4s, \t2\().4s, \t2\().4s      // wim6644
+        trn2            \t2\().4s, \t2\().4s, \t2\().4s      // wim7755
+
+        fmul            \t5\().4s, \t5\().4s, v31.4s
+        fmul            \t3\().4s, \t3\().4s, v31.4s
+        fmul            \t1\().4s, \t1\().4s, v31.4s
+        fmul            \t2\().4s, \t2\().4s, v31.4s
+
+        fmla            \t7\().4s, \o2\().4s, \t5\().4s      // O w0123
+        fmls            \t9\().4s, \o6\().4s, \t5\().4s      // O j0123
+        fmla            \t6\().4s, \o0\().4s, \t3\().4s      // E w0123
+        fmls            \t8\().4s, \o4\().4s, \t3\().4s      // E j0123
+
+        fmla            \ta\().4s, \o1\().4s, \t2\().4s      // E w4567
+        fmla            \tb\().4s, \o3\().4s, \t1\().4s      // O w4567
+        fmls            \t4\().4s, \o5\().4s, \t2\().4s      // E j4567
+        fmls            \t0\().4s, \o7\().4s, \t1\().4s      // O j4567
+
+        fsub            \t2\().4s, \t7\().4s, \t9\().4s
+        fsub            \t1\().4s, \t8\().4s, \t6\().4s
+        fsub            \t3\().4s, \t4\().4s, \ta\().4s
+        fsub            \t5\().4s, \t0\().4s, \tb\().4s
+
+        fadd            \t6\().4s, \t8\().4s, \t6\().4s
+        fadd            \t7\().4s, \t9\().4s, \t7\().4s
+        fadd            \t8\().4s, \t4\().4s, \ta\().4s
+        fadd            \t9\().4s, \t0\().4s, \tb\().4s
+
+        fmul            \t1\().4s, \t1\().4s, v31.4s
+        fmul            \t2\().4s, \t2\().4s, v31.4s
+        fmul            \t3\().4s, \t3\().4s, v31.4s
+        fmul            \t5\().4s, \t5\().4s, v31.4s
+
+        rev64           \t6\().4s, \t6\().4s
+        rev64           \t8\().4s, \t8\().4s
+        rev64           \t7\().4s, \t7\().4s
+        rev64           \t9\().4s, \t9\().4s
+
+        fsub            \o0\().4s, \e0\().4s, \t6\().4s
+        fsub            \o1\().4s, \e1\().4s, \t8\().4s
+        fsub            \o2\().4s, \e2\().4s, \t1\().4s
+        fsub            \o3\().4s, \e3\().4s, \t3\().4s
+
+        fsub            \o4\().4s, \e4\().4s, \t7\().4s
+        fsub            \o5\().4s, \e6\().4s, \t9\().4s
+        fadd            \o6\().4s, \e5\().4s, \t2\().4s
+        fsub            \o7\().4s, \e7\().4s, \t5\().4s
+
+        fadd            \e0\().4s, \e0\().4s, \t6\().4s
+        fadd            \e1\().4s, \e1\().4s, \t8\().4s
+        fadd            \e2\().4s, \e2\().4s, \t1\().4s
+        fadd            \e3\().4s, \e3\().4s, \t3\().4s
+
+        fadd            \e4\().4s, \e4\().4s, \t7\().4s
+        fsub            \e5\().4s, \e5\().4s, \t2\().4s      // swapped
+        fadd            \e6\().4s, \e6\().4s, \t9\().4s      // swapped
+        fadd            \e7\().4s, \e7\().4s, \t5\().4s
+.endm
+
+.macro SR_COMBINE_HALF e0, e1, e2, e3, \
+                       o0, o1, o2, o3, \
+                       c0, c1, c2, c3, \
+                       t0, t1, t2, t3, t4, t5, part
+
+.if \part == 0
+        trn1            \t4\().4s, \c0\().4s, \c0\().4s   // cos0022
+        trn1            \c1\().4s, \c1\().4s, \c1\().4s   // cos1133
+.else
+        trn2            \t4\().4s, \c0\().4s, \c0\().4s   // cos0022
+        trn2            \c1\().4s, \c1\().4s, \c1\().4s   // cos1133
+.endif
+.if \part == 0
+        trn2            \t5\().4s, \c2\().4s, \c2\().4s   // wim7755
+        trn2            \c3\().4s, \c3\().4s, \c3\().4s   // wim3311
+.else
+        trn1            \t5\().4s, \c2\().4s, \c2\().4s   // wim7755
+        trn1            \c3\().4s, \c3\().4s, \c3\().4s   // wim3311
+.endif
+
+        fmul            \t5\().4s, \t5\().4s, v31.4s
+        fmul            \c3\().4s, \c3\().4s, v31.4s
+
+        rev64           \t0\().4s, \o0\().4s              // E m2[0,1].imre
+        rev64           \t1\().4s, \o2\().4s              // E m2[2,3].imre
+        rev64           \t2\().4s, \o1\().4s              // E m3[0,1].imre
+        rev64           \t3\().4s, \o3\().4s              // E m3[2,3].imre
+
+        fmul            \o0\().4s, \o0\().4s, \c3\().4s   // E m2[0,1].imre*t1[0,2]
+        fmul            \o1\().4s, \o1\().4s, \t5\().4s   // E m3[0,1].imre*t1[4,6]
+        fmla            \o0\().4s, \t0\().4s, \t4\().4s   // E w0123
+        fmla            \o1\().4s, \t2\().4s, \c1\().4s   // E w4567
+
+        fmul            \t1\().4s, \t1\().4s, \t4\().4s   // E m2[2,3].imre*t1[0,2]
+        fmul            \t3\().4s, \t3\().4s, \c1\().4s   // E m3[2,3].imre*t1[4,6]
+        fmls            \t1\().4s, \o2\().4s, \c3\().4s   // E j0123
+        fmls            \t3\().4s, \o3\().4s, \t5\().4s   // E j4567
+
+        fsub            \t0\().4s, \t1\().4s, \o0\().4s
+        fadd            \t1\().4s, \t1\().4s, \o0\().4s
+        fadd            \t2\().4s, \t3\().4s, \o1\().4s
+        fsub            \t3\().4s, \t3\().4s, \o1\().4s
+
+        fmul            \t0\().4s, \t0\().4s, v31.4s
+        fmul            \t3\().4s, \t3\().4s, v31.4s
+
+        rev64           \t1\().4s, \t1\().4s
+        rev64           \t2\().4s, \t2\().4s
+
+.if \part == 0
+        fsub            \o0\().4s, \e0\().4s, \t1\().4s
+        fsub            \o1\().4s, \e1\().4s, \t2\().4s
+        fsub            \o2\().4s, \e2\().4s, \t0\().4s
+        fsub            \o3\().4s, \e3\().4s, \t3\().4s
+.else
+        fsub            \o0\().4s, \e0\().4s, \t1\().4s
+        fadd            \o2\().4s, \e1\().4s, \t2\().4s
+        fsub            \o1\().4s, \e2\().4s, \t0\().4s
+        fadd            \o3\().4s, \e3\().4s, \t3\().4s
+.endif
+
+.if \part == 0
+        fadd            \e0\().4s, \e0\().4s, \t1\().4s
+        fadd            \e1\().4s, \e1\().4s, \t2\().4s
+        fadd            \e2\().4s, \e2\().4s, \t0\().4s
+        fadd            \e3\().4s, \e3\().4s, \t3\().4s
+.else
+        fadd            \e0\().4s, \e0\().4s, \t1\().4s
+        fsub            \e1\().4s, \e1\().4s, \t2\().4s   // swapped
+        fadd            \e2\().4s, \e2\().4s, \t0\().4s   // swapped
+        fsub            \e3\().4s, \e3\().4s, \t3\().4s
+.endif
+.endm
+
+/* Same as SR_COMBINE_HALF, but heroically tries to use 3 temporary registers
+ * without touching the tables. */
+.macro SR_COMBINE_LITE e0, e1, e2, e3, \
+                       o0, o1, o2, o3, \
+                       c0, c1, c2, c3, \
+                       t0, t1, t2, part
+
+        rev64           \t0\().4s, \o0\().4s              // E m2[0,1].imre
+        rev64           \t1\().4s, \o2\().4s              // E m2[2,3].imre
+.if \part == 0
+        trn2            \t2\().4s, \c3\().4s, \c3\().4s   // wim3311
+.else
+        trn1            \t2\().4s, \c3\().4s, \c3\().4s   // wim3311
+.endif
+        fmul            \t2\().4s, \t2\().4s, v31.4s
+        fmul            \o2\().4s, \o2\().4s, \t2\().4s
+        fmul            \o0\().4s, \o0\().4s, \t2\().4s   // E m2[0,1].imre*t1[0,2]
+.if \part == 0
+        trn1            \t2\().4s, \c0\().4s, \c0\().4s   // cos0022
+.else
+        trn2            \t2\().4s, \c0\().4s, \c0\().4s   // cos0022
+.endif
+        fmul            \t1\().4s, \t1\().4s, \t2\().4s   // E m2[2,3].imre*t1[0,2]
+        fmla            \o0\().4s, \t0\().4s, \t2\().4s   // E w0123
+        fsub            \t1\().4s, \t1\().4s, \o2\().4s   // E j0123
+
+        rev64           \t2\().4s, \o1\().4s              // E m3[0,1].imre
+        rev64           \o2\().4s, \o3\().4s              // E m3[2,3].imre
+
+.if \part == 0
+        trn2            \t0\().4s, \c2\().4s, \c2\().4s   // wim7755
+.else
+        trn1            \t0\().4s, \c2\().4s, \c2\().4s   // wim7755
+.endif
+        fmul            \t0\().4s, \t0\().4s, v31.4s
+
+        fmul            \o1\().4s, \o1\().4s, \t0\().4s   // E m3[0,1].imre*t1[4,6]
+        fmul            \o3\().4s, \o3\().4s, \t0\().4s
+
+.if \part == 0
+        trn1            \t0\().4s, \c1\().4s, \c1\().4s   // cos1133
+.else
+        trn2            \t0\().4s, \c1\().4s, \c1\().4s   // cos1133
+.endif
+        fmul            \o2\().4s, \o2\().4s, \t0\().4s   // E m3[2,3].imre*t1[4,6]
+        fmla            \o1\().4s, \t2\().4s, \t0\().4s   // E w4567
+        fsub            \o2\().4s, \o2\().4s, \o3\().4s   // E j4567
+
+        fsub            \t0\().4s, \t1\().4s, \o0\().4s
+        fadd            \o0\().4s, \t1\().4s, \o0\().4s
+        fadd            \t2\().4s, \o2\().4s, \o1\().4s
+        fsub            \t1\().4s, \o2\().4s, \o1\().4s
+
+        fmul            \t0\().4s, \t0\().4s, v31.4s
+        fmul            \t1\().4s, \t1\().4s, v31.4s
+
+        rev64           \t2\().4s, \t2\().4s
+        rev64           \o0\().4s, \o0\().4s
+
+.if \part == 0
+        fsub            \o1\().4s, \e1\().4s, \t2\().4s
+        fsub            \o2\().4s, \e2\().4s, \t0\().4s
+        fsub            \o3\().4s, \e3\().4s, \t1\().4s
+.else
+        fadd            \o2\().4s, \e1\().4s, \t0\().4s
+        fsub            \o1\().4s, \e2\().4s, \t2\().4s
+        fadd            \o3\().4s, \e3\().4s, \t1\().4s
+.endif
+
+.if \part == 0
+        fadd            \e1\().4s, \e1\().4s, \t2\().4s
+        fadd            \e2\().4s, \e2\().4s, \t0\().4s
+        fadd            \e3\().4s, \e3\().4s, \t1\().4s
+.else
+        fsub            \e1\().4s, \e1\().4s, \t0\().4s   // swapped
+        fadd            \e2\().4s, \e2\().4s, \t2\().4s   // swapped
+        fsub            \e3\().4s, \e3\().4s, \t1\().4s
+.endif
+
+        mov             \t1\().16b, \o0\().16b
+
+        fsub            \o0\().4s, \e0\().4s, \t1\().4s
+        fadd            \e0\().4s, \e0\().4s, \t1\().4s
+.endm
+
+.macro SR_COMBINE_4 len, part, off
+        add             x10, x1, x21
+        add             x11, x1, x21, lsl #1
+        add             x12, x1, x22
+
+        ldp             q0,  q1,  [x1,  #((0 + \part)*32 + \off)]
+        ldp             q4,  q5,  [x1,  #((2 + \part)*32 + \off)]
+        ldp             q2,  q3,  [x10, #((0 + \part)*32 + \off)]
+        ldp             q6,  q7,  [x10, #((2 + \part)*32 + \off)]
+
+        ldp             q8,  q9,  [x11, #((0 + \part)*32 + \off)]
+        ldp             q10, q11, [x11, #((2 + \part)*32 + \off)]
+        ldp             q12, q13, [x12, #((0 + \part)*32 + \off)]
+        ldp             q14, q15, [x12, #((2 + \part)*32 + \off)]
+
+        SR_COMBINE      v0,  v1,  v2,  v3,  v4,  v6,  v5,  v7, \
+                        v8,  v9, v10, v11, v12, v13, v14, v15, \
+                        x7,  x8, x9, 0
+
+        stp             q0,  q1,  [x1,  #((0 + \part)*32 + \off)]
+        stp             q4,  q5,  [x1,  #((2 + \part)*32 + \off)]
+        stp             q2,  q3,  [x10, #((0 + \part)*32 + \off)]
+        stp             q6,  q7,  [x10, #((2 + \part)*32 + \off)]
+
+        stp             q8,  q9,  [x11, #((0 + \part)*32 + \off)]
+        stp             q12, q13, [x11, #((2 + \part)*32 + \off)]
+        stp             q10, q11, [x12, #((0 + \part)*32 + \off)]
+        stp             q14, q15, [x12, #((2 + \part)*32 + \off)]
+.endm
+
+.macro SR_COMBINE_FULL len, off=0
+        add             x10, x1, x21
+        add             x11, x1, x21, lsl #1
+        add             x12, x1, x22
+
+        SR_COMBINE_4    \len, 0, \off
+        SR_COMBINE_4    \len, 1, \off
+        SR_COMBINE_4    \len, 4, \off
+        SR_COMBINE_4    \len, 5, \off
+.endm
+
+.macro SR_COMBINE_D2 part, off
+        add             x10,  x1, #((\part)*32 + \off)
+        add             x11, x14, #((\part)*32 + \off)
+        add             x12, x15, #((\part)*32 + \off)
+        add             x13, x16, #((\part)*32 + \off)
+
+        ldp             q0,  q1,  [x10]
+        ldp             q4,  q5,  [x10, #(2*32)]
+        ldp             q2,  q3,  [x11]
+        ldp             q6,  q7,  [x11, #(2*32)]
+
+        ldp             q8,  q9,  [x12]
+        ldp             q10, q11, [x12, #(2*32)]
+        ldp             q12, q13, [x13]
+        ldp             q14, q15, [x13, #(2*32)]
+
+        SR_COMBINE      v0,  v1,  v2,  v3,  v4,  v6,  v5,  v7, \
+                        v8,  v9, v10, v11, v12, v13, v14, v15, \
+                        x7,  x8, x9, 0, \
+                        v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27
+
+        zip1            v16.2d, v0.2d, v4.2d
+        zip2            v17.2d, v0.2d, v4.2d
+        zip1            v18.2d, v1.2d, v5.2d
+        zip2            v19.2d, v1.2d, v5.2d
+
+        zip1            v20.2d, v2.2d, v6.2d
+        zip2            v21.2d, v2.2d, v6.2d
+        zip1            v22.2d, v3.2d, v7.2d
+        zip2            v23.2d, v3.2d, v7.2d
+
+        ldp             q0,  q1,  [x10, #(1*32)]
+        ldp             q4,  q5,  [x10, #(3*32)]
+        ldp             q2,  q3,  [x11, #(1*32)]
+        ldp             q6,  q7,  [x11, #(3*32)]
+
+        st1             { v16.4s, v17.4s, v18.4s, v19.4s }, [x10], #64
+        st1             { v20.4s, v21.4s, v22.4s, v23.4s }, [x11], #64
+
+        zip1            v20.2d, v8.2d, v12.2d
+        zip2            v21.2d, v8.2d, v12.2d
+        zip1            v22.2d, v9.2d, v13.2d
+        zip2            v23.2d, v9.2d, v13.2d
+        zip1            v24.2d, v10.2d, v14.2d
+        zip2            v25.2d, v10.2d, v14.2d
+        zip1            v26.2d, v11.2d, v15.2d
+        zip2            v27.2d, v11.2d, v15.2d
+
+        ldp             q8,  q9,  [x12, #(1*32)]
+        ldp             q10, q11, [x12, #(3*32)]
+        ldp             q12, q13, [x13, #(1*32)]
+        ldp             q14, q15, [x13, #(3*32)]
+
+        st1             { v20.4s, v21.4s, v22.4s, v23.4s }, [x12], #64
+        st1             { v24.4s, v25.4s, v26.4s, v27.4s }, [x13], #64
+
+        SR_COMBINE      v0,  v1,  v2,  v3,  v4,  v6,  v5,  v7, \
+                        v8,  v9, v10, v11, v12, v13, v14, v15, \
+                        x7,  x8, x9, 0, \
+                        v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27
+
+        zip1            v16.2d, v0.2d, v4.2d
+        zip2            v17.2d, v0.2d, v4.2d
+        zip1            v18.2d, v1.2d, v5.2d
+        zip2            v19.2d, v1.2d, v5.2d
+        st1             { v16.4s, v17.4s, v18.4s, v19.4s }, [x10]
+
+        zip1            v16.2d, v2.2d, v6.2d
+        zip2            v17.2d, v2.2d, v6.2d
+        zip1            v18.2d, v3.2d, v7.2d
+        zip2            v19.2d, v3.2d, v7.2d
+        st1             { v16.4s, v17.4s, v18.4s, v19.4s }, [x11]
+
+        zip1            v20.2d, v8.2d, v12.2d
+        zip2            v21.2d, v8.2d, v12.2d
+        zip1            v22.2d, v9.2d, v13.2d
+        zip2            v23.2d, v9.2d, v13.2d
+        st1             { v20.4s, v21.4s, v22.4s, v23.4s }, [x12]
+
+        zip1            v24.2d, v10.2d, v14.2d
+        zip2            v25.2d, v10.2d, v14.2d
+        zip1            v26.2d, v11.2d, v15.2d
+        zip2            v27.2d, v11.2d, v15.2d
+        st1             { v24.4s, v25.4s, v26.4s, v27.4s }, [x13]
+.endm
+
+.macro SR_COMBINE_DINT off=0
+        add             x14, x1, x21
+        add             x15, x1, x21, lsl #1
+        add             x16, x1, x22
+
+        SR_COMBINE_D2   0, \off
+        SR_COMBINE_D2   4, \off
+.endm
+
+.macro FFT32_FN name, no_perm
+function ff_tx_fft32_\name\()_neon, export=1
+        stp             d14, d15, [sp, #-16*4]!
+        stp             d8,  d9,  [sp, #16*3]
+        stp             d10, d11, [sp, #16*2]
+        stp             d12, d13, [sp, #16]
+
+        LOAD_SUBADD
+        SETUP_SR_RECOMB 32, x7, x8, x9
+
+        SETUP_LUT       \no_perm
+        LOAD_INPUT      0,  1,  2,  3,  x2, \no_perm
+        LOAD_INPUT      4,  5,  6,  7,  x2, \no_perm
+        LOAD_INPUT      8,  9,  10, 11, x2, \no_perm
+        LOAD_INPUT      12, 13, 14, 15, x2, \no_perm
+
+        FFT8_X2         v8, v9, v10, v11, v12, v13, v14, v15
+        FFT16           v0, v1, v2, v3, v4, v5, v6, v7
+
+        SR_COMBINE      v0,  v1,  v2,  v3,  v4,  v5,  v6,  v7, \
+                        v8,  v9, v10, v11, v12, v13, v14, v15, \
+                        x7,  x8,  x9, 0
+
+        zip1            v16.2d, v0.2d, v4.2d
+        zip2            v17.2d, v0.2d, v4.2d
+        zip1            v18.2d, v1.2d, v6.2d
+        zip2            v19.2d, v1.2d, v6.2d
+        st1             { v16.4s, v17.4s, v18.4s, v19.4s }, [x1], #64
+
+        zip1            v20.2d, v2.2d, v5.2d
+        zip2            v21.2d, v2.2d, v5.2d
+        zip1            v22.2d, v3.2d, v7.2d
+        zip2            v23.2d, v3.2d, v7.2d
+        st1             { v20.4s, v21.4s, v22.4s, v23.4s }, [x1], #64
+
+        zip1            v24.2d, v8.2d, v12.2d
+        zip2            v25.2d, v8.2d, v12.2d
+        zip1            v26.2d, v9.2d, v13.2d
+        zip2            v27.2d, v9.2d, v13.2d
+        st1             { v24.4s, v25.4s, v26.4s, v27.4s }, [x1], #64
+
+        zip1            v28.2d, v10.2d, v14.2d
+        zip2            v29.2d, v10.2d, v14.2d
+        zip1            v30.2d, v11.2d, v15.2d
+        zip2            v31.2d, v11.2d, v15.2d
+        st1             { v28.4s, v29.4s, v30.4s, v31.4s }, [x1]
+
+        ldp             d12, d13, [sp, #16]
+        ldp             d10, d11, [sp, #16*2]
+        ldp             d8,  d9,  [sp, #16*3]
+        ldp             d14, d15, [sp], #16*4
+
+        ret
+endfunc
+.endm
+
+FFT32_FN float,    0
+FFT32_FN ns_float, 1
+
+.macro cmp_imm reg, imm
+.if \imm >= 4096
+        cmp             \reg, #((\imm)/4096), lsl #12
+.else
+        cmp             \reg, #(\imm)
+.endif
+.endm
+
+.macro SR_TRANSFORM_DEF len, next=0
+\len:
+        stp             x20, x30, [sp, #-16]!
+        mov             w20, #(\len/4)
+        mov             x5, #((\len*4) - (\len/1))
+        add             x1, x1, x5
+        bl              32b
+        mov             x5, #((\len*2) - (\len/2))
+        add             x1, x1, x5
+        bl              32b
+        ldp             x20, x30, [sp], #16
+        ldr             w5, =(\len*6 + \len/2)
+        sub             x1, x1, x5
+
+        SETUP_SR_RECOMB \len, x7, x8, x9
+
+.if \next\() != 0
+        cmp_imm         w19, \len
+        b.eq            0f
+
+        mov             w5, #(\len/128)
+\len\()5:
+        SR_COMBINE_FULL \len
+        add             x1, x1, 8*32
+        subs            w5, w5, 1
+        b.gt            \len\()5b
+
+        cmp_imm         w20, \len
+        b.gt            \next\()f
+        ret
+.endif
+.endm
+
+.macro FFT_SPLIT_RADIX_FN name, no_perm
+function ff_tx_fft_sr_\name\()_neon, export=1
+        stp             x21, x22, [sp, #-16*6]!
+        stp             d8,  d9,  [sp, #16*5]
+        stp             d10, d11, [sp, #16*4]
+        stp             d12, d13, [sp, #16*3]
+        stp             d14, d15, [sp, #16*2]
+        stp             x19, x20, [sp, #16]
+
+        ldr             w19, [x0, #0] // global target
+        mov             w20, w19      // local length
+
+        LOAD_SUBADD
+        SETUP_LUT       \no_perm
+
+32:
+        SETUP_SR_RECOMB 32, x7, x8, x9
+
+        LOAD_INPUT      0,  1,  2,  3,  x2, \no_perm
+        LOAD_INPUT      4,  6,  5,  7,  x2, \no_perm, 1
+        LOAD_INPUT      8,  9,  10, 11, x2, \no_perm
+        LOAD_INPUT      12, 13, 14, 15, x2, \no_perm
+
+        FFT8_X2         v8, v9, v10, v11, v12, v13, v14, v15
+        FFT16           v0, v1, v2, v3, v4, v6, v5, v7
+
+        SR_COMBINE      v0,  v1,  v2,  v3,  v4,  v6,  v5,  v7,  \
+                        v8,  v9,  v10, v11, v12, v13, v14, v15, \
+                        x7,  x8,  x9,  0
+
+        stp             q2,  q3,  [x1, #32*1]
+        stp             q6,  q7,  [x1, #32*3]
+        stp             q10, q11, [x1, #32*5]
+        stp             q14, q15, [x1, #32*7]
+
+        cmp             w20, #32
+        b.gt            64f
+
+        stp             q0,  q1,  [x1, #32*0]
+        stp             q4,  q5,  [x1, #32*2]
+        stp             q8,  q9,  [x1, #32*4]
+        stp             q12, q13, [x1, #32*6]
+
+        ret
+64:
+        SETUP_SR_RECOMB 64, x7, x8, x9
+
+        LOAD_INPUT      2,  3,  10, 11, x2, \no_perm, 1
+        LOAD_INPUT      6,  14, 7,  15, x2, \no_perm, 1
+
+        FFT16           v2, v3, v10, v11, v6, v14, v7, v15
+
+        LOAD_INPUT      16, 17, 18, 19, x2, \no_perm
+        LOAD_INPUT      20, 22, 21, 23, x2, \no_perm, 1
+
+        FFT16           v16, v17, v18, v19, v20, v22, v21, v23, \
+                        v24, v25, v26, v27, v28, v29, v30
+
+        ld1             { v26.4s, v27.4s }, [x8], x9
+        ldp             q24, q25, [x7], #32
+
+        ext             v26.16b, v26.16b, v26.16b, #8
+        ext             v27.16b, v27.16b, v27.16b, #8
+
+        cmp             w19, #64
+        b.eq            2f // custom deinterleave
+
+        // TODO: investigate doing the 2 combines like in deinterleave
+        // TODO: experiment with spilling to gprs and converting to HALF or full
+        SR_COMBINE_LITE v0,  v1,  v8,  v9,  \
+                        v2,  v3,  v16, v17, \
+                        v24, v25, v26, v27, \
+                        v28, v29, v30, 0
+
+        stp             q0,  q1,  [x1, #32* 0]
+        stp             q8,  q9,  [x1, #32* 4]
+        stp             q2,  q3,  [x1, #32* 8]
+        stp             q16, q17, [x1, #32*12]
+
+        SR_COMBINE_HALF v4,  v5,  v12, v13, \
+                        v6,  v7,  v20, v21, \
+                        v24, v25, v26, v27, \
+                        v28, v29, v30, v0, v1, v8, 1
+
+        stp             q4,  q20, [x1, #32* 2]
+        stp             q12, q21, [x1, #32* 6]
+        stp             q6,  q5,  [x1, #32*10]
+        stp             q7,  q13, [x1, #32*14]
+
+        ldp             q2,  q3,  [x1, #32*1]
+        ldp             q6,  q7,  [x1, #32*3]
+        ldp             q12, q13, [x1, #32*5]
+        ldp             q16, q17, [x1, #32*7]
+
+        SR_COMBINE      v2,  v3,  v12, v13, v6,  v16, v7,  v17, \
+                        v10, v11, v14, v15, v18, v19, v22, v23, \
+                        x7,  x8,  x9,  0, \
+                        v24, v25, v26, v27, v28, v29, v30, v8, v0, v1, v4, v5
+
+        stp             q2,  q3,  [x1, #32* 1]
+        stp             q6,  q7,  [x1, #32* 3]
+        stp             q12, q13, [x1, #32* 5]
+        stp             q16, q17, [x1, #32* 7]
+
+        stp             q10, q11, [x1, #32* 9]
+        stp             q18, q19, [x1, #32*11]
+        stp             q14, q15, [x1, #32*13]
+        stp             q22, q23, [x1, #32*15]
+
+        cmp             w20, #64
+        b.gt            128f
+        ret
+128:
+        stp             x20, x30, [sp, #-16]!
+        mov             w20, #32
+        add             x1, x1, #16*32
+        bl              32b
+        add             x1, x1, #8*32
+        bl              32b
+        ldp             x20, x30, [sp], #16
+        sub             x1, x1, #24*32
+
+        SETUP_SR_RECOMB 128, x7, x8, x9
+
+        cmp             w19, #128
+        b.eq            0f
+
+        SR_COMBINE_FULL 128
+
+        cmp             w20, #128
+        b.gt            256f
+        ret
+256:
+        stp             x20, x30, [sp, #-16]!
+        mov             w20, #64
+        add             x1, x1, #32*32
+        bl              32b
+        add             x1, x1, #16*32
+        bl              32b
+        ldp             x20, x30, [sp], #16
+        sub             x1, x1, #48*32
+
+        SETUP_SR_RECOMB 256, x7, x8, x9
+
+        cmp             w19, #256
+        b.eq            0f
+
+        SR_COMBINE_FULL 256
+        SR_COMBINE_FULL 256, 8*32
+
+        cmp             w20, #256
+        b.gt            512f
+        ret
+512:
+        stp             x20, x30, [sp, #-16]!
+        mov             w20, #128
+        add             x1, x1, #64*32
+        bl              32b
+        add             x1, x1, #32*32
+        bl              32b
+        ldp             x20, x30, [sp], #16
+        sub             x1, x1, #96*32
+
+        SETUP_SR_RECOMB 512, x7, x8, x9
+
+        cmp             w19, #512
+        b.eq            0f
+
+        mov             x5, 4
+5125:
+        SR_COMBINE_FULL 512
+        add             x1, x1, 8*32
+        subs            w5, w5, 1
+        b.gt            5125b
+
+        cmp             w20, #512
+        b.gt            1024f
+
+        ret
+1024:
+        stp             x20, x30, [sp, #-16]!
+        mov             w20, #256
+        add             x1, x1, #96*32
+        bl              32b
+        add             x1, x1, #64*32
+        bl              32b
+        ldp             x20, x30, [sp], #16
+        mov             x5, #192*32
+        sub             x1, x1, x5
+
+        SETUP_SR_RECOMB 1024, x7, x8, x9
+
+        cmp             w19, #1024
+        b.eq            0f
+
+        mov             w5, 8
+10245:
+        SR_COMBINE_FULL 1024
+        add             x1, x1, 8*32
+        subs            w5, w5, 1
+        b.gt            10245b
+
+        cmp             w20, #1024
+        b.gt            2048f
+
+        ret
+
+SR_TRANSFORM_DEF        2048, 4096
+SR_TRANSFORM_DEF        4096, 8192
+SR_TRANSFORM_DEF        8192, 16384
+SR_TRANSFORM_DEF        16384, 32768
+SR_TRANSFORM_DEF        32768, 65536
+SR_TRANSFORM_DEF        65536, 131072
+SR_TRANSFORM_DEF        131072
+
+0: // general deinterleave loop
+        SR_COMBINE_DINT
+        add             x1, x1, #32*8
+        subs            w19, w19, #32*4
+        b.gt            0b
+
+        ldp             x19, x20, [sp, #16]
+        ldp             d14, d15, [sp, #16*2]
+        ldp             d12, d13, [sp, #16*3]
+        ldp             d10, d11, [sp, #16*4]
+        ldp             d8,  d9,  [sp, #16*5]
+        ldp             x21, x22, [sp], #16*6
+
+        ret
+
+2: // special case for 64 point deinterleave
+        mov             x10, v23.d[0]
+        mov             x11, v23.d[1]
+
+        SR_COMBINE_LITE v0,  v1,  v8,  v9,  \
+                        v2,  v3,  v16, v17, \
+                        v24, v25, v26, v27, \
+                        v28, v29, v30, 0
+
+        SR_COMBINE_HALF v4,  v5,  v12, v13, \
+                        v6,  v7,  v20, v21, \
+                        v24, v25, v26, v27, \
+                        v28, v29, v30, v23, v24, v26, 1
+
+        zip1            v23.2d, v0.2d, v4.2d
+        zip2            v24.2d, v0.2d, v4.2d
+        zip1            v25.2d, v1.2d, v20.2d
+        zip2            v26.2d, v1.2d, v20.2d
+
+        zip1            v27.2d, v8.2d, v12.2d
+        zip2            v28.2d, v8.2d, v12.2d
+        zip1            v29.2d, v9.2d, v21.2d
+        zip2            v30.2d, v9.2d, v21.2d
+
+        mov             v20.16b, v5.16b
+        mov             v21.16b, v7.16b
+        mov             x12, x1
+        add             x13, x1, #32* 4
+        add             x14, x1, #32* 8
+        add             x15, x1, #32*12
+
+        zip1            v4.2d,  v2.2d, v6.2d
+        zip2            v5.2d,  v2.2d, v6.2d
+        zip1            v6.2d,  v3.2d, v20.2d
+        zip2            v7.2d,  v3.2d, v20.2d
+
+        zip1            v0.2d, v16.2d, v21.2d
+        zip2            v1.2d, v16.2d, v21.2d
+        zip1            v2.2d, v17.2d, v13.2d
+        zip2            v3.2d, v17.2d, v13.2d
+
+        // stp is faster by a little on A53, but this is faster on M1s (theory)
+        ldp             q8,  q9, [x1, #32*1]
+        ldp             q12, q13, [x1, #32*5]
+
+        st1             { v23.4s, v24.4s, v25.4s, v26.4s }, [x12], #64  // 32* 0...1
+        st1             { v27.4s, v28.4s, v29.4s, v30.4s }, [x13], #64  // 32* 4...5
+        st1             {  v4.4s,  v5.4s,  v6.4s,  v7.4s }, [x14], #64  // 32* 8...9
+        st1             {  v0.4s,  v1.4s,  v2.4s,  v3.4s }, [x15], #64  // 32*12..13
+
+        mov             v23.d[0], x10
+        mov             v23.d[1], x11
+
+        ldp             q6,  q7, [x1, #32*3]
+        ldp             q16, q17, [x1, #32*7]
+
+        SR_COMBINE      v8,  v9,  v12, v13, v6,  v16, v7,  v17, \
+                        v10, v11, v14, v15, v18, v19, v22, v23, \
+                        x7,  x8,  x9,  0, \
+                        v24, v25, v26, v27, v28, v29, v30, v4, v0, v1, v5, v20
+
+        zip1            v0.2d, v8.2d, v6.2d
+        zip2            v1.2d, v8.2d, v6.2d
+        zip1            v2.2d, v9.2d, v7.2d
+        zip2            v3.2d, v9.2d, v7.2d
+        st1             {  v0.4s,  v1.4s,  v2.4s,  v3.4s }, [x12]
+
+        zip1            v4.2d, v12.2d, v16.2d
+        zip2            v5.2d, v12.2d, v16.2d
+        zip1            v6.2d, v13.2d, v17.2d
+        zip2            v7.2d, v13.2d, v17.2d
+        st1             {  v4.4s,  v5.4s,  v6.4s,  v7.4s }, [x13]
+
+        zip1            v0.2d, v10.2d, v18.2d
+        zip2            v1.2d, v10.2d, v18.2d
+        zip1            v2.2d, v11.2d, v19.2d
+        zip2            v3.2d, v11.2d, v19.2d
+        st1             {  v0.4s,  v1.4s,  v2.4s,  v3.4s }, [x14]
+
+        zip1            v4.2d, v14.2d, v22.2d
+        zip2            v5.2d, v14.2d, v22.2d
+        zip1            v6.2d, v15.2d, v23.2d
+        zip2            v7.2d, v15.2d, v23.2d
+        st1             {  v4.4s,  v5.4s,  v6.4s,  v7.4s }, [x15]
+
+        ldp             x19, x20, [sp, #16]
+        ldp             d14, d15, [sp, #16*2]
+        ldp             d12, d13, [sp, #16*3]
+        ldp             d10, d11, [sp, #16*4]
+        ldp             d8,  d9,  [sp, #16*5]
+        ldp             x21, x22, [sp], #16*6
+
+        ret
+endfunc
+.endm
+
+FFT_SPLIT_RADIX_FN float, 0
+FFT_SPLIT_RADIX_FN ns_float, 1
author	Daniel Baumann <daniel.baumann@progress-linux.org>	2024-04-19 00:47:55 +0000
committer	Daniel Baumann <daniel.baumann@progress-linux.org>	2024-04-19 00:47:55 +0000
commit	26a029d407be480d791972afb5975cf62c9360a6 (patch)
tree	f435a8308119effd964b339f76abb83a57c29483 /media/ffvpx/libavutil/aarch64
parent	Initial commit. (diff)
download	firefox-26a029d407be480d791972afb5975cf62c9360a6.tar.xz firefox-26a029d407be480d791972afb5975cf62c9360a6.zip