summaryrefslogtreecommitdiffstats
path: root/media/ffvpx/libavutil/aarch64/tx_float_neon.S
diff options
context:
space:
mode:
Diffstat (limited to 'media/ffvpx/libavutil/aarch64/tx_float_neon.S')
-rw-r--r--media/ffvpx/libavutil/aarch64/tx_float_neon.S1294
1 files changed, 1294 insertions, 0 deletions
diff --git a/media/ffvpx/libavutil/aarch64/tx_float_neon.S b/media/ffvpx/libavutil/aarch64/tx_float_neon.S
new file mode 100644
index 0000000000..78e4876d6c
--- /dev/null
+++ b/media/ffvpx/libavutil/aarch64/tx_float_neon.S
@@ -0,0 +1,1294 @@
+/*
+ * Copyright (c) Lynne
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavutil/aarch64/asm.S"
+
+/* Open `doc/transforms.md` to see the code upon which the transforms here were
+ * based upon.
+ *
+ * File conventions:
+ * GPRs: x0-x3 - arguments, untouched
+ * x4 - Lookup table base pointer
+ * x5-x6 - macro ld1 temps/function scratch
+ * x7-x9 - FFT table state
+ * x10-x17 - lookup table/macro scratch
+ * w19-w20 - current/target length when needed
+ * x21-x22 - len*2, len*6
+ *
+ * Vectors: v0-v7 - coefficients
+ * v8-v15 - coefficients when needed, otherwise untouched
+ * v16-v30 - used as needed
+ * v31 - -1.0, +1.0, -1.0, +1.0. Never touched after loading.
+ *
+ * Stack: backup for v8-v15 and x19-x22 when needed, and transform lengths
+ */
+
+#define M_SQRT1_2 0.707106781186547524401
+#define COS16_1 0.92387950420379638671875
+#define COS16_3 0.3826834261417388916015625
+
+/* We only ever load this once at the start, and then live with losing an
+ * entire register as we need to lug this all the time everywhere.
+ * Clearly should be integrated into an fsadd and fmlsa, but "muh RISC!". */
+const subadd, align=4
+ .float -1.0, 1.0, -1.0, 1.0
+endconst
+
+.macro LOAD_SUBADD
+ movrel x5, subadd
+ ld1 { v31.4s }, [x5]
+.endm
+
+.macro SETUP_LUT no_lut=0
+.if \no_lut == 0
+ ldr x4, [x0, #8]
+.endif
+.endm
+
+.macro LOAD_INPUT dst1, dst2, dst3, dst4, src, no_lut=0, discont=0
+.if \no_lut == 1
+.if \discont == 1
+ ldp q\dst1\(), q\dst2\(), [\src\()]
+ ldp q\dst3\(), q\dst4\(), [\src\(), #32]
+ add \src\(), \src\(), #64
+.else
+ ld1 { v\dst1\().4s, v\dst2\().4s, v\dst3\().4s, v\dst4\().4s }, [\src], #64
+.endif
+.else
+ ldp w10, w11, [x4, #0 ]
+ ldp w12, w13, [x4, #8 ]
+ ldp w14, w15, [x4, #16]
+ ldp w16, w17, [x4, #24]
+
+ add x4, x4, #32
+
+ ldr d\dst1, [\src, x10, lsl #3]
+ add x11, \src, x11, lsl #3
+ ldr d\dst2, [\src, x12, lsl #3]
+ add x13, \src, x13, lsl #3
+ ldr d\dst3, [\src, x14, lsl #3]
+ add x15, \src, x15, lsl #3
+ ldr d\dst4, [\src, x16, lsl #3]
+ add x17, \src, x17, lsl #3
+
+ ld1 { v\dst1\().d }[1], [x11]
+ ld1 { v\dst2\().d }[1], [x13]
+ ld1 { v\dst3\().d }[1], [x15]
+ ld1 { v\dst4\().d }[1], [x17]
+.endif
+.endm
+
+.macro FFT4 e0, o0, standalone
+ fadd v16.4s, \e0\().4s, \o0\().4s // r1..4
+ fsub \e0\().4s, \e0\().4s, \o0\().4s // t1..4
+
+ rev64 v18.4s, \e0\().4s
+
+ zip2 \o0\().2d, v16.2d, \e0\().2d
+ zip1 v17.2d, v16.2d, \e0\().2d
+
+ mov \o0\().d[1], v18.d[1]
+
+ fadd \e0\().4s, v17.4s, \o0\().4s // a1,2 b1,4
+ fsub v16.4s, v17.4s, \o0\().4s // a3,4 b3,2
+
+ mov \o0\().16b, v16.16b // Swap once again...
+ mov \o0\().s[3], \e0\().s[3]
+ mov \e0\().s[3], v16.s[3]
+
+.if \standalone == 0
+ uzp2 \o0\().2d, \e0\().2d, \o0\().2d
+ uzp1 \e0\().2d, \e0\().2d, v16.2d
+.endif
+.endm
+
+const shuf_4pt_x2, align=4
+ .byte 24, 25, 26, 27 // reg2, 3
+ .byte 12, 13, 14, 15 // reg1, 4
+ .byte 8, 9, 10, 11 // reg1, 3
+ .byte 28, 29, 30, 31 // reg2, 4
+endconst
+
+// Identical to FFT4, but does 2 transforms in parallel, with no deinterleaving
+.macro FFT4_X2 e0, o0, e1, o1, \
+ t0=v16, t1=v17, t2=v18, t3=v19, t4=v20, t5=v21, t6=v22
+
+ fadd \t0\().4s, \e0\().4s, \o0\().4s // r1234
+ fadd \t2\().4s, \e1\().4s, \o1\().4s // r1234
+ fsub \e0\().4s, \e0\().4s, \o0\().4s // t1234
+ fsub \e1\().4s, \e1\().4s, \o1\().4s // t1234
+
+ movrel x5, shuf_4pt_x2
+
+ rev64 \t4\().4s, \e0\().4s
+ rev64 \t5\().4s, \e1\().4s
+
+ zip2 \o0\().2d, \t0\().2d, \e0\().2d // t3,4 r3,4
+ zip2 \o1\().2d, \t2\().2d, \e1\().2d // t3,4 r3,4
+
+ ld1 { \t6\().16b }, [x5]
+
+ mov \o0\().d[1], \t4\().d[1]
+ mov \o1\().d[1], \t5\().d[1]
+
+ zip1 \t1\().2d, \t0\().2d, \e0\().2d // t1,2 r1,2
+ zip1 \t3\().2d, \t2\().2d, \e1\().2d // t1,2 r1,2
+
+ fsub \t4\().4s, \t1\().4s, \o0\().4s // a34 b32
+ fadd \t5\().4s, \t1\().4s, \o0\().4s // a12 b14
+ fsub \t2\().4s, \t3\().4s, \o1\().4s // a34 b32
+ fadd \t3\().4s, \t3\().4s, \o1\().4s // a12 b14
+
+ // TODO: experiment with movs instead of tables here
+ tbl \o0\().16b, { \t4\().16b, \t5\().16b }, \t6\().16b // b1234
+ tbl \o1\().16b, { \t2\().16b, \t3\().16b }, \t6\().16b // b1234
+
+ zip1 \e0\().2d, \t5\().2d, \t4\().2d // a1234
+// zip2 \o0\().2d, \t5\().2d, \t4\().2d // b1432
+ zip1 \e1\().2d, \t3\().2d, \t2\().2d // a1234
+// zip2 \o1\().2d, \t3\().2d, \t2\().2d // b1432
+// rev64 \o0\().4s, \o0\().4s // b4123
+// rev64 \o1\().4s, \o1\().4s // b4123
+// ext \o0\().16b, \o0\().16b, \o0\().16b, #4 // b1234
+// ext \o1\().16b, \o1\().16b, \o1\().16b, #4 // b1234
+.endm
+
+const tab_8pt, align=4
+ .float M_SQRT1_2, -M_SQRT1_2, -M_SQRT1_2, M_SQRT1_2
+endconst
+
+.macro FFT8 e0, e1, o0, o1, \
+ t0=v16, t1=v17, t2=v18, t3=v19, t4=v20, t5=v21, t6=v22
+
+ movrel x5, tab_8pt
+
+ fsub \t1\().4s, \e1\().4s, \o1\().4s // j1234
+ fadd \o1\().4s, \e1\().4s, \o1\().4s // k1234
+ fsub \t0\().4s, \e0\().4s, \o0\().4s // r1234
+ fadd \o0\().4s, \e0\().4s, \o0\().4s // q1234
+
+ ld1 { \t5\().4s }, [x5]
+
+ ext \t4\().16b, \o1\().16b, \o1\().16b, #12
+ rev64 \t4\().4s, \t4\().4s
+
+ ext \t2\().16b, \o0\().16b, \t4\().16b, #8 // o0[0,1], o1[3,2]
+ mov \o0\().d[1], \t4\().d[1] // o0[3, 4]; o1[1, 4]
+
+ fsub \e1\().4s, \o0\().4s, \t2\().4s // s34, g43
+ fadd \t2\().4s, \o0\().4s, \t2\().4s // s12, g12
+
+ rev64 \t6\().4s, v31.4s // 1, -1, 1, -1
+ dup \o0\().2d, \t0\().d[0] // r1212
+ dup \o1\().2d, \t0\().d[1] // r3434
+
+ rev64 \t4\().4s, \e1\().4s // xxg34
+ rev64 \o1\().4s, \o1\().4s // r4343
+
+ ext \t6\().16b, v31.16b, \t6\().16b, #8 // -1, 1, 1, -1
+ zip1 \t3\().2d, \t2\().2d, \e1\().2d // s1234
+ zip2 \t2\().2d, \t2\().2d, \t4\().2d // g1234
+
+ fadd \e0\().4s, \t3\().4s, \t2\().4s // out_e1
+ fsub \e1\().4s, \t3\().4s, \t2\().4s // out_e2
+
+ fmul \t1\().4s, \t1\().4s, \t5\().4s // j * +--+M_SQRT1_2
+ fmls \o0\().4s, \o1\().4s, \t6\().4s // z1234
+
+ rev64 \t4\().4s, \t1\().4s // j2143
+ fmla \t1\().4s, \t4\().4s, v31.4s // l2143
+
+ rev64 \t4\().4s, \t1\().4s // l1234
+ ext \t4\().16b, \t4\().16b, \t4\().16b, #8 // l3412
+
+ fmla \t4\().4s, \t1\().4s, v31.4s // t1234
+
+ fadd \o1\().4s, \o0\().4s, \t4\().4s // out_o2
+ fsub \o0\().4s, \o0\().4s, \t4\().4s // out_o1
+.endm
+
+// Identical as FFT8, but does 2 transforms in parallel
+.macro FFT8_X2 e0, e1, o0, o1, e2, e3, o2, o3
+
+ movrel x5, tab_8pt
+
+ fadd v19.4s, \e3\().4s, \o3\().4s // k1234
+ fadd v17.4s, \e1\().4s, \o1\().4s // k1234
+ fadd v18.4s, \e2\().4s, \o2\().4s // q1234
+ fadd v16.4s, \e0\().4s, \o0\().4s // q1234
+
+ ld1 { v23.4s }, [x5]
+
+ ext v22.16b, v19.16b, v19.16b, #12
+ ext v21.16b, v17.16b, v17.16b, #12
+
+ rev64 v22.4s, v22.4s
+ rev64 v21.4s, v21.4s
+
+ ext v19.16b, v18.16b, v22.16b, #8
+ ext v17.16b, v16.16b, v21.16b, #8
+
+ mov v18.d[1], v22.d[1]
+ mov v21.d[0], v16.d[0]
+
+ fadd v22.4s, v18.4s, v19.4s // s12, g12
+ fsub v19.4s, v18.4s, v19.4s // s34, g43
+ fsub v18.4s, v21.4s, v17.4s // s34, g43
+ fadd v16.4s, v21.4s, v17.4s // s12, g12
+
+ fsub \e0\().4s, \e0\().4s, \o0\().4s // r1234
+ fsub v20.4s, \e1\().4s, \o1\().4s // j1234
+ fsub \e2\().4s, \e2\().4s, \o2\().4s // r1234
+ fsub v21.4s, \e3\().4s, \o3\().4s // j1234
+
+ rev64 v24.4s, v31.4s // 1, -1, 1, -1
+ zip1 v17.2d, v16.2d, v18.2d // s1234
+ zip1 \e1\().2d, v22.2d, v19.2d // s1234
+
+ rev64 v18.4s, v18.4s // xxg34
+ rev64 v19.4s, v19.4s // xxg34
+
+ zip2 v16.2d, v16.2d, v18.2d // g1234
+ zip2 \e3\().2d, v22.2d, v19.2d // g1234
+
+ dup \o0\().2d, \e0\().d[0] // r1212
+ dup \o1\().2d, \e0\().d[1] // r3434
+ dup \o2\().2d, \e2\().d[0] // r1212
+ dup \o3\().2d, \e2\().d[1] // r3434
+
+ fadd \e2\().4s, \e1\().4s, \e3\().4s // out_e1
+ fsub \e3\().4s, \e1\().4s, \e3\().4s // out_e2
+ fadd \e0\().4s, v17.4s, v16.4s // out_e1
+ fsub \e1\().4s, v17.4s, v16.4s // out_e2
+
+ ext v24.16b, v31.16b, v24.16b, #8 // -1, 1, 1, -1
+ rev64 \o1\().4s, \o1\().4s // r4343
+ rev64 \o3\().4s, \o3\().4s // r4343
+
+ fmul v19.4s, v20.4s, v23.4s // j * +--+M_SQRT1_2
+ fmul v21.4s, v21.4s, v23.4s // j * +--+M_SQRT1_2
+
+ rev64 v20.4s, v19.4s // j2143
+ rev64 v18.4s, v21.4s // j2143
+
+ fmls \o0\().4s, \o1\().4s, v24.4s // z1234
+ fmls \o2\().4s, \o3\().4s, v24.4s // z1234
+
+ fmla v19.4s, v20.4s, v31.4s // l2143
+ fmla v21.4s, v18.4s, v31.4s // l2143
+
+ rev64 v20.4s, v19.4s // l1234
+ rev64 v18.4s, v21.4s // l1234
+ ext v20.16b, v20.16b, v20.16b, #8 // l3412
+ ext v18.16b, v18.16b, v18.16b, #8 // l3412
+
+ fmla v20.4s, v19.4s, v31.4s // t1234
+ fmla v18.4s, v21.4s, v31.4s // t1234
+
+ fadd \o1\().4s, \o0\().4s, v20.4s // out_o2
+ fadd \o3\().4s, \o2\().4s, v18.4s // out_o2
+ fsub \o0\().4s, \o0\().4s, v20.4s // out_o1
+ fsub \o2\().4s, \o2\().4s, v18.4s // out_o1
+.endm
+
+const tab_16pt, align=4
+ .float -COS16_1, COS16_1, -COS16_3, COS16_3 // Could be +-+- too
+ .float COS16_3, COS16_3, COS16_1, COS16_1
+ .float 1.0, 1.0, M_SQRT1_2, M_SQRT1_2
+endconst
+
+// 16-point FFT
+// t3, t4, t5, t6 must be sequential
+.macro FFT16 e0, e1, e2, e3, o0, o1, o2, o3, \
+ t0=v16, t1=v17, t2=v18, t3=v19, t4=v20, t5=v21, t6=v22
+
+ FFT8 \e0, \e1, \e2, \e3, \t0, \t1, \t2, \t3, \t4, \t5, \t6
+ FFT4_X2 \o0, \o1, \o2, \o3, \t0, \t1, \t2, \t3, \t4, \t5, \t6
+
+ movrel x5, tab_16pt
+
+ rev64 \t0\().4s, \o0\().4s // z[ 8, 9].imre
+ rev64 \t1\().4s, \o2\().4s // z[10,11].imre
+
+ ins \t0\().d[0], xzr
+ ins \t1\().d[0], xzr
+
+ ld1 { \t4\().4s, \t5\().4s, \t6\().4s }, [x5]
+ // TODO: We could derive \t4\() or \t5\() from either, but it seems cheaper to load
+
+ fmla \o2\().4s, \t1\().4s, v31.4s // s[4567]
+ fmls \o0\().4s, \t0\().4s, v31.4s // s[0123]
+
+ fmul \t2\().4s, \o1\().4s, \t4\().4s
+ fmul \t3\().4s, \o3\().4s, \t4\().4s
+
+ rev64 \o3\().4s, \o3\().4s
+ rev64 \o1\().4s, \o1\().4s
+
+ fmla \t3\().4s, \o3\().4s, \t5\().4s // s[12, 13, 14, 15]
+ fmls \t2\().4s, \o1\().4s, \t5\().4s // s[ 8, 9, 10, 11]
+
+ fmul \t1\().4s, \o2\().4s, \t6\().4s // s[4567] * mult
+ fmul \t0\().4s, \o0\().4s, \t6\().4s // s[0123] * mult
+
+ mov \o1\().16b, \t3\().16b
+ mov \o2\().16b, \t1\().16b
+
+ fsub \t3\().4s, \t3\().4s, \t2\().4s // y34, u34
+ fsub \t1\().4s, \t1\().4s, \t0\().4s // w34, x34
+
+ fadd \t2\().4s, \t2\().4s, \o1\().4s // y56, u56
+ rev64 \t3\().4s, \t3\().4s
+ fadd \t0\().4s, \t0\().4s, \o2\().4s // w56, x56
+ rev64 \t1\().4s, \t1\().4s
+
+ fmul \t2\().4s, \t2\().4s, v31.4s
+ fmul \t1\().4s, \t1\().4s, v31.4s
+
+ fadd \o3\().4s, \e3\().4s, \t3\().4s
+ fsub \o2\().4s, \e3\().4s, \t3\().4s
+ fsub \o1\().4s, \e2\().4s, \t2\().4s
+ fadd \o0\().4s, \e2\().4s, \t2\().4s
+
+ fsub \e2\().4s, \e0\().4s, \t0\().4s
+ fadd \e0\().4s, \e0\().4s, \t0\().4s
+ fsub \e3\().4s, \e1\().4s, \t1\().4s
+ fadd \e1\().4s, \e1\().4s, \t1\().4s
+.endm
+
+function ff_tx_fft2_float_neon, export=1
+ ld2r { v0.2d, v1.2d }, [x2]
+
+ fneg v2.2s, v1.2s
+ mov v2.d[1], v1.d[0]
+
+ fsub v2.4s, v0.4s, v2.4s
+
+ st1 { v2.4s }, [x1]
+ ret
+endfunc
+
+.macro FFT4_FN name, inv
+function ff_tx_fft4_\name\()_float_neon, export=1
+ ld1 {v0.4s, v1.4s}, [x2]
+
+.if \inv == 1
+ mov v2.d[0], v0.d[1]
+ mov v0.d[1], v1.d[1]
+ mov v1.d[1], v2.d[0]
+.endif
+
+ FFT4 v0, v1, 1
+
+ st1 { v0.4s, v1.4s }, [x1]
+ ret
+endfunc
+.endm
+
+FFT4_FN fwd, 0
+FFT4_FN inv, 1
+
+.macro FFT8_FN name, no_perm
+function ff_tx_fft8_\name\()_neon, export=1
+ SETUP_LUT \no_perm
+ LOAD_INPUT 0, 1, 2, 3, x2, \no_perm
+
+ LOAD_SUBADD
+ FFT8 v0, v1, v2, v3
+
+ zip1 v16.2d, v0.2d, v2.2d
+ zip2 v17.2d, v0.2d, v2.2d
+ zip1 v18.2d, v1.2d, v3.2d
+ zip2 v19.2d, v1.2d, v3.2d
+ st1 { v16.4s, v17.4s, v18.4s, v19.4s }, [x1]
+
+ ret
+endfunc
+.endm
+
+FFT8_FN float, 0
+FFT8_FN ns_float, 1
+
+.macro FFT16_FN name, no_perm
+function ff_tx_fft16_\name\()_neon, export=1
+ SETUP_LUT \no_perm
+ LOAD_INPUT 0, 1, 2, 3, x2, \no_perm
+ LOAD_INPUT 4, 5, 6, 7, x2, \no_perm
+
+ LOAD_SUBADD
+ FFT16 v0, v1, v2, v3, v4, v5, v6, v7
+
+ zip1 v20.2d, v0.2d, v4.2d
+ zip2 v21.2d, v0.2d, v4.2d
+ zip1 v22.2d, v1.2d, v6.2d
+ zip2 v23.2d, v1.2d, v6.2d
+ st1 { v20.4s, v21.4s, v22.4s, v23.4s }, [x1], #64
+
+ zip1 v24.2d, v2.2d, v5.2d
+ zip2 v25.2d, v2.2d, v5.2d
+ zip1 v26.2d, v3.2d, v7.2d
+ zip2 v27.2d, v3.2d, v7.2d
+ st1 { v24.4s, v25.4s, v26.4s, v27.4s }, [x1]
+
+ ret
+endfunc
+.endm
+
+FFT16_FN float, 0
+FFT16_FN ns_float, 1
+
+.macro SETUP_SR_RECOMB len, re, im, dec
+ ldr w5, =(\len - 4*7)
+ movrel \re, X(ff_tx_tab_\len\()_float)
+ add \im, \re, x5
+ mov \dec, #-32
+
+.if \len > 32
+ mov x21, #2*\len
+ add x22, x21, x21, lsl #1
+.endif
+.endm
+
+.macro SR_COMBINE e0, e1, e2, e3, e4, e5, e6, e7, \
+ o0, o1, o2, o3, o4, o5, o6, o7, \
+ re, im, dec, swap_im, \
+ t0=v16, t1=v17, t2=v18, t3=v19, t4=v20, t5=v21, \
+ t6=v22, t7=v23, t8=v24, t9=v25, ta=v26, tb=v27
+
+ ld1 { \t8\().4s, \t9\().4s }, [\im], \dec
+ ld1 { \t0\().4s, \t1\().4s }, [\re], #32
+
+.if \swap_im == 1
+ ext \t2\().16b, \t9\().16b, \t9\().16b, #8
+ ext \t3\().16b, \t8\().16b, \t8\().16b, #8
+.else
+ ext \t2\().16b, \t8\().16b, \t8\().16b, #8
+ ext \t3\().16b, \t9\().16b, \t9\().16b, #8
+.endif
+
+ trn1 \t4\().4s, \t0\().4s, \t0\().4s // cos0022
+ trn2 \t0\().4s, \t0\().4s, \t0\().4s // cos4466
+ trn1 \t5\().4s, \t1\().4s, \t1\().4s // cos1133
+ trn2 \t1\().4s, \t1\().4s, \t1\().4s // cos5577
+
+ rev64 \t6\().4s, \o0\().4s // E m2[0,1].imre
+ rev64 \t7\().4s, \o2\().4s // O m2[0,1].imre
+ rev64 \t8\().4s, \o4\().4s // E m2[2,3].imre
+ rev64 \t9\().4s, \o6\().4s // O m2[2,3].imre
+
+ fmul \t6\().4s, \t6\().4s, \t4\().4s // E m2[0,1].imre*t1[0,2]
+ fmul \t7\().4s, \t7\().4s, \t0\().4s // O m2[0,1].imre*t1[0,2]
+ fmul \t8\().4s, \t8\().4s, \t4\().4s // E m2[2,3].imre*t1[0,2]
+ fmul \t9\().4s, \t9\().4s, \t0\().4s // O m2[2,3].imre*t1[0,2]
+
+ rev64 \ta\().4s, \o1\().4s // E m3[0,1].imre
+ rev64 \tb\().4s, \o3\().4s // O m3[0,1].imre
+ rev64 \t4\().4s, \o5\().4s // E m3[2,3].imre
+ rev64 \t0\().4s, \o7\().4s // O m3[2,3].imre
+
+ fmul \ta\().4s, \ta\().4s, \t5\().4s // E m3[0,1].imre*t1[4,6]
+ fmul \tb\().4s, \tb\().4s, \t1\().4s // O m3[0,1].imre*t1[4,6]
+ fmul \t4\().4s, \t4\().4s, \t5\().4s // E m3[2,3].imre*t1[4,6]
+ fmul \t0\().4s, \t0\().4s, \t1\().4s // O m3[2,3].imre*t1[4,6]
+
+ trn1 \t5\().4s, \t3\().4s, \t3\().4s // wim2200
+ trn2 \t3\().4s, \t3\().4s, \t3\().4s // wim3311
+ trn1 \t1\().4s, \t2\().4s, \t2\().4s // wim6644
+ trn2 \t2\().4s, \t2\().4s, \t2\().4s // wim7755
+
+ fmul \t5\().4s, \t5\().4s, v31.4s
+ fmul \t3\().4s, \t3\().4s, v31.4s
+ fmul \t1\().4s, \t1\().4s, v31.4s
+ fmul \t2\().4s, \t2\().4s, v31.4s
+
+ fmla \t7\().4s, \o2\().4s, \t5\().4s // O w0123
+ fmls \t9\().4s, \o6\().4s, \t5\().4s // O j0123
+ fmla \t6\().4s, \o0\().4s, \t3\().4s // E w0123
+ fmls \t8\().4s, \o4\().4s, \t3\().4s // E j0123
+
+ fmla \ta\().4s, \o1\().4s, \t2\().4s // E w4567
+ fmla \tb\().4s, \o3\().4s, \t1\().4s // O w4567
+ fmls \t4\().4s, \o5\().4s, \t2\().4s // E j4567
+ fmls \t0\().4s, \o7\().4s, \t1\().4s // O j4567
+
+ fsub \t2\().4s, \t7\().4s, \t9\().4s
+ fsub \t1\().4s, \t8\().4s, \t6\().4s
+ fsub \t3\().4s, \t4\().4s, \ta\().4s
+ fsub \t5\().4s, \t0\().4s, \tb\().4s
+
+ fadd \t6\().4s, \t8\().4s, \t6\().4s
+ fadd \t7\().4s, \t9\().4s, \t7\().4s
+ fadd \t8\().4s, \t4\().4s, \ta\().4s
+ fadd \t9\().4s, \t0\().4s, \tb\().4s
+
+ fmul \t1\().4s, \t1\().4s, v31.4s
+ fmul \t2\().4s, \t2\().4s, v31.4s
+ fmul \t3\().4s, \t3\().4s, v31.4s
+ fmul \t5\().4s, \t5\().4s, v31.4s
+
+ rev64 \t6\().4s, \t6\().4s
+ rev64 \t8\().4s, \t8\().4s
+ rev64 \t7\().4s, \t7\().4s
+ rev64 \t9\().4s, \t9\().4s
+
+ fsub \o0\().4s, \e0\().4s, \t6\().4s
+ fsub \o1\().4s, \e1\().4s, \t8\().4s
+ fsub \o2\().4s, \e2\().4s, \t1\().4s
+ fsub \o3\().4s, \e3\().4s, \t3\().4s
+
+ fsub \o4\().4s, \e4\().4s, \t7\().4s
+ fsub \o5\().4s, \e6\().4s, \t9\().4s
+ fadd \o6\().4s, \e5\().4s, \t2\().4s
+ fsub \o7\().4s, \e7\().4s, \t5\().4s
+
+ fadd \e0\().4s, \e0\().4s, \t6\().4s
+ fadd \e1\().4s, \e1\().4s, \t8\().4s
+ fadd \e2\().4s, \e2\().4s, \t1\().4s
+ fadd \e3\().4s, \e3\().4s, \t3\().4s
+
+ fadd \e4\().4s, \e4\().4s, \t7\().4s
+ fsub \e5\().4s, \e5\().4s, \t2\().4s // swapped
+ fadd \e6\().4s, \e6\().4s, \t9\().4s // swapped
+ fadd \e7\().4s, \e7\().4s, \t5\().4s
+.endm
+
+.macro SR_COMBINE_HALF e0, e1, e2, e3, \
+ o0, o1, o2, o3, \
+ c0, c1, c2, c3, \
+ t0, t1, t2, t3, t4, t5, part
+
+.if \part == 0
+ trn1 \t4\().4s, \c0\().4s, \c0\().4s // cos0022
+ trn1 \c1\().4s, \c1\().4s, \c1\().4s // cos1133
+.else
+ trn2 \t4\().4s, \c0\().4s, \c0\().4s // cos0022
+ trn2 \c1\().4s, \c1\().4s, \c1\().4s // cos1133
+.endif
+.if \part == 0
+ trn2 \t5\().4s, \c2\().4s, \c2\().4s // wim7755
+ trn2 \c3\().4s, \c3\().4s, \c3\().4s // wim3311
+.else
+ trn1 \t5\().4s, \c2\().4s, \c2\().4s // wim7755
+ trn1 \c3\().4s, \c3\().4s, \c3\().4s // wim3311
+.endif
+
+ fmul \t5\().4s, \t5\().4s, v31.4s
+ fmul \c3\().4s, \c3\().4s, v31.4s
+
+ rev64 \t0\().4s, \o0\().4s // E m2[0,1].imre
+ rev64 \t1\().4s, \o2\().4s // E m2[2,3].imre
+ rev64 \t2\().4s, \o1\().4s // E m3[0,1].imre
+ rev64 \t3\().4s, \o3\().4s // E m3[2,3].imre
+
+ fmul \o0\().4s, \o0\().4s, \c3\().4s // E m2[0,1].imre*t1[0,2]
+ fmul \o1\().4s, \o1\().4s, \t5\().4s // E m3[0,1].imre*t1[4,6]
+ fmla \o0\().4s, \t0\().4s, \t4\().4s // E w0123
+ fmla \o1\().4s, \t2\().4s, \c1\().4s // E w4567
+
+ fmul \t1\().4s, \t1\().4s, \t4\().4s // E m2[2,3].imre*t1[0,2]
+ fmul \t3\().4s, \t3\().4s, \c1\().4s // E m3[2,3].imre*t1[4,6]
+ fmls \t1\().4s, \o2\().4s, \c3\().4s // E j0123
+ fmls \t3\().4s, \o3\().4s, \t5\().4s // E j4567
+
+ fsub \t0\().4s, \t1\().4s, \o0\().4s
+ fadd \t1\().4s, \t1\().4s, \o0\().4s
+ fadd \t2\().4s, \t3\().4s, \o1\().4s
+ fsub \t3\().4s, \t3\().4s, \o1\().4s
+
+ fmul \t0\().4s, \t0\().4s, v31.4s
+ fmul \t3\().4s, \t3\().4s, v31.4s
+
+ rev64 \t1\().4s, \t1\().4s
+ rev64 \t2\().4s, \t2\().4s
+
+.if \part == 0
+ fsub \o0\().4s, \e0\().4s, \t1\().4s
+ fsub \o1\().4s, \e1\().4s, \t2\().4s
+ fsub \o2\().4s, \e2\().4s, \t0\().4s
+ fsub \o3\().4s, \e3\().4s, \t3\().4s
+.else
+ fsub \o0\().4s, \e0\().4s, \t1\().4s
+ fadd \o2\().4s, \e1\().4s, \t2\().4s
+ fsub \o1\().4s, \e2\().4s, \t0\().4s
+ fadd \o3\().4s, \e3\().4s, \t3\().4s
+.endif
+
+.if \part == 0
+ fadd \e0\().4s, \e0\().4s, \t1\().4s
+ fadd \e1\().4s, \e1\().4s, \t2\().4s
+ fadd \e2\().4s, \e2\().4s, \t0\().4s
+ fadd \e3\().4s, \e3\().4s, \t3\().4s
+.else
+ fadd \e0\().4s, \e0\().4s, \t1\().4s
+ fsub \e1\().4s, \e1\().4s, \t2\().4s // swapped
+ fadd \e2\().4s, \e2\().4s, \t0\().4s // swapped
+ fsub \e3\().4s, \e3\().4s, \t3\().4s
+.endif
+.endm
+
+/* Same as SR_COMBINE_HALF, but heroically tries to use 3 temporary registers
+ * without touching the tables. */
+.macro SR_COMBINE_LITE e0, e1, e2, e3, \
+ o0, o1, o2, o3, \
+ c0, c1, c2, c3, \
+ t0, t1, t2, part
+
+ rev64 \t0\().4s, \o0\().4s // E m2[0,1].imre
+ rev64 \t1\().4s, \o2\().4s // E m2[2,3].imre
+.if \part == 0
+ trn2 \t2\().4s, \c3\().4s, \c3\().4s // wim3311
+.else
+ trn1 \t2\().4s, \c3\().4s, \c3\().4s // wim3311
+.endif
+ fmul \t2\().4s, \t2\().4s, v31.4s
+ fmul \o2\().4s, \o2\().4s, \t2\().4s
+ fmul \o0\().4s, \o0\().4s, \t2\().4s // E m2[0,1].imre*t1[0,2]
+.if \part == 0
+ trn1 \t2\().4s, \c0\().4s, \c0\().4s // cos0022
+.else
+ trn2 \t2\().4s, \c0\().4s, \c0\().4s // cos0022
+.endif
+ fmul \t1\().4s, \t1\().4s, \t2\().4s // E m2[2,3].imre*t1[0,2]
+ fmla \o0\().4s, \t0\().4s, \t2\().4s // E w0123
+ fsub \t1\().4s, \t1\().4s, \o2\().4s // E j0123
+
+ rev64 \t2\().4s, \o1\().4s // E m3[0,1].imre
+ rev64 \o2\().4s, \o3\().4s // E m3[2,3].imre
+
+.if \part == 0
+ trn2 \t0\().4s, \c2\().4s, \c2\().4s // wim7755
+.else
+ trn1 \t0\().4s, \c2\().4s, \c2\().4s // wim7755
+.endif
+ fmul \t0\().4s, \t0\().4s, v31.4s
+
+ fmul \o1\().4s, \o1\().4s, \t0\().4s // E m3[0,1].imre*t1[4,6]
+ fmul \o3\().4s, \o3\().4s, \t0\().4s
+
+.if \part == 0
+ trn1 \t0\().4s, \c1\().4s, \c1\().4s // cos1133
+.else
+ trn2 \t0\().4s, \c1\().4s, \c1\().4s // cos1133
+.endif
+ fmul \o2\().4s, \o2\().4s, \t0\().4s // E m3[2,3].imre*t1[4,6]
+ fmla \o1\().4s, \t2\().4s, \t0\().4s // E w4567
+ fsub \o2\().4s, \o2\().4s, \o3\().4s // E j4567
+
+ fsub \t0\().4s, \t1\().4s, \o0\().4s
+ fadd \o0\().4s, \t1\().4s, \o0\().4s
+ fadd \t2\().4s, \o2\().4s, \o1\().4s
+ fsub \t1\().4s, \o2\().4s, \o1\().4s
+
+ fmul \t0\().4s, \t0\().4s, v31.4s
+ fmul \t1\().4s, \t1\().4s, v31.4s
+
+ rev64 \t2\().4s, \t2\().4s
+ rev64 \o0\().4s, \o0\().4s
+
+.if \part == 0
+ fsub \o1\().4s, \e1\().4s, \t2\().4s
+ fsub \o2\().4s, \e2\().4s, \t0\().4s
+ fsub \o3\().4s, \e3\().4s, \t1\().4s
+.else
+ fadd \o2\().4s, \e1\().4s, \t0\().4s
+ fsub \o1\().4s, \e2\().4s, \t2\().4s
+ fadd \o3\().4s, \e3\().4s, \t1\().4s
+.endif
+
+.if \part == 0
+ fadd \e1\().4s, \e1\().4s, \t2\().4s
+ fadd \e2\().4s, \e2\().4s, \t0\().4s
+ fadd \e3\().4s, \e3\().4s, \t1\().4s
+.else
+ fsub \e1\().4s, \e1\().4s, \t0\().4s // swapped
+ fadd \e2\().4s, \e2\().4s, \t2\().4s // swapped
+ fsub \e3\().4s, \e3\().4s, \t1\().4s
+.endif
+
+ mov \t1\().16b, \o0\().16b
+
+ fsub \o0\().4s, \e0\().4s, \t1\().4s
+ fadd \e0\().4s, \e0\().4s, \t1\().4s
+.endm
+
+.macro SR_COMBINE_4 len, part, off
+ add x10, x1, x21
+ add x11, x1, x21, lsl #1
+ add x12, x1, x22
+
+ ldp q0, q1, [x1, #((0 + \part)*32 + \off)]
+ ldp q4, q5, [x1, #((2 + \part)*32 + \off)]
+ ldp q2, q3, [x10, #((0 + \part)*32 + \off)]
+ ldp q6, q7, [x10, #((2 + \part)*32 + \off)]
+
+ ldp q8, q9, [x11, #((0 + \part)*32 + \off)]
+ ldp q10, q11, [x11, #((2 + \part)*32 + \off)]
+ ldp q12, q13, [x12, #((0 + \part)*32 + \off)]
+ ldp q14, q15, [x12, #((2 + \part)*32 + \off)]
+
+ SR_COMBINE v0, v1, v2, v3, v4, v6, v5, v7, \
+ v8, v9, v10, v11, v12, v13, v14, v15, \
+ x7, x8, x9, 0
+
+ stp q0, q1, [x1, #((0 + \part)*32 + \off)]
+ stp q4, q5, [x1, #((2 + \part)*32 + \off)]
+ stp q2, q3, [x10, #((0 + \part)*32 + \off)]
+ stp q6, q7, [x10, #((2 + \part)*32 + \off)]
+
+ stp q8, q9, [x11, #((0 + \part)*32 + \off)]
+ stp q12, q13, [x11, #((2 + \part)*32 + \off)]
+ stp q10, q11, [x12, #((0 + \part)*32 + \off)]
+ stp q14, q15, [x12, #((2 + \part)*32 + \off)]
+.endm
+
+.macro SR_COMBINE_FULL len, off=0
+ add x10, x1, x21
+ add x11, x1, x21, lsl #1
+ add x12, x1, x22
+
+ SR_COMBINE_4 \len, 0, \off
+ SR_COMBINE_4 \len, 1, \off
+ SR_COMBINE_4 \len, 4, \off
+ SR_COMBINE_4 \len, 5, \off
+.endm
+
+.macro SR_COMBINE_D2 part, off
+ add x10, x1, #((\part)*32 + \off)
+ add x11, x14, #((\part)*32 + \off)
+ add x12, x15, #((\part)*32 + \off)
+ add x13, x16, #((\part)*32 + \off)
+
+ ldp q0, q1, [x10]
+ ldp q4, q5, [x10, #(2*32)]
+ ldp q2, q3, [x11]
+ ldp q6, q7, [x11, #(2*32)]
+
+ ldp q8, q9, [x12]
+ ldp q10, q11, [x12, #(2*32)]
+ ldp q12, q13, [x13]
+ ldp q14, q15, [x13, #(2*32)]
+
+ SR_COMBINE v0, v1, v2, v3, v4, v6, v5, v7, \
+ v8, v9, v10, v11, v12, v13, v14, v15, \
+ x7, x8, x9, 0, \
+ v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27
+
+ zip1 v16.2d, v0.2d, v4.2d
+ zip2 v17.2d, v0.2d, v4.2d
+ zip1 v18.2d, v1.2d, v5.2d
+ zip2 v19.2d, v1.2d, v5.2d
+
+ zip1 v20.2d, v2.2d, v6.2d
+ zip2 v21.2d, v2.2d, v6.2d
+ zip1 v22.2d, v3.2d, v7.2d
+ zip2 v23.2d, v3.2d, v7.2d
+
+ ldp q0, q1, [x10, #(1*32)]
+ ldp q4, q5, [x10, #(3*32)]
+ ldp q2, q3, [x11, #(1*32)]
+ ldp q6, q7, [x11, #(3*32)]
+
+ st1 { v16.4s, v17.4s, v18.4s, v19.4s }, [x10], #64
+ st1 { v20.4s, v21.4s, v22.4s, v23.4s }, [x11], #64
+
+ zip1 v20.2d, v8.2d, v12.2d
+ zip2 v21.2d, v8.2d, v12.2d
+ zip1 v22.2d, v9.2d, v13.2d
+ zip2 v23.2d, v9.2d, v13.2d
+ zip1 v24.2d, v10.2d, v14.2d
+ zip2 v25.2d, v10.2d, v14.2d
+ zip1 v26.2d, v11.2d, v15.2d
+ zip2 v27.2d, v11.2d, v15.2d
+
+ ldp q8, q9, [x12, #(1*32)]
+ ldp q10, q11, [x12, #(3*32)]
+ ldp q12, q13, [x13, #(1*32)]
+ ldp q14, q15, [x13, #(3*32)]
+
+ st1 { v20.4s, v21.4s, v22.4s, v23.4s }, [x12], #64
+ st1 { v24.4s, v25.4s, v26.4s, v27.4s }, [x13], #64
+
+ SR_COMBINE v0, v1, v2, v3, v4, v6, v5, v7, \
+ v8, v9, v10, v11, v12, v13, v14, v15, \
+ x7, x8, x9, 0, \
+ v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27
+
+ zip1 v16.2d, v0.2d, v4.2d
+ zip2 v17.2d, v0.2d, v4.2d
+ zip1 v18.2d, v1.2d, v5.2d
+ zip2 v19.2d, v1.2d, v5.2d
+ st1 { v16.4s, v17.4s, v18.4s, v19.4s }, [x10]
+
+ zip1 v16.2d, v2.2d, v6.2d
+ zip2 v17.2d, v2.2d, v6.2d
+ zip1 v18.2d, v3.2d, v7.2d
+ zip2 v19.2d, v3.2d, v7.2d
+ st1 { v16.4s, v17.4s, v18.4s, v19.4s }, [x11]
+
+ zip1 v20.2d, v8.2d, v12.2d
+ zip2 v21.2d, v8.2d, v12.2d
+ zip1 v22.2d, v9.2d, v13.2d
+ zip2 v23.2d, v9.2d, v13.2d
+ st1 { v20.4s, v21.4s, v22.4s, v23.4s }, [x12]
+
+ zip1 v24.2d, v10.2d, v14.2d
+ zip2 v25.2d, v10.2d, v14.2d
+ zip1 v26.2d, v11.2d, v15.2d
+ zip2 v27.2d, v11.2d, v15.2d
+ st1 { v24.4s, v25.4s, v26.4s, v27.4s }, [x13]
+.endm
+
+.macro SR_COMBINE_DINT off=0
+ add x14, x1, x21
+ add x15, x1, x21, lsl #1
+ add x16, x1, x22
+
+ SR_COMBINE_D2 0, \off
+ SR_COMBINE_D2 4, \off
+.endm
+
+.macro FFT32_FN name, no_perm
+function ff_tx_fft32_\name\()_neon, export=1
+ stp d14, d15, [sp, #-16*4]!
+ stp d8, d9, [sp, #16*3]
+ stp d10, d11, [sp, #16*2]
+ stp d12, d13, [sp, #16]
+
+ LOAD_SUBADD
+ SETUP_SR_RECOMB 32, x7, x8, x9
+
+ SETUP_LUT \no_perm
+ LOAD_INPUT 0, 1, 2, 3, x2, \no_perm
+ LOAD_INPUT 4, 5, 6, 7, x2, \no_perm
+ LOAD_INPUT 8, 9, 10, 11, x2, \no_perm
+ LOAD_INPUT 12, 13, 14, 15, x2, \no_perm
+
+ FFT8_X2 v8, v9, v10, v11, v12, v13, v14, v15
+ FFT16 v0, v1, v2, v3, v4, v5, v6, v7
+
+ SR_COMBINE v0, v1, v2, v3, v4, v5, v6, v7, \
+ v8, v9, v10, v11, v12, v13, v14, v15, \
+ x7, x8, x9, 0
+
+ zip1 v16.2d, v0.2d, v4.2d
+ zip2 v17.2d, v0.2d, v4.2d
+ zip1 v18.2d, v1.2d, v6.2d
+ zip2 v19.2d, v1.2d, v6.2d
+ st1 { v16.4s, v17.4s, v18.4s, v19.4s }, [x1], #64
+
+ zip1 v20.2d, v2.2d, v5.2d
+ zip2 v21.2d, v2.2d, v5.2d
+ zip1 v22.2d, v3.2d, v7.2d
+ zip2 v23.2d, v3.2d, v7.2d
+ st1 { v20.4s, v21.4s, v22.4s, v23.4s }, [x1], #64
+
+ zip1 v24.2d, v8.2d, v12.2d
+ zip2 v25.2d, v8.2d, v12.2d
+ zip1 v26.2d, v9.2d, v13.2d
+ zip2 v27.2d, v9.2d, v13.2d
+ st1 { v24.4s, v25.4s, v26.4s, v27.4s }, [x1], #64
+
+ zip1 v28.2d, v10.2d, v14.2d
+ zip2 v29.2d, v10.2d, v14.2d
+ zip1 v30.2d, v11.2d, v15.2d
+ zip2 v31.2d, v11.2d, v15.2d
+ st1 { v28.4s, v29.4s, v30.4s, v31.4s }, [x1]
+
+ ldp d12, d13, [sp, #16]
+ ldp d10, d11, [sp, #16*2]
+ ldp d8, d9, [sp, #16*3]
+ ldp d14, d15, [sp], #16*4
+
+ ret
+endfunc
+.endm
+
+FFT32_FN float, 0
+FFT32_FN ns_float, 1
+
+.macro cmp_imm reg, imm
+.if \imm >= 4096
+ cmp \reg, #((\imm)/4096), lsl #12
+.else
+ cmp \reg, #(\imm)
+.endif
+.endm
+
+.macro SR_TRANSFORM_DEF len, next=0
+\len:
+ stp x20, x30, [sp, #-16]!
+ mov w20, #(\len/4)
+ mov x5, #((\len*4) - (\len/1))
+ add x1, x1, x5
+ bl 32b
+ mov x5, #((\len*2) - (\len/2))
+ add x1, x1, x5
+ bl 32b
+ ldp x20, x30, [sp], #16
+ ldr w5, =(\len*6 + \len/2)
+ sub x1, x1, x5
+
+ SETUP_SR_RECOMB \len, x7, x8, x9
+
+.if \next\() != 0
+ cmp_imm w19, \len
+ b.eq 0f
+
+ mov w5, #(\len/128)
+\len\()5:
+ SR_COMBINE_FULL \len
+ add x1, x1, 8*32
+ subs w5, w5, 1
+ b.gt \len\()5b
+
+ cmp_imm w20, \len
+ b.gt \next\()f
+ ret
+.endif
+.endm
+
+.macro FFT_SPLIT_RADIX_FN name, no_perm
+function ff_tx_fft_sr_\name\()_neon, export=1
+ stp x21, x22, [sp, #-16*6]!
+ stp d8, d9, [sp, #16*5]
+ stp d10, d11, [sp, #16*4]
+ stp d12, d13, [sp, #16*3]
+ stp d14, d15, [sp, #16*2]
+ stp x19, x20, [sp, #16]
+
+ ldr w19, [x0, #0] // global target
+ mov w20, w19 // local length
+
+ LOAD_SUBADD
+ SETUP_LUT \no_perm
+
+32:
+ SETUP_SR_RECOMB 32, x7, x8, x9
+
+ LOAD_INPUT 0, 1, 2, 3, x2, \no_perm
+ LOAD_INPUT 4, 6, 5, 7, x2, \no_perm, 1
+ LOAD_INPUT 8, 9, 10, 11, x2, \no_perm
+ LOAD_INPUT 12, 13, 14, 15, x2, \no_perm
+
+ FFT8_X2 v8, v9, v10, v11, v12, v13, v14, v15
+ FFT16 v0, v1, v2, v3, v4, v6, v5, v7
+
+ SR_COMBINE v0, v1, v2, v3, v4, v6, v5, v7, \
+ v8, v9, v10, v11, v12, v13, v14, v15, \
+ x7, x8, x9, 0
+
+ stp q2, q3, [x1, #32*1]
+ stp q6, q7, [x1, #32*3]
+ stp q10, q11, [x1, #32*5]
+ stp q14, q15, [x1, #32*7]
+
+ cmp w20, #32
+ b.gt 64f
+
+ stp q0, q1, [x1, #32*0]
+ stp q4, q5, [x1, #32*2]
+ stp q8, q9, [x1, #32*4]
+ stp q12, q13, [x1, #32*6]
+
+ ret
+64:
+ SETUP_SR_RECOMB 64, x7, x8, x9
+
+ LOAD_INPUT 2, 3, 10, 11, x2, \no_perm, 1
+ LOAD_INPUT 6, 14, 7, 15, x2, \no_perm, 1
+
+ FFT16 v2, v3, v10, v11, v6, v14, v7, v15
+
+ LOAD_INPUT 16, 17, 18, 19, x2, \no_perm
+ LOAD_INPUT 20, 22, 21, 23, x2, \no_perm, 1
+
+ FFT16 v16, v17, v18, v19, v20, v22, v21, v23, \
+ v24, v25, v26, v27, v28, v29, v30
+
+ ld1 { v26.4s, v27.4s }, [x8], x9
+ ldp q24, q25, [x7], #32
+
+ ext v26.16b, v26.16b, v26.16b, #8
+ ext v27.16b, v27.16b, v27.16b, #8
+
+ cmp w19, #64
+ b.eq 2f // custom deinterleave
+
+ // TODO: investigate doing the 2 combines like in deinterleave
+ // TODO: experiment with spilling to gprs and converting to HALF or full
+ SR_COMBINE_LITE v0, v1, v8, v9, \
+ v2, v3, v16, v17, \
+ v24, v25, v26, v27, \
+ v28, v29, v30, 0
+
+ stp q0, q1, [x1, #32* 0]
+ stp q8, q9, [x1, #32* 4]
+ stp q2, q3, [x1, #32* 8]
+ stp q16, q17, [x1, #32*12]
+
+ SR_COMBINE_HALF v4, v5, v12, v13, \
+ v6, v7, v20, v21, \
+ v24, v25, v26, v27, \
+ v28, v29, v30, v0, v1, v8, 1
+
+ stp q4, q20, [x1, #32* 2]
+ stp q12, q21, [x1, #32* 6]
+ stp q6, q5, [x1, #32*10]
+ stp q7, q13, [x1, #32*14]
+
+ ldp q2, q3, [x1, #32*1]
+ ldp q6, q7, [x1, #32*3]
+ ldp q12, q13, [x1, #32*5]
+ ldp q16, q17, [x1, #32*7]
+
+ SR_COMBINE v2, v3, v12, v13, v6, v16, v7, v17, \
+ v10, v11, v14, v15, v18, v19, v22, v23, \
+ x7, x8, x9, 0, \
+ v24, v25, v26, v27, v28, v29, v30, v8, v0, v1, v4, v5
+
+ stp q2, q3, [x1, #32* 1]
+ stp q6, q7, [x1, #32* 3]
+ stp q12, q13, [x1, #32* 5]
+ stp q16, q17, [x1, #32* 7]
+
+ stp q10, q11, [x1, #32* 9]
+ stp q18, q19, [x1, #32*11]
+ stp q14, q15, [x1, #32*13]
+ stp q22, q23, [x1, #32*15]
+
+ cmp w20, #64
+ b.gt 128f
+ ret
+128:
+ stp x20, x30, [sp, #-16]!
+ mov w20, #32
+ add x1, x1, #16*32
+ bl 32b
+ add x1, x1, #8*32
+ bl 32b
+ ldp x20, x30, [sp], #16
+ sub x1, x1, #24*32
+
+ SETUP_SR_RECOMB 128, x7, x8, x9
+
+ cmp w19, #128
+ b.eq 0f
+
+ SR_COMBINE_FULL 128
+
+ cmp w20, #128
+ b.gt 256f
+ ret
+256:
+ stp x20, x30, [sp, #-16]!
+ mov w20, #64
+ add x1, x1, #32*32
+ bl 32b
+ add x1, x1, #16*32
+ bl 32b
+ ldp x20, x30, [sp], #16
+ sub x1, x1, #48*32
+
+ SETUP_SR_RECOMB 256, x7, x8, x9
+
+ cmp w19, #256
+ b.eq 0f
+
+ SR_COMBINE_FULL 256
+ SR_COMBINE_FULL 256, 8*32
+
+ cmp w20, #256
+ b.gt 512f
+ ret
+512:
+ stp x20, x30, [sp, #-16]!
+ mov w20, #128
+ add x1, x1, #64*32
+ bl 32b
+ add x1, x1, #32*32
+ bl 32b
+ ldp x20, x30, [sp], #16
+ sub x1, x1, #96*32
+
+ SETUP_SR_RECOMB 512, x7, x8, x9
+
+ cmp w19, #512
+ b.eq 0f
+
+ mov x5, 4
+5125:
+ SR_COMBINE_FULL 512
+ add x1, x1, 8*32
+ subs w5, w5, 1
+ b.gt 5125b
+
+ cmp w20, #512
+ b.gt 1024f
+
+ ret
+1024:
+ stp x20, x30, [sp, #-16]!
+ mov w20, #256
+ add x1, x1, #96*32
+ bl 32b
+ add x1, x1, #64*32
+ bl 32b
+ ldp x20, x30, [sp], #16
+ mov x5, #192*32
+ sub x1, x1, x5
+
+ SETUP_SR_RECOMB 1024, x7, x8, x9
+
+ cmp w19, #1024
+ b.eq 0f
+
+ mov w5, 8
+10245:
+ SR_COMBINE_FULL 1024
+ add x1, x1, 8*32
+ subs w5, w5, 1
+ b.gt 10245b
+
+ cmp w20, #1024
+ b.gt 2048f
+
+ ret
+
+SR_TRANSFORM_DEF 2048, 4096
+SR_TRANSFORM_DEF 4096, 8192
+SR_TRANSFORM_DEF 8192, 16384
+SR_TRANSFORM_DEF 16384, 32768
+SR_TRANSFORM_DEF 32768, 65536
+SR_TRANSFORM_DEF 65536, 131072
+SR_TRANSFORM_DEF 131072
+
+0: // general deinterleave loop
+ SR_COMBINE_DINT
+ add x1, x1, #32*8
+ subs w19, w19, #32*4
+ b.gt 0b
+
+ ldp x19, x20, [sp, #16]
+ ldp d14, d15, [sp, #16*2]
+ ldp d12, d13, [sp, #16*3]
+ ldp d10, d11, [sp, #16*4]
+ ldp d8, d9, [sp, #16*5]
+ ldp x21, x22, [sp], #16*6
+
+ ret
+
+2: // special case for 64 point deinterleave
+ mov x10, v23.d[0]
+ mov x11, v23.d[1]
+
+ SR_COMBINE_LITE v0, v1, v8, v9, \
+ v2, v3, v16, v17, \
+ v24, v25, v26, v27, \
+ v28, v29, v30, 0
+
+ SR_COMBINE_HALF v4, v5, v12, v13, \
+ v6, v7, v20, v21, \
+ v24, v25, v26, v27, \
+ v28, v29, v30, v23, v24, v26, 1
+
+ zip1 v23.2d, v0.2d, v4.2d
+ zip2 v24.2d, v0.2d, v4.2d
+ zip1 v25.2d, v1.2d, v20.2d
+ zip2 v26.2d, v1.2d, v20.2d
+
+ zip1 v27.2d, v8.2d, v12.2d
+ zip2 v28.2d, v8.2d, v12.2d
+ zip1 v29.2d, v9.2d, v21.2d
+ zip2 v30.2d, v9.2d, v21.2d
+
+ mov v20.16b, v5.16b
+ mov v21.16b, v7.16b
+ mov x12, x1
+ add x13, x1, #32* 4
+ add x14, x1, #32* 8
+ add x15, x1, #32*12
+
+ zip1 v4.2d, v2.2d, v6.2d
+ zip2 v5.2d, v2.2d, v6.2d
+ zip1 v6.2d, v3.2d, v20.2d
+ zip2 v7.2d, v3.2d, v20.2d
+
+ zip1 v0.2d, v16.2d, v21.2d
+ zip2 v1.2d, v16.2d, v21.2d
+ zip1 v2.2d, v17.2d, v13.2d
+ zip2 v3.2d, v17.2d, v13.2d
+
+ // stp is faster by a little on A53, but this is faster on M1s (theory)
+ ldp q8, q9, [x1, #32*1]
+ ldp q12, q13, [x1, #32*5]
+
+ st1 { v23.4s, v24.4s, v25.4s, v26.4s }, [x12], #64 // 32* 0...1
+ st1 { v27.4s, v28.4s, v29.4s, v30.4s }, [x13], #64 // 32* 4...5
+ st1 { v4.4s, v5.4s, v6.4s, v7.4s }, [x14], #64 // 32* 8...9
+ st1 { v0.4s, v1.4s, v2.4s, v3.4s }, [x15], #64 // 32*12..13
+
+ mov v23.d[0], x10
+ mov v23.d[1], x11
+
+ ldp q6, q7, [x1, #32*3]
+ ldp q16, q17, [x1, #32*7]
+
+ SR_COMBINE v8, v9, v12, v13, v6, v16, v7, v17, \
+ v10, v11, v14, v15, v18, v19, v22, v23, \
+ x7, x8, x9, 0, \
+ v24, v25, v26, v27, v28, v29, v30, v4, v0, v1, v5, v20
+
+ zip1 v0.2d, v8.2d, v6.2d
+ zip2 v1.2d, v8.2d, v6.2d
+ zip1 v2.2d, v9.2d, v7.2d
+ zip2 v3.2d, v9.2d, v7.2d
+ st1 { v0.4s, v1.4s, v2.4s, v3.4s }, [x12]
+
+ zip1 v4.2d, v12.2d, v16.2d
+ zip2 v5.2d, v12.2d, v16.2d
+ zip1 v6.2d, v13.2d, v17.2d
+ zip2 v7.2d, v13.2d, v17.2d
+ st1 { v4.4s, v5.4s, v6.4s, v7.4s }, [x13]
+
+ zip1 v0.2d, v10.2d, v18.2d
+ zip2 v1.2d, v10.2d, v18.2d
+ zip1 v2.2d, v11.2d, v19.2d
+ zip2 v3.2d, v11.2d, v19.2d
+ st1 { v0.4s, v1.4s, v2.4s, v3.4s }, [x14]
+
+ zip1 v4.2d, v14.2d, v22.2d
+ zip2 v5.2d, v14.2d, v22.2d
+ zip1 v6.2d, v15.2d, v23.2d
+ zip2 v7.2d, v15.2d, v23.2d
+ st1 { v4.4s, v5.4s, v6.4s, v7.4s }, [x15]
+
+ ldp x19, x20, [sp, #16]
+ ldp d14, d15, [sp, #16*2]
+ ldp d12, d13, [sp, #16*3]
+ ldp d10, d11, [sp, #16*4]
+ ldp d8, d9, [sp, #16*5]
+ ldp x21, x22, [sp], #16*6
+
+ ret
+endfunc
+.endm
+
+FFT_SPLIT_RADIX_FN float, 0
+FFT_SPLIT_RADIX_FN ns_float, 1