summaryrefslogtreecommitdiffstats
path: root/media/ffvpx/libavutil/x86/tx_float.asm
diff options
context:
space:
mode:
Diffstat (limited to 'media/ffvpx/libavutil/x86/tx_float.asm')
-rw-r--r--media/ffvpx/libavutil/x86/tx_float.asm1936
1 files changed, 1936 insertions, 0 deletions
diff --git a/media/ffvpx/libavutil/x86/tx_float.asm b/media/ffvpx/libavutil/x86/tx_float.asm
new file mode 100644
index 0000000000..e1533a8595
--- /dev/null
+++ b/media/ffvpx/libavutil/x86/tx_float.asm
@@ -0,0 +1,1936 @@
+;******************************************************************************
+;* Copyright (c) Lynne
+;*
+;* This file is part of FFmpeg.
+;*
+;* FFmpeg is free software; you can redistribute it and/or
+;* modify it under the terms of the GNU Lesser General Public
+;* License as published by the Free Software Foundation; either
+;* version 2.1 of the License, or (at your option) any later version.
+;*
+;* FFmpeg is distributed in the hope that it will be useful,
+;* but WITHOUT ANY WARRANTY; without even the implied warranty of
+;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+;* Lesser General Public License for more details.
+;*
+;* You should have received a copy of the GNU Lesser General Public
+;* License along with FFmpeg; if not, write to the Free Software
+;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+;******************************************************************************
+
+; Open `doc/transforms.md` to see the code upon which the transforms here were
+; based upon and compare.
+
+; Intra-asm call convention:
+; 320 bytes of stack available
+; 14 GPRs available (last 4 must not be clobbered)
+; Additionally, don't clobber ctx, in, out, stride, len, lut
+; All vector regs available
+
+; TODO:
+; carry over registers from smaller transforms to save on ~8 loads/stores
+; check if vinsertf could be faster than verpm2f128 for duplication
+; even faster FFT8 (current one is very #instructions optimized)
+; replace some xors with blends + addsubs?
+; replace some shuffles with vblends?
+; avx512 split-radix
+
+%include "libavutil/x86/x86util.asm"
+
+%define private_prefix ff_tx
+
+%if ARCH_X86_64
+%define ptr resq
+%else
+%define ptr resd
+%endif
+
+%assign i 16
+%rep 14
+cextern tab_ %+ i %+ _float ; ff_tab_i_float...
+%assign i (i << 1)
+%endrep
+
+cextern tab_53_float
+
+struc AVTXContext
+ .len: resd 1 ; Length
+ .inv resd 1 ; Inverse flag
+ .map: ptr 1 ; Lookup table(s)
+ .exp: ptr 1 ; Exponentiation factors
+ .tmp: ptr 1 ; Temporary data
+
+ .sub: ptr 1 ; Subcontexts
+ .fn: ptr 4 ; Subcontext functions
+ .nb_sub: resd 1 ; Subcontext count
+
+ ; Everything else is inaccessible
+endstruc
+
+SECTION_RODATA 32
+
+%define POS 0x00000000
+%define NEG 0x80000000
+
+%define M_SQRT1_2 0.707106781186547524401
+%define COS16_1 0.92387950420379638671875
+%define COS16_3 0.3826834261417388916015625
+
+d8_mult_odd: dd M_SQRT1_2, -M_SQRT1_2, -M_SQRT1_2, M_SQRT1_2, \
+ M_SQRT1_2, -M_SQRT1_2, -M_SQRT1_2, M_SQRT1_2
+
+s8_mult_odd: dd 1.0, 1.0, -1.0, 1.0, -M_SQRT1_2, -M_SQRT1_2, M_SQRT1_2, M_SQRT1_2
+s8_perm_even: dd 1, 3, 0, 2, 1, 3, 2, 0
+s8_perm_odd1: dd 3, 3, 1, 1, 1, 1, 3, 3
+s8_perm_odd2: dd 1, 2, 0, 3, 1, 0, 0, 1
+
+s16_mult_even: dd 1.0, 1.0, M_SQRT1_2, M_SQRT1_2, 1.0, -1.0, M_SQRT1_2, -M_SQRT1_2
+s16_mult_odd1: dd COS16_1, COS16_1, COS16_3, COS16_3, COS16_1, -COS16_1, COS16_3, -COS16_3
+s16_mult_odd2: dd COS16_3, -COS16_3, COS16_1, -COS16_1, -COS16_3, -COS16_3, -COS16_1, -COS16_1
+s16_perm: dd 0, 1, 2, 3, 1, 0, 3, 2
+
+s15_perm: dd 0, 6, 5, 3, 2, 4, 7, 1
+
+mask_mmppmmmm: dd NEG, NEG, POS, POS, NEG, NEG, NEG, NEG
+mask_mmmmpppm: dd NEG, NEG, NEG, NEG, POS, POS, POS, NEG
+mask_ppmpmmpm: dd POS, POS, NEG, POS, NEG, NEG, POS, NEG
+mask_mppmmpmp: dd NEG, POS, POS, NEG, NEG, POS, NEG, POS
+mask_mpmppmpm: dd NEG, POS, NEG, POS, POS, NEG, POS, NEG
+mask_pmmppmmp: dd POS, NEG, NEG, POS, POS, NEG, NEG, POS
+mask_pmpmpmpm: times 4 dd POS, NEG
+
+SECTION .text
+
+; Load complex values (64 bits) via a lookup table
+; %1 - output register
+; %2 - GRP of base input memory address
+; %3 - GPR of LUT (int32_t indices) address
+; %4 - LUT offset
+; %5 - temporary GPR (only used if vgather is not used)
+; %6 - temporary register (for avx only)
+; %7 - temporary register (for avx only, enables vgatherdpd (AVX2) if FMA3 is set)
+%macro LOAD64_LUT 5-7
+%if %0 > 6 && cpuflag(avx2)
+ pcmpeqd %7, %7 ; pcmpeqq has a 0.5 throughput on Zen 3, this has 0.25
+ movupd xmm%6, [%3 + %4] ; float mov since vgatherdpd is a float instruction
+ vgatherdpd %1, [%2 + xmm%6*8], %7 ; must use separate registers for args
+%else
+ mov %5d, [%3 + %4 + 0]
+ movsd xmm%1, [%2 + %5q*8]
+%if sizeof%1 > 16 && %0 > 5
+ mov %5d, [%3 + %4 + 8]
+ movsd xmm%6, [%2 + %5q*8]
+%endif
+ mov %5d, [%3 + %4 + 4]
+ movhps xmm%1, [%2 + %5q*8]
+%if sizeof%1 > 16 && %0 > 5
+ mov %5d, [%3 + %4 + 12]
+ movhps xmm%6, [%2 + %5q*8]
+ vinsertf128 %1, %1, xmm%6, 1
+%endif
+%endif
+%endmacro
+
+; Single 2-point in-place complex FFT (will do 2 transforms at once in AVX mode)
+; %1 - coefficients (r0.reim, r1.reim)
+; %2 - temporary
+%macro FFT2 2
+ shufps %2, %1, %1, q3322
+ shufps %1, %1, %1, q1100
+
+ addsubps %1, %1, %2
+
+ shufps %1, %1, %1, q2031
+%endmacro
+
+; Single 4-point in-place complex FFT (will do 2 transforms at once in [AVX] mode)
+; %1 - even coefficients (r0.reim, r2.reim, r4.reim, r6.reim)
+; %2 - odd coefficients (r1.reim, r3.reim, r5.reim, r7.reim)
+; %3 - temporary
+%macro FFT4 3
+ subps %3, %1, %2 ; r1234, [r5678]
+ addps %1, %1, %2 ; t1234, [t5678]
+
+ shufps %2, %1, %3, q1010 ; t12, r12
+ shufps %1, %1, %3, q2332 ; t34, r43
+
+ subps %3, %2, %1 ; a34, b32
+ addps %2, %2, %1 ; a12, b14
+
+ shufps %1, %2, %3, q1010 ; a1234 even
+
+ shufps %2, %2, %3, q2332 ; b1423
+ shufps %2, %2, %2, q1320 ; b1234 odd
+%endmacro
+
+; Single/Dual 8-point in-place complex FFT (will do 2 transforms in [AVX] mode)
+; %1 - even coefficients (a0.reim, a2.reim, [b0.reim, b2.reim])
+; %2 - even coefficients (a4.reim, a6.reim, [b4.reim, b6.reim])
+; %3 - odd coefficients (a1.reim, a3.reim, [b1.reim, b3.reim])
+; %4 - odd coefficients (a5.reim, a7.reim, [b5.reim, b7.reim])
+; %5 - temporary
+; %6 - temporary
+%macro FFT8 6
+ addps %5, %1, %3 ; q1-8
+ addps %6, %2, %4 ; k1-8
+
+ subps %1, %1, %3 ; r1-8
+ subps %2, %2, %4 ; j1-8
+
+ shufps %4, %1, %1, q2323 ; r4343
+ shufps %3, %5, %6, q3032 ; q34, k14
+
+ shufps %1, %1, %1, q1010 ; r1212
+ shufps %5, %5, %6, q1210 ; q12, k32
+
+ xorps %4, %4, [mask_pmmppmmp] ; r4343 * pmmp
+ addps %6, %5, %3 ; s12, g12
+
+ mulps %2, %2, [d8_mult_odd] ; r8 * d8_mult_odd
+ subps %5, %5, %3 ; s34, g43
+
+ addps %3, %1, %4 ; z1234
+ unpcklpd %1, %6, %5 ; s1234
+
+ shufps %4, %2, %2, q2301 ; j2143
+ shufps %6, %6, %5, q2332 ; g1234
+
+ addsubps %2, %2, %4 ; l2143
+ shufps %5, %2, %2, q0123 ; l3412
+ addsubps %5, %5, %2 ; t1234
+
+ subps %2, %1, %6 ; h1234 even
+ subps %4, %3, %5 ; u1234 odd
+
+ addps %1, %1, %6 ; w1234 even
+ addps %3, %3, %5 ; o1234 odd
+%endmacro
+
+; Single 8-point in-place complex FFT in 20 instructions
+; %1 - even coefficients (r0.reim, r2.reim, r4.reim, r6.reim)
+; %2 - odd coefficients (r1.reim, r3.reim, r5.reim, r7.reim)
+; %3 - temporary
+; %4 - temporary
+%macro FFT8_AVX 4
+ subps %3, %1, %2 ; r1234, r5678
+ addps %1, %1, %2 ; q1234, q5678
+
+ vpermilps %2, %3, [s8_perm_odd1] ; r4422, r6688
+ shufps %4, %1, %1, q3322 ; q1122, q5566
+
+ movsldup %3, %3 ; r1133, r5577
+ shufps %1, %1, %1, q1100 ; q3344, q7788
+
+ addsubps %3, %3, %2 ; z1234, z5678
+ addsubps %1, %1, %4 ; s3142, s7586
+
+ mulps %3, %3, [s8_mult_odd] ; z * s8_mult_odd
+ vpermilps %1, %1, [s8_perm_even] ; s1234, s5687 !
+
+ shufps %2, %3, %3, q2332 ; junk, z7887
+ xorps %4, %1, [mask_mmmmpppm] ; e1234, e5687 !
+
+ vpermilps %3, %3, [s8_perm_odd2] ; z2314, z6556
+ vperm2f128 %1, %1, %4, 0x03 ; e5687, s1234
+
+ addsubps %2, %2, %3 ; junk, t5678
+ subps %1, %1, %4 ; w1234, w5678 even
+
+ vperm2f128 %2, %2, %2, 0x11 ; t5678, t5678
+ vperm2f128 %3, %3, %3, 0x00 ; z2314, z2314
+
+ xorps %2, %2, [mask_ppmpmmpm] ; t * ppmpmmpm
+ addps %2, %3, %2 ; u1234, u5678 odd
+%endmacro
+
+; Single 16-point in-place complex FFT
+; %1 - even coefficients (r0.reim, r2.reim, r4.reim, r6.reim)
+; %2 - even coefficients (r8.reim, r10.reim, r12.reim, r14.reim)
+; %3 - odd coefficients (r1.reim, r3.reim, r5.reim, r7.reim)
+; %4 - odd coefficients (r9.reim, r11.reim, r13.reim, r15.reim)
+; %5, %6 - temporary
+; %7, %8 - temporary (optional)
+%macro FFT16 6-8
+ FFT4 %3, %4, %5
+%if %0 > 7
+ FFT8_AVX %1, %2, %6, %7
+ movaps %8, [mask_mpmppmpm]
+ movaps %7, [s16_perm]
+%define mask %8
+%define perm %7
+%elif %0 > 6
+ FFT8_AVX %1, %2, %6, %7
+ movaps %7, [s16_perm]
+%define mask [mask_mpmppmpm]
+%define perm %7
+%else
+ FFT8_AVX %1, %2, %6, %5
+%define mask [mask_mpmppmpm]
+%define perm [s16_perm]
+%endif
+ xorps %5, %5, %5 ; 0
+
+ shufps %6, %4, %4, q2301 ; z12.imre, z13.imre...
+ shufps %5, %5, %3, q2301 ; 0, 0, z8.imre...
+
+ mulps %4, %4, [s16_mult_odd1] ; z.reim * costab
+ xorps %5, %5, [mask_mppmmpmp]
+%if cpuflag(fma3)
+ fmaddps %6, %6, [s16_mult_odd2], %4 ; s[8..15]
+ addps %5, %3, %5 ; s[0...7]
+%else
+ mulps %6, %6, [s16_mult_odd2] ; z.imre * costab
+
+ addps %5, %3, %5 ; s[0...7]
+ addps %6, %4, %6 ; s[8..15]
+%endif
+ mulps %5, %5, [s16_mult_even] ; s[0...7]*costab
+
+ xorps %4, %6, mask ; s[8..15]*mpmppmpm
+ xorps %3, %5, mask ; s[0...7]*mpmppmpm
+
+ vperm2f128 %4, %4, %4, 0x01 ; s[12..15, 8..11]
+ vperm2f128 %3, %3, %3, 0x01 ; s[4..7, 0..3]
+
+ addps %6, %6, %4 ; y56, u56, y34, u34
+ addps %5, %5, %3 ; w56, x56, w34, x34
+
+ vpermilps %6, %6, perm ; y56, u56, y43, u43
+ vpermilps %5, %5, perm ; w56, x56, w43, x43
+
+ subps %4, %2, %6 ; odd part 2
+ addps %3, %2, %6 ; odd part 1
+
+ subps %2, %1, %5 ; even part 2
+ addps %1, %1, %5 ; even part 1
+%undef mask
+%undef perm
+%endmacro
+
+; Single 15-point complex FFT
+; Input:
+; xm0 must contain in[0,1].reim
+; m2 - in[3-6].reim
+; m3 - in[7-11].reim
+; m4 - in[12-15].reim
+; xm5 must contain in[2].reimreim
+;
+; Output:
+; m0, m1, m2 - ACs
+; xm14 - out[0]
+; xm15 - out[10, 5]
+%macro FFT15 0
+ shufps xm1, xm0, xm0, q3223 ; in[1].imrereim
+ shufps xm0, xm0, xm0, q1001 ; in[0].imrereim
+
+ xorps xm1, xm11
+ addps xm1, xm0 ; pc[0,1].imre
+
+ shufps xm0, xm1, xm1, q3232 ; pc[1].reimreim
+ addps xm0, xm5 ; dc[0].reimreim
+
+ mulps xm1, xm9 ; tab[0123]*pc[01]
+
+ shufpd xm6, xm1, xm1, 01b ; pc[1,0].reim
+ xorps xm1, xm11
+ addps xm1, xm1, xm6
+ addsubps xm1, xm5, xm1 ; dc[1,2].reim
+
+ subps m7, m2, m3 ; q[0-3].imre
+ addps m6, m2, m3 ; q[4-7]
+ shufps m7, m7, m7, q2301 ; q[0-3].reim
+
+ addps m5, m4, m6 ; y[0-3]
+
+ vperm2f128 m14, m9, m9, 0x11 ; tab[23232323]
+ vbroadcastsd m15, xm9 ; tab[01010101]
+
+ mulps m6, m14
+ mulps m7, m15
+
+ subps m2, m6, m7 ; k[0-3]
+ addps m3, m6, m7 ; k[4-7]
+
+ shufps m12, m11, m11, q3232 ; ppppmmmm
+
+ addsubps m6, m4, m2 ; k[0-3]
+ addsubps m7, m4, m3 ; k[4-7]
+
+ ; 15pt from here on
+ vpermpd m2, m5, q0123 ; y[3-0]
+ vpermpd m3, m6, q0123 ; k[3-0]
+ vpermpd m4, m7, q0123 ; k[7-4]
+
+ xorps m5, m12
+ xorps m6, m12
+ xorps m7, m12
+
+ addps m2, m5 ; t[0-3]
+ addps m3, m6 ; t[4-7]
+ addps m4, m7 ; t[8-11]
+
+ movlhps xm14, xm2 ; out[0]
+ unpcklpd xm15, xm3, xm4 ; out[10,5]
+ unpckhpd xm5, xm3, xm4 ; out[10,5]
+
+ addps xm14, xm2 ; out[0]
+ addps xm15, xm5 ; out[10,5]
+ addps xm14, xm0 ; out[0]
+ addps xm15, xm1 ; out[10,5]
+
+ shufps m12, m10, m10, q3232 ; tab5 4 5 4 5 8 9 8 9
+ shufps m13, m10, m10, q1010 ; tab5 6 7 6 7 10 11 10 11
+
+ mulps m5, m2, m12 ; t[0-3]
+ mulps m6, m3, m12 ; t[4-7]
+ mulps m7, m4, m12 ; t[8-11]
+
+ mulps m2, m13 ; r[0-3]
+ mulps m3, m13 ; r[4-7]
+ mulps m4, m13 ; r[8-11]
+
+ shufps m5, m5, m5, q1032 ; t[1,0,3,2].reim
+ shufps m6, m6, m6, q1032 ; t[5,4,7,6].reim
+ shufps m7, m7, m7, q1032 ; t[9,8,11,10].reim
+
+ vperm2f128 m13, m11, m11, 0x01 ; mmmmmmpp
+ shufps m12, m11, m11, q3232 ; ppppmmmm
+
+ xorps m5, m13
+ xorps m6, m13
+ xorps m7, m13
+
+ addps m2, m5 ; r[0,1,2,3]
+ addps m3, m6 ; r[4,5,6,7]
+ addps m4, m7 ; r[8,9,10,11]
+
+ shufps m5, m2, m2, q2301
+ shufps m6, m3, m3, q2301
+ shufps m7, m4, m4, q2301
+
+ xorps m2, m12
+ xorps m3, m12
+ xorps m4, m12
+
+ vpermpd m5, m5, q0123
+ vpermpd m6, m6, q0123
+ vpermpd m7, m7, q0123
+
+ addps m5, m2
+ addps m6, m3
+ addps m7, m4
+
+ vpermps m5, m8, m5
+ vpermps m6, m8, m6
+ vpermps m7, m8, m7
+
+ vbroadcastsd m0, xm0 ; dc[0]
+ vpermpd m2, m1, q1111 ; dc[2]
+ vbroadcastsd m1, xm1 ; dc[1]
+
+ addps m0, m5
+ addps m1, m6
+ addps m2, m7
+%endmacro
+
+; Cobmines m0...m8 (tx1[even, even, odd, odd], tx2,3[even], tx2,3[odd]) coeffs
+; Uses all 16 of registers.
+; Output is slightly permuted such that tx2,3's coefficients are interleaved
+; on a 2-point basis (look at `doc/transforms.md`)
+%macro SPLIT_RADIX_COMBINE 17
+%if %1 && mmsize == 32
+ vperm2f128 %14, %6, %7, 0x20 ; m2[0], m2[1], m3[0], m3[1] even
+ vperm2f128 %16, %9, %8, 0x20 ; m2[0], m2[1], m3[0], m3[1] odd
+ vperm2f128 %15, %6, %7, 0x31 ; m2[2], m2[3], m3[2], m3[3] even
+ vperm2f128 %17, %9, %8, 0x31 ; m2[2], m2[3], m3[2], m3[3] odd
+%endif
+
+ shufps %12, %10, %10, q2200 ; cos00224466
+ shufps %13, %11, %11, q1133 ; wim77553311
+ movshdup %10, %10 ; cos11335577
+ shufps %11, %11, %11, q0022 ; wim66442200
+
+%if %1 && mmsize == 32
+ shufps %6, %14, %14, q2301 ; m2[0].imre, m2[1].imre, m2[2].imre, m2[3].imre even
+ shufps %8, %16, %16, q2301 ; m2[0].imre, m2[1].imre, m2[2].imre, m2[3].imre odd
+ shufps %7, %15, %15, q2301 ; m3[0].imre, m3[1].imre, m3[2].imre, m3[3].imre even
+ shufps %9, %17, %17, q2301 ; m3[0].imre, m3[1].imre, m3[2].imre, m3[3].imre odd
+
+ mulps %14, %14, %13 ; m2[0123]reim * wim7531 even
+ mulps %16, %16, %11 ; m2[0123]reim * wim7531 odd
+ mulps %15, %15, %13 ; m3[0123]reim * wim7531 even
+ mulps %17, %17, %11 ; m3[0123]reim * wim7531 odd
+%else
+ mulps %14, %6, %13 ; m2,3[01]reim * wim7531 even
+ mulps %16, %8, %11 ; m2,3[01]reim * wim7531 odd
+ mulps %15, %7, %13 ; m2,3[23]reim * wim7531 even
+ mulps %17, %9, %11 ; m2,3[23]reim * wim7531 odd
+ ; reorder the multiplies to save movs reg, reg in the %if above
+ shufps %6, %6, %6, q2301 ; m2[0].imre, m2[1].imre, m3[0].imre, m3[1].imre even
+ shufps %8, %8, %8, q2301 ; m2[0].imre, m2[1].imre, m3[0].imre, m3[1].imre odd
+ shufps %7, %7, %7, q2301 ; m2[2].imre, m2[3].imre, m3[2].imre, m3[3].imre even
+ shufps %9, %9, %9, q2301 ; m2[2].imre, m2[3].imre, m3[2].imre, m3[3].imre odd
+%endif
+
+%if cpuflag(fma3) ; 11 - 5 = 6 instructions saved through FMA!
+ fmaddsubps %6, %6, %12, %14 ; w[0..8] even
+ fmaddsubps %8, %8, %10, %16 ; w[0..8] odd
+ fmsubaddps %7, %7, %12, %15 ; j[0..8] even
+ fmsubaddps %9, %9, %10, %17 ; j[0..8] odd
+ movaps %13, [mask_pmpmpmpm] ; "subaddps? pfft, who needs that!"
+%else
+ mulps %6, %6, %12 ; m2,3[01]imre * cos0246
+ mulps %8, %8, %10 ; m2,3[01]imre * cos0246
+ movaps %13, [mask_pmpmpmpm] ; "subaddps? pfft, who needs that!"
+ mulps %7, %7, %12 ; m2,3[23]reim * cos0246
+ mulps %9, %9, %10 ; m2,3[23]reim * cos0246
+ addsubps %6, %6, %14 ; w[0..8]
+ addsubps %8, %8, %16 ; w[0..8]
+ xorps %15, %15, %13 ; +-m2,3[23]imre * wim7531
+ xorps %17, %17, %13 ; +-m2,3[23]imre * wim7531
+ addps %7, %7, %15 ; j[0..8]
+ addps %9, %9, %17 ; j[0..8]
+%endif
+
+ addps %14, %6, %7 ; t10235476 even
+ addps %16, %8, %9 ; t10235476 odd
+ subps %15, %6, %7 ; +-r[0..7] even
+ subps %17, %8, %9 ; +-r[0..7] odd
+
+ shufps %14, %14, %14, q2301 ; t[0..7] even
+ shufps %16, %16, %16, q2301 ; t[0..7] odd
+ xorps %15, %15, %13 ; r[0..7] even
+ xorps %17, %17, %13 ; r[0..7] odd
+
+ subps %6, %2, %14 ; m2,3[01] even
+ subps %8, %4, %16 ; m2,3[01] odd
+ subps %7, %3, %15 ; m2,3[23] even
+ subps %9, %5, %17 ; m2,3[23] odd
+
+ addps %2, %2, %14 ; m0 even
+ addps %4, %4, %16 ; m0 odd
+ addps %3, %3, %15 ; m1 even
+ addps %5, %5, %17 ; m1 odd
+%endmacro
+
+; Same as above, only does one parity at a time, takes 3 temporary registers,
+; however, if the twiddles aren't needed after this, the registers they use
+; can be used as any of the temporary registers.
+%macro SPLIT_RADIX_COMBINE_HALF 10
+%if %1
+ shufps %8, %6, %6, q2200 ; cos00224466
+ shufps %9, %7, %7, q1133 ; wim77553311
+%else
+ shufps %8, %6, %6, q3311 ; cos11335577
+ shufps %9, %7, %7, q0022 ; wim66442200
+%endif
+
+ mulps %10, %4, %9 ; m2,3[01]reim * wim7531 even
+ mulps %9, %9, %5 ; m2,3[23]reim * wim7531 even
+
+ shufps %4, %4, %4, q2301 ; m2[0].imre, m2[1].imre, m3[0].imre, m3[1].imre even
+ shufps %5, %5, %5, q2301 ; m2[2].imre, m2[3].imre, m3[2].imre, m3[3].imre even
+
+%if cpuflag(fma3)
+ fmaddsubps %4, %4, %8, %10 ; w[0..8] even
+ fmsubaddps %5, %5, %8, %9 ; j[0..8] even
+ movaps %10, [mask_pmpmpmpm]
+%else
+ mulps %4, %4, %8 ; m2,3[01]imre * cos0246
+ mulps %5, %5, %8 ; m2,3[23]reim * cos0246
+ addsubps %4, %4, %10 ; w[0..8]
+ movaps %10, [mask_pmpmpmpm]
+ xorps %9, %9, %10 ; +-m2,3[23]imre * wim7531
+ addps %5, %5, %9 ; j[0..8]
+%endif
+
+ addps %8, %4, %5 ; t10235476
+ subps %9, %4, %5 ; +-r[0..7]
+
+ shufps %8, %8, %8, q2301 ; t[0..7]
+ xorps %9, %9, %10 ; r[0..7]
+
+ subps %4, %2, %8 ; %3,3[01]
+ subps %5, %3, %9 ; %3,3[23]
+
+ addps %2, %2, %8 ; m0
+ addps %3, %3, %9 ; m1
+%endmacro
+
+; Same as above, tries REALLY hard to use 2 temporary registers.
+%macro SPLIT_RADIX_COMBINE_LITE 9
+%if %1
+ shufps %8, %6, %6, q2200 ; cos00224466
+ shufps %9, %7, %7, q1133 ; wim77553311
+%else
+ shufps %8, %6, %6, q3311 ; cos11335577
+ shufps %9, %7, %7, q0022 ; wim66442200
+%endif
+
+ mulps %9, %9, %4 ; m2,3[01]reim * wim7531 even
+ shufps %4, %4, %4, q2301 ; m2[0].imre, m2[1].imre, m3[0].imre, m3[1].imre even
+
+%if cpuflag(fma3)
+ fmaddsubps %4, %4, %8, %9 ; w[0..8] even
+%else
+ mulps %4, %4, %8 ; m2,3[01]imre * cos0246
+ addsubps %4, %4, %9 ; w[0..8]
+%endif
+
+%if %1
+ shufps %9, %7, %7, q1133 ; wim77553311
+%else
+ shufps %9, %7, %7, q0022 ; wim66442200
+%endif
+
+ mulps %9, %9, %5 ; m2,3[23]reim * wim7531 even
+ shufps %5, %5, %5, q2301 ; m2[2].imre, m2[3].imre, m3[2].imre, m3[3].imre even
+%if cpuflag (fma3)
+ fmsubaddps %5, %5, %8, %9 ; j[0..8] even
+%else
+ mulps %5, %5, %8 ; m2,3[23]reim * cos0246
+ xorps %9, %9, [mask_pmpmpmpm] ; +-m2,3[23]imre * wim7531
+ addps %5, %5, %9 ; j[0..8]
+%endif
+
+ addps %8, %4, %5 ; t10235476
+ subps %9, %4, %5 ; +-r[0..7]
+
+ shufps %8, %8, %8, q2301 ; t[0..7]
+ xorps %9, %9, [mask_pmpmpmpm] ; r[0..7]
+
+ subps %4, %2, %8 ; %3,3[01]
+ subps %5, %3, %9 ; %3,3[23]
+
+ addps %2, %2, %8 ; m0
+ addps %3, %3, %9 ; m1
+%endmacro
+
+%macro SPLIT_RADIX_COMBINE_64 0
+ SPLIT_RADIX_COMBINE_LITE 1, m0, m1, tx1_e0, tx2_e0, tw_e, tw_o, tmp1, tmp2
+
+ movaps [outq + 0*mmsize], m0
+ movaps [outq + 4*mmsize], m1
+ movaps [outq + 8*mmsize], tx1_e0
+ movaps [outq + 12*mmsize], tx2_e0
+
+ SPLIT_RADIX_COMBINE_HALF 0, m2, m3, tx1_o0, tx2_o0, tw_e, tw_o, tmp1, tmp2, m0
+
+ movaps [outq + 2*mmsize], m2
+ movaps [outq + 6*mmsize], m3
+ movaps [outq + 10*mmsize], tx1_o0
+ movaps [outq + 14*mmsize], tx2_o0
+
+ movaps tw_e, [tab_64_float + mmsize]
+ vperm2f128 tw_o, tw_o, [tab_64_float + 64 - 4*7 - mmsize], 0x23
+
+ movaps m0, [outq + 1*mmsize]
+ movaps m1, [outq + 3*mmsize]
+ movaps m2, [outq + 5*mmsize]
+ movaps m3, [outq + 7*mmsize]
+
+ SPLIT_RADIX_COMBINE 0, m0, m2, m1, m3, tx1_e1, tx2_e1, tx1_o1, tx2_o1, tw_e, tw_o, \
+ tmp1, tmp2, tx2_o0, tx1_o0, tx2_e0, tx1_e0 ; temporary registers
+
+ movaps [outq + 1*mmsize], m0
+ movaps [outq + 3*mmsize], m1
+ movaps [outq + 5*mmsize], m2
+ movaps [outq + 7*mmsize], m3
+
+ movaps [outq + 9*mmsize], tx1_e1
+ movaps [outq + 11*mmsize], tx1_o1
+ movaps [outq + 13*mmsize], tx2_e1
+ movaps [outq + 15*mmsize], tx2_o1
+%endmacro
+
+; Perform a single even/odd split radix combination with loads and stores
+; The _4 indicates this is a quarter of the iterations required to complete a full
+; combine loop
+; %1 must contain len*2, %2 must contain len*4, %3 must contain len*6
+%macro SPLIT_RADIX_LOAD_COMBINE_4 8
+ movaps m8, [rtabq + (%5)*mmsize + %7]
+ vperm2f128 m9, m9, [itabq - (%5)*mmsize + %8], 0x23
+
+ movaps m0, [outq + (0 + %4)*mmsize + %6]
+ movaps m2, [outq + (2 + %4)*mmsize + %6]
+ movaps m1, [outq + %1 + (0 + %4)*mmsize + %6]
+ movaps m3, [outq + %1 + (2 + %4)*mmsize + %6]
+
+ movaps m4, [outq + %2 + (0 + %4)*mmsize + %6]
+ movaps m6, [outq + %2 + (2 + %4)*mmsize + %6]
+ movaps m5, [outq + %3 + (0 + %4)*mmsize + %6]
+ movaps m7, [outq + %3 + (2 + %4)*mmsize + %6]
+
+ SPLIT_RADIX_COMBINE 0, m0, m1, m2, m3, \
+ m4, m5, m6, m7, \
+ m8, m9, \
+ m10, m11, m12, m13, m14, m15
+
+ movaps [outq + (0 + %4)*mmsize + %6], m0
+ movaps [outq + (2 + %4)*mmsize + %6], m2
+ movaps [outq + %1 + (0 + %4)*mmsize + %6], m1
+ movaps [outq + %1 + (2 + %4)*mmsize + %6], m3
+
+ movaps [outq + %2 + (0 + %4)*mmsize + %6], m4
+ movaps [outq + %2 + (2 + %4)*mmsize + %6], m6
+ movaps [outq + %3 + (0 + %4)*mmsize + %6], m5
+ movaps [outq + %3 + (2 + %4)*mmsize + %6], m7
+%endmacro
+
+%macro SPLIT_RADIX_LOAD_COMBINE_FULL 2-5
+%if %0 > 2
+%define offset_c %3
+%else
+%define offset_c 0
+%endif
+%if %0 > 3
+%define offset_r %4
+%else
+%define offset_r 0
+%endif
+%if %0 > 4
+%define offset_i %5
+%else
+%define offset_i 0
+%endif
+
+ SPLIT_RADIX_LOAD_COMBINE_4 %1, 2*%1, %2, 0, 0, offset_c, offset_r, offset_i
+ SPLIT_RADIX_LOAD_COMBINE_4 %1, 2*%1, %2, 1, 1, offset_c, offset_r, offset_i
+ SPLIT_RADIX_LOAD_COMBINE_4 %1, 2*%1, %2, 4, 2, offset_c, offset_r, offset_i
+ SPLIT_RADIX_LOAD_COMBINE_4 %1, 2*%1, %2, 5, 3, offset_c, offset_r, offset_i
+%endmacro
+
+; Perform a single even/odd split radix combination with loads, deinterleaves and
+; stores. The _2 indicates this is a half of the iterations required to complete
+; a full combine+deinterleave loop
+; %3 must contain len*2, %4 must contain len*4, %5 must contain len*6
+%macro SPLIT_RADIX_COMBINE_DEINTERLEAVE_2 6
+ movaps m8, [rtabq + (0 + %2)*mmsize]
+ vperm2f128 m9, m9, [itabq - (0 + %2)*mmsize], 0x23
+
+ movaps m0, [outq + (0 + 0 + %1)*mmsize + %6]
+ movaps m2, [outq + (2 + 0 + %1)*mmsize + %6]
+ movaps m1, [outq + %3 + (0 + 0 + %1)*mmsize + %6]
+ movaps m3, [outq + %3 + (2 + 0 + %1)*mmsize + %6]
+
+ movaps m4, [outq + %4 + (0 + 0 + %1)*mmsize + %6]
+ movaps m6, [outq + %4 + (2 + 0 + %1)*mmsize + %6]
+ movaps m5, [outq + %5 + (0 + 0 + %1)*mmsize + %6]
+ movaps m7, [outq + %5 + (2 + 0 + %1)*mmsize + %6]
+
+ SPLIT_RADIX_COMBINE 0, m0, m1, m2, m3, \
+ m4, m5, m6, m7, \
+ m8, m9, \
+ m10, m11, m12, m13, m14, m15
+
+ unpckhpd m10, m0, m2
+ unpckhpd m11, m1, m3
+ unpckhpd m12, m4, m6
+ unpckhpd m13, m5, m7
+ unpcklpd m0, m0, m2
+ unpcklpd m1, m1, m3
+ unpcklpd m4, m4, m6
+ unpcklpd m5, m5, m7
+
+ vextractf128 [outq + (0 + 0 + %1)*mmsize + %6 + 0], m0, 0
+ vextractf128 [outq + (0 + 0 + %1)*mmsize + %6 + 16], m10, 0
+ vextractf128 [outq + %3 + (0 + 0 + %1)*mmsize + %6 + 0], m1, 0
+ vextractf128 [outq + %3 + (0 + 0 + %1)*mmsize + %6 + 16], m11, 0
+
+ vextractf128 [outq + %4 + (0 + 0 + %1)*mmsize + %6 + 0], m4, 0
+ vextractf128 [outq + %4 + (0 + 0 + %1)*mmsize + %6 + 16], m12, 0
+ vextractf128 [outq + %5 + (0 + 0 + %1)*mmsize + %6 + 0], m5, 0
+ vextractf128 [outq + %5 + (0 + 0 + %1)*mmsize + %6 + 16], m13, 0
+
+ vperm2f128 m10, m10, m0, 0x13
+ vperm2f128 m11, m11, m1, 0x13
+ vperm2f128 m12, m12, m4, 0x13
+ vperm2f128 m13, m13, m5, 0x13
+
+ movaps m8, [rtabq + (1 + %2)*mmsize]
+ vperm2f128 m9, m9, [itabq - (1 + %2)*mmsize], 0x23
+
+ movaps m0, [outq + (0 + 1 + %1)*mmsize + %6]
+ movaps m2, [outq + (2 + 1 + %1)*mmsize + %6]
+ movaps m1, [outq + %3 + (0 + 1 + %1)*mmsize + %6]
+ movaps m3, [outq + %3 + (2 + 1 + %1)*mmsize + %6]
+
+ movaps [outq + (0 + 1 + %1)*mmsize + %6], m10 ; m0 conflict
+ movaps [outq + %3 + (0 + 1 + %1)*mmsize + %6], m11 ; m1 conflict
+
+ movaps m4, [outq + %4 + (0 + 1 + %1)*mmsize + %6]
+ movaps m6, [outq + %4 + (2 + 1 + %1)*mmsize + %6]
+ movaps m5, [outq + %5 + (0 + 1 + %1)*mmsize + %6]
+ movaps m7, [outq + %5 + (2 + 1 + %1)*mmsize + %6]
+
+ movaps [outq + %4 + (0 + 1 + %1)*mmsize + %6], m12 ; m4 conflict
+ movaps [outq + %5 + (0 + 1 + %1)*mmsize + %6], m13 ; m5 conflict
+
+ SPLIT_RADIX_COMBINE 0, m0, m1, m2, m3, \
+ m4, m5, m6, m7, \
+ m8, m9, \
+ m10, m11, m12, m13, m14, m15 ; temporary registers
+
+ unpcklpd m8, m0, m2
+ unpcklpd m9, m1, m3
+ unpcklpd m10, m4, m6
+ unpcklpd m11, m5, m7
+ unpckhpd m0, m0, m2
+ unpckhpd m1, m1, m3
+ unpckhpd m4, m4, m6
+ unpckhpd m5, m5, m7
+
+ vextractf128 [outq + (2 + 0 + %1)*mmsize + %6 + 0], m8, 0
+ vextractf128 [outq + (2 + 0 + %1)*mmsize + %6 + 16], m0, 0
+ vextractf128 [outq + (2 + 1 + %1)*mmsize + %6 + 0], m8, 1
+ vextractf128 [outq + (2 + 1 + %1)*mmsize + %6 + 16], m0, 1
+
+ vextractf128 [outq + %3 + (2 + 0 + %1)*mmsize + %6 + 0], m9, 0
+ vextractf128 [outq + %3 + (2 + 0 + %1)*mmsize + %6 + 16], m1, 0
+ vextractf128 [outq + %3 + (2 + 1 + %1)*mmsize + %6 + 0], m9, 1
+ vextractf128 [outq + %3 + (2 + 1 + %1)*mmsize + %6 + 16], m1, 1
+
+ vextractf128 [outq + %4 + (2 + 0 + %1)*mmsize + %6 + 0], m10, 0
+ vextractf128 [outq + %4 + (2 + 0 + %1)*mmsize + %6 + 16], m4, 0
+ vextractf128 [outq + %4 + (2 + 1 + %1)*mmsize + %6 + 0], m10, 1
+ vextractf128 [outq + %4 + (2 + 1 + %1)*mmsize + %6 + 16], m4, 1
+
+ vextractf128 [outq + %5 + (2 + 0 + %1)*mmsize + %6 + 0], m11, 0
+ vextractf128 [outq + %5 + (2 + 0 + %1)*mmsize + %6 + 16], m5, 0
+ vextractf128 [outq + %5 + (2 + 1 + %1)*mmsize + %6 + 0], m11, 1
+ vextractf128 [outq + %5 + (2 + 1 + %1)*mmsize + %6 + 16], m5, 1
+%endmacro
+
+%macro SPLIT_RADIX_COMBINE_DEINTERLEAVE_FULL 2-3
+%if %0 > 2
+%define offset %3
+%else
+%define offset 0
+%endif
+ SPLIT_RADIX_COMBINE_DEINTERLEAVE_2 0, 0, %1, %1*2, %2, offset
+ SPLIT_RADIX_COMBINE_DEINTERLEAVE_2 4, 2, %1, %1*2, %2, offset
+%endmacro
+
+INIT_XMM sse3
+cglobal fft2_asm_float, 0, 0, 0, ctx, out, in, stride
+ movaps m0, [inq]
+ FFT2 m0, m1
+ movaps [outq], m0
+ ret
+
+cglobal fft2_float, 4, 4, 2, ctx, out, in, stride
+ movaps m0, [inq]
+ FFT2 m0, m1
+ movaps [outq], m0
+ RET
+
+%macro FFT4_FN 3
+INIT_XMM sse2
+%if %3
+cglobal fft4_ %+ %1 %+ _asm_float, 0, 0, 0, ctx, out, in, stride
+%else
+cglobal fft4_ %+ %1 %+ _float, 4, 4, 3, ctx, out, in, stride
+%endif
+ movaps m0, [inq + 0*mmsize]
+ movaps m1, [inq + 1*mmsize]
+
+%if %2
+ shufps m2, m1, m0, q3210
+ shufps m0, m0, m1, q3210
+ movaps m1, m2
+%endif
+
+ FFT4 m0, m1, m2
+
+ unpcklpd m2, m0, m1
+ unpckhpd m0, m0, m1
+
+ movaps [outq + 0*mmsize], m2
+ movaps [outq + 1*mmsize], m0
+
+%if %3
+ ret
+%else
+ RET
+%endif
+%endmacro
+
+FFT4_FN fwd, 0, 0
+FFT4_FN fwd, 0, 1
+FFT4_FN inv, 1, 0
+FFT4_FN inv, 1, 1
+
+%macro FFT8_SSE_FN 1
+INIT_XMM sse3
+%if %1
+cglobal fft8_asm_float, 0, 0, 0, ctx, out, in, stride, tmp
+ movaps m0, [inq + 0*mmsize]
+ movaps m1, [inq + 1*mmsize]
+ movaps m2, [inq + 2*mmsize]
+ movaps m3, [inq + 3*mmsize]
+%else
+cglobal fft8_float, 4, 4, 6, ctx, out, in, tmp
+ mov ctxq, [ctxq + AVTXContext.map]
+ LOAD64_LUT m0, inq, ctxq, (mmsize/2)*0, tmpq
+ LOAD64_LUT m1, inq, ctxq, (mmsize/2)*1, tmpq
+ LOAD64_LUT m2, inq, ctxq, (mmsize/2)*2, tmpq
+ LOAD64_LUT m3, inq, ctxq, (mmsize/2)*3, tmpq
+%endif
+
+ FFT8 m0, m1, m2, m3, m4, m5
+
+ unpcklpd m4, m0, m3
+ unpcklpd m5, m1, m2
+ unpckhpd m0, m0, m3
+ unpckhpd m1, m1, m2
+
+ movups [outq + 0*mmsize], m4
+ movups [outq + 1*mmsize], m0
+ movups [outq + 2*mmsize], m5
+ movups [outq + 3*mmsize], m1
+
+%if %1
+ ret
+%else
+ RET
+%endif
+
+%if %1
+cglobal fft8_ns_float, 4, 5, 6, ctx, out, in, stride, tmp
+ call mangle(ff_tx_fft8_asm_float_sse3)
+ RET
+%endif
+%endmacro
+
+FFT8_SSE_FN 0
+FFT8_SSE_FN 1
+
+%macro FFT8_AVX_FN 1
+INIT_YMM avx
+%if %1
+cglobal fft8_asm_float, 0, 0, 0, ctx, out, in, stride, tmp
+ movaps m0, [inq + 0*mmsize]
+ movaps m1, [inq + 1*mmsize]
+%else
+cglobal fft8_float, 4, 4, 4, ctx, out, in, tmp
+ mov ctxq, [ctxq + AVTXContext.map]
+ LOAD64_LUT m0, inq, ctxq, (mmsize/2)*0, tmpq, m2
+ LOAD64_LUT m1, inq, ctxq, (mmsize/2)*1, tmpq, m3
+%endif
+
+ FFT8_AVX m0, m1, m2, m3
+
+ unpcklpd m2, m0, m1
+ unpckhpd m0, m0, m1
+
+ ; Around 2% faster than 2x vperm2f128 + 2x movapd
+ vextractf128 [outq + 16*0], m2, 0
+ vextractf128 [outq + 16*1], m0, 0
+ vextractf128 [outq + 16*2], m2, 1
+ vextractf128 [outq + 16*3], m0, 1
+
+%if %1
+ ret
+%else
+ RET
+%endif
+
+%if %1
+cglobal fft8_ns_float, 4, 5, 4, ctx, out, in, stride, tmp
+ call mangle(ff_tx_fft8_asm_float_avx)
+ RET
+%endif
+%endmacro
+
+FFT8_AVX_FN 0
+FFT8_AVX_FN 1
+
+%macro FFT16_FN 2
+INIT_YMM %1
+%if %2
+cglobal fft16_asm_float, 0, 0, 0, ctx, out, in, stride, tmp
+ movaps m0, [inq + 0*mmsize]
+ movaps m1, [inq + 1*mmsize]
+ movaps m2, [inq + 2*mmsize]
+ movaps m3, [inq + 3*mmsize]
+%else
+cglobal fft16_float, 4, 4, 8, ctx, out, in, tmp
+ mov ctxq, [ctxq + AVTXContext.map]
+ LOAD64_LUT m0, inq, ctxq, (mmsize/2)*0, tmpq, m4
+ LOAD64_LUT m1, inq, ctxq, (mmsize/2)*1, tmpq, m5
+ LOAD64_LUT m2, inq, ctxq, (mmsize/2)*2, tmpq, m6
+ LOAD64_LUT m3, inq, ctxq, (mmsize/2)*3, tmpq, m7
+%endif
+
+ FFT16 m0, m1, m2, m3, m4, m5, m6, m7
+
+ unpcklpd m5, m1, m3
+ unpcklpd m4, m0, m2
+ unpckhpd m1, m1, m3
+ unpckhpd m0, m0, m2
+
+ vextractf128 [outq + 16*0], m4, 0
+ vextractf128 [outq + 16*1], m0, 0
+ vextractf128 [outq + 16*2], m4, 1
+ vextractf128 [outq + 16*3], m0, 1
+ vextractf128 [outq + 16*4], m5, 0
+ vextractf128 [outq + 16*5], m1, 0
+ vextractf128 [outq + 16*6], m5, 1
+ vextractf128 [outq + 16*7], m1, 1
+
+%if %2
+ ret
+%else
+ RET
+%endif
+
+%if %2
+cglobal fft16_ns_float, 4, 5, 8, ctx, out, in, stride, tmp
+ call mangle(ff_tx_fft16_asm_float_ %+ %1)
+ RET
+%endif
+%endmacro
+
+FFT16_FN avx, 0
+FFT16_FN avx, 1
+FFT16_FN fma3, 0
+FFT16_FN fma3, 1
+
+%macro FFT32_FN 2
+INIT_YMM %1
+%if %2
+cglobal fft32_asm_float, 0, 0, 0, ctx, out, in, stride, tmp
+ movaps m4, [inq + 4*mmsize]
+ movaps m5, [inq + 5*mmsize]
+ movaps m6, [inq + 6*mmsize]
+ movaps m7, [inq + 7*mmsize]
+%else
+cglobal fft32_float, 4, 4, 16, ctx, out, in, tmp
+ mov ctxq, [ctxq + AVTXContext.map]
+ LOAD64_LUT m4, inq, ctxq, (mmsize/2)*4, tmpq, m8, m12
+ LOAD64_LUT m5, inq, ctxq, (mmsize/2)*5, tmpq, m9, m13
+ LOAD64_LUT m6, inq, ctxq, (mmsize/2)*6, tmpq, m10, m14
+ LOAD64_LUT m7, inq, ctxq, (mmsize/2)*7, tmpq, m11, m15
+%endif
+
+ FFT8 m4, m5, m6, m7, m8, m9
+
+%if %2
+ movaps m0, [inq + 0*mmsize]
+ movaps m1, [inq + 1*mmsize]
+ movaps m2, [inq + 2*mmsize]
+ movaps m3, [inq + 3*mmsize]
+%else
+ LOAD64_LUT m0, inq, ctxq, (mmsize/2)*0, tmpq, m8, m12
+ LOAD64_LUT m1, inq, ctxq, (mmsize/2)*1, tmpq, m9, m13
+ LOAD64_LUT m2, inq, ctxq, (mmsize/2)*2, tmpq, m10, m14
+ LOAD64_LUT m3, inq, ctxq, (mmsize/2)*3, tmpq, m11, m15
+%endif
+
+ movaps m8, [tab_32_float]
+ vperm2f128 m9, m9, [tab_32_float + 4*8 - 4*7], 0x23
+
+ FFT16 m0, m1, m2, m3, m10, m11, m12, m13
+
+ SPLIT_RADIX_COMBINE 1, m0, m1, m2, m3, m4, m5, m6, m7, m8, m9, \
+ m10, m11, m12, m13, m14, m15 ; temporary registers
+
+ unpcklpd m9, m1, m3
+ unpcklpd m10, m5, m7
+ unpcklpd m8, m0, m2
+ unpcklpd m11, m4, m6
+ unpckhpd m1, m1, m3
+ unpckhpd m5, m5, m7
+ unpckhpd m0, m0, m2
+ unpckhpd m4, m4, m6
+
+ vextractf128 [outq + 16* 0], m8, 0
+ vextractf128 [outq + 16* 1], m0, 0
+ vextractf128 [outq + 16* 2], m8, 1
+ vextractf128 [outq + 16* 3], m0, 1
+ vextractf128 [outq + 16* 4], m9, 0
+ vextractf128 [outq + 16* 5], m1, 0
+ vextractf128 [outq + 16* 6], m9, 1
+ vextractf128 [outq + 16* 7], m1, 1
+
+ vextractf128 [outq + 16* 8], m11, 0
+ vextractf128 [outq + 16* 9], m4, 0
+ vextractf128 [outq + 16*10], m11, 1
+ vextractf128 [outq + 16*11], m4, 1
+ vextractf128 [outq + 16*12], m10, 0
+ vextractf128 [outq + 16*13], m5, 0
+ vextractf128 [outq + 16*14], m10, 1
+ vextractf128 [outq + 16*15], m5, 1
+
+%if %2
+ ret
+%else
+ RET
+%endif
+
+%if %2
+cglobal fft32_ns_float, 4, 5, 16, ctx, out, in, stride, tmp
+ call mangle(ff_tx_fft32_asm_float_ %+ %1)
+ RET
+%endif
+%endmacro
+
+%if ARCH_X86_64
+FFT32_FN avx, 0
+FFT32_FN avx, 1
+FFT32_FN fma3, 0
+FFT32_FN fma3, 1
+%endif
+
+%macro FFT_SPLIT_RADIX_DEF 1-2
+ALIGN 16
+.%1 %+ pt:
+ PUSH lenq
+ mov lenq, (%1/4)
+
+ add outq, (%1*4) - (%1/1)
+ call .32pt
+
+ add outq, (%1*2) - (%1/2) ; the synth loops also increment outq
+ call .32pt
+
+ POP lenq
+ sub outq, (%1*4) + (%1*2) + (%1/2)
+
+ lea rtabq, [tab_ %+ %1 %+ _float]
+ lea itabq, [tab_ %+ %1 %+ _float + %1 - 4*7]
+
+%if %0 > 1
+ cmp tgtq, %1
+ je .deinterleave
+
+ mov tmpq, %1
+
+.synth_ %+ %1:
+ SPLIT_RADIX_LOAD_COMBINE_FULL 2*%1, 6*%1, 0, 0, 0
+ add outq, 8*mmsize
+ add rtabq, 4*mmsize
+ sub itabq, 4*mmsize
+ sub tmpq, 4*mmsize
+ jg .synth_ %+ %1
+
+ cmp lenq, %1
+ jg %2 ; can't do math here, nasm doesn't get it
+ ret
+%endif
+%endmacro
+
+%macro FFT_SPLIT_RADIX_FN 2
+INIT_YMM %1
+%if %2
+cglobal fft_sr_asm_float, 0, 0, 0, ctx, out, in, stride, len, lut, itab, rtab, tgt, tmp
+%else
+cglobal fft_sr_float, 4, 10, 16, 272, ctx, out, in, stride, len, lut, itab, rtab, tgt, tmp
+ movsxd lenq, dword [ctxq + AVTXContext.len]
+ mov lutq, [ctxq + AVTXContext.map]
+%endif
+ mov tgtq, lenq
+
+; Bottom-most/32-point transform ===============================================
+ALIGN 16
+.32pt:
+%if %2
+ movaps m4, [inq + 4*mmsize]
+ movaps m5, [inq + 5*mmsize]
+ movaps m6, [inq + 6*mmsize]
+ movaps m7, [inq + 7*mmsize]
+%else
+ LOAD64_LUT m4, inq, lutq, (mmsize/2)*4, tmpq, m8, m12
+ LOAD64_LUT m5, inq, lutq, (mmsize/2)*5, tmpq, m9, m13
+ LOAD64_LUT m6, inq, lutq, (mmsize/2)*6, tmpq, m10, m14
+ LOAD64_LUT m7, inq, lutq, (mmsize/2)*7, tmpq, m11, m15
+%endif
+
+ FFT8 m4, m5, m6, m7, m8, m9
+
+%if %2
+ movaps m0, [inq + 0*mmsize]
+ movaps m1, [inq + 1*mmsize]
+ movaps m2, [inq + 2*mmsize]
+ movaps m3, [inq + 3*mmsize]
+%else
+ LOAD64_LUT m0, inq, lutq, (mmsize/2)*0, tmpq, m8, m12
+ LOAD64_LUT m1, inq, lutq, (mmsize/2)*1, tmpq, m9, m13
+ LOAD64_LUT m2, inq, lutq, (mmsize/2)*2, tmpq, m10, m14
+ LOAD64_LUT m3, inq, lutq, (mmsize/2)*3, tmpq, m11, m15
+%endif
+
+ movaps m8, [tab_32_float]
+ vperm2f128 m9, m9, [tab_32_float + 32 - 4*7], 0x23
+
+ FFT16 m0, m1, m2, m3, m10, m11, m12, m13
+
+ SPLIT_RADIX_COMBINE 1, m0, m1, m2, m3, m4, m5, m6, m7, m8, m9, \
+ m10, m11, m12, m13, m14, m15 ; temporary registers
+
+ movaps [outq + 1*mmsize], m1
+ movaps [outq + 3*mmsize], m3
+ movaps [outq + 5*mmsize], m5
+ movaps [outq + 7*mmsize], m7
+
+%if %2
+ add inq, 8*mmsize
+%else
+ add lutq, (mmsize/2)*8
+%endif
+ cmp lenq, 32
+ jg .64pt
+
+ movaps [outq + 0*mmsize], m0
+ movaps [outq + 2*mmsize], m2
+ movaps [outq + 4*mmsize], m4
+ movaps [outq + 6*mmsize], m6
+
+ ret
+
+; 64-point transform ===========================================================
+ALIGN 16
+.64pt:
+; Helper defines, these make it easier to track what's happening
+%define tx1_e0 m4
+%define tx1_e1 m5
+%define tx1_o0 m6
+%define tx1_o1 m7
+%define tx2_e0 m8
+%define tx2_e1 m9
+%define tx2_o0 m10
+%define tx2_o1 m11
+%define tw_e m12
+%define tw_o m13
+%define tmp1 m14
+%define tmp2 m15
+
+ SWAP m4, m1
+ SWAP m6, m3
+
+%if %2
+ movaps tx1_e0, [inq + 0*mmsize]
+ movaps tx1_e1, [inq + 1*mmsize]
+ movaps tx1_o0, [inq + 2*mmsize]
+ movaps tx1_o1, [inq + 3*mmsize]
+%else
+ LOAD64_LUT tx1_e0, inq, lutq, (mmsize/2)*0, tmpq, tw_e, tmp1
+ LOAD64_LUT tx1_e1, inq, lutq, (mmsize/2)*1, tmpq, tw_o, tmp2
+ LOAD64_LUT tx1_o0, inq, lutq, (mmsize/2)*2, tmpq, tw_e, tmp1
+ LOAD64_LUT tx1_o1, inq, lutq, (mmsize/2)*3, tmpq, tw_o, tmp2
+%endif
+
+ FFT16 tx1_e0, tx1_e1, tx1_o0, tx1_o1, tw_e, tw_o, tx2_o0, tx2_o1
+
+%if %2
+ movaps tx2_e0, [inq + 4*mmsize]
+ movaps tx2_e1, [inq + 5*mmsize]
+ movaps tx2_o0, [inq + 6*mmsize]
+ movaps tx2_o1, [inq + 7*mmsize]
+%else
+ LOAD64_LUT tx2_e0, inq, lutq, (mmsize/2)*4, tmpq, tw_e, tmp1
+ LOAD64_LUT tx2_e1, inq, lutq, (mmsize/2)*5, tmpq, tw_o, tmp2
+ LOAD64_LUT tx2_o0, inq, lutq, (mmsize/2)*6, tmpq, tw_e, tmp1
+ LOAD64_LUT tx2_o1, inq, lutq, (mmsize/2)*7, tmpq, tw_o, tmp2
+%endif
+
+ FFT16 tx2_e0, tx2_e1, tx2_o0, tx2_o1, tmp1, tmp2, tw_e, tw_o
+
+ movaps tw_e, [tab_64_float]
+ vperm2f128 tw_o, tw_o, [tab_64_float + 64 - 4*7], 0x23
+
+%if %2
+ add inq, 8*mmsize
+%else
+ add lutq, (mmsize/2)*8
+%endif
+ cmp tgtq, 64
+ je .64pt_deint
+
+ SPLIT_RADIX_COMBINE_64
+
+ cmp lenq, 64
+ jg .128pt
+ ret
+
+; 128-point transform ==========================================================
+ALIGN 16
+.128pt:
+ PUSH lenq
+ mov lenq, 32
+
+ add outq, 16*mmsize
+ call .32pt
+
+ add outq, 8*mmsize
+ call .32pt
+
+ POP lenq
+ sub outq, 24*mmsize
+
+ lea rtabq, [tab_128_float]
+ lea itabq, [tab_128_float + 128 - 4*7]
+
+ cmp tgtq, 128
+ je .deinterleave
+
+ SPLIT_RADIX_LOAD_COMBINE_FULL 2*128, 6*128
+
+ cmp lenq, 128
+ jg .256pt
+ ret
+
+; 256-point transform ==========================================================
+ALIGN 16
+.256pt:
+ PUSH lenq
+ mov lenq, 64
+
+ add outq, 32*mmsize
+ call .32pt
+
+ add outq, 16*mmsize
+ call .32pt
+
+ POP lenq
+ sub outq, 48*mmsize
+
+ lea rtabq, [tab_256_float]
+ lea itabq, [tab_256_float + 256 - 4*7]
+
+ cmp tgtq, 256
+ je .deinterleave
+
+ SPLIT_RADIX_LOAD_COMBINE_FULL 2*256, 6*256
+ SPLIT_RADIX_LOAD_COMBINE_FULL 2*256, 6*256, 8*mmsize, 4*mmsize, -4*mmsize
+
+ cmp lenq, 256
+ jg .512pt
+ ret
+
+; 512-point transform ==========================================================
+ALIGN 16
+.512pt:
+ PUSH lenq
+ mov lenq, 128
+
+ add outq, 64*mmsize
+ call .32pt
+
+ add outq, 32*mmsize
+ call .32pt
+
+ POP lenq
+ sub outq, 96*mmsize
+
+ lea rtabq, [tab_512_float]
+ lea itabq, [tab_512_float + 512 - 4*7]
+
+ cmp tgtq, 512
+ je .deinterleave
+
+ mov tmpq, 4
+
+.synth_512:
+ SPLIT_RADIX_LOAD_COMBINE_FULL 2*512, 6*512
+ add outq, 8*mmsize
+ add rtabq, 4*mmsize
+ sub itabq, 4*mmsize
+ sub tmpq, 1
+ jg .synth_512
+
+ cmp lenq, 512
+ jg .1024pt
+ ret
+
+; 1024-point transform ==========================================================
+ALIGN 16
+.1024pt:
+ PUSH lenq
+ mov lenq, 256
+
+ add outq, 96*mmsize
+ call .32pt
+
+ add outq, 64*mmsize
+ call .32pt
+
+ POP lenq
+ sub outq, 192*mmsize
+
+ lea rtabq, [tab_1024_float]
+ lea itabq, [tab_1024_float + 1024 - 4*7]
+
+ cmp tgtq, 1024
+ je .deinterleave
+
+ mov tmpq, 8
+
+.synth_1024:
+ SPLIT_RADIX_LOAD_COMBINE_FULL 2*1024, 6*1024
+ add outq, 8*mmsize
+ add rtabq, 4*mmsize
+ sub itabq, 4*mmsize
+ sub tmpq, 1
+ jg .synth_1024
+
+ cmp lenq, 1024
+ jg .2048pt
+ ret
+
+; 2048 to 131072-point transforms ==============================================
+FFT_SPLIT_RADIX_DEF 2048, .4096pt
+FFT_SPLIT_RADIX_DEF 4096, .8192pt
+FFT_SPLIT_RADIX_DEF 8192, .16384pt
+FFT_SPLIT_RADIX_DEF 16384, .32768pt
+FFT_SPLIT_RADIX_DEF 32768, .65536pt
+FFT_SPLIT_RADIX_DEF 65536, .131072pt
+FFT_SPLIT_RADIX_DEF 131072
+
+;===============================================================================
+; Final synthesis + deinterleaving code
+;===============================================================================
+.deinterleave:
+%if %2
+ PUSH strideq
+%endif
+ mov tgtq, lenq
+ imul tmpq, lenq, 2
+ lea strideq, [4*lenq + tmpq]
+
+.synth_deinterleave:
+ SPLIT_RADIX_COMBINE_DEINTERLEAVE_FULL tmpq, strideq
+ add outq, 8*mmsize
+ add rtabq, 4*mmsize
+ sub itabq, 4*mmsize
+ sub tgtq, 4*mmsize
+ jg .synth_deinterleave
+
+%if %2
+ POP strideq
+ sub outq, tmpq
+ neg tmpq
+ lea inq, [inq + tmpq*4]
+ ret
+%else
+ RET
+%endif
+
+; 64-point deinterleave which only has to load 4 registers =====================
+.64pt_deint:
+ SPLIT_RADIX_COMBINE_LITE 1, m0, m1, tx1_e0, tx2_e0, tw_e, tw_o, tmp1, tmp2
+ SPLIT_RADIX_COMBINE_HALF 0, m2, m3, tx1_o0, tx2_o0, tw_e, tw_o, tmp1, tmp2, tw_e
+
+ unpcklpd tmp1, m0, m2
+ unpcklpd tmp2, m1, m3
+ unpcklpd tw_o, tx1_e0, tx1_o0
+ unpcklpd tw_e, tx2_e0, tx2_o0
+ unpckhpd m0, m0, m2
+ unpckhpd m1, m1, m3
+ unpckhpd tx1_e0, tx1_e0, tx1_o0
+ unpckhpd tx2_e0, tx2_e0, tx2_o0
+
+ vextractf128 [outq + 0*mmsize + 0], tmp1, 0
+ vextractf128 [outq + 0*mmsize + 16], m0, 0
+ vextractf128 [outq + 4*mmsize + 0], tmp2, 0
+ vextractf128 [outq + 4*mmsize + 16], m1, 0
+
+ vextractf128 [outq + 8*mmsize + 0], tw_o, 0
+ vextractf128 [outq + 8*mmsize + 16], tx1_e0, 0
+ vextractf128 [outq + 9*mmsize + 0], tw_o, 1
+ vextractf128 [outq + 9*mmsize + 16], tx1_e0, 1
+
+ vperm2f128 tmp1, tmp1, m0, 0x31
+ vperm2f128 tmp2, tmp2, m1, 0x31
+
+ vextractf128 [outq + 12*mmsize + 0], tw_e, 0
+ vextractf128 [outq + 12*mmsize + 16], tx2_e0, 0
+ vextractf128 [outq + 13*mmsize + 0], tw_e, 1
+ vextractf128 [outq + 13*mmsize + 16], tx2_e0, 1
+
+ movaps tw_e, [tab_64_float + mmsize]
+ vperm2f128 tw_o, tw_o, [tab_64_float + 64 - 4*7 - mmsize], 0x23
+
+ movaps m0, [outq + 1*mmsize]
+ movaps m1, [outq + 3*mmsize]
+ movaps m2, [outq + 5*mmsize]
+ movaps m3, [outq + 7*mmsize]
+
+ movaps [outq + 1*mmsize], tmp1
+ movaps [outq + 5*mmsize], tmp2
+
+ SPLIT_RADIX_COMBINE 0, m0, m2, m1, m3, tx1_e1, tx2_e1, tx1_o1, tx2_o1, tw_e, tw_o, \
+ tmp1, tmp2, tx2_o0, tx1_o0, tx2_e0, tx1_e0 ; temporary registers
+
+ unpcklpd tmp1, m0, m1
+ unpcklpd tmp2, m2, m3
+ unpcklpd tw_e, tx1_e1, tx1_o1
+ unpcklpd tw_o, tx2_e1, tx2_o1
+ unpckhpd m0, m0, m1
+ unpckhpd m2, m2, m3
+ unpckhpd tx1_e1, tx1_e1, tx1_o1
+ unpckhpd tx2_e1, tx2_e1, tx2_o1
+
+ vextractf128 [outq + 2*mmsize + 0], tmp1, 0
+ vextractf128 [outq + 2*mmsize + 16], m0, 0
+ vextractf128 [outq + 3*mmsize + 0], tmp1, 1
+ vextractf128 [outq + 3*mmsize + 16], m0, 1
+
+ vextractf128 [outq + 6*mmsize + 0], tmp2, 0
+ vextractf128 [outq + 6*mmsize + 16], m2, 0
+ vextractf128 [outq + 7*mmsize + 0], tmp2, 1
+ vextractf128 [outq + 7*mmsize + 16], m2, 1
+
+ vextractf128 [outq + 10*mmsize + 0], tw_e, 0
+ vextractf128 [outq + 10*mmsize + 16], tx1_e1, 0
+ vextractf128 [outq + 11*mmsize + 0], tw_e, 1
+ vextractf128 [outq + 11*mmsize + 16], tx1_e1, 1
+
+ vextractf128 [outq + 14*mmsize + 0], tw_o, 0
+ vextractf128 [outq + 14*mmsize + 16], tx2_e1, 0
+ vextractf128 [outq + 15*mmsize + 0], tw_o, 1
+ vextractf128 [outq + 15*mmsize + 16], tx2_e1, 1
+
+%if %2
+ sub inq, 16*mmsize
+ ret
+%else
+ RET
+%endif
+
+%if %2
+cglobal fft_sr_ns_float, 4, 10, 16, 272, ctx, out, in, tmp, len, lut, itab, rtab, tgt, off
+ movsxd lenq, dword [ctxq + AVTXContext.len]
+ mov lutq, [ctxq + AVTXContext.map]
+
+ call mangle(ff_tx_fft_sr_asm_float_ %+ %1)
+ RET
+%endif
+%endmacro
+
+%if ARCH_X86_64
+FFT_SPLIT_RADIX_FN avx, 0
+FFT_SPLIT_RADIX_FN avx, 1
+FFT_SPLIT_RADIX_FN fma3, 0
+FFT_SPLIT_RADIX_FN fma3, 1
+%if HAVE_AVX2_EXTERNAL
+FFT_SPLIT_RADIX_FN avx2, 0
+FFT_SPLIT_RADIX_FN avx2, 1
+%endif
+%endif
+
+%macro FFT15_FN 2
+INIT_YMM avx2
+cglobal fft15_ %+ %2, 4, 10, 16, ctx, out, in, stride, len, lut, tmp, tgt5, stride3, stride5
+ mov lutq, [ctxq + AVTXContext.map]
+
+ imul stride3q, strideq, 3
+ imul stride5q, strideq, 5
+
+ movaps m11, [mask_mmppmmmm] ; mmppmmmm
+ movaps m10, [tab_53_float] ; tab5
+ movaps xm9, [tab_53_float + 32] ; tab3
+ vpermpd m9, m9, q1110 ; tab[23232323]
+ movaps m8, [s15_perm]
+
+%if %1
+ movups xm0, [inq]
+ movddup xm5, [inq + 16]
+ movups m2, [inq + mmsize*0 + 24]
+ movups m3, [inq + mmsize*1 + 24]
+ movups m4, [inq + mmsize*2 + 24]
+%else
+ LOAD64_LUT xm0, inq, lutq, 0, tmpq, m14, xm15
+ LOAD64_LUT m2, inq, lutq, (mmsize/2)*0 + 12, tmpq, m6, m7
+ LOAD64_LUT m3, inq, lutq, (mmsize/2)*1 + 12, tmpq, m14, m15
+ LOAD64_LUT m4, inq, lutq, (mmsize/2)*2 + 12, tmpq, m6, m7
+ mov tmpd, [lutq + 8]
+ movddup xm5, [inq + tmpq*8]
+%endif
+
+ FFT15
+
+ lea tgt5q, [outq + stride5q]
+ lea tmpq, [outq + stride5q*2]
+
+ movhps [outq], xm14 ; out[0]
+ movhps [outq + stride5q*1], xm15 ; out[5]
+ movlps [outq + stride5q*2], xm15 ; out[10]
+
+ vextractf128 xm3, m0, 1
+ vextractf128 xm4, m1, 1
+ vextractf128 xm5, m2, 1
+
+ movlps [outq + strideq*1], xm1
+ movhps [outq + strideq*2], xm2
+ movlps [outq + stride3q*1], xm3
+ movhps [outq + strideq*4], xm4
+ movlps [outq + stride3q*2], xm0
+ movlps [outq + strideq*8], xm5
+ movhps [outq + stride3q*4], xm0
+ movhps [tgt5q + strideq*2], xm1
+ movhps [tgt5q + strideq*4], xm3
+ movlps [tmpq + strideq*1], xm2
+ movlps [tmpq + stride3q*1], xm4
+ movhps [tmpq + strideq*4], xm5
+
+ RET
+%endmacro
+
+%if ARCH_X86_64 && HAVE_AVX2_EXTERNAL
+FFT15_FN 0, float
+FFT15_FN 1, ns_float
+%endif
+
+%macro IMDCT_FN 1
+INIT_YMM %1
+cglobal mdct_inv_float, 4, 14, 16, 320, ctx, out, in, stride, len, lut, exp, t1, t2, t3, \
+ t4, t5, btmp
+ movsxd lenq, dword [ctxq + AVTXContext.len]
+ mov expq, [ctxq + AVTXContext.exp]
+
+ lea t1d, [lend - 1]
+ imul t1d, strided
+
+ mov btmpq, ctxq ; backup original context
+ mov lutq, [ctxq + AVTXContext.map] ; load map
+
+ cmp strideq, 4
+ je .stride4
+
+ shl strideq, 1
+ movd xm4, strided
+ vpbroadcastd m4, xm4 ; stride splatted
+ movd xm5, t1d
+ vpbroadcastd m5, xm5 ; offset splatted
+
+ mov t2q, outq ; don't modify the original output
+ pcmpeqd m15, m15 ; set all bits to 1
+
+.stridex_pre:
+ pmulld m2, m4, [lutq] ; multiply by stride
+ movaps m0, m15
+ psubd m3, m5, m2 ; subtract from offset
+ movaps m1, m15
+ vgatherdps m6, [inq + m2], m0 ; im
+ vgatherdps m7, [inq + m3], m1 ; re
+
+ movaps m8, [expq + 0*mmsize] ; tab 1
+ movaps m9, [expq + 1*mmsize] ; tab 2
+
+ unpcklps m0, m7, m6 ; re, im, re, im
+ unpckhps m1, m7, m6 ; re, im, re, im
+
+ vperm2f128 m2, m1, m0, 0x02 ; output order
+ vperm2f128 m3, m1, m0, 0x13 ; output order
+
+ movshdup m10, m8 ; tab 1 imim
+ movshdup m11, m9 ; tab 2 imim
+ movsldup m12, m8 ; tab 1 rere
+ movsldup m13, m9 ; tab 2 rere
+
+ mulps m10, m2 ; 1 reim * imim
+ mulps m11, m3 ; 2 reim * imim
+
+ shufps m10, m10, m10, q2301
+ shufps m11, m11, m11, q2301
+
+ fmaddsubps m10, m12, m2, m10
+ fmaddsubps m11, m13, m3, m11
+
+ movups [t2q + 0*mmsize], m10
+ movups [t2q + 1*mmsize], m11
+
+ add expq, mmsize*2
+ add lutq, mmsize
+ add t2q, mmsize*2
+ sub lenq, mmsize/2
+ jg .stridex_pre
+ jmp .transform
+
+.stride4:
+ lea expq, [expq + lenq*4]
+ lea lutq, [lutq + lenq*2]
+ lea t1q, [inq + t1q]
+ lea t1q, [t1q + strideq - mmsize]
+ lea t2q, [lenq*2 - mmsize/2]
+
+.stride4_pre:
+ movups m4, [inq]
+ movups m3, [t1q]
+
+ movsldup m1, m4 ; im im, im im
+ movshdup m0, m3 ; re re, re re
+ movshdup m4, m4 ; re re, re re (2)
+ movsldup m3, m3 ; im im, im im (2)
+
+ movups m2, [expq] ; tab
+ movups m5, [expq + 2*t2q] ; tab (2)
+
+ vpermpd m0, m0, q0123 ; flip
+ shufps m7, m2, m2, q2301
+ vpermpd m4, m4, q0123 ; flip (2)
+ shufps m8, m5, m5, q2301
+
+ mulps m1, m7 ; im im * tab.reim
+ mulps m3, m8 ; im im * tab.reim (2)
+
+ fmaddsubps m0, m0, m2, m1
+ fmaddsubps m4, m4, m5, m3
+
+ vextractf128 xm3, m0, 1
+ vextractf128 xm6, m4, 1
+
+ ; scatter
+ movsxd strideq, dword [lutq + 0*4]
+ movsxd lenq, dword [lutq + 1*4]
+ movsxd t3q, dword [lutq + 2*4]
+ movsxd t4q, dword [lutq + 3*4]
+
+ movlps [outq + strideq*8], xm0
+ movhps [outq + lenq*8], xm0
+ movlps [outq + t3q*8], xm3
+ movhps [outq + t4q*8], xm3
+
+ movsxd strideq, dword [lutq + 0*4 + t2q]
+ movsxd lenq, dword [lutq + 1*4 + t2q]
+ movsxd t3q, dword [lutq + 2*4 + t2q]
+ movsxd t4q, dword [lutq + 3*4 + t2q]
+
+ movlps [outq + strideq*8], xm4
+ movhps [outq + lenq*8], xm4
+ movlps [outq + t3q*8], xm6
+ movhps [outq + t4q*8], xm6
+
+ add lutq, mmsize/2
+ add expq, mmsize
+ add inq, mmsize
+ sub t1q, mmsize
+ sub t2q, mmsize
+ jge .stride4_pre
+
+.transform:
+ mov strideq, 2*4
+ mov t4q, ctxq ; backup original context
+ mov t5q, [ctxq + AVTXContext.fn] ; subtransform's jump point
+ mov ctxq, [ctxq + AVTXContext.sub]
+ mov lutq, [ctxq + AVTXContext.map]
+ movsxd lenq, dword [ctxq + AVTXContext.len]
+
+ mov inq, outq ; in-place transform
+ call t5q ; call the FFT
+
+ mov ctxq, t4q ; restore original context
+ movsxd lenq, dword [ctxq + AVTXContext.len]
+ mov expq, [ctxq + AVTXContext.exp]
+ lea expq, [expq + lenq*4]
+
+ xor t1q, t1q ; low
+ lea t2q, [lenq*4 - mmsize] ; high
+
+.post:
+ movaps m2, [expq + t2q] ; tab h
+ movaps m3, [expq + t1q] ; tab l
+ movups m0, [outq + t2q] ; in h
+ movups m1, [outq + t1q] ; in l
+
+ movshdup m4, m2 ; tab h imim
+ movshdup m5, m3 ; tab l imim
+ movsldup m6, m2 ; tab h rere
+ movsldup m7, m3 ; tab l rere
+
+ shufps m2, m0, m0, q2301 ; in h imre
+ shufps m3, m1, m1, q2301 ; in l imre
+
+ mulps m6, m0
+ mulps m7, m1
+
+ fmaddsubps m4, m4, m2, m6
+ fmaddsubps m5, m5, m3, m7
+
+ vpermpd m3, m5, q0123 ; flip
+ vpermpd m2, m4, q0123 ; flip
+
+ blendps m1, m2, m5, 01010101b
+ blendps m0, m3, m4, 01010101b
+
+ movups [outq + t2q], m0
+ movups [outq + t1q], m1
+
+ add t1q, mmsize
+ sub t2q, mmsize
+ sub lenq, mmsize/2
+ jg .post
+
+ RET
+%endmacro
+
+%if ARCH_X86_64 && HAVE_AVX2_EXTERNAL
+IMDCT_FN avx2
+%endif
+
+%macro PFA_15_FN 2
+INIT_YMM %1
+%if %2
+cglobal fft_pfa_15xM_asm_float, 0, 0, 0, ctx, out, in, stride, len, lut, buf, map, tgt, tmp, \
+ tgt5, stride3, stride5, btmp
+%else
+cglobal fft_pfa_15xM_float, 4, 14, 16, 320, ctx, out, in, stride, len, lut, buf, map, tgt, tmp, \
+ tgt5, stride3, stride5, btmp
+%endif
+
+%if %2
+ PUSH inq
+ PUSH tgt5q
+ PUSH stride3q
+ PUSH stride5q
+ PUSH btmpq
+%endif
+
+ PUSH strideq
+
+ mov btmpq, outq
+
+ mov outq, [ctxq + AVTXContext.tmp]
+%if %2 == 0
+ movsxd lenq, dword [ctxq + AVTXContext.len]
+ mov lutq, [ctxq + AVTXContext.map]
+%endif
+
+ ; Load stride (second transform's length) and second transform's LUT
+ mov tmpq, [ctxq + AVTXContext.sub]
+ movsxd strideq, dword [tmpq + AVTXContext.len]
+ mov mapq, [tmpq + AVTXContext.map]
+
+ shl strideq, 3
+ imul stride3q, strideq, 3
+ imul stride5q, strideq, 5
+
+ movaps m11, [mask_mmppmmmm] ; mmppmmmm
+ movaps m10, [tab_53_float] ; tab5
+ movaps xm9, [tab_53_float + 32] ; tab3
+ vpermpd m9, m9, q1110 ; tab[23232323]
+ movaps m8, [s15_perm]
+
+.dim1:
+ mov tmpd, [mapq]
+ lea tgtq, [outq + tmpq*8]
+
+%if %2
+ movups xm0, [inq] ; in[0,1].reim
+ movddup xm5, [inq + 16] ; in[2].reimreim
+ movups m2, [inq + mmsize*0 + 24] ; in[3-6].reim
+ movups m3, [inq + mmsize*1 + 24] ; in[7-11].reim
+ movups m4, [inq + mmsize*2 + 24] ; in[12-15].reim
+%else
+ LOAD64_LUT xm0, inq, lutq, 0, tmpq, m14, xm15 ; in[0,1].reim
+ LOAD64_LUT m2, inq, lutq, (mmsize/2)*0 + 12, tmpq, m6, m7
+ LOAD64_LUT m3, inq, lutq, (mmsize/2)*1 + 12, tmpq, m14, m15
+ LOAD64_LUT m4, inq, lutq, (mmsize/2)*2 + 12, tmpq, m6, m7
+ mov tmpd, [lutq + 8]
+ movddup xm5, [inq + tmpq*8] ; in[2].reimreim
+%endif
+
+ FFT15
+
+ lea tgt5q, [tgtq + stride5q]
+ lea tmpq, [tgtq + stride5q*2]
+
+ movhps [tgtq], xm14 ; out[0]
+ movhps [tgtq + stride5q*1], xm15 ; out[5]
+ movlps [tgtq + stride5q*2], xm15 ; out[10]
+
+ vextractf128 xm3, m0, 1
+ vextractf128 xm4, m1, 1
+ vextractf128 xm5, m2, 1
+
+ movlps [tgtq + strideq*1], xm1
+ movhps [tgtq + strideq*2], xm2
+ movlps [tgtq + stride3q*1], xm3
+ movhps [tgtq + strideq*4], xm4
+ movlps [tgtq + stride3q*2], xm0
+ movlps [tgtq + strideq*8], xm5
+ movhps [tgtq + stride3q*4], xm0
+ movhps [tgt5q + strideq*2], xm1
+ movhps [tgt5q + strideq*4], xm3
+ movlps [tmpq + strideq*1], xm2
+ movlps [tmpq + stride3q*1], xm4
+ movhps [tmpq + strideq*4], xm5
+
+%if %2
+ add inq, mmsize*3 + 24
+%else
+ add lutq, (mmsize/2)*3 + 12
+%endif
+ add mapq, 4
+ sub lenq, 15
+ jg .dim1
+
+ ; Second transform setup
+ mov stride5q, ctxq ; backup original context
+ movsxd stride3q, dword [ctxq + AVTXContext.len] ; full length
+ mov tgt5q, [ctxq + AVTXContext.fn] ; subtransform's jump point
+
+ mov inq, outq ; in-place transform
+ mov ctxq, [ctxq + AVTXContext.sub] ; load subtransform's context
+ mov lutq, [ctxq + AVTXContext.map] ; load subtransform's map
+ movsxd lenq, dword [ctxq + AVTXContext.len] ; load subtransform's length
+
+.dim2:
+ call tgt5q ; call the FFT
+ lea inq, [inq + lenq*8]
+ lea outq, [outq + lenq*8]
+ sub stride3q, lenq
+ jg .dim2
+
+ mov ctxq, stride5q ; restore original context
+ mov lutq, [ctxq + AVTXContext.map]
+ mov inq, [ctxq + AVTXContext.tmp]
+ movsxd lenq, dword [ctxq + AVTXContext.len] ; full length
+
+ lea stride3q, [lutq + lenq*4] ; second part of the LUT
+ mov stride5q, lenq
+ mov tgt5q, btmpq
+ POP strideq
+ lea tmpq, [strideq + 2*strideq]
+
+.post:
+ LOAD64_LUT m0, inq, stride3q, 0, tmpq, m8, m9
+ vextractf128 xm1, m0, 1
+ movlps [tgt5q], xm0
+ movhps [tgt5q + strideq], xm0
+ movlps [tgt5q + strideq*2], xm1
+ movhps [tgt5q + tmpq], xm1
+
+ lea tgt5q, [tgt5q + 4*strideq]
+ add stride3q, mmsize/2
+ sub stride5q, mmsize/8
+ jg .post
+
+%if %2
+ mov outq, btmpq
+ POP btmpq
+ POP stride5q
+ POP stride3q
+ POP tgt5q
+ POP inq
+ ret
+%else
+ RET
+%endif
+
+%if %2
+cglobal fft_pfa_15xM_ns_float, 4, 14, 16, 320, ctx, out, in, stride, len, lut, buf, map, tgt, tmp, \
+ tgt5, stride3, stride5, btmp
+ movsxd lenq, dword [ctxq + AVTXContext.len]
+ mov lutq, [ctxq + AVTXContext.map]
+
+ call mangle(ff_tx_fft_pfa_15xM_asm_float)
+ RET
+%endif
+%endmacro
+
+%if ARCH_X86_64 && HAVE_AVX2_EXTERNAL
+PFA_15_FN avx2, 0
+PFA_15_FN avx2, 1
+%endif