1 files changed, 404 insertions, 0 deletions
diff --git a/media/openmax_dl/dl/sp/src/omxSP_FFTFwd_RToCCS_F32_Sfs_s.S b/media/openmax_dl/dl/sp/src/omxSP_FFTFwd_RToCCS_F32_Sfs_s.S
new file mode 100644
index 0000000000..00358352be
--- /dev/null
+++ b/media/openmax_dl/dl/sp/src/omxSP_FFTFwd_RToCCS_F32_Sfs_s.S
@@ -0,0 +1,404 @@
+@//
+@//  Copyright (c) 2013 The WebRTC project authors. All Rights Reserved.
+@//
+@//  Use of this source code is governed by a BSD-style license
+@//  that can be found in the LICENSE file in the root of the source
+@//  tree. An additional intellectual property rights grant can be found
+@//  in the file PATENTS.  All contributing project authors may
+@//  be found in the AUTHORS file in the root of the source tree.
+@//
+@//  This is a modification of omxSP_FFTFwd_RToCCS_S32_Sfs_s.s
+@//  to support float instead of SC32.
+@//
+
+@//
+@// Description:
+@// Compute FFT for a real signal
+@//
+@//
+
+
+@// Include standard headers
+
+#include "dl/api/armCOMM_s.h"
+#include "dl/api/omxtypes_s.h"
+
+
+@// Import symbols required from other files
+@// (For example tables)
+
+        .extern  armSP_FFTFwd_CToC_FC32_Radix2_fs_OutOfPlace_unsafe
+        .extern  armSP_FFTFwd_CToC_FC32_Radix4_fs_OutOfPlace_unsafe
+        .extern  armSP_FFTFwd_CToC_FC32_Radix8_fs_OutOfPlace_unsafe
+        .extern  armSP_FFTFwd_CToC_FC32_Radix4_OutOfPlace_unsafe
+        .extern  armSP_FFTFwd_CToC_FC32_Radix4_fs_OutOfPlace_unsafe
+        .extern  armSP_FFTFwd_CToC_FC32_Radix8_fs_OutOfPlace_unsafe
+        .extern  armSP_FFTFwd_CToC_FC32_Radix2_OutOfPlace_unsafe
+
+@// Set debugging level
+@//DEBUG_ON    SETL {TRUE}
+
+
+
+@// Guarding implementation by the processor name
+
+
+
+    @// Guarding implementation by the processor name
+
+@// Import symbols required from other files
+@// (For example tables)
+        .extern  armSP_FFTFwd_CToC_FC32_Radix4_ls_OutOfPlace_unsafe
+        .extern  armSP_FFTFwd_CToC_FC32_Radix2_ls_OutOfPlace_unsafe
+
+
+@//Input Registers
+
+#define pSrc            r0
+#define pDst            r1
+#define pFFTSpec        r2
+#define scale           r3
+
+
+@// Output registers
+#define result          r0
+
+@//Local Scratch Registers
+
+#define argTwiddle      r1
+#define argDst          r2
+#define argScale        r4
+#define tmpOrder        r4
+#define pTwiddle        r4
+#define pOut            r5
+#define subFFTSize      r7
+#define subFFTNum       r6
+#define N               r6
+#define order           r14
+#define diff            r9
+@// Total num of radix stages required to comple the FFT
+#define count           r8
+#define x0r             r4
+#define x0i             r5
+#define diffMinusOne    r2
+#define subFFTSizeTmp   r6
+#define step            r3
+#define step1           r4
+#define twStep          r8
+#define zero            r9
+#define pTwiddleTmp     r5
+#define t0              r10
+
+@// Neon registers
+
+#define dX0       d0.f32
+#define dzero     d1.f32
+#define dZero     d2.f32
+#define dShift    d3.f32
+#define dX0r      d2.f32
+#define dX0i      d3.f32
+#define dX1r      d4.f32
+#define dX1i      d5.f32
+#define dT0       d6.f32
+#define dT1       d7.f32
+#define dT2       d8.f32
+#define dT3       d9.f32
+#define qT0       d10.f32
+#define qT1       d12.f32
+#define dW0r      d14.f32
+#define dW0i      d15.f32
+#define dW1r      d16.f32
+#define dW1i      d17.f32
+#define dY0r      d14.f32
+#define dY0i      d15.f32
+#define dY1r      d16.f32
+#define dY1i      d17.f32
+#define dY0rS64   d14.s64
+#define dY0iS64   d15.s64
+#define qT2       d18.f32
+#define qT3       d20.f32
+@// lastThreeelements
+#define dX1       d3.f32
+#define dW0       d4.f32
+#define dW1       d5.f32
+#define dY0       d10.f32
+#define dY1       d11.f32
+#define dY2       d12.f32
+#define dY3       d13.f32
+
+#define half      d0.f32
+
+    @// Allocate stack memory required by the function
+
+    @// Write function header
+        M_START     omxSP_FFTFwd_RToCCS_F32_Sfs,r11,d15
+
+@ Structure offsets for the FFTSpec
+        .set    ARMsFFTSpec_N, 0
+        .set    ARMsFFTSpec_pBitRev, 4
+        .set    ARMsFFTSpec_pTwiddle, 8
+        .set    ARMsFFTSpec_pBuf, 12
+
+        @// Define stack arguments
+
+        @// Read the size from structure and take log
+        LDR     N, [pFFTSpec, #ARMsFFTSpec_N]
+
+        @// Read other structure parameters
+        LDR     pTwiddle, [pFFTSpec, #ARMsFFTSpec_pTwiddle]
+        LDR     pOut, [pFFTSpec, #ARMsFFTSpec_pBuf]
+
+        @//  N=1 Treat seperately
+        CMP     N,#1
+        BGT     sizeGreaterThanOne
+        VLD1    dX0[0],[pSrc]
+        MOV     zero,#0
+        VMOV    dzero[0],zero
+        VMOV    dZero[0],zero
+        VST3    {dX0[0],dzero[0],dZero[0]},[pDst]
+
+        B       End
+
+
+
+sizeGreaterThanOne:
+        @// Do a N/2 point complex FFT including the scaling
+
+        MOV     N,N,ASR #1                          @// N/2 point complex FFT
+
+        CLZ     order,N                             @// N = 2^order
+        RSB     order,order,#31
+        MOV     subFFTSize,#1
+        @//MOV     subFFTNum,N
+
+        CMP     order,#3
+        BGT     orderGreaterthan3                   @// order > 3
+
+        CMP     order,#1
+        BGE     orderGreaterthan0                   @// order > 0
+        VLD1    dX0,[pSrc]
+        VST1    dX0,[pOut]
+        MOV     pSrc,pOut
+        MOV     argDst,pDst
+        BLT     FFTEnd
+
+orderGreaterthan0:
+        @// set the buffers appropriately for various orders
+        CMP     order,#2
+        MOVEQ   argDst,pDst
+        MOVNE   argDst,pOut
+        @// Pass the first stage destination in RN5
+        MOVNE   pOut,pDst
+        MOV     argTwiddle,pTwiddle
+
+        CMP     order,#1
+        BGT     orderGreaterthan1
+        @// order = 1
+        BL      armSP_FFTFwd_CToC_FC32_Radix2_fs_OutOfPlace_unsafe
+        B       FFTEnd
+
+orderGreaterthan1:
+        CMP     order,#2
+        BGT     orderGreaterthan2
+        @// order =2
+        BL      armSP_FFTFwd_CToC_FC32_Radix2_fs_OutOfPlace_unsafe
+        BL      armSP_FFTFwd_CToC_FC32_Radix2_ls_OutOfPlace_unsafe
+        B       FFTEnd
+
+orderGreaterthan2:@// order =3
+        BL      armSP_FFTFwd_CToC_FC32_Radix2_fs_OutOfPlace_unsafe
+        BL      armSP_FFTFwd_CToC_FC32_Radix2_OutOfPlace_unsafe
+        BL      armSP_FFTFwd_CToC_FC32_Radix2_ls_OutOfPlace_unsafe
+
+        B       FFTEnd
+
+
+
+orderGreaterthan3:
+specialScaleCase:
+
+        @// Set input args to fft stages
+        TST     order, #2
+        MOVEQ   argDst,pDst
+        MOVNE   argDst,pOut
+        @// Pass the first stage destination in RN5
+        MOVNE   pOut,pDst
+        MOV     argTwiddle,pTwiddle
+
+        @//check for even or odd order
+        @// NOTE: The following combination of BL's would work fine even though
+        @// the first BL would corrupt the flags. This is because the end of
+        @// the "grpZeroSetLoop" loop inside
+        @// armSP_FFTFwd_CToC_FC32_Radix4_fs_OutOfPlace_unsafe sets the Z flag
+        @// to EQ
+
+        TST     order,#0x00000001
+        BLEQ    armSP_FFTFwd_CToC_FC32_Radix4_fs_OutOfPlace_unsafe
+        BLNE    armSP_FFTFwd_CToC_FC32_Radix8_fs_OutOfPlace_unsafe
+
+        CMP        subFFTNum,#4
+        BLT     FFTEnd
+
+
+unscaledRadix4Loop:
+        BEQ        lastStageUnscaledRadix4
+         BL        armSP_FFTFwd_CToC_FC32_Radix4_OutOfPlace_unsafe
+         CMP        subFFTNum,#4
+         B        unscaledRadix4Loop
+
+lastStageUnscaledRadix4:
+        BL      armSP_FFTFwd_CToC_FC32_Radix4_ls_OutOfPlace_unsafe
+        B        FFTEnd
+
+
+FFTEnd:
+finalComplexToRealFixup:
+
+
+        @// F(0) = 1/2[Z(0) + Z'(0)] - j [Z(0) - Z'(0)]
+        @// 1/2[(a+jb) + (a-jb)] - j  [(a+jb) - (a-jb)]
+        @// 1/2[2a+j0] - j [0+j2b]
+        @// (a+b, 0)
+
+        @// F(N/2) = 1/2[Z(0) + Z'(0)] + j [Z(0) - Z'(0)]
+        @// 1/2[(a+jb) + (a-jb)] + j  [(a+jb) - (a-jb)]
+        @// 1/2[2a+j0] + j [0+j2b]
+        @// (a-b, 0)
+
+        @// F(0) and F(N/2)
+        VLD2    {dX0r[0],dX0i[0]},[pSrc]!
+        MOV     zero,#0
+        VMOV    dX0r[1],zero
+        MOV     step,subFFTSize,LSL #3            @// step = N/2 * 8 bytes
+        VMOV    dX0i[1],zero
+        @// twStep = 3N/8 * 8 bytes pointing to W^1
+        SUB     twStep,step,subFFTSize,LSL #1
+
+        VADD    dY0r,dX0r,dX0i                    @// F(0) = ((Z0.r+Z0.i) , 0)
+        MOV     step1,subFFTSize,LSL #2           @// step1 = N/2 * 4 bytes
+        VSUB    dY0i,dX0r,dX0i                    @// F(N/2) = ((Z0.r-Z0.i) , 0)
+        SUBS    subFFTSize,subFFTSize,#2
+
+        VST1    dY0r,[argDst],step
+        ADD     pTwiddleTmp,argTwiddle,#8         @// W^2
+        VST1    dY0i,[argDst]!
+        ADD     argTwiddle,argTwiddle,twStep      @// W^1
+
+        VDUP    dzero,zero
+        SUB     argDst,argDst,step
+
+        BLT     End
+        BEQ     lastElement
+        SUB     step,step,#24
+        SUB     step1,step1,#8                    @// (N/4-1)*8 bytes
+
+        @// F(k) = 1/2[Z(k) +  Z'(N/2-k)] -j*W^(k) [Z(k) -  Z'(N/2-k)]
+        @// Note: W^k is stored as negative values in the table
+        @// Process 4 elements at a time. E.g: F(1),F(2) and F(N/2-2),F(N/2-1)
+        @// since both of them require Z(1),Z(2) and Z(N/2-2),Z(N/2-1)
+
+
+        ADR     t0, HALF
+        VLD1    half[0], [t0]
+
+evenOddButterflyLoop:
+
+
+        VLD1    dW0r,[argTwiddle],step1
+        VLD1    dW1r,[argTwiddle]!
+
+        VLD2    {dX0r,dX0i},[pSrc],step
+        SUB     argTwiddle,argTwiddle,step1
+        VLD2    {dX1r,dX1i},[pSrc]!
+
+
+
+        SUB     step1,step1,#8                    @// (N/4-2)*8 bytes
+        VLD1    dW0i,[pTwiddleTmp],step1
+        VLD1    dW1i,[pTwiddleTmp]!
+        SUB     pSrc,pSrc,step
+
+        SUB     pTwiddleTmp,pTwiddleTmp,step1
+        VREV64  dX1r,dX1r
+        VREV64  dX1i,dX1i
+        SUBS    subFFTSize,subFFTSize,#4
+
+
+
+        VSUB    dT2,dX0r,dX1r                     @// a-c
+        SUB     step1,step1,#8
+        VADD    dT0,dX0r,dX1r                     @// a+c
+        VSUB    dT1,dX0i,dX1i                     @// b-d
+        VADD    dT3,dX0i,dX1i                     @// b+d
+        VMUL   dT0,dT0,half[0]
+        VMUL   dT1,dT1,half[0]
+        VZIP    dW1r,dW1i
+        VZIP    dW0r,dW0i
+
+
+        VMUL   qT0,dW1r,dT2
+        VMUL   qT1,dW1r,dT3
+        VMUL   qT2,dW0r,dT2
+        VMUL   qT3,dW0r,dT3
+
+        VMLA   qT0,dW1i,dT3
+        VMLS   qT1,dW1i,dT2
+
+        VMLS   qT2,dW0i,dT3
+        VMLA   qT3,dW0i,dT2
+
+
+        VMUL  dX1r,qT0,half[0]
+        VMUL  dX1i,qT1,half[0]
+
+        VSUB    dY1r,dT0,dX1i                     @// F(N/2 -1)
+        VADD    dY1i,dT1,dX1r
+        VNEG    dY1i,dY1i
+
+        VREV64  dY1r,dY1r
+        VREV64  dY1i,dY1i
+
+
+        VMUL  dX0r,qT2,half[0]
+        VMUL  dX0i,qT3,half[0]
+
+        VSUB    dY0r,dT0,dX0i                     @// F(1)
+        VADD    dY0i,dT1,dX0r
+
+
+        VST2    {dY0r,dY0i},[argDst],step
+        VST2    {dY1r,dY1i},[argDst]!
+        SUB     argDst,argDst,step
+        SUB     step,step,#32                     @// (N/2-4)*8 bytes
+
+
+        BGT     evenOddButterflyLoop
+
+        @// set both the ptrs to the last element
+        SUB     pSrc,pSrc,#8
+        SUB     argDst,argDst,#8
+
+
+
+        @// Last element can be expanded as follows
+        @// 1/2[Z(k) + Z'(k)] + j w^k [Z(k) - Z'(k)]
+        @// 1/2[(a+jb) + (a-jb)] + j w^k [(a+jb) - (a-jb)]
+        @// 1/2[2a+j0] + j (c+jd) [0+j2b]
+        @// (a-bc, -bd)
+        @// Since (c,d) = (0,1) for the last element, result is just (a,-b)
+
+lastElement:
+        VLD1    dX0r,[pSrc]
+
+        VST1    dX0r[0],[argDst]!
+        VNEG    dX0r,dX0r
+        VST1    dX0r[1],[argDst]!
+
+End:
+        @// Set return value
+        MOV     result, #OMX_Sts_NoErr
+
+        @// Write function tail
+        M_END
+HALF:   .float  0.5
+        .end