1 files changed, 331 insertions, 0 deletions
diff --git a/media/openmax_dl/dl/sp/src/armSP_FFT_CToC_FC32_Radix4_unsafe_s.S b/media/openmax_dl/dl/sp/src/armSP_FFT_CToC_FC32_Radix4_unsafe_s.S
new file mode 100644
index 0000000000..cbb8f25608
--- /dev/null
+++ b/media/openmax_dl/dl/sp/src/armSP_FFT_CToC_FC32_Radix4_unsafe_s.S
@@ -0,0 +1,331 @@
+@//
+@//  Copyright (c) 2013 The WebRTC project authors. All Rights Reserved.
+@//
+@//  Use of this source code is governed by a BSD-style license
+@//  that can be found in the LICENSE file in the root of the source
+@//  tree. An additional intellectual property rights grant can be found
+@//  in the file PATENTS.  All contributing project authors may
+@//  be found in the AUTHORS file in the root of the source tree.
+@//
+@//
+@//  This is a modification of armSP_FFT_CToC_SC32_Radix4_unsafe_s.s
+@//  to support float instead of SC32.
+@//
+
+@//
+@// Description:
+@// Compute a Radix 4 FFT stage for a N point complex signal
+@//
+@//
+
+
+@// Include standard headers
+
+#include "dl/api/armCOMM_s.h"
+#include "dl/api/omxtypes_s.h"
+
+
+@// Import symbols required from other files
+@// (For example tables)
+
+
+
+
+@// Set debugging level
+@//DEBUG_ON    SETL {TRUE}
+
+
+
+@// Guarding implementation by the processor name
+
+
+
+
+@// Guarding implementation by the processor name
+
+
+@// Import symbols required from other files
+@// (For example tables)
+
+
+@//Input Registers
+
+#define pSrc            r0
+#define pDst            r2
+#define pTwiddle        r1
+#define subFFTNum       r6
+#define subFFTSize      r7
+
+
+
+@//Output Registers
+
+
+@//Local Scratch Registers
+
+#define grpCount        r3
+#define pointStep       r4
+#define outPointStep    r5
+#define stepTwiddle     r12
+#define setCount        r14
+#define srcStep         r8
+#define setStep         r9
+#define dstStep         r10
+#define twStep          r11
+#define t1              r3
+
+@// Neon Registers
+
+#define dW1     D0.F32
+#define dW2     D1.F32
+#define dW3     D2.F32
+
+#define dXr0    D4.F32
+#define dXi0    D5.F32
+#define dXr1    D6.F32
+#define dXi1    D7.F32
+#define dXr2    D8.F32
+#define dXi2    D9.F32
+#define dXr3    D10.F32
+#define dXi3    D11.F32
+#define dYr0    D12.F32
+#define dYi0    D13.F32
+#define dYr1    D14.F32
+#define dYi1    D15.F32
+#define dYr2    D16.F32
+#define dYi2    D17.F32
+#define dYr3    D18.F32
+#define dYi3    D19.F32
+#define qT0     d16.f32
+#define qT1     d18.f32
+#define qT2     d12.f32
+#define qT3     d14.f32
+#define dZr0    D20.F32
+#define dZi0    D21.F32
+#define dZr1    D22.F32
+#define dZi1    D23.F32
+#define dZr2    D24.F32
+#define dZi2    D25.F32
+#define dZr3    D26.F32
+#define dZi3    D27.F32
+
+#define qY0     Q6.F32
+#define qY1     Q7.F32
+#define qY2     Q8.F32
+#define qY3     Q9.F32
+#define qX0     Q2.F32
+#define qZ0     Q10.F32
+#define qZ1     Q11.F32
+#define qZ2     Q12.F32
+#define qZ3     Q13.F32
+
+        .MACRO FFTSTAGE scaled, inverse , name
+
+        @// Define stack arguments
+
+
+        @// Update grpCount and grpSize rightaway inorder to reuse
+        @// pGrpCount and pGrpSize regs
+
+        LSL     grpCount,subFFTSize,#2
+        LSR     subFFTNum,subFFTNum,#2
+        MOV     subFFTSize,grpCount
+
+        VLD1     dW1,[pTwiddle]                    @//[wi | wr]
+        @// pT0+1 increments pT0 by 8 bytes
+        @// pT0+pointStep = increment of 8*pointStep bytes = 2*grpSize bytes
+        MOV     pointStep,subFFTNum,LSL #1
+
+
+        @// pOut0+1 increments pOut0 by 8 bytes
+        @// pOut0+outPointStep == increment of 8*outPointStep bytes
+        @//   = 2*size bytes
+
+        MOV     stepTwiddle,#0
+        VLD1     dW2,[pTwiddle]                    @//[wi | wr]
+        SMULBB  outPointStep,grpCount,pointStep
+        LSL     pointStep,pointStep,#2             @// 2*grpSize
+
+        VLD1     dW3,[pTwiddle]                    @//[wi | wr]
+        MOV     srcStep,pointStep,LSL #1           @// srcStep = 2*pointStep
+        ADD     setStep,srcStep,pointStep          @// setStep = 3*pointStep
+
+        RSB     setStep,setStep,#0                 @// setStep = - 3*pointStep
+        SUB     srcStep,srcStep,#16                @// srcStep = 2*pointStep-16
+
+        MOV     dstStep,outPointStep,LSL #1
+        ADD     dstStep,dstStep,outPointStep       @// dstStep = 3*outPointStep
+        @// dstStep = - 3*outPointStep+16
+        RSB     dstStep,dstStep,#16
+
+
+
+radix4GrpLoop\name :
+
+        VLD2    {dXr0,dXi0},[pSrc],pointStep       @//  data[0]
+        ADD      stepTwiddle,stepTwiddle,pointStep
+        VLD2    {dXr1,dXi1},[pSrc],pointStep       @//  data[1]
+        @// set pTwiddle to the first point
+        ADD      pTwiddle,pTwiddle,stepTwiddle
+        VLD2    {dXr2,dXi2},[pSrc],pointStep       @//  data[2]
+        MOV      twStep,stepTwiddle,LSL #2
+
+        @//  data[3] & update pSrc for the next set
+        VLD2    {dXr3,dXi3},[pSrc],setStep
+        SUB      twStep,stepTwiddle,twStep         @// twStep = -3*stepTwiddle
+
+        MOV      setCount,pointStep,LSR #3
+        @// set pSrc to data[0] of the next set
+        ADD     pSrc,pSrc,#16
+        @// increment to data[1] of the next set
+        ADD     pSrc,pSrc,pointStep
+
+
+        @// Loop on the sets
+
+radix4SetLoop\name :
+
+
+
+        .ifeqs  "\inverse", "TRUE"
+            VMUL   dZr1,dXr1,dW1[0]
+            VMUL   dZi1,dXi1,dW1[0]
+            VMUL   dZr2,dXr2,dW2[0]
+            VMUL   dZi2,dXi2,dW2[0]
+            VMUL   dZr3,dXr3,dW3[0]
+            VMUL   dZi3,dXi3,dW3[0]
+
+            VMLA   dZr1,dXi1,dW1[1]                @// real part
+            VMLS   dZi1,dXr1,dW1[1]                @// imag part
+
+            @//  data[1] for next iteration
+            VLD2    {dXr1,dXi1},[pSrc],pointStep
+
+            VMLA   dZr2,dXi2,dW2[1]                @// real part
+            VMLS   dZi2,dXr2,dW2[1]                @// imag part
+
+            @//  data[2] for next iteration
+            VLD2    {dXr2,dXi2},[pSrc],pointStep
+
+            VMLA   dZr3,dXi3,dW3[1]                @// real part
+            VMLS   dZi3,dXr3,dW3[1]                @// imag part
+        .else
+            VMUL   dZr1,dXr1,dW1[0]
+            VMUL   dZi1,dXi1,dW1[0]
+            VMUL   dZr2,dXr2,dW2[0]
+            VMUL   dZi2,dXi2,dW2[0]
+            VMUL   dZr3,dXr3,dW3[0]
+            VMUL   dZi3,dXi3,dW3[0]
+
+            VMLS   dZr1,dXi1,dW1[1]                @// real part
+            VMLA   dZi1,dXr1,dW1[1]                @// imag part
+
+            @//  data[1] for next iteration
+            VLD2    {dXr1,dXi1},[pSrc],pointStep
+
+            VMLS   dZr2,dXi2,dW2[1]                @// real part
+            VMLA   dZi2,dXr2,dW2[1]                @// imag part
+
+            @//  data[2] for next iteration
+            VLD2    {dXr2,dXi2},[pSrc],pointStep
+
+            VMLS   dZr3,dXi3,dW3[1]                @// real part
+            VMLA   dZi3,dXr3,dW3[1]                @// imag part
+        .endif
+
+        @//  data[3] & update pSrc to data[0]
+        @// But don't read on the very last iteration because that reads past 
+	@// the end of pSrc. The last iteration is grpCount = 4, setCount = 2.
+        cmp     grpCount, #4
+        cmpeq   setCount, #2                      @// Test setCount if grpCount = 4
+        @// These are executed only if both grpCount = 4 and setCount = 2       
+        addeq   pSrc, pSrc, setStep
+        beq     radix4SkipRead\name
+        VLD2    {dXr3,dXi3},[pSrc],setStep
+radix4SkipRead\name:
+        SUBS    setCount,setCount,#2
+
+        @// finish first stage of 4 point FFT
+        VADD    qY0,qX0,qZ2
+        VSUB    qY2,qX0,qZ2
+
+        @//  data[0] for next iteration
+        VLD2    {dXr0,dXi0},[pSrc, :128]!
+        VADD    qY1,qZ1,qZ3
+        VSUB    qY3,qZ1,qZ3
+
+        @// finish second stage of 4 point FFT
+
+        VSUB    qZ0,qY2,qY1
+
+
+        .ifeqs  "\inverse", "TRUE"
+
+            VADD    dZr3,dYr0,dYi3
+            VST2    {dZr0,dZi0},[pDst, :128],outPointStep
+            VSUB    dZi3,dYi0,dYr3
+
+            VADD    qZ2,qY2,qY1
+            VST2    {dZr3,dZi3},[pDst, :128],outPointStep
+
+            VSUB    dZr1,dYr0,dYi3
+            VST2    {dZr2,dZi2},[pDst, :128],outPointStep
+            VADD    dZi1,dYi0,dYr3
+
+            VST2    {dZr1,dZi1},[pDst, :128],dstStep
+
+
+        .else
+
+            VSUB    dZr1,dYr0,dYi3
+            VST2    {dZr0,dZi0},[pDst, :128],outPointStep
+            VADD    dZi1,dYi0,dYr3
+
+            VADD    qZ2,qY2,qY1
+            VST2    {dZr1,dZi1},[pDst, :128],outPointStep
+
+            VADD    dZr3,dYr0,dYi3
+            VST2    {dZr2,dZi2},[pDst, :128],outPointStep
+            VSUB    dZi3,dYi0,dYr3
+
+            VST2    {dZr3,dZi3},[pDst, :128],dstStep
+
+
+        .endif
+
+        @// increment to data[1] of the next set
+        ADD     pSrc,pSrc,pointStep
+        BGT     radix4SetLoop\name
+
+
+        VLD1     dW1,[pTwiddle, :64],stepTwiddle    @//[wi | wr]
+        @// subtract 4 since grpCount multiplied by 4
+        SUBS    grpCount,grpCount,#4
+        VLD1     dW2,[pTwiddle, :64],stepTwiddle    @//[wi | wr]
+        @// increment pSrc for the next grp
+        ADD     pSrc,pSrc,srcStep
+        VLD1     dW3,[pTwiddle, :64],twStep         @//[wi | wr]
+        BGT     radix4GrpLoop\name
+
+
+        @// Reset and Swap pSrc and pDst for the next stage
+        MOV     t1,pDst
+        @// pDst -= 2*size; pSrc -= 8*size bytes
+        SUB     pDst,pSrc,outPointStep,LSL #2
+        SUB     pSrc,t1,outPointStep
+
+
+        .endm
+
+
+        M_START armSP_FFTFwd_CToC_FC32_Radix4_OutOfPlace_unsafe,r4
+            FFTSTAGE "FALSE","FALSE",FWD
+        M_END
+
+
+        M_START armSP_FFTInv_CToC_FC32_Radix4_OutOfPlace_unsafe,r4
+            FFTSTAGE "FALSE","TRUE",INV
+        M_END
+
+
+        .end