diff options
Diffstat (limited to 'media/openmax_dl/dl/sp/src/omxSP_FFTFwd_RToCCS_F32_Sfs_s.S')
-rw-r--r-- | media/openmax_dl/dl/sp/src/omxSP_FFTFwd_RToCCS_F32_Sfs_s.S | 404 |
1 files changed, 404 insertions, 0 deletions
diff --git a/media/openmax_dl/dl/sp/src/omxSP_FFTFwd_RToCCS_F32_Sfs_s.S b/media/openmax_dl/dl/sp/src/omxSP_FFTFwd_RToCCS_F32_Sfs_s.S new file mode 100644 index 0000000000..00358352be --- /dev/null +++ b/media/openmax_dl/dl/sp/src/omxSP_FFTFwd_RToCCS_F32_Sfs_s.S @@ -0,0 +1,404 @@ +@// +@// Copyright (c) 2013 The WebRTC project authors. All Rights Reserved. +@// +@// Use of this source code is governed by a BSD-style license +@// that can be found in the LICENSE file in the root of the source +@// tree. An additional intellectual property rights grant can be found +@// in the file PATENTS. All contributing project authors may +@// be found in the AUTHORS file in the root of the source tree. +@// +@// This is a modification of omxSP_FFTFwd_RToCCS_S32_Sfs_s.s +@// to support float instead of SC32. +@// + +@// +@// Description: +@// Compute FFT for a real signal +@// +@// + + +@// Include standard headers + +#include "dl/api/armCOMM_s.h" +#include "dl/api/omxtypes_s.h" + + +@// Import symbols required from other files +@// (For example tables) + + .extern armSP_FFTFwd_CToC_FC32_Radix2_fs_OutOfPlace_unsafe + .extern armSP_FFTFwd_CToC_FC32_Radix4_fs_OutOfPlace_unsafe + .extern armSP_FFTFwd_CToC_FC32_Radix8_fs_OutOfPlace_unsafe + .extern armSP_FFTFwd_CToC_FC32_Radix4_OutOfPlace_unsafe + .extern armSP_FFTFwd_CToC_FC32_Radix4_fs_OutOfPlace_unsafe + .extern armSP_FFTFwd_CToC_FC32_Radix8_fs_OutOfPlace_unsafe + .extern armSP_FFTFwd_CToC_FC32_Radix2_OutOfPlace_unsafe + +@// Set debugging level +@//DEBUG_ON SETL {TRUE} + + + +@// Guarding implementation by the processor name + + + + @// Guarding implementation by the processor name + +@// Import symbols required from other files +@// (For example tables) + .extern armSP_FFTFwd_CToC_FC32_Radix4_ls_OutOfPlace_unsafe + .extern armSP_FFTFwd_CToC_FC32_Radix2_ls_OutOfPlace_unsafe + + +@//Input Registers + +#define pSrc r0 +#define pDst r1 +#define pFFTSpec r2 +#define scale r3 + + +@// Output registers +#define result r0 + +@//Local Scratch Registers + +#define argTwiddle r1 +#define argDst r2 +#define argScale r4 +#define tmpOrder r4 +#define pTwiddle r4 +#define pOut r5 +#define subFFTSize r7 +#define subFFTNum r6 +#define N r6 +#define order r14 +#define diff r9 +@// Total num of radix stages required to comple the FFT +#define count r8 +#define x0r r4 +#define x0i r5 +#define diffMinusOne r2 +#define subFFTSizeTmp r6 +#define step r3 +#define step1 r4 +#define twStep r8 +#define zero r9 +#define pTwiddleTmp r5 +#define t0 r10 + +@// Neon registers + +#define dX0 d0.f32 +#define dzero d1.f32 +#define dZero d2.f32 +#define dShift d3.f32 +#define dX0r d2.f32 +#define dX0i d3.f32 +#define dX1r d4.f32 +#define dX1i d5.f32 +#define dT0 d6.f32 +#define dT1 d7.f32 +#define dT2 d8.f32 +#define dT3 d9.f32 +#define qT0 d10.f32 +#define qT1 d12.f32 +#define dW0r d14.f32 +#define dW0i d15.f32 +#define dW1r d16.f32 +#define dW1i d17.f32 +#define dY0r d14.f32 +#define dY0i d15.f32 +#define dY1r d16.f32 +#define dY1i d17.f32 +#define dY0rS64 d14.s64 +#define dY0iS64 d15.s64 +#define qT2 d18.f32 +#define qT3 d20.f32 +@// lastThreeelements +#define dX1 d3.f32 +#define dW0 d4.f32 +#define dW1 d5.f32 +#define dY0 d10.f32 +#define dY1 d11.f32 +#define dY2 d12.f32 +#define dY3 d13.f32 + +#define half d0.f32 + + @// Allocate stack memory required by the function + + @// Write function header + M_START omxSP_FFTFwd_RToCCS_F32_Sfs,r11,d15 + +@ Structure offsets for the FFTSpec + .set ARMsFFTSpec_N, 0 + .set ARMsFFTSpec_pBitRev, 4 + .set ARMsFFTSpec_pTwiddle, 8 + .set ARMsFFTSpec_pBuf, 12 + + @// Define stack arguments + + @// Read the size from structure and take log + LDR N, [pFFTSpec, #ARMsFFTSpec_N] + + @// Read other structure parameters + LDR pTwiddle, [pFFTSpec, #ARMsFFTSpec_pTwiddle] + LDR pOut, [pFFTSpec, #ARMsFFTSpec_pBuf] + + @// N=1 Treat seperately + CMP N,#1 + BGT sizeGreaterThanOne + VLD1 dX0[0],[pSrc] + MOV zero,#0 + VMOV dzero[0],zero + VMOV dZero[0],zero + VST3 {dX0[0],dzero[0],dZero[0]},[pDst] + + B End + + + +sizeGreaterThanOne: + @// Do a N/2 point complex FFT including the scaling + + MOV N,N,ASR #1 @// N/2 point complex FFT + + CLZ order,N @// N = 2^order + RSB order,order,#31 + MOV subFFTSize,#1 + @//MOV subFFTNum,N + + CMP order,#3 + BGT orderGreaterthan3 @// order > 3 + + CMP order,#1 + BGE orderGreaterthan0 @// order > 0 + VLD1 dX0,[pSrc] + VST1 dX0,[pOut] + MOV pSrc,pOut + MOV argDst,pDst + BLT FFTEnd + +orderGreaterthan0: + @// set the buffers appropriately for various orders + CMP order,#2 + MOVEQ argDst,pDst + MOVNE argDst,pOut + @// Pass the first stage destination in RN5 + MOVNE pOut,pDst + MOV argTwiddle,pTwiddle + + CMP order,#1 + BGT orderGreaterthan1 + @// order = 1 + BL armSP_FFTFwd_CToC_FC32_Radix2_fs_OutOfPlace_unsafe + B FFTEnd + +orderGreaterthan1: + CMP order,#2 + BGT orderGreaterthan2 + @// order =2 + BL armSP_FFTFwd_CToC_FC32_Radix2_fs_OutOfPlace_unsafe + BL armSP_FFTFwd_CToC_FC32_Radix2_ls_OutOfPlace_unsafe + B FFTEnd + +orderGreaterthan2:@// order =3 + BL armSP_FFTFwd_CToC_FC32_Radix2_fs_OutOfPlace_unsafe + BL armSP_FFTFwd_CToC_FC32_Radix2_OutOfPlace_unsafe + BL armSP_FFTFwd_CToC_FC32_Radix2_ls_OutOfPlace_unsafe + + B FFTEnd + + + +orderGreaterthan3: +specialScaleCase: + + @// Set input args to fft stages + TST order, #2 + MOVEQ argDst,pDst + MOVNE argDst,pOut + @// Pass the first stage destination in RN5 + MOVNE pOut,pDst + MOV argTwiddle,pTwiddle + + @//check for even or odd order + @// NOTE: The following combination of BL's would work fine even though + @// the first BL would corrupt the flags. This is because the end of + @// the "grpZeroSetLoop" loop inside + @// armSP_FFTFwd_CToC_FC32_Radix4_fs_OutOfPlace_unsafe sets the Z flag + @// to EQ + + TST order,#0x00000001 + BLEQ armSP_FFTFwd_CToC_FC32_Radix4_fs_OutOfPlace_unsafe + BLNE armSP_FFTFwd_CToC_FC32_Radix8_fs_OutOfPlace_unsafe + + CMP subFFTNum,#4 + BLT FFTEnd + + +unscaledRadix4Loop: + BEQ lastStageUnscaledRadix4 + BL armSP_FFTFwd_CToC_FC32_Radix4_OutOfPlace_unsafe + CMP subFFTNum,#4 + B unscaledRadix4Loop + +lastStageUnscaledRadix4: + BL armSP_FFTFwd_CToC_FC32_Radix4_ls_OutOfPlace_unsafe + B FFTEnd + + +FFTEnd: +finalComplexToRealFixup: + + + @// F(0) = 1/2[Z(0) + Z'(0)] - j [Z(0) - Z'(0)] + @// 1/2[(a+jb) + (a-jb)] - j [(a+jb) - (a-jb)] + @// 1/2[2a+j0] - j [0+j2b] + @// (a+b, 0) + + @// F(N/2) = 1/2[Z(0) + Z'(0)] + j [Z(0) - Z'(0)] + @// 1/2[(a+jb) + (a-jb)] + j [(a+jb) - (a-jb)] + @// 1/2[2a+j0] + j [0+j2b] + @// (a-b, 0) + + @// F(0) and F(N/2) + VLD2 {dX0r[0],dX0i[0]},[pSrc]! + MOV zero,#0 + VMOV dX0r[1],zero + MOV step,subFFTSize,LSL #3 @// step = N/2 * 8 bytes + VMOV dX0i[1],zero + @// twStep = 3N/8 * 8 bytes pointing to W^1 + SUB twStep,step,subFFTSize,LSL #1 + + VADD dY0r,dX0r,dX0i @// F(0) = ((Z0.r+Z0.i) , 0) + MOV step1,subFFTSize,LSL #2 @// step1 = N/2 * 4 bytes + VSUB dY0i,dX0r,dX0i @// F(N/2) = ((Z0.r-Z0.i) , 0) + SUBS subFFTSize,subFFTSize,#2 + + VST1 dY0r,[argDst],step + ADD pTwiddleTmp,argTwiddle,#8 @// W^2 + VST1 dY0i,[argDst]! + ADD argTwiddle,argTwiddle,twStep @// W^1 + + VDUP dzero,zero + SUB argDst,argDst,step + + BLT End + BEQ lastElement + SUB step,step,#24 + SUB step1,step1,#8 @// (N/4-1)*8 bytes + + @// F(k) = 1/2[Z(k) + Z'(N/2-k)] -j*W^(k) [Z(k) - Z'(N/2-k)] + @// Note: W^k is stored as negative values in the table + @// Process 4 elements at a time. E.g: F(1),F(2) and F(N/2-2),F(N/2-1) + @// since both of them require Z(1),Z(2) and Z(N/2-2),Z(N/2-1) + + + ADR t0, HALF + VLD1 half[0], [t0] + +evenOddButterflyLoop: + + + VLD1 dW0r,[argTwiddle],step1 + VLD1 dW1r,[argTwiddle]! + + VLD2 {dX0r,dX0i},[pSrc],step + SUB argTwiddle,argTwiddle,step1 + VLD2 {dX1r,dX1i},[pSrc]! + + + + SUB step1,step1,#8 @// (N/4-2)*8 bytes + VLD1 dW0i,[pTwiddleTmp],step1 + VLD1 dW1i,[pTwiddleTmp]! + SUB pSrc,pSrc,step + + SUB pTwiddleTmp,pTwiddleTmp,step1 + VREV64 dX1r,dX1r + VREV64 dX1i,dX1i + SUBS subFFTSize,subFFTSize,#4 + + + + VSUB dT2,dX0r,dX1r @// a-c + SUB step1,step1,#8 + VADD dT0,dX0r,dX1r @// a+c + VSUB dT1,dX0i,dX1i @// b-d + VADD dT3,dX0i,dX1i @// b+d + VMUL dT0,dT0,half[0] + VMUL dT1,dT1,half[0] + VZIP dW1r,dW1i + VZIP dW0r,dW0i + + + VMUL qT0,dW1r,dT2 + VMUL qT1,dW1r,dT3 + VMUL qT2,dW0r,dT2 + VMUL qT3,dW0r,dT3 + + VMLA qT0,dW1i,dT3 + VMLS qT1,dW1i,dT2 + + VMLS qT2,dW0i,dT3 + VMLA qT3,dW0i,dT2 + + + VMUL dX1r,qT0,half[0] + VMUL dX1i,qT1,half[0] + + VSUB dY1r,dT0,dX1i @// F(N/2 -1) + VADD dY1i,dT1,dX1r + VNEG dY1i,dY1i + + VREV64 dY1r,dY1r + VREV64 dY1i,dY1i + + + VMUL dX0r,qT2,half[0] + VMUL dX0i,qT3,half[0] + + VSUB dY0r,dT0,dX0i @// F(1) + VADD dY0i,dT1,dX0r + + + VST2 {dY0r,dY0i},[argDst],step + VST2 {dY1r,dY1i},[argDst]! + SUB argDst,argDst,step + SUB step,step,#32 @// (N/2-4)*8 bytes + + + BGT evenOddButterflyLoop + + @// set both the ptrs to the last element + SUB pSrc,pSrc,#8 + SUB argDst,argDst,#8 + + + + @// Last element can be expanded as follows + @// 1/2[Z(k) + Z'(k)] + j w^k [Z(k) - Z'(k)] + @// 1/2[(a+jb) + (a-jb)] + j w^k [(a+jb) - (a-jb)] + @// 1/2[2a+j0] + j (c+jd) [0+j2b] + @// (a-bc, -bd) + @// Since (c,d) = (0,1) for the last element, result is just (a,-b) + +lastElement: + VLD1 dX0r,[pSrc] + + VST1 dX0r[0],[argDst]! + VNEG dX0r,dX0r + VST1 dX0r[1],[argDst]! + +End: + @// Set return value + MOV result, #OMX_Sts_NoErr + + @// Write function tail + M_END +HALF: .float 0.5 + .end |