diff options
Diffstat (limited to 'media/openmax_dl/dl/sp/src/armSP_FFT_CToC_FC32_Radix8_fs_unsafe_s.S')
-rw-r--r-- | media/openmax_dl/dl/sp/src/armSP_FFT_CToC_FC32_Radix8_fs_unsafe_s.S | 422 |
1 files changed, 0 insertions, 422 deletions
diff --git a/media/openmax_dl/dl/sp/src/armSP_FFT_CToC_FC32_Radix8_fs_unsafe_s.S b/media/openmax_dl/dl/sp/src/armSP_FFT_CToC_FC32_Radix8_fs_unsafe_s.S deleted file mode 100644 index 569818c5c3..0000000000 --- a/media/openmax_dl/dl/sp/src/armSP_FFT_CToC_FC32_Radix8_fs_unsafe_s.S +++ /dev/null @@ -1,422 +0,0 @@ -@// -@// Copyright (c) 2013 The WebRTC project authors. All Rights Reserved. -@// -@// Use of this source code is governed by a BSD-style license -@// that can be found in the LICENSE file in the root of the source -@// tree. An additional intellectual property rights grant can be found -@// in the file PATENTS. All contributing project authors may -@// be found in the AUTHORS file in the root of the source tree. -@// -@// This is a modification of armSP_FFT_CToC_FC32_Radix8_fs_unsafe_s.s -@// to support float instead of SC32. -@// - -@// -@// Description: -@// Compute a first stage Radix 8 FFT stage for a N point complex signal -@// -@// - - -@// Include standard headers - -#include "dl/api/armCOMM_s.h" -#include "dl/api/omxtypes_s.h" - -@// Import symbols required from other files -@// (For example tables) - - -@// Set debugging level -@//DEBUG_ON SETL {TRUE} - - - -@// Guarding implementation by the processor name - - - - -@// Guarding implementation by the processor name - -@//Input Registers - -#define pSrc r0 -#define pDst r2 -#define pTwiddle r1 -#define subFFTNum r6 -#define subFFTSize r7 -@// dest buffer for the next stage (not pSrc for first stage) -#define pPingPongBuf r5 - - -@//Output Registers - - -@//Local Scratch Registers - -#define grpSize r3 -@// Reuse grpSize as setCount -#define setCount r3 -#define pointStep r4 -#define outPointStep r4 -#define setStep r8 -#define step1 r9 -#define step2 r10 -#define t0 r11 - - -@// Neon Registers - -#define dXr0 D0 -#define dXi0 D1 -#define dXr1 D2 -#define dXi1 D3 -#define dXr2 D4 -#define dXi2 D5 -#define dXr3 D6 -#define dXi3 D7 -#define dXr4 D8 -#define dXi4 D9 -#define dXr5 D10 -#define dXi5 D11 -#define dXr6 D12 -#define dXi6 D13 -#define dXr7 D14 -#define dXi7 D15 -#define qX0 Q0 -#define qX1 Q1 -#define qX2 Q2 -#define qX3 Q3 -#define qX4 Q4 -#define qX5 Q5 -#define qX6 Q6 -#define qX7 Q7 - -#define dUr0 D16 -#define dUi0 D17 -#define dUr2 D18 -#define dUi2 D19 -#define dUr4 D20 -#define dUi4 D21 -#define dUr6 D22 -#define dUi6 D23 -#define dUr1 D24 -#define dUi1 D25 -#define dUr3 D26 -#define dUi3 D27 -#define dUr5 D28 -#define dUi5 D29 -@// reuse dXr7 and dXi7 -#define dUr7 D30 -#define dUi7 D31 -#define qU0 Q8 -#define qU1 Q12 -#define qU2 Q9 -#define qU3 Q13 -#define qU4 Q10 -#define qU5 Q14 -#define qU6 Q11 -#define qU7 Q15 - - -#define dVr0 D24 -#define dVi0 D25 -#define dVr2 D26 -#define dVi2 D27 -#define dVr4 D28 -#define dVi4 D29 -#define dVr6 D30 -#define dVi6 D31 -#define dVr1 D16 -#define dVi1 D17 -#define dVr3 D18 -#define dVi3 D19 -#define dVr5 D20 -#define dVi5 D21 -#define dVr7 D22 -#define dVi7 D23 -#define qV0 Q12 -#define qV1 Q8 -#define qV2 Q13 -#define qV3 Q9 -#define qV4 Q14 -#define qV5 Q10 -#define qV6 Q15 -#define qV7 Q11 - -#define dYr0 D16 -#define dYi0 D17 -#define dYr2 D18 -#define dYi2 D19 -#define dYr4 D20 -#define dYi4 D21 -#define dYr6 D22 -#define dYi6 D23 -#define dYr1 D24 -#define dYi1 D25 -#define dYr3 D26 -#define dYi3 D27 -#define dYr5 D28 -#define dYi5 D29 -#define dYr7 D30 -#define dYi7 D31 -#define qY0 Q8 -#define qY1 Q12 -#define qY2 Q9 -#define qY3 Q13 -#define qY4 Q10 -#define qY5 Q14 -#define qY6 Q11 -#define qY7 Q15 - -#define dT0 D14 -#define dT1 D15 - - - .MACRO FFTSTAGE scaled, inverse, name - - @// Define stack arguments - - @// Update pSubFFTSize and pSubFFTNum regs - @// subFFTSize = 1 for the first stage - MOV subFFTSize,#8 - ADR t0,ONEBYSQRT2\name - - @// Note: setCount = subFFTNum/8 (reuse the grpSize reg for setCount) - LSR grpSize,subFFTNum,#3 - MOV subFFTNum,grpSize - - - @// pT0+1 increments pT0 by 8 bytes - @// pT0+pointStep = increment of 8*pointStep bytes = grpSize bytes - @// Note: outPointStep = pointStep for firststage - - MOV pointStep,grpSize,LSL #3 - - - @// Calculate the step of input data for the next set - @//MOV step1,pointStep,LSL #1 @// step1 = 2*pointStep - VLD2.F32 {dXr0,dXi0},[pSrc, :128],pointStep @// data[0] - MOV step1,grpSize,LSL #4 - - MOV step2,pointStep,LSL #3 - VLD2.F32 {dXr1,dXi1},[pSrc, :128],pointStep @// data[1] - SUB step2,step2,pointStep @// step2 = 7*pointStep - @// setStep = - 7*pointStep+16 - RSB setStep,step2,#16 - - VLD2.F32 {dXr2,dXi2},[pSrc, :128],pointStep @// data[2] - VLD2.F32 {dXr3,dXi3},[pSrc, :128],pointStep @// data[3] - VLD2.F32 {dXr4,dXi4},[pSrc, :128],pointStep @// data[4] - VLD2.F32 {dXr5,dXi5},[pSrc, :128],pointStep @// data[5] - VLD2.F32 {dXr6,dXi6},[pSrc, :128],pointStep @// data[6] - @// data[7] & update pSrc for the next set - @// setStep = -7*pointStep + 16 - VLD2.F32 {dXr7,dXi7},[pSrc, :128],setStep - @// grp = 0 a special case since all the twiddle factors are 1 - @// Loop on the sets - -radix8fsGrpZeroSetLoop\name : - - @// Decrement setcount - SUBS setCount,setCount,#2 - - - @// finish first stage of 8 point FFT - - VADD.F32 qU0,qX0,qX4 - VADD.F32 qU2,qX1,qX5 - VADD.F32 qU4,qX2,qX6 - VADD.F32 qU6,qX3,qX7 - - @// finish second stage of 8 point FFT - - VADD.F32 qV0,qU0,qU4 - VSUB.F32 qV2,qU0,qU4 - VADD.F32 qV4,qU2,qU6 - VSUB.F32 qV6,qU2,qU6 - - @// finish third stage of 8 point FFT - - VADD.F32 qY0,qV0,qV4 - VSUB.F32 qY4,qV0,qV4 - VST2.F32 {dYr0,dYi0},[pDst, :128],step1 @// store y0 - - .ifeqs "\inverse", "TRUE" - - VSUB.F32 dYr2,dVr2,dVi6 - VADD.F32 dYi2,dVi2,dVr6 - - VADD.F32 dYr6,dVr2,dVi6 - VST2.F32 {dYr2,dYi2},[pDst, :128],step1 @// store y2 - VSUB.F32 dYi6,dVi2,dVr6 - - VSUB.F32 qU1,qX0,qX4 - VST2.F32 {dYr4,dYi4},[pDst, :128],step1 @// store y4 - - VSUB.F32 qU3,qX1,qX5 - VSUB.F32 qU5,qX2,qX6 - VST2.F32 {dYr6,dYi6},[pDst, :128],step1 @// store y6 - - .ELSE - - VADD.F32 dYr6,dVr2,dVi6 - VSUB.F32 dYi6,dVi2,dVr6 - - VSUB.F32 dYr2,dVr2,dVi6 - VST2.F32 {dYr6,dYi6},[pDst, :128],step1 @// store y2 - VADD.F32 dYi2,dVi2,dVr6 - - - VSUB.F32 qU1,qX0,qX4 - VST2.F32 {dYr4,dYi4},[pDst, :128],step1 @// store y4 - VSUB.F32 qU3,qX1,qX5 - VSUB.F32 qU5,qX2,qX6 - VST2.F32 {dYr2,dYi2},[pDst, :128],step1 @// store y6 - - - .ENDIF - - @// finish first stage of 8 point FFT - - VSUB.F32 qU7,qX3,qX7 - VLD1.F32 dT0[0], [t0] - - @// finish second stage of 8 point FFT - - VSUB.F32 dVr1,dUr1,dUi5 - @// data[0] for next iteration - VLD2.F32 {dXr0,dXi0},[pSrc, :128],pointStep - VADD.F32 dVi1,dUi1,dUr5 - VADD.F32 dVr3,dUr1,dUi5 - VLD2.F32 {dXr1,dXi1},[pSrc, :128],pointStep @// data[1] - VSUB.F32 dVi3,dUi1,dUr5 - - VSUB.F32 dVr5,dUr3,dUi7 - VLD2.F32 {dXr2,dXi2},[pSrc, :128],pointStep @// data[2] - VADD.F32 dVi5,dUi3,dUr7 - VADD.F32 dVr7,dUr3,dUi7 - VLD2.F32 {dXr3,dXi3},[pSrc, :128],pointStep @// data[3] - VSUB.F32 dVi7,dUi3,dUr7 - - @// finish third stage of 8 point FFT - - .ifeqs "\inverse", "TRUE" - - @// calculate a*v5 - VMUL.F32 dT1,dVr5,dT0[0] @// use dVi0 for dT1 - - VLD2.F32 {dXr4,dXi4},[pSrc, :128],pointStep @// data[4] - VMUL.F32 dVi5,dVi5,dT0[0] - - VLD2.F32 {dXr5,dXi5},[pSrc, :128],pointStep @// data[5] - VSUB.F32 dVr5,dT1,dVi5 @// a * V5 - VADD.F32 dVi5,dT1,dVi5 - - VLD2.F32 {dXr6,dXi6},[pSrc, :128],pointStep @// data[6] - - @// calculate b*v7 - VMUL.F32 dT1,dVr7,dT0[0] - VMUL.F32 dVi7,dVi7,dT0[0] - - VADD.F32 qY1,qV1,qV5 - VSUB.F32 qY5,qV1,qV5 - - - VADD.F32 dVr7,dT1,dVi7 @// b * V7 - VSUB.F32 dVi7,dVi7,dT1 - SUB pDst, pDst, step2 @// set pDst to y1 - - @// On the last iteration, this will read past the end of pSrc, - @// so skip this read. - BEQ radix8SkipLastUpdateInv\name - VLD2.F32 {dXr7,dXi7},[pSrc, :128],setStep @// data[7] -radix8SkipLastUpdateInv\name: - - VSUB.F32 dYr3,dVr3,dVr7 - VSUB.F32 dYi3,dVi3,dVi7 - VST2.F32 {dYr1,dYi1},[pDst, :128],step1 @// store y1 - VADD.F32 dYr7,dVr3,dVr7 - VADD.F32 dYi7,dVi3,dVi7 - - - VST2.F32 {dYr3,dYi3},[pDst, :128],step1 @// store y3 - VST2.F32 {dYr5,dYi5},[pDst, :128],step1 @// store y5 - VST2.F32 {dYr7,dYi7},[pDst, :128] @// store y7 - ADD pDst, pDst, #16 - - .ELSE - - @// calculate b*v7 - VMUL.F32 dT1,dVr7,dT0[0] - VLD2.F32 {dXr4,dXi4},[pSrc, :128],pointStep @// data[4] - VMUL.F32 dVi7,dVi7,dT0[0] - - VLD2.F32 {dXr5,dXi5},[pSrc, :128],pointStep @// data[5] - VADD.F32 dVr7,dT1,dVi7 @// b * V7 - VSUB.F32 dVi7,dVi7,dT1 - - VLD2.F32 {dXr6,dXi6},[pSrc, :128],pointStep @// data[6] - - @// calculate a*v5 - VMUL.F32 dT1,dVr5,dT0[0] @// use dVi0 for dT1 - VMUL.F32 dVi5,dVi5,dT0[0] - - VADD.F32 dYr7,dVr3,dVr7 - VADD.F32 dYi7,dVi3,dVi7 - SUB pDst, pDst, step2 @// set pDst to y1 - - VSUB.F32 dVr5,dT1,dVi5 @// a * V5 - VADD.F32 dVi5,dT1,dVi5 - - @// On the last iteration, this will read past the end of pSrc, - @// so skip this read. - BEQ radix8SkipLastUpdateFwd\name - VLD2.F32 {dXr7,dXi7},[pSrc, :128],setStep @// data[7] -radix8SkipLastUpdateFwd\name: - - VSUB.F32 qY5,qV1,qV5 - - VSUB.F32 dYr3,dVr3,dVr7 - VST2.F32 {dYr7,dYi7},[pDst, :128],step1 @// store y1 - VSUB.F32 dYi3,dVi3,dVi7 - VADD.F32 qY1,qV1,qV5 - - - VST2.F32 {dYr5,dYi5},[pDst, :128],step1 @// store y3 - VST2.F32 {dYr3,dYi3},[pDst, :128],step1 @// store y5 - VST2.F32 {dYr1,dYi1},[pDst, :128]! @// store y7 - - .ENDIF - - - @// update pDst for the next set - SUB pDst, pDst, step2 - BGT radix8fsGrpZeroSetLoop\name - - - @// reset pSrc to pDst for the next stage - SUB pSrc,pDst,pointStep @// pDst -= 2*grpSize - MOV pDst,pPingPongBuf - - - - .endm - - - @// Allocate stack memory required by the function - - - M_START armSP_FFTFwd_CToC_FC32_Radix8_fs_OutOfPlace_unsafe,r4 - FFTSTAGE "FALSE","FALSE",FWD - M_END -ONEBYSQRT2FWD: .float 0.7071067811865476e0 - - M_START armSP_FFTInv_CToC_FC32_Radix8_fs_OutOfPlace_unsafe,r4 - FFTSTAGE "FALSE","TRUE",INV - M_END -ONEBYSQRT2INV: .float 0.7071067811865476e0 - - - .end |