media/openmax_dl/dl/sp/src/omxSP_FFTFwd_RToCCS_F32_Sfs_s.S


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404

@//
@//  Copyright (c) 2013 The WebRTC project authors. All Rights Reserved.
@//
@//  Use of this source code is governed by a BSD-style license
@//  that can be found in the LICENSE file in the root of the source
@//  tree. An additional intellectual property rights grant can be found
@//  in the file PATENTS.  All contributing project authors may
@//  be found in the AUTHORS file in the root of the source tree.
@//
@//  This is a modification of omxSP_FFTFwd_RToCCS_S32_Sfs_s.s
@//  to support float instead of SC32.
@//

@//
@// Description:
@// Compute FFT for a real signal
@//
@//


@// Include standard headers

#include "dl/api/armCOMM_s.h"
#include "dl/api/omxtypes_s.h"


@// Import symbols required from other files
@// (For example tables)

        .extern  armSP_FFTFwd_CToC_FC32_Radix2_fs_OutOfPlace_unsafe
        .extern  armSP_FFTFwd_CToC_FC32_Radix4_fs_OutOfPlace_unsafe
        .extern  armSP_FFTFwd_CToC_FC32_Radix8_fs_OutOfPlace_unsafe
        .extern  armSP_FFTFwd_CToC_FC32_Radix4_OutOfPlace_unsafe
        .extern  armSP_FFTFwd_CToC_FC32_Radix4_fs_OutOfPlace_unsafe
        .extern  armSP_FFTFwd_CToC_FC32_Radix8_fs_OutOfPlace_unsafe
        .extern  armSP_FFTFwd_CToC_FC32_Radix2_OutOfPlace_unsafe

@// Set debugging level
@//DEBUG_ON    SETL {TRUE}


@// Guarding implementation by the processor name


    @// Guarding implementation by the processor name

@// Import symbols required from other files
@// (For example tables)
        .extern  armSP_FFTFwd_CToC_FC32_Radix4_ls_OutOfPlace_unsafe
        .extern  armSP_FFTFwd_CToC_FC32_Radix2_ls_OutOfPlace_unsafe


@//Input Registers

#define pSrc            r0
#define pDst            r1
#define pFFTSpec        r2
#define scale           r3


@// Output registers
#define result          r0

@//Local Scratch Registers

#define argTwiddle      r1
#define argDst          r2
#define argScale        r4
#define tmpOrder        r4
#define pTwiddle        r4
#define pOut            r5
#define subFFTSize      r7
#define subFFTNum       r6
#define N               r6
#define order           r14
#define diff            r9
@// Total num of radix stages required to comple the FFT
#define count           r8
#define x0r             r4
#define x0i             r5
#define diffMinusOne    r2
#define subFFTSizeTmp   r6
#define step            r3
#define step1           r4
#define twStep          r8
#define zero            r9
#define pTwiddleTmp     r5
#define t0              r10

@// Neon registers

#define dX0       d0
#define dzero     d1
#define dZero     d2
#define dShift    d3
#define dX0r      d2
#define dX0i      d3
#define dX1r      d4
#define dX1i      d5
#define dT0       d6
#define dT1       d7
#define dT2       d8
#define dT3       d9
#define qT0       d10
#define qT1       d12
#define dW0r      d14
#define dW0i      d15
#define dW1r      d16
#define dW1i      d17
#define dY0r      d14
#define dY0i      d15
#define dY1r      d16
#define dY1i      d17
#define dY0rS64   d14.s64
#define dY0iS64   d15.s64
#define qT2       d18
#define qT3       d20
@// lastThreeelements
#define dX1       d3
#define dW0       d4
#define dW1       d5
#define dY0       d10
#define dY1       d11
#define dY2       d12
#define dY3       d13

#define half      d0

    @// Allocate stack memory required by the function

    @// Write function header
        M_START     omxSP_FFTFwd_RToCCS_F32_Sfs,r11,d15

@ Structure offsets for the FFTSpec
        .set    ARMsFFTSpec_N, 0
        .set    ARMsFFTSpec_pBitRev, 4
        .set    ARMsFFTSpec_pTwiddle, 8
        .set    ARMsFFTSpec_pBuf, 12

        @// Define stack arguments

        @// Read the size from structure and take log
        LDR     N, [pFFTSpec, #ARMsFFTSpec_N]

        @// Read other structure parameters
        LDR     pTwiddle, [pFFTSpec, #ARMsFFTSpec_pTwiddle]
        LDR     pOut, [pFFTSpec, #ARMsFFTSpec_pBuf]

        @//  N=1 Treat seperately
        CMP     N,#1
        BGT     sizeGreaterThanOne
        VLD1.F32    dX0[0],[pSrc]
        MOV     zero,#0
        VMOV.F32    dzero[0],zero
        VMOV.F32    dZero[0],zero
        VST3.F32    {dX0[0],dzero[0],dZero[0]},[pDst]

        B       End


sizeGreaterThanOne:
        @// Do a N/2 point complex FFT including the scaling

        MOV     N,N,ASR #1                          @// N/2 point complex FFT

        CLZ     order,N                             @// N = 2^order
        RSB     order,order,#31
        MOV     subFFTSize,#1
        @//MOV     subFFTNum,N

        CMP     order,#3
        BGT     orderGreaterthan3                   @// order > 3

        CMP     order,#1
        BGE     orderGreaterthan0                   @// order > 0
        VLD1.F32    dX0,[pSrc]
        VST1.F32    dX0,[pOut]
        MOV     pSrc,pOut
        MOV     argDst,pDst
        BLT     FFTEnd

orderGreaterthan0:
        @// set the buffers appropriately for various orders
        CMP     order,#2
        MOVEQ   argDst,pDst
        MOVNE   argDst,pOut
        @// Pass the first stage destination in RN5
        MOVNE   pOut,pDst
        MOV     argTwiddle,pTwiddle

        CMP     order,#1
        BGT     orderGreaterthan1
        @// order = 1
        BL      armSP_FFTFwd_CToC_FC32_Radix2_fs_OutOfPlace_unsafe
        B       FFTEnd

orderGreaterthan1:
        CMP     order,#2
        BGT     orderGreaterthan2
        @// order =2
        BL      armSP_FFTFwd_CToC_FC32_Radix2_fs_OutOfPlace_unsafe
        BL      armSP_FFTFwd_CToC_FC32_Radix2_ls_OutOfPlace_unsafe
        B       FFTEnd

orderGreaterthan2:@// order =3
        BL      armSP_FFTFwd_CToC_FC32_Radix2_fs_OutOfPlace_unsafe
        BL      armSP_FFTFwd_CToC_FC32_Radix2_OutOfPlace_unsafe
        BL      armSP_FFTFwd_CToC_FC32_Radix2_ls_OutOfPlace_unsafe

        B       FFTEnd


orderGreaterthan3:
specialScaleCase:

        @// Set input args to fft stages
        TST     order, #2
        MOVEQ   argDst,pDst
        MOVNE   argDst,pOut
        @// Pass the first stage destination in RN5
        MOVNE   pOut,pDst
        MOV     argTwiddle,pTwiddle

        @//check for even or odd order
        @// NOTE: The following combination of BL's would work fine even though
        @// the first BL would corrupt the flags. This is because the end of
        @// the "grpZeroSetLoop" loop inside
        @// armSP_FFTFwd_CToC_FC32_Radix4_fs_OutOfPlace_unsafe sets the Z flag
        @// to EQ

        TST     order,#0x00000001
        BLEQ    armSP_FFTFwd_CToC_FC32_Radix4_fs_OutOfPlace_unsafe
        BLNE    armSP_FFTFwd_CToC_FC32_Radix8_fs_OutOfPlace_unsafe

        CMP        subFFTNum,#4
        BLT     FFTEnd


unscaledRadix4Loop:
        BEQ        lastStageUnscaledRadix4
         BL        armSP_FFTFwd_CToC_FC32_Radix4_OutOfPlace_unsafe
         CMP        subFFTNum,#4
         B        unscaledRadix4Loop

lastStageUnscaledRadix4:
        BL      armSP_FFTFwd_CToC_FC32_Radix4_ls_OutOfPlace_unsafe
        B        FFTEnd


FFTEnd:
finalComplexToRealFixup:


        @// F(0) = 1/2[Z(0) + Z'(0)] - j [Z(0) - Z'(0)]
        @// 1/2[(a+jb) + (a-jb)] - j  [(a+jb) - (a-jb)]
        @// 1/2[2a+j0] - j [0+j2b]
        @// (a+b, 0)

        @// F(N/2) = 1/2[Z(0) + Z'(0)] + j [Z(0) - Z'(0)]
        @// 1/2[(a+jb) + (a-jb)] + j  [(a+jb) - (a-jb)]
        @// 1/2[2a+j0] + j [0+j2b]
        @// (a-b, 0)

        @// F(0) and F(N/2)
        VLD2.F32    {dX0r[0],dX0i[0]},[pSrc]!
        MOV     zero,#0
        VMOV.F32    dX0r[1],zero
        MOV     step,subFFTSize,LSL #3            @// step = N/2 * 8 bytes
        VMOV.F32    dX0i[1],zero
        @// twStep = 3N/8 * 8 bytes pointing to W^1
        SUB     twStep,step,subFFTSize,LSL #1

        VADD.F32    dY0r,dX0r,dX0i                    @// F(0) = ((Z0.r+Z0.i) , 0)
        MOV     step1,subFFTSize,LSL #2           @// step1 = N/2 * 4 bytes
        VSUB.F32    dY0i,dX0r,dX0i                    @// F(N/2) = ((Z0.r-Z0.i) , 0)
        SUBS    subFFTSize,subFFTSize,#2

        VST1.F32    dY0r,[argDst],step
        ADD     pTwiddleTmp,argTwiddle,#8         @// W^2
        VST1.F32    dY0i,[argDst]!
        ADD     argTwiddle,argTwiddle,twStep      @// W^1

        VDUP.F32    dzero,zero
        SUB     argDst,argDst,step

        BLT     End
        BEQ     lastElement
        SUB     step,step,#24
        SUB     step1,step1,#8                    @// (N/4-1)*8 bytes

        @// F(k) = 1/2[Z(k) +  Z'(N/2-k)] -j*W^(k) [Z(k) -  Z'(N/2-k)]
        @// Note: W^k is stored as negative values in the table
        @// Process 4 elements at a time. E.g: F(1),F(2) and F(N/2-2),F(N/2-1)
        @// since both of them require Z(1),Z(2) and Z(N/2-2),Z(N/2-1)


        ADR     t0, HALF
        VLD1.F32    half[0], [t0]

evenOddButterflyLoop:


        VLD1.F32    dW0r,[argTwiddle],step1
        VLD1.F32    dW1r,[argTwiddle]!

        VLD2.F32    {dX0r,dX0i},[pSrc],step
        SUB     argTwiddle,argTwiddle,step1
        VLD2.F32    {dX1r,dX1i},[pSrc]!


        SUB     step1,step1,#8                    @// (N/4-2)*8 bytes
        VLD1.F32    dW0i,[pTwiddleTmp],step1
        VLD1.F32    dW1i,[pTwiddleTmp]!
        SUB     pSrc,pSrc,step

        SUB     pTwiddleTmp,pTwiddleTmp,step1
        VREV64.F32  dX1r,dX1r
        VREV64.F32  dX1i,dX1i
        SUBS    subFFTSize,subFFTSize,#4


        VSUB.F32    dT2,dX0r,dX1r                     @// a-c
        SUB     step1,step1,#8
        VADD.F32    dT0,dX0r,dX1r                     @// a+c
        VSUB.F32    dT1,dX0i,dX1i                     @// b-d
        VADD.F32    dT3,dX0i,dX1i                     @// b+d
        VMUL.F32   dT0,dT0,half[0]
        VMUL.F32   dT1,dT1,half[0]
        VZIP.F32    dW1r,dW1i
        VZIP.F32    dW0r,dW0i


        VMUL.F32   qT0,dW1r,dT2
        VMUL.F32   qT1,dW1r,dT3
        VMUL.F32   qT2,dW0r,dT2
        VMUL.F32   qT3,dW0r,dT3

        VMLA.F32   qT0,dW1i,dT3
        VMLS.F32   qT1,dW1i,dT2

        VMLS.F32   qT2,dW0i,dT3
        VMLA.F32   qT3,dW0i,dT2


        VMUL.F32  dX1r,qT0,half[0]
        VMUL.F32  dX1i,qT1,half[0]

        VSUB.F32    dY1r,dT0,dX1i                     @// F(N/2 -1)
        VADD.F32    dY1i,dT1,dX1r
        VNEG.F32    dY1i,dY1i

        VREV64.F32  dY1r,dY1r
        VREV64.F32  dY1i,dY1i


        VMUL.F32  dX0r,qT2,half[0]
        VMUL.F32  dX0i,qT3,half[0]

        VSUB.F32    dY0r,dT0,dX0i                     @// F(1)
        VADD.F32    dY0i,dT1,dX0r


        VST2.F32    {dY0r,dY0i},[argDst],step
        VST2.F32    {dY1r,dY1i},[argDst]!
        SUB     argDst,argDst,step
        SUB     step,step,#32                     @// (N/2-4)*8 bytes


        BGT     evenOddButterflyLoop

        @// set both the ptrs to the last element
        SUB     pSrc,pSrc,#8
        SUB     argDst,argDst,#8


        @// Last element can be expanded as follows
        @// 1/2[Z(k) + Z'(k)] + j w^k [Z(k) - Z'(k)]
        @// 1/2[(a+jb) + (a-jb)] + j w^k [(a+jb) - (a-jb)]
        @// 1/2[2a+j0] + j (c+jd) [0+j2b]
        @// (a-bc, -bd)
        @// Since (c,d) = (0,1) for the last element, result is just (a,-b)

lastElement:
        VLD1.F32    dX0r,[pSrc]

        VST1.F32    dX0r[0],[argDst]!
        VNEG.F32    dX0r,dX0r
        VST1.F32    dX0r[1],[argDst]!

End:
        @// Set return value
        MOV     result, #OMX_Sts_NoErr

        @// Write function tail
        M_END
HALF:   .float  0.5
        .end