Project Ne10
An Open Optimized Software Library Project for the ARM Architecture
Loading...
Searching...
No Matches
NE10_rfft_float32.c
1/*
2 * Copyright 2014-15 ARM Limited and Contributors.
3 * All rights reserved.
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions are met:
7 * * Redistributions of source code must retain the above copyright
8 * notice, this list of conditions and the following disclaimer.
9 * * Redistributions in binary form must reproduce the above copyright
10 * notice, this list of conditions and the following disclaimer in the
11 * documentation and/or other materials provided with the distribution.
12 * * Neither the name of ARM Limited nor the
13 * names of its contributors may be used to endorse or promote products
14 * derived from this software without specific prior written permission.
15 *
16 * THIS SOFTWARE IS PROVIDED BY ARM LIMITED AND CONTRIBUTORS "AS IS" AND
17 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
18 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
19 * DISCLAIMED. IN NO EVENT SHALL ARM LIMITED AND CONTRIBUTORS BE LIABLE FOR ANY
20 * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
21 * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
22 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
23 * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
24 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
25 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26 */
27
28/* license of Kiss FFT */
29/*
30Copyright (c) 2003-2010, Mark Borgerding
31
32All rights reserved.
33
34Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met:
35
36 * Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer.
37 * Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution.
38 * Neither the author nor the names of any contributors may be used to endorse or promote products derived from this software without specific prior written permission.
39
40THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
41*/
42
43/*
44 * NE10 Library : dsp/NE10_rfft_float32.c
45 */
46
47#include "NE10_types.h"
48#include "NE10_macros.h"
49#include "NE10_fft.h"
50#include "NE10_dsp.h"
51
52#if (NE10_UNROLL_LEVEL > 0)
53
54void ne10_radix8_r2c_c (ne10_fft_cpx_float32_t *Fout,
55 const ne10_fft_cpx_float32_t *Fin,
56 const ne10_int32_t fstride,
57 const ne10_int32_t mstride,
58 const ne10_int32_t nfft)
59{
60 const ne10_int32_t in_step = nfft >> 3;
61 ne10_int32_t f_count;
62
63 ne10_float32_t scratch_in[8];
64 ne10_float32_t scratch [4];
65
66 /* real pointers */
67 const ne10_float32_t* Fin_r = (ne10_float32_t*) Fin;
68 ne10_float32_t* Fout_r = (ne10_float32_t*) Fout;
69 Fout_r ++; // always leave the first real empty
70
71 for (f_count = fstride; f_count; f_count --)
72 {
73 scratch_in[0] = Fin_r[in_step * 0] + Fin_r[in_step * (0 + 4)];
74 scratch_in[1] = Fin_r[in_step * 0] - Fin_r[in_step * (0 + 4)];
75 scratch_in[2] = Fin_r[in_step * 1] + Fin_r[in_step * (1 + 4)];
76 scratch_in[3] = Fin_r[in_step * 1] - Fin_r[in_step * (1 + 4)];
77 scratch_in[4] = Fin_r[in_step * 2] + Fin_r[in_step * (2 + 4)];
78 scratch_in[5] = Fin_r[in_step * 2] - Fin_r[in_step * (2 + 4)];
79 scratch_in[6] = Fin_r[in_step * 3] + Fin_r[in_step * (3 + 4)];
80 scratch_in[7] = Fin_r[in_step * 3] - Fin_r[in_step * (3 + 4)];
81
82 scratch_in[3] *= TW_81_F32;
83 scratch_in[7] *= TW_81N_F32;
84
85 // radix 2 butterfly
86 scratch[0] = scratch_in[0] + scratch_in[4];
87 scratch[1] = scratch_in[2] + scratch_in[6];
88 scratch[2] = scratch_in[7] - scratch_in[3];
89 scratch[3] = scratch_in[3] + scratch_in[7];
90
91 Fout_r[0] = scratch [0] + scratch [1];
92 Fout_r[7] = scratch [0] - scratch [1];
93
94 Fout_r[1] = scratch_in[1] + scratch [3];
95 Fout_r[5] = scratch_in[1] - scratch [3];
96
97 Fout_r[2] = scratch [2] - scratch_in[5];
98 Fout_r[6] = scratch [2] + scratch_in[5];
99
100 Fout_r[3] = scratch_in[0] - scratch_in[4];
101
102 Fout_r[4] = scratch_in[6] - scratch_in[2];
103
104 Fin_r ++;
105 Fout_r += 8;
106 }
107}
108
109void ne10_radix8_c2r_c (ne10_fft_cpx_float32_t *Fout,
110 const ne10_fft_cpx_float32_t *Fin,
111 const ne10_int32_t fstride,
112 const ne10_int32_t mstride,
113 const ne10_int32_t nfft)
114{
115 const ne10_int32_t in_step = nfft >> 3;
116 ne10_int32_t f_count;
117
118 ne10_float32_t scratch_in[8];
119
120 const ne10_float32_t one_by_N = 1.0 / nfft;
121
122 /* real pointers */
123 const ne10_float32_t* Fin_r = (ne10_float32_t*) Fin;
124 ne10_float32_t* Fout_r = (ne10_float32_t*) Fout;
125
126 for (f_count = fstride; f_count; f_count --)
127 {
128 scratch_in[0] = Fin_r[0] + Fin_r[3] + Fin_r[3] + Fin_r[7];
129 scratch_in[1] = Fin_r[1] + Fin_r[1] + Fin_r[5] + Fin_r[5];
130 scratch_in[2] = Fin_r[0] - Fin_r[4] - Fin_r[4] - Fin_r[7];
131 scratch_in[3] = Fin_r[1] - Fin_r[2] - Fin_r[5] - Fin_r[6];
132 scratch_in[4] = Fin_r[0] - Fin_r[3] - Fin_r[3] + Fin_r[7];
133 scratch_in[5] = - Fin_r[2] - Fin_r[2] + Fin_r[6] + Fin_r[6];
134 scratch_in[6] = Fin_r[0] + Fin_r[4] + Fin_r[4] - Fin_r[7];
135 scratch_in[7] = Fin_r[1] + Fin_r[2] - Fin_r[5] + Fin_r[6];
136
137 scratch_in[3] /= TW_81_F32;
138 scratch_in[7] /= TW_81N_F32;
139
140 Fout_r[0 * in_step] = scratch_in[0] + scratch_in[1];
141 Fout_r[4 * in_step] = scratch_in[0] - scratch_in[1];
142 Fout_r[1 * in_step] = scratch_in[2] + scratch_in[3];
143 Fout_r[5 * in_step] = scratch_in[2] - scratch_in[3];
144 Fout_r[2 * in_step] = scratch_in[4] + scratch_in[5];
145 Fout_r[6 * in_step] = scratch_in[4] - scratch_in[5];
146 Fout_r[3 * in_step] = scratch_in[6] + scratch_in[7];
147 Fout_r[7 * in_step] = scratch_in[6] - scratch_in[7];
148
149#if defined(NE10_DSP_RFFT_SCALING)
150 Fout_r[0 * in_step] *= one_by_N;
151 Fout_r[4 * in_step] *= one_by_N;
152 Fout_r[1 * in_step] *= one_by_N;
153 Fout_r[5 * in_step] *= one_by_N;
154 Fout_r[2 * in_step] *= one_by_N;
155 Fout_r[6 * in_step] *= one_by_N;
156 Fout_r[3 * in_step] *= one_by_N;
157 Fout_r[7 * in_step] *= one_by_N;
158#endif
159
160 Fin_r += 8;
161 Fout_r ++;
162 }
163}
164
165NE10_INLINE void ne10_radix4_r2c_c (ne10_fft_cpx_float32_t *Fout,
166 const ne10_fft_cpx_float32_t *Fin,
167 const ne10_int32_t fstride,
168 const ne10_int32_t mstride,
169 const ne10_int32_t nfft)
170{
171 const ne10_int32_t in_step = nfft >> 2;
172 ne10_int32_t f_count;
173
174 ne10_float32_t scratch_in [4];
175 ne10_float32_t scratch_out[4];
176
177 /* real pointers */
178 const ne10_float32_t *Fin_r = (ne10_float32_t*) Fin;
179 ne10_float32_t *Fout_r = (ne10_float32_t*) Fout;
180 Fout_r ++; // always leave the first real empty
181
182 for (f_count = fstride; f_count; f_count --)
183 {
184 scratch_in[0] = Fin_r[0 * in_step];
185 scratch_in[1] = Fin_r[1 * in_step];
186 scratch_in[2] = Fin_r[2 * in_step];
187 scratch_in[3] = Fin_r[3 * in_step];
188
189 // NE10_PRINT_Q_VECTOR(scratch_in);
190
191 NE10_FFT_R2C_4R_RCR(scratch_out,scratch_in);
192
193 // NE10_PRINT_Q_VECTOR(scratch_out);
194
195 Fout_r[0] = scratch_out[0];
196 Fout_r[1] = scratch_out[1];
197 Fout_r[2] = scratch_out[2];
198 Fout_r[3] = scratch_out[3];
199
200 Fin_r ++;
201 Fout_r += 4;
202 }
203}
204
205NE10_INLINE void ne10_radix4_c2r_c (ne10_fft_cpx_float32_t *Fout,
206 const ne10_fft_cpx_float32_t *Fin,
207 const ne10_int32_t fstride,
208 const ne10_int32_t mstride,
209 const ne10_int32_t nfft)
210{
211 ne10_int32_t f_count;
212 const ne10_int32_t in_step = nfft >> 2;
213 ne10_float32_t scratch_in [4];
214 ne10_float32_t scratch_out[4];
215
216 const ne10_float32_t one_by_N = 1.0 / nfft;
217
218 /* real pointers */
219 const ne10_float32_t *Fin_r = (ne10_float32_t*) Fin;
220 ne10_float32_t *Fout_r = (ne10_float32_t*) Fout;
221
222 for (f_count = fstride; f_count; f_count --)
223 {
224 scratch_in[0] = Fin_r[0];
225 scratch_in[1] = Fin_r[1];
226 scratch_in[2] = Fin_r[2];
227 scratch_in[3] = Fin_r[3];
228
229 // NE10_PRINT_Q_VECTOR(scratch_in);
230
231 NE10_FFT_C2R_RCR_4R(scratch_out,scratch_in);
232
233 // NE10_PRINT_Q_VECTOR(scratch_out);
234
235#if defined(NE10_DSP_RFFT_SCALING)
236 scratch_out[0] *= one_by_N;
237 scratch_out[1] *= one_by_N;
238 scratch_out[2] *= one_by_N;
239 scratch_out[3] *= one_by_N;
240#endif
241
242 // store
243 Fout_r[0 * in_step] = scratch_out[0];
244 Fout_r[1 * in_step] = scratch_out[1];
245 Fout_r[2 * in_step] = scratch_out[2];
246 Fout_r[3 * in_step] = scratch_out[3];
247
248 Fin_r += 4;
249 Fout_r ++;
250 }
251}
252
253NE10_INLINE void ne10_radix4_r2c_with_twiddles_first_butterfly_c (ne10_float32_t *Fout_r,
254 const ne10_float32_t *Fin_r,
255 const ne10_int32_t out_step,
256 const ne10_int32_t in_step,
257 const ne10_fft_cpx_float32_t *twiddles)
258{
259 ne10_float32_t scratch_out[4];
260 ne10_float32_t scratch_in [4];
261
262 // load
263 scratch_in[0] = Fin_r[0 * in_step];
264 scratch_in[1] = Fin_r[1 * in_step];
265 scratch_in[2] = Fin_r[2 * in_step];
266 scratch_in[3] = Fin_r[3 * in_step];
267
268 // NE10_PRINT_Q_VECTOR(scratch_in);
269
270 NE10_FFT_R2C_4R_RCR(scratch_out,scratch_in);
271
272 // NE10_PRINT_Q_VECTOR(scratch_out);
273
274 // store
275 Fout_r[ 0] = scratch_out[0];
276 Fout_r[ (out_step << 1) - 1] = scratch_out[1];
277 Fout_r[ (out_step << 1) ] = scratch_out[2];
278 Fout_r[2 * (out_step << 1) - 1] = scratch_out[3];
279}
280
281NE10_INLINE void ne10_radix4_c2r_with_twiddles_first_butterfly_c (ne10_float32_t *Fout_r,
282 const ne10_float32_t *Fin_r,
283 const ne10_int32_t out_step,
284 const ne10_int32_t in_step,
285 const ne10_fft_cpx_float32_t *twiddles)
286{
287 ne10_float32_t scratch [8];
288 ne10_float32_t scratch_in_r [4];
289 ne10_float32_t scratch_out_r[4];
290
291 // load
292 scratch_in_r[0] = Fin_r[0 ];
293 scratch_in_r[1] = Fin_r[1*(out_step<<1)-1];
294 scratch_in_r[2] = Fin_r[1*(out_step<<1) ];
295 scratch_in_r[3] = Fin_r[2*(out_step<<1)-1];
296
297 // NE10_PRINT_Q_VECTOR(scratch_in_r);
298
299 // radix 4 butterfly without twiddles
300 scratch[0] = scratch_in_r[0] + scratch_in_r[3];
301 scratch[1] = scratch_in_r[0] - scratch_in_r[3];
302 scratch[2] = scratch_in_r[1] + scratch_in_r[1];
303 scratch[3] = scratch_in_r[2] + scratch_in_r[2];
304
305 scratch_out_r[0] = scratch[0] + scratch[2];
306 scratch_out_r[1] = scratch[1] - scratch[3];
307 scratch_out_r[2] = scratch[0] - scratch[2];
308 scratch_out_r[3] = scratch[1] + scratch[3];
309
310 // NE10_PRINT_Q_VECTOR(scratch_out_r);
311
312 // store
313 Fout_r[0 * in_step] = scratch_out_r[0];
314 Fout_r[1 * in_step] = scratch_out_r[1];
315 Fout_r[2 * in_step] = scratch_out_r[2];
316 Fout_r[3 * in_step] = scratch_out_r[3];
317
318}
319
320NE10_INLINE void ne10_radix4_r2c_with_twiddles_other_butterfly_c (ne10_float32_t *Fout_r,
321 const ne10_float32_t *Fin_r,
322 const ne10_int32_t out_step,
323 const ne10_int32_t in_step,
324 const ne10_fft_cpx_float32_t *twiddles)
325{
326 ne10_int32_t m_count;
327 ne10_float32_t *Fout_b = Fout_r + (((out_step<<1)-1)<<1) - 2; // reversed
328 ne10_fft_cpx_float32_t scratch_tw[3], scratch_in[4];
329
330 ne10_fft_cpx_float32_t scratch[4], scratch_out[4];
331
332 for (m_count = (out_step >> 1) - 1; m_count; m_count --)
333 {
334 scratch_tw [0] = twiddles[0 * out_step];
335 scratch_tw [1] = twiddles[1 * out_step];
336 scratch_tw [2] = twiddles[2 * out_step];
337
338 scratch_in[0].r = Fin_r[0 * in_step ];
339 scratch_in[0].i = Fin_r[0 * in_step + 1];
340 scratch_in[1].r = Fin_r[1 * in_step ];
341 scratch_in[1].i = Fin_r[1 * in_step + 1];
342 scratch_in[2].r = Fin_r[2 * in_step ];
343 scratch_in[2].i = Fin_r[2 * in_step + 1];
344 scratch_in[3].r = Fin_r[3 * in_step ];
345 scratch_in[3].i = Fin_r[3 * in_step + 1];
346
347 // NE10_PRINT_Q2_VECTOR(scratch_in);
348
349 // radix 4 butterfly with twiddles
350 scratch[0].r = scratch_in[0].r;
351 scratch[0].i = scratch_in[0].i;
352 scratch[1].r = scratch_in[1].r * scratch_tw[0].r - scratch_in[1].i * scratch_tw[0].i;
353 scratch[1].i = scratch_in[1].i * scratch_tw[0].r + scratch_in[1].r * scratch_tw[0].i;
354
355 scratch[2].r = scratch_in[2].r * scratch_tw[1].r - scratch_in[2].i * scratch_tw[1].i;
356 scratch[2].i = scratch_in[2].i * scratch_tw[1].r + scratch_in[2].r * scratch_tw[1].i;
357
358 scratch[3].r = scratch_in[3].r * scratch_tw[2].r - scratch_in[3].i * scratch_tw[2].i;
359 scratch[3].i = scratch_in[3].i * scratch_tw[2].r + scratch_in[3].r * scratch_tw[2].i;
360
361 NE10_FFT_R2C_CC_CC(scratch_out,scratch);
362
363 // NE10_PRINT_Q2_VECTOR(scratch_in);
364
365 // result
366 Fout_r[ 0] = scratch_out[0].r;
367 Fout_r[ 1] = scratch_out[0].i;
368 Fout_r[ (out_step << 1) ] = scratch_out[1].r;
369 Fout_r[ (out_step << 1) + 1] = scratch_out[1].i;
370 Fout_b[ 0] = scratch_out[2].r;
371 Fout_b[ 1] = scratch_out[2].i;
372 Fout_b[- (out_step << 1) ] = scratch_out[3].r;
373 Fout_b[- (out_step << 1) + 1] = scratch_out[3].i;
374
375 // update pointers
376 Fin_r += 2;
377 Fout_r += 2;
378 Fout_b -= 2;
379 twiddles ++;
380 }
381}
382
383NE10_INLINE void ne10_radix4_c2r_with_twiddles_other_butterfly_c (ne10_float32_t *Fout_r,
384 const ne10_float32_t *Fin_r,
385 const ne10_int32_t out_step,
386 const ne10_int32_t in_step,
387 const ne10_fft_cpx_float32_t *twiddles)
388{
389 ne10_int32_t m_count;
390 const ne10_float32_t *Fin_b = Fin_r + (((out_step<<1)-1)<<1) - 2; // reversed
391 ne10_fft_cpx_float32_t scratch_tw [3],
392 scratch [8],
393 scratch_in [4],
394 scratch_out[4];
395
396 for (m_count = (out_step >> 1) - 1; m_count; m_count --)
397 {
398 scratch_tw[0] = twiddles[0 * out_step];
399 scratch_tw[1] = twiddles[1 * out_step];
400 scratch_tw[2] = twiddles[2 * out_step];
401
402 scratch_in[0].r = Fin_r[0];
403 scratch_in[0].i = Fin_r[1];
404
405 scratch_in[1].r = Fin_b[0];
406 scratch_in[1].i = Fin_b[1];
407
408 scratch_in[2].r = Fin_r[(out_step<<1) + 0];
409 scratch_in[2].i = Fin_r[(out_step<<1) + 1];
410
411 scratch_in[3].r = Fin_b[-(out_step<<1) + 0];
412 scratch_in[3].i = Fin_b[-(out_step<<1) + 1];
413
414 // NE10_PRINT_Q2_VECTOR(scratch_in);
415
416 // // inverse of "result"
417 NE10_FFT_C2R_CC_CC(scratch,scratch_in);
418
419 // inverse of "mutltipy twiddles"
420 scratch_out[0] = scratch[0];
421
422 scratch_out[1].r = scratch[1].r * scratch_tw[0].r + scratch[1].i * scratch_tw[0].i;
423 scratch_out[1].i = scratch[1].i * scratch_tw[0].r - scratch[1].r * scratch_tw[0].i;
424
425 scratch_out[2].r = scratch[2].r * scratch_tw[1].r + scratch[2].i * scratch_tw[1].i;
426 scratch_out[2].i = scratch[2].i * scratch_tw[1].r - scratch[2].r * scratch_tw[1].i;
427
428 scratch_out[3].r = scratch[3].r * scratch_tw[2].r + scratch[3].i * scratch_tw[2].i;
429 scratch_out[3].i = scratch[3].i * scratch_tw[2].r - scratch[3].r * scratch_tw[2].i;
430
431 // NE10_PRINT_Q2_VECTOR(scratch_out);
432
433 // store
434 Fout_r[0 * in_step ] = scratch_out[0].r;
435 Fout_r[0 * in_step + 1] = scratch_out[0].i;
436 Fout_r[1 * in_step ] = scratch_out[1].r;
437 Fout_r[1 * in_step + 1] = scratch_out[1].i;
438 Fout_r[2 * in_step ] = scratch_out[2].r;
439 Fout_r[2 * in_step + 1] = scratch_out[2].i;
440 Fout_r[3 * in_step ] = scratch_out[3].r;
441 Fout_r[3 * in_step + 1] = scratch_out[3].i;
442
443 // update pointers
444 Fin_r += 2;
445 Fout_r += 2;
446 Fin_b -= 2;
447 twiddles ++;
448 }
449}
450
451NE10_INLINE void ne10_radix4_r2c_with_twiddles_last_butterfly_c (ne10_float32_t *Fout_r,
452 const ne10_float32_t *Fin_r,
453 const ne10_int32_t out_step,
454 const ne10_int32_t in_step,
455 const ne10_fft_cpx_float32_t *twiddles)
456{
457 ne10_float32_t scratch_in [4];
458 ne10_float32_t scratch_out[4];
459
460 scratch_in[0] = Fin_r[0 * in_step];
461 scratch_in[1] = Fin_r[1 * in_step];
462 scratch_in[2] = Fin_r[2 * in_step];
463 scratch_in[3] = Fin_r[3 * in_step];
464
465 // NE10_PRINT_Q_VECTOR(scratch_in);
466
467 NE10_FFT_R2C_4R_CC(scratch_out,scratch_in);
468
469 // NE10_PRINT_Q_VECTOR(scratch_out);
470
471 Fout_r[ 0] = scratch_out[0];
472 Fout_r[ 1] = scratch_out[1];
473 Fout_r[ (out_step << 1) ] = scratch_out[2];
474 Fout_r[ (out_step << 1) + 1] = scratch_out[3];
475}
476
477NE10_INLINE void ne10_radix4_c2r_with_twiddles_last_butterfly_c (ne10_float32_t *Fout_r,
478 const ne10_float32_t *Fin_r,
479 const ne10_int32_t out_step,
480 const ne10_int32_t in_step,
481 const ne10_fft_cpx_float32_t *twiddles)
482{
483 // inverse operation of ne10_radix4_r2c_with_twiddles_last_butterfly_c
484 ne10_float32_t scratch_in [4];
485 ne10_float32_t scratch_out[4];
486
487 // load
488 scratch_in[0] = Fin_r[ 0];
489 scratch_in[1] = Fin_r[ 1];
490 scratch_in[2] = Fin_r[ (out_step << 1) ];
491 scratch_in[3] = Fin_r[ (out_step << 1) + 1];
492
493 // NE10_PRINT_Q_VECTOR(scratch_in);
494
495 NE10_FFT_C2R_CC_4R(scratch_out,scratch_in);
496
497 // NE10_PRINT_Q_VECTOR(scratch_out);
498
499 // store
500 Fout_r[0 * in_step] = scratch_out[0];
501 Fout_r[1 * in_step] = scratch_out[1];
502 Fout_r[2 * in_step] = scratch_out[2];
503 Fout_r[3 * in_step] = scratch_out[3];
504}
505
506NE10_INLINE void ne10_radix4_r2c_with_twiddles_c (ne10_fft_cpx_float32_t *Fout,
507 const ne10_fft_cpx_float32_t *Fin,
508 const ne10_int32_t fstride,
509 const ne10_int32_t mstride,
510 const ne10_int32_t nfft,
511 const ne10_fft_cpx_float32_t *twiddles)
512{
513 ne10_int32_t f_count;
514 const ne10_int32_t in_step = nfft >> 2;
515 const ne10_int32_t out_step = mstride;
516
517 const ne10_float32_t *Fin_r = (ne10_float32_t*) Fin;
518 ne10_float32_t *Fout_r = (ne10_float32_t*) Fout;
519 const ne10_fft_cpx_float32_t *tw;
520
521 Fout_r ++;
522 Fin_r ++;
523
524 for (f_count = fstride; f_count; f_count --)
525 {
526 tw = twiddles;
527
528 // first butterfly
529 ne10_radix4_r2c_with_twiddles_first_butterfly_c (Fout_r, Fin_r, out_step, in_step, tw);
530
531 tw ++;
532 Fin_r ++;
533 Fout_r ++;
534
535 // other butterfly
536 ne10_radix4_r2c_with_twiddles_other_butterfly_c (Fout_r, Fin_r, out_step, in_step, tw);
537
538 // update Fin_r, Fout_r, twiddles
539 tw += ( (out_step >> 1) - 1);
540 Fin_r += 2 * ( (out_step >> 1) - 1);
541 Fout_r += 2 * ( (out_step >> 1) - 1);
542
543 // last butterfly
544 ne10_radix4_r2c_with_twiddles_last_butterfly_c (Fout_r, Fin_r, out_step, in_step, tw);
545 tw ++;
546 Fin_r ++;
547 Fout_r ++;
548
549 Fout_r += 3 * out_step;
550 } // f_count
551}
552
553NE10_INLINE void ne10_radix4_c2r_with_twiddles_c (ne10_fft_cpx_float32_t *Fout,
554 const ne10_fft_cpx_float32_t *Fin,
555 const ne10_int32_t fstride,
556 const ne10_int32_t mstride,
557 const ne10_int32_t nfft,
558 const ne10_fft_cpx_float32_t *twiddles)
559{
560 ne10_int32_t f_count;
561 const ne10_int32_t in_step = nfft >> 2;
562 const ne10_int32_t out_step = mstride;
563
564 const ne10_float32_t *Fin_r = (ne10_float32_t*) Fin;
565 ne10_float32_t *Fout_r = (ne10_float32_t*) Fout;
566 const ne10_fft_cpx_float32_t *tw;
567
568 for (f_count = fstride; f_count; f_count --)
569 {
570 tw = twiddles;
571
572 // first butterfly
573 ne10_radix4_c2r_with_twiddles_first_butterfly_c (Fout_r, Fin_r, out_step, in_step, tw);
574
575 tw ++;
576 Fin_r ++;
577 Fout_r ++;
578
579 // other butterfly
580 ne10_radix4_c2r_with_twiddles_other_butterfly_c (Fout_r, Fin_r, out_step, in_step, tw);
581
582 // update Fin_r, Fout_r, twiddles
583 tw += ( (out_step >> 1) - 1);
584 Fin_r += 2 * ( (out_step >> 1) - 1);
585 Fout_r += 2 * ( (out_step >> 1) - 1);
586
587 // last butterfly
588 ne10_radix4_c2r_with_twiddles_last_butterfly_c (Fout_r, Fin_r, out_step, in_step, tw);
589 tw ++;
590 Fin_r ++;
591 Fout_r ++;
592
593 Fin_r += 3 * out_step;
594 } // f_count
595}
596
597NE10_INLINE void ne10_mixed_radix_r2c_butterfly_float32_c (
599 const ne10_fft_cpx_float32_t * Fin,
600 const ne10_int32_t * factors,
601 const ne10_fft_cpx_float32_t * twiddles,
602 ne10_fft_cpx_float32_t * buffer)
603{
604 // PRINT_POINTERS_INFO(Fin,Fout,buffer,twiddles);
605
606 ne10_int32_t fstride, mstride, nfft;
607 ne10_int32_t radix;
608 ne10_int32_t stage_count;
609
610 // init fstride, mstride, radix, nfft
611 stage_count = factors[0];
612 fstride = factors[1];
613 mstride = factors[ (stage_count << 1) - 1 ];
614 radix = factors[ stage_count << 1 ];
615 nfft = radix * fstride;
616
617 // PRINT_STAGE_INFO;
618
619 if (radix == 2)
620 {
621 // combine one radix-4 and one radix-2 into one radix-8
622 mstride <<= 2;
623 fstride >>= 2;
624 twiddles += 6; // (4-1) x 2
625 stage_count --;
626 }
627
628 if (stage_count % 2 == 0)
629 {
630 ne10_swap_ptr (buffer, Fout);
631 }
632
633 // the first stage
634 if (radix == 2) // length of FFT is 2^n (n is odd)
635 {
636 // PRINT_POINTERS_INFO(Fin,Fout,buffer,twiddles);
637 ne10_radix8_r2c_c (Fout, Fin, fstride, mstride, nfft);
638 }
639 else if (radix == 4) // length of FFT is 2^n (n is even)
640 {
641 // PRINT_POINTERS_INFO(Fin,Fout,buffer,twiddles);
642 ne10_radix4_r2c_c (Fout, Fin, fstride, mstride, nfft);
643 }
644 // end of first stage
645
646 // others
647 for (; fstride > 1;)
648 {
649 fstride >>= 2;
650 ne10_swap_ptr (buffer, Fout);
651
652 // PRINT_POINTERS_INFO(Fin,Fout,buffer,twiddles);
653 ne10_radix4_r2c_with_twiddles_c (Fout, buffer, fstride, mstride, nfft, twiddles);
654 twiddles += 3 * mstride;
655 mstride <<= 2;
656
657 } // other stage
658}
659
660NE10_INLINE void ne10_mixed_radix_c2r_butterfly_float32_c (
662 const ne10_fft_cpx_float32_t * Fin,
663 const ne10_int32_t * factors,
664 const ne10_fft_cpx_float32_t * twiddles,
665 ne10_fft_cpx_float32_t * buffer)
666{
667 // PRINT_POINTERS_INFO(Fin,Fout,buffer,twiddles);
668
669 ne10_int32_t fstride, mstride, nfft;
670 ne10_int32_t radix;
671 ne10_int32_t stage_count;
672
673 // init fstride, mstride, radix, nfft
674 stage_count = factors[0];
675 fstride = factors[1];
676 mstride = factors[ (stage_count << 1) - 1 ];
677 radix = factors[ stage_count << 1 ];
678 nfft = radix * fstride;
679
680 // fstride, mstride for the last stage
681 fstride = 1;
682 mstride = nfft >> 2;
683 // PRINT_STAGE_INFO;
684
685 if (radix == 2)
686 {
687 // combine one radix-4 and one radix-2 into one radix-8
688 stage_count --;
689 }
690
691 if (stage_count % 2 == 1)
692 {
693 ne10_swap_ptr (buffer, Fout);
694 }
695
696 // last butterfly -- inversed
697 if (stage_count > 1)
698 {
699 twiddles -= 3 * mstride;
700 // PRINT_STAGE_INFO;
701 // PRINT_POINTERS_INFO(Fin,Fout,buffer,twiddles);
702 ne10_radix4_c2r_with_twiddles_c (buffer, Fin, fstride, mstride, nfft, twiddles);
703 fstride <<= 2;
704 mstride >>= 2;
705 stage_count --;
706 }
707
708 // others but the last stage
709 for (; stage_count > 1;)
710 {
711 twiddles -= 3 * mstride;
712 // PRINT_STAGE_INFO;
713 // PRINT_POINTERS_INFO(Fin,Fout,buffer,twiddles);
714 ne10_radix4_c2r_with_twiddles_c (Fout, buffer, fstride, mstride, nfft, twiddles);
715 fstride <<= 2;
716 mstride >>= 2;
717 stage_count --;
718 ne10_swap_ptr (buffer, Fout);
719 } // other stage
720
721 // first stage -- inversed
722 if (radix == 2) // length of FFT is 2^n (n is odd)
723 {
724 mstride >>= 1;
725
726 // PRINT_STAGE_INFO;
727 // PRINT_POINTERS_INFO(Fin,Fout,buffer,twiddles);
728 ne10_radix8_c2r_c (Fout, buffer, fstride, mstride, nfft);
729 }
730 else if (radix == 4) // length of FFT is 2^n (n is even)
731 {
732 // PRINT_STAGE_INFO;
733 // PRINT_POINTERS_INFO(Fin,Fout,buffer,twiddles);
734 ne10_radix4_c2r_c (Fout, buffer, fstride, mstride, nfft);
735 }
736}
737
745{
747 ne10_int32_t ncfft = nfft >> 1;
748 ne10_int32_t result;
749
750 ne10_uint32_t memneeded = sizeof (ne10_fft_r2c_state_float32_t)
751 + sizeof (ne10_fft_cpx_float32_t) * nfft /* buffer*/
752 + sizeof (ne10_int32_t) * (NE10_MAXFACTORS * 2) /* r_factors */
753 + sizeof (ne10_int32_t) * (NE10_MAXFACTORS * 2) /* r_factors_neon */
754 + sizeof (ne10_fft_cpx_float32_t) * nfft /* r_twiddles */
755 + sizeof (ne10_fft_cpx_float32_t) * nfft/4 /* r_twiddles_neon */
756 + sizeof (ne10_fft_cpx_float32_t) * (12 + nfft/32*12) /* r_super_twiddles_neon */
757 + NE10_FFT_BYTE_ALIGNMENT; /* 64-bit alignment*/
758
759 st = (ne10_fft_r2c_cfg_float32_t) NE10_MALLOC (memneeded);
760
761 if (!st)
762 {
763 return st;
764 }
765
766 ne10_int32_t i,j;
768 const ne10_float32_t pi = NE10_PI;
769 ne10_float32_t phase1;
770
771 st->nfft = nfft;
772
773 uintptr_t address = (uintptr_t) st + sizeof (ne10_fft_r2c_state_float32_t);
774 NE10_BYTE_ALIGNMENT (address, NE10_FFT_BYTE_ALIGNMENT);
775
776 st->buffer = (ne10_fft_cpx_float32_t*) address;
777 st->r_twiddles = st->buffer + nfft;
778 st->r_factors = (ne10_int32_t*) (st->r_twiddles + nfft);
779 st->r_twiddles_neon = (ne10_fft_cpx_float32_t*) (st->r_factors + (NE10_MAXFACTORS * 2));
780 st->r_factors_neon = (ne10_int32_t*) (st->r_twiddles_neon + nfft/4);
781 st->r_super_twiddles_neon = (ne10_fft_cpx_float32_t*) (st->r_factors_neon + (NE10_MAXFACTORS * 2));
782
783 if (nfft<16)
784 {
785 return st;
786 }
787
788 // factors and twiddles for rfft C
789 ne10_factor (nfft, st->r_factors, NE10_FACTOR_DEFAULT);
790
791 // backward twiddles pointers
792 st->r_twiddles_backward = ne10_fft_generate_twiddles_float32 (st->r_twiddles, st->r_factors, nfft);
793
794 // factors and twiddles for rfft neon
795 result = ne10_factor (nfft/4, st->r_factors_neon, NE10_FACTOR_DEFAULT);
796 if (result == NE10_ERR)
797 {
798 return st;
799 }
800
801 // Twiddle table is transposed here to improve cache access performance.
802 st->r_twiddles_neon_backward = ne10_fft_generate_twiddles_transposed_float32 (
803 st->r_twiddles_neon,
804 st->r_factors_neon,
805 nfft/4);
806
807 // nfft/4 x 4
808 tw = st->r_super_twiddles_neon;
809 for (i = 1; i < 4; i ++)
810 {
811 for (j = 0; j < 4; j++)
812 {
813 phase1 = - 2 * pi * ( (ne10_float32_t) (i * j) / nfft);
814 tw[4*i-4+j].r = (ne10_float32_t) cos (phase1);
815 tw[4*i-4+j].i = (ne10_float32_t) sin (phase1);
816 }
817 }
818
819 ne10_int32_t k,s;
820 // [nfft/32] x [3] x [4]
821 // k s j
822 for (k=1; k<nfft/32; k++)
823 {
824 // transposed
825 for (s = 1; s < 4; s++)
826 {
827 for (j = 0; j < 4; j++)
828 {
829 phase1 = - 2 * pi * ( (ne10_float32_t) ((k*4+j) * s) / nfft);
830 tw[12*k+j+4*(s-1)].r = (ne10_float32_t) cos (phase1);
831 tw[12*k+j+4*(s-1)].i = (ne10_float32_t) sin (phase1);
832 }
833 }
834 }
835 return st;
836}
837
849 ne10_float32_t *fin,
851{
852 ne10_fft_cpx_float32_t * tmpbuf = cfg->buffer;
853
854 switch(cfg->nfft)
855 {
856 case 8:
857 ne10_radix8_r2c_c( (ne10_fft_cpx_float32_t*) fout, ( ne10_fft_cpx_float32_t*) fin,1,1,8);
858 break;
859 default:
860 ne10_mixed_radix_r2c_butterfly_float32_c (
861 fout,
863 cfg->r_factors,
864 cfg->r_twiddles,
865 tmpbuf);
866 break;
867 }
868
869 fout[0].r = fout[0].i;
870 fout[0].i = 0.0f;
871 fout[(cfg->nfft) >> 1].i = 0.0f;
872}
873
884void ne10_fft_c2r_1d_float32_c (ne10_float32_t *fout,
887{
888 ne10_fft_cpx_float32_t * tmpbuf = cfg->buffer;
889
890 fin[0].i = fin[0].r;
891 fin[0].r = 0.0f;
892 switch(cfg->nfft)
893 {
894 case 8:
895 ne10_radix8_c2r_c( (ne10_fft_cpx_float32_t*) fout, ( ne10_fft_cpx_float32_t*) &fin[0].i,1,1,8);
896 break;
897 default:
898 ne10_mixed_radix_c2r_butterfly_float32_c (
900 (ne10_fft_cpx_float32_t*)&fin[0].i, // first real is moved to first image
901 cfg->r_factors,
902 cfg->r_twiddles_backward,
903 tmpbuf);
904 break;
905 }
906 fin[0].r = fin[0].i;
907 fin[0].i = 0.0f;
908}
909
914#endif // NE10_UNROLL_LEVEL
ne10_fft_r2c_cfg_float32_t ne10_fft_alloc_r2c_float32(ne10_int32_t nfft)
User-callable function to allocate all necessary storage space for the fft (r2c/c2r).
void ne10_fft_r2c_1d_float32_c(ne10_fft_cpx_float32_t *fout, ne10_float32_t *fin, ne10_fft_r2c_cfg_float32_t cfg)
Mixed radix-2/4 FFT (real to complex) of float(32-bit) data.
void ne10_fft_c2r_1d_float32_c(ne10_float32_t *fout, ne10_fft_cpx_float32_t *fin, ne10_fft_r2c_cfg_float32_t cfg)
Mixed radix-2/4 IFFT (complex to real) of float(32-bit) data.