Well after alot of hair pulling and only one migrane headache, I have a tested and fully opreating FFT algorithm for everyone to enjoy. I plan to use this with some DSP work using the microphone but for anyone else's use, here it is:
[size=9]

;_______________________________________
; FFT Ver 1.2
; by NaN / Jaymeson Trudgen May 12, 2003
;
; Written exclusively for the MASM32 assembly package maintained by Hutch--
; This algorithm is based on the Danielson-Lanczos algorithm.
;
; Inputs:
;
; lp_data - A complex array of data to forward or resverse fourier transform
; Real values are indexed evenly (0,2,4,6,..,N-2)
; Imaginary values are index oddly (1,3,5,7,..,N-1)
; All array values are QWORD length, hence a complex value is 16 bytes.
;
; nn - Number of COMPLEX data points. For data array lenghts of N, this
; Value MUST be N/2 AND EQUAL TO 2^k where k = positive integer
; (1,2,4,8,16,32,64,....,etc). Note: Padd data with zeros to fit
; to one of these array sizes if not an exact fit.
;
; isign - THIS TERM IN NOT CHECK FOR ACCURACY AND CAN CORRUPT RESULTS IF
; ANY OTHER VALUE THAN +1 or -1 IS PROVIDED.
; +1 = Forward FFT (time domain data -> frequency domain data)
; -1 = Reverse FFT (frequency domain data -> time domain data)
;
; Normalize - If this term is +1, then the output from a forward FFT will have
; its energy content normalized to the width of the frequency
; domain. This should be done for purely scientific work, however
; some DSP work may not require this at some stages. For this
; reason control has been provided to keep the algorithm as fast
; as possible.
;
;
; Visit the first and best Win32ASM community:
;
; [url]http://www.asmcommunity.net/board/[/url]
;
;_______________________________________
FFT PROTO :DWORD, :DWORD, :DWORD, :DWORD

.data
NaNTwoPie dq -6.28318530717959

.code

FFT PROC USES EBX ESI EDI lp_data:DWORD, nn:DWORD, isign:DWORD, Normalize:DWORD
LOCAL fpu_temp :DWORD
LOCAL _n :DWORD
LOCAL _n1 :DWORD
LOCAL mmax :DWORD
LOCAL _m :DWORD
LOCAL istep :DWORD
LOCAL wr :QWORD
LOCAL wpr :QWORD
LOCAL wpi :QWORD
LOCAL wi :QWORD
LOCAL tempr :QWORD
LOCAL tempi :QWORD

finit

mov eax, nn ; n=nn << 1;
shl eax, 1
mov _n, eax

mov _n1, eax
dec _n1

xor esi, esi ; j=1;

; This is the bit-reversal section of the routine.

mov ebx, esi ; for (i=1;i<n;i+=2) NOTE: EBX == i = 1

.while( ebx < _n1 )

.if( esi > ebx ) ; if (j > i)

mov edx, lp_data
fld QWORD PTR [edx+esi*8] ; SWAP(data[j],data[i])
fld QWORD PTR [edx+ebx*8]
fstp QWORD PTR [edx+esi*8]
fstp QWORD PTR [edx+ebx*8]

inc esi
inc ebx

fld QWORD PTR [edx+esi*8] ; SWAP(data[j+1],data[i+1]);
fld QWORD PTR [edx+ebx*8]
fstp QWORD PTR [edx+esi*8]
fstp QWORD PTR [edx+ebx*8]

dec esi
dec ebx
.endif

mov edi, _n ; m=n >> 1;
shr edi, 1 ; edi == m


.while ((EDI >= 1)&&(ESI >= EDI)) ; while (m >= 2 && j > m)
sub esi, edi ; j -= m;
shr edi, 1 ; m >>= 1;
.endw
add esi, edi ; j += m;

inc ebx ; i+=2
inc ebx
.endw

; Here begins the Danielson-Lanczos section of the routine.

xor edi, edi
inc edi
inc edi ; EDI == mmax=2;


.while( _n > edi) ; while (n > mmax) Outer loop executed log2 nn times.

mov mmax, edi

mov eax, edi ; istep=mmax << 1;
shl eax, 1
mov istep, eax

lea edx, isign
fild DWORD PTR [edx]
fmul QWORD PTR [NaNTwoPie] ; theta=isign*(-6.28318530717959/mmax);
lea edx, mmax
fild DWORD PTR [edx]
fdiv

fld st ; st0 = st1 = theta
fsin ; st0 = sin(theta)
lea edx, wpi
fstp QWORD PTR [edx] ; wpi = sin(theta) (and pop'd)

xor eax, eax ; wtemp=sin(0.5*theta)
inc eax
inc eax
mov fpu_temp, eax
lea edx, fpu_temp
fild DWORD PTR [edx]
fdiv ; st0 - 0.5*theta
fsin
; wpr = -2.0*wtemp*wtemp;
fld st ; st0 = st1 = wtemp
fmul ; st0 = wtemp^2
fldz ; st0 = 0.0
lea edx, wi
fst QWORD PTR [edx] ; wi = 0.0
fld1
lea edx, wr
fst QWORD PTR [edx] ; wr = 1.0
fsub ; st0 = -1.0
fld1
fsub ; st0 = -2.0
fmul ; st0 = -2.0*wtemp^2
lea edx, wpr
fstp QWORD PTR [edx] ; st0 = wpr (and pop to finish)

; EDI = mmax, EBX == i, ESI = j
xor eax, eax
mov _m, eax ; Here are the two nested inner loops.
.while( _m < edi ) ; (m=1;m<mmax;m+=2)
mov ebx, _m ; i = m
inc ebx
.while( ebx <= _n1) ; (i=m;i<=n;i+=istep)

mov esi, ebx ; j = i+mmax
add esi, edi

dec esi
dec ebx

mov edx, lp_data
fld QWORD PTR [edx+esi*8] ; fld data[j]

inc esi
fld QWORD PTR [edx+esi*8] ; fld data[j+1]
dec esi

lea edx, wr
fld QWORD PTR [edx] ; fld wr
lea edx, wi
fld QWORD PTR [edx] ; fld wi

fld st(1) ; wr
fld st(4) ; data[j]
fmul
fld st(1) ; wi
fld st(4) ; data[j+1]
fmul
fsub ; st0 = wr*data[j] - wi*data[j+1]
lea edx, tempr
fstp QWORD PTR [edx] ; tempr = wr*data[j] - wi*data[j+1]

fld st(1) ; wr
fld st(3) ; data[j+1]
fmul
fld st(1) ; wi
fld st(5) ; data[j]
fmul
fadd
lea edx, tempi
fstp QWORD PTR [edx] ; tempi = wr*data[j+1] + wi*data[j]

fstp st ; Empty the stack
fstp st
fstp st
fstp st

mov edx, lp_data
fld QWORD PTR [edx+ebx*8] ; fld data[i]
lea ecx, tempr
fld QWORD PTR [ecx] ; fld tempr
fsub
fstp QWORD PTR [edx+esi*8] ; data[j] = data[i]-tempr

inc esi
inc ebx

fld QWORD PTR [edx+ebx*8] ; fld data[i+1]
lea ecx, tempi
fld QWORD PTR [ecx] ; fld tempi
fsub
fstp QWORD PTR [edx+esi*8] ; data[j+1] = data[i+1]-tempi

dec esi
dec ebx

lea ecx, tempr
fld QWORD PTR [ecx] ; fld tempr
fld QWORD PTR [edx+ebx*8] ; fld data[i]
fadd
fstp QWORD PTR [edx+ebx*8] ; data[i] += tempr

inc ebx
lea ecx, tempi
fld QWORD PTR [ecx] ; fld tempi
fld QWORD PTR [edx+ebx*8] ; fld data[i+1]
fadd
fstp QWORD PTR [edx+ebx*8] ; data[i+1] += tempi
dec ebx

inc ebx
inc esi

add ebx, istep ; i+=istep
.endw

lea edx, wr
fld QWORD PTR [edx]
lea edx, wpr
fld QWORD PTR [edx]
lea edx, wi
fld QWORD PTR [edx]
lea edx, wpi
fld QWORD PTR [edx]

fld st(3) ; wr
fld st(3) ; wpr
fmul
fld st(2) ; wi
fld st(2) ; wpi
fmul
fsub
fld st(4)
fadd ; st0 = wr*wpr-wi*wpi+wr
lea edx, wr
fstp QWORD PTR [edx]

fld st(1) ; wi
fld st(3) ; wpr
fmul
fld st(4) ; wr
fld st(2) ; wpi
fmul
fld st(3)
fadd ; wi
fadd ; st0 = wi*wpr+wr*wpi+wi
lea edx, wi
fstp QWORD PTR [edx]

fstp st ; cleanup the stack
fstp st
fstp st
fstp st

inc _m ; m+=2
inc _m
.endw

mov edi, istep ; mmax = istep
.endw

xor eax, eax
inc eax
.if( isign == eax )&&( Normalize == 1 )

lea edx, nn
fld1
fild DWORD PTR [edx] ; load scale factor
fdiv

mov edx, lp_data
xor ebx, ebx

.while( ebx < _n );

fld QWORD PTR [edx+ebx*8]
fld st(1)
fmul
fstp QWORD PTR [edx+ebx*8]

inc ebx
.endw
fstp st

.endif

ret
FFT ENDP [/size]


I know nothing about optomization. If someone wishes to make it run fasters I can supply them with suitable test code to ensure the algorithm's integrity is correct. If you do optomize it and prove to improve performace, place your name at the top for your hard work, increment the sub version number, and repost the solution.

Happy DSP'n :tongue:
:alright:
NaN
Posted on 2003-05-14 21:06:37 by NaN
Maybe you could explain a little bit what FFT is for some of the mentally retarded folks here. Myself included.
Posted on 2003-05-14 23:13:16 by iblis
Any analog signal, digital or not, can be expressed as a sum of sines and cosines (which incremently differs by frequency), weather its an impulse (sharp spike in a sea of silence), or a sine wave itself.

The FFT (fast fourier transform) is a fast algorithm that takes a signal pattern and decomposes it into its frequency components and their respective phase components for each frequency.

The IFFT (inverse fast fourier transform) does the opposite, taking freqency components and their phase and recreates a signal pattern.

A good application is in sound processing, but it can be used in many other fields such as image processing (object detection).

The basic theory comes from Eulers Identity:

e^j@ = cos(@) + j*sin(@)

where @=2*pi*f*t (time domain)


My test code for the FFT had a data array of points defined by:

data(x) = sin( 2*pi*5*x / 63)

To make a total of 64 data points, which oscillated and produced 5 complete sine-waves (0->63 points in all):
Posted on 2003-05-15 00:52:24 by NaN
The resulting data array is processed by the FFT, and the array is repopulated with its frequency and phase component values.

Note each frequency location can be expressed by the Euler identity:

x = Amplitude of signal at a given frequency
y = phase of the signal at a given frequency

Individual frequency component 'z(f)' that is part of the fourier series to make the original signal is the:

z(f) = x * e^j(2*pi*f*t + y)
z(f) = x * [ cos( 2*pi*f*t + y) + j.sin(2*pi*f*t + y) ]

If you look this over carefully you will see that the two important factors for each frequency component is magnitude 'x' and phase 'y' for each frequency 'f'. Below is the FFT output plot of only the frequency magnitudes of the origional signal posted above:
Posted on 2003-05-15 01:01:42 by NaN
It should be noted that the NYQUEST frequency is defined as 1/2 the sampling frequency. This translates to being half the data width. So our 64 point array has a nyquist frequency at 32.

Anything above this frequency is ALIASED to a lower frequency. This is a phenomenon easily observed in car tires rotating on a highway. How some how they seem to appear to stop rotating, and then start to rotate backwards. We know they are rotating in one direction, and far faster than what we see! However, our eyes can only sample up to a maximum frequency. After this nyquist point, everything else appears to be less again, ultimately falling back to an apparent stand-still of the rolling tire.

As well, all frequencies has an equal but opposit conjegate. In math x^2 +25 =0, when we solve for the roots, we get +/- 5j. All frequencies other than CONSTANT (0 Hertz) is on the j axis. So there is a point at +5 and -5 from the math equation. Just as equally, the Magnitude plot shows the the +5j (at the start at 5Hz), and the -5j (near end at 59Hz (64-5)).

Basically all real signal values (non-mathematicaly made) will have a mirrored relationship, pivoting on the Nyquest Frequency Location when FFT'd.

Looking at this magnitude plot then, i know that there is only one frequency in the source signal. 5Hz. Because this is where all the energy in the source signal is located. All other frequencies have very little energy in their signals. So low that it negligable and considered not there.

Having a Magnitude still doesnt tell me the signal behavior completely. We also need to see the Phase for each magnitude:
Posted on 2003-05-15 01:15:35 by NaN
The phase relationship shows that the signal is and ODD signal. This is because the average value is == 0! (remember you high school math here ~ even/odd relationships).

sin(-x) == -sin(x) (( Odd behavior ))
cos(-x) == cos(x) (( Even behavior))

The sign out front is the phase. Having a negative phase, and a positive phase, for x, and -x show us that its a SIN function, not a cosine function. The other plot showed us the magnitude for this function.


Now your question is why?

I can easily take a sample of sound data and add digital effects, or simply pitch shift the signal by moving the frequency vaules up scale or down scale in magnitude and phase, and then IFFT to recreat the modified time valued signal.

This is similar to the "Hello Sydney" voice box used in the movie SCREAM. It remodulated the voice to a lower pitch. Essencially doing an FFT, adjusting the data down in frequency (cut & paste like work) and then doing the IFFT to build a new signal and play it out (which you hear as a lower voice).

In image processing, the frequency/phase patterns will act like a finger-print, and varry depending on the image contents. This condenses alot of data into more manageable information for such things...

The list goes on, and on.
I hope this helps..
:alright:
NaN
Posted on 2003-05-15 01:27:48 by NaN
By the way, your winamp displays a magnitude plot of its MP3. Only it shows you up to the nyquist frequency, and not beyond (the mirrored aliased frequencies).

Just to give you a quick example...
Posted on 2003-05-15 02:15:54 by NaN
Wow, thanks NaN. I still have no clue what you're talking about, but the last part about modifying sound waves made enough sense. :alright:

Should I ever have the inclination to mess with sound data, I'll be sure to check back here.
Posted on 2003-05-15 03:16:40 by iblis
Ya it pretty cool stuff.. (math pays off if you stick with it long enought to learn what you can really do ;) )

As well, the FFT fingerprint patterns could be normalized even further and feed into a Neural Network to produce "thought based" results. Ofcourse you would need to train it multiple times to learn what its looking for.. but this would be crude first step into recognition systems.

:NaN:
Posted on 2003-05-15 09:22:38 by NaN
NaN,
Thank you for post the fft source.

FYI:
There are some suggestions for optimizatiom but that imposes some restrictions.
It is possible to get rid of the sin and cos calculation with a small table for
sin((+/-)2pi/N) and cos((+/-)2pi/N), as N is necessarily a power of 2 the table
would have just 56 elements for a limit of 2<=N<=16k, 60 elements for a limit of
N=32k and so forth.

These and others optimizations were persented in the "Faster FFTs" article on Dr.
Dobb's magazine February 1995 in the column "Algorithm Alley". The article presents
the C and assembler implementation - http://www.ddj.com/ftp/1995/1995.02/ (asm only)

Regards,

RValois.
Posted on 2003-05-15 13:49:45 by RValois
Great work.
You made NRiC code work, and I admire you. :) I think anyone who make NRiC code sane (in any language) deserves a "stand-up ovation".

Maybe this is a minor point, but...
Why would you do fld/fld/fmul or similar two load and 'op and pop reg stack' sequence, when they do not have distinctive advantage? For example, in the innermost loop,


fld QWORD PTR [ecx] ; fld tempi
fld QWORD PTR [edx+ebx*8] ; fld data[i+1]
fadd

could be done by a shorter code (2 byte shorter, I guess), like


fld qword ptr [ecx]
fadd qword ptr [edx+8*ebx]

On a P6, under ideal conditions, they run at the same speed. On P5, all FPU instructions except fxch are not pairable, so the latter is slightly faster in decoding.
Posted on 2003-05-15 18:05:30 by Starless

Great work.
You made NRiC code work, and I admire you. :) I think anyone who make NRiC code sane (in any language) deserves a "stand-up ovation".

Maybe this is a minor point, but...
Why would you do fld/fld/fmul or similar two load and 'op and pop reg stack' sequence, when they do not have distinctive advantage? For example, in the innermost loop,


fld QWORD PTR [ecx] ; fld tempi
fld QWORD PTR [edx+ebx*8] ; fld data[i+1]
fadd

could be done by a shorter code (2 byte shorter, I guess), like


fld qword ptr [ecx]
fadd qword ptr [edx+8*ebx]

On a P6, under ideal conditions, they run at the same speed. On P5, all FPU instructions except fxch are not pairable, so the latter is slightly faster in decoding.


Ya NaN you can save a lot of time from doing it the way Starless suggested trust me I know first hand.
Posted on 2003-05-16 17:25:57 by x86asm
Well I wanted to do DSP also, but I'm not sure what all these fancy symbols mean and represent I mean what does "sigma" mean?! I have a DSP book by the name of "BASIC Digital Signal Processing" and in one case I believe for a DFT it has N/2-1 on top and N=0 on the bottom can any of you guys help me?
Posted on 2003-05-16 17:29:51 by x86asm
Is it a symbol that looks like a cross between a '&' and a 'S'. If so its most likely the the symbol used to represent an Impulse (value 1 at time 0, and value 0 for all other times).

However, im not sure if your refering to a sum ('E' like symbol), since you mentioned a start N=0 to N=N/2. That sum is like a FOR loop:

 xor edx, edx

mov n, edx
.while n<=N/2
add edx, (sigma term as a function of n)
inc n
.endw


:NaN:
Posted on 2003-05-16 18:13:56 by NaN
I've taken a shot at revising the code in an attempt for some cleanup and minor optimizations. These are generic only relative to pentitium (no specific target) in theory only. I haven't benchmarked or compared against previous -- someone may want to do this.

Please note-- I changed the source into a subroutine/module (for my convenience only) so others that want to incoroporate in-line should take out what they don't need. PLEASE NOTE, I commented out the FINIT statement (personal preference based on my typical use).

The enclosed file contains the 1.3 revision and others files that make up a VB callable DLL that contains the FFT routine. See the included MS/Excel example for the VB example.

I left a lot of (working) comments in, excuse the mess!

Future revs may want to fine-tune the parallel computations - they are not as good as they could be. Also, some additional structure changes (trig table, integer math instead of fp, etc) can also help in a big way.
Posted on 2003-05-18 09:57:08 by TheoMcCloskey
Thanks for you thoughts.. Im still trying to get a moment to look it over tho. Work is getting busy again, and i just got back from my brother's wedding...

Need to find more personal time... :ack:

:NaN:
Posted on 2003-05-20 21:46:41 by NaN

Thanks for you thoughts.. Im still trying to get a moment to look it over tho. Work is getting busy again, and i just got back from my brother's wedding...

Need to find more personal time... :ack:

:NaN:


NaN before you go you know the code you posted? Is that how the sigma sign is decoded? (it looks like a deformed "E")
I have a BASIC Digital Signal Processing book, so far it has been good in explaining the theory and I don't have a problem with the math but this sigma is really stumping me. As I dont know how to interpret it.
Posted on 2003-05-20 21:49:44 by x86asm

Thanks for you thoughts.. Im still trying to get a moment to look it over tho. Work is getting busy again, and i just got back from my brother's wedding...

Need to find more personal time... :ack:

:NaN:


Also NaN, I believe you have said that the FFT decomposes an analog signal into a sum of sines and cosines. So in this case I can break down a WAV file into frequency domains and selectively amplifying the different frequency bands therefore emulating an EQ?
Posted on 2003-05-20 21:55:42 by x86asm
Yes.. Its a Sum Opperator. Its basically a mathematical symbol of a FOR-LOOP, looping about the ADD opperation and the variables expressed in the equation.
  n=5

E ( 2n-1 ) = (0-1) + (2-1) + (4-1) + (6-1) + (8-1) + (10-1)
n=0

n=5
E ( 2n-1 ) = -1 + 1 + 3 + 5 + 7 + 9 = 24
n=0


E = 0
FOR N = 0 TO 5
E = E + ( 2n - 1)
NEXT N


xor esi, esi
xor ecx, ecx
.while (ecx <=5)
mov eax, ecx
shl eax, 1
dec eax
add esi, eax
inc ecx
.endw


Hope that is clear ;)

You may also (rarely) come across a simular mathematical expression only the Sigmal 'E' is replaced with a large PI symbol. This means the operator is to be MULTIPLY instead of ADD. However, it is rarely found is common mathematics.

Fun fact:
  n=infinity

E ( 1 + 1/n! ) = 1/1 + 1/1 +1/2 + 1/6 + ... + 1/infinit = e (exponetial e^1)
n=0


Enjoy
:alright:
NaN
Posted on 2003-05-20 22:06:43 by NaN
NaN, I believe you mean...

n = infinity
E ( 1/n! ) = 1/1 + 1/1 +1/2 + 1/6 + ... = e
n = 0

otherwise you end up with a non-convergent series summation.
Posted on 2003-05-21 11:47:38 by Poimander