hello!

i've written a procedure and i'd like to ask how you would improve it...?

it takes a pointer to an array of at least 3 source Vertices and a pointer to a single target vertex.

for example

asm_normal(&SourceV[0], &TargetV);



here's the procedure:



asm_normal PROC uses ecx psrcVArr:DWORD, ptrgV:DWORD ; gets source VertexArray[4] and dest Vertex


mov ecx, [psrcVArr]
fld DWORD PTR [ecx] ; st(0) = V[0].x
fsub DWORD PTR [ecx + 12] ; st(0) = V[0].x - V[1].x
fld DWORD PTR [ecx + 4] ; st(0) = V[0].y
fsub DWORD PTR [ecx + 12 + 4] ; st(0) = V[0].y - V[1].y
fld DWORD PTR [ecx + 8] ; st(0) = V[0].z
fsub DWORD PTR [ecx + 12 + 8] ; st(0) = V[0].z - V[1].z
add ecx, 12
fld DWORD PTR [ecx] ; st(0) = V[1].x
fsub DWORD PTR [ecx + 12] ; st(0) = V[1].x - V[2].x
fld DWORD PTR [ecx + 4] ; st(0) = V[1].y
fsub DWORD PTR [ecx + 12 + 4] ; st(0) = V[1].y - V[2].y
fld DWORD PTR [ecx + 8] ; st(0) = V[1].z
fsub DWORD PTR [ecx + 12 + 8] ; st(0) = V[1].z - V[2].z

mov ecx, [ptrgV]
fld st(4) ; st(0) = temp[0].y
fmul st, st(1) ; st(0) = temp[0].y*temp[1].z
fld st(4) ; st(0) = temp[0].z
fmul st, st(3) ; st(0) = temp[0].z * temp[1].y
fsubp st(1), st ; st(0) = temp[0].y*temp[1].z-temp[0].z * temp[1].y
fstp DWORD PTR [ecx]

fld st(3)
fmul st, st(3)
fld st(6)
fmul st, st(2)
fsubp st(1), st
fstp DWORD PTR [ecx + 4]

fld st(5)
fmul st, st(2)
fld st(5)
fmul st, st(4)
fsubp st(1), st
fstp DWORD PTR [ecx + 8]

mov ecx, 6 ; maybe FINIT would be better?
@@:
fstp st(0)
dec ecx
jnz @B

mov ecx, [ptrgV]
fld DWORD PTR [ecx]
fld DWORD PTR [ecx + 4]
fld DWORD PTR [ecx + 8]
fld st(2)
fmul st, st ; x?
fld st(2)
fmul st, st ; y?
fld st(2)
fmul st, st ; z?
faddp st(1), st ; st(0) = z?+y?
faddp st(1), st ; st(0) = (z?+y?)+x?
fsqrt

fdiv st(3), st ; x?/l
fdiv st(2), st ; y?/l
fdivp st(1), st ; z?/l
fstp DWORD PTR [ecx + 8]
fstp DWORD PTR [ecx + 4]
fstp DWORD PTR [ecx]

ret
asm_normal ENDP



there are some points where i'm not sure what's the best to do, one is the loop that pops the FPU-registers away.
the algorithm itself isn't optimized as you will probably see...well.

thanks for any helpful reaction
Posted on 2002-03-17 13:59:22 by 08/15
ffreep st(3)
ffreep st(3)
ffreep st(3)
; three values popped from FPU stack,
; and top three FPU values are empty

fcompp
fcompp
fcompp
; this pops all six values, but the above is faster
Posted on 2002-03-17 15:45:46 by bitRAKE
thank you bitrake...

one problem remains: can't find FFREEP in the docs and MASM gives an error when i try to assemble it.;)
it just sets the register to empty, it doesn't affect the TOS-pointer.
i tried using FFREE, but it ends up just like FSTP, i have to do it 6 times.
Posted on 2002-03-17 16:48:52 by 08/15
FFREEP instruction is supported on Pentium processors and above, but MASM doesn't support it and it is not documented by Intel. It works. :)

http://www.geocities.com/SiliconValley/Heights/1295/nasmdoca.htm#section-A.55
http://fatphil.org/x86/pentopt/29.html

HEX: DF C0+reg
Posted on 2002-03-17 17:03:09 by bitRAKE
You can improve he final bit by first dividing 1 by (z?+y?)+x? then multiplying the x,y & z by this. It's faster because division is very slow compared to multiplication. I always use the following MACRO to normalise the top three values on the FPU stack:
Normalise Macro

fld st
fmul st, st
fld st(2)
fmul st, st
fadd
fld st(3)
fmul st, st
fadd
fsqrt
fdivr fpc(1.0)
fmul st(3), st
fmul st(2), st
fmul
EndM
eg:
     mov    ecx, [ptrgV]

fld DWORD PTR [ecx]
fld DWORD PTR [ecx + 4]
fld DWORD PTR [ecx + 8]
[COLOR=darkred]Normalise[/COLOR]
fstp DWORD PTR [ecx + 8]
fstp DWORD PTR [ecx + 4]
fstp DWORD PTR [ecx]
Posted on 2002-03-17 17:09:46 by Eóin
hey :grin:, thank you.

seems like you really know what you're doing



fpc(1.0) is another macro, right?
Posted on 2002-03-17 17:14:42 by 08/15
works perfectly, eoin! thx

i do a FLD1 before FDIVRP to go around the fpc(1.0)



bitrake,

FFREE st(3) becomes DD C3, and as stated on the website you gave, FFREEP st(3) should be DF C3.

so, i assembled, opened the *.obj and fixed the three DD C3 values (simply used FFREE to make space for FFREEP in the binary) to DF C3.

when i run ollydbg and the first DF C3 is executed, it says "unknown instruction".
now i'm really confused...

do you know why that is?
Posted on 2002-03-17 17:54:16 by 08/15
wow, 3 in a row;)

here's the fasterFDIVR/FMUL version:
(without macro, though)

FFREEP doesn't work yet.



asm_normal PROC uses ecx psrcVArr:DWORD, ptrgV:DWORD ; gets source VertexArray[4] and dest Vertex

mov ecx, [psrcVArr]
fld DWORD PTR [ecx] ; st(0) = V[0].x
fsub DWORD PTR [ecx + 12] ; st(0) = V[0].x - V[1].x
fld DWORD PTR [ecx + 4] ; st(0) = V[0].y
fsub DWORD PTR [ecx + 12 + 4] ; st(0) = V[0].y - V[1].y
fld DWORD PTR [ecx + 8] ; st(0) = V[0].z
fsub DWORD PTR [ecx + 12 + 8] ; st(0) = V[0].z - V[1].z
add ecx, 12
fld DWORD PTR [ecx] ; st(0) = V[1].x
fsub DWORD PTR [ecx + 12] ; st(0) = V[1].x - V[2].x
fld DWORD PTR [ecx + 4] ; st(0) = V[1].y
fsub DWORD PTR [ecx + 12 + 4] ; st(0) = V[1].y - V[2].y
fld DWORD PTR [ecx + 8] ; st(0) = V[1].z
fsub DWORD PTR [ecx + 12 + 8] ; st(0) = V[1].z - V[2].z

mov ecx, [ptrgV]
fld st(4) ; st(0) = temp[0].y
fmul st, st(1) ; st(0) = temp[0].y*temp[1].z
fld st(4) ; st(0) = temp[0].z
fmul st, st(3) ; st(0) = temp[0].z * temp[1].y
fsubp st(1), st ; st(0) = temp[0].y*temp[1].z-temp[0].z * temp[1].y
fstp DWORD PTR [ecx]
fld st(3)
fmul st, st(3)
fld st(6)
fmul st, st(2)
fsubp st(1), st
fstp DWORD PTR [ecx + 4]
fld st(5)
fmul st, st(2)
fld st(5)
fmul st, st(4)
fsubp st(1), st
fstp DWORD PTR [ecx + 8]

mov ecx, 6 ; replace with 3x FFREEP ST(3) (in HEX!!! MASM won't do it - PII and above)
@@:
fstp st(0)
dec ecx
jnz @B

; ffree st(3) <-- hex: DD C3; when changed to DF C3 (FFREEP st(3))
; ffree st(3) it doesn't recognize the instruction
; ffree st(3)

mov ecx, [ptrgV]
fld DWORD PTR [ecx]
fld DWORD PTR [ecx + 4]
fld DWORD PTR [ecx + 8]
fld st(2)
fmul st, st ; x?
fld st(2)
fmul st, st ; y?
fld st(2)
fmul st, st ; z?
faddp st(1), st ; st(0) = z?+y?
faddp st(1), st ; st(0) = (z?+y?)+x?
fsqrt

fld1
fdivrp st(1), st
fmul st(3), st ; x?/l
fmul st(2), st ; y?/l
fmulp st(1), st ; z?/l

fstp DWORD PTR [ecx + 8]
fstp DWORD PTR [ecx + 4]
fstp DWORD PTR [ecx]

ret
asm_normal ENDP
Posted on 2002-03-18 03:33:38 by 08/15
First bitRAKE, that freep comand is interesting, I've never heard of it before. I always went by Anger Fog who said fspt st is the fastest way to free the TOS, while fcompp is the fastest way to free two values. But this freep seems more useful altogether.

Second 08/15, you can avoid freeing the stack by using the values in it throughout the middle calculation. From what I gather the stack at the start of the calculation starts at
0: z1-z2, call it zb
1: y1-y2, call it yb
2: x1-x2, call it xb
3: z0-z1, call it za
4: y0-y1, call it ya
5: x0-x1, call it xa


And you want to calculate:
ya*zb-za*yb
za*xb-zb*xa
xa*yb-xb*ya

Since all the values in the first equation are needed by the other two your code is the fastest way to work it:
mov    ecx, [ptrgV]     

fld st(4) ; st(0) = temp[0].y
fmul st, st(1) ; st(0) = temp[0].y*temp[1].z
fld st(4) ; st(0) = temp[0].z
fmul st, st(3) ; st(0) = temp[0].z * temp[1].y
fsubp st(1), st ; st(0) = temp[0].y*temp[1].z-temp[0].z * temp[1].y
fstp DWORD PTR [ecx]
Now however the third equation doesn't use za/b at all, so we might as well overwrite in the calculation of the second equation.
fmul st,st(5)  ; st0 = zb*xa

fxch st(3)
fmul st,st(2) ; st0 = za*xb
fsubrp st(3),st ; st(2) = za*xb-zb*xa
Now at this stage the result we want is stuck at st(2) so we cant store it, but never mind. Continue on calculating the third equation and the second value will be freed eventually.
fmulp st(4),st  ; st(3) = xa*yb

fmulp st(2),st ; st(1) = xb*ya

Now the result to the second equation is at st(0) so store it
fstp   DWORD PTR [ecx + 4]

And finish af the third equation
fsub ; st(0) = xa*yb-ya*xb

fstp DWORD PTR [ecx + 8]
And you can forget about freeing the values now as theres nothing left on the stack. Note I haven't tested this so sorry if it doesn't work.
Posted on 2002-03-18 07:05:09 by Eóin
thank you very much eoin, i haven't tried yet because i was very busy.

btw, did you actually try to use this FFREEP instructions?
did it work?

just curious...

:confused:
Posted on 2002-03-19 12:50:04 by 08/15
No I haven't had a chance yet, I won't get to my computer till thursday. :(
Posted on 2002-03-20 03:17:25 by Eóin

btw, did you actually try to use this FFREEP instructions?
did it work?
What processor are you using?
Did you try using it outside the debugger?
Encode the instruction inline:

db 0DFh, 0C3h ; FFREEP st(3)
db 0DFh, 0C3h ; FFREEP st(3)
db 0DFh, 0C3h ; FFREEP st(3)

MASM will allow this in a code section.
Posted on 2002-03-20 10:12:24 by bitRAKE
333 celeron...

i'll try to do it inline like you suggested

thanx for helping again, bitrake
:)
Posted on 2002-03-20 10:51:29 by 08/15
:grin:
how could i doubt that

you're certainly right, this command exists and it works very well.
i was really confused about the message in OllyDebug, it felt like walking on thin ice without being able to test in the debugger first, but it worx.

now i'll go and look for an olly-update

you made my day with the inline stuff.
i guess the hex-edit works too. too dumb i never tried to just run it...well, have a nice day!




just dl'ed OllyDebug v1.06 (had 1.05 step 1 before), no difference.
undocumented and obviously unrecognized.;)



eoins method of optimizing the algo itself is of course the better solution, just to make it clear.
though, FFREEP may be useful in other cases, and i really appreciate to know it now. cheers
Posted on 2002-03-20 11:22:19 by 08/15
finally, i sat down and did some optimizing

there's actually no need to free the stack as i did before, and i removed the save to memory/load from memory stuff which was really dumb.

this is the result:




asm_normal PROC uses ecx psrcVArr:DWORD, ptrgV:DWORD ; gets source VertexArray[4] and dest Vertex

mov ecx, [psrcVArr]
fld DWORD PTR [ecx] ; st(0) = V[0].x
fsub DWORD PTR [ecx + 12] ; st(0) = V[0].x - V[1].x
fld DWORD PTR [ecx + 4] ; st(0) = V[0].y
fsub DWORD PTR [ecx + 12 + 4] ; st(0) = V[0].y - V[1].y
fld DWORD PTR [ecx + 8] ; st(0) = V[0].z
fsub DWORD PTR [ecx + 12 + 8] ; st(0) = V[0].z - V[1].z
add ecx, 12
fld DWORD PTR [ecx] ; st(0) = V[1].x
fsub DWORD PTR [ecx + 12] ; st(0) = V[1].x - V[2].x
fld DWORD PTR [ecx + 4] ; st(0) = V[1].y
fsub DWORD PTR [ecx + 12 + 4] ; st(0) = V[1].y - V[2].y
fld DWORD PTR [ecx + 8] ; st(0) = V[1].z
fsub DWORD PTR [ecx + 12 + 8] ; st(0) = V[1].z - V[2].z

fld st(4) ; st(0) = temp[0].y
fmul st, st(1) ; st(0) = temp[0].y*temp[1].z
fld st(4) ; st(0) = temp[0].z
fmul st, st(3) ; st(0) = temp[0].z * temp[1].y
fsubp st(1), st ; st(0) = temp[0].y*temp[1].z-temp[0].z * temp[1].y
fstp st(7) ; make it last element

fxch
fmul st, st(5)
fxch
fxch st(4)
fmul st, st(2)
fsubp st(1), st ; st(0) = temp[0].x*temp[1].y-temp[0].y * temp[1].x
fstp st(6) ; make it last element

fmulp st(1), st
fxch
fmulp st(2), st
fsubrp st(1), st

fld st(2)
fmul st, st ; z?
fld st(2)
fmul st, st ; x?
fld st(2)
fmul st, st ; y?
faddp st(1), st ; st(0) = x?+y?
faddp st(1), st ; st(0) = (x?+y?)+z?
fsqrt ; st(0) = l

fld1
fdivrp st(1), st ; st(0) = 1/l
fmul st(3), st ; z?/l
fmul st(2), st ; x?/l
fmulp st(1), st ; y?/l
fxch

mov ecx, [ptrgV]
fstp DWORD PTR [ecx]
fstp DWORD PTR [ecx + 4]
fstp DWORD PTR [ecx + 8]

ret
asm_normal ENDP


maybe it's more useful now
Posted on 2002-03-24 17:15:20 by 08/15