hello!

i've written a procedure and i'd like to ask how you would improve it...?

it takes a pointer to an array of at least 3 source Vertices and a pointer to a single target vertex.

for example

asm_normal(&SourceV, &TargetV);

here's the procedure:

``````
asm_normal PROC uses ecx psrcVArr:DWORD, ptrgV:DWORD          ;  gets source VertexArray and dest Vertex

mov    ecx, [psrcVArr]
fld    DWORD PTR [ecx]                  ; st(0) = V.x
fsub   DWORD PTR [ecx + 12]             ; st(0) = V.x - V.x
fld    DWORD PTR [ecx + 4]              ; st(0) = V.y
fsub   DWORD PTR [ecx + 12 + 4]         ; st(0) = V.y - V.y
fld    DWORD PTR [ecx + 8]              ; st(0) = V.z
fsub   DWORD PTR [ecx + 12 + 8]         ; st(0) = V.z - V.z
fld    DWORD PTR [ecx]                  ; st(0) = V.x
fsub   DWORD PTR [ecx + 12]             ; st(0) = V.x - V.x
fld    DWORD PTR [ecx + 4]              ; st(0) = V.y
fsub   DWORD PTR [ecx + 12 + 4]         ; st(0) = V.y - V.y
fld    DWORD PTR [ecx + 8]              ; st(0) = V.z
fsub   DWORD PTR [ecx + 12 + 8]         ; st(0) = V.z - V.z

mov    ecx, [ptrgV]
fld    st(4)                               ; st(0) = temp.y
fmul   st, st(1)                        ; st(0) = temp.y*temp.z
fld    st(4)                               ; st(0) = temp.z
fmul   st, st(3)                        ; st(0) = temp.z * temp.y
fsubp  st(1), st                        ; st(0) = temp.y*temp.z-temp.z * temp.y
fstp   DWORD PTR [ecx]

fld    st(3)
fmul   st, st(3)
fld    st(6)
fmul   st, st(2)
fsubp  st(1), st
fstp   DWORD PTR [ecx + 4]

fld    st(5)
fmul   st, st(2)
fld    st(5)
fmul   st, st(4)
fsubp  st(1), st
fstp   DWORD PTR [ecx + 8]

mov    ecx, 6                  ;  maybe FINIT would be better?
@@:
fstp    st(0)
dec     ecx
jnz     @B

mov    ecx, [ptrgV]
fld    DWORD PTR [ecx]
fld    DWORD PTR [ecx + 4]
fld    DWORD PTR [ecx + 8]
fld    st(2)
fmul   st, st           ; x?
fld    st(2)
fmul   st, st           ; y?
fld    st(2)
fmul   st, st           ; z?
faddp  st(1), st        ; st(0) = z?+y?
faddp  st(1), st        ; st(0) = (z?+y?)+x?
fsqrt

fdiv   st(3), st        ; x?/l
fdiv   st(2), st        ; y?/l
fdivp  st(1), st        ; z?/l
fstp   DWORD PTR [ecx + 8]
fstp   DWORD PTR [ecx + 4]
fstp   DWORD PTR [ecx]

ret
asm_normal ENDP
``````

there are some points where i'm not sure what's the best to do, one is the loop that pops the FPU-registers away.
the algorithm itself isn't optimized as you will probably see...well.

Posted on 2002-03-17 13:59:22 by 08/15
ffreep st(3)
ffreep st(3)
ffreep st(3)
; three values popped from FPU stack,
; and top three FPU values are empty

fcompp
fcompp
fcompp
; this pops all six values, but the above is faster
Posted on 2002-03-17 15:45:46 by bitRAKE
thank you bitrake...

one problem remains: can't find FFREEP in the docs and MASM gives an error when i try to assemble it.;)
it just sets the register to empty, it doesn't affect the TOS-pointer.
i tried using FFREE, but it ends up just like FSTP, i have to do it 6 times.
Posted on 2002-03-17 16:48:52 by 08/15
FFREEP instruction is supported on Pentium processors and above, but MASM doesn't support it and it is not documented by Intel. It works. :)

http://www.geocities.com/SiliconValley/Heights/1295/nasmdoca.htm#section-A.55
http://fatphil.org/x86/pentopt/29.html

HEX: DF C0+reg
Posted on 2002-03-17 17:03:09 by bitRAKE
You can improve he final bit by first dividing 1 by (z?+y?)+x? then multiplying the x,y & z by this. It's faster because division is very slow compared to multiplication. I always use the following MACRO to normalise the top three values on the FPU stack:
``````Normalise Macro
fld st
fmul st, st
fld st(2)
fmul st, st
fld st(3)
fmul st, st
fsqrt
fdivr fpc(1.0)
fmul st(3), st
fmul st(2), st
fmul
EndM``````
eg:
``````     mov    ecx, [ptrgV]
fld    DWORD PTR [ecx]
fld    DWORD PTR [ecx + 4]
fld    DWORD PTR [ecx + 8]
[COLOR=darkred]Normalise[/COLOR]
fstp   DWORD PTR [ecx + 8]
fstp   DWORD PTR [ecx + 4]
fstp   DWORD PTR [ecx]
``````
Posted on 2002-03-17 17:09:46 by E�in
hey :grin:, thank you.

seems like you really know what you're doing

fpc(1.0) is another macro, right?
Posted on 2002-03-17 17:14:42 by 08/15
works perfectly, eoin! thx

i do a FLD1 before FDIVRP to go around the fpc(1.0)

bitrake,

FFREE st(3) becomes DD C3, and as stated on the website you gave, FFREEP st(3) should be DF C3.

so, i assembled, opened the *.obj and fixed the three DD C3 values (simply used FFREE to make space for FFREEP in the binary) to DF C3.

when i run ollydbg and the first DF C3 is executed, it says "unknown instruction".
now i'm really confused...

do you know why that is?
Posted on 2002-03-17 17:54:16 by 08/15
wow, 3 in a row;)

here's the fasterFDIVR/FMUL version:
(without macro, though)

FFREEP doesn't work yet.

``````
asm_normal PROC uses ecx psrcVArr:DWORD, ptrgV:DWORD          ;  gets source VertexArray and dest Vertex

mov    ecx, [psrcVArr]
fld    DWORD PTR [ecx]                  ; st(0) = V.x
fsub   DWORD PTR [ecx + 12]             ; st(0) = V.x - V.x
fld    DWORD PTR [ecx + 4]              ; st(0) = V.y
fsub   DWORD PTR [ecx + 12 + 4]         ; st(0) = V.y - V.y
fld    DWORD PTR [ecx + 8]              ; st(0) = V.z
fsub   DWORD PTR [ecx + 12 + 8]         ; st(0) = V.z - V.z
fld    DWORD PTR [ecx]                  ; st(0) = V.x
fsub   DWORD PTR [ecx + 12]             ; st(0) = V.x - V.x
fld    DWORD PTR [ecx + 4]              ; st(0) = V.y
fsub   DWORD PTR [ecx + 12 + 4]         ; st(0) = V.y - V.y
fld    DWORD PTR [ecx + 8]              ; st(0) = V.z
fsub   DWORD PTR [ecx + 12 + 8]         ; st(0) = V.z - V.z

mov    ecx, [ptrgV]
fld    st(4)                            ; st(0) = temp.y
fmul   st, st(1)                        ; st(0) = temp.y*temp.z
fld    st(4)                            ; st(0) = temp.z
fmul   st, st(3)                        ; st(0) = temp.z * temp.y
fsubp  st(1), st                        ; st(0) = temp.y*temp.z-temp.z * temp.y
fstp   DWORD PTR [ecx]
fld    st(3)
fmul   st, st(3)
fld    st(6)
fmul   st, st(2)
fsubp  st(1), st
fstp   DWORD PTR [ecx + 4]
fld    st(5)
fmul   st, st(2)
fld    st(5)
fmul   st, st(4)
fsubp  st(1), st
fstp   DWORD PTR [ecx + 8]

mov    ecx, 6     ; replace with 3x FFREEP ST(3) (in HEX!!! MASM won't do it - PII and above)
@@:
fstp    st(0)
dec     ecx
jnz     @B

;     ffree st(3)           <-- hex: DD C3; when changed to DF C3 (FFREEP st(3))
;     ffree st(3)                           it doesn't recognize the instruction
;     ffree st(3)

mov    ecx, [ptrgV]
fld    DWORD PTR [ecx]
fld    DWORD PTR [ecx + 4]
fld    DWORD PTR [ecx + 8]
fld    st(2)
fmul   st, st           ; x?
fld    st(2)
fmul   st, st           ; y?
fld    st(2)
fmul   st, st           ; z?
faddp  st(1), st        ; st(0) = z?+y?
faddp  st(1), st        ; st(0) = (z?+y?)+x?
fsqrt

fld1
fdivrp st(1), st
fmul   st(3), st        ; x?/l
fmul   st(2), st        ; y?/l
fmulp  st(1), st        ; z?/l

fstp   DWORD PTR [ecx + 8]
fstp   DWORD PTR [ecx + 4]
fstp   DWORD PTR [ecx]

ret
asm_normal ENDP
``````
Posted on 2002-03-18 03:33:38 by 08/15
First bitRAKE, that freep comand is interesting, I've never heard of it before. I always went by Anger Fog who said fspt st is the fastest way to free the TOS, while fcompp is the fastest way to free two values. But this freep seems more useful altogether.

Second 08/15, you can avoid freeing the stack by using the values in it throughout the middle calculation. From what I gather the stack at the start of the calculation starts at
0: z1-z2, call it zb
1: y1-y2, call it yb
2: x1-x2, call it xb
3: z0-z1, call it za
4: y0-y1, call it ya
5: x0-x1, call it xa

And you want to calculate:
ya*zb-za*yb
za*xb-zb*xa
xa*yb-xb*ya

Since all the values in the first equation are needed by the other two your code is the fastest way to work it:
``````mov    ecx, [ptrgV]
fld    st(4)                            ; st(0) = temp.y
fmul   st, st(1)                        ; st(0) = temp.y*temp.z
fld    st(4)                            ; st(0) = temp.z
fmul   st, st(3)                        ; st(0) = temp.z * temp.y
fsubp  st(1), st                        ; st(0) = temp.y*temp.z-temp.z * temp.y
fstp   DWORD PTR [ecx]
``````
Now however the third equation doesn't use za/b at all, so we might as well overwrite in the calculation of the second equation.
``````fmul st,st(5)  ; st0 = zb*xa
fxch st(3)
fmul st,st(2)  ;  st0 = za*xb
fsubrp st(3),st  ; st(2) = za*xb-zb*xa``````
Now at this stage the result we want is stuck at st(2) so we cant store it, but never mind. Continue on calculating the third equation and the second value will be freed eventually.
``````fmulp st(4),st  ; st(3) = xa*yb
fmulp st(2),st  ; st(1) = xb*ya``````

Now the result to the second equation is at st(0) so store it
``fstp   DWORD PTR [ecx + 4]``

And finish af the third equation
``````fsub ; st(0) = xa*yb-ya*xb
fstp   DWORD PTR [ecx + 8]
``````
And you can forget about freeing the values now as theres nothing left on the stack. Note I haven't tested this so sorry if it doesn't work.
Posted on 2002-03-18 07:05:09 by E�in
thank you very much eoin, i haven't tried yet because i was very busy.

btw, did you actually try to use this FFREEP instructions?
did it work?

just curious...

:confused:
Posted on 2002-03-19 12:50:04 by 08/15
No I haven't had a chance yet, I won't get to my computer till thursday. :(
Posted on 2002-03-20 03:17:25 by E�in

btw, did you actually try to use this FFREEP instructions?
did it work?
What processor are you using?
Did you try using it outside the debugger?
Encode the instruction inline:

db 0DFh, 0C3h ; FFREEP st(3)
db 0DFh, 0C3h ; FFREEP st(3)
db 0DFh, 0C3h ; FFREEP st(3)

MASM will allow this in a code section.
Posted on 2002-03-20 10:12:24 by bitRAKE
333 celeron...

i'll try to do it inline like you suggested

thanx for helping again, bitrake
:)
Posted on 2002-03-20 10:51:29 by 08/15
:grin:
how could i doubt that

you're certainly right, this command exists and it works very well.
i was really confused about the message in OllyDebug, it felt like walking on thin ice without being able to test in the debugger first, but it worx.

now i'll go and look for an olly-update

you made my day with the inline stuff.
i guess the hex-edit works too. too dumb i never tried to just run it...well, have a nice day!

just dl'ed OllyDebug v1.06 (had 1.05 step 1 before), no difference.
undocumented and obviously unrecognized.;)

eoins method of optimizing the algo itself is of course the better solution, just to make it clear.
though, FFREEP may be useful in other cases, and i really appreciate to know it now. cheers
Posted on 2002-03-20 11:22:19 by 08/15
finally, i sat down and did some optimizing

there's actually no need to free the stack as i did before, and i removed the save to memory/load from memory stuff which was really dumb.

this is the result:

``````

asm_normal PROC uses ecx psrcVArr:DWORD, ptrgV:DWORD          ;  gets source VertexArray and dest Vertex

mov    ecx, [psrcVArr]
fld    DWORD PTR [ecx]                  ; st(0) = V.x
fsub   DWORD PTR [ecx + 12]             ; st(0) = V.x - V.x
fld    DWORD PTR [ecx + 4]              ; st(0) = V.y
fsub   DWORD PTR [ecx + 12 + 4]         ; st(0) = V.y - V.y
fld    DWORD PTR [ecx + 8]              ; st(0) = V.z
fsub   DWORD PTR [ecx + 12 + 8]         ; st(0) = V.z - V.z
fld    DWORD PTR [ecx]                  ; st(0) = V.x
fsub   DWORD PTR [ecx + 12]             ; st(0) = V.x - V.x
fld    DWORD PTR [ecx + 4]              ; st(0) = V.y
fsub   DWORD PTR [ecx + 12 + 4]         ; st(0) = V.y - V.y
fld    DWORD PTR [ecx + 8]              ; st(0) = V.z
fsub   DWORD PTR [ecx + 12 + 8]         ; st(0) = V.z - V.z

fld    st(4)                            ; st(0) = temp.y
fmul   st, st(1)                        ; st(0) = temp.y*temp.z
fld    st(4)                            ; st(0) = temp.z
fmul   st, st(3)                        ; st(0) = temp.z * temp.y
fsubp  st(1), st                        ; st(0) = temp.y*temp.z-temp.z * temp.y
fstp   st(7)                            ; make it last element

fxch
fmul   st, st(5)
fxch
fxch   st(4)
fmul   st, st(2)
fsubp  st(1), st                       ; st(0) = temp.x*temp.y-temp.y * temp.x
fstp   st(6)                           ; make it last element

fmulp  st(1), st
fxch
fmulp  st(2), st
fsubrp st(1), st

fld    st(2)
fmul   st, st           ; z?
fld    st(2)
fmul   st, st           ; x?
fld    st(2)
fmul   st, st           ; y?
faddp  st(1), st        ; st(0) = x?+y?
faddp  st(1), st        ; st(0) = (x?+y?)+z?
fsqrt                   ; st(0) = l

fld1
fdivrp st(1), st        ; st(0) = 1/l
fmul   st(3), st        ; z?/l
fmul   st(2), st        ; x?/l
fmulp  st(1), st        ; y?/l
fxch

mov    ecx, [ptrgV]
fstp   DWORD PTR [ecx]
fstp   DWORD PTR [ecx + 4]
fstp   DWORD PTR [ecx + 8]

ret
asm_normal ENDP
``````

maybe it's more useful now
Posted on 2002-03-24 17:15:20 by 08/15