I know I'm being lazy, but my workload is just crazy right now, I'd appreciate not wasting my time rewriting the wheel.
I want it for my bone animation demo, I'm ready to render dancing joint hierarchies sans skin.

TIA, Homer.

footnote: here's my NON WORKING version (anyone see a problem? I checked the output by multiplying an arbitrary matrix with an Identity matrix, result SHOULD be original arb matrix, and it aint)



Matrix4By4Product proc uses esi edi ebx pOut, pMat1, pMat2
local lLoop1, lLoop2
mov esi, pMat1
mov ebx, pOut
mov ecx, 4

QLoop1:
mov edx, 3
mov edi, pMat2 ;<-- reset Source2 pointer
fld dword ptr [esi] ;<-- load the next Source1 Row
fld dword ptr [esi+4]
fld dword ptr [esi+8]
fld dword ptr [esi+12]

QLoop2:
fld dword ptr [edi]
fmul st, st(4)
fld dword ptr [edi+16]
fmul st, st(4)
faddp st(1), st
fld dword ptr [edi+32]
fmul st, st(3)
faddp st(1), st
fld dword ptr [edi+48]
fmul st, st(2)
faddp st(1), st
fstp dword ptr [ebx]
add edi, 4 ;<-- increment Source2 pointer
add ebx, 4 ;<-- increment Target pointer
dec edx
jne QLoop2

fld dword ptr [edi]
fmulp st(4), st
fld dword ptr [edi+16]
fmulp st(3), st
fld dword ptr [edi+32]
fmulp st(2), st
fld dword ptr [edi+48]
fmulp st(1), st
faddp st(2), st
faddp st(2), st
faddp st(1), st
fstp dword ptr [ebx] ;<-- Store partial result
add esi, 16 ;<-- increment Source1 Row
dec ecx
jne QLoop1
ret
Matrix4By4Product endp
Posted on 2004-12-12 21:55:38 by Homer
;Kessel 3D math routines

;Created by Bas Fagginger Auer, 2001
;

TempMat DD 16 DUP(0.0)

;Matrix vector multiplication
;
;b = a * matrix
;
;User passes:
;
; ESI = Address of the source vector (3x1)
; EAX = Address of the matrix (4x4)
; EDI = Address of the target vector (3x1)
;
;User receives:
;
; [EDI] = the multiplied vector
;
MVMul PROC

;dummy.x = a.d(0, 0) * v.x + a.d(1, 0) * v.y + a.d(2, 0) * v.z
;dummy.y = a.d(0, 1) * v.x + a.d(1, 1) * v.y + a.d(2, 1) * v.z
;dummy.z = a.d(0, 2) * v.x + a.d(1, 2) * v.y + a.d(2, 2) * v.z

fld dword ptr [esi] ;ST0 = a.x
fmul dword ptr [eax] ;ST0 = a.x * m(0,0)

fld dword ptr [esi+4] ;ST0 = a.y
fmul dword ptr [eax+16] ;ST0 = a.y * m(1,0)

fld dword ptr [esi+8] ;ST0 = a.z
fmul dword ptr [eax+32] ;ST0 = a.z * m(2,0)

fadd ;ST0 = a.z * m(2,0) + a.y * m(1,0)
fadd ;ST0 = a.z * m(2,0) + a.y * m(1,0) + a.x * m(0,0)
fadd dword ptr [eax+48] ;ST0 = a.z * m(2,0) + a.y * m(1,0) + a.x * m(0,0) + m(3,0)

fstp dword ptr [edi] ;b.x = a.z * m(2,0) + a.y * m(1,0) + a.x * m(0,0) + m(3,0)


fld dword ptr [esi] ;ST0 = a.x
fmul dword ptr [eax+4] ;ST0 = a.x * m(0,1)

fld dword ptr [esi+4] ;ST0 = a.y
fmul dword ptr [eax+20] ;ST0 = a.y * m(1,1)

fld dword ptr [esi+8] ;ST0 = a.z
fmul dword ptr [eax+36] ;ST0 = a.z * m(2,1)

fadd ;ST0 = a.z * m(2,1) + a.y * m(1,1)
fadd ;ST0 = a.z * m(2,1) + a.y * m(1,1) + a.x * m(0,1)
fadd dword ptr [eax+52] ;ST0 = a.z * m(2,1) + a.y * m(1,1) + a.x * m(0,1) + m(3,1)

fstp dword ptr [edi+4] ;b.y = a.z * m(2,1) + a.y * m(1,1) + a.x * m(0,1) + m(3,1)


fld dword ptr [esi] ;ST0 = a.x
fmul dword ptr [eax+8] ;ST0 = a.x * m(0,2)

fld dword ptr [esi+4] ;ST0 = a.y
fmul dword ptr [eax+24] ;ST0 = a.y * m(1,2)

fld dword ptr [esi+8] ;ST0 = a.z
fmul dword ptr [eax+40] ;ST0 = a.z * m(2,2)

fadd ;ST0 = a.z * m(2,2) + a.y * m(1,2)
fadd ;ST0 = a.z * m(2,2) + a.y * m(1,2) + a.x * m(0,2)
fadd dword ptr [eax+56] ;ST0 = a.z * m(2,2) + a.y * m(1,2) + a.x * m(0,2) + m(3,2)

fstp dword ptr [edi+8] ;b.z = a.z * m(2,2) + a.y * m(1,2) + a.x * m(0,2) + m(3,2)

ret
MVMul ENDP


;Create unit matrix
;
;matrix = [1,0,0,0],[0,1,0,0],[0,0,1,0],[0,0,0,1]
;
;User passes:
;
; EDI = Address of the target matrix
;
;User receives:
;
; [EDI] = A unit matrix
;
MUnit PROC
mov ecx, 16
fldz
fstp dword ptr [TempI]
mov eax, [TempI]
rep stosd

;mov ECX, 16
;LoopjeStoreZeroes:
; fst dword ptr [EDI]
; add EDI, 4
; dec ECX
;jnz LoopjeStoreZeroes


sub edi, 64
fld1
fst dword ptr [edi]
fst dword ptr [edi+20]
fst dword ptr [edi+40]
fstp dword ptr [edi+60]
ret
MUnit ENDP


;Matrix matrix multiplication
;
;a = a * b
;
;User passes:
;
; [ESI] = Matrix B
; [EDI] = Matrix A
;
;User receives:
;
; [EDI] = Result matrix A
;
MMul PROC
;FOR i = 0 TO 2
; FOR j = 0 TO 2
; tm.d(i, j) = a.d(i, 0) * b.d(0, j) + a.d(i, 1) * b.d(1, j) + a.d(i, 2) * b.d(2, j)
; NEXT j
;NEXT i
lea eax, [TempMat]
xor ecx, ecx
ILoop:
xor cl, cl
JLoop:
xor edx, edx
xor ebx, ebx
mov dl, ch ;DL = i
mov bl, cl ;BL = j
lea edx, [edx*8]
lea ebx, [ebx*4]
lea edx, [edx*2]

fld dword ptr [edi+edx] ;ST0 = a(i,0)
fmul dword ptr [esi+ebx] ;ST0 = a(i,0) * b(0,j)

fld dword ptr [edi+edx+4] ;ST0 = a(i,1)
fmul dword ptr [esi+ebx+16] ;ST0 = a(i,1) * b(1,j)

fld dword ptr [edi+edx+8] ;ST0 = a(i,2)
fmul dword ptr [esi+ebx+32] ;ST0 = a(i,2) * b(2,j)

fld dword ptr [edi+edx+12] ;ST0 = a(i,3)
fmul dword ptr [esi+ebx+48] ;ST0 = a(i,3) * b(3,j)

add ebx, edx

fadd ;ST0 = a(i,3) * b(3,j) + a(i,2) * b(2,j)
fadd ;ST0 = a(i,3) * b(3,j) + a(i,2) * b(2,j) + a(i,1) * b(1,j)
fadd ;ST0 = a(i,3) * b(3,j) + a(i,2) * b(2,j) + a(i,1) * b(1,j) + a(i,0) * b(0,j)

fstp dword ptr [eax+ebx] ;t(i,j) = a(i,3) * b(3,j) + a(i,2) * b(2,j) + a(i,1) * b(1,j) + a(i,0) * b(0,j)
inc cl
cmp cl, 4
jne JLoop
inc ch
cmp ch, 4
jne ILoop
mov ecx, 16
mov esi, eax
rep movsd
ret
MMul ENDP

;-----------> MRot procedure
;
;Created by Bas Fagginger Auer
;
;Created a 4x4 rotation matrix based on the given rX, rY and rZ angles.
;
;
;Caller must pass:
;
; ESI = Offset of rotation vector (vector.x = rotation around the X angle, etc.)
; EDI = Offset of matrix A
;
;Caller receives:
;
; -
;
;Register(s) that are modified:
;
; -
;


MRot PROC
fldz
fstp [TempD]
mov eax, [TempD]

mov cx, 16 ;We want to store 4*4 matrix elements
rep stosd ;Store 64 zeros

sub EDI, 64

fld1
fstp dword ptr [EDI+60]


;Matrix now looks like this:
;
;0 0 0 0
;0 0 0 0
;0 0 0 0
;0 0 0 1


;Now insert the 'rotation elements'

fld DWORD PTR [ESI] ;ST0 = rX
fsincos ;ST0 = Cos(rX)
;ST1 = Sin(rX)

fld DWORD PTR [ESI+4] ;ST0 = rY
fsincos ;ST0 = Cos(rY)
;ST1 = Sin(rY)

fld DWORD PTR [ESI+8] ;ST0 = rZ
fsincos ;ST0 = Cos(rZ)
;ST1 = Sin(rZ)


fstp DWORD PTR [cZ] ;cZ = Cos(rZ)
fstp DWORD PTR [sZ] ;sZ = Sin(rZ)

fstp DWORD PTR [cY] ;cY = Cos(rY)
fstp DWORD PTR [sY] ;sY = Sin(rY)

fstp DWORD PTR [acX] ;acX = Cos(rX)
fstp DWORD PTR [sX] ;sX = Sin(rX)


fld DWORD PTR [cY]
fmul DWORD PTR [cZ] ;ST0 = cY * cZ
fstp DWORD PTR [EDI] ;A(0,0) = cY * cZ




fld DWORD PTR [cY]
fmul DWORD PTR [sZ] ;ST0 = cY * sZ
fstp DWORD PTR [EDI+4] ;A(0,1) = cY * sZ

fld DWORD PTR [sY]
fchs ;ST0 = -sY
fstp DWORD PTR [EDI+8] ;A(0,2) = -sY


fld DWORD PTR [acX]
fmul DWORD PTR [sZ] ;ST0 = acX * sZ

fld DWORD PTR [sX]
fmul DWORD PTR [sY]
fmul DWORD PTR [cZ] ;ST0 = sX * sY * cZ

fsub ST(0), ST(1) ;ST0 = sX * sY * cZ - acX * sZ

fstp DWORD PTR [EDI+16] ;A(1,0) = sX * sY * cZ - acX * sZ


fld DWORD PTR [acX]
fmul DWORD PTR [cZ] ;ST0 = acX * cZ

fld DWORD PTR [sX]
fmul DWORD PTR [sY]
fmul DWORD PTR [sZ] ;ST0 = sX * sY * sZ

fadd ;ST0 = sX * sY * sZ + acX * cZ

fstp DWORD PTR [EDI+20] ;A(1,1) = sX * sY * sZ + acX * cZ

fld DWORD PTR [sX]
fmul DWORD PTR [cY] ;ST0 = sX * cY
fstp DWORD PTR [EDI+24] ;A(1,2) = sX * cY


fld DWORD PTR [sX]
fmul DWORD PTR [sZ] ;ST0 = sX * sZ

fld DWORD PTR [acX]
fmul DWORD PTR [sY]
fmul DWORD PTR [cZ] ;ST0 = acX * sY * cZ

fadd ;ST0 = acX * sY * cZ + sX * sZ

fstp DWORD PTR [EDI+32] ;A(2,0) = acX * sY * cZ + sX * sZ

fld DWORD PTR [sX]
fmul DWORD PTR [cZ] ;ST0 = sX * cZ

fld DWORD PTR [acX]
fmul DWORD PTR [sY]
fmul DWORD PTR [sZ] ;ST0 = acX * sY * sZ

fsub ST(0), ST(1) ;ST0 = acX * sY * sZ - sX * cZ

fstp DWORD PTR [EDI+36] ;A(2,1) = acX * sY * sZ - sX * cZ

fld DWORD PTR [acX]
fmul DWORD PTR [cY] ;ST0 = acX * cY
fstp DWORD PTR [EDI+40] ;A(2,2) = acX * cY

ret
MRot ENDP
http://www.phys.uu.nl/~0307467/
Posted on 2004-12-13 15:13:58 by bitRAKE
much obliged, I'll give it a whirl when I get home from work :)
Posted on 2004-12-13 16:51:32 by Homer
Sweet, thanks a bunch - it passed my simplistic multiply-by-Identity test.
I should be able to get my joint hierarchy to render correctly now.
Posted on 2004-12-13 22:37:17 by Homer
Your code is good, but you forgot to advance EBX when you store the last value in a row.
Matrix4By4Product proc uses esi edi ebx pOut, pMat1, pMat2 

local lLoop1, lLoop2
mov esi, pMat1
mov ebx, pOut
mov ecx, 4

QLoop1:
mov edx, 3
mov edi, pMat2 ;<-- reset Source2 pointer
fld dword ptr [esi] ;<-- load the next Source1 Row
fld dword ptr [esi+4]
fld dword ptr [esi+8]
fld dword ptr [esi+12]

QLoop2:
fld dword ptr [edi]
fmul st, st(4)
fld dword ptr [edi+16]
fmul st, st(4)
faddp st(1), st
fld dword ptr [edi+32]
fmul st, st(3)
faddp st(1), st
fld dword ptr [edi+48]
fmul st, st(2)
faddp st(1), st
fstp dword ptr [ebx]
add edi, 4 ;<-- increment Source2 pointer
add ebx, 4 ;<-- increment Target pointer
dec edx
jne QLoop2

fld dword ptr [edi]
fmulp st(4), st
fld dword ptr [edi+16]
fmulp st(3), st
fld dword ptr [edi+32]
fmulp st(2), st
fld dword ptr [edi+48]
fmulp st(1), st
faddp st(2), st
faddp st(2), st
faddp st(1), st
fstp dword ptr [ebx] ;<-- Store partial result

; MISSING INSTRUCTION
add ebx, 4 ;<-- increment Target pointer

add esi, 16 ;<-- increment Source1 Row
dec ecx
jne QLoop1
ret
Matrix4By4Product endp
Can't wait to see what you got cookin'. :)
Posted on 2004-12-14 01:55:02 by bitRAKE
Hello Homer,

Maybe you missed my msg, but here's something for you

http://www.asmcommunity.net/board/viewtopic.php?t=19782

I did try coding a SSE version but it was only half done. I could post it here if there are any interested parties to complete it.
Posted on 2004-12-14 06:03:16 by roticv
bitrake - d'oh ! My bad :) ty
victor - Yeah I got no msg, I'll take a look at that SSE stuff if you care to share :)
Posted on 2004-12-14 09:34:44 by Homer


;Transpose matrix2
;Store transposed matrix in xmm4...xmm7
movdqa xmm5, [matrix2]
movdqa xmm1, [matrix2+128]
movdqa xmm2, [matrix2+128+128]
movdqa xmm3, [matrix2+128+128+128]

;For later use
movdqa xmm7, xmm5
movdqa xmm0, xmm1

unpcklps xmm5, xmm2
unpcklps xmm1, xmm3
movdqa xmm4, xmm5
unpcklps xmm4, xmm1

;First column transposed

unpckhps xmm5, xmm1

;Second column transposed

unpckhps xmm7, xmm2
unpckhps xmm0, xmm3
movqda xmm6, xmm7
unpcklps xmm6, xmm1

;Third column transposed

unpckhps xmm7, xmm1

;Fourth column transposed

;Multiply now
;Only 16 multiplies needed

movqda xmm0, [matrix1]
movqda xmm1, xmm0
mulps xmm1, xmm4
movqda xmm2, xmm1
add

;movqda xmm2, xmm0
;movqda xmm3, xmm1
mulps xmm0, xmm4
mulps xmm1, xmm5
mulps xmm2, xmm6
mulps xmm3, xmm7

;Add the 4 real in xmm0..xmm3
;You get the the first row of the matrix

...
Posted on 2004-12-14 10:39:30 by roticv
The existing 3d engine supports managed instances of two basic 3d primitive objects (textured spheres and cubes). I've had a couple of tries under DX to get bone-animated models implemented with little success.
This time under OGL, I decided to implement it myself from the ground up.
I have written an oop class which implements a Joint node hierarchy, and a container class to hold the root node, skin geometry and other required stuff. I have implemented my own text-based file format for describing just the joint hierarchy (loosely based on X file format), as well as a loader which creates a joint hierarchy based on the text file data.
I've written code which should display all the joints in the hierarchy using a single instance of my Sphere class. Once I get the joints to render, I'll write a new geometric primitive class for Pyramids and use that to render the Bones inbetween the Joints. When I have done that I will introduce another custom file format containing animation data (keyframed matrices to be applied at the joints, similar to the static joint hierarchy).
After I have implemented dancing skeletons, I will figure out how to go about deforming the skin vertices using bone weights, probably using a vertexshader. Later on I will attempt to blend animations, then it will be time to add code to export the loaded model to a custom BINARY file format, and code to load it back in again from the binfile(s).
Wish me luck :)
Posted on 2004-12-14 22:54:31 by Homer