i am new to asm programming,the code below is to use SSE to compute the product of two 4*4 matrix,i use the visual studio.net 2003 to built it,it have runtime error,but it works well when i change
float m2[4][4],m1[4][4],m3[4][4];
to
float m1[4][4],m2[4][4],m3[4][4];
is it a bug in the complier?can try it in other environment.



/////////////////////////////

#include<stdio.h>

#define FILE_IN "input.txt"
#define FILE_OUT "output.txt"

void MultiMatrix(float dest[4][4],float src1[4][4],float src2[4][4])
{
_asm
{
mov ecx,src1;
mov edx,src2;
mov eax,dest;
movss xmm0,[ecx];
shufps xmm0,xmm0,00h;
movss xmm1,[ecx+4];
shufps xmm1,xmm1,00h;
movss xmm2,[ecx+8];
shufps xmm2,xmm2,00h;
movss xmm3,[ecx+12];
shufps xmm3,xmm3,00h;
movaps xmm4,[edx];
movaps xmm5,[edx+16];
movaps xmm6,[edx+32];
movaps xmm7,[edx+48];
mulps xmm0,xmm4;
mulps xmm1,xmm5;
mulps xmm2,xmm6;
mulps xmm3,xmm7;
addps xmm0,xmm1;
addps xmm0,xmm2;
addps xmm0,xmm3;
movups [eax],xmm0;

movss xmm0,[ecx+16];
shufps xmm0,xmm0,00h;
movss xmm1,[ecx+20];
shufps xmm1,xmm1,00h;
movss xmm2,[ecx+24];
shufps xmm2,xmm2,00h;
movss xmm3,[ecx+28];
shufps xmm3,xmm3,00h;
mulps xmm0,xmm4;
mulps xmm1,xmm5;
mulps xmm2,xmm6;
mulps xmm3,xmm7;
addps xmm0,xmm1;
addps xmm0,xmm2;
addps xmm0,xmm3;
movups [eax+16],xmm0;

movss xmm0,[ecx+32];
shufps xmm0,xmm0,00h;
movss xmm1,[ecx+36];
shufps xmm1,xmm1,00h;
movss xmm2,[ecx+40];
shufps xmm2,xmm2,00h;
movss xmm3,[ecx+44];
shufps xmm3,xmm3,00h;
mulps xmm0,xmm4;
mulps xmm1,xmm5;
mulps xmm2,xmm6;
mulps xmm3,xmm7;
addps xmm0,xmm1;
addps xmm0,xmm2;
addps xmm0,xmm3;
movups [eax+32],xmm0;

movss xmm0,[ecx+48];
shufps xmm0,xmm0,00h;
movss xmm1,[ecx+52];
shufps xmm1,xmm1,00h;
movss xmm2,[ecx+56];
shufps xmm2,xmm2,00h;
movss xmm3,[ecx+60];
shufps xmm3,xmm3,00h;
mulps xmm0,xmm4;
mulps xmm1,xmm5;
mulps xmm2,xmm6;
mulps xmm3,xmm7;
addps xmm0,xmm1;
addps xmm0,xmm2;
addps xmm0,xmm3;
movups [eax+48],xmm0;
}
}

int main()
{
int i,j;
FILE *fin,*fout;
float m2[4][4],m1[4][4],m3[4][4];

fin = fopen(FILE_IN,"r");
fout = fopen(FILE_OUT,"w");

for(i=0;i<4;++i)
for(j=0;j<4;++j)
fscanf(fin,"%f",&m1[i][j]);

for(i=0;i<4;++i)
for(j=0;j<4;++j)
fscanf(fin,"%f",&m2[i][j]);

MultiMatrix(m3,m1,m2);

for(i=0;i<4;++i)
{
for(j=0;j<4;++j)
fprintf(fout,"%f ",m3[i][j]);
fprintf(fout,"\n");
}

fclose(fin);
fclose(fout);

return 0;

}

//////////////////////
input.txt

1.2 2.1 3.2 0
1.0 1.0 1.0 1.0
3.0 4.1 5.2 192.1
2.3 4.3 5.8 6.0

1.2 2.1 1.0 0
11.0 1.0 1.0 1.0
3.0 24.1 1.0 192.1
22.3 4.35 1.0 6.01

/////////////////////
Posted on 2003-11-15 07:32:26 by fairstar
The problem is that movaps needs a pointer aligned on a 16-byte boundary and the local arguments are aligned on 4-byte boundary. Use movups or align the locals manually.

__declspec(align(16)) will force 16-byte alignment for the local variables.
Posted on 2003-11-16 10:57:02 by Dr. Manhattan