Hello, a noob with a dumb question  :D

I'm writing some optimized matrix code, and for whatever dumb reason, I can't understand why the following doesn't work. I am using inline assembly in visual c++, btw ;)

Matrix4 matResult;

__m128 row0, row1, row2, row3;
__m128 base0, base1, base2, base3;
__m128 result0, result1, result2, result3;
__m128 tmp1;

tmp1.m128_i64[0] = 0;
tmp1.m128_i64[1] = 0;

row0 = _mm_load_ps(m_fMatrix16); // first chance exception here
row1 = _mm_load_ps(m_fMatrix16+4);
row2 = _mm_load_ps(m_fMatrix16+8);
row3 = _mm_load_ps(m_fMatrix16+12);

base0 = _mm_load_ps(mat.m_fMatrix16);
base1 = _mm_load_ps(mat.m_fMatrix16+4);
base2 = _mm_load_ps(mat.m_fMatrix16+8);
base3 = _mm_load_ps(mat.m_fMatrix16+12);

result0 = _mm_add_ps(row0, base0);
result1 = _mm_add_ps(row1, base1);
result2 = _mm_add_ps(row2, base2);
result3 = _mm_add_ps(row3, base3);

_mm_store_ps(matResult.m_fMatrix16, result0);
_mm_store_ps(matResult.m_fMatrix16+4, result1);
_mm_store_ps(matResult.m_fMatrix16+8, result2);
_mm_store_ps(matResult.m_fMatrix16+12, result3);

return matResult;

Here is the dissassembly that vc++ spews up from the above

Matrix4 matResult;
004129CF  lea        ecx,
004129D2  call        SkepWorks::Math::Matrix4::Matrix4 (411267h)

__m128 row0, row1, row2, row3;
__m128 base0, base1, base2, base3;
__m128 result0, result1, result2, result3;
__m128 tmp1;

tmp1.m128_i64[0] = 0;
004129D7  mov        dword ptr ,0
004129E1  mov        dword ptr ,0
tmp1.m128_i64[1] = 0;
004129EB  mov        dword ptr ,0
004129F5  mov        dword ptr ,0

row0 = _mm_load_ps(m_fMatrix16);
004129FF  mov        eax,dword ptr
00412A02  movaps      xmm0,xmmword ptr ;error here
00412A05  movaps      xmmword ptr ,xmm0
00412A0C  movaps      xmm0,xmmword ptr
00412A13  movaps      xmmword ptr ,xmm0
row1 = _mm_load_ps(m_fMatrix16+4);
00412A17  mov        eax,dword ptr
00412A1A  movaps      xmm0,xmmword ptr
00412A1E  movaps      xmmword ptr ,xmm0
00412A25  movaps      xmm0,xmmword ptr
00412A2C  movaps      xmmword ptr ,xmm0
row2 = _mm_load_ps(m_fMatrix16+8);
00412A33  mov        eax,dword ptr
00412A36  movaps      xmm0,xmmword ptr
00412A3A  movaps      xmmword ptr ,xmm0
00412A41  movaps      xmm0,xmmword ptr
00412A48  movaps      xmmword ptr ,xmm0
row3 = _mm_load_ps(m_fMatrix16+12);
00412A4F  mov        eax,dword ptr
00412A52  movaps      xmm0,xmmword ptr
00412A56  movaps      xmmword ptr ,xmm0
00412A5D  movaps      xmm0,xmmword ptr
00412A64  movaps      xmmword ptr ,xmm0

base0 = _mm_load_ps(mat.m_fMatrix16);
00412A6B  mov        eax,dword ptr
00412A6E  movaps      xmm0,xmmword ptr
00412A71  movaps      xmmword ptr ,xmm0
00412A78  movaps      xmm0,xmmword ptr
00412A7F  movaps      xmmword ptr ,xmm0
base1 = _mm_load_ps(mat.m_fMatrix16+4);
00412A86  mov        eax,dword ptr
00412A89  movaps      xmm0,xmmword ptr
00412A8D  movaps      xmmword ptr ,xmm0
00412A94  movaps      xmm0,xmmword ptr
00412A9B  movaps      xmmword ptr ,xmm0
base2 = _mm_load_ps(mat.m_fMatrix16+8);
00412AA2  mov        eax,dword ptr
00412AA5  movaps      xmm0,xmmword ptr
00412AA9  movaps      xmmword ptr ,xmm0
00412AB0  movaps      xmm0,xmmword ptr
00412AB7  movaps      xmmword ptr ,xmm0
base3 = _mm_load_ps(mat.m_fMatrix16+12);
00412ABE  mov        eax,dword ptr
00412AC1  movaps      xmm0,xmmword ptr
00412AC5  movaps      xmmword ptr ,xmm0
00412ACC  movaps      xmm0,xmmword ptr
00412AD3  movaps      xmmword ptr ,xmm0

result0 = _mm_add_ps(row0, base0);
00412ADA  movaps      xmm0,xmmword ptr
00412AE1  movaps      xmm1,xmmword ptr
00412AE5  addps      xmm1,xmm0
00412AE8  movaps      xmmword ptr ,xmm1
00412AEF  movaps      xmm0,xmmword ptr
00412AF6  movaps      xmmword ptr ,xmm0
result1 = _mm_add_ps(row1, base1);
00412AFD  movaps      xmm0,xmmword ptr
00412B04  movaps      xmm1,xmmword ptr
00412B0B  addps      xmm1,xmm0
00412B0E  movaps      xmmword ptr ,xmm1
00412B15  movaps      xmm0,xmmword ptr
00412B1C  movaps      xmmword ptr ,xmm0
result2 = _mm_add_ps(row2, base2);
00412B23  movaps      xmm0,xmmword ptr
00412B2A  movaps      xmm1,xmmword ptr
00412B31  addps      xmm1,xmm0
00412B34  movaps      xmmword ptr ,xmm1
00412B3B  movaps      xmm0,xmmword ptr
00412B42  movaps      xmmword ptr ,xmm0
result3 = _mm_add_ps(row3, base3);
00412B49  movaps      xmm0,xmmword ptr
00412B50  movaps      xmm1,xmmword ptr
00412B57  addps      xmm1,xmm0
00412B5A  movaps      xmmword ptr ,xmm1
00412B61  movaps      xmm0,xmmword ptr
00412B68  movaps      xmmword ptr ,xmm0

_mm_store_ps(matResult.m_fMatrix16, result0);
00412B6F  movaps      xmm0,xmmword ptr
00412B76  movaps      xmmword ptr ,xmm0
_mm_store_ps(matResult.m_fMatrix16+4, result1);
00412B7A  movaps      xmm0,xmmword ptr
00412B81  movaps      xmmword ptr ,xmm0
_mm_store_ps(matResult.m_fMatrix16+8, result2);
00412B85  movaps      xmm0,xmmword ptr
00412B8C  movaps      xmmword ptr ,xmm0
_mm_store_ps(matResult.m_fMatrix16+12, result3);
00412B90  movaps      xmm0,xmmword ptr
00412B97  movaps      xmmword ptr ,xmm0

return matResult;

The areas that error above give the following first chance exception:

Unhandled exception at 0x00412a02 in Test.exe: 0xC0000005: Access violation reading location 0x00000000.

The variable m_fMatrix16 is indeed valid, and is an array of sixteen floats.

I'm new to assembly programming, so please tell me if i'm missing something  :mrgreen:

Thanks for your assistance,
exorcist_bob
Posted on 2006-06-24 20:09:58 by exorcist_bob
dang, i feel stupid, had to declare the variable with __declspec(align(16))
Posted on 2006-06-24 20:35:00 by exorcist_bob