Friend of mine ask me to speed up one of his function, written in C. After scratching a head (I never playing with FPU before) I've managed to translate it. Here is C source:

``````double	cur_discr = 0.0;	//current temperature difference
double	max_cur_discr = 0.0;//maximum cur_discr in iter.
double	max_discr = 1.0;	//-=-, after 1000 iter.
double	t0;					//temperature before recalc
long	needCountDiscr = false;//do we need to calc discrepancy

do
{
for (int i = 1; i < height-1; i++)
{
//if (0 == i || (height - 1) == i) continue;
for (int j = 0; j < width; j++)
{
left = j - 1;
top = i - 1;
right = j + 1;
bottom = i + 1;
if (0 == j)
left = width - 1;
if ((width - 1) == j)
right = 0;

t0 = Tmp(i, j);
double test1 = Cleft (i, j);
double test2 = Ctop (i, j);
double test3 = Cright (i, j);
double test4 = Cbottom (i, j);
Tmp(i, j) =
Tmp(i, left)	/ Cleft (i, j)		+
Tmp(top, j)		/ Ctop (i, j)		+
Tmp(i, right)	/ Cright (i, j)		+
Tmp(bottom, j)	/ Cbottom (i, j);

if (needCountDiscr)
{
cur_discr = Tmp(i, j) - t0;
if (cur_discr < 0.0) cur_discr *= -1.0;
if (cur_discr > max_cur_discr)
max_cur_discr = cur_discr;
}
}
}
if (needCountDiscr)
{
max_discr = max_cur_discr;
max_cur_discr = 0.0;
}
m_IterCount++;
if(!(m_IterCount % 1000))
needCountDiscr = true;
else
needCountDiscr = false;
} while (max_discr > m_Props.discrepancy);``````

And here is assembly:
``````_asm {
//int 3
mov esi, Tmp
mov esi, [esi+10h]
mov esi, [esi+4] ; esi is a start of doubles

__do:
xor ecx, ecx
__next_inner:
inc ecx
cmp ecx, total
jae __end_outer
emms
//int 3
mov edi, Cbottom+10h
mov edi, [edi+4]
lea edi, [edi+ecx*8]
fld qword ptr [edi] ; st3
mov edi, Ctop+10h
mov edi, [edi+4]
lea edi, [edi+ecx*8]
fld qword ptr [edi] ; st2
mov edi, Cright+10h
mov edi, [edi+4]
lea edi, [edi+ecx*8]
fld qword ptr [edi] ; st1
mov edi, Cleft+10h
mov edi, [edi+4]
lea edi, [edi+ecx*8]
fld qword ptr [edi] ; st0

lea edi, [esi+ecx*8-8] // left
fld qword ptr [edi]
fxch
fdivp st(1), st(0) // Tmp/left

lea edi, [esi+ecx*8+8] ; right
fld qword ptr [edi]
fdiv st(0), st(2) ; Tmp/right

mov edx, width
shl edx, 3 ; *8
mov eax, edx
neg eax
lea eax, [ecx*8+eax]
lea edi, [esi+eax] ; top
fld qword ptr [edi]
fdiv st(0), st(3) ; Tmp/top

lea eax, [ecx*8+edx]
lea edi, [esi+eax] ; bottom
fld qword ptr [edi]
fdiv st(0), st(4) ; Tmp/bottom

lea edi, [esi+ecx*8]
fld qword ptr [edi] ; t0

fxch

fst qword ptr [edi] ; save current Tmp(i, j)

mov eax, needCountDiscr
test eax, eax
jz __next_inner
//int 3
fsub st(0), st(1)
fldz
fcomip st(0), st(1)
jz __nextcmp
jc __nextcmp
fchs ; *-1
__nextcmp:
lea eax, cur_discr
lea edx, max_cur_discr
fst qword ptr [eax]
fld qword ptr [edx]
fcomip st(0), st(1)
jnc __next_inner
fst qword ptr [edx]

jmp __next_inner
__end_outer:
mov eax, needCountDiscr
test eax, eax
jz __inc_iter
//int 3
emms
lea eax, max_discr
lea edx, max_cur_discr
fld qword ptr [edx]
fstp qword ptr [eax]
fldz
fstp qword ptr [edx]
__inc_iter:
inc IterCount
mov eax, IterCount
mov ebx, 1000
cdq
idiv ebx
xor eax, eax
test edx, edx
cmp edx, 1
mov needCountDiscr, eax

lea eax, max_discr
fld qword ptr [eax]
lea eax, discr
fld qword ptr [eax]
; st1 = max_discr, st0 = discr
fcomip st(0), st(1)
jc __do
jz __do
}
``````

Quite a big piece of code :)
This code is executed about 200 000 of iteration and each iteration include running through every Tmp(i, j) element, which is a REAL8 (or double in C), length of Tmp field is a little more than 20 000 element, Cxxxx all have the same size. The problem is Athlon execute this code much more faster than P4.
Approximately, Athlon XP 2200+ spends about 33 minutes, P4 2 GHz with non-optimized C code spends about 50 minites, but with optimization more than 5 times slower. I know about Athlon executes FPU operation a little faster than P4, but difference is to huge... All data aligned properly.
Any suggestions?
Posted on 2003-06-08 09:43:09 by masquer
Lame low level analysis:
``````jc __?
jz __?

[b]Same as:[/b]

jna __?``````

``````
lea	edi, [edi+ecx*8]
fld	qword ptr [edi]

[b]Same as:[/b]

fld	qword ptr [edi+ecx*8]``````

FXCH is virtually free, but...
``````
fld qword ptr [edi]
fxch
fdivp st(1), st(0) // Tmp/left

[b]Same as:[/b]

fld qword ptr [edi]
fdivrp // Tmp/left``````

``````
mov edx, width
shl edx, 3 ; *8
mov eax, edx
neg eax
lea eax, [ecx*8+eax]
lea edi, [esi+eax] ; top

[b]Same as:[/b]

mov edx, width
mov eax, ecx
sub eax, edx
shl edx, 3
fld qword ptr [esi + eax*8]
``````

No branch needed here!
``````
if (cur_discr < 0.0) cur_discr *= -1.0;

[b]Same as:[/b]

fabs cur_discr``````
I have not looked at the algorithm at a higher level, just instruction usage. There is much work to be done here, imho. Intel has given up on improving the FPU -- SSE2 should be used if possible.
Posted on 2003-06-08 10:53:30 by bitRAKE
Thanks bitRAKE,

Lame low level analysis

Absolutely agree :)

But is P4 so slow with FPU? And what is your forecast about using SSE2 here?
Posted on 2003-06-08 11:23:59 by masquer
SSE2 should be much faster on P4.
Posted on 2003-06-08 11:41:13 by Eóin
Originally posted by masquer
But is P4 so slow with FPU? And what is your forecast about using SSE2 here?
In comparison to a similarly clocked Athon, yes - the P4 is only half the machine when comes to FPU. The problems in the code above are the result of other factors as well - there is much more branching than needed (the nasty unpredicatable type, too). P4's really suffer for random branching - give them a straight piece of code and they'll plow right through it.

Using SSE2 here should help to remove all of the bad branching.

Yes, the "5 times slower" is not what I would expect, but I'd have to see the code generated by the compiler to comment on that.
Posted on 2003-06-08 11:45:26 by bitRAKE
Hate to think about SSE2, since I have no P4 around, only Athlons :)
I can remove some branching here but seems it does nothing on P4 :(
OK, here is compiler's output in attachment.
Posted on 2003-06-09 09:16:01 by masquer