Hi all. I was just hoping to get some hints/critiques/criticism about my first MMX function (which converts a 32-bit image into grayscale). I am a complete newby when it comes to MMX so please be gentle, but all comments are appreciated.



void toGrayscaleMMX( void *color32, DWORD pixelCount, void *gray )
{
// Y = 0.299 * R + 0.587 * G + 0.114 * B
// Y = ( 0x4D * R + 0x96 * G + 0x1D * B ) >> 8
static __int64 MULTIPLIERS = 0x0000004D0096001D;

__asm
{
mov ecx, pixelCount
mov esi, color32
mov edi, gray
pxor mm2, mm2
movq mm1, MULTIPLIERS
NEXT_PIXEL:
movd mm0, [esi] // mm0 = 0x0000000000RRGGBB
punpcklbw mm0, mm2 // mm0 = 0x000000RR00GG00BB
pmaddwd mm0, mm1 // 0 + R * MULT | G * MULT + B * MULT
movd ebx, mm0 // ebx = G * MULT + B * MULT
psrlq mm0, 32 // mm0 = 0x000000 | R * MULT
movd eax, mm0 // eax = 0 + R * MULT

add eax, ebx // eax == R * MULT + G * MULT + B * MULT
shr eax, 8
mov [edi], al

add esi, 4
add edi, 1
loop NEXT_PIXEL
emms
}
}


Spara
Posted on 2004-10-19 18:39:15 by Sparafusile
That is a good start, but it would be much faster if unroll to process two pixels at once. MOVD is slow compared to MOVQ. Then unroll further and process eight pixels - using MOVQ/MOVNTQ to store eight destination pixels at once.
Posted on 2004-10-19 18:54:35 by bitRAKE
I thought of that, but then I realized that I might go out of bounds of my color data if the pixel length of the image isn't divisible by 2 or 8 respectively. Is there any easy way to get around this problem or do I just have to make that a constraint of calling the function?

Spara
Posted on 2004-10-19 18:57:09 by Sparafusile
process the portion the ones that is divisible, and use a simpler single pixel processing routine for the remainder
Posted on 2004-10-19 19:50:42 by comrade
Here's the second version of my function (I even managed to find some optimizations myself, go me).



void toGrayscaleMMX_Ex( void *color32, DWORD pixelCount, void *gray )
{
// Y = 0.299 * R + 0.587 * G + 0.114 * B
// Y = ( 0x4D * R + 0x96 * G + 0x1D * B ) >> 8
static __int64 MULTIPLIERS = 0x0000004D0096001D;

__asm
{
mov esi, color32
mov edi, gray
movq mm7, MULTIPLIERS
pxor mm6, mm6

// Unrolled 2x loop
mov ecx, pixelCount
shr ecx, 1
jz CONTINUE_1
LOOP_2X:
movq mm0, [esi] // mm0 = 0x00R2G2B200R1G1B1
movq mm1, mm0 // mm1 = mm0
punpckhbw mm1, mm6 // mm1 = 0x000000R200G200B2
pmaddwd mm1, mm7 // mm1 = 0 + R2 * MULT | G2 * MULT + B2 * MULT
punpcklbw mm0, mm6 // mm0 = 0x000000R100G100B1
pmaddwd mm0, mm7 // mm0 = 0 + R1 * MULT | G1 * MULT + B1 * MULT
movq mm2, mm0 // mm2 = mm0
punpckhdq mm0, mm1 // mm0 = 0 + R2 * MULT | 0 + R1 * MULT
punpckldq mm2, mm1 // mm2 = G2 * MULT + B2 * MULT | G1 * MULT + B1 * MULT
paddd mm0, mm2 // mm0 = Y2 << 8 | Y1 << 8
movd eax, mm0 // eax = Y1 << 8
mov [edi], ah // "shr eax, 8", "mov [edi], al"
psrl mm0, 32 // mm0 = 0x00000000 | Y2 << 8
movd eax, mm0 // eax = Y2 << 8
mov [edi+1], ah // "shr eax, 8", "mov [edi+1], al"
add esi, 8
add edi, 2
loop LOOP_2X

CONTINUE_1:
test pixelCount, 1
jz EXIT_PROC

// Single pixel (only the last one)
movd mm0, [esi] // mm0 = 0x0000000000RRGGBB
punpcklbw mm0, mm2 // mm0 = 0x000000RR00GG00BB
pmaddwd mm0, mm1 // 0 + R * MULT | G * MULT + B * MULT
movd ebx, mm0 // ebx = G * MULT + B * MULT
psrlq mm0, 32 // mm0 = 0x000000 | R * MULT
movd eax, mm0 // eax = 0 + R * MULT

add eax, ebx // eax = R * MULT + G * MULT + B * MULT = Y << 8
mov [edi], ah // "shr eax, 8", "mov [edi], al"
EXIT_PROC:

emms
}
}


I'll work on a 8x unrolled loop next. I see how that could really speed things up. Thanks for all the tips!

Spara
Posted on 2004-10-19 20:51:43 by Sparafusile
movq mm0, [esi]

movq mm1, mm0

psrlw mm0, 8
pand mm1, mm5 ; 00FF

pmaddwd mm0, mm7 ; 0000 00GG
pmaddwd mm1, mm6 ; 00RR 00BB

paddd mm0, mm1
-3 instructions :)
Posted on 2004-10-19 21:19:39 by bitRAKE
Took a few minutes of head scratching, but I figure out what you were doing:



void toGrayscaleMMX_Ex( void *color32, DWORD pixelCount, void *gray )
{
// Y = 0.299 * R + 0.587 * G + 0.114 * B
// Y = ( 0x4D * R + 0x96 * G + 0x1D * B ) >> 8
static __int64 G_MULTS = 0x0000009600000096;
static __int64 RB_MULTS = 0x004D001D004D001D;
static __int64 LOWBYTE_MASK = 0x00FF00FF00FF00FF;

__asm
{
mov esi, color32
mov edi, gray
movq mm7, G_MULTS
movq mm6, RB_MULTS
movq mm5, LOWBYTE_MASK

// Unrolled 2x loop
mov ecx, pixelCount
shr ecx, 1
jz LOOP_1X
LOOP_2X:
movq mm0, [esi] // mm0 = 0x00R2G2B200R1G1B1
movq mm1, mm0 // mm1 = mm0
psrlw mm0, 8 // mm0 = 0x000000G2000000G1
pand mm1, mm5 // mm1 = 0x00R200B200R100B1
pmaddwd mm0, mm7 // mm0 = 0 + G2 * MULT | 0 + G1 * MULT
pmaddwd mm1, mm6 // mm1 = R2 * MULT + B2 * MULT | R1 * MULT + B1 * MULT
paddd mm0, mm1 // mm0 = Y2 << 8 | Y1 << 8
movd eax, mm0 // eax = Y1 << 8
mov [edi], ah // "shr eax, 8", "mov [edi], al"
psrl mm0, 32 // mm0 = 0x00000000 | Y2 << 8
movd eax, mm0 // eax = Y2 << 8
mov [edi+1], ah // "shr eax, 8", "mov [edi+1], al"
add esi, 8
add edi, 2
loop LOOP_2X

LOOP_1X:
test pixelCount, 1
jz EXIT_PROC

// Single pixel (only the last one)
movd mm0, [esi] // mm0 = 0x0000000000RRGGBB
punpcklbw mm0, mm2 // mm0 = 0x000000RR00GG00BB
pmaddwd mm0, mm1 // 0 + R * MULT | G * MULT + B * MULT
movd ebx, mm0 // ebx = G * MULT + B * MULT
psrlq mm0, 32 // mm0 = 0x000000 | R * MULT
movd eax, mm0 // eax = 0 + R * MULT
add eax, ebx // eax = R * MULT + G * MULT + B * MULT = Y << 8
mov [edi], ah // "shr eax, 8", "mov [edi], al"
EXIT_PROC:

emms
}
}


That might make unrolling the loop 6 more times kinda hard with 3 registers tied up, but I'll keep trying. Thanks for your help, I appreciate it.

Spara
Posted on 2004-10-19 22:50:18 by Sparafusile
Another thing that needs to be mentioned is memory speed. To not talk about it kind of implies it takes no time, but we know in reality that is not true. Additionally, a routine like this plows through memory by design and 24 bit pictures are mostly larger than the cache.

The conclusion is the routine will be memory bound for real life size pictures. Only when testing on data in the cache will we see the speed we are gaining - or testing on a slow computer.

For these reasons I have suggested the use of MOVNTQ above. Even though the destination buffer is only 1/5th of the data accessed, the writes can happen almost for free when bypassing the cache. This results in a ~20% gain on a memory bound routine! We will see this increase for anything larger than cache.

The downside is that MOVNTQ will be slower on small data already in the cache and timing it requires care. When it is known the dataset is larger than cache always test larger than cache, and ignore the small dataset tests. MOVNTQ will be slower when the data is needed in the cache afterward as well.

Imagine we are creating an image processing tool to process 1 TB of data - SOHO satelite data from the sun or whatever your flavor. If we are just doing this one function then MOVNTQ is the best the x86 has to offer. But two or more functions can be done in the same amount of time on a fast CPU by keeping the data in the cache until no longer needed! The data would be broke into cache size chunks with the final function storing with MOVNTQ (or other cache passing instructions).

Can you see the metaphor here to parallel/serial circuits? We require both types of processing to work efficiently. This is a scalable concept that permeates computer architectures.
Posted on 2004-10-20 00:27:46 by bitRAKE
As a general rule of thumb I use MOVNTQ and it's variants to write to memory when I get into the 100's of KB of data.
Posted on 2004-10-20 10:18:19 by mark_larson
I've been working on my 8x unrolled loop, but have run into a small problem. Here's the code:


mov ecx, pixelCount
shr ecx, 3
jz SKIP_8X
LOOP_8X:
movq mm0, [esi] // mm0 = 0x00R2G2B200R1G1B1
movq mm1, mm0 // mm1 = mm0
psrlw mm0, 8 // mm0 = 0x000000G2000000G1
pand mm1, mm5 // mm1 = 0x00R200B200R100B1
pmaddwd mm0, mm7 // mm0 = 0 + G2 * MULT | 0 + G1 * MULT
pmaddwd mm1, mm6 // mm1 = R2 * MULT + B2 * MULT | R1 * MULT + B1 * MULT
paddd mm0, mm1 // mm0 = Y2 << 8 | Y1 << 8

movq mm1, [esi+8] // mm1 = 0x00R4G4B400R3G3B3
movq mm2, mm1 // mm2 = mm1
psrlw mm1, 8 // mm1 = 0x000000G4000000G3
pand mm2, mm5 // mm2 = 0x00R400B400R300B3
pmaddwd mm1, mm7 // mm1 = 0 + G4 * MULT | 0 + G3 * MULT
pmaddwd mm2, mm6 // mm2 = R4 * MULT + B4 * MULT | R3 * MULT + B3 * MULT
paddd mm1, mm2 // mm1 = Y4 << 8 | Y3 << 8

packssdw mm0, mm1 // mm0 = Y4 | Y3 | Y2 | Y1

movq mm1, [esi+16] // mm1 = 0x00R6G6B600R5G5B5
movq mm2, mm1 // mm2 = mm1
psrlw mm1, 8 // mm1 = 0x000000G6000000G5
pand mm2, mm5 // mm2 = 0x00R600B600R500B5
pmaddwd mm1, mm7 // mm1 = 0 + G6 * MULT | 0 + G5 * MULT
pmaddwd mm2, mm6 // mm2 = R6 * MULT + B6 * MULT | R5 * MULT + B5 * MULT
paddd mm1, mm2 // mm1 = Y6 << 8 | Y5 << 8

movq mm2, [esi+24] // mm2 = 0x00R8G8B800R7G7B7
movq mm3, mm2 // mm3 = mm2
psrlw mm2, 8 // mm2 = 0x000000G8000000G7
pand mm3, mm5 // mm3 = 0x00R800B800R700B7
pmaddwd mm2, mm7 // mm2 = 0 + G8 * MULT | 0 + G7 * MULT
pmaddwd mm3, mm6 // mm3 = R8 * MULT + B8 * MULT | R7 * MULT + B7 * MULT
paddd mm2, mm3 // mm2 = Y8 << 8 | Y7 << 8

packssdw mm1, mm2 // mm1 = Y8 | Y7 | Y6 | Y5

psrlw mm0, 8
psrlw mm1, 8
packuswb mm0, mm1 // mm0 = 0xY8Y7Y6Y5Y4Y3Y2Y1
movntq [edi], mm0

add esi, 32
add edi, 8
loop LOOP_8X

SKIP_8X:

The problem, I belive, lies in the


packssdw mm0, mm1 // mm0 = Y4 | Y3 | Y2 | Y1
packssdw mm1, mm2 // mm1 = Y8 | Y7 | Y6 | Y5

lines. When the function returns all the values in my grayscale image are set to 127 (0x7F) which is what the packssdw opcode saturates large numbers to. Is there such a thing as packusdw? Or is there some way to get around this problem?

I tried shifting the DWORD oriented registers when they only contain two Y values before packing them together, but then all my grayscale values were set to 255 (0xFF).

Spara
Posted on 2004-10-20 14:14:08 by Sparafusile
I haven't tested any code in this thread.
	movq mm0, [esi][8*0]	; ..r1g1b1..r0g0b0

movq mm1, mm0
psrlw mm0, 8 ; ......g1......g0
pand mm1, mm5 ; ..r1..b1..r0..b0
pmaddwd mm0, mm7
pmaddwd mm1, mm6
paddd mm0, mm1 ; ....y1xx....y0xx

movq mm2, [esi][8*1] ; ..r3g3b3..r2g2b2
movq mm3, mm2
psrlw mm2, 8 ; ......g3......g2
pand mm3, mm5 ; ..r3..b3..r2..b2
pmaddwd mm2, mm7
pmaddwd mm3, mm6
paddd mm2, mm3 ; ....y3xx....y2xx

psrlw mm0, 8 ; ......y1......y0
psrlw mm2, 8 ; ......y3......y2
packuswb mm0, mm2 ; ..y3..y2..y1..y0


movq mm4, [esi][8*2] ; ..r5g5b5..r4g4b4
movq mm1, mm4
psrlw mm4, 8 ; ......g5......g4
pand mm1, mm5 ; ..r5..b5..r4..b4
pmaddwd mm4, mm7
pmaddwd mm1, mm6
paddd mm4, mm1 ; ....y5xx....y4xx

movq mm2, [esi][8*3] ; ..r7g7b7..r6g6b6
movq mm3, mm2
psrlw mm2, 8 ; ......g7......g6
pand mm3, mm5 ; ..r7..b7..r6..b6
pmaddwd mm2, mm7
pmaddwd mm3, mm6
paddd mm2, mm3 ; ....y7xx....y6xx

psrlw mm4, 8 ; ......y5......y4
psrlw mm2, 8 ; ......y7......y6
packuswb mm4, mm2 ; ..y7..y6..y5..y4

packuswb mm0, mm4 ; y7y6y5y4y3y2y1y0

movntq [edi], mm0
I think it would be nice to tweak the constants used for the conversion and use DWORD shifts prior to satuaration to allow greater flexiblity - code would execute the same speed, of course.

If we are lucky maybe it runs at ~3 cycles per pixel on data in L1 cache. :)
Posted on 2004-10-21 23:34:44 by bitRAKE
That's what I got too. I realized my problem was typing the wrong number for one the mmx registers. All my timing functions say it runs at about 4.5 cycles per pixel, but I don't know how to tell if the original color data is in the cache or not.

I noticed that you paired the psrlw statements together. Did you do this on purpose? It seems it would be faster if you seperated them since they're using the same resources on the processor.

Spara
Posted on 2004-10-22 00:17:05 by Sparafusile
I noticed that you paired the psrlw statements together. Did you do this on purpose? It seems it would be faster if you seperated them since they're using the same resources on the processor.
I assume you are using an Intel CPU - sometimes AMD's can execute three MMX instructions in one cycle. It is just for readablity. As with mark_larson's suggestion, the instruction order is CPU dependant. Intel will be releasing CPU's with very large caches for the general PC audience next year.

By 'touching' a cacheline it is loaded into L1 cache, prefetch instructions are not required to do anything - newer chips will auto-fetch and re-arrange instructions to some degree. :)
Posted on 2004-10-22 11:03:46 by bitRAKE
I noticed that you paired the psrlw statements together. Did you do this on purpose? It seems it would be faster if you seperated them since they're using the same resources on the processor.
I assume you are using an Intel CPU - sometimes AMD's can execute three MMX instructions in one cycle. It is just for readablity. As with mark_larson's suggestion, the instruction order is CPU dependant.

On Intel you can't pair MMX ( there are a few exceptions). Generally you can you use different execution units to get a speed up. If you download the P4 optimization manual it will list all the execution units different instructions use.



Intel will be releasing CPU's with very large caches for the general PC audience next year.

By 'touching' a cacheline it is loaded into L1 cache, prefetch instructions are not required to do anything - newer chips will auto-fetch and re-arrange instructions to some degree. :)


Yep, the P4 has a hardware prefetcher that prefetches 512 bytes ahead automatically without program intervention. In your code if you carefully code "prefetch" instructions you can get a 10%-30% increase in speed even with the hardware prefetcher.
Posted on 2004-10-23 18:01:54 by mark_larson