i'm currently implementing a constrast routine to be applied realtime.. the basic calculation is this applied to every color channel (R,G,B)

color = ((color - 128) * contrast ) + 128

where with contrast being 0.0 - 1.0 creates contraste between grey and the original, and greater than 1.0 being creating extreme contrast away from the original.. i ussually max it at about 4.0

however i decided to make 0.0 to 1.0 to be 0 to 255 , and thus 4.0 would be 1023 or something..

so i unpack the bytes into words, and multiple it

however i got 2 problems..

if the contrast is greater than 1.0 it goes over 16 bits, (actualyl really if greater than 5.0 since its signed math)

and MMX mulipliations are only signed and also only return the upper or lower 16 bits.. (when really i need them both before i can do my SHL 8 to divide by 256 to bring it back down to a NORMAL amount..

so to really do a MMX multiply 16 bits into 32 bits i have to do it twice, once high and once low.. this seems kinda silly

pmullw mm0,mm6; ;// lr0,lb0,lR0,lB0

pmullw mm1,mm6; // la0,lg0,lA0,lG0

pmulhw mm2,mm6; ;// hr0,hb0,hR0,hB0

pmulhw mm3,mm6; // ha0,hg0,hA0,hG0

then i have to pack them correctly as well..

is that slow? it means for these 2 pixels i am doing 4 muliplications (even though its a double up of the same multiplicatgion anyway)

then i have to pack them around .. i suppose in normal i'd have to do 8 muls anyway for the red,green and blue ,alpha of both pixels

anyhow here is my algorithm mostly

movq mm0, ;

movq mm1, ;

//mm0 = a0,r0,g0,b0,A0,R0,G0,B0

//SEPERATE INTO WORDS

punpcklbw mm0,mm5 ;//mm1=w( r0,b0,R0,b0)

punpckhbw mm1,mm5 ;//m2 =w( a0,g0,A0,G0)

//Subtrast midpoint

psubw mm0,mm7; ;// amount - 128

psubw mm1,mm7;

movq mm2,mm0

movq mm3,mm1

//multiply (l means low, h means high)

pmullw mm0,mm6; ;// lr0,lb0,lR0,lB0

pmullw mm1,mm6; // la0,lg0,lA0,lG0

pmulhw mm2,mm6; ;// hr0,hb0,hR0,hB0

pmulhw mm3,mm6; // ha0,hg0,hA0,hG0

movq mm4,mm0

//lr0,lb0,lR0,lB0

//hr0,hb0,hR0,hB0

//pack high and low results back together

punpckhwd mm0,mm2 ;//mm0 = (hr0 lr0) (hb0 lb0)

punpcklwd mm4,mm2 ;//mm4 = (hRO lR0) (hBO LBO)

// la0,lg0,lA0,lG0

// ha0,hg0,hA0,hG0

movq mm5,mm1

punpckhwd mm1,mm3 ; //(ha0 la0) (hg0 lg0)

punpcklwd mm5,mm3 ; //(hA0 la0 )(hGO LGO)

//divide by 256, thus because our multiplier is never too large, the upper 16 bits of each DWORD will be

//zero after this.

psrld mm0,8 ;// 00 r0 00 b0

psrld mm4,8 ;// 00 R0 00 B0

psrld mm1,8 ;// 00 a0 00 g0

psrld mm5,8 ;// 00 A0 00 G0

is this still more efficent than say a non MMX method?

now after this though i am having problems getting everything back in order..

basically each of those values may containg something from maybe -512 to + 512 or something.. and i will pack to saturate it to 0, 255

which is easy.. but its just getting it back into correct order

i can do something like this

packssdw mm0,mm1 ;//a0 r0 g0 b0

packssdw mm4,mm5 ;//A0 R0 G0 B0

packssdw mm4,mm1 ;//a0 R0 g0 B0

packssdw mm0,mm5; //A0 r0 G0 b0

to get them into correct 16 bit order - (in this case i tried interacing the different pixels values

but then there is no way i can find to pack them into right order a0 r0 g0 b0 A0 R0 G0 B0

like here i want to pack with unsigned saturation say

packssusb mm4,mm0 ;//a0 a0 r0 R0 g0 G0 b0 B0

which isn't in the right order as i want..

pulling my hair out over this.. i know otehrs prob have experience and would know how to order things upstream a bit different so that they could be

packed down into the right order.

Karl

color = ((color - 128) * contrast ) + 128

where with contrast being 0.0 - 1.0 creates contraste between grey and the original, and greater than 1.0 being creating extreme contrast away from the original.. i ussually max it at about 4.0

however i decided to make 0.0 to 1.0 to be 0 to 255 , and thus 4.0 would be 1023 or something..

so i unpack the bytes into words, and multiple it

however i got 2 problems..

if the contrast is greater than 1.0 it goes over 16 bits, (actualyl really if greater than 5.0 since its signed math)

and MMX mulipliations are only signed and also only return the upper or lower 16 bits.. (when really i need them both before i can do my SHL 8 to divide by 256 to bring it back down to a NORMAL amount..

so to really do a MMX multiply 16 bits into 32 bits i have to do it twice, once high and once low.. this seems kinda silly

pmullw mm0,mm6; ;// lr0,lb0,lR0,lB0

pmullw mm1,mm6; // la0,lg0,lA0,lG0

pmulhw mm2,mm6; ;// hr0,hb0,hR0,hB0

pmulhw mm3,mm6; // ha0,hg0,hA0,hG0

then i have to pack them correctly as well..

is that slow? it means for these 2 pixels i am doing 4 muliplications (even though its a double up of the same multiplicatgion anyway)

then i have to pack them around .. i suppose in normal i'd have to do 8 muls anyway for the red,green and blue ,alpha of both pixels

anyhow here is my algorithm mostly

movq mm0, ;

movq mm1, ;

//mm0 = a0,r0,g0,b0,A0,R0,G0,B0

//SEPERATE INTO WORDS

punpcklbw mm0,mm5 ;//mm1=w( r0,b0,R0,b0)

punpckhbw mm1,mm5 ;//m2 =w( a0,g0,A0,G0)

//Subtrast midpoint

psubw mm0,mm7; ;// amount - 128

psubw mm1,mm7;

movq mm2,mm0

movq mm3,mm1

//multiply (l means low, h means high)

pmullw mm0,mm6; ;// lr0,lb0,lR0,lB0

pmullw mm1,mm6; // la0,lg0,lA0,lG0

pmulhw mm2,mm6; ;// hr0,hb0,hR0,hB0

pmulhw mm3,mm6; // ha0,hg0,hA0,hG0

movq mm4,mm0

//lr0,lb0,lR0,lB0

//hr0,hb0,hR0,hB0

//pack high and low results back together

punpckhwd mm0,mm2 ;//mm0 = (hr0 lr0) (hb0 lb0)

punpcklwd mm4,mm2 ;//mm4 = (hRO lR0) (hBO LBO)

// la0,lg0,lA0,lG0

// ha0,hg0,hA0,hG0

movq mm5,mm1

punpckhwd mm1,mm3 ; //(ha0 la0) (hg0 lg0)

punpcklwd mm5,mm3 ; //(hA0 la0 )(hGO LGO)

//divide by 256, thus because our multiplier is never too large, the upper 16 bits of each DWORD will be

//zero after this.

psrld mm0,8 ;// 00 r0 00 b0

psrld mm4,8 ;// 00 R0 00 B0

psrld mm1,8 ;// 00 a0 00 g0

psrld mm5,8 ;// 00 A0 00 G0

is this still more efficent than say a non MMX method?

now after this though i am having problems getting everything back in order..

basically each of those values may containg something from maybe -512 to + 512 or something.. and i will pack to saturate it to 0, 255

which is easy.. but its just getting it back into correct order

i can do something like this

packssdw mm0,mm1 ;//a0 r0 g0 b0

packssdw mm4,mm5 ;//A0 R0 G0 B0

packssdw mm4,mm1 ;//a0 R0 g0 B0

packssdw mm0,mm5; //A0 r0 G0 b0

to get them into correct 16 bit order - (in this case i tried interacing the different pixels values

but then there is no way i can find to pack them into right order a0 r0 g0 b0 A0 R0 G0 B0

like here i want to pack with unsigned saturation say

packssusb mm4,mm0 ;//a0 a0 r0 R0 g0 G0 b0 B0

which isn't in the right order as i want..

pulling my hair out over this.. i know otehrs prob have experience and would know how to order things upstream a bit different so that they could be

packed down into the right order.

Karl