Hi all,

I have a situation where I need to transpose a 6x4 bit matrix, stored in six 32-bit variables. So I start with this:

--------------------------a1a2a3a4
--------------------------b1b2b3b4
--------------------------c1c2c3c4
--------------------------d1d2d3d4
--------------------------e1e2e3e4
--------------------------f1f2f3f4


And I'd like to end up with:

------------------------a1b1c1d1e1f1
------------------------a2b2c2d2e2f2
------------------------a3b3c3d3e3f3
------------------------a4b4c4d4e4f4


Of course I could do it one bit at a time, but I really want to do it as fast as possible. All tricks allowed. 8)

Thanks!
Posted on 2005-08-27 14:38:56 by C0D1F1ED
Here's what I came up with.
It's pretty long, but seems to be efficient.
The only other way I would think about doing it would be to load all the data into
XMM0 and XMM1 and then use PEXTRW to get 2 bytes out at a time and move them to the correct locations. If the following solution is too slow I'd suggest implementing the SSE one described above.


6x4:
d1 dd ? ;a1-4
d2 dd ? ;b1-4
d3 dd ? ;c1-4
d4 dd ? ;d1-4
d5 dd ? ;e1-4
d6 dd ? ;f1-4
4x6:
nd1 rb 8 ;abcdef1 0 0 ;2zero bytes as buffer for alignement
nd2 rb 8 ;abcdef2 0 0 ; and easier write
nd3 rb 8 ;abcdef3 0 0
nd4 rb 8 ;abcdef4 0 0

;;;;;;;;;;;;;;ALL REGISTERS ARE KILLED
mov eax,
movzx ebx,al
movzx ecx,ah
shr eax,16
movzx edx,al
movzx esi,ah
shl ebx,8
shl ecx,8
shl edx,8
shl esi,8
;;;;;;;;;d1-4 in place
;;;;;;;;;working backwards to avoid a 8bswap's
mov eax,
movzx ebp,al
movzx edi,ah
or ebx,ebp
or ecx,edi
shr eax,16
movzx ebp,al
movzx edi,ah
or edx,ebp
or esi,edi
shl ebx,8
shl ecx,8
shl edx,8
shl esi,8
;;;;;;;;;c1-4 in place
mov eax,
movzx ebp,al
movzx edi,ah
or ebx,ebp
or ecx,edi
shr eax,16
movzx ebp,al
movzx edi,ah
or edx,ebp
or esi,edi
shl ebx,8
shl ecx,8
shl edx,8
shl esi,8
;;;;;;;;;;b1-4 in place
mov eax,
movzx ebp,al
movzx edi,ah
or ebx,ebp
or ecx,edi
shr eax,16
movzx ebp,al
movzx edi,ah
or edx,ebp
or esi,edi
;;;;;;;;;;;a1-4 in place
;;;;;;;;;;;write and repeat for e1-4 & f1-4
mov ,ebx
mov ,ecx
mov ,edx
mov ,esi
;;;;;;;;;; last 2 columns
mov eax,
movzx ebx,al
movzx ecx,ah
shr eax,16
movzx edx,al
movzx esi,ah
shl ebx,8
shl ecx,8
shl edx,8
shl esi,8
;;;;;;;;;f1-4 in place now do e1-4
mov eax,
movzx ebp,al
movzx edi,ah
or ebx,ebp
or ecx,edi
shr eax,16
movzx ebp,al
movzx edi,ah
or edx,ebp
or esi,edi
;;;;;;;;;;write the last 2 columns
mov ,ebx
mov ,ecx
mov ,edx
mov ,esi
Posted on 2005-08-27 18:44:54 by r22
Of course I could do it one bit at a time,


This is probably the way you would have intended to proceed.

.data
    A6x4  dd  6 dup(?)  ;original 6x4 bit matrix
    B4x6  dd  4 dup(?)  ;converted 4x6 bit matrix

.code
    mov  edi,offset A6x4
    mov  bl,      ;a1a2a3a4
    add  edi,4
    mov  bh,      ;b1b2b3b4
    add  edi,4
    mov  cl,      ;c1c2c3c4
    add  edi,4
    mov  ch,      ;d1d2d3d4
    add  edi,4
    mov  dl,      ;e1e2e3e4
    add  edi,4
    mov  dh,      ;f1f2f3f4

    mov  edi,offset B4x6 + 16
    mov  esi,4
  @@:
    xor  eax,eax
    shr  bl,1
    rcl  al,1
    shr  bh,1
    rcl  al,1
    shr  cl,1
    rcl  al,1
    shr  ch,1
    rcl  al,1
    shr  dl,1
    rcl  al,1
    shr  dh,1
    rcl  al,1
    sub  edi,4
    mov  ,eax
    dec  esi
    jnz  @B


Raymond
Posted on 2005-08-27 20:55:53 by Raymond
That's awesome, thanks!

It already works great now (it's for encryption by the way), but I noticed that I actually only need to transpose 3x4 bits to make it work the same way. So it gets reduced to:

--------------------------a1a2a3a4
--------------------------b1b2b3b4
--------------------------c1c2c3c4

---------------------------a1b1c1
---------------------------a2b2c2
---------------------------a3b3c3
---------------------------a4b4c4


This enables the use of a lookup table, using the 12 input bits as offset into the table. This takes at least 8 kB of precious memory though, so I'm wondering if an arithmetic approach could be equally fast? This operation is really time critical to the whole application...

Here's my first attempt (in C code):


unsigned int a = (a1a2a3a4 >> 1) | (a1a2a3a4 << 4) | (a1a2a3a4 << 9) | (a1a2a3a4 << 14);
unsigned int b = (b1b2b3b4 >> 2) | (b1b2b3b4 << 3) | (b1b2b3b4 << 8) | (b1b2b3b4 << 13);
unsigned int c = (c1c2c3c4 >> 3) | (c1c2c3c4 << 2) | (c1c2c3c4 << 7) | (c1c2c3c4 << 12);

a = a & 0x00004444;
b = b & 0x00002222;
c = c & 0x00001111;

a = a | b | c;

unsigned int r0 = (a & 0x00000007) >> 0;
unsigned int r1 = (a & 0x00000070) >> 4;
unsigned int r2 = (a & 0x00000700) >> 8;
unsigned int r3 = (a & 0x00007000) >> 12;


With the 4-bit registers in assembler this can probably be done a lot faster...
Posted on 2005-08-27 21:49:44 by C0D1F1ED
Actually I can let three small lookup tables do all the work:


unsigned short lut0[16] =
{
    0x0000,
    0x0001,
    0x0010,
    0x0011,
    0x0100,
  ...
};

unsigned short lut1[16] =
{
    0x0000,
    0x0002,
    0x0020,
    0x0022,
    0x0200,
  ...
};

unsigned short lut2[16] =
{
    0x0000,
    0x0004,
    0x0040,
    0x0044,
    0x0400,
  ...
};

unsigned short _a4b4c4_a3b3c3_a2b2c2_a1b1c1 = lut2 | lut1 | lut0;

unsigned short _a1b1c1 = (_a4b4c4_a3b3c3_a2b2c2_a1b1c1 & 0x0007) >> 0;
unsigned short _a2b2c2 = (_a4b4c4_a3b3c3_a2b2c2_a1b1c1 & 0x0070) >> 4;
unsigned short _a3b3c3 = (_a4b4c4_a3b3c3_a2b2c2_a1b1c1 & 0x0700) >> 8;
unsigned short _a4b4c4 = (_a4b4c4_a3b3c3_a2b2c2_a1b1c1 & 0x7000) >> 12;


So essentially it's three lookups and two or operations. 8) The tables take only 3 x 16 x 2 = 96 bytes.
Posted on 2005-08-27 22:24:16 by C0D1F1ED
i was bored...  :roll:

.data
align 8
_6x4 label dword
db 0a1h,0a2h,0a3h,0a4h
db 0b1h,0b2h,0b3h,0b4h
db 0c1h,0c2h,0c3h,0c4h
db 0d1h,0d2h,0d3h,0d4h
db 0e1h,0e2h,0e3h,0e4h
db 0f1h,0f2h,0f3h,0f4h
_4x6 label qword
db 8 dup(0)
db 8 dup(0)
db 8 dup(0)
db 8 dup(0)
.code
Transponse:
IFNDEF .MMX
pushad
xor edx,edx
xor ebp,ebp
mov esi,offset _6x4
mov edi,offset _4x6
@@: mov al,
mov bl,
mov cl,
mov ah,
mov bh,
mov ch,
mov ,ax
mov ,bx
mov ,cx
add edx,8
inc ebp
cmp edx,8*4
jb @B
popad
ELSE
mov eax,offset _6x4
mov edx,offset _4x6
MOVD mm0,
PUNPCKLBW mm0,
MOVD mm7,
PUNPCKLBW mm7,
MOVQ mm1,mm0
PUNPCKLWD mm0,mm7
PUNPCKHWD mm1,mm7
MOVD mm6,
PUNPCKLBW mm6,
PUNPCKLWD mm2,mm6
PUNPCKHWD mm3,mm6
MOVQ mm4,mm2
PUNPCKLDQ mm4,mm0
PUNPCKHDQ mm2,mm0
MOVQ mm5,mm3
PUNPCKLDQ mm5,mm1
PUNPCKHDQ mm3,mm1
PSRLQ mm4,16
PSRLQ mm2,16
PSRLQ mm5,16
PSRLQ mm3,16
MOVQ ,mm4
MOVQ ,mm2
MOVQ ,mm5
MOVQ ,mm3
ENDIF
retn


EDIT: didn't read thoroughly... :(
Posted on 2005-08-27 23:08:51 by drizz
ok.. second try... bits this time

pushad
i = 0
for reg,<eax,ebx,ecx,edx,edi,esi>
%mov &reg&,_6x4
i = i +1
endm
xor ebp,ebp
for reg,<eax,ebx,ecx,edx,edi,esi>
shl &reg&,(32-4)+1
adc ebp,ebp
endm
mov _4x6[0*4],ebp
i = 1
rept 3
xor ebp,ebp
for reg,<eax,ebx,ecx,edx,edi,esi>
add &reg&,&reg&
adc ebp,ebp
;; shld ebp,&reg&,1
endm
mov _4x6,ebp
i = i + 1
endm
popad

most of the code should pair
Posted on 2005-08-28 00:31:24 by drizz
Oh it's 6x4 BITS, look up table is fastest.

As for "precious memory" pff an exe with 10ish KB of size from a look up table is nothing.
Posted on 2005-08-28 01:09:06 by r22

As for "precious memory" pff an exe with 10ish KB of size from a look up table is nothing.

It's meant to be used on an embedded device (like mobile phone). So for this component it's hardly acceptable to use more than a couple kBs.
Posted on 2005-08-28 07:30:04 by C0D1F1ED
even if it's useless, i enjoy these kind of tasks,  hope no-one minds my posts

__asm {
mov eax,[_3x4+0*4]
xor edx,edx
shl eax,(32-4)+1 
mov ebx,[_3x4+1*4]
adc edx,edx
mov ecx,[_3x4+2*4]
shl ebx,(32-4)+1
mov edi,0
adc edx,edx
mov esi,0
shl ecx,(32-4)+1
mov ebp,0
adc edx,edx
add eax,eax
adc edi,edi
add ebx,ebx
adc edi,edi
add ecx,ecx
adc edi,edi
add eax,eax
adc esi,esi
add ebx,ebx
adc esi,esi
add ecx,ecx
adc esi,esi
add eax,eax
adc ebp,ebp
add ebx,ebx
adc ebp,ebp
add ecx,ecx
adc ebp,ebp
mov [_4x3+0*4],edx
mov [_4x3+1*4],edi
mov [_4x3+2*4],esi
mov [_4x3+3*4],ebp
}

35 lines / 2
Posted on 2005-08-28 07:53:16 by drizz
an embedded device/mobile phone with x86 and sse o_O !!!

?

Posted on 2005-08-28 22:20:39 by HeLLoWorld