popcount:
;eax = value to count bits.
;return eax = count of bits.
mov ebx,eax
and eax,33333333h
xor ebx,eax
shr ebx,2
add eax,ebx

mov ebx,eax
and eax,0f0f0f0fh
xor ebx,eax
shr ebx,4
add eax,ebx

mov ebx,eax
and eax,00ff00fffh
xor ebx,eax
shr ebx,8
add eax,ebx

mov ebx,eax
and eax,0000fffffh
xor ebx,eax
shr ebx,16
add eax,ebx
ret


as you can see, the fast asm method is not so fast.
but is faster than a loop.
Posted on 2010-07-09 06:43:29 by edfed

as you can see, the fast asm method is not so fast.
but is faster than a loop.


Yes, you are right, but where did you find this code ?
The links I proposed include SSE code, like:


; xmm0 - input (16 bytes)
; xmm7 - POPCOUNT_4bit  -- lookup table
; xmm6 - MASK_bits03 = packed_byte(0x0f) -- mask 4 lower bits

movdqa  %%xmm0, %%xmm1
psrlw      $4, %%xmm1

pand    %%xmm6, %%xmm0  ; xmm0 - lower nibbles
pand    %%xmm6, %%xmm1  ; xmm1 - higher nibbles

movdqa  %%xmm7, %%xmm2  ; since instruction pshufb modifes LUT
movdqa  %%xmm7, %%xmm3  ; it must be saved for further use

pshufb  %%xmm0, %%xmm2  ; xmm2 = vector of popcount for lower nibbles
pshufb  %%xmm1, %%xmm3  ; xmm3 = vector of popcount for higher nibbles

paddb  %%xmm3, %%xmm2  ; xmm2 += xmm3 -- vector of popcount for bytes

Posted on 2010-07-10 17:06:54 by MCoder
I have now some 64 bits popcount, but it needs more optimizations (most notably by reducing the number of registers used).

Code for SSE2:

Mask0x55 DD 055555555H,055555555H,055555555H,055555555H
Mask0x33 DD 033333333H,033333333H,033333333H,033333333H
Mask0x0F DD 00f0f0f0fH,00f0f0f0fH,00f0f0f0fH,00f0f0f0fH

POPCNT_SSE2:
        movdqa    xmm0, XMMWORD PTR
        movdqa    xmm2, XMMWORD PTR
        movdqa    xmm6, xmm2
        movq      xmm1, QWORD PTR
        movdqa    xmm3, xmm1
        psrlw    xmm1, 1
        pand      xmm1, xmm0
        pand      xmm3, xmm0
        paddb    xmm3, xmm1
        pand      xmm6, xmm3
        psrlw    xmm3, 2
        pand      xmm3, xmm2
        paddb    xmm6, xmm3
        movdqa    xmm4, xmm6
        pxor      xmm5, xmm5
        psrlw    xmm4, 4
        paddb    xmm6, xmm4
        pand      xmm6, XMMWORD PTR
        psadbw    xmm6, xmm5
        movd      eax, xmm6
        ret


Code for SSE3:

Mask0x0F DD 00f0f0f0fH,00f0f0f0fH,00f0f0f0fH,00f0f0f0fH
BitCount DD 002010100H,003020201H,003020201H,004030302H
POPCNT_SSE3:
        movdqa    xmm1, XMMWORD PTR
        movdqa    xmm5, XMMWORD PTR
        movq      xmm2, QWORD PTR
        movdqa    xmm0, xmm2
        movdqa    xmm3, xmm5
        psrlw    xmm2, 4
        pand      xmm0, xmm1
        pand      xmm2, xmm1
        pxor      xmm4, xmm4
        pshufb    xmm5, xmm0
        pshufb    xmm3, xmm2
        paddb    xmm5, xmm3
        psadbw    xmm5, xmm4
        movd      eax, xmm5
        ret

Posted on 2010-07-18 20:12:39 by MCoder

I have now some 64 bits popcount, but it needs more optimizations (most notably by reducing the number of registers used).

	; algo from hackers delight
.data
align 16
msk77 dd 77777777h,77777777h,77777777h,77777777h
msk0F dd 0F0F0F0Fh,0F0F0F0Fh,0F0F0F0Fh,0F0F0F0Fh
.code
movq xmm0,val
movdqa xmm1,xmm0
repeat 3
psrld xmm1,1
pand xmm1,oword ptr msk77
psubd xmm0,xmm1
endm
movdqa xmm1,xmm0
psrld xmm1,4
paddd xmm0,xmm1
pand xmm0,oword ptr msk0F
pxor xmm1,xmm1
psadbw xmm0,xmm1
movd eax,xmm0
Posted on 2010-07-26 16:41:11 by drizz
Nice, and I confirm that this works !
Posted on 2010-07-26 18:25:09 by MCoder