In the same way as with the adler32 thread, here's a CRC32 algo in asm and a test program. Just put a 2MB+ file as file.dat in the dirctory, run file2obj, run make and run test.exe.
Here's what I got so far:


;Calculate table:

Thomas1_ct proc uses ebx esi edi
mov edi, offset CRCtable - 4
xor ecx, ecx

_ml:
mov eax, ecx
mov ebx, 0EDB88320h
add edi, 4
mov esi, 8
@@:
shr eax, 1
sbb edx, edx
and edx, ebx
xor eax, edx
dec esi
jnz @B
inc ecx
mov [edi], eax
cmp ecx, 256
jb _ml
ret
Thomas1_ct endp

; Calculate CRC
; (Uses Thomas1_ct for table generation)
Thomas2 proc uses ebx esi edi buf:DWORD, len:DWORD
mov esi, buf
mov edi, offset CRCtable
mov edx, len
shr edx, 1
or ecx, -1
xor eax, eax
@@:
mov al, [esi]
xor al, cl
shr ecx, 8
mov ebx, [edi+4*eax]
xor ecx, ebx

mov al, [esi+1]
xor al, cl
shr ecx, 8
mov ebx, [edi+4*eax]
add esi,2
xor ecx, ebx

dec edx
jnz @B

test len, 1
jz @F
mov al, [esi]
xor al, cl
inc esi
shr ecx, 8
mov ebx, [edi+4*eax]
xor ecx, ebx
@@:

mov eax, ecx
not eax
ret
Thomas2 endp


I found another one by tE, one from Nico and one from Huh but I couldn't get the last one working..

Results:


AMD Thunderbird 1.4GHz
------------------------------------------------------------------
eax = tE : [B2033B77], 1672 ms [10x2MB], 11.96 MB/s
eax = 592 ticks(!) for table.
eax = Thomas1 : [B2033B77], 131 ms [10x2MB], 152.67 MB/s
eax = 112855 ticks(!) for table.
eax = Nico : [B2033B77], 180 ms [10x2MB], 111.11 MB/s
eax = 222965 ticks(!) for table.
eax = Thomas2 : [B2033B77], 130 ms [10x2MB], 153.84 MB/s
eax = 81628 ticks(!) for table.

Pentium III - 550 MHz
------------------------------------------------------------------
eax = tE : [B2033B77], 3966 ms [10x2MB], 5.04 MB/s
eax = 314 ticks(!) for table.
eax = Thomas1 : [B2033B77], 341 ms [10x2MB], 58.65 MB/s
eax = 158263 ticks(!) for table.
eax = Nico : [B2033B77], 570 ms [10x2MB], 35.08 MB/s
eax = 202842 ticks(!) for table.
eax = Thomas2 : [B2033B77], 591 ms [10x2MB], 33.84 MB/s
eax = 463400 ticks(!) for table.


Thomas
Posted on 2002-04-05 04:23:21 by Thomas
Thomas, that is great as I wanted to work on a crc-32 algo implementation this week-end... so I guess I will try to participate to the "optimizing process" ;)
Posted on 2002-04-05 08:36:25 by JCP
Some crc-32 assembly routines can be found on this site too : http://www.geocities.com/SiliconValley/Heights/7394/
Posted on 2002-04-05 09:05:27 by JCP
I added the one from the site Readiosys mentioned (by Adam Stanislav):



AMD Thunderbird 1.4Ghz:

eax = tE : [B2033B77], 1592 ms [10x2MB], 12.56 MB/s
eax = 804 ticks(!) for table.
eax = Thomas1 : [B2033B77], 130 ms [10x2MB], 153.84 MB/s
eax = 125213 ticks(!) for table.
eax = Nico : [B2033B77], 180 ms [10x2MB], 111.11 MB/s
eax = 221046 ticks(!) for table.
eax = Thomas2 : [B2033B77], 131 ms [10x2MB], 152.67 MB/s
eax = 82106 ticks(!) for table.
eax = Adam Stanislav : [B2033B77], 190 ms [10x2MB], 105.26 MB/s
eax = 478 ticks(!) for table.

(Can't test on the other machine right now)


Updated files are in the attachment.

Thomas
Posted on 2002-04-05 09:38:17 by Thomas
Sorry Thomas, I would have added it by myself if I wasn't at work and had my assembler/time to do it right now...

I found another one, from Jay (NASM), if you want to add it :



bits 32

global init_crc
global update_crc

segment code public align=4 use32 class=code

; Initializes the crc_table, call this first
init_crc:
xor eax, eax
xor edx, edx

.loop:
mov ecx, eax
mov edx, 8
.inloop:
test ecx, 1
jz .clear
shr ecx, 1
xor ecx, 0xedb88320
jmp short .set
.clear:
shr ecx, 1
.set:
dec edx
jnz .inloop

mov [crc_table + eax*4], ecx
inc eax
cmp eax, 256
jl .loop

retn

; update_crc: Updates a running crc with bytes buf[0..len-1] and returns the crc
; Takes 3 parameters, (DWORD crc,LPBYTE buf,DWORD len)
; crc: The current crc value
; buf: Pointer to the buffer to update crc
; len: length of the buffer
align 4
update_crc:
push ebp
mov ebp, esp

; PARAMS
%define crc ebp + 8
%define buf ebp + 12
%define len ebp + 16

push edi

mov eax, [crc]
xor eax, 0xFFFFFFFF

mov edi, [buf]
xor ecx, ecx

cmp [len], dword 0
je .done
.loop:
mov edx, eax
xor edx, [edi + ecx]
and edx, 0xFF
shr eax, 8
xor eax, [crc_table + edx*4]

inc ecx
cmp ecx, [len]
jl .loop
.done:
xor eax, 0xFFFFFFFF

pop edi

leave
retn 12

segment bss public align=4 use32 class=bss

crc_table resd 256
Posted on 2002-04-05 10:00:37 by JCP


------------------------------------------------------------------
eax = tE : [B2033B77], 1633 ms [10x2MB], 12.24 MB/s
eax = 379 ticks(!) for table.
eax = Thomas1 : [B2033B77], 140 ms [10x2MB], 142.85 MB/s
eax = 115930 ticks(!) for table.
eax = Nico : [B2033B77], 190 ms [10x2MB], 105.26 MB/s
eax = 223476 ticks(!) for table.
eax = Thomas2 : [B2033B77], 140 ms [10x2MB], 142.85 MB/s
eax = 81703 ticks(!) for table.
eax = Adam Stanislav : [B2033B77], 191 ms [10x2MB], 104.71 MB/s
eax = 409 ticks(!) for table.
eax = Jay : [B2033B77], 180 ms [10x2MB], 111.11 MB/s
eax = 467825 ticks(!) for table.


Thomas
Posted on 2002-04-05 10:54:23 by Thomas


proc Nexo_ct uses ebx esi edi
lea edi,[CRCtable]
xor ebx,ebx
mov edx,0EDB88320h
@@lp: mov eax,ebx
mov ecx,8
@@: xor esi,esi
shr eax,1
cmovc esi,edx
xor eax,esi
dec ecx
jne @B
mov [edi],eax
add edi,4
inc bl
jne @@lp
ret
endp

macro CRC
movzx edx,al
shr eax,8
xor eax,[edi+4*edx]
endm

proc Nexo uses esi edi, lpData:DWORD,DataSize:DWORD
mov ecx,[DataSize]
or eax,-1
mov esi,[lpData]
mov edi,offset CRCtable
sub ecx,4
jl @@b
@@: xor eax,[esi]
add esi,4
CRC
CRC
CRC
CRC
sub ecx,4
jge @B
@@b: mov edx,[tblmsk+4*(ecx+4)]
and edx,[esi]
xor eax,edx
lea esi,[esi+4+ecx]
jmp [tblunr+4*(ecx+4)]
align 8
tblmsk dd 0,0FFh,0FFFFh,0FFFFFFh
u_delta = unr_d-unr
tblunr dd unr+3*u_delta,unr+2*u_delta,unr+1*u_delta,unr

unr: CRC
unr_d: CRC
CRC
not eax
ret
endp


I have magnified a block size up to 10 Mbytes because very small numbers of a time were gained.

Pentium II 400MHz


Thomas1 : [87DD3D07], 2443 ms [10x10MB], 40.93 MB/s 138876 ticks(!) for table.
Nico : [87DD3D07], 3966 ms [10x10MB], 25.21 MB/s 232933 ticks(!) for table.
Thomas2 : [87DD3D07], 3956 ms [10x10MB], 25.27 MB/s 138909 ticks(!) for table.
Nexo : [87DD3D07], 2263 ms [10x10MB], 44.18 MB/s 133792 ticks(!) for table.


Athlon XP 1400MHz


Nico : [87DD3D07], 704 ms [10x10MB], 142.04 MB/s 247010 ticks(!) for table.
Thomas1 : [87DD3D07], 406 ms [10x10MB], 246.30 MB/s 87947 ticks(!) for table.
Thomas2 : [87DD3D07], 453 ms [10x10MB], 220.75 MB/s 87758 ticks(!) for table.
Adam Stanislav : [87DD3D07], 719 ms [10x10MB], 139.08 MB/s 327 ticks(!) for table.
Jay : [87DD3D07], 562 ms [10x10MB], 177.93 MB/s 254086 ticks(!) for table.
Nexo : [87DD3D07], 406 ms [10x10MB], 246.30 MB/s 87559 ticks(!) for table.
Posted on 2002-04-05 11:43:41 by Nexo
Hi !

I tried to optimize a bit the Thomas' table generation proc...

Here's my new version :



ThomasReadiosys_ct proc uses ebx esi edi
mov edi, offset CRCtable - 4
xor ecx, ecx

_ml:
mov eax, ecx
add edi, 4
mov esi, 8
@@:
shr eax, 1
sbb edx, edx
and edx, 0EDB88320h
xor eax, edx
dec esi
jnz @B
inc ecx
mov [edi], eax
cmp ecx, 256
jb _ml
ret
ThomasReadiosys_ct endp


As you can see, the only change is that edx is "anded" with an immediate value instead of a register... so the line "mov ebx, 0EDB88320h" from the original Thomas' version disappears...
It seems to be a little faster because of that...



Thomas1_ct (test.asm, 18)
PROFILECYCLES = 8731 (test.asm, 20)
ThomasReadiosys_ct (test.asm, 22)
PROFILECYCLES = 7709 (test.asm, 24)
Make finished.


More than 1000 cycles less.

I also tested a stosd based version to fill the table but it is slower than the Thomas' original version.
I will try to have a closer look after some sleep... ;)

PS : The results are from my 1.2 GHz Athlon TB and from the excellent Maverick's PROFILEr... I will try to test it tomorrow on my P4 1.6 GHz Notebook...

*EDIT* : It seems to be slower on the P4 than the original Thomas version (~1000 cycles) but faster on my 450 MHz Celeron (~250 cycles).

I attach a test package, as the Maverick's tool seems to be more adapted to benchmark the table generation routine.

Thanks for your attention and remarks.
Posted on 2002-04-05 17:13:19 by JCP
Nice code Nexo, but:

- u can remove add esi,4 from main loop and work with:
xor eax, ; ecx-> negative value esi->end of buffer

- remove lea esi,
Posted on 2002-04-05 17:20:50 by buliaNaza
buliaNaza, I dont remove add esi,4, becouse it will be wrong CRC32 & burst mode off:
Posted on 2002-04-06 00:51:22 by Nexo
Nice little boost in speed with prefetch.
Don't waste your Athlon - use prefetch. ;)
Thanks for the algo, Nexo.
CRC MACRO

movzx edx,al
shr eax,8
xor eax,[edi+4*edx]
ENDM

CACHELINE EQU 64
DO_LINES EQU 2

NexoBit PROC uses esi edi ebx, lpData:DWORD,DataSize:DWORD
mov esi,[lpData]
mov edi,offset CRCtable
mov ebx,[DataSize]
and [DataSize],3
and ebx,-4
or eax,-1
sub ebx,CACHELINE * DO_LINES
jl j9

j1:
add esi,CACHELINE * DO_LINES ; bytes ahead
mov ecx,0-(CACHELINE*DO_LINES)/4 ; DWORDS

i=0
WHILE i NE DO_LINES
i=i+1
prefetchnta [esi + CACHELINE * (i-DO_LINES+2)]
ENDM

j2: xor eax,[esi+ecx*4]
CRC
CRC
CRC
CRC
inc ecx
jne j2

sub ebx,CACHELINE * DO_LINES
jge j1
j9:
add ebx,CACHELINE * DO_LINES
add esi,ebx
sub ecx,ebx
and ebx,0
sar ecx,2
jne j2

mov ecx,[DataSize]
mov edx,[tblmsk + 4*ecx]
and edx,[esi]
xor eax,edx
lea esi,[esi+4+ecx]
jmp [tblunr + 4*ecx]
align 8 ; should be align 64
unr::
CRC
unr_d::
CRC
CRC
not eax
ret
.data
align 8 ; should be align 32
tblmsk dd 0,0FFh,0FFFFh,0FFFFFFh
u_delta = unr_d-unr
tblunr dd unr+3*u_delta,unr+2*u_delta,unr+1*u_delta,unr
.code
NexoBit ENDP
Posted on 2002-04-06 03:13:57 by bitRAKE
BitRAKE, ?refetch here it is useless owing to specificity of looping. You can be convinced of it under the test ;)
How many you have received a scoring from prefetch?
Posted on 2002-04-06 04:53:48 by Nexo
Hi Nexo,
I say over you can..


mov ebx,[DataSize]
or eax,-1
mov esi,[lpData]
mov edi,offset CRCtable
cmp ebx,4
jl @@b
mov ecx,ebx
and ecx,-4
add esi,ecx
sub ebx,ecx
neg ecx
@@Loop:
xor eax,[esi+ecx]
CRC
CRC
CRC
CRC
add ecx,4
jl @@Loop
@@b:
;test ebx, ebx
;jz exit
mov edx,[tblmsk+4*ebx]
and edx,[esi]
xor eax,edx
jmp [tblunr+4*ebx]
align 8
tblmsk dd 0, 0FFh, 0FFFFh, 0FFFFFFh
u_delta = unr_d-unr
tblunr dd unr+3*u_delta, unr+2*u_delta, unr+1*u_delta, unr

unr: CRC
unr_d: CRC
CRC
;exit:
not eax
ret
Posted on 2002-04-06 10:10:30 by buliaNaza
Pentium 4 1400MHz
[size=9]

Nico : [F3032BB4], 1021 ms [10x10MB], 97.94 MB/s 493776 ticks(!) for table.
Thomas1 : [F3032BB4], 501 ms [10x10MB], 199.60 MB/s 239124 ticks(!) for table.
Thomas2 : [F3032BB4], 470 ms [10x10MB], 212.76 MB/s 238784 ticks(!) for table.
Adam Stanislav : [F3032BB4], 761 ms [10x10MB], 131.40 MB/s 268 ticks(!) for table.
Jay : [F3032BB4], 711 ms [10x10MB], 140.64 MB/s 472528 ticks(!) for table.
Nexo : [F3032BB4], 501 ms [10x10MB], 199.60 MB/s 182564 ticks(!) for table.
NexoBit : [F3032BB4], 511 ms [10x10MB], 195.69 MB/s 183100 ticks(!) for table.
[/SIZE]
Posted on 2002-04-06 11:11:31 by Nexo
Maybe, there is something wrong with my machine?
eax = Nexo           : [88ABBE8A],  842 ms [64x2MB],  152.01 MB/s

eax = NexoBit : [88ABBE8A], 571 ms [64x2MB], 224.16 MB/s

eax = Nexo : [88ABBE8A], 3294 ms [256x2MB], 155.43 MB/s
eax = NexoBit : [88ABBE8A], 2203 ms [256x2MB], 232.41 MB/s
eax = NexoBit : [88ABBE8A], 2203 ms [256x2MB], 232.41 MB/s
eax = NexoBit : [88ABBE8A], 2203 ms [256x2MB], 232.41 MB/s
Athlon TB 1333, P4 sucks :)
Posted on 2002-04-06 11:13:41 by bitRAKE
Hi, buliaNaza. The code became longer, speed the same.
Here even prefetch does not help.
Complete stupor of the processor on dependences.
Posted on 2002-04-06 11:58:27 by Nexo
BitRAKE, I have put same 64x2Mb parameters.


Thomas1 : [7E2CDFAB], 531 ms [64x2MB], 241.05 MB/s
Thomas2 : [7E2CDFAB], 594 ms [64x2MB], 215.48 MB/s
Adam Stanislav : [7E2CDFAB], 922 ms [64x2MB], 138.82 MB/s
Jay : [7E2CDFAB], 734 ms [64x2MB], 174.38 MB/s
Nexo : [7E2CDFAB], 531 ms [64x2MB], 241.05 MB/s
NexoBit : [7E2CDFAB], 532 ms [64x2MB], 240.60 MB/s




Thomas1 : [7E2CDFAB], 2172 ms [256x2MB], 235.72 MB/s
Thomas2 : [7E2CDFAB], 2328 ms [256x2MB], 219.93 MB/s
Adam Stanislav : [7E2CDFAB], 3735 ms [256x2MB], 137.08 MB/s
Jay : [7E2CDFAB], 2985 ms [256x2MB], 171.52 MB/s
Nexo : [7E2CDFAB], 2203 ms [256x2MB], 232.41 MB/s
NexoBit : [7E2CDFAB], 2203 ms [256x2MB], 232.41 MB/s
Posted on 2002-04-06 12:01:48 by Nexo
I understand what your say Nexo, but as the processor speed increases, this does not matter - the memory latency will put an upper limit on the algo. I agree that prefetch is not a big plus here, but it helps. Yes, the DDR memory matches the processor speed, but try it on a 2.2Ghz XP with DDR - the difference will be more appearant. My only tests have been on an Athlon, but I feel the same argument would hold for faster P$'s.
Posted on 2002-04-06 12:08:41 by bitRAKE
Why is CRC32 to wonderful anyhow? I've seen other checksum algos that work just fine and are only a few lines of code. CRC seems like overkill to me.

The point is to generate a unique signature from a block of data, right? I can think of about 20 different ways to do that that isn't so cumbersome.

Maybe I have the wrong ideas about CRC. Somebody enlighten me please :D
Posted on 2002-04-06 12:09:25 by iblis
I understand what your say Nexo, but as the processor speed increases, this does not matter - the memory latency will put an upper limit on the algo. I agree that prefetch is not a big plus here, but it helps.

Correctly you think :)
Yes, the DDR memory matches the processor speed, but try it on a 2.2Ghz XP with DDR - the difference will be more appearant.

While there is no possibility to test XP 2200+ (1800MHz). On the other hand, if the code of cycle worked more slowly the difference should be more? Try it:


CRC macro
push ebp
REPT 11
imul ebp,9999999
ENDM
pop ebp
movzx edx,al
shr eax,8
xor eax,[CRCtable+4*edx]
endm

Nexo : [7E2CDFAB], 16640 ms [256x2MB], 30.76 MB/s
NexoBit : [7E2CDFAB], 17329 ms [256x2MB], 29.54 MB/s

I somewhere wander? ;)


My only tests have been on an Athlon, but I feel the same argument would hold for faster P$'s.

The Pentium 4 has hardware prefetch. The tested Pentium 4 had DDR333 ;)
I tested on XP 1533MHz, but the trend has remained the same :(
Where here 10 GHz processers? :)
Posted on 2002-04-06 15:06:29 by Nexo