In the same way as with the adler32 thread, here's a CRC32 algo in asm and a test program. Just put a 2MB+ file as file.dat in the dirctory, run file2obj, run make and run test.exe.
Here's what I got so far:
I found another one by tE, one from Nico and one from Huh but I couldn't get the last one working..
Results:
Thomas
Here's what I got so far:
;Calculate table:
Thomas1_ct proc uses ebx esi edi
mov edi, offset CRCtable - 4
xor ecx, ecx
_ml:
mov eax, ecx
mov ebx, 0EDB88320h
add edi, 4
mov esi, 8
@@:
shr eax, 1
sbb edx, edx
and edx, ebx
xor eax, edx
dec esi
jnz @B
inc ecx
mov [edi], eax
cmp ecx, 256
jb _ml
ret
Thomas1_ct endp
; Calculate CRC
; (Uses Thomas1_ct for table generation)
Thomas2 proc uses ebx esi edi buf:DWORD, len:DWORD
mov esi, buf
mov edi, offset CRCtable
mov edx, len
shr edx, 1
or ecx, -1
xor eax, eax
@@:
mov al, [esi]
xor al, cl
shr ecx, 8
mov ebx, [edi+4*eax]
xor ecx, ebx
mov al, [esi+1]
xor al, cl
shr ecx, 8
mov ebx, [edi+4*eax]
add esi,2
xor ecx, ebx
dec edx
jnz @B
test len, 1
jz @F
mov al, [esi]
xor al, cl
inc esi
shr ecx, 8
mov ebx, [edi+4*eax]
xor ecx, ebx
@@:
mov eax, ecx
not eax
ret
Thomas2 endp
I found another one by tE, one from Nico and one from Huh but I couldn't get the last one working..
Results:
AMD Thunderbird 1.4GHz
------------------------------------------------------------------
eax = tE : [B2033B77], 1672 ms [10x2MB], 11.96 MB/s
eax = 592 ticks(!) for table.
eax = Thomas1 : [B2033B77], 131 ms [10x2MB], 152.67 MB/s
eax = 112855 ticks(!) for table.
eax = Nico : [B2033B77], 180 ms [10x2MB], 111.11 MB/s
eax = 222965 ticks(!) for table.
eax = Thomas2 : [B2033B77], 130 ms [10x2MB], 153.84 MB/s
eax = 81628 ticks(!) for table.
Pentium III - 550 MHz
------------------------------------------------------------------
eax = tE : [B2033B77], 3966 ms [10x2MB], 5.04 MB/s
eax = 314 ticks(!) for table.
eax = Thomas1 : [B2033B77], 341 ms [10x2MB], 58.65 MB/s
eax = 158263 ticks(!) for table.
eax = Nico : [B2033B77], 570 ms [10x2MB], 35.08 MB/s
eax = 202842 ticks(!) for table.
eax = Thomas2 : [B2033B77], 591 ms [10x2MB], 33.84 MB/s
eax = 463400 ticks(!) for table.
Thomas
Thomas, that is great as I wanted to work on a crc-32 algo implementation this week-end... so I guess I will try to participate to the "optimizing process" ;)
Some crc-32 assembly routines can be found on this site too : http://www.geocities.com/SiliconValley/Heights/7394/
I added the one from the site Readiosys mentioned (by Adam Stanislav):
Updated files are in the attachment.
Thomas
AMD Thunderbird 1.4Ghz:
eax = tE : [B2033B77], 1592 ms [10x2MB], 12.56 MB/s
eax = 804 ticks(!) for table.
eax = Thomas1 : [B2033B77], 130 ms [10x2MB], 153.84 MB/s
eax = 125213 ticks(!) for table.
eax = Nico : [B2033B77], 180 ms [10x2MB], 111.11 MB/s
eax = 221046 ticks(!) for table.
eax = Thomas2 : [B2033B77], 131 ms [10x2MB], 152.67 MB/s
eax = 82106 ticks(!) for table.
eax = Adam Stanislav : [B2033B77], 190 ms [10x2MB], 105.26 MB/s
eax = 478 ticks(!) for table.
(Can't test on the other machine right now)
Updated files are in the attachment.
Thomas
Sorry Thomas, I would have added it by myself if I wasn't at work and had my assembler/time to do it right now...
I found another one, from Jay (NASM), if you want to add it :
I found another one, from Jay (NASM), if you want to add it :
bits 32
global init_crc
global update_crc
segment code public align=4 use32 class=code
; Initializes the crc_table, call this first
init_crc:
xor eax, eax
xor edx, edx
.loop:
mov ecx, eax
mov edx, 8
.inloop:
test ecx, 1
jz .clear
shr ecx, 1
xor ecx, 0xedb88320
jmp short .set
.clear:
shr ecx, 1
.set:
dec edx
jnz .inloop
mov [crc_table + eax*4], ecx
inc eax
cmp eax, 256
jl .loop
retn
; update_crc: Updates a running crc with bytes buf[0..len-1] and returns the crc
; Takes 3 parameters, (DWORD crc,LPBYTE buf,DWORD len)
; crc: The current crc value
; buf: Pointer to the buffer to update crc
; len: length of the buffer
align 4
update_crc:
push ebp
mov ebp, esp
; PARAMS
%define crc ebp + 8
%define buf ebp + 12
%define len ebp + 16
push edi
mov eax, [crc]
xor eax, 0xFFFFFFFF
mov edi, [buf]
xor ecx, ecx
cmp [len], dword 0
je .done
.loop:
mov edx, eax
xor edx, [edi + ecx]
and edx, 0xFF
shr eax, 8
xor eax, [crc_table + edx*4]
inc ecx
cmp ecx, [len]
jl .loop
.done:
xor eax, 0xFFFFFFFF
pop edi
leave
retn 12
segment bss public align=4 use32 class=bss
crc_table resd 256
------------------------------------------------------------------
eax = tE : [B2033B77], 1633 ms [10x2MB], 12.24 MB/s
eax = 379 ticks(!) for table.
eax = Thomas1 : [B2033B77], 140 ms [10x2MB], 142.85 MB/s
eax = 115930 ticks(!) for table.
eax = Nico : [B2033B77], 190 ms [10x2MB], 105.26 MB/s
eax = 223476 ticks(!) for table.
eax = Thomas2 : [B2033B77], 140 ms [10x2MB], 142.85 MB/s
eax = 81703 ticks(!) for table.
eax = Adam Stanislav : [B2033B77], 191 ms [10x2MB], 104.71 MB/s
eax = 409 ticks(!) for table.
eax = Jay : [B2033B77], 180 ms [10x2MB], 111.11 MB/s
eax = 467825 ticks(!) for table.
Thomas
proc Nexo_ct uses ebx esi edi
lea edi,[CRCtable]
xor ebx,ebx
mov edx,0EDB88320h
@@lp: mov eax,ebx
mov ecx,8
@@: xor esi,esi
shr eax,1
cmovc esi,edx
xor eax,esi
dec ecx
jne @B
mov [edi],eax
add edi,4
inc bl
jne @@lp
ret
endp
macro CRC
movzx edx,al
shr eax,8
xor eax,[edi+4*edx]
endm
proc Nexo uses esi edi, lpData:DWORD,DataSize:DWORD
mov ecx,[DataSize]
or eax,-1
mov esi,[lpData]
mov edi,offset CRCtable
sub ecx,4
jl @@b
@@: xor eax,[esi]
add esi,4
CRC
CRC
CRC
CRC
sub ecx,4
jge @B
@@b: mov edx,[tblmsk+4*(ecx+4)]
and edx,[esi]
xor eax,edx
lea esi,[esi+4+ecx]
jmp [tblunr+4*(ecx+4)]
align 8
tblmsk dd 0,0FFh,0FFFFh,0FFFFFFh
u_delta = unr_d-unr
tblunr dd unr+3*u_delta,unr+2*u_delta,unr+1*u_delta,unr
unr: CRC
unr_d: CRC
CRC
not eax
ret
endp
I have magnified a block size up to 10 Mbytes because very small numbers of a time were gained.
Pentium II 400MHz
Thomas1 : [87DD3D07], 2443 ms [10x10MB], 40.93 MB/s 138876 ticks(!) for table.
Nico : [87DD3D07], 3966 ms [10x10MB], 25.21 MB/s 232933 ticks(!) for table.
Thomas2 : [87DD3D07], 3956 ms [10x10MB], 25.27 MB/s 138909 ticks(!) for table.
Nexo : [87DD3D07], 2263 ms [10x10MB], 44.18 MB/s 133792 ticks(!) for table.
Athlon XP 1400MHz
Nico : [87DD3D07], 704 ms [10x10MB], 142.04 MB/s 247010 ticks(!) for table.
Thomas1 : [87DD3D07], 406 ms [10x10MB], 246.30 MB/s 87947 ticks(!) for table.
Thomas2 : [87DD3D07], 453 ms [10x10MB], 220.75 MB/s 87758 ticks(!) for table.
Adam Stanislav : [87DD3D07], 719 ms [10x10MB], 139.08 MB/s 327 ticks(!) for table.
Jay : [87DD3D07], 562 ms [10x10MB], 177.93 MB/s 254086 ticks(!) for table.
Nexo : [87DD3D07], 406 ms [10x10MB], 246.30 MB/s 87559 ticks(!) for table.
Hi !
I tried to optimize a bit the Thomas' table generation proc...
Here's my new version :
As you can see, the only change is that edx is "anded" with an immediate value instead of a register... so the line "mov ebx, 0EDB88320h" from the original Thomas' version disappears...
It seems to be a little faster because of that...
More than 1000 cycles less.
I also tested a stosd based version to fill the table but it is slower than the Thomas' original version.
I will try to have a closer look after some sleep... ;)
PS : The results are from my 1.2 GHz Athlon TB and from the excellent Maverick's PROFILEr... I will try to test it tomorrow on my P4 1.6 GHz Notebook...
*EDIT* : It seems to be slower on the P4 than the original Thomas version (~1000 cycles) but faster on my 450 MHz Celeron (~250 cycles).
I attach a test package, as the Maverick's tool seems to be more adapted to benchmark the table generation routine.
Thanks for your attention and remarks.
I tried to optimize a bit the Thomas' table generation proc...
Here's my new version :
ThomasReadiosys_ct proc uses ebx esi edi
mov edi, offset CRCtable - 4
xor ecx, ecx
_ml:
mov eax, ecx
add edi, 4
mov esi, 8
@@:
shr eax, 1
sbb edx, edx
and edx, 0EDB88320h
xor eax, edx
dec esi
jnz @B
inc ecx
mov [edi], eax
cmp ecx, 256
jb _ml
ret
ThomasReadiosys_ct endp
As you can see, the only change is that edx is "anded" with an immediate value instead of a register... so the line "mov ebx, 0EDB88320h" from the original Thomas' version disappears...
It seems to be a little faster because of that...
Thomas1_ct (test.asm, 18)
PROFILECYCLES = 8731 (test.asm, 20)
ThomasReadiosys_ct (test.asm, 22)
PROFILECYCLES = 7709 (test.asm, 24)
Make finished.
More than 1000 cycles less.
I also tested a stosd based version to fill the table but it is slower than the Thomas' original version.
I will try to have a closer look after some sleep... ;)
PS : The results are from my 1.2 GHz Athlon TB and from the excellent Maverick's PROFILEr... I will try to test it tomorrow on my P4 1.6 GHz Notebook...
*EDIT* : It seems to be slower on the P4 than the original Thomas version (~1000 cycles) but faster on my 450 MHz Celeron (~250 cycles).
I attach a test package, as the Maverick's tool seems to be more adapted to benchmark the table generation routine.
Thanks for your attention and remarks.
Nice code Nexo, but:
- u can remove add esi,4 from main loop and work with:
xor eax, ; ecx-> negative value esi->end of buffer
- remove lea esi,
- u can remove add esi,4 from main loop and work with:
xor eax, ; ecx-> negative value esi->end of buffer
- remove lea esi,
buliaNaza, I dont remove add esi,4, becouse it will be wrong CRC32 & burst mode off:
Posted on 2002-04-06 00:51:22 by Nexo
Posted on 2002-04-06 00:51:22 by Nexo
Nice little boost in speed with prefetch.
Don't waste your Athlon - use prefetch. ;)
Thanks for the algo, Nexo.
Don't waste your Athlon - use prefetch. ;)
Thanks for the algo, Nexo.
CRC MACRO
movzx edx,al
shr eax,8
xor eax,[edi+4*edx]
ENDM
CACHELINE EQU 64
DO_LINES EQU 2
NexoBit PROC uses esi edi ebx, lpData:DWORD,DataSize:DWORD
mov esi,[lpData]
mov edi,offset CRCtable
mov ebx,[DataSize]
and [DataSize],3
and ebx,-4
or eax,-1
sub ebx,CACHELINE * DO_LINES
jl j9
j1:
add esi,CACHELINE * DO_LINES ; bytes ahead
mov ecx,0-(CACHELINE*DO_LINES)/4 ; DWORDS
i=0
WHILE i NE DO_LINES
i=i+1
prefetchnta [esi + CACHELINE * (i-DO_LINES+2)]
ENDM
j2: xor eax,[esi+ecx*4]
CRC
CRC
CRC
CRC
inc ecx
jne j2
sub ebx,CACHELINE * DO_LINES
jge j1
j9:
add ebx,CACHELINE * DO_LINES
add esi,ebx
sub ecx,ebx
and ebx,0
sar ecx,2
jne j2
mov ecx,[DataSize]
mov edx,[tblmsk + 4*ecx]
and edx,[esi]
xor eax,edx
lea esi,[esi+4+ecx]
jmp [tblunr + 4*ecx]
align 8 ; should be align 64
unr::
CRC
unr_d::
CRC
CRC
not eax
ret
.data
align 8 ; should be align 32
tblmsk dd 0,0FFh,0FFFFh,0FFFFFFh
u_delta = unr_d-unr
tblunr dd unr+3*u_delta,unr+2*u_delta,unr+1*u_delta,unr
.code
NexoBit ENDP
BitRAKE, ?refetch here it is useless owing to specificity of looping. You can be convinced of it under the test ;)
How many you have received a scoring from prefetch?
How many you have received a scoring from prefetch?
Hi Nexo,
I say over you can..
I say over you can..
mov ebx,[DataSize]
or eax,-1
mov esi,[lpData]
mov edi,offset CRCtable
cmp ebx,4
jl @@b
mov ecx,ebx
and ecx,-4
add esi,ecx
sub ebx,ecx
neg ecx
@@Loop:
xor eax,[esi+ecx]
CRC
CRC
CRC
CRC
add ecx,4
jl @@Loop
@@b:
;test ebx, ebx
;jz exit
mov edx,[tblmsk+4*ebx]
and edx,[esi]
xor eax,edx
jmp [tblunr+4*ebx]
align 8
tblmsk dd 0, 0FFh, 0FFFFh, 0FFFFFFh
u_delta = unr_d-unr
tblunr dd unr+3*u_delta, unr+2*u_delta, unr+1*u_delta, unr
unr: CRC
unr_d: CRC
CRC
;exit:
not eax
ret
Pentium 4 1400MHz
[size=9]
Nico : [F3032BB4], 1021 ms [10x10MB], 97.94 MB/s 493776 ticks(!) for table.
Thomas1 : [F3032BB4], 501 ms [10x10MB], 199.60 MB/s 239124 ticks(!) for table.
Thomas2 : [F3032BB4], 470 ms [10x10MB], 212.76 MB/s 238784 ticks(!) for table.
Adam Stanislav : [F3032BB4], 761 ms [10x10MB], 131.40 MB/s 268 ticks(!) for table.
Jay : [F3032BB4], 711 ms [10x10MB], 140.64 MB/s 472528 ticks(!) for table.
Nexo : [F3032BB4], 501 ms [10x10MB], 199.60 MB/s 182564 ticks(!) for table.
NexoBit : [F3032BB4], 511 ms [10x10MB], 195.69 MB/s 183100 ticks(!) for table.
[/SIZE]
Maybe, there is something wrong with my machine?
eax = Nexo : [88ABBE8A], 842 ms [64x2MB], 152.01 MB/s
eax = NexoBit : [88ABBE8A], 571 ms [64x2MB], 224.16 MB/s
eax = Nexo : [88ABBE8A], 3294 ms [256x2MB], 155.43 MB/s
eax = NexoBit : [88ABBE8A], 2203 ms [256x2MB], 232.41 MB/s
eax = NexoBit : [88ABBE8A], 2203 ms [256x2MB], 232.41 MB/s
eax = NexoBit : [88ABBE8A], 2203 ms [256x2MB], 232.41 MB/s
Athlon TB 1333, P4 sucks :)Hi, buliaNaza. The code became longer, speed the same.
Here even prefetch does not help.
Complete stupor of the processor on dependences.
Here even prefetch does not help.
Complete stupor of the processor on dependences.
BitRAKE, I have put same 64x2Mb parameters.
Thomas1 : [7E2CDFAB], 531 ms [64x2MB], 241.05 MB/s
Thomas2 : [7E2CDFAB], 594 ms [64x2MB], 215.48 MB/s
Adam Stanislav : [7E2CDFAB], 922 ms [64x2MB], 138.82 MB/s
Jay : [7E2CDFAB], 734 ms [64x2MB], 174.38 MB/s
Nexo : [7E2CDFAB], 531 ms [64x2MB], 241.05 MB/s
NexoBit : [7E2CDFAB], 532 ms [64x2MB], 240.60 MB/s
Thomas1 : [7E2CDFAB], 2172 ms [256x2MB], 235.72 MB/s
Thomas2 : [7E2CDFAB], 2328 ms [256x2MB], 219.93 MB/s
Adam Stanislav : [7E2CDFAB], 3735 ms [256x2MB], 137.08 MB/s
Jay : [7E2CDFAB], 2985 ms [256x2MB], 171.52 MB/s
Nexo : [7E2CDFAB], 2203 ms [256x2MB], 232.41 MB/s
NexoBit : [7E2CDFAB], 2203 ms [256x2MB], 232.41 MB/s
I understand what your say Nexo, but as the processor speed increases, this does not matter - the memory latency will put an upper limit on the algo. I agree that prefetch is not a big plus here, but it helps. Yes, the DDR memory matches the processor speed, but try it on a 2.2Ghz XP with DDR - the difference will be more appearant. My only tests have been on an Athlon, but I feel the same argument would hold for faster P$'s.
Why is CRC32 to wonderful anyhow? I've seen other checksum algos that work just fine and are only a few lines of code. CRC seems like overkill to me.
The point is to generate a unique signature from a block of data, right? I can think of about 20 different ways to do that that isn't so cumbersome.
Maybe I have the wrong ideas about CRC. Somebody enlighten me please :D
The point is to generate a unique signature from a block of data, right? I can think of about 20 different ways to do that that isn't so cumbersome.
Maybe I have the wrong ideas about CRC. Somebody enlighten me please :D
I understand what your say Nexo, but as the processor speed increases, this does not matter - the memory latency will put an upper limit on the algo. I agree that prefetch is not a big plus here, but it helps.
Correctly you think :)
Yes, the DDR memory matches the processor speed, but try it on a 2.2Ghz XP with DDR - the difference will be more appearant.
While there is no possibility to test XP 2200+ (1800MHz). On the other hand, if the code of cycle worked more slowly the difference should be more? Try it:
CRC macro
push ebp
REPT 11
imul ebp,9999999
ENDM
pop ebp
movzx edx,al
shr eax,8
xor eax,[CRCtable+4*edx]
endm
Nexo : [7E2CDFAB], 16640 ms [256x2MB], 30.76 MB/s
NexoBit : [7E2CDFAB], 17329 ms [256x2MB], 29.54 MB/s
I somewhere wander? ;)
My only tests have been on an Athlon, but I feel the same argument would hold for faster P$'s.
The Pentium 4 has hardware prefetch. The tested Pentium 4 had DDR333 ;)
I tested on XP 1533MHz, but the trend has remained the same :(
Where here 10 GHz processers? :)