Howdy.
Just finished a large buffer memcpy (with size and alignment restrictions). It's optimized for Athlon XPs, and pretty fast - 2 to 3 times the throughput of VC7's code.
Any comments, thoughts, improvements? :)

BTW, what would you all use for a 'general' version? I'm willing to assume PII+.


#define CHUNK_SIZE 4096
#define CHUNK_SHIFT 12

/*
* block prefetch memcpy for large, uncached arrays
*
* src and len must be multiples of CHUNK_SIZE.
*/
__declspec(naked) void memcpy_nt(void* dst, void* src, int len)
{
__asm
{
push esi

mov edx, [esp+4+4]
mov esi, [esp+4+8]
mov ecx, [esp+4+12]
shr ecx, CHUNK_SHIFT ; # chunks
; smaller than sub ecx, CHUNK_SIZE below
main_loop:

; prefetch: touch each cache line in chunk
; (backwards to prevent hardware prefetches)
add esi, CHUNK_SIZE
prefetch_loop:
mov eax, [esi-64]
mov eax, [esi-128]
sub esi, 128
test esi, (CHUNK_SIZE-1)
jnz prefetch_loop

; copy the chunk 64 bytes at a time
write_loop:
movq mm0, [esi]
movq mm1, [esi+8]
movq mm2, [esi+16]
movq mm3, [esi+24]
movq mm4, [esi+32]
movq mm5, [esi+40]
movq mm6, [esi+48]
movq mm7, [esi+56]
add esi, 64
test esi, (CHUNK_SIZE-1)
movntq [edx], mm0
movntq [edx+8], mm1
movntq [edx+16], mm2
movntq [edx+24], mm3
movntq [edx+32], mm4
movntq [edx+40], mm5
movntq [edx+48], mm6
movntq [edx+56], mm7
lea edx, [edx+64] ; leave flags intact
jnz write_loop

dec ecx
jnz main_loop

sfence
emms

pop esi
ret
}
}
Posted on 2003-04-24 20:08:41 by Jan Wassenberg
Jan,

It looks like a good technique for later processors, have you benchmarked it against the older "rep movsd" style memory copy functions to see how much faster it is. It would be a useful reference to people who are interested in this type of algo.

Regards,

hutch@movsd.com
Posted on 2003-04-24 22:33:49 by hutch--
It's not really a fair contest - this code is makes good use of the cache, and blows movsd out of the water for large arrays:

Measured: throughput . Test system: Athlon XP 1600+, 2/2/2 RAM.


size [KB] 128 160 192 256 320 384, 512, 1024, 2048
memcpy 1800 1100 300 350 350 350..400 (varies)
memcpy_nt 1450 1450 1450 1450 1350 950


(rep movsd performance is very similar to memcpy)

Note to self: turn off fileserver before being despondent on seeing the newest benchmarks... :)

/* edit: fixed table, movsd = memcpy */
Posted on 2003-04-25 06:04:32 by Jan Wassenberg
Jan,

Compliments on the testing, these are very good results and it shows that your algorithm is very fast.

Thanks for posting the algo in here.

Regards,

hutch@movsd.com
Posted on 2003-04-25 10:10:31 by hutch--
Thanks, and no problem.

Can anyone comment on their general memcpy? This one is page granular, and only makes sense for large (> 128 KB) arrays. I don't know whether I should go with FPU (8 byte alignment/increment is ok, and no emms needed), or MMX (supports non-temporal mov, but I don't know if that is beneficial here), or a simple unrolled loop (maximum compatibility).
Posted on 2003-04-25 12:54:13 by Jan Wassenberg
Looks good to me. I don't know that much about prefetching and even less about MMX instructions.

I'd be interested to see a similar implementation of memmove() though. Hint hint. :grin:
Posted on 2003-04-25 13:17:42 by iblis


MMX memcpy (95 MB/s)

_asm {
mov esi, src
mov edi, dest
mov ecx, nbytes
shr ecx, 6 // 64 bytes per iteration

loop1:
movq mm1, 0[ESI] // Read in source data
movq mm2, 8[ESI]
movq mm3, 16[ESI]
movq mm4, 24[ESI]
movq mm5, 32[ESI]
movq mm6, 40[ESI]
movq mm7, 48[ESI]
movq mm0, 56[ESI]

movq 0[EDI], mm1 // Write to destination
movq 8[EDI], mm2
movq 16[EDI], mm3
movq 24[EDI], mm4
movq 32[EDI], mm5
movq 40[EDI], mm6
movq 48[EDI], mm7
movq 56[EDI], mm0

add esi, 64
add edi, 64
dec ecx
jnz loop1

emms
}

MMX memcpy with non-temporal stores (135 MB/s)

_asm {
mov esi, src
mov edi, dest
mov ecx, nbytes
shr ecx, 6 // 64 bytes per iteration

loop1:
movq mm1, 0[ESI] // Read in source data
movq mm2, 8[ESI]
movq mm3, 16[ESI]
movq mm4, 24[ESI]
movq mm5, 32[ESI]
movq mm6, 40[ESI]
movq mm7, 48[ESI]
movq mm0, 56[ESI]

movntq 0[EDI], mm1 // Non-temporal stores
movntq 8[EDI], mm2
movntq 16[EDI], mm3
movntq 24[EDI], mm4
movntq 32[EDI], mm5
movntq 40[EDI], mm6
movntq 48[EDI], mm7
movntq 56[EDI], mm0

add esi, 64
add edi, 64
dec ecx
jnz loop1

emms
}

MMX memcpy with prefetch and non-temporal stores (165 MB/s)

_asm {
mov esi, src
mov edi, dest
mov ecx, nbytes
shr ecx, 6 // 64 bytes per iteration

loop1:
prefetchnta 64[ESI] // Prefetch next loop, non-temporal
prefetchnta 96[ESI]

movq mm1, 0[ESI] // Read in source data
movq mm2, 8[ESI]
movq mm3, 16[ESI]
movq mm4, 24[ESI]
movq mm5, 32[ESI]
movq mm6, 40[ESI]
movq mm7, 48[ESI]
movq mm0, 56[ESI]

movntq 0[EDI], mm1 // Non-temporal stores
movntq 8[EDI], mm2
movntq 16[EDI], mm3
movntq 24[EDI], mm4
movntq 32[EDI], mm5
movntq 40[EDI], mm6
movntq 48[EDI], mm7
movntq 56[EDI], mm0

add esi, 64
add edi, 64
dec ecx
jnz loop1

emms
}


MMX memcpy with prefetch, non-temporal stores, and streaming reads/ writes (240 MB/s)

asm {
mov esi, src
mov ecx, nbytes
mov ebx, ecx
shr ebx, 11 // 2048 bytes at a time
mov edi, dest

loop2k: // Copy 2k into temporary buffer
push edi
mov edi, tbuf
mov ecx, 2048
shr ecx, 6

loopMemToL1:
prefetchnta 64[ESI] // Prefetch next loop, non-temporal
prefetchnta 96[ESI]

movq mm1, 0[ESI] // Read in source data
movq mm2, 8[ESI]
movq mm3, 16[ESI]
movq mm4, 24[ESI]
movq mm5, 32[ESI]
movq mm6, 40[ESI]
movq mm7, 48[ESI]
movq mm0, 56[ESI]

movq 0[EDI], mm1 // Store into L1
movq 8[EDI], mm2
movq 16[EDI], mm3
movq 24[EDI], mm4
movq 32[EDI], mm5
movq 40[EDI], mm6
movq 48[EDI], mm7
movq 56[EDI], mm0
add esi, 64
add edi, 64
dec ecx
jnz loopMemToL1

pop edi // Now copy from L1 to system memory
push esi
mov esi, tbuf
mov ecx, 2048
shr ecx, 6

loopL1ToMem:
movq mm1, 0[ESI] // Read in source data from L1
movq mm2, 8[ESI]
movq mm3, 16[ESI]
movq mm4, 24[ESI]
movq mm5, 32[ESI]
movq mm6, 40[ESI]
movq mm7, 48[ESI]
movq mm0, 56[ESI]

movntq 0[EDI], mm1 // Non-temporal stores
movntq 8[EDI], mm2
movntq 16[EDI], mm3
movntq 24[EDI], mm4
movntq 32[EDI], mm5
movntq 40[EDI], mm6
movntq 48[EDI], mm7
movntq 56[EDI], mm0

add esi, 64
add edi, 64
dec ecx
jnz loopL1ToMem

pop esi // Do next 2k block
dec ebx
jnz loop2k
}
Posted on 2003-04-25 18:16:22 by lingo12
Random thoughts:

1. Jan, doesn't Athlon support prefetch family of instructions?

2. I was in a flame-war like situation with this topic about 2 years ago elsewhere. I was more or less skeptical about the efficiency bound of MMX/SSE enhanced memcpy/memmove, and that invited whole lot of flame throwers. But, I'm not convinced yet. How large does the third arg to memcpy() have to be in order for this routine to be dominantly faster?

3. I tried a similar thing after the forementioned flamewar. My test showed that movntq and sfence cost more than simple movq on my Pentium III. Since then, I don't use movntq in my code. Is movntq much better on Athlon or Pentium 4?

4. Maybe moderators want to set up a subforum about memcpy/memmove/memset/BitBlt/strlen/strcpy. This topic pops up in this board frequently. I think that is natural because they are a focal point of optimization. What about directing all those efforts to one subforum so that people do not re-invent wheels over and over again?
Posted on 2003-04-25 19:25:37 by Starless
Lingo,

Great stuff. :alright:

Regards,

hutch@movsd.com
Posted on 2003-04-25 22:25:16 by hutch--
Thanks for all replies.

Iblis:
> I'd be interested to see a similar implementation of memmove() though.
On first glance, looks difficult ;) Will give this some thought...

lingo12:
Looks like our methods are similar. Your last code (modified a tiny bit for VC7?s masm) gets the following numbers in my benchmark:
128, 160, 192, 256: 1340
320: 980..1020
384, 512, 1024, 2048: 580
Why are you copying twice? To prefetch (load into cache), all you need to do is touch one cache line (one memory access).

Starless:
> 1. Jan, doesn't Athlon support prefetch family of instructions?
Yes. Problem with prefetch is they can be ignored, and only 3 can be in flight at any time (but I want to start reading 4 KB).

>2. I was in a flame-war like situation with this topic about 2 years ago elsewhere. I was more or less skeptical about the efficiency bound of MMX/SSE enhanced memcpy/memmove, and that invited whole lot of flame throwers. But, I'm not convinced yet. How large does the third arg to memcpy() have to be in order for this routine to be dominantly faster?<
Convince yourself ? check the numbers :) This method is significantly faster when the arrays don?t fit in cache (transfer size > 128 KB).

>3. I tried a similar thing after the forementioned flamewar. My test showed that movntq and sfence cost more than simple movq on my Pentium III. Since then, I don't use movntq in my code. Is movntq much better on Athlon or Pentium 4?<
movntq is rated at 3 clocks (Athlon XP) ? I don?t see any problems. What transfer sizes did you test? Were the buffers already in cache?

>4. Maybe moderators want to set up a subforum about memcpy/memmove/memset/BitBlt/strlen/ strcpy. This topic pops up in this board frequently. I think that is natural because they are a focal point of optimization. What about directing all those efforts to one subforum so that people do not re-invent wheels over and over again?<
Good idea, I?m all for it! :)
Posted on 2003-04-26 09:57:26 by Jan Wassenberg

4. Maybe moderators want to set up a subforum about memcpy/memmove/memset/BitBlt/strlen/strcpy. This topic pops up in this board frequently. I think that is natural because they are a focal point of optimization. What about directing all those efforts to one subforum so that people do not re-invent wheels over and over again?


Just pick suitable thread names or ask a moderator to change it afterwards and learn to use the search engine ;)
Posted on 2003-04-26 20:31:13 by bazik
Jan,

What is probably a good idea is your original offer of designing a block memory copy routine that does not have to be run on a PIII or later as many people are still using older boxes and will be for a long time to come.

The pretouching technique that you used looks like it will work on older stuff without assuming the PIII PV prefetch instructions so it would be very useful there.

It is very easy to translate the code you have posted into masm format and it would be able to be used by a far larger number of people.

Regards,

hutch@movsd.com
Posted on 2003-04-27 00:17:40 by hutch--
For wheel inventors:
"Using Block Prefetch for Optimized Memory Performance"
http://cdrom.amd.com/devconn/events/AMD_block_prefetch_paper.pdf
"AMD Athlon Processor x86 Code Optimization Guide TM"
22007.pdf
"Intel? Pentium? 4 Processor Optimization"
24896602.pdf
Posted on 2003-04-27 03:51:44 by Nexo
Nexo,

Thanks, its a good article.

Regards,

hutch@movsd.com
Posted on 2003-04-27 04:46:09 by hutch--
Hutch: the above code is in masm format, even if I've avoided the red tape :grin:

Nexo: yes, I'm aware of the block prefetch paper (hence the name ;)). This code is smaller+faster.
Posted on 2003-04-28 09:17:41 by Jan Wassenberg
Jan,

You will have to forgive me but at the moment I onlt have about half a brain left and its working in parsing design, not high speed block copy.

When you offered to develop a version that would work on a PII, I was interested because many people don't have the latest whizz bang hardware and your pretouching design looked like it could be used to get good speed increases on older machines over the standard REP MOVSD style of code.

If you do have the time to put together a version that would work on an old timer, I am sure it would be appreciated by many people who don't own late model hardware.

Regards,

hutch@movsd.com
Posted on 2003-04-28 23:34:02 by hutch--
Starless, movNTq is great under certain circumstances, though I was skeptical I could use it one day.

That day came... Into my code I first init a huge Transposition Table (64 bytes * 1 million items).
I reinit this TT several times in my program : I zeroe a few bytes for each item before my TT can be reused.

I simply changed movq ,mm0 by a movNTq ,mm0... copy takes now 0.05 instead of 0.5 s, on my PIII 650 MHz, 256 KB L2, I could hardly believe it !

So when it is huge and out of cache, do not pollute it...

Thx for the code, Jan&lingo12
Posted on 2003-05-05 06:00:09 by valy
Sorry for the late reply; my internet connection was down :(

hutch--: my wording was probably poor. I was asking for advice on the general version, which should work on older (k6, PII) CPUs and assume less.

Yeah, I suppose block prefetching is applicable to older processors.

> If you do have the time to put together a version that would work on an old timer, I am sure it would be appreciated by many people who don't own late model hardware. <
Working on it, very busy at the moment.

valy: hehe, nice :)
Glad to.
Posted on 2003-05-06 05:50:30 by Jan Wassenberg