prev thread:
memset?



CopyMemD proc source:DWORD,dest:DWORD,len:DWORD

cmp len, 4 ; if < DWORD size
jl @@@1 ; copy as BYTE

mov esi, source
mov edi, dest
mov ecx, len

push ecx ; copy ecx
shr ecx, 2 ; integer divide by 4
shl ecx, 2 ; multiply by 4 for dividend
pop edx
sub edx, ecx ; remainder in edx
; shr ecx ; bug in original
shr ecx, 2 ; div by 4 for DWORD

rep movsd ; copy as DWORD
mov ecx, edx ; put remainder in counter
jmp @@@2

@@@1:
mov ecx, len
@@@2:
rep movsb ; copy as BYTE

ret

CopyMemD endp


At least 3 problems in above proc (apart from commented out ;shr ecx):
1. source & dest can be zero or something.
But it's OK. Assume it's valid pointers.
2. Direction flag is ambiguous. In what direction we copy?
3. if (len < 4) -> CRASH! Because of esi & edi are ambiguous.

Too many bugs. And this proc is in masm32 package!
\masm32\PROCS\Memcopy.asm

ZeroMemD in the attach of thread above has the same bugs.

The procs below are smaler and a bit faster on medium size blocks



fCopyMemory proc pSour:LPVOID, pDest:LPVOID, cbLen:UINT

; (c) Four-F, 2002
; [email]four-f@mail.ru[/email]

; Assume pSour & pDest are valid

option PROLOGUE:NONE
option EPILOGUE:NONE

push esi
push edi
mov ecx, [esp + sizeof DWORD * 5]
mov edx, ecx
mov edi, [esp + sizeof DWORD * 4]
cld
mov esi, [esp + sizeof DWORD * 3]
shr ecx, 2
jz @F ; can be commented out
rep movsd
and edx, 011y
@@:
mov ecx, edx
rep movsb
pop edi
pop esi

ret (sizeof DWORD)*3

option PROLOGUE:PROLOGUEDEF
option EPILOGUE:EPILOGUEDEF

fCopyMemory endp




fZeroMemory proc pDest:LPVOID, cbLen:UINT

; (c) Four-F, 2002
; [email]four-f@mail.ru[/email]

; Assume pDest is valid

option PROLOGUE:NONE
option EPILOGUE:NONE

xor eax, eax
push edi
mov ecx, [esp + sizeof DWORD * 3]
mov edx, ecx
cld
mov edi, [esp + sizeof DWORD * 2]
shr ecx, 2
jz @F ; can be commented out
rep stosd
and edx, 011y
@@:
mov ecx, edx
rep stosb
pop edi

ret (sizeof DWORD)*2

option PROLOGUE:PROLOGUEDEF
option EPILOGUE:EPILOGUEDEF

fZeroMemory endp


Hope it will be useful for someone.
Posted on 2002-07-07 09:01:13 by Four-F
This is the version in the MASM32 library, short of MMX or SIMD with prefetch, I doubt it can be done faster with integer instructions.

Regards,

hutch@movsd.com


; #########################################################################

MemCopy proc public uses esi edi Source:PTR BYTE,Dest:PTR BYTE,ln:DWORD

; ---------------------------------------------------------
; Copy ln bytes of memory from Source buffer to Dest buffer
; ~~ ~~~~~~ ~~~~
; USAGE:
; invoke MemCopy,ADDR Source,ADDR Dest,4096
;
; NOTE: Dest buffer must be at least as large as the source
; buffer otherwise a page fault will be generated.
; ---------------------------------------------------------

cld
mov esi, [Source]
mov edi, [Dest]
mov ecx, [ln]

shr ecx, 2
rep movsd

mov ecx, [ln]
and ecx, 3
rep movsb

ret

MemCopy endp
; #########################################################################
Posted on 2002-07-07 10:40:09 by hutch--
A bit faster and two bytes smaller ;-)
If integer instructions are only you have.



fCopyMemory proc pSour:LPVOID, pDest:LPVOID, cbLen:UINT

; (c) Four-F, 2002
; [email]four-f@mail.ru[/email]
; sizeof 32 bytes

option PROLOGUE:NONE
option EPILOGUE:NONE

push esi
push edi

mov eax, esp
cld
mov ecx, [eax + (sizeof DWORD)*5]
mov edi, [eax + (sizeof DWORD)*4]
push ecx
shr ecx, 2
mov esi, [eax + (sizeof DWORD)*3]
rep movsd
pop ecx
and ecx, 011y
rep movsb

pop edi
pop esi

ret (sizeof DWORD)*3

option PROLOGUE:PROLOGUEDEF
option EPILOGUE:EPILOGUEDEF

fCopyMemory endp
Posted on 2002-07-08 04:58:04 by Four-F
How about a forum devoted to copy memory routines? I think we had like a 100 posts about this topic already!
Posted on 2002-07-08 12:13:28 by comrade
Four-F,

Why do you mark all of your posted source sampes (c) Every thing i seen by you got that hint. NaDa.... Save the hints...

What's the point if you sharing it with fellow programmers at this board. Nine out of ten me, myself or eye wrote something like it or better many moons agoooooooo.

Just wondering

comrade, Posted on 2002-07-08 17:40:12 by cmax
How about a forum devoted to copy memory routines? I think we had like a 100 posts about this topic already!


Similar idea here. What about making memcpy/memset (or Win32 API equivalent) thread sticky? I was this close to posting the same thing (only difference was I made mine in macro). But one quick search about any previous posting revealed that the same code (although not in macro) has been posted over and over again. I gave up at that point. I was rebuilding (not reinventing) the wheel. I felt so stupid. :stupid:

Considering that newbies tend to post whatever they have in mind before doing any search on the topic, why don't we make the thread sticky, so that the topic stands out without search?

After all, memcpy/BitBlt is a focal point of speed improvement. So there is nothing surprising why people are interested in it. :)
Posted on 2002-07-08 18:41:09 by Starless
A FAQ would be much better, oops I think there is one already at the Test section. :grin:
Posted on 2002-07-08 18:47:28 by stryker
I'd say... don't check for NULL pointers on these routines;
better to crash, as that makes tracking down bugs rather
fast (much easier than stuff not being done :)). Assume
direction flag is clear, as that's what windows wants.
Routines that set the direction flag should clear it before
returning. After all, you want a clear direction most of
the time when dealing with string instructions. Assumption
on minimum length and transfer block size... well, a generic
routine ought not have these, and ought not overwrite even
a single byte of memory; routines for large copies can
have other assumptions.
Posted on 2002-07-08 19:12:40 by f0dder
; (c) bitRAKE

; [email]Rickey.Bowers@attbi.com[/email]
; 27 bytes
pop edx ; return address!
pop eax ; count

mov ecx, eax
and eax, 011y

shr ecx, 2
; cld ; this isn't required

xchg edi, [esp][4*0] ; dest
xchg esi, [esp][4*1] ; src

rep movsd
mov ecx, eax
rep movsb
pop edi
pop esi
jmp edx
:grin: :grin: Faster & Smaller
Posted on 2002-07-08 19:14:07 by bitRAKE
cute :grin:
Posted on 2002-07-08 19:15:46 by f0dder
Since It was hot today, temperatures rising in the 90 F, I can't continue on my projects(Can't concentrate!!!), So I decided to try to code some ZeroMem...It's pretty lame BTW and "slow"??? :grin:
ZeroMem:


pop edx ;EIP
pop eax ;Address In Memory
pop ecx ;Count

pxor MM0, MM0

@@:

sub ecx, 8
jb @F
movq [eax+ecx], MM0
jmp @B

@@:

add ecx, 8
jz __exit

@@:

dec ecx
mov BYTE PTR [eax+ecx], 0
jnz @B

__exit:

jmp edx ;Ala bitRAKE
Best suited for large memory, since it zeros a memory, 8 bytes per loop. :grin:


I forgot to add this(syntax):

push NumberOfBytesToZero
push OFFSET buffer
call ZeroMem
Posted on 2002-07-08 20:38:51 by stryker
OPTION PROLOGUE:NONE

OPTION EPILOGUE:NONE

memCOPY PROC Bytes:DWORD, Destination:DWORD, Source:DWORD
; (c) bitRAKE
; [email]Rickey.Bowers@attbi.com[/email]
; 27 bytes
pop edx ; return address!
pop eax ; count
mov ecx, eax
xchg edi, [esp][4*0] ; dest
shr ecx, 2
xchg esi, [esp][4*1] ; src
and eax, 011y
rep movsd
mov ecx, eax
rep movsb
pop edi
pop esi
jmp edx
memCOPY ENDP

memZERO PROC Bytes:DWORD, Destination:DWORD
; (c) bitRAKE
; [email]Rickey.Bowers@attbi.com[/email]
; 22 bytes
pop edx ; return address!
pop ecx ; count
xor eax, eax
xchg edi, [esp][4*0] ; dest
push ecx
shr ecx, 2
rep stosd
pop ecx
and ecx, 011y
rep stosb
pop edi
jmp edx
memZERO ENDP

OPTION PROLOGUE:PROLOGUEDEF
OPTION EPILOGUE:EPILOGUEDEF
:grin:
You can make the memory copy smaller with:
; (c) bitRAKE

; [email]Rickey.Bowers@attbi.com[/email]
; 24 bytes!
pop edx
pop ecx
pop eax
xchg esi, [esp]
xchg eax, edi
push eax
push ecx
shr ecx, 2
rep movsd
pop ecx
and ecx, 011y
rep movsb
pop edi
pop esi
jmp edx
But I don't like it. In fact, I don't have a use for any of these algos.
Posted on 2002-07-08 21:44:21 by bitRAKE
The only reductions I have seen here are in stack overhead, REP MOVSD/B are doing the work and short of writing late model processor specific code with prefetch, none of these will time any faster. Yawn etc ... :tongue:

Regards,

hutch@movsd.com
Posted on 2002-07-09 07:47:42 by hutch--
But they're still cute :grin:
Posted on 2002-07-09 07:55:54 by f0dder
Well, maybe they aren't completely useless - think I'll put them in my collection of algos to build on the stack and execute - just to confuse some 'idiot fringe'.
Posted on 2002-07-09 09:08:10 by bitRAKE
If I can find the time, I'll code a 64 byte zeromem on each loop and see if it can go any faster, ha! ha! ha! :grin: or better yet use all MMX and SSE registers on one loop :)
Posted on 2002-07-09 09:10:48 by stryker

Four-F,
Why do you mark all of your posted source sampes (c) Every thing i seen by you got that hint. NaDa.... Save the hints...


Every thing? Really?... No. This one without (c). :tongue:
line number

What every first line should every good coder put in his listing?
...You know the answer. ;)
To be serious, dont't bother about my (c).
Simply remove these two annoing lines and do with my code everithing you like.
Sorry for causing trouble. I wrote it only for fun. Never mind.

to bitRAKE
Nice solution. Did not think about args swapping.
Please don't sneer at me. No more (c) from me from now.
Posted on 2002-07-09 09:33:48 by Four-F
Four-F, don't take it so hard - we are just having fun here. I just thought it funny that you '(c)' your algo. Guess that was a joke as well, so it was good of me to laugh at it, or no? :grin:
Posted on 2002-07-09 09:58:45 by bitRAKE
; (c) stryker, 2002     ;Ala Four-F

; email: [email]6567693132407961686F6F2E636F6D[/email]

ZeroMem:
pop edx ;EIP
pop eax ;Address In Memory
pop ecx ;Count
pxor MM0, MM0
@@:
sub ecx, 64
jb @F
movntq [eax+ecx], MM0
movntq [eax+ecx+8], MM0
movntq [eax+ecx+16], MM0
movntq [eax+ecx+24], MM0
movntq [eax+ecx+32], MM0
movntq [eax+ecx+40], MM0
movntq [eax+ecx+48], MM0
movntq [eax+ecx+56], MM0
jmp @B
@@:
add ecx, 64
jz __exit
@@:
dec ecx
mov BYTE PTR [eax+ecx], 0
jnz @B
__exit:
[color=#3366FF]sfence ;Requested by bitRAKE from below. Is this right?
emms ;use femms for AMD??[/color]
jmp edx ;Ala bitRAKE
Does this goes faster??? :grin:
Posted on 2002-07-09 10:10:44 by stryker
stryker, don't you have to use SFENCE/EMMS?
I'm not saying it wouldn't work. :grin:
ZeroMem:

pop edx ;EIP
pop eax ;Address In Memory
pop ecx ;Count
pxor MM0, MM0
jmp _1

ALIGN 8

_0:
movntq [eax+ecx], MM0
movntq [eax+ecx+8], MM0
movntq [eax+ecx+16], MM0
movntq [eax+ecx+24], MM0
movntq [eax+ecx+32], MM0
movntq [eax+ecx+40], MM0
movntq [eax+ecx+48], MM0
movntq [eax+ecx+56], MM0
_1:
sub ecx, 64
jae _0

lea ecx, [_END][ecx*4]
jmp ecx

ALIGN 8

i=64
REPEAT 63
i=i-1
mov BYTE PTR [eax+(i)],0
ENDM

_END:
sfence
emms
jmp edx
Maybe this would work?
Posted on 2002-07-09 10:17:28 by bitRAKE