szInsertString2 proc uses ebx esi edi lpString1,lpString2,len1,len2,index

mov esi,lpString1 ; esi = lpString1
mov ebx,len1 ; ebx = len1

mov eax,index ;eax = index
mov ecx,len2 ;ecx = len2

add esi,eax
mov edi,ecx

mov edx,esi
and esi,not 3

and edx,3
lea edi,[esi][ecx]

neg eax
push ecx

lea ecx,[eax][ebx][7]
push eax

sub ecx,edx
shr ecx,3

@@: mov eax,[esi][ecx*8][4]
mov edx,[esi][ecx*8]
mov [edi][ecx*8][4],eax
mov [edi][ecx*8],edx
dec ecx
jns @B

pop eax
mov edi,lpString1
pop ecx

neg eax
mov edx,ecx
mov esi,lpString2

add edi,eax ;edi = String1+index
and ecx,not 3
je @bytes
add edi,ecx
add esi,ecx
neg ecx
@@: mov eax,[esi][ecx]
mov [edi][ecx],eax
add ecx,4
jne @B

@bytes:
and edx,3
je @exit
add esi,edx
add edi,edx
neg edx
@@: mov al,[esi][edx]
mov [edi][edx],al
inc edx
jne @B
@exit:
ret

szInsertString2 endp
Posted on 2002-02-03 23:20:21 by The Svin
szInsertString3 PROC ; lpString1,lpString2,len1,len2,index

mov ecx,[esp - 4*1] ; lpString1
mov edx,[esp - 4*2] ; lpString2
add ecx,[esp - 4*5] ; index
@@: mov al,[ecx]
inc ecx
mov [edx],al
inc edx
dec [esp - 4*4] ; len2
jns @B
ret 4*5
szInsertString3 ENDP
How do you think this will perform in an actual program? Are you going to have many large strings in the cache? If the strings are small, the overhead of the PROC eliminates the advantages? Your waiting on data, or taking longer to do a short operation?
Posted on 2002-02-04 00:10:24 by bitRAKE
place dec at the begining of the loop
and change condition to jne
Answers to most your questions are
1. Tests
2. Writing programms wich will have parts that REALLY need
speed. For example replacing stings in editors buffers, files, databases etc.

Some thoughts make me uncomfortable. I want to share them with you. Assembler programming used by talanted and expirience programmer can give the best speed permormace.
Then why I don't see (exept for game programmers) demos with code that perform real big tasks when speed is realy needed?
Ernie once mentioned about such a task (as I recall it was about searching files with sertain strings) but I never saw the code.

You say you are making research on algorithms.
I can give you some real(and very usefull) common tasks.
For example find on all hard drives dublicate files.
And arrange information of them in relative one to many order
Table of distinct names of dublicated files to table of their instances with information of location on an instance, size, dates
etc.
It is just an example of REAL task.
But just realization of this kind of tasks can give you real filling of
speed importance and the importance of what you kin on - I mean algorithm, math, and knowing the machine.

To business.
Dec memory need three stage of loading and decoding and 2 clocks to execute before flag is set. BTB is verified jcc before last
stage - write. It will wait each time before execution stage of
dec mem is finished to verify jcc. If you place it at the begining
and change inc to lea reg, jcc will perform without any
delay.
Posted on 2002-02-04 01:15:47 by The Svin
Some impovements:


szInsertString2 proc uses ebx esi edi lpString1,lpString2,len1,len2,index

mov esi,lpString1 ; esi = lpString1
mov ebx,len1 ; ebx = len1

mov eax,index ;eax = index
mov ecx,len2 ;ecx = len2

add esi,eax
mov edi,ecx

mov edx,esi
and esi,not 7

and edx,7
lea edi,[esi][ecx]

neg eax
push ecx

lea ecx,[eax][ebx][7]
neg eax

sub ecx,edx
shr ecx,3


@@: movq mm(0),[esi][ecx*8]
movq [edi][ecx*8],mm(0)
dec ecx
jns @B
emms
mov edi,lpString1
pop ecx

mov edx,ecx
mov esi,lpString2

add edi,eax ;edi = String1+index
and ecx,not 3
je @bytes
add edi,ecx
add esi,ecx
neg ecx
@@: mov eax,[esi][ecx]
mov [edi][ecx],eax
add ecx,4
jne @B

@bytes:
and edx,3
je @exit
add esi,edx
add edi,edx
neg edx
@@: mov al,[esi][edx]
mov [edi][edx],al
inc edx
jne @B
@exit:
ret

szInsertString2 endp
Posted on 2002-02-08 09:09:09 by The Svin