I just continue the old Hutch's thread:
http://www.asmcommunity.net/board/showthread.php?threadid=6368&highlight=instring
    

; InString searches (case sensitive) for a substring
; in a larger string and if it is found, it returns its
; address in eax.
; If the substring is not found, it returns zero in eax.
; Usage: push offset Pattern
; push offset Source
; call InStringL
; [B]Caution:[/B]This code is hard to read
;
OPTION PROLOGUE:NONE ; turn it off
OPTION EPILOGUE:NONE ;
Align 16 ; Align 16 before the proc
InStringL proc lpSource:DWORD, lpPattern:DWORD
db 3Eh ; ds: prefix
mov eax, [esp+8] ; ecx->lpPattern
sub esp, 2*4 ; room to save registers and lenght of substring
db 3Eh ;
mov edx, [esp+2*4] ; edx-> return address
db 3Eh ;
mov ecx, [eax] ; get dword from substring
db 3Eh ;
mov [esp+2*4], ebx ; save register ebx
movzx ebx, cl ; ebx->the 1st byte of substring
db 3Eh ;
mov [esp+4*4], edx ; save return address
imul ebx, 1010101h ; ebx=77 77 77 77h ; 77h -> ASCII code of "w"
db 3Eh ; (if the 1st byte of substring is "w")
mov [esp], esi ;
lea esi, [eax-1] ; esi-> lpPattern-1
mov [esp+1*4], edi ; save register edi
mov edx, 80808080h ;
mov edi, [esp+3*4] ; edi-> lpSource
mov [esp+3*4], ebp ; save register ebp
mov ebp, 0FEFEFEFFh ; ebp=0FEFEFEFFh
LoopS1: ; my strlen with substring
add eax, 4 ; ecx->lpPattern
add ecx, ebp ; ebp=0FEFEFEFFh
test edx, ecx ;
mov ecx, [eax] ; get dword from substring
je LoopS1 ; 2 clocks per 4 bytes
cmp byte ptr [eax-4], 0 ;
je S1_minus4 ;
cmp byte ptr [eax-4+1], 0 ;
je S1_minus3 ;
cmp byte ptr [eax-4+2], 0 ;
je S1_minus2 ;
cmp byte ptr [eax-4+3], 0 ;
jne LoopS1 ; if not zeroes loop again
sub eax, 2 ;
S1: ;
mov ebp, [edi] ;
sub eax, esi ; end my strlen with substring
mov ecx, -4 ;
push eax ; save len of substring in variable
BytesScan: ;
db 3Eh ; ds: prefix
lea edx, [ebp-1010101h] ; searching the 1st byte of substring or/and zero
xor ebp, ebx ; ebx=77 77 77 77h ; 77h -> ASCII code of "w"
db 3Eh ; (if the 1st byte of substring is "w")
add edi, 4 ;
sub ebp, 1010101h ;
or edx, ebp ; testing the 1st byte of substring and 0
db 3Eh ; simultaneously in the larger string
mov ebp, [edi] ;
and edx, 80808080h ;
je BytesScan ; 4 clocks per 4 bytes
SrchNextByte: ;
cmp [edi+ecx], bl ; bl = 77h -> ASCII code of "w
je StartCmp ; (if the 1st byte of substring is "w")
cmp byte ptr [edi+ecx], 0 ; is it the end of the larger string?
je ExitP ; exit
ToNext: ;
inc ecx ; ecx-> -4 to 0
jne SrchNextByte ;
mov ebp, [edi] ; restoring ebp for ByteScan
sub ecx, 4 ; ecx = -4
jc BytesScan ; loop again
align 16 ;
S1_minus4: ;
db 3Eh ; ds: prefix
sub eax, 5 ;
jno S1 ;
S1_minus3: ;
sub eax, 4 ;
jno S1 ;
S1_minus2: ;
sub eax, 3 ;
jno S1 ;
StartCmp: ; comparing next bytes from substring
mov edx, [esp] ; edx-> len of substring
lea ebp, [edi+ecx] ; ebp=edi+ecx -> as a base register
movzx eax, byte ptr [esi+edx] ; comparing rest bytes
dec edx ;
je ExitP ; exit
CmpNext: ;
cmp [ebp+edx], al ; ebp->lp to current 1st byte in the larger string
jne ToNext ;
movzx eax, byte ptr [esi+edx] ; get next byte
dec edx ;
jne CmpNext ; 2 clocks per byte
ExitP: ;
mov esi, [esp+1*4] ; restoring register esi
cmp edx, 1 ; if edx= 0 mask = 0FFFFFFFFh else mask=0
sbb eax, eax ; eax->mask 0 or 0FFFFFFFFh
mov edi, [esp+2*4] ; restoring register edi
and eax, ebp ; eax->lp 1st occurrence in large string or zero
mov ebx, [esp+3*4] ; restoring register ebx
mov ebp, [esp+4*4] ; restoring register ebp
add esp, 5*4 ; restoring register esp
ret ; faster return then ret 2*4
InStringL endp ;
OPTION PROLOGUE:PROLOGUEDEF ; turn back on the defaults
OPTION EPILOGUE:EPILOGUEDEF ;
[B]Note:[/B]
Seven instructions after imul (4 clocks) are executed "for free" because they are
in the 4 clocks delay of imul ebx, 1010101h and don't depend on imul instruction.


Regards,
Lingo
Posted on 2003-06-01 17:37:37 by lingo12

InStringL proc lpSource:DWORD, lpPattern:DWORD
db 3Eh ; ds: prefix
mov eax, ; ecx->lpPattern
sub esp, 2*4 ; room to save registers and lenght of substring
db 3Eh ;
mov edx, ; edx-> return address
db 3Eh ;
mov ecx, ; get dword from substring
db 3Eh ;
mov , ebx ; save register ebx
movzx ebx, cl ; ebx->the 1st byte of substring
db 3Eh ;
mov , edx ; save return address
imul ebx, 1010101h ; ebx=77 77 77 77h ; 77h -> ASCII code of "w"
db 3Eh ; (if the 1st byte of substring is "w")
mov , esi ;

My God.. the code looks totally bloated and overkill just to find the position of a substring into e.g. a 10 characters long string. And why do you slow down everything with all those totally unnecessary 3E prefixes? Somebody oughta tell you that Win32 uses a flat memory system.. where DS already = ES that already = SS. No need for those silly prefixes.
Posted on 2003-06-02 02:42:26 by Kyle Katarn
He is aligning to code so that the loop is in the code cache if I am not wrong.

Coded for speed, not size I suppose.
Posted on 2003-06-02 03:04:39 by roticv
Kyle,
".. just to find the position of a substring into e.g. a 10 characters long string."
Emotional and wrong! The length of long string isn't limited

"And why do you slow down everything.."
It is the fastest InString I ever seen
If you know faster pls don't hesitate to open my eyes

"No need for those silly prefixes"
http://www.asmcommunity.net/board/showthread.php?threadid=13541

Regards,
Lingo
Posted on 2003-06-02 05:26:33 by lingo12
Here is the shortest InString:


OPTION PROLOGUE:NONE ; turn it off
OPTION EPILOGUE:NONE ;
InStringS proc lpSource:DWORD, lpPattern:DWORD
pop eax ; ret address
pop ecx ; ecx-> lpSource
pop edx ; edx-> lpPattern
push eax ; ret address
push edi ; save edi
push esi ; save esi
push edx ; edx-> lpPattern
mov edi, ecx ; edi-> lpSource
mov esi, edx ; esi-> lpPattern
call StrLen
push eax ; save eax-> len of lpPattern
push eax ; save eax-> len of lpPattern
push edi ; edi-> lpSource
call StrLen
pop edx ; edx-> len of lpPattern
inc eax ; eax-> len of lpSource+1
mov ch, [esi] ; ch = 1st byte of substring
dec esi ; esi-> lpPattern -1
sub eax, edx ;
add edi, eax ; edi-> end of lpSource +1
neg eax ;
SrchNextByteS:
cmp [edi+eax], ch ; ch = 1st byte of substring
jne NextS
pop edx ; edx-> len of substring
add eax, edi ; eax-> base register
push edx ; edx-> len of substring
CmpNextS:
mov cl, [esi+edx] ; comparing rest bytes
dec edx ; edx-> len of substring
je ExitS ; exit
cmp [eax+edx], cl ; eax-> current offset in lpSource
je CmpNextS ; at 1st byte of substring
sub eax, edi ; eax-> negative index register
NextS:
inc eax
jl SrchNextByteS
ExitS:
pop ecx ; ecx-> len of substring
pop esi
pop edi
ret ; 64 bytes
InStringS ENDP
OPTION PROLOGUE:PROLOGUEDEF ; turn back on the defaults
OPTION EPILOGUE:EPILOGUEDEF ;


Regards,
Lingo
Posted on 2003-06-10 06:29:58 by lingo12
I'm just curious.
Why do you keep using DS prefix for ESP? ESP is relative to SS. If, by any chance, DS != SS, (for whatever the reason is) then your function will work on something different from what are passed.
And, it would be nice to see some speed test result other than just claiming "It is the fastest InString I ever seen".
Posted on 2003-06-10 16:09:50 by Starless
Starless,
"Why do you keep using DS prefix for ESP?"

Because the labels LoopS1, BytesScan, SrchNextByte and StartCmp
should be aligned to addresses divisible by 16

"Instructions with a memory operand can be made one byte longer with a SIB byte, but the easiest way of making an instruction one byte longer is to add a DS: segment prefix (DB 3Eh). The microprocessors generally accept redundant and meaningless prefixes (except LOCK) as long as the instruction length does not exceed 15 bytes. Even instructions without a memory operand can have a segment prefix. So if you want the DEC ECX instruction to be 2 bytes long, write:
DB 3Eh
DEC ECX" by A.Fog



"ESP is relative to SS. If, by any chance, DS != SS, (for whatever the reason is) then your function will work on something different from what are passed."

"Somebody oughta tell you that Win32 uses a flat memory system.. where DS already = ES that already = SS." by Kyle Katarn

"...but I would consider it safe to use a segment prefix with any instruction. " by A.Fog



"And, it would be nice to see some speed test result..."
Just do it...

Regards,
Lingo
Posted on 2003-06-11 04:57:23 by lingo12
cs = ds = ss, its a safe assumption for most versions of windows, but there are versions of windows which may not (datacenter 2K for example). In order to get past the 4GB limit on database machines Intel provides PAE to get around the 32bit address limit (effectivly providing a 36 bit address, as some of the bits of the segment selectors are used for other things).

It'll work as is for us ordinary folk with ordinary versions of windows though...

Mirno
Posted on 2003-06-11 13:00:13 by Mirno
BTW alignment here can be done without prefixes or
additinal instruction.
Just using different coding in modrm and sib and zero displacements
Posted on 2003-10-09 07:44:44 by The Svin
cs = ds = ss

I guess you meant base of the segments, not values (selectors). Values (selectors) of cs and ds/ss is different in
CPL 3, and all of them are different in CPL 0.
Posted on 2003-10-09 07:47:15 by The Svin
I'll try to explain my point in
opcode demonstration.
For example in lingo proc there is mnemonic:
mov ecx,
Actually the mnemonic can be encoded
different ways and wich is more important
in this topic context - with different resulting
size, without using prefixes of segement
redefinishing (wich can force to
reload shadow registers and thus loosing clocks)

Compilers usually use optimum (in size hence)
of optional encoding.
in case mov ecx, it's 2 bytes:
8B 08
08 - is modr/m byte 00 001 000
00 - mod (memr = address pointer without displacement)
001 - ecx
000 -

We can encode it one byte longer (3 bytes total) two ways:
1. Using SIB without index:
8Bh:00 001 100(sing of SIB):00 100 (no index) 000()
In hex it:
8B 0C 20
And it's also mov ecx,
2. Other way without SIB but with 1 byte displacement:
8Bh:01 (displacement 1 byte) 001 000y:00h
in HEX:
8B 48 00


We can encode it 2 bytes longer (4 bytes total) using SIB + displacement 1 byte:

8B:01 001 100:00 100 000:00h
in hex
8B 4C 20 00

We can encode it 4 bytes longer without SIB, and with displacement 4 bytes:

8B:10 (displacement full size) 001 000:00000000h
in hex
8B 88 00 00 00 00

And finally we can encode it 5 bytes longer with SIB and displacement 4 bytes:

8B:10 001 100: 00 100 000: 00000000h

in hex
8B 8C 20 00 00 00 00

The same approach can be taken (different size of displacemt part and
redandent usage of SIB) with many other opcodes here.
For example lea esi, can be coded 3 bytes longer if mod set to 10 and
additinal 3 FFh placed in displacement part, and so on.
Posted on 2003-10-09 13:29:48 by The Svin

Just using different coding in modrm and sib and zero displacements
True, but I think the choice of segment overrides is to stress a different part of the CPU than is done by other methods. Additionally, it is what the optimization manual suggests. Mainly for P3+/K7+.
Posted on 2003-10-09 19:56:08 by bitRAKE
Lingo12,


db 3Eh ; ds: prefix
mov eax, ; ecx->lpPattern


When you need the DS: only for stretch the instruction, use rather


mov eax, ds: ; ecx->lpPattern


, or better


mov eax, ss: ; ecx->lpPattern


. It's much better readable :)
Posted on 2003-10-09 21:38:37 by MazeGen

True, but I think the choice of segment overrides is to stress a different part of the CPU than is done by other methods. Additionally, it is what the optimization manual suggests. Mainly for P3+/K7+.


To make clear my point I would devide
Intel docs into two logical parts:
1. Description of architecture, inner parts and
algos placed in internal units.
2. Solutions and suggestions.

The First part in subject of most close studing.
The Second is nothing but ones of possible and not always
optimal solutions. People of Intel has the only advantage
in solution searching against your own brain - they
know data of inners better, since you carefully read their
description of the data - you can find better solution
then Intel people can offer.

I can remind you as an example the other thread when
I myself initially knew data a litlle bit better than
others yet my solutions weren't the best ones.

In other thread we discussed Reg code to string optimal
size solution. Practically all encoding part descriptions
in details came from me, yet solutions of other including
you were better.
Especially in the case our freind Poimander was interesting to observe
in his development, - 'cause for the first several posts
he couldn't get quite well what was the problem about :)
And despite of the fact of being not "au fait" initially -
he finally offered lots of interesting ideas and one of
two versions of shortest code.
And me with all my deeper understanding - just appeared to
find myself as deep outsider :) Some of my ideas were accepted,
others - replaced with better ones.

The same about Intel docs - description part is most valubale,
as to solution - .. well, it's just one of possible.
And not nesserilly the best.
At least about the Intel Software IMHO - it's one of the slowest :)
So IMHO, it's worthy to take Intel notes about cach lines and alignments,
but treat their solution as one of many possible.

I guess they offered just easier to encode for HLL programmer way,
since for most people manual binary encoding is just too much.
And no tool available to offer different size encoding.
I guess, I could write one if needed.
Kindda - input mnemonic - output several optional ways to encode it
with different size. Might be more - encode with automatic
choosing sizes to align to given value.
Posted on 2003-10-09 22:20:07 by The Svin

. It's much better readable

Might be it's better readable, but lingo gave sources
to be compiled, not just psuedo-asm-algo, AFAIK he USES
MASM, and MASM would leave out (ommit) those prefixes
optimizing for size. As the result label wouldn't be aligned
thus lingo placed prefixes explicitly as data byte to force
MASM include them.
Posted on 2003-10-09 22:37:27 by The Svin
Originally posted by The Svin
The Second is nothing but ones of possible and not always
optimal solutions. People of Intel has the only advantage
in solution searching against your own brain - they
know data of inners better, since you carefully read their
description of the data - you can find better solution
then Intel people can offer.
IMHO, it is in Intel's best interest to produce clear documents about the function of their processors, but the problem lies in the level of detail in those documents. To protect their intests we only get a rehashed interpretation of the fine detail within the chip.

Especially in the case our freind Poimander was interesting to observe
in his development, - 'cause for the first several posts
he couldn't get quite well what was the problem about :)
This was quite interesting to see and my respect for him developed as well.

At least about the Intel Software IMHO - it's one of the slowest :)
So IMHO, it's worthy to take Intel notes about cach lines and alignments,
but treat their solution as one of many possible.
Our understanding of the world is a constantly changing process and nothing be so rigid to not change in time.

I guess they offered just easier to encode for HLL programmer way,
since for most people manual binary encoding is just too much.
Dispite the desires of many the brain does not think in numbers.
Posted on 2003-10-10 00:06:01 by bitRAKE

cs = ds = ss, its a safe assumption for most versions of windows, but there are versions of windows which may not (datacenter 2K for example) [...]


It sounds really interesting, Mirno. Could you post more info or some reference about it? A man hears only about flat mode again and again...



Might be it's better readable, but lingo gave sources
to be compiled, not just psuedo-asm-algo, AFAIK he USES
MASM, and MASM would leave out (ommit) those prefixes
optimizing for size. As the result label wouldn't be aligned
thus lingo placed prefixes explicitly as data byte to force
MASM include them.


Thanks, I didn't know it before...
Posted on 2003-10-10 13:47:02 by MazeGen
Hello,


I just continue the old Hutch's thread:
http://www.asmcommunity.net/board/showthread.php?threadid=6368&highlight=instring
    

; InString searches (case sensitive) for a substring
; in a larger string and if it is found, it returns its
; address in eax.
; If the substring is not found, it returns zero in eax.
; Usage: push offset Pattern
; push offset Source
; call InStringL
; [B]Caution:[/B]This code is hard to read



Very nice work. It is very fast and i want ask you, if i can use it in my string lib.
There is a little difference between my ASMSTRING and a C-String, because my ASMSTRING contains the size of the string at OFFSET PTR -4 (grows dynamically).

Until now is used the following source, because it seems faster then InStringx:



PosLeft proc String: ASMSTRING, Pattern: ASMSTRING
MOV EAX, Pattern
MOV EDX, String

TEST EAX, EAX
JE @@noWork ;{ SubStr = nil }

TEST EDX, EDX
JE @@StrEmpty ;{ S = nil }

PUSH EBX
PUSH EDI
PUSH ESI
MOV ESI, EAX ;{ ESI = Pattern }
MOV EDI, EDX ;{ EDI = String }
MOV EAX, 1 ;{ StartPosition }

@@SkipAdjust:
DEC EAX ;{ first char should be compared }
MOV ECX, [EDI-4] ;{ ECX = Length(S) }
MOV EDX, [ESI-4] ; { Length(SubStr }
SUB ECX, EDX
JS @@fail ; { Length(SubStr) > (Length(S) - StartPos) }
INC ECX
PUSH EDI ; { remember s position to calculate index }
LEA EDI, [EDI+EAX] ; { I := StartPos }
MOV AL, [ESI] ; { SubStr[1] }
JMP @@RepneScasb

@@PrepareRepeCmpsb:
MOV EBX, ECX
MOV ECX, EDX ; { Length(SubStr) }

@@RepeCmpsb:
DEC ECX
JZ @@found ; { Length(SubStr) = 0 }

MOV AH, [ESI+ECX]
CMP AH, [EDI+ECX]
JZ @@RepeCmpsb ; { S[I] <> SubStr[A] -> next char and Inc(I)
; until S[I] = SubStr[1] }
MOV ECX, EBX
INC EDI
DEC ECX
JZ @@failWithPop ; { string cant be found -> end of string reached
; { and outer @@RepneScasb }
@@RepneScasb:
CMP AL, [EDI]
JZ @@PrepareRepeCmpsb ; { SubStr[A] = S[I] }
INC EDI ; { Inc(I) }
DEC ECX
JNZ @@RepneScasb ; { I <> Length(S) }
JMP @@failWithPop ; { I = Length(S) and S[I - 1] <> SubStr[1] }

@@StrEmpty:
XOR EAX, EAX
JMP @@noWork

@@found:
POP EDX
MOV EAX, EDI
SUB EAX, EDX
INC EAX
JMP @@exit

@@failWithPop:
POP EDX

@@fail:
XOR EAX, EAX

@@exit:
POP ESI
POP EDI
POP EBX

@@noWork:

ret
PosLeft endp


But your code was ca. 15%-20% faster then mine (on my PIV) :-).
Do you think, it could be faster through alignement ( i try to learn ASM )?

Do your code have some limitations or there are situations, where it cannot be used?


MfG Manuel.
Posted on 2004-02-22 10:22:16 by other
other,

"It is very fast and i want ask you, if i can use it in my string lib."
"Do you think, it could be faster through alignement ( i try to learn ASM )? "

Yes

"Do your code have some limitations or there are situations, where it cannot be used?"

No

Regards,
Lingo
Posted on 2004-02-22 12:26:47 by lingo12
Hello,



other,

"It is very fast and i want ask you, if i can use it in my string lib."
"Do you think, it could be faster through alignement ( i try to learn ASM )? "

Yes

"Do your code have some limitations or there are situations, where it cannot be used?"

No

Regards,
Lingo


Thats great. Thank you very much :-).


MfG Manuel.
Posted on 2004-02-22 13:23:39 by other