Hi all

Any faster?


AsciiToDw proc uses edx esi,lpAscii:DWORD

mov esi,lpAscii
xor eax,eax
xor edx,edx
@@:
mov dl,[esi]
sub dl,'0'
jb @f
cmp dl,10
jnb @f
lea eax,[eax*4+eax] ;Multiply by 5
lea eax,[eax*2+edx] ;Multiply by 2 and add digit in edx
inc esi
jmp @b
@@:
ret

AsciiToDw endp

KetilO
Posted on 2002-08-10 17:56:27 by KetilO
Did you look at xlat/xlatb?

Did you look at cwd/cwde?
Posted on 2002-08-10 18:59:08 by Roy Cline
Please tell more.

KetilO
Posted on 2002-08-10 19:13:06 by KetilO
ignore it :)
xlat does no help for decimals.
I am not giving you another algo, I'll help you improve your own for better understanding.
First rule for fast algo - remove anithing that can be removed from main loop
1. Do sub "0" at the and of loop and change unconditinal jump to conditional
sub dl,"0"
jns @B

Do adds (leas) at the begining.
That will help you remove 1 instruction.
advantage = x -1 , where x is number of iterations.
Why -1? Cause you need to do jump to the
mov dl,
inc esi
sub dl,"0"
jns @B
at the begining
2. I don't understand the
cmp dl,10
If it is ASCIIZ then sub dl,"0" would be enough to determin if it is the end of the string (0)
So you can remove those 2 instructions too.
Or you expect some letter to be at the end of symbolic number?
If not your algo can look like:






AsciiToDw proc uses edx esi,lpAscii:DWORD

mov esi,lpAscii
xor eax,eax
xor edx,edx
jmp @1
@@:
lea eax,[eax*4+eax];Multiply by 5
lea eax,[eax*2+edx];Multiply by 2 and add digit in edx
@1:
mov dl,[esi]
inc esi
sub dl,"0"
jns @B ;or jnc
ret

AsciiToDw endp

Posted on 2002-08-10 20:05:12 by The Svin
hi ! :)

The Svin,

mov dl,
sub dl,'0'
jb @f
cmp dl,10
jnb @f

is equal to : if(dl>=0 && dl<10) ....

Ketilo test if it's a vailable number and stop convertion if it's not a number.
and for better optimization use 'add' and not 'inc', better pairing is more powerfull :grin:

Bye ! :)
Posted on 2002-08-10 20:56:57 by kylekiller
Looks like error testing done outside PROC?
Then: is it safe to assume one digit?
AsciiToDw proc uses edx ecx, lpAscii:DWORD

mov ecx, lpAscii
xor eax, eax
; use this line if not assume one digit and move INC ECX
; mov edx, '0'
movzx edx, BYTE PTR [ecx]
@@:
lea eax, [eax*4+eax]
inc ecx
lea eax, [eax*2+edx-'0']
movzx edx, BYTE PTR [ecx]
cmp edx, '9'+1
jnb @F
cmp edx, '0'
jnb @B
@@:
ret
AsciiToDw endp
I have 7 fast versions of atodw posted to this board. :)
Posted on 2002-08-10 21:12:49 by bitRAKE
With a little twist from The Svin's code, this will now support signed values.
[size=9]s_atodw:


mov edx, [esp+4]
cmp BYTE PTR [edx], "-"
je __negative

__convert:

xor eax,eax
xor ecx,ecx
jmp @1
@@:
lea eax, [eax*4+eax] ;Multiply by 5
lea eax, [eax*2+ecx] ;Multiply by 2 and add digit in edx
@1:
mov cl, [edx]
inc edx
sub cl, "0"
jns @B ;or jnc
retn 4

__negative:

inc edx
push edx
call __convert
neg eax
retn 4[/size]
or from Ketilo's
[size=9]s_atodw:


mov edx, [esp+4]
cmp BYTE PTR [edx], "-"
je __negative

__convert:

xor eax,eax
xor ecx,ecx

@@:
mov cl, [edx]
sub cl, '0'
jb @f
cmp cl, 10
jnb @f
lea eax, [eax*4+eax] ;Multiply by 5
lea eax, [eax*2+ecx] ;Multiply by 2 and add digit in edx
inc edx
jmp @b
@@:

retn 4

__negative:

inc edx
push edx
call __convert
neg eax
retn 4[/size]
or from bitRAKE's
[size=9]s_atodw:


mov edx, [esp+4]
cmp BYTE PTR [edx], "-"
je __negative

__convert:

xor eax,eax

movzx ecx, BYTE PTR [edx]
@@:
lea eax, [eax*4+eax]
inc edx
lea eax, [eax*2+ecx-'0']
movzx ecx, BYTE PTR [edx]
cmp ecx, '9'+1
jnb @F
cmp ecx, '0'
jnb @B
@@:

retn 4

__negative:

inc edx
push edx
call __convert
neg eax
retn 4[/size]
Sorry for the code bloat, but more functionality is good. :)
Posted on 2002-08-10 22:58:59 by stryker
This one lets you specify base. (base 2 up to base 36)
It is case insensitive. You can use "c1a5eb" or "C1A5EB". It won't matter.

atodw proc uses ecx edx esi, lpszAscii:DWORD, dwBase:DWORD


mov esi, lpszAscii
xor eax, eax
jmp _GetChar
_Alpha: add cl, 0Ah
_Numeric: cmp ecx, dwBase
jnc _Exit
mul dwBase
add eax, ecx
inc esi
_GetChar: movzx ecx, byte ptr [esi]
sub cl, 30h
cmp cl, 0Ah
jc _Numeric
sub cl, 11h
and cl, 0DFh
cmp cl, 1Ah
jc _Alpha
_Exit: ret

atodw endp
Posted on 2002-08-11 01:25:28 by iblis
Signed values conversion. Speed optimized.


; esi-string
cmp [byte esi+0],'-'+1
sbb ebx,ebx
adc esi,1
movzx eax,[byte esi-1]
sub eax,'0'
@@: movzx ecx,[byte esi+0]
movzx edx,[byte esi+1]
sub ecx,'0'
sub edx,'0'
cmp ecx,9
ja @W
cmp edx,9
ja @F
lea ecx,[ecx+4*ecx]
imul eax,100
lea edx,[edx+2*ecx]
add eax,edx
add esi,2
jmp @B
@@: lea eax,[eax+4*eax]
lea eax,[ecx+2*eax]
@W: add eax,ebx
xor eax,ebx
; eax-number


*Edit*: I find Athlon optimized version
Posted on 2002-08-11 02:15:29 by Nexo
Hi all

Assuming PROFILE can be trusted, the results are a bit odd.

KetilO
Posted on 2002-08-11 03:49:45 by KetilO
hi ! :)

i have optimize just a little bit the proc of bitRake :


mov ecx, lpAscii
xor eax, eax
xor edx,edx
; use this line if not assume one digit and move INC ECX
;mov edx, '0'
;this instruction decrease speed only here
;movzx edx, BYTE PTR
mov dl,BYTE PTR
@@:
lea eax,
inc ecx
lea eax,
movzx edx, BYTE PTR
xor edx, '0'
cmp edx, 10
jb @B
@@:



xor edx, '0'
cmp edx, 10
jb @B

is equivalent to

cmp edx, '9'+1
jnb @F
cmp edx, '0'
jnb @B

xor '0' with a number will always give a number <10 ;) :grin:

Bye alls ! :alright:
Posted on 2002-08-11 06:33:22 by kylekiller

Hi all

Assuming PROFILE can be trusted, the results are a bit odd.

KetilO


Hi KetilO,

PROFILE never gave me any problem. If one pretends to use it with MASM, however, this has some (MASM) limitations. Quoting some lines of the MASM translation of PROFILE.INC (it wasn't translated by me, by the way, and I never tested it):



ALIGN 4 ; align to a cache entry on all CPU's
PROFILEROUTINE DWORD ?
PROFILECYCLES DWORD ?
[...]
ALIGN 16 ; align to a cache entry on all CPU's
[...]
ALIGN 16 ; align to a cache entry on all CPU's
@empty: RET


Clearly the explicitly required "align to a cache entry on all CPU's" and that 4 or 16 after ALIGN are not compatible.

My advice: use a real assembler. My definition of "real assembler" is a 1:1 assembler that lets you do anything required by low level programming. Unfortunately, the Microsoft Macro Assembler doesn't always fit the definition.

Also (it's not your case, though), as I stated many times, care must be spent on understanding that PROFILE calls the routine to be tested 5 times, and although it preserves the EAX..ESP and EFL registers across these 5 internal calls, it cannot save/restore variables held in memory, for example, thus (as you did, anyway, but unlike others) one has to PROFILE a routine which also initializes all memory variables and memory buffers that it will modify (i.e. write and read).

Last but not least (again, as stated in the documentation not without a reason) << [..] For such comparative tests, remember to align the same way all the routines to be tested (use ALIGN 64 to align to a cache line on all modern x86 CPUs). >>

PS: official FAsm version (previously I released only a NASM version):



; ---------------------------------------------------------------------------
; ***************************************************************************
; ---------------------------------------------------------------------------
;
;
; PROFILING MACROs.
;
;
; Original NASM and FASM code (c) 2002 Fabio Bizzetti (a.k.a. Maverick).
; MASM translation by grv575 and improved by others (please claim credits).
;
; You can freely use and spread this source code as long as you do not modify
; any part of it, including the copyright notice.
;
; ---------------------------------------------------------------------------
;
; These MACROs and code are useful to count the exact number of cycles that
; it takes to a piece of code to execute. Although not particularly tuned to
; the hardware of a certain CPU, PROFILE is designed to work well on all the
; past, present and future x86 CPU's, so far with no exceptions known to me.
; Please report any exceptions to [email]maverock@libero.it[/email]
;
; ---------------------------------------------------------------------------
;
; Use it as follows:
;
; ... set up your CPU registers as you wish ...
; PROFILE yoursubroutine
; ... check the results of yoursubroutine, if you wish ...
;
; i.e. just use "PROFILE" exactly like you would use the CALL instruction,
; since registers/CPU flags aren't affected by the profiler.
; On return you will get in the 64 bit unsigned integer [PROFILE.CYCLES] the
; exact number of cycles it took to yoursubroutine to be executed. Of course,
; if you're certain that it took less than 2^32 cycles to execute, you can
; access [PROFILE.CYCLES] as a 32bit variable.
; Note that the routine self-adapts itself to any past, present and future
; CPU's (so you don't have to subtract x cycles depending on your CPU), it
; automatically removes the cost of the last RET in the subroutine it tests
; (simulating the test of inlined code, which is usually more useful), and
; it calls it several times to ensure that the caches are setup (you can
; also test it uncached, though, but only in ring0.. e.g. in protected mode
; Dos, where I used this PROFILE routine most), and to do it correctly it
; saves and restores the CPU registers at each call.. so the only limitation
; here is that (being the routine to be tested called more than once)
; external pointers or counters must be initialized in your own routine..
; about the EAX..ESP and EFL registers instead it's transparent, and behaves
; like if yoursubroutine to be tested was called only once.
;
; The advantages of this routine? It's 100% precise, consistent and stable,
; this is my first goal and I've never seen one that behaved better (that's
; why I wrote this). It self adapts to any CPU.. saving you from this major
; hassle. It can be used to profile whole subroutines by just using PROFILE
; instead of CALL, and giving a 64bit result, it can be used to profile
; long-executing routines as well. I still use it much for comparing
; different versions of a subroutine on the same dataset, to choose the
; fastest one. Also, it's precise down to ~0 cycles (something extremely
; troublesome on modern CPU's).. so that's the most interesting thing IMO..
; since it's reliable and can be used to test any routine in a extremely
; precise way.
;
; For such comparative tests, remember to align the same way all the routines
; to be tested (use ALIGN 64 to align to a cache line on all modern x86 CPUs).
;
; NOTE about all the ALIGN directives in the source:
; *YOU* *have* *to* provide and ensure alignment. This *IS* a requirement.
;
; ---------------------------------------------------------------------------
; ***************************************************************************
; ---------------------------------------------------------------------------

MACRO PROFILE Address {
MOV DWORD [_PROFILE.ROUTINE],Address
CALL _PROFILE
}

; -------------------------------------

ALIGN 64 ; align to a cache entry on all CPU's

PROFILE.CYCLES: DD 0
DD 0
_PROFILE.EMPTY: DD 0 ; how many cycles it takes for a simple RET to be executed on the host CPU
DD 0
_PROFILE.ROUTINE: DD 0
_PROFILE.RETURN: DD 0
_PROFILE.IN.EAX: DD 0
_PROFILE.IN.EBX: DD 0
_PROFILE.IN.ECX: DD 0
_PROFILE.IN.EDX: DD 0
_PROFILE.IN.ESI: DD 0
_PROFILE.IN.EDI: DD 0
_PROFILE.IN.EBP: DD 0
_PROFILE.IN.EFL: DD 0
_PROFILE.OUT.EAX: DD 0
_PROFILE.OUT.EBX: DD 0
_PROFILE.OUT.ECX: DD 0
_PROFILE.OUT.EDX: DD 0
_PROFILE.OUT.EFL: DD 0
_PROFILE.RETADDR: DD 0

; the following is to make sure that data and code are on a different page.
ALIGN 4096 ; note: *YOU* have to provide alignment

; -------------------------------------

ALIGN 64 ; align to a cache entry on all CPU's
_PROFILE:
MOV DWORD [_PROFILE.IN.EAX],EAX ; saves INPUT EAX (will be trashed by CPUID)
MOV DWORD [_PROFILE.IN.EBX],EBX ; saves INPUT EBX (will be trashed by CPUID)
MOV DWORD [_PROFILE.IN.ECX],ECX ; saves INPUT ECX (will be trashed by CPUID)
MOV DWORD [_PROFILE.IN.EDX],EDX ; saves INPUT EDX (will be trashed by CPUID)
MOV DWORD [_PROFILE.IN.ESI],ESI ; saves INPUT EAX (will be trashed by the routine to be tested, which will be called multiple times)
MOV DWORD [_PROFILE.IN.EDI],EDI ; saves INPUT EBX (will be trashed by the routine to be tested, which will be called multiple times)
MOV DWORD [_PROFILE.IN.EBP],EBP ; saves INPUT ECX (will be trashed by the routine to be tested, which will be called multiple times)
PUSHFD
POP DWORD [_PROFILE.IN.EFL] ; saves INPUT CPU EFLAGS
POP DWORD [_PROFILE.RETURN] ; saves return address
PUSH DWORD [_PROFILE.ROUTINE] ; saves requested _PROFILE.ROUTINE
MOV DWORD [_PROFILE.ROUTINE],.empty ; first we'll profile a simple RET
MOV DWORD [_PROFILE.RETADDR],.ret1
JMP DWORD .profile ; make sure it gets cached
.ret1: MOV DWORD [_PROFILE.RETADDR],.ret2
JMP DWORD .profile ; profile for real (well, let it set up)
.ret2: MOV DWORD [_PROFILE.RETADDR],.ret3
JMP DWORD .profile ; profile for real (well, let it set up again)
.ret3: MOV DWORD [_PROFILE.RETADDR],.ret4
JMP DWORD .profile ; profile for real (well, let it set up one final time)
.ret4: MOV DWORD [_PROFILE.RETADDR],.ret5
JMP DWORD .profile ; profile for real
.ret5: MOV EAX,DWORD [PROFILE.CYCLES+0]
MOV EDX,DWORD [PROFILE.CYCLES+4]
MOV DWORD [_PROFILE.EMPTY+0],EAX ; saves RET cycles count
MOV DWORD [_PROFILE.EMPTY+4],EDX
;
POP DWORD [_PROFILE.ROUTINE] ; restores requested _PROFILE.ROUTINE
MOV DWORD [_PROFILE.RETADDR],.ret6
JMP BYTE .profile ; make sure it gets cached
.ret6: MOV DWORD [_PROFILE.RETADDR],.ret7
JMP BYTE .profile ; profile for real
.ret7: MOV DWORD [_PROFILE.RETADDR],.ret8
JMP BYTE .profile ; make sure it gets cached
.ret8: MOV DWORD [_PROFILE.RETADDR],.ret9
JMP BYTE .profile ; profile for real
.ret9: MOV DWORD [_PROFILE.RETADDR],.ret10
JMP BYTE .profile ; make sure it gets cached
.ret10: MOV EAX,DWORD [_PROFILE.EMPTY+0] ; subtracts simple RET overhead
MOV EDX,DWORD [_PROFILE.EMPTY+4]
SUB DWORD [PROFILE.CYCLES+0],EAX ; saves cycles, low 32bit
SBB DWORD [PROFILE.CYCLES+4],EDX ; saves cycles, high 32bit
MOV EAX,DWORD [_PROFILE.OUT.EAX] ; gives OUTPUT EAX
MOV EBX,DWORD [_PROFILE.OUT.EBX] ; gives OUTPUT EBX
MOV ECX,DWORD [_PROFILE.OUT.ECX] ; gives OUTPUT ECX
MOV EDX,DWORD [_PROFILE.OUT.EDX] ; gives OUTPUT EDX
PUSH DWORD [_PROFILE.OUT.EFL]
POPFD ; gives CPU EFLAGS
JMP DWORD [_PROFILE.RETURN] ; returns to caller
.profile:
;WBINVD (if you want to test uncached data/code) ; beware, WBINVD is only for ring0
MOV EAX,DWORD [PROFILE.CYCLES+0] ; touches caches
MOV EDX,DWORD [PROFILE.CYCLES+4]
MOV EAX,DWORD [_PROFILE.IN.EAX]
MOV EBX,DWORD [_PROFILE.IN.EBX]
MOV ECX,DWORD [_PROFILE.IN.ECX]
MOV EDX,DWORD [_PROFILE.IN.EDX]
MOV ESI,DWORD [_PROFILE.IN.ESI]
MOV EDI,DWORD [_PROFILE.IN.EDI]
MOV EBP,DWORD [_PROFILE.IN.EBP]
MOV EAX,DWORD [_PROFILE.IN.EFL]
MOV EAX,DWORD [_PROFILE.OUT.EAX]
MOV EBX,DWORD [_PROFILE.OUT.EBX]
MOV ECX,DWORD [_PROFILE.OUT.ECX]
MOV EDX,DWORD [_PROFILE.OUT.EDX]
MOV EAX,DWORD [_PROFILE.OUT.EFL]
MOV EAX,DWORD [_PROFILE.ROUTINE]
MOV ECX,32
.stack: PUSH EAX ; touches stack
LOOP .stack
ADD esp,128
XOR EAX,EAX
CPUID ; flush pipelines
RDTSC
MOV DWORD [PROFILE.CYCLES+0],EAX ; saves TSC, low 32bit
MOV DWORD [PROFILE.CYCLES+4],EDX ; saves TSC, high 32bit
XOR EAX,EAX
CPUID ; flush pipelines
MOV EAX,DWORD [_PROFILE.IN.EAX] ; restores INPUT EAX
MOV EBX,DWORD [_PROFILE.IN.EBX] ; restores INPUT EBX
MOV ECX,DWORD [_PROFILE.IN.ECX] ; restores INPUT ECX
MOV EDX,DWORD [_PROFILE.IN.EDX] ; restores INPUT EDX
MOV ESI,DWORD [_PROFILE.IN.ESI]
MOV EDI,DWORD [_PROFILE.IN.EDI]
MOV EBP,DWORD [_PROFILE.IN.EBP]
PUSH DWORD [_PROFILE.IN.EFL]
POPFD ; restores CPU EFLAGS
CALL DWORD [_PROFILE.ROUTINE] ; calls the routine to be tested
MOV DWORD [_PROFILE.OUT.EAX],EAX ; saves OUTPUT EAX
MOV DWORD [_PROFILE.OUT.EBX],EBX ; saves OUTPUT EBX
MOV DWORD [_PROFILE.OUT.ECX],ECX ; saves OUTPUT ECX
MOV DWORD [_PROFILE.OUT.EDX],EDX ; saves OUTPUT EDX
PUSHFD
POP DWORD [_PROFILE.OUT.EFL] ; saves OUTPUT CPU EFLAGS
XOR EAX,EAX
CPUID ; flush pipelines
RDTSC
XCHG DWORD [PROFILE.CYCLES+0],EAX
XCHG DWORD [PROFILE.CYCLES+4],EDX
SUB DWORD [PROFILE.CYCLES+0],EAX ; saves TSC, low 32bit
SBB DWORD [PROFILE.CYCLES+4],EDX ; saves TSC, high 32bit
JMP DWORD [_PROFILE.RETADDR]

ALIGN 64 ; align to a cache entry on all CPU's
.empty: RET

; ---------------------------------------------------------------------------
; ***************************************************************************
; ---------------------------------------------------------------------------
Posted on 2002-08-11 08:29:23 by Maverick
Hi Maverick

Thanks for a very useful tool. The results simply shows that optimalisation is too complex to be done on paper alone. It must be tested.

KetilO
Posted on 2002-08-11 08:49:06 by KetilO
Hei KetilO,

Yes, I strongly believe that too. Expecially when one deals with cache-critical routines (i.e. look up tables).

Hadet,
Maverick
Posted on 2002-08-11 10:26:00 by Maverick
On my AMD I ran your a2dw test program. These are my results in order from fastest to slowest.

When strlen( lpAscii ) == 0:
KetilO
The Svin
BitRake
Nexo

When strlen( lpAscii ) == 1 :
BitRake
The Svin
KetilO
Nexo

When strlen( lpAscii ) == 2 :
The Svin
BitRake
KetilO
Nexo

When strlen( lpAscii ) == 3 and >= 7:
BitRake
The Svin
KetilO
Nexo

When strlen( lpAscii ) >= 4 and <= 6:
The Svin
BitRake
KetilO
Nexo



Very confusing results. Nexo's algo performs the worst, but as strlen() gets higher he begins to catch up. Unfortunately I can only input 8 chars.

BitRake and The Svin's algos, the two best performers, seem to be neck and neck most of the time, but The Svin's numbers appear to fluctuate where BitRake's numbers are more steady.
Posted on 2002-08-11 10:44:45 by iblis
On my PIII

Fastest first.

0 K,S,B,N
1 K=S,B,N
2 K,B,S,N
3 K,B,S,N
4 S,K,B,N
5 B,S,K,N
6 B,S,N,K
7 B,S,N,K
8 B,N,S,K
9 B,S,N,K
10 B,S=N,K

Very confusing.
It should be said that Nexo's has sign and register preservation. So the comparition is not fair.

KetilO
Posted on 2002-08-11 11:10:36 by KetilO
I modificate test.
Now all proc executed in same address space.
It is very significan moment.
Also remove sign checking from my proc.
Very intresting as was changed results.
Posted on 2002-08-11 12:46:05 by Nexo
here is my try




AsciiToDw proc uses edx esi,lpAscii:DWORD

mov esi,lpAscii
xor edx,edx
xor eax,eax
mov bl,[esi]
jmp @checkdigit
@@:

lea eax,[eax*4+eax] ;Multiply by 5
inc esi

lea eax,[eax*2+edx] ;Multiply by 2 and add digit in edx

@checkdigit:
mov dl,[esi]
xor dl,48
js @F
cmp dl,10
jb @B

@@:

ret
AsciiToDw endp


EDIT:
update the algo
Posted on 2002-08-11 13:11:11 by eko
Hi !

This is my result from my PII 350 :grin:

0 Car
Ketilo, The Svin
bitRAKE
Nexo

1 Car
The Svin, bitRAKE
Ketilo
Nexo

2 Car
bitRAKE
The Svin
Ketilo
Nexo

3,4 Car
bitRAKE
Ketilo
The Svin
Nexo

5 Car
bitRAKE
The Svin
Nexo
Ketilo

6 Car
bitRAKE
Ketilo, Nexo
The Svin

7 Car
bitRAKE
Nexo
The Svin
Ketilo

8 Car
bitRAKE
The Svin
Nexo
Ketilo



So, on My PII bitRake is the Best.
Result are very different from AMD and intel :rolleyes: :grin:

Bye ! ;)
Posted on 2002-08-11 13:51:43 by kylekiller
Note that only lower end case of PROC exit is tested and IMHO, Nexo has "cheated" by eliminating upper end test from his PROC while leaving it in all the others.

Oh, after review of the thread, it appears Nexo never tested upper exit case. :grin:

Picture (>9k) is worth some words...
Posted on 2002-08-11 16:18:22 by bitRAKE