Hi all

Any faster?
``````
AsciiToDw proc uses edx esi,lpAscii:DWORD

mov		esi,lpAscii
xor		eax,eax
xor		edx,edx
@@:
mov		dl,[esi]
sub		dl,'0'
jb		@f
cmp		dl,10
jnb		@f
lea		eax,[eax*4+eax]		;Multiply by 5
lea		eax,[eax*2+edx]		;Multiply by 2 and add digit in edx
inc		esi
jmp		@b
@@:
ret

AsciiToDw endp
``````

KetilO
Posted on 2002-08-10 17:56:27 by KetilO
Did you look at xlat/xlatb?

Did you look at cwd/cwde?
Posted on 2002-08-10 18:59:08 by Roy Cline

KetilO
Posted on 2002-08-10 19:13:06 by KetilO
ignore it :)
xlat does no help for decimals.
First rule for fast algo - remove anithing that can be removed from main loop
1. Do sub "0" at the and of loop and change unconditinal jump to conditional
sub dl,"0"
jns @B

Do adds (leas) at the begining.
advantage = x -1 , where x is number of iterations.
mov dl,
inc esi
sub dl,"0"
jns @B
at the begining
2. I don't understand the
cmp dl,10
If it is ASCIIZ then sub dl,"0" would be enough to determin if it is the end of the string (0)
So you can remove those 2 instructions too.
Or you expect some letter to be at the end of symbolic number?
If not your algo can look like:
``````

AsciiToDw proc uses edx esi,lpAscii:DWORD

mov esi,lpAscii
xor eax,eax
xor edx,edx
jmp @1
@@:
lea eax,[eax*4+eax];Multiply by 5
lea eax,[eax*2+edx];Multiply by 2 and add digit in edx
@1:
mov dl,[esi]
inc esi
sub dl,"0"
jns @B ;or jnc
ret

AsciiToDw endp

``````
Posted on 2002-08-10 20:05:12 by The Svin
hi ! :)

The Svin,

mov dl,
sub dl,'0'
jb @f
cmp dl,10
jnb @f

is equal to : if(dl>=0 && dl<10) ....

Ketilo test if it's a vailable number and stop convertion if it's not a number.
and for better optimization use 'add' and not 'inc', better pairing is more powerfull :grin:

Bye ! :)
Posted on 2002-08-10 20:56:57 by kylekiller
Looks like error testing done outside PROC?
Then: is it safe to assume one digit?
``````AsciiToDw proc uses edx ecx, lpAscii:DWORD
mov	ecx, lpAscii
xor	eax, eax
; use this line if not assume one digit and move INC ECX
;	mov	edx, '0'
movzx	edx, BYTE PTR [ecx]
@@:
lea	eax, [eax*4+eax]
inc	ecx
lea	eax, [eax*2+edx-'0']
movzx	edx, BYTE PTR [ecx]
cmp	edx, '9'+1
jnb	@F
cmp	edx, '0'
jnb	@B
@@:
ret
AsciiToDw endp``````
I have 7 fast versions of atodw posted to this board. :)
Posted on 2002-08-10 21:12:49 by bitRAKE
With a little twist from The Svin's code, this will now support signed values.
``````[size=9]s_atodw:

mov     edx, [esp+4]
cmp     BYTE PTR [edx], "-"
je      __negative

__convert:

xor     eax,eax
xor     ecx,ecx
jmp     @1
@@:
lea     eax, [eax*4+eax]    ;Multiply by 5
lea     eax, [eax*2+ecx]    ;Multiply by 2 and add digit in edx
@1:
mov     cl, [edx]
inc     edx
sub     cl, "0"
jns     @B                  ;or jnc
retn    4

__negative:

inc     edx
push    edx
call    __convert
neg     eax
retn    4[/size]``````
or from Ketilo's
``````[size=9]s_atodw:

mov     edx, [esp+4]
cmp     BYTE PTR [edx], "-"
je      __negative

__convert:

xor     eax,eax
xor     ecx,ecx

@@:
mov     cl, [edx]
sub     cl, '0'
jb      @f
cmp     cl, 10
jnb     @f
lea     eax, [eax*4+eax]    ;Multiply by 5
lea     eax, [eax*2+ecx]    ;Multiply by 2 and add digit in edx
inc     edx
jmp     @b
@@:

retn    4

__negative:

inc     edx
push    edx
call    __convert
neg     eax
retn    4[/size]``````
or from bitRAKE's
``````[size=9]s_atodw:

mov     edx, [esp+4]
cmp     BYTE PTR [edx], "-"
je      __negative

__convert:

xor     eax,eax

movzx   ecx, BYTE PTR [edx]
@@:
lea     eax, [eax*4+eax]
inc     edx
lea     eax, [eax*2+ecx-'0']
movzx   ecx, BYTE PTR [edx]
cmp     ecx, '9'+1
jnb     @F
cmp     ecx, '0'
jnb     @B
@@:

retn    4

__negative:

inc     edx
push    edx
call    __convert
neg     eax
retn    4[/size]``````
Sorry for the code bloat, but more functionality is good. :)
Posted on 2002-08-10 22:58:59 by stryker
This one lets you specify base. (base 2 up to base 36)
It is case insensitive. You can use "c1a5eb" or "C1A5EB". It won't matter.

``````atodw proc uses ecx edx esi, lpszAscii:DWORD, dwBase:DWORD

mov esi,	lpszAscii
xor eax,	eax
jmp 		_GetChar
_Numeric:	cmp ecx,	dwBase
jnc 		_Exit
mul		dwBase
inc 		esi
_GetChar:	movzx ecx,	byte ptr [esi]
sub cl,		30h
cmp cl,		0Ah
jc 		_Numeric
sub cl,		11h
and cl,		0DFh
cmp cl,		1Ah
jc 		_Alpha
_Exit:		ret

atodw endp``````
Posted on 2002-08-11 01:25:28 by iblis
Signed values conversion. Speed optimized.
``````
; esi-string
cmp [byte esi+0],'-'+1
sbb ebx,ebx
movzx eax,[byte esi-1]
sub eax,'0'
@@:	movzx ecx,[byte esi+0]
movzx edx,[byte esi+1]
sub ecx,'0'
sub edx,'0'
cmp ecx,9
ja @W
cmp edx,9
ja @F
lea ecx,[ecx+4*ecx]
imul eax,100
lea edx,[edx+2*ecx]
jmp @B
@@:	lea eax,[eax+4*eax]
lea eax,[ecx+2*eax]
xor eax,ebx
; eax-number
``````

*Edit*: I find Athlon optimized version
Posted on 2002-08-11 02:15:29 by Nexo
Hi all

Assuming PROFILE can be trusted, the results are a bit odd.

KetilO
Posted on 2002-08-11 03:49:45 by KetilO
hi ! :)

i have optimize just a little bit the proc of bitRake :

mov ecx, lpAscii
xor eax, eax
xor edx,edx
; use this line if not assume one digit and move INC ECX
;mov edx, '0'
;this instruction decrease speed only here
;movzx edx, BYTE PTR
mov dl,BYTE PTR
@@:
lea eax,
inc ecx
lea eax,
movzx edx, BYTE PTR
xor edx, '0'
cmp edx, 10
jb @B
@@:

xor edx, '0'
cmp edx, 10
jb @B

is equivalent to

cmp edx, '9'+1
jnb @F
cmp edx, '0'
jnb @B

xor '0' with a number will always give a number <10 ;) :grin:

Bye alls ! :alright:
Posted on 2002-08-11 06:33:22 by kylekiller

Hi all

Assuming PROFILE can be trusted, the results are a bit odd.

KetilO

Hi KetilO,

PROFILE never gave me any problem. If one pretends to use it with MASM, however, this has some (MASM) limitations. Quoting some lines of the MASM translation of PROFILE.INC (it wasn't translated by me, by the way, and I never tested it):

``````
ALIGN            4                              ; align to a cache entry on all CPU's
PROFILEROUTINE                  DWORD            ?
PROFILECYCLES                   DWORD            ?
[...]
ALIGN           16                              ; align to a cache entry on all CPU's
[...]
ALIGN           16                              ; align to a cache entry on all CPU's
@empty:                         RET
``````

Clearly the explicitly required "align to a cache entry on all CPU's" and that 4 or 16 after ALIGN are not compatible.

My advice: use a real assembler. My definition of "real assembler" is a 1:1 assembler that lets you do anything required by low level programming. Unfortunately, the Microsoft Macro Assembler doesn't always fit the definition.

Also (it's not your case, though), as I stated many times, care must be spent on understanding that PROFILE calls the routine to be tested 5 times, and although it preserves the EAX..ESP and EFL registers across these 5 internal calls, it cannot save/restore variables held in memory, for example, thus (as you did, anyway, but unlike others) one has to PROFILE a routine which also initializes all memory variables and memory buffers that it will modify (i.e. write and read).

Last but not least (again, as stated in the documentation not without a reason) << [..] For such comparative tests, remember to align the same way all the routines to be tested (use ALIGN 64 to align to a cache line on all modern x86 CPUs). >>

PS: official FAsm version (previously I released only a NASM version):

``````
; ---------------------------------------------------------------------------
; ***************************************************************************
; ---------------------------------------------------------------------------
;
;
; PROFILING MACROs.
;
;
; Original NASM and FASM code (c) 2002 Fabio Bizzetti (a.k.a. Maverick).
; MASM translation by grv575 and improved by others (please claim credits).
;
; You can freely use and spread this source code as long as you do not modify
; any part of it, including the copyright notice.
;
; ---------------------------------------------------------------------------
;
; These MACROs and code are useful to count the exact number of cycles that
; it takes to a piece of code to execute. Although not particularly tuned to
; the hardware of a certain CPU, PROFILE is designed to work well on all the
; past, present and future x86 CPU's, so far with no exceptions known to me.
; Please report any exceptions to [email]maverock@libero.it[/email]
;
; ---------------------------------------------------------------------------
;
; Use it as follows:
;
; ... set up your CPU registers as you wish ...
; PROFILE yoursubroutine
; ... check the results of yoursubroutine, if you wish ...
;
; i.e. just use "PROFILE" exactly like you would use the CALL instruction,
; since registers/CPU flags aren't affected by the profiler.
; On return you will get in the 64 bit unsigned integer [PROFILE.CYCLES] the
; exact number of cycles it took to yoursubroutine to be executed. Of course,
; if you're certain that it took less than 2^32 cycles to execute, you can
; access [PROFILE.CYCLES] as a 32bit variable.
; Note that the routine self-adapts itself to any past, present and future
; CPU's (so you don't have to subtract x cycles depending on your CPU), it
; automatically removes the cost of the last RET in the subroutine it tests
; (simulating the test of inlined code, which is usually more useful), and
; it calls it several times to ensure that the caches are setup (you can
; also test it uncached, though, but only in ring0.. e.g. in protected mode
; Dos, where I used this PROFILE routine most), and to do it correctly it
; saves and restores the CPU registers at each call.. so the only limitation
; here is that (being the routine to be tested called more than once)
; external pointers or counters must be initialized in your own routine..
; about the EAX..ESP and EFL registers instead it's transparent, and behaves
; like if yoursubroutine to be tested was called only once.
;
; The advantages of this routine? It's 100% precise, consistent and stable,
; this is my first goal and I've never seen one that behaved better (that's
; why I wrote this). It self adapts to any CPU.. saving you from this major
; hassle. It can be used to profile whole subroutines by just using PROFILE
; instead of CALL, and giving a 64bit result, it can be used to profile
; long-executing routines as well. I still use it much for comparing
; different versions of a subroutine on the same dataset, to choose the
; fastest one. Also, it's precise down to ~0 cycles (something extremely
; troublesome on modern CPU's).. so that's the most interesting thing IMO..
; since it's reliable and can be used to test any routine in a extremely
; precise way.
;
; For such comparative tests, remember to align the same way all the routines
; to be tested (use ALIGN 64 to align to a cache line on all modern x86 CPUs).
;
; NOTE about all the ALIGN directives in the source:
; *YOU* *have* *to* provide and ensure alignment. This *IS* a requirement.
;
; ---------------------------------------------------------------------------
; ***************************************************************************
; ---------------------------------------------------------------------------

CALL            _PROFILE
}

; -------------------------------------

ALIGN           64                              ; align to a cache entry on all CPU's

PROFILE.CYCLES:                 DD              0
DD              0
_PROFILE.EMPTY:                 DD              0                               ; how many cycles it takes for a simple RET to be executed on the host CPU
DD              0
_PROFILE.ROUTINE:               DD              0
_PROFILE.RETURN:                DD              0
_PROFILE.IN.EAX:                DD              0
_PROFILE.IN.EBX:                DD              0
_PROFILE.IN.ECX:                DD              0
_PROFILE.IN.EDX:                DD              0
_PROFILE.IN.ESI:                DD              0
_PROFILE.IN.EDI:                DD              0
_PROFILE.IN.EBP:                DD              0
_PROFILE.IN.EFL:                DD              0
_PROFILE.OUT.EAX:               DD              0
_PROFILE.OUT.EBX:               DD              0
_PROFILE.OUT.ECX:               DD              0
_PROFILE.OUT.EDX:               DD              0
_PROFILE.OUT.EFL:               DD              0

; the following is to make sure that data and code are on a different page.
ALIGN           4096                            ; note: *YOU* have to provide alignment

; -------------------------------------

ALIGN           64                              ; align to a cache entry on all CPU's
_PROFILE:
MOV             DWORD [_PROFILE.IN.EAX],EAX     ; saves INPUT EAX (will be trashed by CPUID)
MOV             DWORD [_PROFILE.IN.EBX],EBX     ; saves INPUT EBX (will be trashed by CPUID)
MOV             DWORD [_PROFILE.IN.ECX],ECX     ; saves INPUT ECX (will be trashed by CPUID)
MOV             DWORD [_PROFILE.IN.EDX],EDX     ; saves INPUT EDX (will be trashed by CPUID)
MOV             DWORD [_PROFILE.IN.ESI],ESI     ; saves INPUT EAX (will be trashed by the routine to be tested, which will be called multiple times)
MOV             DWORD [_PROFILE.IN.EDI],EDI     ; saves INPUT EBX (will be trashed by the routine to be tested, which will be called multiple times)
MOV             DWORD [_PROFILE.IN.EBP],EBP     ; saves INPUT ECX (will be trashed by the routine to be tested, which will be called multiple times)
PUSHFD
POP             DWORD [_PROFILE.IN.EFL]         ; saves INPUT CPU EFLAGS
POP             DWORD [_PROFILE.RETURN]         ; saves return address
PUSH            DWORD [_PROFILE.ROUTINE]        ; saves requested _PROFILE.ROUTINE
MOV             DWORD [_PROFILE.ROUTINE],.empty ; first we'll profile a simple RET
JMP DWORD       .profile                        ; make sure it gets cached
JMP DWORD       .profile                        ; profile for real (well, let it set up)
JMP DWORD       .profile                        ; profile for real (well, let it set up again)
JMP DWORD       .profile                        ; profile for real (well, let it set up one final time)
JMP DWORD       .profile                        ; profile for real
.ret5:                          MOV             EAX,DWORD [PROFILE.CYCLES+0]
MOV             EDX,DWORD [PROFILE.CYCLES+4]
MOV             DWORD [_PROFILE.EMPTY+0],EAX    ; saves RET cycles count
MOV             DWORD [_PROFILE.EMPTY+4],EDX
;
POP             DWORD [_PROFILE.ROUTINE]        ; restores requested _PROFILE.ROUTINE
JMP BYTE        .profile                        ; make sure it gets cached
JMP BYTE        .profile                        ; profile for real
JMP BYTE        .profile                        ; make sure it gets cached
JMP BYTE        .profile                        ; profile for real
JMP BYTE        .profile                        ; make sure it gets cached
.ret10:                         MOV             EAX,DWORD [_PROFILE.EMPTY+0]    ; subtracts simple RET overhead
MOV             EDX,DWORD [_PROFILE.EMPTY+4]
SUB             DWORD [PROFILE.CYCLES+0],EAX    ; saves cycles, low 32bit
SBB             DWORD [PROFILE.CYCLES+4],EDX    ; saves cycles, high 32bit
MOV             EAX,DWORD [_PROFILE.OUT.EAX]    ; gives OUTPUT EAX
MOV             EBX,DWORD [_PROFILE.OUT.EBX]    ; gives OUTPUT EBX
MOV             ECX,DWORD [_PROFILE.OUT.ECX]    ; gives OUTPUT ECX
MOV             EDX,DWORD [_PROFILE.OUT.EDX]    ; gives OUTPUT EDX
PUSH            DWORD [_PROFILE.OUT.EFL]
POPFD                                           ; gives CPU EFLAGS
JMP             DWORD [_PROFILE.RETURN]         ; returns to caller
.profile:
;WBINVD (if you want to test uncached data/code) ; beware, WBINVD is only for ring0
MOV             EAX,DWORD [PROFILE.CYCLES+0]    ; touches caches
MOV             EDX,DWORD [PROFILE.CYCLES+4]
MOV             EAX,DWORD [_PROFILE.IN.EAX]
MOV             EBX,DWORD [_PROFILE.IN.EBX]
MOV             ECX,DWORD [_PROFILE.IN.ECX]
MOV             EDX,DWORD [_PROFILE.IN.EDX]
MOV             ESI,DWORD [_PROFILE.IN.ESI]
MOV             EDI,DWORD [_PROFILE.IN.EDI]
MOV             EBP,DWORD [_PROFILE.IN.EBP]
MOV             EAX,DWORD [_PROFILE.IN.EFL]
MOV             EAX,DWORD [_PROFILE.OUT.EAX]
MOV             EBX,DWORD [_PROFILE.OUT.EBX]
MOV             ECX,DWORD [_PROFILE.OUT.ECX]
MOV             EDX,DWORD [_PROFILE.OUT.EDX]
MOV             EAX,DWORD [_PROFILE.OUT.EFL]
MOV             EAX,DWORD [_PROFILE.ROUTINE]
MOV             ECX,32
.stack:                         PUSH            EAX                             ; touches stack
LOOP            .stack
XOR             EAX,EAX
CPUID                                           ; flush pipelines
RDTSC
MOV             DWORD [PROFILE.CYCLES+0],EAX    ; saves TSC, low 32bit
MOV             DWORD [PROFILE.CYCLES+4],EDX    ; saves TSC, high 32bit
XOR             EAX,EAX
CPUID                                           ; flush pipelines
MOV             EAX,DWORD [_PROFILE.IN.EAX]     ; restores INPUT EAX
MOV             EBX,DWORD [_PROFILE.IN.EBX]     ; restores INPUT EBX
MOV             ECX,DWORD [_PROFILE.IN.ECX]     ; restores INPUT ECX
MOV             EDX,DWORD [_PROFILE.IN.EDX]     ; restores INPUT EDX
MOV             ESI,DWORD [_PROFILE.IN.ESI]
MOV             EDI,DWORD [_PROFILE.IN.EDI]
MOV             EBP,DWORD [_PROFILE.IN.EBP]
PUSH            DWORD [_PROFILE.IN.EFL]
POPFD                                           ; restores CPU EFLAGS
CALL            DWORD [_PROFILE.ROUTINE]        ; calls the routine to be tested
MOV             DWORD [_PROFILE.OUT.EAX],EAX    ; saves OUTPUT EAX
MOV             DWORD [_PROFILE.OUT.EBX],EBX    ; saves OUTPUT EBX
MOV             DWORD [_PROFILE.OUT.ECX],ECX    ; saves OUTPUT ECX
MOV             DWORD [_PROFILE.OUT.EDX],EDX    ; saves OUTPUT EDX
PUSHFD
POP             DWORD [_PROFILE.OUT.EFL]        ; saves OUTPUT CPU EFLAGS
XOR             EAX,EAX
CPUID                                           ; flush pipelines
RDTSC
XCHG            DWORD [PROFILE.CYCLES+0],EAX
XCHG            DWORD [PROFILE.CYCLES+4],EDX
SUB             DWORD [PROFILE.CYCLES+0],EAX    ; saves TSC, low 32bit
SBB             DWORD [PROFILE.CYCLES+4],EDX    ; saves TSC, high 32bit

ALIGN           64                              ; align to a cache entry on all CPU's
.empty:                         RET

; ---------------------------------------------------------------------------
; ***************************************************************************
; ---------------------------------------------------------------------------
``````
Posted on 2002-08-11 08:29:23 by Maverick
Hi Maverick

Thanks for a very useful tool. The results simply shows that optimalisation is too complex to be done on paper alone. It must be tested.

KetilO
Posted on 2002-08-11 08:49:06 by KetilO
Hei KetilO,

Yes, I strongly believe that too. Expecially when one deals with cache-critical routines (i.e. look up tables).

Maverick
Posted on 2002-08-11 10:26:00 by Maverick
On my AMD I ran your a2dw test program. These are my results in order from fastest to slowest.

When strlen( lpAscii ) == 0:
KetilO
The Svin
BitRake
Nexo

When strlen( lpAscii ) == 1 :
BitRake
The Svin
KetilO
Nexo

When strlen( lpAscii ) == 2 :
The Svin
BitRake
KetilO
Nexo

When strlen( lpAscii ) == 3 and >= 7:
BitRake
The Svin
KetilO
Nexo

When strlen( lpAscii ) >= 4 and <= 6:
The Svin
BitRake
KetilO
Nexo

Very confusing results. Nexo's algo performs the worst, but as strlen() gets higher he begins to catch up. Unfortunately I can only input 8 chars.

BitRake and The Svin's algos, the two best performers, seem to be neck and neck most of the time, but The Svin's numbers appear to fluctuate where BitRake's numbers are more steady.
Posted on 2002-08-11 10:44:45 by iblis
On my PIII

Fastest first.

0 K,S,B,N
1 K=S,B,N
2 K,B,S,N
3 K,B,S,N
4 S,K,B,N
5 B,S,K,N
6 B,S,N,K
7 B,S,N,K
8 B,N,S,K
9 B,S,N,K
10 B,S=N,K

Very confusing.
It should be said that Nexo's has sign and register preservation. So the comparition is not fair.

KetilO
Posted on 2002-08-11 11:10:36 by KetilO
I modificate test.
Now all proc executed in same address space.
It is very significan moment.
Also remove sign checking from my proc.
Very intresting as was changed results.
Posted on 2002-08-11 12:46:05 by Nexo
here is my try

``````

AsciiToDw proc uses edx esi,lpAscii:DWORD

mov		esi,lpAscii
xor 		edx,edx
xor		eax,eax
mov 		bl,[esi]
jmp @checkdigit
@@:

lea		eax,[eax*4+eax]		;Multiply by 5
inc		esi

lea		eax,[eax*2+edx]		;Multiply by 2 and add digit in edx

@checkdigit:
mov		dl,[esi]
xor		dl,48
js @F
cmp 		dl,10
jb @B

@@:

ret
AsciiToDw endp
``````

EDIT:
update the algo
Posted on 2002-08-11 13:11:11 by eko
Hi !

This is my result from my PII 350 :grin:

0 Car
Ketilo, The Svin
bitRAKE
Nexo

1 Car
The Svin, bitRAKE
Ketilo
Nexo

2 Car
bitRAKE
The Svin
Ketilo
Nexo

3,4 Car
bitRAKE
Ketilo
The Svin
Nexo

5 Car
bitRAKE
The Svin
Nexo
Ketilo

6 Car
bitRAKE
Ketilo, Nexo
The Svin

7 Car
bitRAKE
Nexo
The Svin
Ketilo

8 Car
bitRAKE
The Svin
Nexo
Ketilo

So, on My PII bitRake is the Best.
Result are very different from AMD and intel :rolleyes: :grin:

Bye ! ;)
Posted on 2002-08-11 13:51:43 by kylekiller
Note that only lower end case of PROC exit is tested and IMHO, Nexo has "cheated" by eliminating upper end test from his PROC while leaving it in all the others.

Oh, after review of the thread, it appears Nexo never tested upper exit case. :grin:

Picture (>9k) is worth some words...
Posted on 2002-08-11 16:18:22 by bitRAKE