Here is Svin's routine sligthly modified to use the '-' sign.
Also, I think I optimized the loop routine.

I'd be glad to hear about other optimizations.

JCM

.code
uqw2a proc uses ebx ecx esi edi lBuffer,lQword
public _uqw2a
_uqw2a:
LOCAL num18:QWORD
mov esi,offset lQword
mov edi,offset lBuffer
mov edx,
mov eax,

or eax, eax
jns positive
mov byte ptr , '-'
inc edi
neg eax
neg edx
sbb eax, 0
positive:
mov esi, edi

N19H equ 0DE0B6B3h
N19L equ 0A7640000h
N20H equ 8AC72304h
N20L equ 89E80000h
cmp eax,N19H
jne C20
cmp edx,N19L
C20:
jc C18
cmp eax,N20H
jne DO20
cmp edx,N20L
DO20:
jc DO19
mov byte ptr ,'1'
sub edx,N20L
lea edi,
sbb eax,N20H
DO19:
mov ebx, '0'-1
@@: inc ebx
sub edx,N19L
sbb eax,N19H
jnc @B
add edx,N19L
mov byte ptr ,bl
adc eax,N19H
inc edi
C18:
mov dword ptr num18,edx
mov dword ptr num18+4,eax

sub esp,10
fild num18
fbstp
mov ecx,2
@@:
pop eax
mov ebx, eax
and eax, 00F0F0F0Fh
and ebx, 0F0F0F0F0h
shr ebx, 4
add eax, 30303030h
add ebx, 30303030h

mov byte ptr , bh
mov byte ptr , ah
mov byte ptr , bl
mov byte ptr , al
shr ebx, 16
shr eax, 16
mov byte ptr , bh
mov byte ptr , ah
mov byte ptr , bl
mov byte ptr , al
dec ecx
lea edi,
jne @B

mov ah,byte ptr
add esp,2 ; pop ax better ?
mov al,ah
shr al,4
and eax,0f0fh
or eax,3030h
mov word ptr ,ax
mov byte ptr ,0

lea ebx,
@@: inc ebx
cmp byte ptr ,30h
je @B
jae finalfill
dec ebx
finalfill:
mov al, byte ptr
inc ebx
mov byte ptr , al
inc esi
or al,al
jne finalfill
ret
uqw2a endp

JCM
Posted on 2002-05-15 07:41:06 by MCoder
Thanks MCoder

I didn't see the small bug that happens if low part was 0 before hi part was 0. I added in your fixes, and also added a check to avoid dividing the hi part if it already reached zero.
Now uses the stack to store partial digits during calculation.

final code




.486
.model flat,stdcall
option casemap:none

include \masm32\include\Windows.inc
include \masm32\include\masm32.inc
include \masm32\include\kernel32.inc
include \masm32\include\user32.inc

includelib \masm32\lib\user32.lib
includelib \masm32\lib\masm32.lib
includelib \masm32\lib\kernel32.lib

i64toa proto :DWORD,:DWORD

.data
testint1 dq 7FFFFFFFFFFFFFFFh
testint2 dq -1234567890123456789
caption1 db "POSITIVE 64 bit integer",0
caption2 db "NEGATIVE 64 bit integer",0
testbuff db 64 dup(32) ,0

.code

start:
invoke i64toa,ADDR testint1,ADDR testbuff
invoke MessageBox,0,ADDR testbuff,ADDR caption1,0
invoke i64toa,ADDR testint2,ADDR testbuff
invoke MessageBox,0,ADDR testbuff,ADDR caption2,0
invoke ExitProcess,0

i64toa proc uses ebx ecx edx edi esi lpLarge:DWORD,lpBuffer:DWORD

mov edi,lpLarge
mov ebx,[edi] ; low 32 bits
mov ecx,[edi+4] ; high 32 bits

mov edi,lpBuffer
or ecx, ecx
jns positive
mov byte ptr [edi],'-' ; integer is negative
inc edi
not ebx
not ecx
add ebx,1
adc ecx,0

positive:
;--------------------------------
xor eax, eax ; push the 0 to end string
push eax
mov esi,10 ; decoding the 64 bit binary to ASCII

@@: sub edx,edx
test ecx,ecx ; avoid dividing numhi
jz not64 ; if it's already 0

mov eax,ecx
div esi ; divide numhi by 10...
mov ecx,eax

not64: mov eax,ebx
div esi ; .... then numlo by 10, using
mov ebx,eax ; first remainder in edx
add dl,30h ; new remainder plus 30h is the digit
push edx ; store it in the stack for printing
or eax,ecx
jnz @B ; keep looping until no digits left
;---------------------------------------------------------------
; conversion finished
; copy from stack to testbuff
; +123456.................. pop to testbuff after sign
;---------------------------------------------------------------

move:
pop eax
mov [edi],al ; move......
inc edi
or eax,eax
jnz move
ret
i64toa endp

end start

Posted on 2002-05-15 16:50:15 by towers
Good trick to avoid half of the divisions !

BTW, you can optimize this:
not ebx
not ecx
add ebx,1
adc ecx,0

as:

neg ecx
neg ebx
sbb ecx, 0

JCM
Posted on 2002-05-15 17:43:00 by MCoder
More optimized code for good processors ;)


const
dig1 ='0'
dig2 ='0'
dig3 ='0'
len =1
label tblX2DecAsc dword
REPT 1000
dd len shl 24 + dig1 shl 16 + dig2 shl 8 + dig3
dig1=dig1+1
if dig1 gt '9'
if len eq 1
len=2
endif
dig1='0'
dig2=dig2+1
if dig2 gt '9'
if len eq 2
len=3
endif
dig2='0'
dig3=dig3+1
endif
endif
ENDM

macro Div1000
mov edx,10624DD3h
mov ecx,eax
mul edx
shr edx,6
imul eax,edx,1000
sub ecx,eax
endm

macro FullTriplet
pop eax
Div1000
mov eax,edx
mov esi,[tblX2DecAsc+4*ecx]
Div1000
mov eax,[tblX2DecAsc+4*edx]
mov [d,edi+0],eax
mov eax,[tblX2DecAsc+4*ecx]
mov [d,edi+3],eax
mov [d,edi+6],esi
add edi,9
endm

codeseg
proc Qword2DecAsc
test edx,edx
je Dword2DecAsc
mov ebx,eax
mov eax,edx
mov esi,1000000000
xor edx,edx
div esi
mov ecx,eax
mov eax,ebx
div esi
mov ebx,ecx
or ecx,eax
je @@t3
push edx
mov edx,ebx
div esi
test eax,eax
je @@t2
push edx
cmp eax,10
jl @@ten
mov ebx,0CCCCCCCDh
mov ecx,eax
mul ebx
shr edx,3
imul esi,edx,10
add edx,'0'
mov [edi],dl
inc edi
sub ecx,esi
mov eax,ecx
@@ten: add al,'0'
mov [edi],al
inc edi
FullTriplet
@@t1: FullTriplet
mov [b,edi],0
ret
@@t2: mov eax,edx
call Dword2DecAsc
jmp @@t1
@@t3: mov eax,edx

proc Dword2DecAsc
mov ebx,esp
@@lpG: Div1000
mov eax,edx
push [tblX2DecAsc+4*ecx]
test eax,eax
jne @@lpG
sub ebx,esp
pop eax
mov edx,eax
shr edx,24
mov ecx,edx
xor ecx,3
shl ecx,3
shr eax,cl
mov [edi],eax
add edi,edx
jmp [tbl-4+ebx]
const
tbl dd q@@0,q@@1,q@@2,q@@3
codeseg
q@@3: pop [d,edi]
add edi,3
q@@2: pop [d,edi]
add edi,3
q@@1: pop [d,edi]
add edi,3
q@@0: mov [b,edi],0
ret
endp
endp

Compare Stepan's MMX and div+table versions



MMX div+tbl number
249 32 0
276 32 1
276 32 10
276 32 100
276 43 1000
282 43 10000
282 43 100000
282 55 1000000
280 57 10000000
284 57 100000000
284 69 1000000000
284 185 10000000000
283 185 100000000000
295 196 1000000000000
294 195 10000000000000
294 195 100000000000000
293 208 1000000000000000
289 209 10000000000000000
289 209 100000000000000000
294 185 1000000000000000000
292 191 10000000000000000000
316 191 18446744073709551615
313 191 17777777777777777777
298 191 12345678901234567890
clocks
Posted on 2002-06-03 09:39:19 by Nexo
Nexo, how many conversions do you think would have to be made before this algo really becomes faster than a non-table version on these 'good processors'?
Posted on 2002-06-03 10:37:07 by bitRAKE
Probably you have in view of cache a problem. The table - 4000 bytes. For memory DDR333 cache will be loaded for ~3000ns. If frequency of the processor of 1400 MHz, it 4285clk. An average of clock ticks MMX 286 clk, DIV-TBL 126 clk. As the result, for arbitrary number it will be 27 conversions. It so is a lot of for you? I can somewhere was mistaken? But in any case I see acceleration of machining in real applications.
Posted on 2002-06-03 14:31:56 by Nexo
I am just curious about the environment for which this algo was designed and to stress that there is a need for more than one algo for a task. I would not use this algo in a general way - it is for applications that do several conversions almost consecutively, or rather don't thrash the data cache between conversions. Thanks for sharing it with us. :)
Posted on 2002-06-03 15:12:49 by bitRAKE
What is "general way"? :) I saw variation of productivity at generation of reports, mathematical tables and statistics, orders, listings of compilation. Even at insignificant quantity of numbers on the form (2-4). Three numbers, on the average ~10 digits - 10 triplets. The table - 63 lines cache (AMD). At worst 10 triplets - 1/6 necessary lines. But it is really even less. Plus a good FSB 266Mhz (533 Mhz). How many there should be a period between conversions for trash 384 Kb (or 512 Kb) cache memories? For one thread quantum will be generated more forms.
Posted on 2002-06-05 12:23:50 by Nexo