I just finished a number to string conversion proc as a learning exercise.  My idea was to divide the number by 1 in the largest possible place value and then continue to divide reiteratively, dividing the divisor by 10 each time.  After each division I take the quotient and add 48 (the ascii table offset) to it to get that number's corresponding ascii character.  Then at the end taking the last number as is.

So that's my solution and it works like a charm.  My question now is:  is that the way it's usually done, or is there a less costly algorithm?  There used to be an algorithms section on this board but I couldn't find it.  Cheers!

Here's the code:


;+----------------------------------------------------------------------------+
;|  Title: NumberToString
;|  In: dwNumberIn (Contains the dword value to be converted.)
;|      dwBufSizeIn (Contains the size, in bytes, of the buffer pointed to.)
;|  Out: pStringOut (Contains a pointer to the buffer receiving the string.)
;|  Returns: -2 on buffer to small, -1 on error, 0 if successful
;|  Notes: Maximum DWORD value is 4,294,967,295 (0xFFFFFFFF)
;|  Author: R.Daneel     
;+----------------------------------------------------------------------------+
NumberToString   proc    dwNumberIn:DWORD, pStringOut:DWORD, dwBufSizeIn:DWORD
    LOCAL   dwRemainder:DWORD
    LOCAL   dwQuotient:DWORD
    LOCAL   dwNumber:DWORD
    LOCAL   pCharPosition:DWORD
    LOCAL   dwDivCount:DWORD
    LOCAL   dwDivisor:DWORD
    LOCAL   boolZeroFlag:DWORD
    LOCAL   dwDigitCount:DWORD


    ;## Set the zero keeper flag to FALSE
    mov     boolZeroFlag, FALSE
   
    ;## Set the initial digit count to zero
    mov     dwDigitCount, 0

    ;## Save the original number
    push    dwNumberIn
    pop     dwNumber
   
    ;## Set the string buffer tracking variable
    push    pStringOut
    pop     pCharPosition
   
    ;## Set the initial divisor to 1 billion and the div count to 9
    mov     dwDivisor, 1000000000     
    mov     dwDivCount, 9
   
    ;## Clear the output buffer with zeros
    pushf
    cld
    mov     al, 0
    mov     ecx, dwBufSizeIn
    mov     edi, pStringOut
    rep     stosb
    popf
     
    ;## Enter the loop to begin dividing down the number by 10's.  The idea is to
    ;## divide the number again and again by descending 10's position and use the
    ;## quotient as the basis for what number goes at that position.  Then we add
    ;## 48 to that number to get to the corresponding ascii character.
    .WHILE (dwDivCount > 0)
        ;## See if this number is divisible by the divisor
        nop
        xor     eax, eax
        xor     edx, edx
        mov     eax, dwNumber
        mov     ebx, dwDivisor
        div     ebx
        mov     dwRemainder, edx
        mov     dwQuotient, eax
       
        .IF ((dwQuotient > 0) && (boolZeroFlag != TRUE))
            ;## Since we've hit a number greater than zero, now we need to
            ;## display the zeros since they'll have significance.
            mov     boolZeroFlag, TRUE           
        .ENDIF
       
        .IF ((dwQuotient > 0) || (boolZeroFlag == TRUE))
            ;## See if we've reached the string size limit
            inc     dwDigitCount
            mov     eax, dwBufSizeIn
            dec     eax
            .IF (dwDigitCount > eax)
                ;## Since there was an error, let's clear the output buffer
                ;## with zeros so we don't give false results.
                pushf
                cld
                mov     al, 0
                mov     ecx, dwBufSizeIn
                mov     edi, pStringOut
                rep     stosb
                popf           
                mov     eax, -2
                ret
            .ENDIF
                   
            ;## This number was larger than the divisor so the quotient
            ;## should tell us what the first character should be.  We need
            ;## to add 48 to that number to get the equivelant ascii char.
            ;## We then store the character in the first byte of the string.
            xor     eax, eax
            mov     eax, dwQuotient
            add     eax, 48
            mov     edi, pCharPosition
            stosb
           
            ;## Increment to the next byte in the string buffer.
            inc     pCharPosition           
           
            ;## Now subtract (quotient * divisor) from the original value
            ;## to begin testing the next position.
            .WHILE (dwQuotient > 0)
                mov     eax, dwNumber
                sub     eax, dwDivisor
                mov     dwNumber, eax
                dec     dwQuotient
            .ENDW
        .ENDIF
       
        ;## Decrement the div count and divide the divisor by 10 to get the
        ;## next 10's position to test
        dec     dwDivCount
        xor     eax, eax
        xor     edx, edx
        mov     eax, dwDivisor
        mov     ebx, 10
        div     ebx
        mov     dwDivisor, eax     
    .ENDW
   
    ;## Put the final remainder in the last string position
    xor     eax, eax
    mov     eax, dwNumber
    add     eax, 48
    mov     edi, pCharPosition
    stosb
   
   
    xor     eax, eax
    ret
NumberToString   endp
Posted on 2009-03-25 16:55:59 by rdaneel
Hi,
No offence, really, but this is acutally the longest and probably the slowest "dw2ascii" function I've ever seen ;) So yeah - there are better ways to do it :P First of all: avoid any divisions. Div is generally bad. Then delete this 'Clear the output buffer with zeros' because it's kinda pointless.

Look at MASM32's implementation of "dwtoa". It should give you some nice ideas :)
Posted on 2009-03-26 09:24:41 by ti_mo_n
A way to remove one division is to 'reverse' your algorithm.
You divide out digits from one end, starting with a big divisor, and then cutting down the divisor one digit at a time...
The number can be seen as this:
number = digit(n) * 10^n + digit(n-1) * 10^(n-1) + ... + digit(1) * 10^1 + digit(0) * 10^0

Looking at it another way:
digit(0) = number % 10;
digit(1) = (number/10) % 10;
digit(2) = (number/100) % 10;
...
digit(n) = number/(10^n) % 10;

Basically you can rewrite that as a sequential algorithm:
number(n) = number(n-1) / 10;
digit(n) = number(n) % 10;

This allows you to take advantage of the fact that % 10 and / 10 are a single div operation. Then you just have a single loop... The only difference is that you start filling your buffer from the other side.
Posted on 2009-03-26 10:23:44 by Scali
ti_mo_n:  none taken.  i am a terrible asm programmer.  this was mostly a mental exercise to see if I could reproduce this function without cheating and looking at the m32lib source.  now that I know a way to do it I feel free to cheat. :)  i planned on trying to convert those divs to some form of shr+mul after I confirmed that the layout worked.  thanks for the info.

scali:  this seems to be what the m32lib is doing if I read it correctly.  now I understand why he reversed the digits in that code.  great info.  thankyou.

- in general, I hate using pre-made code without knowing at least how it accomplishes what it's doing.  once i understand the concept then it doesn't feel like cheating to use those common functions.  maybe i'm just odd.  thanks again guys.
Posted on 2009-03-26 11:06:48 by rdaneel
On a side note (and interesting tidbit) I thought once like that too but unlike you I decided to peek in the m32lib and clearly remember one of my bestest WTF moments ever...  :lol:

; #########################################################################

      .386
      .model flat, stdcall  ; 32 bit memory model
      option casemap :none  ; case sensitive

        wsprintfA PROTO C :DWORD,:VARARG
        wsprintf equ <wsprintfA>

    .data
      fMtStrinG db "%lu",0

    .code

; #########################################################################

dw2a proc dwValue:DWORD, lpBuffer:DWORD

    ; -------------------------------------------------------------
    ; convert DWORD to ascii string
    ; dwValue is passed as a value, direct, indirect or in register
    ; lpBuffer is the ADDRESS of the receiving buffer
    ; EXAMPLE:
    ; invoke dw2a,edx,ADDR buffer
    ; -------------------------------------------------------------
   
    invoke wsprintf,lpBuffer,ADDR fMtStrinG,dwValue

    ret

dw2a endp

; #########################################################################

end
Posted on 2009-03-26 11:29:29 by JimmyClif
LOL!!!  i did the same freakin thing.  I saw that call to wsprintf and thought cheater!  then i realized i was looking at the wrong file.  :oops:  good stuff. lol.
Posted on 2009-03-26 12:43:50 by rdaneel
You could probably use muls yea.
I don't think I ever tried that... but you could use fixedpoint arithmetic...
Namely:
1: x / 10 = x * (1/10)
Which can be rewritten to (x * (2^32/10)) / (2^32) == (x * (2^32/10)) >> 32
And
2: x % 10 = x - ((int)(x/10)*10)

Substitute x/10 with the first equation, and you have completely removed the divisions from the algorithm.
The x*10 operation can also be replaced with some shifts or adds...
Eg:
y = x + x // x * 2
z = x << 3 // x * 8
x = x + y // x * (2 + 8 )

Depending on the CPU used, these elaborate substitutions might actually be faster than using a div or mul directly.
Posted on 2009-03-26 12:56:41 by Scali
I tried my hand at doing it with muls only, no divs...
It works, gets a tad inaccurate for numbers close to 2^32, but I've put in a correction factor, so it always gives the right result. If you were to optimize it, you could split the algo in two... If you find out where the instability starts, you can just use the quick-and-dirty version for all small numbers, and use the correction only for large numbers.
Posted on 2009-03-26 15:42:05 by Scali

I tried my hand at doing it with muls only, no divs...

What about only one multiplication and no loops :)
http://www.masm32.com/board/index.php?topic=9951.msg73240#msg73240
Posted on 2009-03-26 16:45:01 by drizz


I tried my hand at doing it with muls only, no divs...

What about only one multiplication and no loops :)
http://www.masm32.com/board/index.php?topic=9951.msg73240#msg73240


I'd rather hear the explanation than see the code.
Posted on 2009-03-26 16:58:44 by Scali
Sure,

maximum unsiged 32bit number is 4294967295, thats 10 digits in base 10
im using fixed point arithmetic to get decimal digit in the last 4 bits of a 64bit number

only small difference is handling the first digit, it can be max 4(10)
i'm going to multiply the number by 2(64-A)/B
where A is the number of bits of number 4
4(10) = 100(2), that is 3 bits, A=3
B = 10Int(Log10(4294967295))

2^(64-A)/B = 0x089705F41

after handling the first digit, we get subsequent digits by multiplying with 10
the digit is in the last nibble of 64bit number (last 4 bits)


dw2ascii(DWORD val)
{
//val=4294967295;
QWORD x,y;
DWORD i,j;

x=(QWORD)val*0x089705F41; // mul by 2^(64-3)/1000000000
x+=0x70000000; // this is only needed for when val is very big ~~ 4294967295
i=x>>(64-3); // last 3 bits == either 0,1,2,3 or 4
printf("%u",i);
// move the number in position and clear out the last nibble (4 bits)
x>>=1;

x=x&0xFFFFFFFFFFFFFFF;// mask out everything but the last nibble
x*=10;// multiply by 10 to get next digit in the last nibble
i=x>>(64-4);
printf("%u",i);

x=x&0xFFFFFFFFFFFFFFF;
x*=10;
i=x>>(64-4);
printf("%u",i);

x=x&0xFFFFFFFFFFFFFFF;
x*=10;
i=x>>(64-4);
printf("%u",i);

x=x&0xFFFFFFFFFFFFFFF;
x*=10;
i=x>>(64-4);
printf("%u",i);

x=x&0xFFFFFFFFFFFFFFF;
x*=10;
i=x>>(64-4);
printf("%u",i);

x=x&0xFFFFFFFFFFFFFFF;
x*=10;
i=x>>(64-4);
printf("%u",i);

x=x&0xFFFFFFFFFFFFFFF;
x*=10;
i=x>>(64-4);
printf("%u",i);

x=x&0xFFFFFFFFFFFFFFF;
x*=10;
i=x>>(64-4);
printf("%u",i);

x=x&0xFFFFFFFFFFFFFFF;
x*=10;
i=x>>(64-4);
printf("%u",i);

}



Oh, and full 64bits are only needed for first 2 digits, afterwards lower dword can be ignored
Posted on 2009-03-26 18:26:47 by drizz
Yea, thanks.
I half figured out your code... I saw that you were multiplying by 10 all the time, extracting one digit from the most significant part of the register.
That made me think that you used a constant to basically scale 1000000000 up to 0x10000000 (so most significant decimal digit becomes most significant hexadecimal digit.. in a way... and that means the top 4 bits).

I didn't really have the time yet to fully analyse your constant, but I did see there was some kind of special case. And right, good observation that the top digit can only be 0-4, so you need less bits than for the rest.

One thing though... Why exactly do you do this:
x+=0x70000000; // this is only needed for when val is very big ~~ 4294967295

I suppose 0x700000000 is the equivalent of 0.5 in the upscaled number? So basically you use it to round to nearest (as a proper div would do), rather than rounding down?

At any rate, it's a very elegant routine... If it's completely robust for all numbers between 0 and 2^32, that'd be great.
I bet if I write out all my multiplications into a formula, this relation will somehow follow from it. I used the 'reverse' method like with divisions, starting out with the lowest digit first. But for multiply, using the highest digit first makes a lot more sense.
Posted on 2009-03-27 02:50:46 by Scali
DIV isn't too bad on newer cpu's but using SSEx (if you can) is the way to go.
Posted on 2009-03-27 02:59:59 by sinsi

Yea, thanks.
I half figured out your code... I saw that you were multiplying by 10 all the time, extracting one digit from the most significant part of the register.
That made me think that you used a constant to basically scale 1000000000 up to 0x10000000 (so most significant decimal digit becomes most significant hexadecimal digit.. in a way... and that means the top 4 bits).

I didn't really have the time yet to fully analyse your constant, but I did see there was some kind of special case. And right, good observation that the top digit can only be 0-4, so you need less bits than for the rest.

One thing though... Why exactly do you do this:
x+=0x70000000; // this is only needed for when val is very big ~~ 4294967295

I suppose 0x700000000 is the equivalent of 0.5 in the upscaled number? So basically you use it to round to nearest (as a proper div would do), rather than rounding down?

At any rate, it's a very elegant routine... If it's completely robust for all numbers between 0 and 2^32, that'd be great.
I bet if I write out all my multiplications into a formula, this relation will somehow follow from it. I used the 'reverse' method like with divisions, starting out with the lowest digit first. But for multiply, using the highest digit first makes a lot more sense.


You are correct in your observations, and the function works for all numbers in [0 , 232-1] interval.
0x70000000 value is empiricaly chosen and yes its for rounding, sorry for the misleading comment

dw2ascii(DWORD val,BYTE *pbuff)
{
//DWORD val=4294967295;
QWORD x;
DWORD i,j;

x=(QWORD)val*0x089705F41; // mul by 2^(64-3)/1000000000
x+=0x70000000;
i=x>>(64-3); // last 3 bits == either 0,1,2,3 or 4

pbuff[0]=i+0x30;

x>>=1; // clear out the last nibble (4 bits)
x=x&0xFFFFFFFFFFFFFFF;// mask out everything but the last nibble
x*=10;// multiply by 10 to get next digit in the last nibble
i=x>>(64-4);

pbuff[1]=i+0x30;

j=(x>>32)&0xFFFFFFF;
j*=10;
i=j>>(32-4);
pbuff[2]=i+0x30;

j&=0xFFFFFFF;
j*=10;
i=j>>(32-4);
pbuff[3]=i+0x30;

j&=0xFFFFFFF;
j*=10;
i=j>>(32-4);
pbuff[4]=i+0x30;

j&=0xFFFFFFF;
j*=10;
i=j>>(32-4);
pbuff[5]=i+0x30;

j&=0xFFFFFFF;
j*=10;
i=j>>(32-4);
pbuff[6]=i+0x30;

j&=0xFFFFFFF;
j*=10;
i=j>>(32-4);
pbuff[7]=i+0x30;

j&=0xFFFFFFF;
j*=10;
i=j>>(32-4);
pbuff[8]=i+0x30;

j&=0xFFFFFFF;
j*=10;
i=j>>(32-4);
pbuff[9]=i+0x30;

pbuff[10]=0;
}

Posted on 2009-03-27 07:37:57 by drizz
Thanks for the explanation.
It was fun trying to dissect your code, and making my own version. I'll try to complete my version tonight, now that the last few details have been worked out (hopefully)... It sorta worked, but wasn't accurate enough for all digits.

I think it's fun and very useful to solve problems without using the 'obvious' solution, based on simple maths.
In the past, I used a Commodore 64, which didn't even have instructions for mul and div at all. You could only add, sub and shift (one bit at a time).
Since I had to write my own mul and div routines, it gave me a nice insight in how various operations relate, and how analysing bit-patterns can be useful in skipping certain operations, or rewriting things for more efficient instructions.
Posted on 2009-03-27 09:29:49 by Scali
You don't have to tell me that assembly is fun  :P

For the brave ones that want to apply this tehnique for 64bit number-to-string
2^127/10^19 = 17014118346046923173 (0EC1E4A7DB69561A5H) is a nice constant

here is a quick test that shows it works for 64bits too (watch the leftmost nibble)
FFFFFFFFFFFFFFFF(16) = 18446744073709551615(10)
EC1E4A7DB69561A5 * FFFFFFFFFFFFFFFF
= EC1E4A7DB69561A413E1B582496A9E5B
------------------------------------
= EC1E4A7DB69561A413E1B582496A9E5B .
= 1D83C94FB6D2AC34827C36B0492D53CB .
= 8725DD1D243ABA0D18DA22E2DBC545EE
= 477AA3236A4B448C
= 4ACA5F6226F0AD78
= 6BE7B9D58566C6B0
= 770D42573603C2E0
= 468497681C259CC0
= 412DEA1119781F80
= 0BCB24AAFEB13B00
= 75EF6EADF2EC4E00
= 3B5A52CB7D3B0C00
= 71873BF2E44E7800
= 0F48577CEB10B000
= 98D36AE12EA6E000
= 58422CCBD284C000
= 5295BFF6392F8000
= 19D97F9E3BDB0000
= 627EFC2E568E0000
= 18F5D9CF618C0000
= 599A8219CF780000

again for first 2 digits we need 128bits and afterwards only 64.
Posted on 2009-03-27 10:36:41 by drizz
ok, here is a test version of 64bitNumberToString, needs further testing and optimizing.

__declspec(naked) __stdcall _U64ToStr (QWORD val, BYTE *pbuff)
{
__asm
{
#define __locals 4*4+4*4//4*4=regs + 4*4=128bit temp
#define __saveregs dword ptr
#define __128bittmp dword ptr
#define __val_hi dword ptr
#define __val_lo dword ptr
#define __pbuff dword ptr
#define __cnst_hi 0xEC1E4A7D
#define __cnst_lo 0xB69561A5
#define __correction_lo 0xE30437FB
#define __correction_hi 0xAD7B4F1B
sub esp,__locals
mov __saveregs[0*4],ebp
mov __saveregs[1*4],esi
mov __saveregs[2*4],edi
mov __saveregs[3*4],ebx
//
// 64bit x 64bit = 128bit result
// -------------------------------
mov esi,__val_lo//__x_lo
mov edi,__val_hi//__x_hi
mov eax,__cnst_lo//__y_lo; = b0
mul esi;// get a0*b0 = d1:d0
//mov ebp,__r_lo
mov __128bittmp[0*4],eax;//d0
mov ecx,edx;//d1
mov eax,__cnst_lo//__y_lo; = b0
xor ebx,ebx
mul edi;// get a1*b0 = e1:e0
add ecx,eax;//e0
adc ebx,edx;//e1
mov eax,__cnst_hi//__y_hi; =b1
mul esi;// get a0*b1 = f1:f0
add ecx,eax;//f0
adc ebx,edx;//f1
mov __128bittmp[1*4],ecx
mov ecx,0
mov eax,__cnst_hi//__y_hi; =b1
adc ecx,ecx
//mov ebp,__r_hi
mul edi;// get a1*b1 = g1:g0
add eax,ebx;//g0
adc edx,ecx;//g1
mov __128bittmp[2*4],eax
mov __128bittmp[3*4],edx
// -------------------------------
xor eax,eax
add __128bittmp[0*4],__correction_lo
adc __128bittmp[1*4],__correction_hi
adc __128bittmp[2*4],eax
adc __128bittmp[3*4],eax
// -------------------------------
mov eax,'0'
mov edx,__128bittmp[3*4]
add edx,edx//high dword of high qword
adc eax,0// first digit 0 or 1
mov ebp,__pbuff
mov ,al
shr edx,1
//mov __128bittmp[3*4],edx
//
mov esi,__128bittmp[0*4]
mov edi,__128bittmp[1*4]
mov eax,__128bittmp[2*4]
//mov edx,__128bittmp[3*4]

// move into position
shrd esi,edi,3
shrd edi,eax,3
shrd eax,edx,3
shr edx,3

// mul by 10

mov __128bittmp[0*4],esi
mov __128bittmp[1*4],edi
mov __128bittmp[2*4],eax
mov __128bittmp[3*4],edx
shld edx,eax,2
shld eax,edi,2
shld edi,esi,2
shl esi,2
add esi,__128bittmp[0*4]
adc edi,__128bittmp[1*4]
adc eax,__128bittmp[2*4]
adc edx,__128bittmp[3*4]
add esi,esi
adc edi,edi
adc eax,eax
adc edx,edx

mov ebx,edx
and edx,0x0FFFFFFF
shr ebx,32-4
add ebx,'0'
mov byte ptr ,bl

// mul by 10

mov __128bittmp[0*4],esi
mov __128bittmp[1*4],edi
mov __128bittmp[2*4],eax
mov __128bittmp[3*4],edx
shld edx,eax,2
shld eax,edi,2
shld edi,esi,2
shl esi,2
add esi,__128bittmp[0*4]
adc edi,__128bittmp[1*4]
adc eax,__128bittmp[2*4]
adc edx,__128bittmp[3*4]
add esi,esi
adc edi,edi
adc eax,eax
adc edx,edx

mov ebx,edx
and edx,0x0FFFFFFF
shr ebx,32-4
add ebx,'0'
mov byte ptr ,bl

#define shouldbeunrolled 17
mov ecx,-shouldbeunrolled
_L0:
mov esi,eax;
mov edi,edx;
shld edx,eax,2;
shl eax,2;
add eax,esi;
adc edx,edi;
add eax,eax;
adc edx,edx;
mov ebx,edx;
and edx,0x0FFFFFFF;
shr ebx,32-4;
add ebx,'0';
mov byte ptr ,bl;
inc ecx
jnz _L0


mov byte ptr ,0
// -------------------------------
mov ebp,__saveregs[0*4]
mov esi,__saveregs[1*4]
mov esi,__saveregs[2*4]
mov ebx,__saveregs[3*4]
add esp,__locals
ret 8+4
}
}
Posted on 2009-03-29 11:54:30 by drizz