Test this one:


;edi - pointer to string
;===============================
mov cl,2Fh
@@: inc cl
sub al,100
jnc @B
add al,100
cmp cl,31h
mov [edi],cl
sbb edx,edx
aam
cmp ah,1
lea eax,[eax][3030h]
sbb ecx,ecx
mov [edi][edx][1],ah
and ecx,edx
add edi,ecx
mov [edi][edx][2],al
mov byte ptr [edi][edx][3],","
;===============================
; lea edi,[edi][edx][4] - will load addr of next to "," byte
Posted on 2002-02-24 18:54:37 by The Svin
Nexo, maybe I should have used the term 'pre-decode' to stick with the terminology of the AMD documentation - as there are many stages to the pipeline. Sorry, if I was confusing. My analysis of the AMD pipeline is fuzzy in the middle - I understand the pre-decode and the OP execution, but my tests don't seem to fit with what the documentation states.
Posted on 2002-02-25 00:51:25 by bitRAKE
bitRAKE, I have made measurements for a cyclic code. Has received 10 clock ticks of execution for the own code, and for yours 11. The small difference of measurement with yours is coupled to differences of methods of measurement. Execution time of a cyclic code speaks absence of additional stages of the pipeline which were present at linear execution.
Always it is possible to find explanations. ;)
Posted on 2002-02-25 02:56:32 by Nexo
Hi, hutch--.
As a second approximation my code of your task looks so:
; Input: al-number , edi-chars buffer
movzx eax,al
cmp eax,10
jb @@1
mov ecx,' ,'shl 16
cmp eax,100
jb @@10
xor ecx,ecx
add eax,-200
adc ecx,1
imul ebx,ecx,-100
add ecx,',0'
lea eax,
shl ecx,16
@@10:
mov edx,0CCCCCCCDh
mov ebx,eax
mul edx
shr edx,3
imul eax,edx,-10
shl edx,8
add eax,ebx
lea ecx,
add eax,ecx
bswap eax
mov ,eax
add edi,4
ret
@@1:
shl eax,24
add eax,'0,--'
mov ,eax
add edi,4
ret

--,9 1 digit - 3 clocks
-,99 2 digits - 11 clocks
,199 3 digits - 18 clocks (optimized from 23)
Posted on 2002-02-25 02:57:27 by Nexo
Nexo,

I liked your suggestion on using a table as it removes the need to do the conversion on the fly. In the application I am not worried about 1k of table data in the code section so I coded up the first try of taking BYTE data as input and outputting formatted DB sequence data.

Thsi version has not been close range optimised but it is reasonably tidy as is. It maintains DWORD size write and I think I have kept the alignment correct so the write speed should be OK.

Important with this algo is to supply a buffer to write the data to that is 5 times the size of the source BYTE data. There is no size checking on the buffer and it will produce a page write fault if the buffer is not big enough.

I have not done any benmchmarking on it yet as it is still in the test piece I wrote it in.

Regards,

hutch@movsd.com


; ########################################################################

AsciiDump proc lpsrc:DWORD,lpbuf:DWORD,lnsrc:DWORD

LOCAL count :DWORD

jmp @F
align 4
StringTable:
db " 0",44," 1",44," 2",44," 3",44," 4",44," 5",44," 6",44," 7",44
db " 8",44," 9",44," 10",44," 11",44," 12",44," 13",44," 14",44," 15",44
db " 16",44," 17",44," 18",44," 19",44," 20",44," 21",44," 22",44," 23",44
db " 24",44," 25",44," 26",44," 27",44," 28",44," 29",44," 30",44," 31",44
db " 32",44," 33",44," 34",44," 35",44," 36",44," 37",44," 38",44," 39",44
db " 40",44," 41",44," 42",44," 43",44," 44",44," 45",44," 46",44," 47",44
db " 48",44," 49",44," 50",44," 51",44," 52",44," 53",44," 54",44," 55",44
db " 56",44," 57",44," 58",44," 59",44," 60",44," 61",44," 62",44," 63",44
db " 64",44," 65",44," 66",44," 67",44," 68",44," 69",44," 70",44," 71",44
db " 72",44," 73",44," 74",44," 75",44," 76",44," 77",44," 78",44," 79",44
db " 80",44," 81",44," 82",44," 83",44," 84",44," 85",44," 86",44," 87",44
db " 88",44," 89",44," 90",44," 91",44," 92",44," 93",44," 94",44," 95",44
db " 96",44," 97",44," 98",44," 99",44,"100",44,"101",44,"102",44,"103",44
db "104",44,"105",44,"106",44,"107",44,"108",44,"109",44,"110",44,"111",44
db "112",44,"113",44,"114",44,"115",44,"116",44,"117",44,"118",44,"119",44
db "120",44,"121",44,"122",44,"123",44,"124",44,"125",44,"126",44,"127",44
db "128",44,"129",44,"130",44,"131",44,"132",44,"133",44,"134",44,"135",44
db "136",44,"137",44,"138",44,"139",44,"140",44,"141",44,"142",44,"143",44
db "144",44,"145",44,"146",44,"147",44,"148",44,"149",44,"150",44,"151",44
db "152",44,"153",44,"154",44,"155",44,"156",44,"157",44,"158",44,"159",44
db "160",44,"161",44,"162",44,"163",44,"164",44,"165",44,"166",44,"167",44
db "168",44,"169",44,"170",44,"171",44,"172",44,"173",44,"174",44,"175",44
db "176",44,"177",44,"178",44,"179",44,"180",44,"181",44,"182",44,"183",44
db "184",44,"185",44,"186",44,"187",44,"188",44,"189",44,"190",44,"191",44
db "192",44,"193",44,"194",44,"195",44,"196",44,"197",44,"198",44,"199",44
db "200",44,"201",44,"202",44,"203",44,"204",44,"205",44,"206",44,"207",44
db "208",44,"209",44,"210",44,"211",44,"212",44,"213",44,"214",44,"215",44
db "216",44,"217",44,"218",44,"219",44,"220",44,"221",44,"222",44,"223",44
db "224",44,"225",44,"226",44,"227",44,"228",44,"229",44,"230",44,"231",44
db "232",44,"233",44,"234",44,"235",44,"236",44,"237",44,"238",44,"239",44
db "240",44,"241",44,"242",44,"243",44,"244",44,"245",44,"246",44,"247",44
db "248",44,"249",44,"250",44,"251",44,"252",44,"253",44,"254",44,"255",44
@@:

push ebx
push esi
push edi
; ============================

lea edx, StringTable
xor ebx, ebx ; line length counter

mov eax, lpsrc
add eax, lnsrc
mov count, eax ; set count as exit condition

mov esi, lpsrc
mov edi, lpbuf

mov [edi], DWORD PTR 0D202020h ; 3 space padding for alignment + CR
add edi, 4
mov [edi], DWORD PTR 2062640Ah ; LF + "db "
add edi, 4

xor eax, eax ; avoid stall

; %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

@@:
mov al, [esi]
inc esi
mov ecx, [edx+eax*4] ; all table writes are DWORD size
mov [edi], ecx
add edi, 4

cmp ebx, 16 ; test character count per line
je nxt1 ; jump on less common choice
inc ebx
cmp esi, count ; test exit condition
jne @B
jmp The_Exit

nxt1:
dec edi
mov [edi], BYTE PTR 13 ; overwrite comma with CR
inc edi
mov [edi], DWORD PTR 2062640Ah ; write 4 bytes to maintain alignment
add edi, 4
xor ebx, ebx ; zero character count
jmp @B

; %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

The_Exit:

; only overwrite last character IF its a comma ","

cmp [edi-1], BYTE PTR ","
jne @F
dec edi
@@:
mov [edi], DWORD PTR 00000A0Dh ; append CRLF * 2 ascii zeros

; ============================
pop edi
pop esi
pop ebx

ret

AsciiDump endp

; #########################################################################
Posted on 2002-02-25 05:31:06 by hutch--
Why you do not make the elementary sequential optimization of commands? The command " and edx, 0FFFF0000h " is absolutely not necessary.


Here is one good idea in your code, (it is not , of course, first two
lines) but in general it is not faster but slower in processors
it was written for.
It was written to use in Pplain and Pmmx without MMX commands
and performs 10-15% faster on them than the code you wrote.
As to MMX version, it's well written, I saw 3 or 4 versions using the algo with MMX after I released it.
I just wished to show you the algo, so you can use idea, wether
with usuall interger commands, or MMX or whatever :)
Posted on 2002-02-25 06:56:29 by The Svin
Hutch--, it's hard to beat the table - even with MMX. :)
I only make a couple small changes:
AsciiDump proc lpsrc:DWORD,lpbuf:DWORD,lnsrc:DWORD


jmp @F
align 4
StringTable:
; {clipped}
@@:

push ebx
push esi
push edi
; ============================

lea edx, StringTable
xor ecx, ecx ; byte counter for source

mov esi, lpsrc
mov edi, lpbuf

mov [edi], DWORD PTR 0D202020h ; 3 space padding for alignment + CR
add edi, 4
mov [edi], DWORD PTR 2062640Ah ; LF + "db "
add edi, 4

xor eax, eax ; avoid stall

; %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

@@:
mov al, [esi+ecx]
inc ecx
mov ebx, [edx+eax*4] ; all table writes are DWORD size
mov [edi], ebx
add edi, 4

test ecx, 0Fh ; test character count per line
je nxt1 ; jump on less common choice
cmp ecx, lnstr ; test exit condition
jne @B
jmp The_Exit

nxt1:
mov BYTE PTR [edi-1], 13 ; overwrite comma with CR
mov DWORD PTR [edi], 2062640Ah ; write 4 bytes to maintain alignment
add edi, 4
jmp @B

; %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

The_Exit:

; only overwrite last character IF its a comma ","

cmp [edi-1], BYTE PTR ","
jne @F
dec edi
@@:
mov [edi], DWORD PTR 00000A0Dh ; append CRLF * 2 ascii zeros

; ============================
pop edi
pop esi
pop ebx

ret

AsciiDump endp
Your version was easier to change the bytes per line.
Posted on 2002-02-25 08:36:02 by bitRAKE