f0dder, :)
I tested new Dword2Hex algos on my P4 3.6 ghz
Pls, if you have time test for me 2 files dword2h.exe and
bench.exe on your new box and post results
Thanks in advance
Regards,
Lingo
I tested new Dword2Hex algos on my P4 3.6 ghz
1906 hutch
1469 lingo
2781 BiteRider
1906 hutch
1469 lingo
2766 BiteRider
1906 hutch
1469 lingo
2735 BiteRider
1922 hutch
1468 lingo
2750 BiteRider
1906 hutch
1485 lingo
2765 BiteRider
1906 hutch
1469 lingo
2750 BiteRider
1906 hutch
1485 lingo
2812 BiteRider
1906 hutch
1485 lingo
2781 BiteRider
-------
Results
-------
1908 hutch average
1474 lingo average
2767 BiteRider average
Dword2Hex Tests:
Dword2Hex1: 24 clocks; Result: 1C2B3A4B
Dword2Hex : 27 clocks; Result: 1C2B3A4B
DwH->Jeff : 168 clocks; Result: 1C2B3A4B
d2H-lingo : 16 clocks; Result: 1C2B3A4B
dw2hex_ex: 38 clocks; Result: 1C2B3A4B
Press ENTER to exit...
Pls, if you have time test for me 2 files dword2h.exe and
bench.exe on your new box and post results
Thanks in advance
Regards,
Lingo
I'll do later - have to celebrate my birthday and stuff first =)
here is a variant of biterider's code, 512byte table is not neccessary
f0dder : happy birthday!
OPTION PROLOGUE:NONE
OPTION EPILOGUE:NONE
DwtoHex proc stdcall public pBuffer:DWORD,dwValue:DWORD
mov ecx,;dwValue
mov edx,ecx
shr edx,4
push ebx
and ecx,0F0F0F0Fh
and edx,0F0F0F0Fh
lea ebx,
lea eax,
and ebx,10101010h
and eax,10101010h
shr ebx,4
shr eax,4
lea ecx,
lea edx,
sub ecx,ebx
sub edx,eax
mov eax,;pBuffer
mov byte ptr ,0
mov ,cl
mov ,dl
mov ,ch
mov ,dh
shr ecx,16
pop ebx
shr edx,16
mov ,cl
mov ,dl
mov ,ch
mov ,dh
ret 2*4
DwToHex endp
OPTION PROLOGUE:PROLOGUEDEF
OPTION EPILOGUE:EPILOGUEDEF
f0dder : happy birthday!
Interesting non LUT method
I rewrote the code using SSE.
It's not optimized but it should still run faster.
I rewrote the code using SSE.
It's not optimized but it should still run faster.
D2HSSE32:
;;esp+4 buff
;;esp+8 dw val
mov eax,dword
mov edx,dword
movd xmm1,eax
pxor xmm0,xmm0
punpcklbw xmm1,xmm0 ;; 00 b4 00 b3 00 b2 00 b1
movdqa xmm0,xmm1 ;;copy
psllw xmm0, 12 ;;rearrange nibbles
psrlw xmm1, 4 ;;so after calculations
psrlw xmm0, 4 ;;ascii will output in the right order
por xmm0,xmm1 ;; 0h 0l 0h 0l 0h 0l 0h 0l
pshuflw xmm0,xmm0,00011011b ;flip words
movdqa xmm1,xmm0 ;;copy
paddb xmm0,dqword
pand xmm0,dqword
psrlw xmm0,1 ;;8's or 0's
paddb xmm1,xmm0
psrlw xmm0,3;;;1's or 0's
paddb xmm1,dqword
psubb xmm1,xmm0
movq qword,xmm1
ret 8
align 16
addb6 dq 0606060606060606h,0606060606060606h
and16 dq 1010101010101010h,1010101010101010h
addb48 dq 3030303030303030h,3030303030303030h
r22,
Isn't this shorter? Ratch
Isn't this shorter? Ratch
.686
;.MMX
;.K3D
;.XMM
.MODEL FLAT,STDCALL
; option casemap :none ; case sensitive
;.LISTMACROALL
.NOLIST
include \masm32\include\windows.inc
include \masm32\include\user32.inc
include \masm32\include\kernel32.inc
include \masm32\include\gdi32.inc
include \masm32\include\advapi32.inc
.LIST
includelib \masm32\lib\user32.lib
includelib \masm32\lib\kernel32.lib
includelib \masm32\lib\gdi32.lib
includelib \masm32\lib\advapi32.lib
W EQU WORD PTR
B EQU BYTE PTR
.DATA?
szBuffer BYTE 120 DUP (?)
.DATA
STRING1 BYTE 'ABCDEFGHIJKLMNOPQRSTUVWSXZ1234567890'
TEXT1 BYTE 'HEX DUMP',0
.CODE
BTH:
BTH$ STRUC
EDISAV DWORD ?
ESISAV DWORD ?
RETURN DWORD ?
BTH$1 = $
STRSIZ DWORD ?
BUFFER DWORD ?
STRING DWORD ?
BTH$2 = $
BTH$ ENDS
.DATA
.CODE
PUSH ESI
PUSH EDI
MOV ESI,
MOV EDI,
MOV ECX,
.REPEAT
MOVZX EAX,B
SHL EAX,4 ;shift 'n shuffle
INC ESI ;bump up the input pointer
SHR AL,4 ;shift 'n shuffle
CMP AL,10 ;thanks to Norbert Juffa for this 3 line code sequence
SBB AL,069H
DAS
ROR AX,8 ;XCHG bytes
CMP AL,10 ;thanks to Norbert Juffa for this 3 line code sequence
SBB AL,069H
DAS
OR EAX,' ' SHL 16 ;insert a space
MOV ,EAX ;EAX automatically writes a zero terminator on the end
ADD EDI,WORD+BYTE ;bump up the output pointer
DEC ECX
.UNTIL ZERO?
POP EDI
POP ESI
RET BTH$2-BTH$1 ;return to sender
MAIN:
PUSH OFFSET STRING1
PUSH OFFSET szBuffer
PUSH SIZEOF STRING1
CALL BTH
INVOKE MessageBox,0,OFFSET szBuffer,OFFSET TEXT1,0
XOR EAX,EAX
INVOKE ExitProcess,EAX
END MAIN
ratch your code is shorter (instruction wise), which isn't really true if you fully unwrap the loop.
But it's slower executing because of the cmp and jmp.
You can always benchmark the two functions and see for sure.
It may run faster for a 1byte conversion but not a full dword.
But it's slower executing because of the cmp and jmp.
You can always benchmark the two functions and see for sure.
It may run faster for a 1byte conversion but not a full dword.
and here is my plain PMMX code ;)
Here is unrolled Ratch's code
Dword2Hex Tests:
sorry r22 i dont have SSE
OPTION PROLOGUE:NONE
OPTION EPILOGUE:NONE
.data
align 8
msk0F dq 00F0F0F0F0F0F0F0Fh
gtb09 dq 00909090909090909h
and07 dq 00707070707070707h
add30 dq 03030303030303030h
.code
align 16
_DwToHexMMX proc stdcall public dwValue:DWORD,pBuffer:DWORD
mov eax,
mov edx,;dwValue
mov byte ptr ,0
bswap edx
movd mm0,edx
pxor mm1,mm1
punpcklbw mm1,mm0
movq mm0,mm1
psrlq mm0,12
pand mm1,msk0F
pand mm0,msk0F
por mm0,mm1
movq mm1,mm0
pcmpgtb mm0,gtb09
pand mm0,and07
paddb mm0,mm1
paddb mm0,add30
movq ,mm0
ret 2*4
_DwToHexMMX endp
OPTION PROLOGUE:PROLOGUEDEF
OPTION EPILOGUE:EPILOGUEDEF
Here is unrolled Ratch's code
OPTION PROLOGUE:NONE
OPTION EPILOGUE:NONE
align 16
Ratch proc dwValue:DWORD, lpBuffer:DWORD
MOV EDX,
xi = 0
REPEAT 4
MOVZX EAX,BYTE PTR
SHL EAX,4 ;shift 'n shuffle
SHR AL,4 ;shift 'n shuffle
CMP AL,10 ;thanks to Norbert Juffa for this 3 line code sequence
SBB AL,069H
DAS
XCHG AL,AH ;XCHG bytes
CMP AL,10 ;thanks to Norbert Juffa for this 3 line code sequence
SBB AL,069H
DAS
MOV ,EAX ;EAX automatically writes a zero terminator on the end
xi = xi + 1
ENDM
RET 2*4
Ratch endp
OPTION PROLOGUE:PROLOGUEDEF
OPTION EPILOGUE:EPILOGUEDEF
Dword2Hex Tests:
P2MMX533
Dword2Hex1: 32 clocks; Result: 1C2B3A4B
BiteRider : 32 clocks; Result: 1C2B3A4B
DwH->Jeff : 95 clocks; Result: 1C2B3A4B
d2H-lingo : 32 clocks; Result: 1C2B3A4B
dw2hex_ex : 26 clocks; Result: 1C2B3A4B
d2H-drizz : 21 clocks; Result: 1C2B3A4B
MMX-drizz : 17 clocks; Result: 1C2B3A4B
Ratch : 90 clocks; Result: 1C2B3A4B
sorry r22 i dont have SSE
Happy birthday f0dder!
drizz,
very nice
r22,
Since XMM registers are wider than MMX registers, however, some instructions take much longer to complete when operating on XMM registers, even though they use the same execution units.For example, the instruction paddq takes six clock cycles on XMM registers, but only two clock cycles to complete on MMX registers.
Here is the times on my P4 3.6 GHz with WinXP SP2:
Regards,
Lingo
drizz,
very nice
r22,
Since XMM registers are wider than MMX registers, however, some instructions take much longer to complete when operating on XMM registers, even though they use the same execution units.For example, the instruction paddq takes six clock cycles on XMM registers, but only two clock cycles to complete on MMX registers.
Here is the times on my P4 3.6 GHz with WinXP SP2:
Dword2Hex Tests:
Dword2Hex : 26 clocks; Result: 1C2B3A4B
dw2hex_ex : 38 clocks; Result: 1C2B3A4B
d2H-lingo : 15 clocks; Result: 1C2B3A4B
MMX_Lingo : 15 clocks; Result: 1C2B3A4B
dw2D - r22: 27 clocks; Result: 1C2B3A4B
dw2D-drizz: 19 clocks; Result: 1C2B3A4B
Press ENTER to exit...
Regards,
Lingo
Ok, here is my last try, a bit faster than my last mmx routine.
** added a bonus qword 2 hex routine :) **
** added a bonus qword 2 hex routine :) **
OPTION PROLOGUE:NONE
OPTION EPILOGUE:NONE
.data
align 8
msk0F dq 00F0F0F0F0F0F0F0Fh
gtb09 dq 00909090909090909h
and07 dq 00707070707070707h
add30 dq 03030303030303030h
.code
align 16
DwToHexMMX2 proc stdcall public dwValue:DWORD,pBuffer:DWORD
mov eax,
mov edx,;dwValue
bswap edx
mov ecx,edx
and edx,0f0f0f0f0h
shr edx,4
mov byte ptr ,0
and ecx,00f0f0f0fh
movd mm0,edx
movd mm1,ecx
punpcklbw mm0,mm1
movq mm1,mm0
pcmpgtb mm1,gtb09
pand mm1,and07
paddb mm0,add30
paddb mm0,mm1
movq ,mm0
ret 2*4
DwToHexMMX2 endp
align 16
QwToHexMMX proc stdcall public qwValue:QWORD,pBuffer:DWORD
mov edx,[0]
mov ecx,[4]
mov eax,
bswap ecx
bswap edx
movd mm0,ecx
movd mm1,edx
psllq mm0,32
por mm0,mm1
movq mm1,mm0
psrlq mm0,4
pand mm0,msk0F
pand mm1,msk0F
movq mm2,mm0
punpcklbw mm2,mm1
punpckhbw mm0,mm1
movq mm3,mm2
movq mm1,mm0
pcmpgtb mm2,gtb09
pcmpgtb mm0,gtb09
pand mm2,and07
pand mm0,and07
paddb mm2,add30
paddb mm0,add30
paddb mm2,mm3
paddb mm0,mm1
movq [0],mm0
movq [8],mm2
mov byte ptr ,0
ret 3*4
QwToHexMMX endp
OPTION PROLOGUE:PROLOGUEDEF
OPTION EPILOGUE:EPILOGUEDEF
I was going to use the SSE for making a qw2hex, but obviously the add 6 and 16 shift add and sub method wasn't the most efficient to begin with.
Here's the fastest method.
260ish KB LUT that takes words and turns them into dword strings.
I threw together some code to create a LUT that big in the data section then I benchmakred the little look up function.
Even with the giant LUT it only beat LINGO's MMX code by a clock or so (100ms faster over 7FFFFFFh calls).
For all those who care nothing for saving memory.
Someone come up with a legit reason to use a 260kb LUT for a formating function quick, because I can't think of one :D
Here's the fastest method.
260ish KB LUT that takes words and turns them into dword strings.
I threw together some code to create a LUT that big in the data section then I benchmakred the little look up function.
Even with the giant LUT it only beat LINGO's MMX code by a clock or so (100ms faster over 7FFFFFFh calls).
For all those who care nothing for saving memory.
D2H_LUT: ;;;esp+4 buffer ;;;esp+8 value
mov eax,
mov edx,
movzx ecx,ax
shr eax,16
mov ecx,dword
mov eax,dword
mov dword,ecx
mov dword,eax
ret 8
MakeD2HLUT: ;;esp+4 = address to make LUT 263ish kb rb
mov edx,
xor ecx,ecx
mov al,48
.LP1:
mov byte,al
add cx,1
jc .next1
test ecx,0FFFh
jne .LP1
add al,1
cmp al,58
jne .LP1
add al,7
jmp .LP1
.next1:
;---------
xor ecx,ecx
mov al,48
.LP2:
mov byte,al
add cx,1
jc .next2
test ecx,0FFh
jne .LP2
add al,1
cmp al,71
jne .skp2
mov al,48
.skp2:
cmp al,58
jne .LP2
add al,7
jmp .LP2
.next2:
;----------
xor ecx,ecx
mov al,48
.LP3:
mov byte,al
add cx,1
jc .next3
test ecx,0Fh
jne .LP3
add al,1
cmp al,71
jne .skp3
mov al,48
.skp3:
cmp al,58
jne .LP3
add al,7
jmp .LP3
.next3:
;----------
xor ecx,ecx
mov al,48
.LP4:
mov byte,al
add cx,1
jc .next4
add al,1
cmp al,71
jne .skp4
mov al,48
.skp4:
cmp al,58
jne .LP4
add al,7
jmp .LP4
.next4:
ret 4
Someone come up with a legit reason to use a 260kb LUT for a formating function quick, because I can't think of one :D
drizz, :)
"Ok, here is my last try, a bit faster than my last mmx routine."
Nice, but I have two faster MMX variants too... :)
The times are:
Regards,
Lingo
"Ok, here is my last try, a bit faster than my last mmx routine."
Nice, but I have two faster MMX variants too... :)
The times are:
Dword2Hex Tests:
d2h32 -lingo : 15 clocks; Result: 1C2B3A4B
d2hMMX1-lingo : 12 clocks; Result: 1C2B3A4B
d2hMMX2-lingo : 12 clocks; Result: 1C2B3A4B
d2hMMX1-drizz : 19 clocks; Result: 1C2B3A4B
d2hMMX2-drizz : 17 clocks; Result: 1C2B3A4B
Press ENTER to exit...
Regards,
Lingo
lingo, :), well, what is 1 clock compared to infinity :)
Regards
d2h32 -lingo : 31 clocks; Result: 1C2B3A4B
d2hMMX1-lingo : 13 clocks; Result: 1C2B3A4B
d2hMMX2-lingo : 14 clocks; Result: 1C2B3A4B
d2hMMX1-drizz : 16 clocks; Result: 1C2B3A4B
d2hMMX2-drizz : 13 clocks; Result: 1C2B3A4B
Regards
I don't think this is the fastest, but is the simpliest
;it returns the result in eax:edx and the source is ecx, ebx is the base and must be zero
mov bl, ch
mov dx, word ptr
shl dx, 16
mov bl, cl
mov dx, word ptr
shr ecx
mov bl, ch
mov ax, word ptr
shl ax, 16
mov bl, cl
mov ax, word ptr
;done
tb:
db "000102030405060708090A0B0C0D0E0F"
db "101112131415161718191A1B1C1D1E1F"
db "202122232425262728292A2B2C2D2E2F"
db "303132333435363738393A3B3C3D3E3F"
db "404142434445464748494A4B4C4D4E4F"
db "505152535455565758595A5B5C5D5E5F"
db "606162636465666768696A6B6C6D6E6F"
db "707172737475767778797A7B7C7D7E7F"
db "808182838485868788898A8B8C8D8E8F"
db "909192939495969798999A9B9C9D9E9F"
db "A0A1A2A3A4A5A6A7A8A9AAABACADAEAF"
db "B0B1B2B3B4B5B6B7B8B9BABBBCBDBEBF"
db "C0C1C2C3C4C5C6C7C8C9CACBCCCDCECF"
db "D0D1D2D3D4D5D6D7D8D9DADBDCDDDEDF"
db "E0E1E2E3E4E5E6E7E8E9EAEBECEDEEEF"
db "F0F1F2F3F4F5F6F7F8F9FAFBFCFDFEFF"
Been pretty busy celebrating birthday (and hangovers), visiting family, and assembling a new box... when I get back from work tomorrow, I should be able to run the code on an AMD64x2 4400+ :D
Lingo, I couldn't compile your file. It needs 'timers.asm' macro file to do that. I will search it on the forum. The execution of the file end when r22 routine get executed (I've got an AMD Duron, not P4), but I saw this on my screen before crashing:
And your last posted file showed that:
f0dder, after that, you will be able of starting to program using VB environment.
:D
Dword2Hex : 18 clocks; Result: 1C2B3A4B
dw2hex_ex : 30 clocks; Result: 1C2B3A4B
d2H-lingo : 25 clocks; Result: 1C2B3A4B
MMX_Lingo : 9 clocks; Result: 1C2B3A4B
And your last posted file showed that:
word2Hex1: 23 clocks; Result: 1C2B3A4B
word2Hex : 23 clocks; Result: 1C2B3A4B
wH->Jeff : 84 clocks; Result: 1C2B3A4B
2H-lingo : 26 clocks; Result: 1C2B3A4B
w2hex_ex : 26 clocks; Result: 1C2B3A4B
I should be able to run the code on an AMD64x2 4400+
f0dder, after that, you will be able of starting to program using VB environment.
:D
Hehe Kecol... I still have a P4 2.53, P4-cel-1.7, P3-1300 (linux) and a PII-350 to test stuff on... so I won't revert to sloppy code. It's just nice having a powerful machine to develop on.
Eduardo,
"but is the simpliest"
I don't understand why you think
your part of code ( I assume you are lazy to post
the working proc) is simplest ...
and simplest for whom: for you, for the people
without assembly knowledge or for other assembly coders
here...
You can learn more about "simplest" Dword2Hex code here:
http://www.masmforum.com/simple/index.php?topic=2974.0
and here:
http://www.asmcommunity.net/board/index.php?topic=3789.0
Kecol,
Thank you for testing. :)
You can get MASM32 with all files and macros (incl. timers.asm) here:
www.masm32.com
"...on my screen before crashing:"
to avoid crash just try to update your CPU (as f0dder did) :)
f0dder,
"It's just nice having a powerful machine to develop on .."
.. and to test my fast algos too. :)
Regards,
Lingo
"but is the simpliest"
I don't understand why you think
your part of code ( I assume you are lazy to post
the working proc) is simplest ...
and simplest for whom: for you, for the people
without assembly knowledge or for other assembly coders
here...
You can learn more about "simplest" Dword2Hex code here:
http://www.masmforum.com/simple/index.php?topic=2974.0
and here:
http://www.asmcommunity.net/board/index.php?topic=3789.0
Kecol,
Thank you for testing. :)
You can get MASM32 with all files and macros (incl. timers.asm) here:
www.masm32.com
"...on my screen before crashing:"
to avoid crash just try to update your CPU (as f0dder did) :)
f0dder,
"It's just nice having a powerful machine to develop on .."
.. and to test my fast algos too. :)
Regards,
Lingo
lingo, the working proc is a little different:
As you see it transform a array of 2 or more bytes to a string of hex, the part of code i post just get a dword in ecx and put the result in eax:edx, it isn't a proc, only "part of code", you can put it in a macro or something, i don't see any reason to make a proc that the call will take more time to execute than the code.
Simpliest for me is something you don't spend more than five minutes writting it because it is easy.
Anyway, your MMX code take only 8 clocks on my XP 1800+.
bin2hex2 proc uses ebx, edi, esi src:DWORD, dst:DWORD, len:DWORD
;with a 512 bytes ascii table
mov edi, dst
mov esi, src
mov ecx, len
xor eax, eax
xor ebx, ebx
mov bl,
mov dx, word ptr
mov , dx
shr ecx, 1
lstart:
mov ax,
mov bl, al
mov dx, word ptr
mov ,dx
mov bl, ah
mov dx, word ptr
mov ,dx
dec ecx
jnz lstart
ret
tb:
db "000102030405060708090A0B0C0D0E0F"
db "101112131415161718191A1B1C1D1E1F"
db "202122232425262728292A2B2C2D2E2F"
db "303132333435363738393A3B3C3D3E3F"
db "404142434445464748494A4B4C4D4E4F"
db "505152535455565758595A5B5C5D5E5F"
db "606162636465666768696A6B6C6D6E6F"
db "707172737475767778797A7B7C7D7E7F"
db "808182838485868788898A8B8C8D8E8F"
db "909192939495969798999A9B9C9D9E9F"
db "A0A1A2A3A4A5A6A7A8A9AAABACADAEAF"
db "B0B1B2B3B4B5B6B7B8B9BABBBCBDBEBF"
db "C0C1C2C3C4C5C6C7C8C9CACBCCCDCECF"
db "D0D1D2D3D4D5D6D7D8D9DADBDCDDDEDF"
db "E0E1E2E3E4E5E6E7E8E9EAEBECEDEEEF"
db "F0F1F2F3F4F5F6F7F8F9FAFBFCFDFEFF"
bin2hex2 endp
As you see it transform a array of 2 or more bytes to a string of hex, the part of code i post just get a dword in ecx and put the result in eax:edx, it isn't a proc, only "part of code", you can put it in a macro or something, i don't see any reason to make a proc that the call will take more time to execute than the code.
Simpliest for me is something you don't spend more than five minutes writting it because it is easy.
Anyway, your MMX code take only 8 clocks on my XP 1800+.
Eduardo,
Many thanks for testing :)
Regards,
Lingo
Many thanks for testing :)
Regards,
Lingo
bench.exe
...early in the timings, the thread switched execution from core #1 to core #2 and back again shortly after.
dword2h.exe
...and now it's time to sleep :)
E:\temp\Dw2Hex>bench
2687 hutch
2344 lingo
2515 BiteRider
2579 hutch
2343 lingo
2547 BiteRider
2578 hutch
2469 lingo
2594 BiteRider
2578 hutch
2531 lingo
2453 BiteRider
2657 hutch
2406 lingo
2469 BiteRider
2671 hutch
2469 lingo
2625 BiteRider
2688 hutch
2328 lingo
2453 BiteRider
2578 hutch
2547 lingo
2531 BiteRider
-------
Results
-------
2627 hutch average
2429 lingo average
2523 BiteRider average
...early in the timings, the thread switched execution from core #1 to core #2 and back again shortly after.
dword2h.exe
E:\temp\Dw2Hex>dword2h.exe
Please terminate any high-priority tasks and press ENTER to begin.
Dword2Hex Tests:
Dword2Hex1: 15 clocks; Result: 1C2B3A4B
Dword2Hex : 16 clocks; Result: 1C2B3A4B
DwH->Jeff : 66 clocks; Result: 1C2B3A4B
d2H-lingo : 24 clocks; Result: 1C2B3A4B
dw2hex_ex : 26 clocks; Result: 1C2B3A4B
Press ENTER to exit...
...and now it's time to sleep :)