What are the most efficient way to implement the window functions handling the messages being sent to the windows.
I'm most concerned about speed.

Thanks
--
RolfT
Posted on 2001-09-09 13:32:19 by rolft
hmm... well as far as i think the best way would be to code in C using switch-case to check for which msg is being sent. Somehing like this.


LRESULT CALLBACK WndProc(HWND hWnd, UINT message, WPARAM wParam, LPARAM lParam)
{
switch (message)
{
case WM_CREATE:
{
your code
}
break;
case WM_whatevermsg:
{
your code
}
break;

default: return DefWindowProc(hWnd, message, wParam, lParam);
}
return 0;
}


But if you want to code in asm only then i would suggest using bitrake's switch-case macro. Search for it on this forum and you shall find it.
Posted on 2001-09-10 03:29:41 by MovingFulcrum
Due to the fact that even on mouse-moving your app get a message mostly your message-loop roll over without doing anything special. Secondary a jump which will not proceed uses just about one cycle opposite to three cycles (without cache-missings !) if proceeded.

So my conclusion to do a fast message-loop is that: (It isn't a regular proto - but who needs this real ?)



WndProc: ; *** in most cases the incoming messages are passed through ***
; *** so no jump will be executed ***

mov eax, 8 [esp] ; = uMsg !

cmp eax, WM_PAINT
je WP_Paint

cmp eax, WM_CLOSE
je WP_Close

cmp eax, WM_DESTROY
je WP_Destroy

cmp eax, WM_CREATE
je WP_Create

; *** call default WindowProc ***
;
; In fact, no Invoke is needed here because stack-contents will be copied only !
; There's no reason to return from DefWindowProc, so a jmp is done !

@@: jmp DefWindowProc

WP_Create: ...
WP_Close: ...
WP_Destroy: ...



Greetings, CALEB
Posted on 2001-09-10 14:21:00 by Caleb
Agree'd... I receintly built myself a very basic template *.asm that is simular to this design: (This creates an Empty, White Window, for custom controls etc.).
.386

.model flat,stdcall
option casemap: none

include \masm32\include\gdi32.inc
include \masm32\include\kernel32.inc
include \masm32\include\user32.inc
include \masm32\include\windows.inc
include \masm32\include\_Macros_.inc

includelib \masm32\lib\gdi32.lib
includelib \masm32\lib\kernel32.lib
includelib \masm32\lib\user32.lib

; --==============================================

WinMain PROTO :DWORD,:DWORD,:DWORD,:DWORD
WndProc PROTO :DWORD,:DWORD,:DWORD,:DWORD
Paint_Proc PROTO :DWORD,:DWORD

; This is supporting equates for the CreateBlankWindow Macro..
;500 ICON MOVEABLE PURE LOADONCALL DISCARDABLE "MAINICON.ICO"
IDI_MAINICON equ 500
THIS_WIDTH equ 300
THIS_HEIGHT equ 200

; --==============================================
.data

.data?
hInstance dd ?
hIcon dd ?
CommandLine dd ?

.code
; --==============================================
start:
invoke GetModuleHandle, NULL
mov hInstance, eax

invoke GetCommandLine
mov CommandLine, eax

invoke WinMain,hInstance,NULL,CommandLine,SW_SHOWDEFAULT
invoke ExitProcess,eax


; --==============================================
WinMain proc hInst :DWORD, hPrevInst :DWORD,
CmdLine :DWORD, CmdShow :DWORD

LOCAL wc :WNDCLASSEX
LOCAL hWnd :DWORD
LOCAL msg :MSG

mov hWnd, CreateBlankWindow( "AClass", "CTRL-DEL NAME", WndProc )

invoke ShowWindow,hWnd,SW_SHOWNORMAL
invoke UpdateWindow,hWnd

;===================================
; Loop until PostQuitMessage is sent
;===================================

StartLoop:
invoke GetMessage,ADDR msg,NULL,0,0
cmp eax, 0
je ExitLoop
invoke TranslateMessage, ADDR msg
invoke DispatchMessage, ADDR msg
jmp StartLoop
ExitLoop:

mov eax, msg.wParam
ret
WinMain endp


; --==============================================
WndProc proc hWnd:DWORD, uMsg:DWORD, wParam:DWORD, lParam:DWORD

IF_MSG WM_DESTROY, Do_destroy
IF_MSG WM_CREATE, Do_create
IF_MSG WM_PAINT, Do_paint
IF_MSG WM_COMMAND, Do_command
IF_MSG WM_LBUTTONDOWN, Do_LButtonDown

@return: ; ###################################################
invoke DefWindowProc,hWnd,uMsg,wParam,lParam
ret

@end:
xor eax, eax
jmp @return

Do_destroy: ; ################################################

invoke PostQuitMessage, NULL
jmp @end

Do_create: ; #################################################

jmp @end

Do_paint: ; ##################################################

jmp @end

Do_command: ; ################################################

jmp @end

Do_LButtonDown: ; ############################################

mov eax, lParam
mov edx, eax ; x point
and edx, 0ffffh
shr eax, 16 ; y point

invoke SendMessage, hWnd, WM_CLOSE, NULL, NULL

jmp @end

WndProc endp
; --==============================================
end start


I built this to work with two MACRO's, one for creating the Window, and the other for the message loop:


CreateBlankWindow MACRO CName:REQ, CDisp:REQ, Prc:REQ
.data
szThisClassName db CName,0
szThisDisplayName db CDisp,0
.code

mov wc.cbSize,SIZEOF WNDCLASSEX
mov wc.style,CS_HREDRAW or CS_VREDRAW
mov wc.lpfnWndProc, Prc
mov wc.cbClsExtra,0
mov wc.cbWndExtra,0
mov wc.hbrBackground,COLOR_WINDOW+1
mov wc.lpszMenuName,0
mov wc.lpszClassName,offset szThisClassName

invoke LoadIcon,hInst, IDI_MAINICON
mov hIcon,eax
mov wc.hIcon,eax
mov wc.hIconSm,eax

invoke LoadCursor,0,IDC_ARROW
mov wc.hCursor,eax

push hInstance
pop wc.hInstance

invoke RegisterClassEx,addr wc
invoke GetSystemMetrics,SM_CXSCREEN

sub eax, THIS_WIDTH
shr eax,1
mov ebx,eax

invoke GetSystemMetrics,SM_CYSCREEN
sub eax, THIS_HEIGHT
shr eax,1

invoke CreateWindowEx,WS_EX_TOPMOST,addr szThisClassName,addr szThisDisplayName,WS_POPUP or\
WS_SYSMENU,ebx,eax,THIS_WIDTH,THIS_HEIGHT,0,0,hInst,0
EXITM <eax>
ENDM

IF_MSG MACRO WMsg:REQ, DoJmp:REQ
cmp uMsg, WMsg
jz DoJmp
ENDM


You dont *need* the "CreateBlankWindow" macro, but i hate seeing all that crap. But the IF_MSG macro is more to the point of what Caleb was getting at.

As well, the @end, and @return are placed for proper exiting, @end == msg handled, @return == message not handled.


If you can use it, enjoy...

NaN
Posted on 2001-09-10 17:14:19 by NaN
Correct me if i am worng but by the use of switch-case the binary tree which will be created will allow lesser jumps. NaN and Caleb in your methods one would have to pass through all the WM_msgs until it reaches the last one. So obviously more jumps would be executed.
Posted on 2001-09-11 03:47:45 by MovingFulcrum
rolft,

From programming windows code for over 10 years, Microsoft designed the WndProc callback in a C Switch format. Many of the guys who wrote assembler some time back used what ws called despatchers but they are not as efficient as a Switch block style of processing.

If you use the MASM .IF block syntax you get a efficient technique that is much faster than the messages that are being sent to the window and that can be nested as deep as you need with nightmare collections of jumps.

What you will find with a Window's callback WndProc is that clarity and understanding what it does properly is more important than trying to find speed where it does not exist. The current genius in OOP programming is to avoid C style switch blocks because they are supposed to be too complicated to use but this is nonsense, there is a simple method for handling this type of tree structure that is error free if you understand it properly.

In C you make an opening bracket followed by a closing bracket and write the switches inbetween them, in Basic you use Select Case followed by End Select, in MASM you use .IF followed by .ENDIF.

.IF uMsg == WM_COMMAND

.ELSEIF uMsg == WM_CREATE

.ENDIF

This syntax is clear, much faster than windows messaging and is easily extendable in both length and nesting. Despatchers produce nightmares in comparison.

Regards,

hutch@pbq.com.au
Posted on 2001-09-11 04:09:40 by hutch--
A point to notice, you are never going to get a FAST windows message PROC, sure you could go out of your way to not make a slow one.

Any gains you get from extra effot in jumps and switching will maybe increase speed your proc up by 0.1%. Window's own overhead processing on messages is so much greater than anything you can optimize at that level.
Posted on 2001-09-11 04:34:44 by huh
Hutch,

Been a long while since I heard reference to dispatchers in a language. Oh them good old days!;)
Posted on 2001-09-11 06:02:12 by bcraven
To HUH:

Yes that's true - the Windows-Overhang makes all tries to optimize the message-loop senseless. But it's a good exercise in optimization ;) !

To MovingFulcrum:

A switchcase will not produce fewer jumps...

But what I write was more compact and so due to caching it's faster because all jumps are in a small block. If you do a Switch-Case the block expands and more cache-prefetching is needed, so the processor needs more time to process the message-loop !

Also with proceeded jumps some other cpu-optimizations will fail, too !

Greetings, CALEB
Posted on 2001-09-11 06:33:40 by Caleb
Talking of speed Caleb's logic is fastest in the case and
.ID uMsg ==
.ELSEIF uMsg ==
style produce the worst possible code.
If anybody care I can explain it in detail.
As to clearity, please, have a look - to me it's much clear and shorter than .IF .ELSEIF:


.586
.model flat,stdcall
option casemap:none
include C:\masm32\include\windows.inc
include C:\masm32\include\user32.inc
include C:\masm32\include\kernel32.inc
includelib kernel32.lib
includelib user32.lib
WM_CASE macro reg,msgs
irp msg,<msgs>
cmp reg,WM_&msg
je @@&msg
endm
endm
.code
start:

WndProc proc hWnd,uMsg,wParam,lParam

WM_CASE eax,<CREATE,COMMAND,SIZE,PAINT,CLOSE>
invoke DefWindowProc,hWnd,eax,wParam,lParam
@@CREATE:
xor eax,eax
ret
@@COMMAND:
ret
@@SIZE:
xor eax,eax
ret
@@PAINT:
ret
@@CLOSE:
ret
WndProc endp
end start
Posted on 2001-09-22 12:15:25 by The Svin
Svin, my switch case macro does better for many branches. (here)
Posted on 2001-09-22 12:20:56 by bitRAKE
I have a simple question:
Did you time a WINDOW CALLBACK proc with ~10 to 20 cases
using your macro. And compared with this simple pass though
logic?
If you did I want to see results and testing proc.
I doubt.
You can recommend your macro in many cases as very usefull but
not in wndproc.
For simple reason - first jcc taken will flash all pipes.
How may clock it will take if we have in uMsg someting wich is not
one of "our" case and we need just passing it to DefWndProc?
Using:
cmp
je
cmp
je
We need just NumberOfOurCases clocks.
Cause 99% possibitily that none of jmps will be predicted as taken. And in the case both commands pair in one clock. Jmps will be predicted as not taken.
Prove me wrong - write test and we'll see.
We need to remember what is WndProc, it's called by system.
Posted on 2001-09-22 12:41:32 by The Svin
I'm going to take a somewhat different view here. I think the fact that DefWindowProc IS executed so often makes it a good candidate for optimization. Most applications don't process the majority of messages sent to them. Doesn't it help overall system performance if we can give control back to the "system" as quickly as possible?

Caleb brings up a good point with keeping all the cmp/je code close together, since spreading them out can increase cache and BPB confilcts. But I don't think branch prediction is that much of a factor, since we've still got to run the entire loop each time anyway. Using je vs. jne doesn't make a difference since they would both be predicted equally.

Beyond saving a clock here and there, I don't think there is any way of finding great performance improvements, since we still need to pick out the messages we want to process, one way or another. And yes, considering the general windows overhead, it is like spiting into the ocean. But considering how often we spit DefWindowProc, we'll have a few gallons in a short time. So again, doesn't it help system performance if we can give a few clocks back to the OS and/or another application? Doesn't it possibly make OUR next message available that much sooner?

My vote goes to Caleb for the quickest way in and out, and it is pretty clear. IF/ELSE still has a place, one level down, as in WM_COMMAND processing. :)
Posted on 2001-09-22 19:09:00 by S/390
During this discussion I remember my days coding on AMIGA. Well, my friends and me fought for every cycle subroutines spend on. And if the system takes too much, well, we shut it off :grin: and the complete machine belongs to our code. Nice days ...

Today on PC this is impossible due to the fact that there are less than none informations about how to code the devices, even the graphics-cards (today, noone will code VGA or VESA !)

Greetings, CALEB
Posted on 2001-09-22 19:42:22 by Caleb
. I think the fact that DefWindowProc IS executed so often makes it a good candidate for optimization.

What did you mean? To write your own user32.dll? :)
Doesn't it help overall system performance if we can give control back to the "system" as quickly as possible?

Yes it does.
And though out sequence cmp je cmp je serves both to purposes
to give control back to system if it's not our message and find right jump if it's ours.
But I don't think branch prediction is that much of a factor, since we've still got to run the entire loop each time anyway. Using je vs. jne doesn't make a difference since they would both be predicted equally.

You're deadly wrong here. Do simple step by step analyze of different scenarios and clock testing.
Beyond saving a clock here and there, I don't think there is any way of finding great performance improvements, since we still need to pick out the messages we want to process, one way or another.

Words, words, words...
I did get what you meant.
IF/ELSE still has a place, one level down, as in WM_COMMAND processing.

In WM_COMMAND we don't need it at all. We can calculate right jmp with two opecodes even if we have 1000 items in menu.
================

OK, it is my last post in the thread.
If anybody wants to know thruth - the anybody can do some practical work to figure it out - logical analyze going though bare disassebled code and writing test apps.
Here is last hints on the topic look:
Compile, watch, test, think


.586
.model flat,stdcall
option casemap:none
include C:\masm32\include\windows.inc
include C:\masm32\include\user32.inc
include C:\masm32\include\kernel32.inc
includelib kernel32.lib
includelib user32.lib
WM_CASE macro reg,msgs
irp msg,<msgs>
cmp reg,WM_&msg
je @@&msg
endm
endm
.data
.data?
.code
start:

.IF eax == WM_CREATE
add ecx,ecx
ret
.ELSEIF eax == WM_DESTROY
add edx,edx
ret
.ELSEIF eax == WM_PAINT
xor ecx,ecx
ret
.ELSEIF eax ==WM_COMMAND
ret
.ENDIF

WM_CASE <CREATE,DESTROY,PAINT,COMMAND>

@@CREATE: ADD ECX,ECX
RET
@@PAINT: ADD EDX,EDX
RET
@@DESTROY: XOR ECX,ECX

@@COMMAND: RET



call ExitProcess
end start.

There are 2 piece of code wich do the same thing.
Second is
1. More Clear
2. Less size
3. In callback proc it'll run at least twice faster.

To understand it you need:
1. Learn about what is callback.
2. Learn about brunching prediction especially part of first time not in the BPB block handling jmps.
3. Analyse it only in disassembled mode.
Good luck.
Posted on 2001-09-22 20:39:40 by The Svin
I understand what your saying Svin, and I'm fairly sure your right. What do you think about the scasd method?
;;

;; MAIN WINDOW MESSAGES
;;
.CONST
MainMsg DWORD WM_NOTIFY, WM_COMMAND, WM_MENUSELECT,
WM_SIZING, WM_ENTERSIZEMOVE, WM_WINDOWPOSCHANGED,
WM_MOUSEMOVE, WM_LBUTTONDOWN, WM_LBUTTONUP, WM_RBUTTONDOWN,
WM_KEYDOWN, WM_CANCELMODE,
WM_ERASEBKGND, WM_ACTIVATE,
WM_CREATE, WM_CLOSE, WM_DESTROY,
WM_SYSCOLORCHANGE, WM_SETTINGCHANGE
DWORD MainWM_NOTIFY, MainWM_COMMAND, MainWM_MENUSELECT,
MainWM_SIZING, MainWM_ENTERSIZEMOVE, MainWM_WINDOWPOSCHANGED,
MainWM_MOUSEMOVE, MainWM_LBUTTONDOWN, MainWM_LBUTTONUP, MainWM_RBUTTONDOWN,
MainWM_KEYDOWN, MainWM_CANCELMODE,
MainWM_ERASEBKGND, MainWM_ACTIVATE,
MainWM_CREATE, MainWM_CLOSE, MainWM_DESTROY,
MainWM_SYSCOLORCHANGE, MainWM_SETTINGCHANGE

;Equates used to simplify references to window procedure parameters
lParam TEXTEQU <esp+4+4>
wParam TEXTEQU <esp+4+8>
uMsg TEXTEQU <esp+4+12>
hWnd TEXTEQU <esp+4+16>

.CODE
MainWND PROC
push edi
mov eax,[uMsg]
mov ecx,LENGTHOF MainMsg
mov edi,OFFSET MainMsg
repne scasd
je Process
Default:
pop edi
jmp [DefWindowProc]

; ELSE process this message possibly setting carry flag for default processing
ALIGN 4

Process:
call DWORD PTR [edi+(SIZEOF MainMsg-4)]
jc Default
Return:
pop edi
ret 10h
MainWND ENDP
Maybe, we could prefetch the data to speed it up. :)
Posted on 2001-09-22 22:18:54 by bitRAKE
Of course the one more logic is create arrays of WM_s and jmps
to handlers and scan through the array.
But what we should remember:
1. It wouldn't do code faster, we may use the technigue just to decrease size.
2. Using scasd in the case is worst possible way to scan.
The fastest way for the scaning would be creating a loop.
And the fastest way of looping here would be one design with
"fiction point" logic.
With this logic you:
1. Don't need counter at all, wich make the loop faster
2. You can unroll the loop as much as you want. Wich give you aditional speed.
Let you have an array of structures of WM_s and handlers
.data
...
jmptbl dd WM_CREATE,wm_create
dd WM_COMMAND,wm_command
and so on
and in the code:
.code
...
wm_create:
...
wm_command:
and so on
Now put at the and of jmptbl
....
wm_def dd 0,default
and in the code
default:
invoke DefWindowProc,hWnd,eax,wParam,lParam
Whatever message you have we insert it into wm_def dword
So now you don't need counter.
WndProc proc hWnd,uMsg,wParam,lParam
mov eax,uMsg
mov ecx,offset jmptbl
mov wm_def,eax
@@: cmp ,eax
lea ecx,
jne @B
jmp d
do I need show how you can unroll it? :)
Posted on 2001-09-22 23:27:45 by The Svin
Originally posted by The Svin do I need show how you can unroll it? :)
Please, do :)
Posted on 2001-09-23 00:11:37 by bitRAKE
Hi Svin,

I think we're arguing the same point. :)

But I will take a minute to reply to a couple of your comments.

"What did you mean? To write your own user32.dll?"

No, what I'm saying is that when we do run DefWindowProc, let's get there as quickly as possible.

"In WM_COMMAND we don't need it at all. We can calculate right jmp with two opecodes even if we have 1000 items in menu."

This is a good point, and true if you structure your resources properly. All I'm saying is that there are cases where IF/ELSE programming can be useful, just like any other programming technique.

As far branch prediction, I think it's a moot point in this case. Windows is a mulit-tasking OS, so if it task switches between sending us messages, chances are our entries in the BPB well be clobbered. The PII/III have a maximum of 512 BPB entries. Just normal win overhead without a task switch may be enough to step on the BPB. I guess we could test to see how true this is.

If it is true that we suffer the "first time penalty" nearly every time, then cmp/je/cmp/je is by far the best choice, since it is predicted to fall thru the first time (actually first 2 times) on the PII/III if it is a forward reference, which it is in our example. According to Agner Fog, a misprediction costs between 10 and 20 clocks on a PII/III.

In any case, we are in complete agreement that cmp/je/cmp/je is the better of the two. :)
Posted on 2001-09-23 00:42:04 by S/390
Hi CoderZ!

I'm a newbie about win32 stuff, I'm a bit rusted in ASM and I have the worst english possible, so excuse me if my post is inapropiate for this thread or wathever .:)

I think there is other efficient ways for checking those WM_msgs, and I try to show you my aproachs.
The WM_msgs are, in fact, numbers, and looking in the windows.inc I think that a way to perform these checks is in "partitions": (sorry for the code but Im writing this from the top of my head)

mov eax, ; take the uMsg
test ah,ah
jz Part00
cmp ah,01h ; check the High Byte
jz Part01
cmp ah,02h
jz Part02
cmp ah,03h
jz Part03

Part00:
....check here the messages between 0000h and 00FFh - ie: WM_NULL,WM_CREATE.....WM_NCMBUTTONDBLCLK
Part01:
....check here the messages between 0100h and 01FFh - ie: WM_KEYFIRST...WM_CTLCOLORSTATIC
etc.

You can test the nibbles for more precision (in Part00: test al,0F0h, jnz ....etc.
I think this works like a binary tree ;)
This method has sense when you need to check a lot of messages and are "separated" between the values.

The other way (the best, I think) is using a lookup table:

.data

JumpTable dd offset EOCheckTable ;WM_NULL
dd offset InitialDial ;WM_CREATE
dd offset EOCheckTable ;WM_DESTROY
dd offset EOCheckTable ;WM_MOVE
dd offset EOCheckTable
dd offset EOCheckTable ;WM_SIZE
dd offset EOCheckTable ;WM_ACTIVATE
dd offset EOCheckTable ;WM_SETFOCUS
dd offset EOCheckTable ;WM_KILLFOCUS
dd offset EOCheckTable
dd offset EOCheckTable ;WM_ENABLE
............
............
...........
dd offset EOCheckTable ;WM_USER
........

.code

...........
;put this for check

mov eax, ;get uMsg
cmp eax,400h ;check the upper boundary
ja UserCommand ;process the message above 400h
Call ProcMsg
test eax,eax
jne MessageNotProcesed

.............




ProcMsg: add eax,eax
add eax,eax ; uMsg * 4
add eax,offset JumpTable ; add the base of the table
jmp dword ptr ; jump to the correct message routine


EOCheckTable: mov eax,1 ;return true if the message is not processed
ret

;Process the WM_CREATE msg
InitialDial: MsgBox NULL,"Initializing","NSSys",MB_OK ;a msgbox sample
..........
.........
xor eax,eax
ret

EOCheckTable is the "default-not-process-message" routine. Obviously, you must write the complete table putting the right offsets when you want to process some message, and you can make things like this if you dont want to write all the table:

JumpTable dd 110h dup (offset EOCheckTable)
dd offset InitialDial ;WM_INITDIALOG
dd 4096 dup (offset EOCheckTable)

or doing at runtime whith a simple prog that fill the correct address of the routines you want.

The only counterpart for this method is you must waste 4096 bytes for the table (But nowadays I think that is not a serious problem ;)

You can combine the two methods too; checking the "partitions" and doing jumptables.

Well, I hope that you find this useful.

ByeZ

PS: I dont remember if there is some penalties in pairing or caching using "Jmp dword ptr " but its 4:12 AM and Im tired and I dont want to read the agner help now...:grin:
Posted on 2001-09-23 02:15:10 by NightShade