Sorry if this isn't the right board for inline asm in C++ =) If there isn't a right board, then oh shucks =/

Anyway, I'm a pretty newbie assembly coder and I was trying to write a binary search in assembly. I don't know any groovy tricks for optimizing code. Anyway, here's one version of the function:



int binary_search_asm
(int *Array, unsigned int first, unsigned int last, int key)
{
int retval;
__asm
{
mov edx, first ; edx == first
mov ebx, last ; ebx == last

BeginL:
cmp edx, ebx ; cmp edx and ebx
ja EndBad

mov eax, edx ; eax == first
add eax, ebx ; eax == first + last
shr eax, 1 ; eax == (first + last) >> 1

mov esi, [Array + eax * 4]; esi == sortedArray[mid]
cmp key, esi ; cmp key and esi

je Equal ; _if (key == edx) jmpto Equal
jl LessThan ; _if (key < edx) jmpto LessThan

mov edx, eax ; first = mid
inc edx ; first = mid + 1
jmp BeginL ; loop

LessThan:
mov ebx, eax ; last = mid
dec ebx ; last = mid - 1
jmp BeginL ; loop

EndBad:
mov retval, -1 ; value not found
jmp EndL

Equal:
mov retval, eax ; value found, retval = eax, end

EndL:
}

return retval;
}


This is quicker on my comp than the C++ version



int binary_search_ (int *Array, int first, int last, int key)
{
while (first <= last)
{
int mid = (first + last) >> 1;
if (key > Array[mid])
first = ++mid;
else if (key < Array[mid])
last = --mid;
else
return mid;
}

return -1;
}


There's a specific thing I want to do to the inline asm version, and that is make it naked so I don't have to use the retval integer. I changed it to the following:



__declspec (naked) int binary_search_asm
(int *Array, unsigned int first, unsigned int last, int key)
{
__asm
{
mov edx, first ; edx == first
mov ebx, last ; ebx == last

BeginL:
cmp edx, ebx ; cmp edx and ebx
ja EndBad

mov eax, edx ; eax == first
add eax, ebx ; eax == first + last
shr eax, 1 ; eax == (first + last) >> 1

mov esi, [Array + eax * 4]; esi == sortedArray[mid]
cmp key, esi ; cmp key and esi

je Equal ; _if (key == edx) jmpto Equal
jl LessThan ; _if (key < edx) jmpto LessThan

mov edx, eax ; first = mid
inc edx ; first = mid + 1
jmp BeginL ; loop

LessThan:
mov ebx, eax ; last = mid
dec ebx ; last = mid - 1
jmp BeginL ; loop

EndBad:
mov eax, -1

Equal:
ret
}
}


adding the __declspec (naked) and removing the retval ordeal. I'm assuming that ret returns the contents of eax.

The problem is that the first version works fine, but the __declspec (naked) version always seems to return -1, even though it shouldn't... I'm probably missing something. Can someone help me? Is there a different way to do this?

Also, if anyone can tell me any optimizations I could make, please tell me. I want to get this code as fast as can be, while still writing it myself, so I can learn.

Thanks! (and once again, sorry if it's in the wrong place)
Posted on 2003-03-02 01:16:09 by rmullen3

The problem is that the first version works fine, but the __declspec (naked) version always seems to return -1, even though it shouldn't... I'm probably missing something. Can someone help me? Is there a different way to do this?
If you look in a debugger, maybe you will see that the params passed on the stack are not being moved into the registers correctly - due to no stack frame being set-up. Then again, maybe I am wrong - compilers do so many things different than what I expect.

This isn't the fastest, but it might give you some ideas:
; divide and conquer approach to finding:

mov esi, first
mov edi, last
mov edx, Array
mov eax, key
jmp _2

_0: dec edi
_1: inc esi
_2: cmp esi, edi
lea ecx, [esi + edi]
jge notfound
shr ecx, 1 ; (a+b)/2
cmp [edx + ecx*4], eax
xchg ecx, esi ; top half
jc _1
lea ecx, [ecx - 1]
mov edi, esi ; bottom half
mov esi, ecx
jne _0
found:
mov eax, edi ; key number of found
jmp EndL
notfound:
cmp eax, [edx + esi*4]
mov eax, -1
jne EndL
mov eax, esi ; key number of found
EndL:
(hope I didn't break it with all my edits from the original)
Notice there are only 2 or 3 branches per loop.

Is this code really a bottleneck in your application?
Posted on 2003-03-02 01:45:53 by bitRAKE
I really like your example. Unforunatly, it was slower than my orig. code. But I think the way you did it could possibly be a lot faster than mine, cutting down on branches.

I liked this line the most:


lea ecx, [esi + edi]


I didn't think to use load eff. address, that was a little boost in speed.

Also, looking through the Quake 1 source I found out what I was doing wrong. I needed to get things from the stack differently. Here's my new (much much faster) code:



__inline __declspec (naked) int binary_search_asm
(unsigned int first, unsigned int last, int key, int *Array)
{
__asm {
mov ecx, dword ptr [esp + 4] ; ecx = first
mov ebx, dword ptr [esp + 8] ; ebx = last
mov edi, dword ptr [esp + 12] ; edi = key
mov edx, dword ptr [esp + 16] ; edx = Array
BeginL:
cmp ecx, ebx ; cmp ecx and ebx
ja EndBad

lea eax, [ecx + ebx]
shr eax, 1 ; eax == (first + last) >> 1

cmp edi, [edx + eax * 4]

je Equal ; _if (key == ecx) jmpto Equal
jl LessThan ; _if (key < ecx) jmpto LessThan

mov ecx, eax ; first = mid
inc ecx ; first = mid + 1
jmp BeginL ; loop
LessThan:
mov ebx, eax ; last = mid
dec ebx ; last = mid - 1
jmp BeginL ; loop
EndBad:
mov eax, -1 ; not found
Equal:
ret
}
}


bitrake, if you could perhaps break down the structure of your code for me a bit? Assembly coded by others is still cryptic to me. Also, any other optimization tips would be great. Like, is there a better register for doing certain things in this code that I'm missing?

So far I've gotten 10000000 searches on a 6-element array down from 1.60 to 0.85 seconds which is a nice increase. As for being a bottleneck, no it isn't. I'm not even using this code. I'm just trying to write a really fast binary search =)

Thanks for your help!!
Posted on 2003-03-02 13:28:30 by rmullen3
__inline __declspec (naked) int binary_search_asm 

(unsigned int first, unsigned int last, int key, int *Array)
{
__asm {
mov ecx, dword ptr [esp + 4] ; ecx = first
mov ebx, dword ptr [esp + 8] ; ebx = last
mov edi, dword ptr [esp + 12] ; edi = key
mov edx, dword ptr [esp + 16] ; edx = Array
BeginL:
cmp ecx, ebx ; cmp ecx and ebx
lea eax, [ecx + ebx]
ja EndBad
shr eax, 1 ; eax == (first + last) >> 1

cmp edi, [edx + eax * 4]

jl LessThan ; _if (key < ecx) jmpto LessThan

lea ecx, [eax + 1] ; first = mid + 1
jne BeginL ; loop
ret
LessThan:
lea ebx, [eax - 1] ; last = mid - 1
jmp BeginL ; loop
EndBad:
mov eax, -1 ; not found
ret
}
}
IMHO, there is little speed to gain in this code. Another point is to remove the "JE Equal" because this branch is only taken on the end. I've also used two more LEA's. I haven't tested the above code, but it would be nice to know if it does a little better.

Next we move some code around:
__inline __declspec (naked) int binary_search_asm 

(unsigned int first, unsigned int last, int key, int *Array)
{
__asm {
mov ecx, dword ptr [esp + 4] ; ecx = first
mov ebx, dword ptr [esp + 8] ; ebx = last
mov edi, dword ptr [esp + 12] ; edi = key
mov edx, dword ptr [esp + 16] ; edx = Array
jmp BeginL

EndBad:
mov eax, -1 ; not found
ret

ALIGN 16 ; Does this work?

LessThan:
lea ebx, [eax - 1] ; last = mid - 1
BeginL:
cmp ecx, ebx ; cmp ecx and ebx
lea eax, [ecx + ebx]
ja EndBad
shr eax, 1 ; eax == (first + last) >> 1

cmp edi, [edx + eax * 4]

jl LessThan ; _if (key < ecx) jmpto LessThan

lea ecx, [eax + 1] ; first = mid + 1
jne BeginL ; loop
ret
}
}
See how we are getting close to the code I posted first (above) - not that it is any better. :)
Posted on 2003-03-02 13:51:52 by bitRAKE
A 6 element array may not be very representative for real uses of the binary search. I did some tests with your code, bitRAKE's and my own (which is bitRAKE's code but slightly modified) and your code seems to be slower than the C version.
Test results on an athlonTB 1400:
Array size 10 with 80,000,000 iterations:


C version: 5088 ms
rmullen3's asm version: 5247 ms
bitRAKE's asm version: 2243 ms
Thomas's asm version: 2013 ms

Array size 1,000 with 80,000,000 iterations:

C version: 8453 ms
rmullen3's asm version: 8722 ms
bitRAKE's asm version: 3896 ms
Thomas's asm version: 3485 ms

Array size 1,000,000 with 80,000,000 iterations:

C version: 12808 ms
rmullen3's asm version: 12959 ms
bitRAKE's asm version: 7691 ms
Thomas's asm version: 6549 ms


And my own version:


__inline __declspec (naked) int bs_asm2
(unsigned int first, unsigned int last, int key, int *Array)
{
__asm {
push esi
push edi
push ebx

mov ecx, dword ptr [esp + 4 + 3*4] ; ecx = first
mov ebx, dword ptr [esp + 8 + 3*4] ; ebx = last
mov edi, dword ptr [esp + 12+ 3*4] ; edi = key
mov edx, dword ptr [esp + 16+ 3*4] ; edx = Array

lea eax, [ecx + ebx]
jmp BeginL

EndBad:
pop ebx
pop edi
pop esi
mov eax, -1 ; not found
ret

ALIGN 16 ; Does this work?

LessThan:
lea ebx, [eax - 1]
lea eax, [ecx + eax - 1]
BeginL:
cmp ecx, ebx
ja EndBad
shr eax, 1

cmp edi, [edx + eax * 4]

jl LessThan
lea ecx, [eax + 1]
lea eax, [ebx + ecx + 1]

jne BeginL

lea eax, [ecx - 1]

pop ebx
pop edi
pop esi
ret
}
}


Don't forget to push/pop registers when you use __declspec(naked). C++ won't do this for you.

Thomas

edit: moved the post to algorithms & source code.
Posted on 2003-03-02 15:42:55 by Thomas
Nice change, Thomas. :) Does the "__inline" do anything? Can the complier inline this code with it having multiple exit points? Isn't there an error in the second part of the loop?
		jl LessThan

lea ecx, [eax + 1]
lea eax, [ebx + [COLOR=darkred]EAX[/COLOR] + 1]
Posted on 2003-03-02 16:06:14 by bitRAKE
You are right, I don't think that small of array is a good test. Still, testing on a large array, with all optimizations turned off, I tested all versions. (However!! I may not be testing right. The code I use to time code is at the bottom of my post. I don't think it's very good, it's certainly not precise. If anyone could provide me with a better version please do!)



Array size 1000, searched over 8,000,000 iterations:

Orig. C++ version: 3.9 seconds
Last version I wrote: 1.65 seconds
bitrake's version: 1.98 seconds
Thomas's version: 1.43 seconds



Bear in mind, I have all optimizations turned off. My computer is an AMD-K6-2 at 450 mhz, using VC++, and my timing code as I said may not be very good.

However, the main thing: I'm learning a lot here! I'll spend the next hour studying both both of you guy's versions.

Also, thomas, by not pushing and popping can some serious errors occur? And why do you have to add 3*4 all those times?

Thanks for your help both bitrake and thomas!

Anyway, here's the timer code:



#include <sys\types.h>
#include <sys\timeb.h>

class stopwatch
{
public:
__inline stopwatch ()
{
_ftime(&t1);
}

__inline void Reset (void)
{
_ftime(&t1);
}

__inline double Reading (void)
{
_ftime(&t2);
return ((t2.time - t1.time) +
(t2.millitm - t1.millitm)*0.001);
}

private:
struct _timeb t1;
struct _timeb t2;
};
Posted on 2003-03-02 16:35:17 by rmullen3
bitRAKE: Yes that's a bug. Should work fine if you change that one. And I don't think inline does anything if you use declspec(naked). You wouldn't be able to get the parameters from esp+offset then because they might be registers (the compiler knows what's where when it inlines C functions). Inlining in C is much more complicated than for example expanding macros in asm. In C++ you can inline any function you like (no matter how many returns you have) and the compiler will still figure out the best way to inline (although there usually is a restriction on the amount of nesting). The compiler can never figure out how to inline your asm code so it probably will ignore the inline.

Bear in mind, I have all optimizations turned off. My computer is an AMD-K6-2 at 450 mhz, using VC++, and my timing code as I said may not be very good.

I had all optimizations on, to give the C version a fair chance. It also makes the loop more efficient so the timing can be more accurate, although the overhead isn't that big.

Also, thomas, by not pushing and popping can some serious errors occur?

Yes definitely. I believe if you use just _asm (not declspec(naked)) or something VC detects which registers you use and save them but if you write the whole function yourself (naked) then you need to save esi/edi/ebx/ebp. VC assumes you do so your code will probably crash (or worse: produce wrong results without crashing) if you don't. At least I couldn't run your code without saving the registers.

And why do you have to add 3*4 all those times?

At the start of the function, the parameter is at . However if you use the push opcode the stack pointer changes. 4 is substracted from esp everytime you push something. So after three pushes, esp is 12 less than it was before. That's why a correction of number_of_pushes * 4 is needed to still get the right offset.

Here's the full test code I used:


[size=9]
#include <iostream>
#include <string>
using namespace std;

#define WIN32_MEAN_AND_LEAN
#include <windows.h>

const int ARRAY_SIZE = 1*1000*1000;
const int TEST_COUNT = 80*1000*1000;

int bs_c (int *Array, unsigned int first, int last, int key)
{
while (first <= last)
{
int mid = (first + last) >> 1;
if (key > Array[mid])
first = ++mid;
else if (key < Array[mid])
last = --mid;
else
return mid;
}

return -1;
}

__inline __declspec (naked) int binary_search_asm
(unsigned int first, unsigned int last, int key, int *Array)
{
__asm {
push esi
push edi
push ebx
mov ecx, dword ptr [esp + 4+ 3*4] ; ecx = first
mov ebx, dword ptr [esp + 8+ 3*4] ; ebx = last
mov edi, dword ptr [esp + 12+ 3*4] ; edi = key
mov edx, dword ptr [esp + 16+ 3*4] ; edx = Array
BeginL:
cmp ecx, ebx ; cmp ecx and ebx
ja EndBad

lea eax, [ecx + ebx]
shr eax, 1 ; eax == (first + last) >> 1

cmp edi, [edx + eax * 4]

je Equal ; _if (key == ecx) jmpto Equal
jl LessThan ; _if (key < ecx) jmpto LessThan

mov ecx, eax ; first = mid
inc ecx ; first = mid + 1
jmp BeginL ; loop
LessThan:
mov ebx, eax ; last = mid
dec ebx ; last = mid - 1
jmp BeginL ; loop
EndBad:
mov eax, -1 ; not found
Equal:
pop ebx
pop edi
pop esi

ret
}
}
__inline __declspec (naked) int bs_br
(unsigned int first, unsigned int last, int key, int *Array)
{
__asm {
push esi
push edi
push ebx

mov ecx, dword ptr [esp + 4 + 3*4] ; ecx = first
mov ebx, dword ptr [esp + 8+ 3*4] ; ebx = last
mov edi, dword ptr [esp + 12+ 3*4] ; edi = key
mov edx, dword ptr [esp + 16+ 3*4] ; edx = Array
jmp BeginL

EndBad:
pop ebx
pop edi
pop esi
mov eax, -1 ; not found
ret

ALIGN 16 ; Does this work?

LessThan:
lea ebx, [eax - 1] ; last = mid - 1
BeginL:
cmp ecx, ebx ; cmp ecx and ebx
lea eax, [ecx + ebx]
ja EndBad
shr eax, 1 ; eax == (first + last) >> 1

cmp edi, [edx + eax * 4]

jl LessThan ; _if (key < ecx) jmpto LessThan

lea ecx, [eax + 1] ; first = mid + 1
jne BeginL ; loop

pop ebx
pop edi
pop esi
ret
}
}

__inline __declspec (naked) int bs_asm2
(unsigned int first, unsigned int last, int key, int *Array)
{
__asm {
push esi
push edi
push ebx

mov ecx, dword ptr [esp + 4 + 3*4] ; ecx = first
mov ebx, dword ptr [esp + 8 + 3*4] ; ebx = last
mov edi, dword ptr [esp + 12+ 3*4] ; edi = key
mov edx, dword ptr [esp + 16+ 3*4] ; edx = Array

lea eax, [ecx + ebx]
jmp BeginL

EndBad:
pop ebx
pop edi
pop esi
mov eax, -1 ; not found
ret

ALIGN 16 ; Does this work?

LessThan:
lea ebx, [eax - 1]
lea eax, [ecx + eax - 1]
BeginL:
cmp ecx, ebx
ja EndBad
shr eax, 1

cmp edi, [edx + eax * 4]

jl LessThan
lea ecx, [eax + 1]
lea eax, [ebx + eax + 1]

jne BeginL

lea eax, [ecx - 1]

pop ebx
pop edi
pop esi
ret
}
}

int main(int argc, char* argv[])
{
int *arr = new int[ARRAY_SIZE];
int i;

for (i=0;i<ARRAY_SIZE;i++)
arr[i] = i;

DWORD t;

t = GetTickCount();
for (i=0;i<TEST_COUNT;i++)
bs_c(arr, 0, ARRAY_SIZE-1, 3);
t = GetTickCount() - t;
cout << "C version: " << t << " ms\n";


t = GetTickCount();
for (i=0;i<TEST_COUNT;i++)
binary_search_asm(0, ARRAY_SIZE-1, 3, arr);
t = GetTickCount() - t;
cout << "rmullen3's asm version: " << t << " ms\n";

t = GetTickCount();
for (i=0;i<TEST_COUNT;i++)
bs_br(0, ARRAY_SIZE-1, 3, arr);
t = GetTickCount() - t;
cout << "bitRAKE's asm version: " << t << " ms\n";


t = GetTickCount();
for (i=0;i<TEST_COUNT;i++)
bs_asm2(0, ARRAY_SIZE-1, 3, arr);
t = GetTickCount() - t;
cout << "Thomas's asm version: " << t << " ms\n";

delete[] arr;
return 0;
}
[/SIZE]


Thomas
Posted on 2003-03-02 17:05:00 by Thomas
rmullen3,
Some time ago I used similar code with my WndProc:



.data
ALIGN 4 ;
MainMessages DD WM_LBUTTONDOWN, OnMouseMove_1 ; WM_LBUTTONDOWN=201h
DD WM_MOUSEMOVE, OnMouseMove ; WM_MOUSEMOVE=200h
DD WM_TIMER, OnTimer ; WM_TIMER = 113h
DD WM_COMMAND, OnCommand ; WM_COMMAND=111h
DD WM_NCLBUTTONDOWN, OnNclbuttondown ; 0A1h
DD WM_DISPLAYCHANGE, OnDisplayChange ; WM_DISPLAYCHANGE= 7Eh
DD WM_GETMINMAXINFO, OnGetminmaxinfo ; 24h
DD WM_PAINT, OnPaint ; WM_PAINT=0Fh
DD WM_SIZE, OnSize ; WM_SIZE=05h
DD WM_DESTROY, OnDestroy ; WM_DESTROY=02h
DD WM_CREATE, OnCreate ; WM_CREATE=01h
nMessages DD ($-MainMessages)/8 ;
;...............................................................;

.code

;...............................................;
;Align 16 ;
WndProc: ; above 10 sorted messages only!!
cmp dword ptr [esp+8], 201h ; [esp+8]-> uMsg max message
ja DefWindowProc ; out of range?
[B];Binary dword search in sorted array [/B]...........;
mov edx, [esp+8] ; edx=uMsg
xor ecx, ecx ; top <= 0
push ebx ; save register as required by Windows
mov eax, nMessages ; eax=number of messages to do
[B];Align 16[/B] ; !!! Loop must be 16 aligned !!!
lowM: ;
mov ebx, eax ; ebx=>last
add eax, ecx ; ecx=>first
checkM: ;
shr eax, 1 ; divide by 2
cmp ecx, ebx ; done if first == last
jnc WndProc_1 ; not found => jmp to DefWindowProc
cmp [MainMessages+8*eax], edx ; messages are in descending order
;cmp edx, [MainMessages+8*eax] ; messages are in ascending order
jc lowM ;
db 3Eh ; prefix ds:
lea ecx, [eax+1] ; ecx => first
lea eax, [ebx+eax+1] ; ebx=> last
jne checkM ; if not equal loop again
[B];End Binary dword search in sorted array [/B].......;
push esp ; save registers as required by Windows
push ebp ;
push esi ; call correct procedure for the message
push edi ;
call dword ptr [MainMessages+ecx*8-8+4] ;
pop edi ;
pop esi ; [esp+4+24]=hwnd,[esp+8+24]=umsg,
pop ebp ; [esp+12+24]=wparam,[esp+16+24]=lparam
pop esp ;
WndProc_1: ;
pop ebx ; restore registers
jnc DefWindowProc ; if carry set=don?t call DefWinProc eax=exit ode
ret 16 ; ret 16 -> clears the stack from 4 parameters

Loop In memory:
004011D0 8B D8 mov ebx, eax ; D0
004011D2 03 C1 add eax, ecx ; D1
004011D4 D1 E8 shr eax, 1; D0
004011D6 3B CB cmp ecx, ebx ; D1
004011D8 73 13 jae 004011ED ; D0
004011DA 39 14 C5 18 26 40 00 cmp dword ptr [eax*8+402618h], edx ; D0
004011E1 72 ED jb 004011D0 ; D1
004011E3 3E 8D 48 01 lea ecx, ds:[eax+1] ; D0
004011E7 8D 44 18 01 lea eax, [eax+ebx+1] ; D0
004011EB 75 E7 jne 004011D4 ; D1
004011ED
;...............................................;


As you see it is very close to Thomas's decision but MUST be faster

Regards,
Lingo
Posted on 2003-03-02 20:45:49 by lingo12
Your code is confusing to me, lingo. Sorry, I'm not skilled enough to seperate the general search part of it from the other Window like things =) But it looks really interesting! I'll spend some time with it. If it's faster, awesome, I must get max speed =D

I have one more question for now, concerning these lines of code:



shr eax, 1

cmp edi, [edx + eax * 4]


Why doesn't either of following work:



shr eax, 1
shl eax, 2

cmp edi, [edx + eax]




cmp edi, [edx + eax * 2]


Because if you're first dividing eax by 2 (shr eax, 1) then multiplying it by 4, aren't you just multiplying by 2?

Thanks!
Posted on 2003-03-02 21:12:01 by rmullen3
"Why doesn't either of following work:"
Easy! Just substitute eax with: 0,1, 2, 3, 4, 5....and compute it.
Lets edx=1000=const

"Why must it be faster?"
Easy! Just count D0 in both loops ; D0 => CPU clock (expected)

For PPro, PII and PIII here is Thomas's code in memory:


Align 16
LessThan:
[B]004030D0[/B] 8D 58 FF lea ebx, [eax-1] ; D0
004030D3 8D 44 08 FF lea eax, [eax+ecx-1] ; D0
BeginL:
[B]004030D7[/B] 3B CB cmp ecx, ebx ; D1 D0
004030D9 77 10 ja EndBad (4030EBh) ; D0 D1
004030DB D1 E8 shr eax, 1 ; D0
004030DD 3B 3C 82 cmp edi, dword ptr [edx+eax*4];D0
[B]004030E0[/B] 7C EE jl LessThan (4030D0h) ; D0
004030E2 8D 48 01 lea ecx, [eax+1] ; D0
[B]004030E5[/B] 8D 44 19 01 lea eax, [ecx+ebx+1] ; D0
004030E9 75 EC jne BeginL (4030D7h) ; D1
EndBad:

Regards,
Lingo
Posted on 2003-03-02 22:06:09 by lingo12
rmullen3,
I have question about your code, too:


LessThan:
mov ebx, [B]eax[/B] ; last = mid
dec ebx ; last = mid - 1
jmp BeginL

What will be happen if we are at LessThan label and eax = 0?

Regards,
Lingo
Posted on 2003-03-02 23:19:26 by lingo12
;MACRO based on [b]lingo12[/b]'s code:


bs12 MACRO item:REQ, key:REQ, Array:REQ, first:REQ, last:REQ, EndBad:REQ
LOCAL _low, _check

; item : register to receive result index to key item
; key : register of value of key to find
; Array : OFFSET or register pointer to array in memory
; first : register index of first Array item
; last : register index of last Array item
; EndBad: if key is not found jump to this label

mov item, last

ALIGN 16

_low: mov last, item
add item, first
_check: shr item, 1
cmp first, last ; done if first == last
jnc EndBad
cmp [Array + 4*item], key ; descending order
;cmp key, [Array + 4*item] ; ascending order
jc _low
lea first, [item + 1]
lea item, [last + item + 1]
jne _check

lea item, [first - 1] ; found key index
ENDM


bs12 edi, esi, ebx, eax, edx, Not_Found
; EDI is the key, [ebx+edi*4] = esi
Posted on 2003-03-03 00:48:36 by bitRAKE