I was thinking about int approximations of square roots, and thought about the current guess and check methods and how they would fair speed wise with a 64bit integer.
Is very fast, BUT you can't CVT 64bit integers into double FP values. Using the FPU would just make the function slow. Also because it's SIGNED integer the highest value you can do is 7FFFFFFFh.
Here's a square root for all unsigned 32bit values.
Surprisingly it's only 2x slower than the SSE version (after the 15% speed increase from the unroll).
Also in 64bit world the function can be easily modified to get the int sqrt of 64bit integers.
I usually like to comment my code, but unless your familiar with the ambiguous math involved it really wouldn't help much. It relies on the bond and bit patterns between square root, and remaining numbers.
With the extra registers on a 64bit system this function has a lot of room for optimization.
BEFORE SOMEONE ASKS - Why would you need an integer square root? I don't know! :P
IntSqrtSSE:
mov eax,
cvtsi2sd xmm0,eax
sqrtsd xmm0,xmm0
cvtsd2si eax,xmm0
ret 4
Is very fast, BUT you can't CVT 64bit integers into double FP values. Using the FPU would just make the function slow. Also because it's SIGNED integer the highest value you can do is 7FFFFFFFh.
Here's a square root for all unsigned 32bit values.
IntSqrt:
push esi
push edi
mov esi, ; IN number to find sqr root of
xor ecx,ecx ;zero out
xor eax,eax ;
mov edi,10h ;loop counter , use 32 for 64bit int
mov edx,esi
.LP:
shr edx,1eh
shl eax,1
lea ecx,
lea edx, ;2x + 1 part of the binomial expand of (x+1)^2
shl esi,2
cmp edx,ecx
ja .SKIP
sub ecx,edx
inc eax
.SKIP:
mov edx,esi
;unroll 15% speed boost
shr edx,1eh
shl eax,1
lea ecx,
lea edx,
shl esi,2
cmp edx,ecx
ja .SKIP1
sub ecx,edx
inc eax
.SKIP1:
mov edx,esi
sub edi,2
jnz .LP
pop edi
pop esi
ret 4
Surprisingly it's only 2x slower than the SSE version (after the 15% speed increase from the unroll).
Also in 64bit world the function can be easily modified to get the int sqrt of 64bit integers.
I usually like to comment my code, but unless your familiar with the ambiguous math involved it really wouldn't help much. It relies on the bond and bit patterns between square root, and remaining numbers.
With the extra registers on a 64bit system this function has a lot of room for optimization.
BEFORE SOMEONE ASKS - Why would you need an integer square root? I don't know! :P
Why would you need an integer square root?
i have 2 386-based pc without fpu :)
Replace shl eax,1 with add eax,eax.
This should improve pairability and speed !
This should improve pairability and speed !
i found this by tran , written a loong time ago...
ftp://hornet.madtracker.org/mirrors/hornet/code/demosrc/demos/as.zip
quote: (end of file)
; Hey you... Yeah, you reading this message... Ignore this.
;¦¦¦¦¦¦¦¦¦¦¦¦¦¦¦¦¦¦¦¦¦¦¦¦¦¦¦¦¦¦¦¦¦¦¦¦¦¦¦¦¦¦¦¦¦¦¦¦¦¦¦¦¦¦¦¦¦¦¦¦¦¦¦¦¦¦¦¦¦¦¦¦¦¦¦¦¦
; Square root
; In:
; EAX - number to take root of
; Out:
; EAX - root
;¦¦¦¦¦¦¦¦¦¦¦¦¦¦¦¦¦¦¦¦¦¦¦¦¦¦¦¦¦¦¦¦¦¦¦¦¦¦¦¦¦¦¦¦¦¦¦¦¦¦¦¦¦¦¦¦¦¦¦¦¦¦¦¦¦¦¦¦¦¦¦¦¦¦¦¦¦
sqrtbasetbl db 0,1,4,9,16,25,36,49,64,81,100,121,144,169,196,225
_sqrt:
pushad
mov ebp,eax
bsr ebx,eax
jnz short sqrtf0
xor ebx,ebx
sqrtf0:
shr ebx,3
lea eax,
mov cl,32
sub cl,al
rol ebp,cl
mov eax,ebp
movzx eax,al
mov edi,offset sqrtbasetbl
mov ecx,10h
sqrtl0:
scasb
je short sqrtl0d
jb short sqrtl0d2
loop sqrtl0
inc edi
sqrtl0d2:
dec edi
inc cl
sqrtl0d:
movzx edx,byte ptr
dec cl
xor cl,0fh
mov edi,ecx
mov ecx,ebx
jecxz short sqrtdone
sub eax,edx
sqrtml:
shld eax,ebp,8
rol ebp,8
mov ebx,edi
shl ebx,5
xor edx,edx
mov esi,eax
div ebx
rol edi,4
add edi,eax
add ebx,eax
sqrtf2:
imul eax,ebx
mov edx,eax
mov eax,esi
sub eax,edx
jc short sqrtf1
loop sqrtml
sqrtdone:
mov ,edi
popad
ret
sqrtf1:
dec ebx
dec edi
movzx eax,bl
and al,1fh
jmp sqrtf2
ftp://hornet.madtracker.org/mirrors/hornet/code/demosrc/demos/as.zip
quote: (end of file)
; Hey you... Yeah, you reading this message... Ignore this.
;¦¦¦¦¦¦¦¦¦¦¦¦¦¦¦¦¦¦¦¦¦¦¦¦¦¦¦¦¦¦¦¦¦¦¦¦¦¦¦¦¦¦¦¦¦¦¦¦¦¦¦¦¦¦¦¦¦¦¦¦¦¦¦¦¦¦¦¦¦¦¦¦¦¦¦¦¦
; Square root
; In:
; EAX - number to take root of
; Out:
; EAX - root
;¦¦¦¦¦¦¦¦¦¦¦¦¦¦¦¦¦¦¦¦¦¦¦¦¦¦¦¦¦¦¦¦¦¦¦¦¦¦¦¦¦¦¦¦¦¦¦¦¦¦¦¦¦¦¦¦¦¦¦¦¦¦¦¦¦¦¦¦¦¦¦¦¦¦¦¦¦
sqrtbasetbl db 0,1,4,9,16,25,36,49,64,81,100,121,144,169,196,225
_sqrt:
pushad
mov ebp,eax
bsr ebx,eax
jnz short sqrtf0
xor ebx,ebx
sqrtf0:
shr ebx,3
lea eax,
mov cl,32
sub cl,al
rol ebp,cl
mov eax,ebp
movzx eax,al
mov edi,offset sqrtbasetbl
mov ecx,10h
sqrtl0:
scasb
je short sqrtl0d
jb short sqrtl0d2
loop sqrtl0
inc edi
sqrtl0d2:
dec edi
inc cl
sqrtl0d:
movzx edx,byte ptr
dec cl
xor cl,0fh
mov edi,ecx
mov ecx,ebx
jecxz short sqrtdone
sub eax,edx
sqrtml:
shld eax,ebp,8
rol ebp,8
mov ebx,edi
shl ebx,5
xor edx,edx
mov esi,eax
div ebx
rol edi,4
add edi,eax
add ebx,eax
sqrtf2:
imul eax,ebx
mov edx,eax
mov eax,esi
sub eax,edx
jc short sqrtf1
loop sqrtml
sqrtdone:
mov ,edi
popad
ret
sqrtf1:
dec ebx
dec edi
movzx eax,bl
and al,1fh
jmp sqrtf2
Posted a realy shorter implementation there:
http://www.asmcommunity.net/board/index.php?topic=25521.msg186474#msg186474
http://www.asmcommunity.net/board/index.php?topic=25521.msg186474#msg186474