I would like the algo experts to see if this code can be improved.
It was just an exercise on 64 bit math, that i have tried to optimize as much as possible.
Nothing very usefull, but i think many people don't manage 64 bit integer math, and it could be helpfull.

Have fun




;======================================================================
; File: pr.asm
; Desc: Prints prime numbers starting from input number (64 bit).
; Auth: Towers
; Date: 04/30/2002
;
; Usage: pr 12345678 ( any number, greater than 12 )
;
;-----------------------------------------------------------------------
; Notes:
; Win32 Assembly console project.
; To compile and link with masm32, use:
;
; c:\masm32\bin\ml /c /coff pr.asm
; c:\masm32\bin\LINK -subsystem:console -entry:_start pr.obj
;
;=================================================

;-----------------------------------------------------------------------
; DIRECTIVES
;-----------------------------------------------------------------------

.586
.MODEL flat, stdcall
option casemap:none

;-----------------------------------------------------------------------
; INCLUDES
;-----------------------------------------------------------------------
.nolist
include \masm32\include\Windows.inc
include \masm32\include\masm32.inc
include \masm32\include\kernel32.inc
include \masm32\include\user32.inc

includelib \masm32\lib\user32.lib
includelib \masm32\lib\masm32.lib
includelib \masm32\lib\kernel32.lib
.list

.DATA

only_num BYTE "Only digits from 0 to 9",13,10,0
msg_12 BYTE "Input must be grater than 12",13,10,0
msg BYTE "Prime numbers starting at: "
cmdline BYTE " ",13,10,0
msg4 BYTE ".....................", 13, 10,0
numlo DWORD 0
numhi DWORD 0
numten DWORD 10
root DWORD 0
itera DWORD 20

.CODE

_start PROC

INVOKE GetCommandLineA

@@: mov dl,[eax] ; eax= pointer to command line
cmp dl," "
je espacio ; search for space to start reading
inc eax ; the number entered
jmp @B

espacio:
;------------------------------------
; copy from command line into buffer
;------------------------------------

lea edi,[cmdline] ; buffer to place the ASCII number

inc eax ; start reading after space
sub ecx,ecx
@@: mov dl,[eax+ecx] ; move from command line ....
cmp dl,0h
je finstr
mov [edi+ecx],dl ; ... into buffer, until 0 found
inc ecx ; ECX has number of ascii digits
jmp @B ; found in the buffer

;---------------------------------------------
; get numbers from buffer, checking range 0:9
; and then transform to 64 bit binary
; starts with # of digits in ECX
;----------------------------------------------
finstr:
lea esi,[cmdline]
sub ebx,ebx
@@: mov bl,[esi]
cmp bl,39h
ja msg_num
cmp bl,30h
jb msg_num ; if out of range display msg
and bl,0fh

mov eax,numlo ; this is a 64 bit multiply
mul numten ; by ten
mov numlo,eax ; first multiply lower 32 bits
push edx ; save upper 32 bits from mul
mov eax,numhi
mul numten ; multiply hi 32 bits
mov numhi,eax
pop edx
add numhi,edx ; add the upper bits from 1st mul

add numlo,ebx ; add in the next digit
adc numhi,0 ; from the buffer
inc esi ;
loop @B ; loops until all digits processed

;----------------------------------------------------
; check if resulting 64 bit number is greater than 12
;----------------------------------------------------
mov eax,numhi
cmp eax,0
jnz calc_prime ; numhi must not be zero
mov eax,numlo
cmp eax,13 ; and numlo must be greater than 12
jb msg12 ; print msg saying "must be greater than 12"

;----------------------------------------------------
; start calculating if number is prime
; if number is even (last bit =0 ) decrement
; because i want to start with an odd number
; (even numbers are never prime!)
;----------------------------------------------------

calc_prime: invoke StdOut,addr msg
finit

mov eax,numlo
and eax,1 ; check last bit to see if number is even
cmp eax,1
je primero ; if odd start calculating

sub numlo,1 ; if even decrement
sbb numhi,0

;------------------------------------
; to calculate primes i will divide by different numbers
; only upto SQRT(number)
; if no factors found upto SQRT, no factors will be found.
; to check if ODD number is prime, divide by
; 3,5, then 7,11, 13,17, 19,23 pattern +2 +4 +2+ 4+ 2+4....
; if no factors found --> print prime
;------------------------------------

; Calcula root del numero
otrop:
add numlo,2
adc numhi,0

primero: fild QWORD PTR numlo
fsqrt
fistp root ; found SQRT(number)


sub edx,edx ; 64 bit divide by 3
mov ebx,3
mov eax,numhi
div ebx
mov eax,numlo
div ebx
cmp edx,0
je otrop ; edx = 0 means no remainder, factor found
; so not prime, try with next odd number
; adding to present number

sub edx,edx ; 64 bit divide by 5
mov ebx,5
mov eax,numhi
div ebx
mov eax,numlo
div ebx
cmp edx,0
je otrop ; edx = 0 means no remainder, factor found

mov ebx,5
masdos:
sub edx,edx
add ebx,2 ; start the +2 +4 +2 +4 sequence
mov eax,numhi ; looking for factors
div ebx
mov eax,numlo
div ebx
cmp edx,0
je otrop

sub edx,edx
add ebx,4 ; +4
mov eax,numhi
div ebx
mov eax,numlo
div ebx
cmp edx,0
je otrop
cmp ebx,root ; keep looking for factors upto ROOT
jb masdos ; +2 ......

;---------------------------------------------------
; no factors found - let's print the PRIME number
;---------------------------------------------------
mov ebx,numlo
mov ecx,numhi

lea edi,[msg4+20] ; we have to print backwards
mov esi,10 ; decoding the 64 bit binary to ASCII
@@:
sub edx,edx
mov eax,ecx
div esi ; divide numhi by 10
mov ecx,eax

mov eax,ebx
div esi ; then num lo by 10 using
mov ebx,eax ; first remainder in edx
add dl,30h ; new remainder plus 30h is the digit
mov [edi],dl ; move into output print buffer
dec edi ; decrement position in buffer
cmp eax,0
jne @B ; keep looping until no digits left

INVOKE StdOut,addr msg4 ; print the prime
dec itera ; decrement number of primes
jz fin ; until 20 have been found

jmp otrop ; try to find another

;---------------------------------------------

msg_num:
invoke StdOut,addr only_num
jmp fin

msg12:
invoke StdOut,addr msg_12
jmp fin

fin: INVOKE ExitProcess,
0 ; Result code for parent process

_start ENDP

PUBLIC _start

END

Posted on 2002-04-30 18:10:09 by towers
One optimization can be to change your sub ecx,ecx by xor ecx,ecx. The same with your sub ebx,ebx

I think you can also change cases like
mov eax,numhi
cmp eax,0
jnz calc_prime
d

by

mov eax,numhi
dec eax
js calc_prime


I think cases such as this:
cmp eax,1
je primero
can be changed by:

dec eax
jz primero
....
....
primero:
inc eax ;If you really need to keep old EAX value


I also saw a:
sub numlo,1
that can be changed by:
dec word ptr

OK. This is what I saw at the moment. Just correct me if I have mistakes.

Bueno, mucha suerte con el ejercicio. :grin:

Saludos.
Posted on 2002-04-30 19:51:58 by CodeLover

One optimization can be to change your sub ecx,ecx by xor ecx,ecx. The same with your sub ebx,ebx

And why this is better ?
:)

cmp edx,0

use
test edx,edx
speed the same but shorter. as option
or edx,edx
also set ZF if edx = 0
Posted on 2002-04-30 20:23:47 by The Svin
Sorry, xor reg,reg is not better than sub reg,reg
:stupid:
Posted on 2002-04-30 20:51:55 by CodeLover
I guess there are not many comments.:(
I can change a sub for a dec, or a cmp for a test, but that doesn't improve inner loop or the algorithm.

I was asking if maybe there is a better way to do the 64 math, such as the divides in the inner loop than are called millions of times for big primes.

Any ideas ?
Posted on 2002-05-01 15:22:04 by towers
I understand you and promisse to have a close look later.
Can you send a copy of sourse as asm file or archive,
I faild to copy it from screen.
Posted on 2002-05-01 15:38:43 by The Svin
Many, many years ago I was "challenged" to beat some routines that calculated n prime numbers as quickly as possible. At that time I didn't have a girlfriend so I *had* free time ;) thus I accepted.

The CPU was a 65xx.

I mumbled mumbled a bit and found what to me seemed the fastest solution or, at least, was hundreds times faster than the other proposed ones, at least in the given conditions of the challenge, so I've been happy with this and didn't try to improve it.

In short how it worked is: I create/use an array of "flags", where a flag can mean "prime number" or "not prime number". Each flag is 1 bit wide. So if bit 2 of this big array of bits is clear, then it means that the number 2 is a prime number. If bit 18 is set, then it means than number 18 is not a prime number. I start with the whole array initialized as "prime number", as if they all were prime numbers.

Then I scan 8 (64 today with MMX) bits at a time, and (to simplify the concept) OR each even bit. That will make "non prime number" every multiple of 2. I then go back and do this every 3 bits.. using a mask like this:

00100100 10010010 01001001

i.e. every multiple of 3 will be marked as "non prime number".

<edit>It's implicitly clear (but I better write it explicitly maybe) that then we continue this process with 5, etc..
In substance we skipped 4 because we found it (thanks to the previous computations) to be already a "non prime number", and thus there was no need to check for its multiples.. being them certainly "non prime numbers" as well. That's why the went directly to 5.. and so on
</edit>

It's fast.. it's just OR instructions, for the most.

Today CPU's have caches, so you better work on a block at a time, where the block isn't bigger than a certain size.

Moreover, in my implementation I made things even 2 times as fast, by exploiting the fact that all even numbers aren't prime anyway.. and there are further clever optimizations possible. It was lotsa years ago.. I wish I could find the 65xx source of it somewhere, I've got a box full of papers in the cellar.

I hope that helps.
Posted on 2002-05-01 18:03:08 by Maverick
Maverick, I've done the same algo on 680x0, x86, but not MMX.
Hutch, has a version on his web page - not by me.
Posted on 2002-05-01 19:28:17 by bitRAKE
Well, I didn't know.. to me (at that time) it seemed the more straightforward and "low level", thus best solution.
Posted on 2002-05-02 04:11:30 by Maverick
We learn similarly - that is very cool.
Posted on 2002-05-02 07:14:45 by bitRAKE
Thanks The Svin.
Glad to hear that you can still communicate with the world.

Here is the asm code as attachment.


I know that this is kind of "brute force" calculation of prime,
and that there are more elegant ways of doing it.
I'll try another method next time.

As i said this was more of an experiment to see if all my 64 bit math was working OK, I had a few problems with it, but now it seems to be working perfect.

Primes upto 15 digits are very fast, with 20 digits it takes some seconds to find each prime (i'm working with a PIII 750).
Posted on 2002-05-02 08:45:31 by towers
Maverick

The biggest number i can handle with my algorithm is aprox 9.22E18 (9220000000000000000).

First prime found using this number as input is
9220000000000000039

If i want to create a bit array and start to check off primes how would i start to manage this. I would need 1E18 bytes or 1000000000 gigabytes!!!

There must be another (simple) way for checking big primes.
Posted on 2002-05-02 13:59:20 by towers

We learn similarly - that is very cool.
It happened more times and, yup, it's someway an indication that things can't be made faster. ;)

BTW: I looked at Hutch site but couldn't find the x86 routine. Do you have a direct link please?
Posted on 2002-05-02 16:57:58 by Maverick

BTW: I looked at Hutch site but couldn't find the x86 routine. Do you have a direct link please?
http://www.movsd.com/source.htm
(Third from the bottom, called primes.zip)

I did a rewrite some time back, with a great speed increase over this one, but have since lost that HD - it is still sitting here in the hopes of reading it one day. :(

towers, how serious are you about primes?
http://www.mersenne.org/source.htm
...large chunk of the source is ASM - what else could it be. :)
http://www.mersenne.org/math.htm
...this page is a good read.
Posted on 2002-05-02 17:07:18 by bitRAKE

Maverick

The biggest number i can handle with my algorithm is aprox 9.22E18 (9220000000000000000).

First prime found using this number as input is
9220000000000000039

If i want to create a bit array and start to check off primes how would i start to manage this. I would need 1E18 bytes or 1000000000 gigabytes!!!

There must be another (simple) way for checking big primes.
Maybe you could check some book on number theory on your local library.. or use those wonderful internet search engines. ;)
Posted on 2002-05-02 17:17:53 by Maverick

http://www.movsd.com/source.htm
(Third from the bottom, called primes.zip)
Thank you. :)
I did a rewrite some time back, with a great speed increase over this one, but have since lost that HD - it is still sitting here in the hopes of reading it one day. :(
Hard disk crashes are one of the worst things that can happen.. I've solved my problem (i.e. "I can't live making backups every day") by having two PC's, one at my house and one at my gf's house, which are "cloned".. so I just update this or that depending where I worked that half day, and if one dies I've a full working copy in the other house. :)
Now, a earthquake that destroys both computers.. hmm. :tongue:
Posted on 2002-05-02 17:23:11 by Maverick
Rare find on P4 testing by ASM programmer:
http://www.mersenne.org/gimps/p4notes.doc
...I found it a good read.
Posted on 2002-05-02 17:26:08 by bitRAKE
I gave it a quick look.. interesting.. when I have some time I'll read it well.

BTW: you probably++ already know, but Intel nicely sends real (paper) book versions of its CPU books. Just got 248966-04 (Intel Pentium-IV and Intel Xeon Processor Optimization Reference Manual). The 3 books on the IA32 are backordered, and should arrive soon. Paper is much better than PDF, can bring it with me when I'm in holiday (NO! Assuming I'll have any holiday anyway). :)
Posted on 2002-05-02 18:05:01 by Maverick
Maverick, don't you have to call Intel by phone to get them to send
you the manuals? Or is there some online ordering page?
Posted on 2002-05-02 18:12:08 by f0dder
I did it via WEB.. maybe it was available only temporarily, dunno. Worth a check anyway! Sorry, don't have any link.. I was just browsing, found the link, clicked, wrote my address and voilat. ;)

PS: What I recall was a button with a text like "Order a hardcopy of this document".
Posted on 2002-05-02 18:16:36 by Maverick