I would like the algo experts to see if this code can be improved.

It was just an exercise on 64 bit math, that i have tried to optimize as much as possible.

Nothing very usefull, but i think many people don't manage 64 bit integer math, and it could be helpfull.

Have fun

It was just an exercise on 64 bit math, that i have tried to optimize as much as possible.

Nothing very usefull, but i think many people don't manage 64 bit integer math, and it could be helpfull.

Have fun

```
```

;======================================================================

; File: pr.asm

; Desc: Prints prime numbers starting from input number (64 bit).

; Auth: Towers

; Date: 04/30/2002

;

; Usage: pr 12345678 ( any number, greater than 12 )

;

;-----------------------------------------------------------------------

; Notes:

; Win32 Assembly console project.

; To compile and link with masm32, use:

;

; c:\masm32\bin\ml /c /coff pr.asm

; c:\masm32\bin\LINK -subsystem:console -entry:_start pr.obj

;

;=================================================

;-----------------------------------------------------------------------

; DIRECTIVES

;-----------------------------------------------------------------------

.586

.MODEL flat, stdcall

option casemap:none

;-----------------------------------------------------------------------

; INCLUDES

;-----------------------------------------------------------------------

.nolist

include \masm32\include\Windows.inc

include \masm32\include\masm32.inc

include \masm32\include\kernel32.inc

include \masm32\include\user32.inc

includelib \masm32\lib\user32.lib

includelib \masm32\lib\masm32.lib

includelib \masm32\lib\kernel32.lib

.list

.DATA

only_num BYTE "Only digits from 0 to 9",13,10,0

msg_12 BYTE "Input must be grater than 12",13,10,0

msg BYTE "Prime numbers starting at: "

cmdline BYTE " ",13,10,0

msg4 BYTE ".....................", 13, 10,0

numlo DWORD 0

numhi DWORD 0

numten DWORD 10

root DWORD 0

itera DWORD 20

.CODE

_start PROC

INVOKE GetCommandLineA

@@: mov dl,[eax] ; eax= pointer to command line

cmp dl," "

je espacio ; search for space to start reading

inc eax ; the number entered

jmp @B

espacio:

;------------------------------------

; copy from command line into buffer

;------------------------------------

lea edi,[cmdline] ; buffer to place the ASCII number

inc eax ; start reading after space

sub ecx,ecx

@@: mov dl,[eax+ecx] ; move from command line ....

cmp dl,0h

je finstr

mov [edi+ecx],dl ; ... into buffer, until 0 found

inc ecx ; ECX has number of ascii digits

jmp @B ; found in the buffer

;---------------------------------------------

; get numbers from buffer, checking range 0:9

; and then transform to 64 bit binary

; starts with # of digits in ECX

;----------------------------------------------

finstr:

lea esi,[cmdline]

sub ebx,ebx

@@: mov bl,[esi]

cmp bl,39h

ja msg_num

cmp bl,30h

jb msg_num ; if out of range display msg

and bl,0fh

mov eax,numlo ; this is a 64 bit multiply

mul numten ; by ten

mov numlo,eax ; first multiply lower 32 bits

push edx ; save upper 32 bits from mul

mov eax,numhi

mul numten ; multiply hi 32 bits

mov numhi,eax

pop edx

add numhi,edx ; add the upper bits from 1st mul

add numlo,ebx ; add in the next digit

adc numhi,0 ; from the buffer

inc esi ;

loop @B ; loops until all digits processed

;----------------------------------------------------

; check if resulting 64 bit number is greater than 12

;----------------------------------------------------

mov eax,numhi

cmp eax,0

jnz calc_prime ; numhi must not be zero

mov eax,numlo

cmp eax,13 ; and numlo must be greater than 12

jb msg12 ; print msg saying "must be greater than 12"

;----------------------------------------------------

; start calculating if number is prime

; if number is even (last bit =0 ) decrement

; because i want to start with an odd number

; (even numbers are never prime!)

;----------------------------------------------------

calc_prime: invoke StdOut,addr msg

finit

mov eax,numlo

and eax,1 ; check last bit to see if number is even

cmp eax,1

je primero ; if odd start calculating

sub numlo,1 ; if even decrement

sbb numhi,0

;------------------------------------

; to calculate primes i will divide by different numbers

; only upto SQRT(number)

; if no factors found upto SQRT, no factors will be found.

; to check if ODD number is prime, divide by

; 3,5, then 7,11, 13,17, 19,23 pattern +2 +4 +2+ 4+ 2+4....

; if no factors found --> print prime

;------------------------------------

; Calcula root del numero

otrop:

add numlo,2

adc numhi,0

primero: fild QWORD PTR numlo

fsqrt

fistp root ; found SQRT(number)

sub edx,edx ; 64 bit divide by 3

mov ebx,3

mov eax,numhi

div ebx

mov eax,numlo

div ebx

cmp edx,0

je otrop ; edx = 0 means no remainder, factor found

; so not prime, try with next odd number

; adding to present number

sub edx,edx ; 64 bit divide by 5

mov ebx,5

mov eax,numhi

div ebx

mov eax,numlo

div ebx

cmp edx,0

je otrop ; edx = 0 means no remainder, factor found

mov ebx,5

masdos:

sub edx,edx

add ebx,2 ; start the +2 +4 +2 +4 sequence

mov eax,numhi ; looking for factors

div ebx

mov eax,numlo

div ebx

cmp edx,0

je otrop

sub edx,edx

add ebx,4 ; +4

mov eax,numhi

div ebx

mov eax,numlo

div ebx

cmp edx,0

je otrop

cmp ebx,root ; keep looking for factors upto ROOT

jb masdos ; +2 ......

;---------------------------------------------------

; no factors found - let's print the PRIME number

;---------------------------------------------------

mov ebx,numlo

mov ecx,numhi

lea edi,[msg4+20] ; we have to print backwards

mov esi,10 ; decoding the 64 bit binary to ASCII

@@:

sub edx,edx

mov eax,ecx

div esi ; divide numhi by 10

mov ecx,eax

mov eax,ebx

div esi ; then num lo by 10 using

mov ebx,eax ; first remainder in edx

add dl,30h ; new remainder plus 30h is the digit

mov [edi],dl ; move into output print buffer

dec edi ; decrement position in buffer

cmp eax,0

jne @B ; keep looping until no digits left

INVOKE StdOut,addr msg4 ; print the prime

dec itera ; decrement number of primes

jz fin ; until 20 have been found

jmp otrop ; try to find another

;---------------------------------------------

msg_num:

invoke StdOut,addr only_num

jmp fin

msg12:

invoke StdOut,addr msg_12

jmp fin

fin: INVOKE ExitProcess,

0 ; Result code for parent process

_start ENDP

PUBLIC _start

END

One optimization can be to change your

I think you can also change cases like

by

I think cases such as this:

cmp eax,1

je primero

can be changed by:

I also saw a:

sub numlo,1

that can be changed by:

dec word ptr

OK. This is what I saw at the moment. Just correct me if I have mistakes.

Bueno, mucha suerte con el ejercicio. :grin:

Saludos.

*sub ecx,ecx*by**xor ecx,ecx**. The same with your*sub ebx,ebx*I think you can also change cases like

*d*

mov eax,numhi

cmp eax,0

jnz calc_primemov eax,numhi

cmp eax,0

jnz calc_prime

by

mov eax,numhi

dec eax

js calc_primemov eax,numhi

dec eax

js calc_prime

I think cases such as this:

cmp eax,1

je primero

can be changed by:

dec eax

jz primero

....

....

primero:

inc eax ;If you really need to keep old EAX value

dec eax

jz primero

....

....

primero:

inc eax ;If you really need to keep old EAX value

I also saw a:

sub numlo,1

that can be changed by:

dec word ptr

OK. This is what I saw at the moment. Just correct me if I have mistakes.

Bueno, mucha suerte con el ejercicio. :grin:

Saludos.

One optimization can be to change your sub ecx,ecx by xor ecx,ecx. The same with your sub ebx,ebx

And why this is better ?

:)

cmp edx,0

use

test edx,edx

speed the same but shorter. as option

or edx,edx

also set ZF if edx = 0

Sorry, xor reg,reg is not better than sub reg,reg

:stupid:

:stupid:

I guess there are not many comments.:(

I can change a sub for a dec, or a cmp for a test, but that doesn't improve inner loop or the algorithm.

I was asking if maybe there is a better way to do the 64 math, such as the divides in the inner loop than are called millions of times for big primes.

Any ideas ?

I can change a sub for a dec, or a cmp for a test, but that doesn't improve inner loop or the algorithm.

I was asking if maybe there is a better way to do the 64 math, such as the divides in the inner loop than are called millions of times for big primes.

Any ideas ?

I understand you and promisse to have a close look later.

Can you send a copy of sourse as asm file or archive,

I faild to copy it from screen.

Can you send a copy of sourse as asm file or archive,

I faild to copy it from screen.

Many, many years ago I was "challenged" to beat some routines that calculated n prime numbers as quickly as possible. At that time I didn't have a girlfriend so I *had* free time ;) thus I accepted.

The CPU was a 65xx.

I mumbled mumbled a bit and found what to me seemed the fastest solution or, at least, was hundreds times faster than the other proposed ones, at least in the given conditions of the challenge, so I've been happy with this and didn't try to improve it.

In short how it worked is: I create/use an array of "flags", where a flag can mean "prime number" or "not prime number". Each flag is 1 bit wide. So if bit 2 of this big array of bits is clear, then it means that the number 2 is a prime number. If bit 18 is set, then it means than number 18 is not a prime number. I start with the whole array initialized as "prime number", as if they all were prime numbers.

Then I scan 8 (64 today with MMX) bits at a time, and (to simplify the concept) OR each even bit. That will make "non prime number" every multiple of 2. I then go back and do this every 3 bits.. using a mask like this:

00100100 10010010 01001001

i.e. every multiple of 3 will be marked as "non prime number".

It's fast.. it's just OR instructions, for the most.

Today CPU's have caches, so you better work on a block at a time, where the block isn't bigger than a certain size.

Moreover, in my implementation I made things even 2 times as fast, by exploiting the fact that all even numbers aren't prime anyway.. and there are further clever optimizations possible. It was lotsa years ago.. I wish I could find the 65xx source of it somewhere, I've got a box full of papers in the cellar.

I hope that helps.

The CPU was a 65xx.

I mumbled mumbled a bit and found what to me seemed the fastest solution or, at least, was hundreds times faster than the other proposed ones, at least in the given conditions of the challenge, so I've been happy with this and didn't try to improve it.

In short how it worked is: I create/use an array of "flags", where a flag can mean "prime number" or "not prime number". Each flag is 1 bit wide. So if bit 2 of this big array of bits is clear, then it means that the number 2 is a prime number. If bit 18 is set, then it means than number 18 is not a prime number. I start with the whole array initialized as "prime number", as if they all were prime numbers.

Then I scan 8 (64 today with MMX) bits at a time, and (to simplify the concept) OR each even bit. That will make "non prime number" every multiple of 2. I then go back and do this every 3 bits.. using a mask like this:

00100100 10010010 01001001

i.e. every multiple of 3 will be marked as "non prime number".

**<edit>***It's implicitly clear (but I better write it explicitly maybe) that then we continue this process with 5, etc..*

In substance we skipped 4 because we found it (thanks to the previous computations) to be already a "non prime number", and thus there was no need to check for its multiples.. being them certainly "non prime numbers" as well. That's why the went directly to 5.. and so onIn substance we skipped 4 because we found it (thanks to the previous computations) to be already a "non prime number", and thus there was no need to check for its multiples.. being them certainly "non prime numbers" as well. That's why the went directly to 5.. and so on

**</edit>**It's fast.. it's just OR instructions, for the most.

Today CPU's have caches, so you better work on a block at a time, where the block isn't bigger than a certain size.

Moreover, in my implementation I made things even 2 times as fast, by exploiting the fact that all even numbers aren't prime anyway.. and there are further clever optimizations possible. It was lotsa years ago.. I wish I could find the 65xx source of it somewhere, I've got a box full of papers in the cellar.

I hope that helps.

**Maverick**, I've done the same algo on 680x0, x86, but not MMX.

**Hutch**, has a version on his web page - not by me.

Well, I didn't know.. to me (at that time) it seemed the more straightforward and "low level", thus best solution.

We learn similarly - that is very cool.

Thanks The Svin.

Glad to hear that you can still communicate with the world.

Here is the asm code as attachment.

I know that this is kind of "brute force" calculation of prime,

and that there are more elegant ways of doing it.

I'll try another method next time.

As i said this was more of an experiment to see if all my 64 bit math was working OK, I had a few problems with it, but now it seems to be working perfect.

Primes upto 15 digits are very fast, with 20 digits it takes some seconds to find each prime (i'm working with a PIII 750).

Glad to hear that you can still communicate with the world.

Here is the asm code as attachment.

I know that this is kind of "brute force" calculation of prime,

and that there are more elegant ways of doing it.

I'll try another method next time.

As i said this was more of an experiment to see if all my 64 bit math was working OK, I had a few problems with it, but now it seems to be working perfect.

Primes upto 15 digits are very fast, with 20 digits it takes some seconds to find each prime (i'm working with a PIII 750).

Maverick

The biggest number i can handle with my algorithm is aprox 9.22E18 (9220000000000000000).

First prime found using this number as input is

9220000000000000039

If i want to create a bit array and start to check off primes how would i start to manage this. I would need 1E18 bytes or 1000000000 gigabytes!!!

There must be another (simple) way for checking big primes.

The biggest number i can handle with my algorithm is aprox 9.22E18 (9220000000000000000).

First prime found using this number as input is

9220000000000000039

If i want to create a bit array and start to check off primes how would i start to manage this. I would need 1E18 bytes or 1000000000 gigabytes!!!

There must be another (simple) way for checking big primes.

We learn similarly - that is very cool.

BTW: I looked at Hutch site but couldn't find the x86 routine. Do you have a direct link please?

BTW: I looked at Hutch site but couldn't find the x86 routine. Do you have a direct link please?

(Third from the bottom, called primes.zip)

I did a rewrite some time back, with a great speed increase over this one, but have since lost that HD - it is still sitting here in the hopes of reading it one day. :(

**towers**, how serious are you about primes?

http://www.mersenne.org/source.htm

...large chunk of the source is ASM - what else could it be. :)

http://www.mersenne.org/math.htm

...this page is a good read.

Maverick

The biggest number i can handle with my algorithm is aprox 9.22E18 (9220000000000000000).

First prime found using this number as input is

9220000000000000039

If i want to create a bit array and start to check off primes how would i start to manage this. I would need 1E18 bytes or 1000000000 gigabytes!!!

There must be another (simple) way for checking big primes.

Thank you. :)

Now, a earthquake that destroys both computers.. hmm. :tongue:I did a rewrite some time back, with a great speed increase over this one, but have since lost that HD - it is still sitting here in the hopes of reading it one day. :(

Hard disk crashes are one of the worst things that can happen.. I've solved my problem (i.e. "I can't live making backups every day") by having two PC's, one at my house and one at my gf's house, which are "cloned".. so I just update this or that depending where I worked that half day, and if one dies I've a full working copy in the other house. :)
Now, a earthquake that destroys both computers.. hmm. :tongue:

**
**

Rare find on P4 testing by ASM programmer:

http://www.mersenne.org/gimps/p4notes.doc

...I found it a good read.

http://www.mersenne.org/gimps/p4notes.doc

...I found it a good read.

I gave it a quick look.. interesting.. when I have some time I'll read it well.

BTW: you probably++ already know, but Intel nicely sends real (paper) book versions of its CPU books. Just got 248966-04 (Intel Pentium-IV and Intel Xeon Processor Optimization Reference Manual). The 3 books on the IA32 are backordered, and should arrive soon. Paper is much better than PDF, can bring it with me when I'm in holiday (NO! Assuming I'll have any holiday anyway). :)

BTW: you probably++ already know, but Intel nicely sends real (paper) book versions of its CPU books. Just got 248966-04 (Intel Pentium-IV and Intel Xeon Processor Optimization Reference Manual). The 3 books on the IA32 are backordered, and should arrive soon. Paper is much better than PDF, can bring it with me when I'm in holiday (NO! Assuming I'll have any holiday anyway). :)

Maverick, don't you have to call Intel by phone to get them to send

you the manuals? Or is there some online ordering page?

you the manuals? Or is there some online ordering page?

I did it via WEB.. maybe it was available only temporarily, dunno. Worth a check anyway! Sorry, don't have any link.. I was just browsing, found the link, clicked, wrote my address and voilat. ;)

PS: What I recall was a button with a text like "Order a hardcopy of this document".

PS: What I recall was a button with a text like "Order a hardcopy of this document".