woah, nice algo!! This is one of those problems that looks easy but you have to sit down and think about it.
Posted on 2003-05-05 16:10:08 by x86asm
x86asm,
"This is one of those problems that looks easy but you have
to sit down and think about it."

We have already correct algorithm (in C) and
it is not a big deal to make our proc fastest...Here:


; Work out what the day of the week is for a given date
; Speed optimized x86 assembly (PIII) by Lingo
;
; C Algorithm by Tomohiko Sakamoto
; int dayofweek(int d, int m, int y) {
; static int t[] = { 0, 3, 2, 5, 0, 3, 5, 1, 4, 6, 2, 4 };
; y -= m < 3;
; return ( y + y/4 - y/100 + y/400 + t[m-1] + d) % 7;
; }
; year = [1, 9999+]
; month = [1, 12]
; day = [1, 31]

DayOfWeekL PROTO year:DWORD, month:DWORD, day:DWORD

.data
Align 4
monthdata dword 0,3,2,5,0,3,5,1,4,6,2,4
.code
OPTION PROLOGUE:NONE ; turn it off
OPTION EPILOGUE:NONE ;
Align 16 ; Align 16 before the proc
DayOfWeekL PROC year:DWORD, month:DWORD, day:DWORD
mov ecx, [esp+8] ; ecx=month
mov edx, [esp+4] ; edx=year
cmp ecx, 3 ; test if month is < 3
sbb edx, 0 ; if month < 3 then dec edx->years
mov ecx, [monthdata + ecx*4-4] ; ecx-> monthdata[m-1]
mov eax, edx ; eax=edx=years
imul edx, 0A3D8h ; dividing edx=years by 100 == imul edx, 0A3D8h / shr edx, 22
add ecx, [esp+12] ; add day; ecx= monthdata[m-1] + day
add ecx, eax ; ecx-> monthdata[m-1] + day + years
shr eax, 2 ; eax= years/4
add eax, ecx ; eax= monthdata[m-1] + day + years + years/4
mov ecx, [esp] ; ecx->return address
mov [esp+12], ecx ; ecx->return address to right place
shr edx, 22 ; end of dividing ; edx=years/100
sub eax, edx ; eax= monthdata[m-1] +day + years + years/4 - years/100
shr edx, 2 ; edx=years/400
add eax, edx ; eax= monthdata[m-1] +day + years + years/4 - years/100 + years/400
mov edx, 24924925h ; dividing eax by 7 == mul edx, eax ; 24924925h=dword (1/7 * 2^35)
mov ecx, eax ; ecx= monthdata[m-1] + day + years + years/4 - years/100 + years/400
mul edx ; dividing eax by 7 ->result in edx
mov eax, ecx ; eax= monthdata[m-1] + day + years + years/4 - years/100 + years/400
lea ecx, [edx+edx*2] ; multiplying edx by 7 ->
lea edx, [ecx+edx*4] ; ->result in edx
add esp, 3*4 ; clearing the stack from 3 dword parameters
sub eax, edx ; eax= (monthdata[m-1] + day + years + years/4 - years/100 + years/400) % 7
ret ; faster return then return 3*4
DayOfWeekL ENDP ;
OPTION PROLOGUE:PROLOGUEDEF ; turn back on the defaults
OPTION EPILOGUE:EPILOGUEDEF ;
[B]Notes:[/B]
- Dividing by 100 -> imul edx, 0A3D8h / shr edx, 22 works
for all edx in range 0 to AAB2h(43 698)
- Dividing by 7 -> mov edx, 24924925h / mul edx works
for all eax in range 0 to 55555559h(1 431 655 769)

The proc is 80 bytes long and can execute in 19 clocks.
Let's analyze it in memory:


[B]- Instruction fetch and decoding (11 clocks)[/B]
To mark where each ifetch block begins we need [B]Align 16[/B] before
the proc (we have to know where the first ifetch block begins):

The first ifetch block begins at 4012E0h. It ends with an unfinished
instruction at 4012F0h and the next block will begin at the beginning
of this instruction (4012EEh).
The second ifetch block begins at 4012EEh. It ends with an unfinished
instruction at 4012FEh and the next block will begin at the beginning
of this instruction (4012FD).
The third ifetch block begins at 4012FDh. It ends with an unfinished
instruction at 40120Dh and the next block will begin at the beginning
of this instruction (40130B).
The fourth block begins at 40130Bh. It ends with an unfinished
instruction at 40131Bh and the next block will begin at the beginning
of this instruction (401319).
The fifth block begins at 401319h. It ends with an unfinished
instruction at 401329h and the next block will begin at the beginning
of this instruction (401327).
The sixth block begins at 401327h and covers the rest of the proc.

We write D0 as expected decoder at 4012E0h, 4012EEh, 4012FDh, 40130Bh,
401319h and 401327.

Three of the instructions generate 2 mops[B]
(sbb edx, 0 / add ecx, dword ptr [esp+0Ch] / mov dword ptr [esp+0Ch], ecx)[/B]
and one instruction generates 4 mops [B](ret)[/B]. They must go into decoder D0.

Every ifetch block (without the second) contained two decode blocks (see below)
The number of clock cycles it takes to decode this is the number of D0
instructions, which are 11.
We haven't loop so that no further analysis of instruction fetching is needed.

We can reduce decode clock cycles by manipulating instruction lengths,
but in this case it is useless because fetch and decoding are not the bottleneck.

[B]Align 16[/B]
[B]4012E0[/B] 8B4C2408 mov ecx, dword ptr [esp+8] ;[B]D0[/B] 1mop
4012E4 8B542404 mov edx, dword ptr [esp+4] ;D1 1mop
4012E8 83F903 cmp ecx, 3 ;D2 1mop
4012EB 83DA00 sbb edx, 0 ;[B]D0[/B] 2mops

[B]4012EE[/B] 8B0C8D448B4000 mov ecx, dword ptr [ecx*4+408B44h] ;[B]D0[/B] 1mop
4012F5 8BC2 mov eax, edx ;D1 1mop
4012F7 69D2D8A30000 imul edx, edx, 0A3D8h ;D2 1mop

[B]4012FD[/B] 034C240C add ecx, dword ptr [esp+0Ch] ;[B]D0[/B] 2mops
401301 03C8 add ecx, eax ;D1 1mop
401303 C1E802 shr eax, 2 ;D2 1mop
401306 03C1 add eax, ecx ;[B]D0[/B] 1mop
401308 8B0C24 mov ecx, dword ptr [esp] ;D1 1mop

[B]40130B[/B] 894C240C mov dword ptr [esp+0Ch], ecx ;[B]D0[/B] 2mops
40130F C1EA16 shr edx, 16h ;D1 1mop
401312 2BC2 sub eax, edx ;D2 1mop
401314 C1EA02 shr edx, 2 ;[B]D0[/B] 1mop
401317 03C2 add eax, edx ;D1 1mop

[B]401319[/B] BA25499224 mov edx, 24924925h ;[B]D0[/B] 1mop
40131E 8BC8 mov ecx, eax ;D1 1mop
401320 F7E2 mul eax, edx ;D2 1mop
401322 8BC1 mov eax, ecx ;[B]D0[/B] 1mop
401324 8D0C52 lea ecx, [edx+edx*2] ;D1 1mop

[B]401327[/B] 8D1491 lea edx, [ecx+edx*4] ;[B]D0[/B] 1mop
40132A 83C40C add esp, 0Ch ;D1 1mop
40132D 2BC2 sub eax, edx ;D2 1mop
40132F C3 ret ;[B]D0[/B] 4mops
Length: 40132Fh-4012E0h+1=50h=80 bytes ;Total: [B]11 D0[/B]s


[B]- Register stalls[/B]
No register is read in this proc without being written to at least
a few clock cycles before, so there can be no register read stalls.

[B]- Execution (19 clocks)[/B]
Counting the mops for the different ports we get:
port 0 or 1: 12 mops
port 0 only: 7 mops
port 2: 5 mops
port 3: 1 mop
port 4: 1 mop
Assuming that the mops that can go to either port 0 or 1 are distributed
optimally, the execution time will be 10 clocks, but we have dependency
chains and ports 0 and 1 are saturated, so
the execution time will be 14 clocks(see below).

[B]Note:[/B]
A series of instructions where each instruction depends on the result of
the preceding one is called a dependency chain. Long dependency chains
prevent out-of-order and parallel execution.

4012E0 8B4C2408 mov ecx, dword ptr [esp+8] ;1mop p2 rESPwECX

4012E4 8B542404 mov edx, dword ptr [esp+4] ;1mop p2 rESPwEDX
4012E8 83F903 cmp ecx, 3 ;1mop p01 rECXwF

4012EB 83DA00 sbb edx, 0 ;2mops p01 rwEDXrwF
4012EE 8B0C8D448B4000 mov ecx, dword ptr [ecx*4+408B44h];1mop p2 rwECX

4012F5 8BC2 mov eax, edx ;1mop p01 rEDXwEAX
4012F7 69D2D8A30000 imul edx, edx, 0A3D8h ;1mop p0 rwEDXwF

4012FD 034C240C add ecx, dword ptr [esp+0Ch] ;2mops p01 p2 rESPrwECXwF

401301 03C8 add ecx, eax ;1mop p01 rEAXrwECXwF
401303 C1E802 shr eax, 2 ;1mop p0 rwEAXwF

401306 03C1 add eax, ecx ;1mop p01 rECXrwEAXwF
401308 8B0C24 mov ecx, dword ptr [esp] ;1mop p2 rESPwECX

40130B 894C240C mov dword ptr [esp+0Ch], ecx ;2mops p3 p4 rECXrESP
40130F C1EA16 shr edx, 16h ;1mop p0 rwEDXwF

401312 2BC2 sub eax, edx ;1mop p01 rEDXrwEAX
401314 C1EA02 shr edx, 2 ;1mop p0 rwEDXwF

401317 03C2 add eax, edx ;1mop p01 rEDXrwEAXwF
401319 BA25499224 mov edx, 24924925h ;1mop p01 wEDX

40131E 8BC8 mov ecx, eax ;1mop p01 rEAXwECX
401320 F7E2 mul eax, edx ;1mop p0 rwEAXrwEDXwF

401322 8BC1 mov eax, ecx ;1mop p01 rECXwEAX
401324 8D0C52 lea ecx, [edx+edx*2] ;1mop p0 rEDXwECX

401327 8D1491 lea edx, [ecx+edx*4] ;1mop p0 rECXrwEDX
40132A 83C40C add esp, 0Ch ;1mop p01 rwESPwF

40132D 2BC2 sub eax, edx ;1mop p01 rEDXrwEAX

40132F C3 ret

We must add 8 clocks since the slow imul and mul instructions have a delay of
4 clocks each. The execution time will be 14 clocks + 8 = 22 clocks

Five instructions after [B]imul[/B] (3 clocks) are executed "for free" because they
are in the 4 clocks delay of [B]imul edx, edx, 0A3D8h[/B] and don't depend
on [B]imul[/B] instruction.

The expected execution time will be 14 clocks + 8 - 3 = 19 clocks.


[B]- Retirement (10 clocks)[/B]
The time needed for retirement is the number of mops divided by 3, and rounded
up to nearest integer. This gives 28/3 =10 clocks for retirement.

In conclusion, the proc can execute in 19 clocks.


Regards,
Lingo
Posted on 2003-05-09 21:09:34 by lingo12
lingo12, the most wonderful part of your algos are the explainations! :alright:
(I have saved the whole thing and posted in the snippet library - here -)

Takes 12 cycles on an Athlon. Hard to get faster. :)

; Divide by 100

imul edx, eax, 147AFh ; valid until C800 (51 200)
shr edx, 23

; Divide by 7
imul edx, eax, 12493h ; valid until E000 (57 344)
shr edx, 19
Posted on 2003-05-09 22:03:42 by bitRAKE
Thank you bitRAKE,

"; Divide by 100
imul edx, eax, 147AFh ; valid until C800 (51 200)
shr edx, 23"


imul EDX, EDX, 0A3D8h / SHR EDX,22
This free EAX register and I use it for "parallel" execution of next code.


"; Divide by 7
imul edx, eax, 12493h ; valid until E000 (57 344)
shr edx, 19 "


with:
"mov edx, 24924925h
mul edx"
I just avoid SHR EDX, b instruction in the end.

Regards,
Lingo
Posted on 2003-05-10 07:56:09 by lingo12
lingo12, yes I saw the use of MUL in the end to save the the SHR - there are already enough dependancies there - creative choice.
imul EDX, EDX, 0A3D8h / SHR EDX,22
This free EAX register and I use it for "parallel" execution of next code.
I did not know this was a problem on P3 - does not appear to effect the execution speed on Athlon.

Save a byte?:
	add	eax, edx

shl edx, 3
add esp, 3*4
sub eax, edx
ret
I don't think the proc works for Year=0 and Month < 3.
Posted on 2003-05-10 08:58:34 by bitRAKE
bitRAKE,
"I don't think the proc works for Year=0 and Month < 3."

Just use years between 1 to 9999

It is old "war" -> Millennium Start = 2000 or 2001?
http://www.hermetic.ch/cal_stud/beattie.htm

If you want to use year 0 you can add
"adc edx, 0" after "sbb edx,0", but it is just more code...
  mov  ecx, eax                   ; ecx= monthdata[m-1]  + day + years + years/4 - years/100 + years/400

mul edx ; dividing eax by 7 ->result in edx[B]
lea eax, [edx+ecx] ; add n/7 to eax(n)
shl edx, 3 ; edx= n/7 * 8[/B]
; mov eax, ecx ; eax= monthdata[m-1] + day + years + years/4 - years/100 + years/400
; lea ecx, [edx+edx*2] ; multiplying edx by 7 ->
; lea edx, [ecx+edx*4] ; ->result in edx
add esp, 3*4 ; clearing the stack from 3 dword parameters
sub eax, edx ; eax= (monthdata[m-1] + day + years + years/4 - years/100 + years/400) % 7
ret ; faster return then return 3*4


Cute idea!!! I'll use it...Thanks again!


Regards,
Lingo
Posted on 2003-05-10 15:50:02 by lingo12
There is another formula for calculating day of a week. It follows:
w = k + [2,6m - 0,2] - 2C + Y + [ C / 4 ] + [Y / 4] (mod 7)

k:day
m:month
Y=yyyy
C=yyyy

[x] means floor[x] i.e. biggest integer not bigger than x
Posted on 2003-09-30 09:24:33 by inFinie
Hi,

lingo12 (May 10th, 2003 09:50 PM) > Just use years between 1 to 9999
bitRAKE (March 22nd, 2002 03:25 AM) > year = [0, 9999+]

In 1582 Pope Gregory XIII introduced the Gregorian calendar to replace Julius Caesar's Julian calendar.
i.e.
...
October 16 1582
October 15 1582
October 4 1582
October 3 1582
...
Posted on 2003-10-03 00:34:11 by P2M
FYI, the algorithm attributed to Tomohiko Sakamoto is originally due to Gauss.
Posted on 2003-10-05 13:54:28 by Poimander

FYI, the algorithm attributed to Tomohiko Sakamoto is originally due to Gauss.

Poimander, could I ask you an offtopic, personal question:
I admire your comands of The English Language.
It's a rare thing nowdays, espacially for the American.
Could you tell us, please, what's your educational background?
You sound as a hell of a scolar :)
Posted on 2003-10-25 06:18:42 by The Svin
Hi Svin, I'm definitely not a scholar. I'm just a student of asm.
Posted on 2003-10-28 20:23:04 by Poimander
I've been just curious about where you were graduated :)
Posted on 2003-10-30 08:31:55 by The Svin
Bla-bla-bla



; Tested for D:M:Y=[1:3:1900-28:2:2100]

dataseg
align 4
tbl dd 6,2,1,4,6,2,4,0,3,5,1,3

codeseg
proc DayOfWeek day,month,year
mov edx,[month]
mov eax,[year]
mov ecx,[day]
cmp edx,3
sbb eax,0
add ecx,[tbl+4*(edx-1)]
mov edx,24924925h
imul eax,1461
shr eax,2
add eax,ecx
mov ecx,eax
imul edx
lea eax,[edx+ecx]
shl edx,3
sub eax,edx
ret
endp
Posted on 2003-11-02 10:10:24 by Nexo
If you're only interested in 20th and 21th century dates you can use the following
simplified routines:


;-----------------------------------------------------------------------------
;
;  DayOfWeek: Simplified code routine for dates in the range: 1899<year<2100
;
;  Modify the Gaussian calendrical algorithm using the fact that
;  floor(1/400*year)-floor(1/100*year) = -15 = -1 mod (7) for the 20th
;  and 21th centuries.
;
;-----------------------------------------------------------------------------

Align 16
DayOfWeek PROC year:DWORD, month:DWORD,day:DWORD
.DATA
cData DB 0,3,2,5,0,3,5,1,4,6,2,4
.CODE
    mov eax, month  ; eax=month
    mov ecx, year     ; ecx=year
    cmp eax, 3
    sbb ecx, 0         ; year -= (month<3)
   
    movzx eax, BYTE PTR cData
   
    add eax, ecx          ; eax = year + cData
    add eax, day          ; eax = year + day + cData
    shr ecx, 2              ; ecx = year/4
    add eax, ecx          ; eax = year/4 + year + day + cData
    dec eax                 ; eax = -1 + year/4 + year + day + cData
   
    mov ecx, 7
    xor edx, edx
    div ecx
    mov eax, edx            ; eax = (-1 + year/4 + year + day + cData)%7

    ret
DayOfWeek ENDP

;-----------------------------------------------------------------------------

Align 16
DayOfWeek2 PROC USES ebx year:DWORD, month:DWORD,day:DWORD
    mov eax, month     ; eax=month
    mov ebx, year        ; ecx=year
    cmp eax, 3
    sbb ebx, 0             ; year -= (month<3)

    mov eax, 00000000011011010110110010101100b ;06D6CACh
    mov edx, 00000000110011110011000011000000b ;0CF30C0h
    mov ecx, month
    dec ecx
    shl ecx, 1
    shr eax, cl
    shr edx, cl
    and eax, 3
    and edx, 3
    add eax, edx      ; eax = (0CF30C0h>>(2*(month-1)))&3+(06D6CACh>>(2*(month-1)))&3
                            ;     = cData
                           
    add eax, ebx      ; eax = year + cData
    add eax, day      ; eax = year + day + cData
    shr ebx, 2          ; ebx = year/4
    add eax, ebx      ; eax = year/4 + year + day + cData
    dec eax             ; eax = -1 + year/4 + year + day + cData

    mov ebx, 7
    xor edx, edx
    div ebx
    mov eax, edx      ; eax = (-1 + year/4 + year + day + cData)%7

    ret
DayOfWeek2 ENDP

;-----------------------------------------------------------------------------
Posted on 2005-09-18 14:58:20 by Poimander
There is the following routine as well:

;-----------------------------------------------------------------------------
;
; DayOfWeek: Simplified code routine for dates in the range: 1899<year<2100
;
;            Modify the Gaussian calendrical algorithm using the fact that
;            floor(1/400*year)-floor(1/100*year) = -15 = -1 mod (7) for the 20th
;            and 21th centuries.
;
;-----------------------------------------------------------------------------

Align 16     
DayOfWeek3 PROC year:DWORD, month:DWORD, day:DWORD
.DATA
cData DB 0,3,2,5,0,3,5,1,4,6,2,4
.CODE
    mov eax, month        ; eax=month
    mov ecx, year           ; ecx=year
    cmp eax, 3
    sbb ecx, 0                ; year -= (month<3)
   
    movzx eax, BYTE PTR cData
   
    lea ecx, ; ecx = 5*year-4
    shr ecx, 2                  ; ecx = floor(5*year/4) - 1
    add eax, ecx              ; eax = floor(5*year/4) - 1 + cData
    add eax, day              ; eax = day + floor(5*year/4) - 1 + cData

    mov ecx, 7
    xor edx, edx
    div ecx
    mov eax, edx             ; eax = (day + floor(5*year/4) - 1 + cData)%7

    ret
DayOfWeek3 ENDP

Posted on 2005-09-18 18:09:47 by Poimander
In the interest of thoroughness there is one additional routine I should mention:

;-----------------------------------------------------------------------------
;
; DayOfWeek: Simplified code routine for dates in the range: 1899<year<2100
;
; Modify the Gaussian calendrical algorithm using the fact that
; floor(1/400*year)-floor(1/100*year) = -15 = -1 mod (7) for the 20th
; and 21th centuries.
;-----------------------------------------------------------------------------

Align 16     
DayOfWeek4 PROC year:DWORD, month:DWORD, day:DWORD
;.DATA
;cData DB 0,3,2,5,0,3,5,1,4,6,2,4
.CODE
    mov eax, month         ; eax = month
    mov ecx, year          ; ecx = year
    cmp eax, 3
    sbb ecx, 0             ; year -= (month<3)

    movzx eax, BYTE PTR cData

    lea ecx,    ; ecx = 5*year
    and ecx, not 3         ; ecx = 5*year - (5*year mod(4)) = 4*floor(5*year/4)
    lea eax, ; eax = -1 + 8*floor(5*year/4) + cData 
    add eax, day           ; eax = -1 + 8*floor(5*year/4) + cData + day
   
    mov ecx, 7
    xor edx, edx
    div ecx
    mov eax, edx           ; eax = (-1 + floor(5*year/4) + cData + day)%7

    ret
DayOfWeek4 ENDP
Posted on 2005-09-19 21:37:36 by Poimander
I'm using Opera br. and can not scroll vertically all that is typed between "code" statements in this version on forum - no verticall scroll bar in the "code" window.
Posted on 2005-09-20 17:11:35 by The Svin
The Svin,

FYI, I am using Opera 8.02 and I do get a vertical scroll bar in the "code" area. By the way, the latest version of Opera is now free, no ads.

    Opera


Posted on 2005-09-20 18:37:27 by Greg