Hi all !
Here is my RC6 implementation in MASM.
Any optimisation tips & ideas - welcome! ;)



include Kernel32.inc
includelib Kernel32.lib

NROUNDS equ 20
P32 equ 0B7E15163h
Q32 equ 09E3779B9h

RC6_BLOCK struct
_A dd ?
_B dd ?
_C dd ?
_D dd ?
RC6_BLOCK ends

All_Stage1 macro Src, Dest
mov eax, Src
mul eax
add eax, eax
add eax, Src
rol eax, 5
mov Dest, eax
endm

Enc_Stage2 macro Operand, XorOp, RolOp, Off
mov eax, Operand
mov ecx, RolOp
xor eax, XorOp
rol eax, cl
add eax,
mov Operand, eax
endm

Dec_Stage2 macro Operand, XorOp, RorOp, Off
mov eax, Operand
sub eax,
mov ecx, RorOp
ror eax, cl
xor eax, XorOp
mov Operand, eax
endm

SwapVectors macro a1, a2, a3, a4, b1, b2, b3, b4
push ebx
mov eax, b1
mov ebx, b2
mov ecx, b3
mov edx, b4
mov a1, eax
mov a2, ebx
mov a3, ecx
mov a4, edx
pop ebx
endm

.code
RC6_BuildKey proc pUserKey:PDWORD, UserKeyLn:DWORD, ppRoundKey:PDWORD
local v:DWORD
local A:DWORD
local B:DWORD
local i:DWORD
local j:DWORD
local DWUserKeyLn:DWORD
local pDWUserKey:DWORD
pushad
invoke GlobalAlloc, GMEM_FIXED or GMEM_ZEROINIT, (2*NROUNDS+4)*4
mov edi, ppRoundKey
mov , eax
invoke GlobalAlloc, GMEM_FIXED or GMEM_ZEROINIT, 256*4
mov pDWUserKey, eax
mov esi, pUserKey
mov edi, pDWUserKey
mov ecx, UserKeyLn
rep movsb
mov eax, UserKeyLn
add eax, 3
shr eax, 2
mov DWUserKeyLn, eax

mov edi, ppRoundKey
mov edi,
mov esi, pDWUserKey
mov DWORD ptr , P32
mov ecx, 1
.repeat
mov eax,
add eax, Q32
mov , eax
inc ecx
.until ecx>2*NROUNDS+3
cmp ecx, DWUserKeyLn
cmovl ecx, DWUserKeyLn
mov eax, ecx
add ecx, ecx
add ecx, eax
mov v, ecx
xor eax, eax
mov A, eax
mov B, eax
mov i, eax
mov j, eax
mov ebx, 1
.repeat
mov edx, i
mov eax,
add eax, A
add eax, B
rol eax, 3
mov , eax
mov A, eax

mov edx, j
mov ecx, A
add ecx, B
mov eax,
add eax, ecx
rol eax, cl
mov , eax
mov B, eax

mov eax, i
inc eax
mov ecx, 2*NROUNDS+4
xor edx, edx
div ecx
mov i, edx

mov eax, j
inc eax
mov ecx, DWUserKeyLn
xor edx, edx
div ecx
mov j, edx

inc ebx
.until ebx>v
mov edi, pDWUserKey
mov ecx, 64
xor eax, eax
rep stosd
invoke GlobalFree, pDWUserKey
popad
ret
RC6_BuildKey endp

RC6_BurnKey proc pRoundKey:PDWORD
mov edi, pRoundKey
mov ecx, (2*NROUNDS+4)
xor eax, eax
rep stosd
invoke GlobalFree, pRoundKey
ret
RC6_BurnKey endp

RC6_Encrypt proc pBlock:ptr RC6_BLOCK, pRoundKey:PDWORD
local t, u:DWORD
pushad
mov edi, pBlock
mov esi, pRoundKey
assume edi:ptr RC6_BLOCK
assume esi:ptr DWORD
mov eax, ._B
add eax,
mov ._B, eax
mov ebx, ._D
add ebx,
mov ._D, ebx
mov ebx, 1
.repeat
All_Stage1 ._B, t
All_Stage1 ._D, u
Enc_Stage2 ._A, t, u, 0
Enc_Stage2 ._C, u, t, 4
SwapVectors ._A, ._B, ._C, ._D, ._B, ._C, ._D, ._A
inc ebx
.until ebx==NROUNDS+1
mov eax, ._A
add eax,
mov ._A, eax
mov ebx, ._C
add ebx,
mov ._C, ebx
assume edi:nothing
assume esi:nothing
popad
ret
RC6_Encrypt endp

RC6_Decrypt proc pBlock:ptr RC6_BLOCK, pRoundKey:PDWORD
local t, u:DWORD
pushad
mov edi, pBlock
mov esi, pRoundKey
assume edi:ptr RC6_BLOCK
assume esi:ptr DWORD
mov eax, ._C
sub eax,
mov ._C, eax
mov ebx, ._A
sub ebx,
mov ._A, ebx
mov ebx, NROUNDS
.repeat
SwapVectors ._A, ._B, ._C, ._D, ._D, ._A, ._B, ._C
All_Stage1 ._D, u
All_Stage1 ._B, t
Dec_Stage2 ._C, u, t, 4
Dec_Stage2 ._A, t, u, 0
dec ebx
.until ebx==0
mov eax, ._D
sub eax,
mov ._D, eax
mov ebx, ._B
sub ebx,
mov ._B, ebx
assume edi:nothing
assume esi:nothing
popad
ret
RC6_Decrypt endp

purge All_Stage1
purge Enc_Stage2
purge Dec_Stage2
purge SwapVectors


Posted on 2005-08-17 05:34:47 by Bohdan
It seems like you're only handling one block at a time? It might be advantagous to supply routines that work on an arbitrary amount of bytes, including CBC or some other chaining mode.
Posted on 2005-08-17 06:26:56 by f0dder
bohdan,
i posted my implementation a while ago (here: http://www.asmcommunity.net/board/index.php?topic=21136.0). didn't really try to optimise it, so you could go ahead and run a comparison ;)
Posted on 2005-08-17 15:38:24 by Tola
Hi Tola and f0dder, tnx for replay's.

I found that somethink like
  lea edi,
is VERY slow on P4, is it normal?

My implementation test results (1 000 000 iterations (Encrypt+Decrypt) on same 16 bytes of data)
1404 mSec on P3-800 i815EP (running on 900MHz 150FSB) and
1515 mSec on P4-2400 i845G !!!!!!!!!  :) :) :)

Is it possible to put some MMX/SSE here?
I'm just a beginner, sorry for annoying.
Posted on 2005-08-23 04:31:30 by Bohdan
You aren't annoying anybody, and posting code like this is a (Y) :)

I haven't had much time or interesting in working on encryption lately, and especially not with optimization of encryption.

I would suggeset that, when you benchmark, you should encrypt a large buffer, greater than your L2 cache size - a 4meg buffer would probably do fine. Should provide more realistic results.
Posted on 2005-08-23 05:16:39 by f0dder

I found that somethink like
  lea edi,
is VERY slow on P4, is it normal?

Yes, on a P4. It was mentioned in the optimising manual by Intel.

Of course it is possible to post some MMX/SSE code...
Posted on 2005-08-23 06:16:57 by roticv