I'm looking for the fastest unsigned integer array sorter.
If anyone has an implementation of QuickSort to test against this RadixSort I'd be very interested in the results.

It's not fully optimized yet, but the implementation is solid and still very fast.

For those that don't know what radix sort is here's a high level representation of the algorithm. (in Java)

///////////////////////////////
//enhanced radix sort//////////
///////////////////////////////
    private int[] sourceArray;
    private int[] targetArray;
    private int[] counts;
    private int keyLength=4;
    private int[] radix2LUT = {24,16,8,0};
    private void radixSort2(int[] arr)
    {
        counts = new int[256];
        sourceArray = arr;
        targetArray = new int;
        int soFar,keyByte,to,temp,bitShift;       
        int[] tempA;
        //outer loop 4 times because 4words in a 32bit int
        for ( int col=keyLength-1; col>=0; col-- )
        {
            soFar = 0;
            bitShift = radix2LUT;//either 0,8,16 or 24
            //zeroing out the count[0to255]=0
            for (int x = 0; x<256; x=x+8)
            {//unrolled for loop speedup
                counts = counts = counts = counts =
                counts = counts = counts = counts = 0;
            }
            //counting how many of each key
            //this is what avoids using queues
            for ( int i=0; i<sourceArray.length; i++ )
            {
                counts[((sourceArray >> bitShift) & 0xFF)]++;
            }
            //manipulate the count's into indexes into a shadow array
            for ( int i=0; i<counts.length; i=i+8 )
            {//unrolled for speed
                temp = counts;counts = soFar;soFar += temp;
                temp = counts;counts = soFar;soFar += temp;
                temp = counts;counts = soFar;soFar += temp;
                temp = counts;counts = soFar;soFar += temp;
                temp = counts;counts = soFar;soFar += temp;
                temp = counts;counts = soFar;soFar += temp;
                temp = counts;counts = soFar;soFar += temp;
                temp = counts;counts = soFar;soFar += temp;
            }
            //copy and rearrange the array
            for ( int from=0; from<sourceArray.length; from++ )
            {
                keyByte = ((sourceArray >> bitShift) & 0xFF);
                to = counts++;
                targetArray = sourceArray;
            }
            //flip the array pointers around
            tempA = sourceArray;
            sourceArray = targetArray;
            targetArray = tempA;
        }
        //copy the sorted array into the array passed to the method
        //System.arraycopy(sourceArray,0, arr,0, sourceArray.length);
    }


Here's the current fasm project I'm using,
the DATA section is a little messy but the code is straight forward.

I create a 10,000,000 DWORD long array
Fill it with Random 32bit integers
And have benchmarking loops setup to test the speed of RadixSort vs WHATEVER you guys come up with to help me out.

If your going to test your quick sort implementation make sure you call it first in the benchmarking loops (WHY?) because optimized QuickSort algorithms have early stop checks, so if you let the radix sort sort the array first QuickSort will see that it's sorted already and stop early. Or you could just call the FillArray function before the loop starts for both of them.

Without further babbling the code.

format PE console 4.0
entry start
include '%fasminc%\win32a.inc'

section '.data' data readable writeable
result1 db 'Result 1: %d:',0
result2 db 'Result 2: %d:',0
buffer rb 255

align 16
RandomSeed dd 1318699, 1015727, 1235239, 412943

paramA dd 0
paramB dd 0
align 8
dVal dq 4294967295.0

fmtr db 'test: %d',0
tmp dq 0
align 16
fixer dw 0fc01h,0,0,0,0,0,0,0

fmtf db '%d . %d',0
dbgt dq 0

fmth db '%x %x %x',0
str1 dd 0,0,0,0
fhex db '%x  > %x  ',0

ttest db '%x : %x : %x',10,13,0

align 16
sseseed dq 1111111111111111h
        dq 12134567778abbf0h
        dq 0ababcabababdabah
        dq 7654321546174353h
_fmth db ' %X ',0

RandomArray dd 0
TestArray dd 1000BBBBh, 99AABBCCh, 1000BBB1h, 100h, 250h
section '.code' code readable executable
start:

call MakeSeed
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
        push    40000000 ;;40million bytes 10million DWORDS
        call    MakeArray
        mov    , eax
        push    10000000 ;;DWORD size
        push    eax ;;ARRAY MEM
        call    FillArray



;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
        call    ;returns -1
        push    100h
        push    eax
        call   
        call    ;;returns -2
        push    15
        push    eax
        call   

    push 0
    push 0
    push 0
    push 0
    call
;---------------------------------------
    mov esi,01h
    call
    mov edi,eax
    jmp tst1
align 16
tst1:
        push    10000000
        push    dword
        call    RadixSortUint32
    dec esi
    jnz tst1
    push eax
    push _fmth
    call
    call
    sub eax,edi
    push eax
    push result1
    call

    mov esi,01h
    call
    mov edi,eax
    jmp tst2
align 16
tst2:
        push    10000000
        push    dword
        call    RadixSortUint32
    dec esi
    jnz tst2
    push eax
    push _fmth
    call
    call
    sub eax,edi
    push eax
    push result2
    call
;+++++++++++++++++++++++++++++++++++++++++++++
    push 0
    push buffer
    push buffer
    push 0
    call
    push 0
    call


;;IN size
;;IN array addr
RadixSortUint32:
    mov ecx,;;size DWORDs
    push ebp
    push ebx
    push esi
    push edi
    mov esi,;;array
    mov ebp,ecx ;;copy sizeDWORDs
    shl ecx,2;;*4 sizeBYTEs
    push ecx ;;size
    call MakeArray
    mov edx,ebp ;;copy back dword size
    mov edi,eax ;;shadow array
    sub esp,256 * 4 ;;count array
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;ITERATION 1
;;;;;;;;;;;;;;;;;;;;zero count array
    xor eax,eax
    mov ecx,255
  .lpc1:
    mov dword,eax
    mov dword,eax
    mov dword,eax
    mov dword,eax
    mov dword,eax
    sub ecx,5
    jnz .lpc1
    mov dword,eax
;++++++++++++++++++++++++
;;;;;;;;;;;;;;;;;;;get counts
    mov ecx,edx ;;dword size -1
    dec ecx
  .lpg1:
    movzx eax,byte ;;lowest byte first
    inc dword
    dec ecx
    jns .lpg1

;++++++++++++++++++++++++++++
;;;;;change counts to indexes into shadow array
    xor ecx,ecx
    xor ebx,ebx
  .lpm1:
    mov eax, dword
    mov dword, ebx
    add ebx, eax
    add ecx,1
    cmp ecx,256
    jne .lpm1
;++++++++++++++++++++++++++++++++++
;;;;;;;;;;;;;;;;;;;;;; first copy
    xor ecx,ecx
  .lpr1:
    movzx eax, byte ;; first byte
    mov ebx, dword
    inc dword ;++
    mov eax, dword
    mov dword, eax
    add ecx,1
    cmp ecx,edx
    jne .lpr1
;+++++++++++++++++++++++++++++++++++
;;;;;;;;;;;;;;;; flip shadow and array ptrs
    push esi
    push edi
    pop esi
    pop edi
;++++++++++++++++++++++++++++++++++++
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;ITERATION 2
;;;;;;;;;;;;;;;;;;;;zero count array
    xor eax,eax
    mov ecx,255
  .lpc2:
    mov dword,eax
    mov dword,eax
    mov dword,eax
    mov dword,eax
    mov dword,eax
    sub ecx,5
    jnz .lpc2
    mov dword,eax
;++++++++++++++++++++++++
;;;;;;;;;;;;;;;;;;;get counts
    mov ecx,edx ;;dword size -1
    dec ecx
  .lpg2:
    movzx eax,byte ;;2nd lowest byte
    inc dword
    sub ecx,1
    jns .lpg2
;++++++++++++++++++++++++++++
;;;;;change counts to indexes into shadow array
    xor ecx,ecx
    xor ebx,ebx
  .lpm2:
    mov eax, dword
    mov dword, ebx
    add ebx, eax
    add ecx,1
    cmp ecx,256
    jne .lpm2
;++++++++++++++++++++++++++++++++++
;;;;;;;;;;;;;;;;;;;;;; second copy
    xor ecx,ecx
  .lpr2:
    movzx eax, byte ;; second byte
    mov ebx, dword
    inc dword ;++
    mov eax, dword
    mov dword, eax
    add ecx,1
    cmp ecx,edx
    jne .lpr2
;+++++++++++++++++++++++++++++++++++
;;;;;;;;;;;;;;;; flip shadow and array ptrs
    push esi
    push edi
    pop esi
    pop edi
;++++++++++++++++++++++++++++++++++++
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;ITERATION 3
;;;;;;;;;;;;;;;;;;;;zero count array
    xor eax,eax
    mov ecx,255
  .lpc3:
    mov dword,eax
    mov dword,eax
    mov dword,eax
    mov dword,eax
    mov dword,eax
    sub ecx,5
    jnz .lpc3
    mov dword,eax
;++++++++++++++++++++++++
;;;;;;;;;;;;;;;;;;;get counts
    mov ecx,edx ;;dword size -1
    dec ecx
  .lpg3:
    movzx eax,byte ;;third lowest byte
    inc dword
    sub ecx,1
    jns .lpg3
;++++++++++++++++++++++++++++
;;;;;change counts to indexes into shadow array
    xor ecx,ecx
    xor ebx,ebx
  .lpm3:
    mov eax, dword
    mov dword, ebx
    add ebx, eax
    add ecx,1
    cmp ecx,256
    jne .lpm3
;++++++++++++++++++++++++++++++++++
;;;;;;;;;;;;;;;;;;;;;; third copy
    xor ecx,ecx
  .lpr3:
    movzx eax, byte ;; third byte
    mov ebx, dword
    inc dword ;++
    mov eax, dword
    mov dword, eax
    add ecx,1
    cmp ecx,edx
    jne .lpr3
;+++++++++++++++++++++++++++++++++++
;;;;;;;;;;;;;;;; flip shadow and array ptrs
    push esi
    push edi
    pop esi
    pop edi
;++++++++++++++++++++++++++++++++++++
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;ITERATION 4
;;;;;;;;;;;;;;;;;;;;zero count array
    xor eax,eax
    mov ecx,255
  .lpc4:
    mov dword,eax
    mov dword,eax
    mov dword,eax
    mov dword,eax
    mov dword,eax
    sub ecx,5
    jnz .lpc4
    mov dword,eax
;++++++++++++++++++++++++
;;;;;;;;;;;;;;;;;;;get counts
    mov ecx,edx ;;dword size -1
    dec ecx
  .lpg4:
    movzx eax,byte ;;last byte
    inc dword
    sub ecx,1
    jns .lpg4
;++++++++++++++++++++++++++++
;;;;;change counts to indexes into shadow array
    xor ecx,ecx
    xor ebx,ebx
  .lpm4:
    mov eax, dword
    mov dword, ebx
    add ebx, eax
    add ecx,1
    cmp ecx,256
    jne .lpm4
;++++++++++++++++++++++++++++++++++
;;;;;;;;;;;;;;;;;;;;;; fourth copy
    xor ecx,ecx
  .lpr4:
    movzx eax, byte ;; fourth byte
    mov ebx, dword
    inc dword ;++
    mov eax, dword
    mov dword, eax
    add ecx,1
    cmp ecx,edx
    jne .lpr4
;+++++++++++++++++++++++++++++++++++
;;;;;;;;;;;;;;;; flip shadow and array ptrs
    push esi
    push edi
    pop esi
    pop edi
;++++++++++++++++++++++++++++++++++++
    push edi
    call DeleteArray
    add esp,256*4
    pop edi
    pop esi
    pop ebx
    pop ebp
    ret 8


FillArray:
    push ebx
    push esi
    mov ebx,dword ;; count
    mov esi,dword ;; addr
  .lp:
    dec ebx
    js .end
    call Random32
    mov dword,eax
    jmp .lp
.end:
    pop esi
    pop ebx
    ret 8

MakeArray:
    mov eax,
    push 4 ;read/write
    push 1000h Or 2000h ;;commit | reserve
    push eax ;;size
    push 0  ;;addr
    push -1 ;;current process
    call
    test eax,eax
    jnz .good
    push 0
    call
.good:
    ret 4

DeleteArray:
    mov eax, ;;array addr
    push 8000h ;;release
    push 0 ;;size
    push eax ;;addr
    push -1 ;;this process
    call
    ret 4

Random32:
    push ebx
    mov eax,
    mov ebx,
    mov ecx,
    mov edx,
    shld ebx,eax,1
    adc eax,0
    ror eax,3
    bswap eax
    shld edx,ecx,1
    adc ecx,0
    bswap ecx
    ror ecx,7
    mov ,eax
    mov ,ebx
    mov ,ecx
    mov ,edx
  xor eax,ecx
    pop ebx
    ret 0

SetSeed:
.seed equ esp+4 ;,+8,+12,+16
    movdqu xmm0,[.seed]
    movntdq dqword,xmm0
    ret 16

MakeSeed:
    rdtsc
    mov edx,eax
    call
    mov ecx,eax
    mul edx
    mov ,eax
    xor edx,ecx
    mov ,edx
    bswap ecx
    xor eax,ecx
    mov ,eax
    not edx
    bswap edx
    mul edx
    mov ,eax
    ret 0



section '.idata' import data readable writeable

  library kernel32,'KERNEL32.DLL',\
          user32,'USER32.DLL',\
          wsock32,'WS2_32.DLL',\
          ntdll,'NTDLL.DLL',\
          msvcrt,'msvcrt.dll'
      include  "%fasminc%\apia\kernel32.inc"
      include  "%fasminc%\apia\user32.inc"
      include  "%fasminc%\apia\wsock32.inc"
      import ntdll,\
        NtAllocateVirtualMemory,'NtAllocateVirtualMemory',\
        NtFreeVirtualMemory,'NtFreeVirtualMemory',\
        NtWriteVirtualMemory,'NtWriteVirtualMemory',\
        NtProtectVirtualMemory,'NtProtectVirtualMemory',\
        NtCreateThread,'NtCreateThread',\
        NtClose,'NtClose',\
        InitializeCS,'RtlInitializeCriticalSection',\
        EnterCS,'RtlEnterCriticalSection',\
        LeaveCS,'RtlLeaveCriticalSection',\
        DeleteCS,'RtlDeleteCriticalSection'
  import msvcrt,\
        printf,'printf'

section '.reloc' fixups data discardable
Posted on 2006-04-09 19:03:15 by r22
r22, to make the test fair, you should refill the array between sorts - and make sure the same random sequence is used. Also, correctness testing is nice to have. I've attached a little test framework I did a while ago, and added your RADIX as well as a vanilla C++ one.

Here's the test results from my AMD64x2 4400+:


D:\src\test\sort>driver
sorting test - sorting 10485760 signed numbers, using seed 0xBEEFF00D

*** testing for correctness
testing routine libc qsort()
testing routine hutch asm qsort()              failed at index 7 (953 > 479)
testing routine C trimedian qsort()
testing routine STL std::sort()
testing routine r22 FASM RADIX sort()
testing routine David Garcia C++ radix()

*** testing for SPEED
libc qsort() took 1828 ticks
hutch asm qsort() took 1047 ticks
C trimedian qsort() took 875 ticks
STL std::sort() took 1032 ticks
r22 FASM RADIX sort() took 406 ticks
David Garcia C++ radix() took 656 ticks

Attachments:
Posted on 2006-04-10 04:39:20 by f0dder
If I used words instead of bytes it could probably be ~40% faster. Although the memory requirement would be a little on the high side.

65535*4 + 4 * n bytes

Any suggestions on the most efficient way to make a SIGNED dword version. I haven't really though about it but I guess subtracting 255 from the most significant byte (while sorting) would do it.
Posted on 2006-04-15 21:14:08 by r22
Tested against my modified nrQsortA (original found in masm32), ....

libc qsort() took 10886 ticks
hutch asm qsort() took 2123 ticks
C trimedian qsort() took 2173 ticks
STL std::sort() took 1943 ticks
David Garcia C++ radix() took 1662 ticks
r22 FASM RADIX sort() took 1443 ticks
Ultrano nrQsortA() took 2253 ticks

AthlonXP 2000+, DDR400
Attachments:
Posted on 2006-05-15 05:19:09 by Ultrano
Ultrano, does the AthlonXP support ram running at DDR400, or are you using DDR400 at some lower rate?
Posted on 2006-05-15 05:37:09 by f0dder
During my previous test, the memory was at 200MHz, "normal" timing, but FSB133


Now, tested with 200MHz, "ultra" timing, FSB200:

driver.exe, your build (with vct2k3?)

libc qsort() took 3195 ticks
hutch asm qsort() took 2123 ticks
C trimedian qsort() took 1763 ticks
STL std::sort() took 1993 ticks
r22 FASM RADIX sort() took 1392 ticks
David Garcia C++ radix() took 1792 ticks


udriver.exe, my build (vc6)

testing routine David Garcia C++ radix()                failed at index 1 (20289
> 12264)
testing routine Ultrano unrQsortA()

*** testing for SPEED
libc qsort() took 10906 ticks ; while in your build runs for 3195ms
hutch asm qsort() took 2093 ticks
C trimedian qsort() took 2153 ticks ;while in your build runs for 1763ms
STL std::sort() took 1943 ticks
r22 FASM RADIX sort() took 1412 ticks
David Garcia C++ radix() took 1642 ticks
Ultrano unrQsortA() took 2223 ticks

Posted on 2006-05-15 06:06:52 by Ultrano
Yep, my build is VC2003.

Seems like (hardly surprising) VC2003 has better stdlib and better code generation - the few routines that are faster in VC6 can probably be attributed to multitasking fluctuations? (although driver.exe *does* boost priority, hmm).

It's pretty weirdthat David Garcia's routine fails - I wonder if it's because of a code generation bug in VC6, or perhaps because your system isn't entirely stable at ultra timings?
Posted on 2006-05-15 06:14:24 by f0dder
Garcia's route fails because of VC6 code generation.
After my last post, I changed my system to a Sempron 2200+, labelled as FSB333 in my bill from a year or two (but actually works at 2x200MHz FSB)
My AthlonXP turned out to support only FSB133, any higher value and my PC wouldn't boot at all (had to take the mobo batteries out).

Here're FSB values for some x86 amd cpus:
(K7) Duron - all : 64kB L2, 100MHz

(K7/Thunderbird) Athlon: 256kB L2
700-1400: 100MHz FSB
1000-1400: 133MHz

(Palomino/Thoroughbred) AthlonXP: 256kB L2
1500+ up to 2600+ :133MHz
2600+ and 2700+ : 166MHz FSB

(Barton) AthlonXP: 512kB L2
2500+ up to 3000+ : 166MHz FSB


I've been living in an ancient world XD
Your dualcore x64 must feel nice :D

r22, instead of subtracting 255, I'd add 128.
Posted on 2006-05-15 19:39:29 by Ultrano

Your dualcore x64 must feel nice :D

Indeed it does, took me a lot of hard work (which almost cost me a relationship) to purchase it, but in the end it's been worth it. I'm a bit annoyed at AMD's decision to integrate the memory controller on the CPU, though - it doesn't buy them a lot, and many AMD64's have a bugged controller that will run unstable at DDR400 if you have all four memory slots loaded - mine does seem stable, though. But I can't run my ram at CAS2, have to run it CAS3.
Posted on 2006-05-16 04:33:44 by f0dder