The algorithm itself doesn't get any faster by using MMX, but the processor can do more while it is waiting on the hard drive. My computer very consistently processes a 640MB file in 22 seconds -- that is 30MB/sec - the speed of my hard drive! Furthermore, it does this with the simple byte scanner or the MMX code.

Create a thread for ScanFile passing it the file name, and let your program do other stuff...

Stuff to do before using code:
- add error checking
- select the memory and threads needed

BIGINT UNION

QWORD ?
STRUCT
dwLow DWORD ?
dwHigh DWORD ?
ENDS
BIGINT ENDS



THREAD_BLOCK STRUCT
ol OVERLAPPED <> ; overlapped structure for reads

mem DWORD ? ; address to memory buffer

ThreadId DWORD ? ; thread id
hThread DWORD ? ; thread handle
THREAD_BLOCK ENDS



MAX_SCAN_MEMORY EQU 1024*256 ; memory buffer to use
SCAN_MEMORY_BLOCK EQU 1024*64 ; bytes in blocks to read/scan
MAX_SCAN_BLOCKS EQU MAX_SCAN_MEMORY / SCAN_MEMORY_BLOCK

SCAN_BYTE EQU 10 ; linefeed
SCAN_BYTE_MMX EQU <0A0A0A0A0A0A0A0Ah> ; linefeed



_DATA SEGMENT
iBigFile BIGINT <0> ; 64 bit file length
BigFile_count BIGINT <0> ; number of lines

hBigFile DWORD ?
_DATA ENDS


OPTION PROLOGUE:PROLOGUEDEF
OPTION EPILOGUE:EPILOGUEDEF

BlockComplete PROC dwErrorCode:DWORD, dwNumberOfBytesTransfered:DWORD, lpOverlapped:DWORD
cmp dwErrorCode, 0
jne _X
cmp dwNumberOfBytesTransfered, SCAN_MEMORY_BLOCK
jne _Y
mov edx, lpOverlapped
mov edx, [edx].THREAD_BLOCK.mem


IF 0 ; reference scanner
or eax, -1
dec edx
mov ecx, SCAN_MEMORY_BLOCK
_0: inc eax
dec ecx
js _x
_1: inc edx
cmp BYTE PTR [edx], SCAN_BYTE
je _0
dec ecx
jns _1
_x:
ELSE ; MMX scanner

movq mm7, mxc(<0A>)
pxor mm6, mm6

mov ecx, SCAN_MEMORY_BLOCK / (8*128)
@@:
pxor mm5, mm5
i = 0
WHILE i LT (8*128)
movq mm0, [edx + i + 0]
movq mm1, [edx + i + 8]
movq mm2, [edx + i + 16]
movq mm3, [edx + i + 24]

pcmpeqb mm0, mm7
pcmpeqb mm1, mm7
pcmpeqb mm2, mm7
pcmpeqb mm3, mm7

paddb mm0, mm1
paddb mm2, mm3

paddb mm0, mm2

psubb mm5, mm0 ; total 128*8 max = 1K
i=i+8*4
ENDM

; unpack MM5 to get sum in 1K block
pxor mm0, mm0
psadbw mm5, mm0
paddd mm6, mm5

dec ecx
lea edx, [edx + 128*8]
jne @B

movd eax, mm6
ENDIF


_9: add BigFile_count.dwLow, eax
adc BigFile_count.dwHigh, 0

mov edx, lpOverlapped
; adjust the OVERLAPPED structure to get next file block
add [edx].THREAD_BLOCK.ol.loffset, MAX_SCAN_MEMORY
adc [edx].THREAD_BLOCK.ol.OffsetHigh, 0
ret

_X: ; (just bail)
ret

_Y: ; short block scan...
or eax, -1
dec edx
mov ecx, dwNumberOfBytesTransfered
_y0: inc eax
dec ecx
js _yx
_y1: inc edx
cmp BYTE PTR [edx], SCAN_BYTE
je _y0
dec ecx
jns _y1
_yx: jmp _9

BlockComplete ENDP


OPTION PROLOGUE:NONE
OPTION EPILOGUE:NONE

ScanFile_Thread PROC lpData:PTR THREAD_BLOCK

mov ebx, [esp + 4] ; lpData
mov esi, [ebx].THREAD_BLOCK.ol.loffset
mov edi, [ebx].THREAD_BLOCK.ol.OffsetHigh
add esi, SCAN_MEMORY_BLOCK
adc edi, 0
; EBX is pointer to THREAD_BLOCK structure which must begin with
; OVERLAPPED structure. EDI:ESI equals end of block offset
jmp _0

_w: ; wait for I/O completion
invoke SleepEx, INFINITE, TRUE

_0: mov eax, SCAN_MEMORY_BLOCK
xor edx, edx

; check if end of block within file
cmp edi, iBigFile.dwHigh
jc _io
jne @F
cmp esi, iBigFile.dwLow
jle _io
@@:
; if not adjust block size
add eax, iBigFile.dwLow
adc edx, iBigFile.dwHigh

; if block size negitive kill thread
sub eax, esi
sbb edx, edi
jne EOF
dec eax
je EOF

_io: add esi, MAX_SCAN_MEMORY
adc edi, 0

invoke ReadFileEx, hBigFile, [ebx].THREAD_BLOCK.mem, eax, ebx, OFFSET BlockComplete
test eax, eax
jne _w

invoke ExitThread, -1 ; file ERROR
retn 4

EOF: invoke ExitThread, 0
retn 4

ScanFile_Thread ENDP


OPTION PROLOGUE:PROLOGUEDEF
OPTION EPILOGUE:EPILOGUEDEF

ScanFile PROC USES ebx esi edi, lpFileName:DWORD

invoke VirtualAlloc, NULL,
MAX_SCAN_MEMORY + MAX_SCAN_BLOCKS * (SIZEOF THREAD_BLOCK),
MEM_COMMIT or MEM_RESERVE,
PAGE_READWRITE
test eax, eax
je mem_ERROR

lea ebx, [eax + MAX_SCAN_MEMORY]
mov edi, eax

;; Open test file
invoke CreateFile, lpFileName, GENERIC_READ, NULL, NULL, OPEN_EXISTING,
FILE_FLAG_OVERLAPPED or FILE_FLAG_NO_BUFFERING or FILE_FLAG_SEQUENTIAL_SCAN, NULL
mov hBigFile, eax

invoke GetFileSizeEx, eax, OFFSET iBigFile

push ebp
xor esi, esi
xor ebp, ebp
_t0: ; fill THREAD_BLOCK
lea eax, [edi + esi]
mov [ebx].THREAD_BLOCK.ol.loffset, esi
mov [ebx].THREAD_BLOCK.ol.OffsetHigh, 0
mov [ebx].THREAD_BLOCK.ol.hEvent, ebp
mov [ebx].THREAD_BLOCK.mem, eax

invoke CreateThread, NULL, 4096, OFFSET ScanFile_Thread, ebx,
NULL, ADDR [ebx].THREAD_BLOCK.ThreadId
mov [ebx].THREAD_BLOCK.hThread, eax

inc ebp
add esi, SCAN_MEMORY_BLOCK
add ebx, SIZEOF THREAD_BLOCK
cmp ebp, MAX_SCAN_BLOCKS
jne _t0
pop ebp

sub ebx, MAX_SCAN_BLOCKS * (SIZEOF THREAD_BLOCK)

;; Wait for threads to finish:

push esp
mov esi, MAX_SCAN_BLOCKS
_w:
;--------------------------------
; Could do something useful here
;--------------------------------

invoke GetExitCodeThread, [ebx].THREAD_BLOCK.hThread, esp
cmp DWORD PTR [esp], STILL_ACTIVE
je _w

cmp DWORD PTR [esp], 0
je @F

; file ERROR!
int 3
; file ERROR!

@@: invoke CloseHandle, [ebx].THREAD_BLOCK.hThread
add ebx, SIZEOF THREAD_BLOCK
dec esi
jne _w
pop eax


invoke CloseHandle, hBigFile
invoke VirtualFree, edi, 0, MEM_RELEASE

mem_ERROR:
ret

ScanFile ENDP
Posted on 2003-06-07 15:14:49 by bitRAKE
I going to port this into SpAsm Syntax.
And play with it in SpAsm.

multi threading and loading blocks, helps in overcoming natural
speed roadblocks in algos.

Large file usage and byte parsing always an interesting algo topic.

Thanks bitRake
Posted on 2003-06-07 18:10:01 by RobotBob
There is an error in the above code - should use:
invoke WaitForSingleObject, [ebx].THREAD_BLOCK.hThread, INFINITE
...instead of continually checking if the thread is active. This error cause the application to grap all the processor resources - now it is down to a couple percent. :) The reference byte scanner takes 11% of processor time, but the MMX code cuts that down to 3%.
Posted on 2003-06-07 20:31:01 by bitRAKE
bitRAKE
Why in function ScanFile_Thread you use ExitThread, instead of
	xor	eax, eax	; file ERROR

dec eax
retn 4

EOF: xor eax, eax
retn 4
Posted on 2003-06-08 21:06:54 by P2M
P2M, I did not know this was equivalent - thank you!
ExitThread is the preferred method of exiting a thread. When this function is called (either explicitly or by returning from a thread procedure)
Another optimization would be to pre-calculate number of loops needed by each thread, and the partial read thread -- which reduces the thread code to:
	OPTION PROLOGUE:NONE

OPTION EPILOGUE:NONE

ScanFile_Thread PROC lpData:PTR THREAD_BLOCK

mov ebx, [esp + 4] ; lpData

_0: invoke ReadFileEx, hBigFile, [ebx].THREAD_BLOCK.mem,
SCAN_MEMORY_BLOCK, ebx, OFFSET BlockComplete
test eax, eax
je Error

; wait for I/O completion
invoke SleepEx, INFINITE, TRUE

dec [ebx].THREAD_BLOCK.blocks
jne _0

xor eax, eax
retn 4

Error: dec eax
retn 4

ScanFile_Thread ENDP
Posted on 2003-06-09 00:43:28 by bitRAKE
bitRAKE
My computer very consistently processes a 640MB file in 22 seconds -- that is 30MB/sec
What computer configuration (hard, soft)?

I try MMF + memchr WinAPI, C, no thread, no ReadFile:
#define WIN32_LEAN_AND_MEAN

#include <windows.h>
#include <stdio.h>
#include <limits.h>

#define INVALID_FILE_SIZE ((DWORD)0xFFFFFFFF)

int main(int argc, char *argv[])
{
DWORD dwStart = GetTickCount(); // = 0;

HANDLE hFile = INVALID_HANDLE_VALUE;
HANDLE hFMap = NULL;
LPVOID pFile = NULL;
char *pStart = NULL;
char *pFind = NULL;
DWORD loSize = 0;
DWORD hiSize = 0;
DWORD dwCount = 0;

if (1 >= argc)
{
printf("Usage %s filename\n", argv[0]);
goto cleanup;
}

hFile = CreateFile(argv[1], GENERIC_READ, FILE_SHARE_READ, NULL,
OPEN_EXISTING, FILE_ATTRIBUTE_NORMAL, NULL);
if (INVALID_HANDLE_VALUE == hFile)
{
printf("CreateFile(\'%s\') failed %d\n", argv[1], GetLastError());
goto cleanup;
}

loSize = GetFileSize(hFile, &hiSize);
if (INVALID_FILE_SIZE == loSize)
{
printf("GetFileSize failed %d\n", GetLastError());
goto cleanup;
}
if (0 != hiSize)
{
printf("Size great then %d\n", UINT_MAX);
goto cleanup;
}
if (0 == loSize)
{
printf("File is empty\n");
goto cleanup;
}

hFMap = CreateFileMapping(hFile, NULL, PAGE_READONLY, 0, 0, NULL);
if (NULL == hFMap)
{
printf("CreateFileMapping failed %d\n", GetLastError());
goto cleanup;
}
CloseHandle(hFile); hFile = INVALID_HANDLE_VALUE;

pFile = MapViewOfFile(hFMap, FILE_MAP_READ, 0, 0, 0);
if (NULL == pFile)
{
printf("MapViewOfFile failed %d\n", GetLastError());
goto cleanup;
}
CloseHandle(hFMap); hFMap = NULL;

pStart = pFind = (char *)pFile;
do
{
pFind = memchr(pStart, '\n', loSize);
if (NULL != pFind)
{
dwCount++;
loSize -= (pFind - pStart + 1);
pStart = pFind + 1;
}
} while (0 != pStart && NULL != pFind);
if (pFind != pStart)
dwCount++;
printf("%d ends of the lines are found for %d seconds\n",
dwCount, (GetTickCount() - dwStart) / 1000);

cleanup:
if (NULL != pFile) UnmapViewOfFile(pFile);
if (NULL != hFMap) CloseHandle(hFMap);
if (INVALID_HANDLE_VALUE != hFile) CloseHandle(hFile);

return 0;
}


Configuration:
Celeron Tualatin 1300 (13*100MHz) up to 1510 (13*115Mhz)
RAM 320Mb (115Mhz)
HDD 20 Gb UDMA100
W2k sp3
scan file size 746'583'601 byte

Result:
23'394'001 ends of the lines are found for 25 seconds

PS Instead of memchr can be use scasb for ANSI string and scasw for UNICODE string.
Posted on 2003-06-09 20:44:55 by P2M
P2M, my hard drive sucks compared to yours. :)
XP 2500 Barton (at 2.0 Ghz)
1 GB DDR2700 (166Mhz)
EpoX 8RDA+ motherboard
Windows XP
...the weak link is a WD 205AA DMA66 Hard drive. There is currently a problem with the NForce2 IDE driver - it's only working at 33MB/s instead of the rated 66MB/s.

Results are nice! :) How can you deal with files above 2^32 bytes. :)

How did the code above perform?
Posted on 2003-06-09 21:16:16 by bitRAKE
bitRAKE
How can you deal with files above 2^32 bytes.
This is not simply. I use FAT32. Limit file size on FAT32: 2^32 - 1 bytes. (info from Supported File Systems)
I converted FAT32 to NTFS and has changed program.
/* support file above 2^32 bytes*/

#define WIN32_LEAN_AND_MEAN
#include <windows.h>
#include <stdio.h>
#include <limits.h>

#define INVALID_FILE_SIZE ((DWORD)0xFFFFFFFF)

int main(int argc, char *argv[])
{
DWORD dwStart = GetTickCount();
HANDLE hFile = INVALID_HANDLE_VALUE;
HANDLE hFMap = NULL;
LPVOID pFile = NULL;
char *pStart = NULL;
char *pFind = NULL;
LONGLONG DefMapSize = 7 * (256*1024*1024);

LARGE_INTEGER FileSize, MapOffset;
ULONG64 EolCount;
SIZE_T MapSize;


if (1 >= argc)
{
printf("Usage %s filename\n", argv[0]);
goto cleanup;
}

hFile = CreateFile(argv[1], GENERIC_READ, FILE_SHARE_READ, NULL,
OPEN_EXISTING, FILE_ATTRIBUTE_NORMAL, NULL);
if (INVALID_HANDLE_VALUE == hFile)
{
printf("CreateFile(\'%s\') failed %d\n", argv[1], GetLastError());
goto cleanup;
}

FileSize.QuadPart = 0;
FileSize.LowPart = GetFileSize(hFile, (DWORD *)&FileSize.HighPart);
if (INVALID_FILE_SIZE == FileSize.LowPart)
{
printf("GetFileSize failed %d\n", GetLastError());
goto cleanup;
}
if (0 == FileSize.QuadPart)
{
printf("File is empty\n");
goto cleanup;
}
printf("FileSize is %I64u\n", FileSize.QuadPart);

hFMap = CreateFileMapping(hFile, NULL, PAGE_READONLY, 0, 0, NULL);
if (NULL == hFMap)
{
printf("CreateFileMapping failed %d\n", GetLastError());
goto cleanup;
}

MapOffset.QuadPart = 0;
EolCount = 0;
while (0 != FileSize.QuadPart)
{
MapSize = (SIZE_T) min(DefMapSize, FileSize.QuadPart);

pFile = MapViewOfFile(hFMap, FILE_MAP_READ,
MapOffset.HighPart, MapOffset.LowPart, MapSize);
if (NULL == pFile)
{
printf("MapViewOfFile failed %d\n", GetLastError());
goto cleanup;
}

FileSize.QuadPart -= MapSize;
MapOffset.QuadPart += MapSize;

pStart = pFind = (char *)pFile;
do
{
pFind = memchr(pStart, '\n', MapSize);
if (NULL != pFind)
{
EolCount++;
MapSize -= (pFind - pStart + 1);
pStart = pFind + 1;
}
} while (0 != MapSize && NULL != pFind);

UnmapViewOfFile(pFile); pFile = NULL;

}
CloseHandle(hFMap); hFMap = NULL;

if (pFind != pStart)
EolCount++;

printf("%I64u ends of the lines are found for %d seconds\n",
EolCount, (GetTickCount() - dwStart) / 1000);

cleanup:
if (NULL != pFile) UnmapViewOfFile(pFile);
if (NULL != hFMap) CloseHandle(hFMap);
if (INVALID_HANDLE_VALUE != hFile) CloseHandle(hFile);

return 0;
}
Result:
FileSize is 4'479'501'601
140'364'001 ends of the lines are found for 185 seconds
Posted on 2003-06-10 19:30:51 by P2M
Hi bitRAKE, I've just been trying to rewrite/understand this code when I came across a small bug. So small it took me hours to track it down. Basically you used FILE_FLAG_NO_BUFFERING which states
File access must be for numbers of bytes that are integer multiples of the volume's sector size. For example, if the sector size is 512 bytes, an application can request reads and writes of 512, 1024, or 2048 bytes, but not of 335, 981, or 7171 bytes.

I imagine the file you tested with happend to be a whole multiple of the sector size in lenght meaning the final block was still safe. In my test file it wasn't and ReadFileEx threw up an error.

Here's my simple fix, You need two additional dwords, SectorSize and SectorMask. Then after
	invoke VirtualAlloc, NULL,

MAX_SCAN_MEMORY + MAX_SCAN_BLOCKS * (SIZEOF THREAD_BLOCK),
MEM_COMMIT or MEM_RESERVE,
PAGE_READWRITE
test eax, eax
je mem_ERROR

lea ebx, [eax + MAX_SCAN_MEMORY]
mov edi, eax

add
	invoke GetDiskFreeSpace,0,eax,offset SectorSize,eax,eax

dec SectorSize
mov eax, SectorSize
xor eax, 0FFFFFFFFh
mov SectorMask, eax

and replace
_io:	add	esi, MAX_SCAN_MEMORY

adc edi, 0

invoke ReadFileEx, hBigFile, [ebx].THREAD_BLOCK.mem, eax, ebx, OFFSET BlockComplete
test eax, eax
jne _w

with
_io:	add 	eax, SectorSize

add esi, MAX_SCAN_MEMORY
and eax, SectorMask
adc edi, 0

invoke ReadFileEx, hBigFile, [ebx].THREAD_BLOCK.mem, eax, ebx, OFFSET BlockComplete
test eax, eax
jne _w
I think this fixs it.
Posted on 2003-07-25 23:07:15 by Eóin



and eax, SectorMask
adc edi, 0


Note I don't understand the code all over, but those instructions could be incorrect. You have to exchange them.
Posted on 2003-07-27 12:35:12 by MazeGen
I believe you are correct, I was clearly a bit hasty. Thanks :)
Posted on 2003-07-27 18:45:25 by Eóin