Hello there :-)
i mess up with the search engine so i ask my beginner question here.
i want to search a huge html file for links an sort them out in a listbox.
what is the best way as beginner to do so ?
i found string search routines in archive here, but i didn't understand :-(
do i have to search the text for the first char, if found get the rest of the word and seek again ?
lets say i search for "href"
so i search for the char "h" and if found i read the next 3 chars after the found postion and then make lstrcmp if "href" found ?
or do i have to search only 4 char text and compare them with "href" ?
sorry if i waste your time reading this bad English, but i don't know where and who to ask, then here.
very nice if you can point me to the right way, thank you.
i mess up with the search engine so i ask my beginner question here.
i want to search a huge html file for links an sort them out in a listbox.
what is the best way as beginner to do so ?
i found string search routines in archive here, but i didn't understand :-(
do i have to search the text for the first char, if found get the rest of the word and seek again ?
lets say i search for "href"
so i search for the char "h" and if found i read the next 3 chars after the found postion and then make lstrcmp if "href" found ?
or do i have to search only 4 char text and compare them with "href" ?
sorry if i waste your time reading this bad English, but i don't know where and who to ask, then here.
very nice if you can point me to the right way, thank you.
heres my folish code, but worked :-)
invoke GlobalAlloc, GPTR, seeksize
mov hmem, eax
invoke InternetOpen, ADDR useragent, INTERNET_OPEN_TYPE_DIRECT, NULL, NULL, 0
mov hopen, eax
invoke InternetOpenUrl, hopen, ADDR url_buff, NULL, 0, INTERNET_FLAG_RELOAD, 0
mov hfile, eax
invoke InternetReadFile, hfile, hmem, seeksize, offset hresult
invoke InternetCloseHandle, hfile
invoke InternetCloseHandle, hopen
;-- search for links
mov esi, hmem ; html buffer
invoke lstrlen,esi
mov f_size,eax ; size of html buffer
@seeknext:
lea edi, szBuffer1
xor ecx, ecx
@1:
cmp f_size,0
je end_seek
mov al, byte ptr
mov byte ptr , al
inc esi
inc edi
dec f_size
cmp al, 'h'
jne @1
;-- get next chars if "h" is found
lea edi, szBuffer2
xor ecx, ecx
mov byte ptr , 'h'
inc edi
mov al, byte ptr
mov byte ptr , al
inc esi
inc edi
dec f_size
mov al, byte ptr
mov byte ptr , al
inc esi
inc edi
dec f_size
mov al, byte ptr
mov byte ptr , al
inc esi
inc edi
dec f_size
mov al, byte ptr
mov byte ptr , al
inc esi
inc edi
dec f_size
;-- compare with "href="
invoke lstrcmp, addr szHttp,addr szBuffer2
.if !eax ; if found get link
inc esi ; remove " or '
inc edi
dec f_size
lea edi, szBuffer3
xor ecx, ecx
@3:
mov al, byte ptr
mov byte ptr , al
inc esi
inc edi
dec f_size
.if al=="'"
mov byte ptr , 0
invoke SendDlgItemMessageA,mainhwnd,2000,LB_ADDSTRING,0,addr szBuffer3 ; insert link in listbox
.elseif al=='"'
mov byte ptr , 0
invoke SendDlgItemMessageA,mainhwnd,2000,LB_ADDSTRING,0,addr szBuffer3 ; insert link in listbox
.else
jmp @3
.endif
;invoke MessageBox,0,addr szBuffer3,addr szBuffer3,0
.endif
jmp @seeknext
end_seek:
invoke GlobalFree, hmem
ret
but why is the website not full loaded when i use:
invoke InternetReadFile, hfile, hmem, seeksize, offset hresult
and buffer: seeksize dd 50000 ;<- amount of chars to read from page
seams the page is only half readet.
i size up the buffer and write the data to txt file, the page is not full loadet.
how can i load the page full ?
do i forgot something ?
invoke GlobalAlloc, GPTR, seeksize
mov hmem, eax
invoke InternetOpen, ADDR useragent, INTERNET_OPEN_TYPE_DIRECT, NULL, NULL, 0
mov hopen, eax
invoke InternetOpenUrl, hopen, ADDR url_buff, NULL, 0, INTERNET_FLAG_RELOAD, 0
mov hfile, eax
invoke InternetReadFile, hfile, hmem, seeksize, offset hresult
invoke InternetCloseHandle, hfile
invoke InternetCloseHandle, hopen
;-- search for links
mov esi, hmem ; html buffer
invoke lstrlen,esi
mov f_size,eax ; size of html buffer
@seeknext:
lea edi, szBuffer1
xor ecx, ecx
@1:
cmp f_size,0
je end_seek
mov al, byte ptr
mov byte ptr , al
inc esi
inc edi
dec f_size
cmp al, 'h'
jne @1
;-- get next chars if "h" is found
lea edi, szBuffer2
xor ecx, ecx
mov byte ptr , 'h'
inc edi
mov al, byte ptr
mov byte ptr , al
inc esi
inc edi
dec f_size
mov al, byte ptr
mov byte ptr , al
inc esi
inc edi
dec f_size
mov al, byte ptr
mov byte ptr , al
inc esi
inc edi
dec f_size
mov al, byte ptr
mov byte ptr , al
inc esi
inc edi
dec f_size
;-- compare with "href="
invoke lstrcmp, addr szHttp,addr szBuffer2
.if !eax ; if found get link
inc esi ; remove " or '
inc edi
dec f_size
lea edi, szBuffer3
xor ecx, ecx
@3:
mov al, byte ptr
mov byte ptr , al
inc esi
inc edi
dec f_size
.if al=="'"
mov byte ptr , 0
invoke SendDlgItemMessageA,mainhwnd,2000,LB_ADDSTRING,0,addr szBuffer3 ; insert link in listbox
.elseif al=='"'
mov byte ptr , 0
invoke SendDlgItemMessageA,mainhwnd,2000,LB_ADDSTRING,0,addr szBuffer3 ; insert link in listbox
.else
jmp @3
.endif
;invoke MessageBox,0,addr szBuffer3,addr szBuffer3,0
.endif
jmp @seeknext
end_seek:
invoke GlobalFree, hmem
ret
but why is the website not full loaded when i use:
invoke InternetReadFile, hfile, hmem, seeksize, offset hresult
and buffer: seeksize dd 50000 ;<- amount of chars to read from page
seams the page is only half readet.
i size up the buffer and write the data to txt file, the page is not full loadet.
how can i load the page full ?
do i forgot something ?
xanthos,
lstrcmp is a case sensitive cmpare API, you need to use a none case sensitive
one like lstrcmpi. This way if the html tags are caped or not it will still work
right, I wrote this real quick and have not tested it but it should work
as this is a simple operation. It is in no way optimized :shock:
This function will search (filesize) bytes for a string(pTo_Find)
searching in hMem$ for the string if it finds it it will return a
pointer into hMem$ where the string was found.
otherwise it returns -1 for not found in hMem$ anywhere. :lol:
Zcoder....
lstrcmp is a case sensitive cmpare API, you need to use a none case sensitive
one like lstrcmpi. This way if the html tags are caped or not it will still work
right, I wrote this real quick and have not tested it but it should work
as this is a simple operation. It is in no way optimized :shock:
This function will search (filesize) bytes for a string(pTo_Find)
searching in hMem$ for the string if it finds it it will return a
pointer into hMem$ where the string was found.
otherwise it returns -1 for not found in hMem$ anywhere. :lol:
;*************************************************************************
; This function will find the string in hMem$ and will search all bytes
; (filesize) for the tring pTo_Find. if it finds it will return a pointer
; to the start of that string. else eax == -1
;*************************************************************************
Find_String proc hMem$:DWORD,filesize:DWORD,pTo_Find:DWORD
LOCAL i_pointer :DWORD
mov esi,hMem$ ;<--- points to the html file data.
mov ,eax
.while filesize > 0
mov ecx,4
mov esi,
mov edi,offset buffer
.while ecx > 0
mov al,
mov ,al
inc edi
inc esi
mov byte ptr,0
dec ecx
.endw
invoke lstrcmpi,addr buffer,pTo_Find
.if eax == 0
jmp Found_It
.endif
inc
dec filesize
.endw
mov eax,-1 ; did not find it.
ret
Found_It:
mov eax, ; return with pointer to the start of the found string
ret
Find_String endp
Zcoder....
xanthos ,
you must loop doing InternetReadFile until it returns TRUE
and lpdwNumberOfBytesRead param == 0
that way you will have read in the whole file.
Zcoder....
you must loop doing InternetReadFile until it returns TRUE
and lpdwNumberOfBytesRead param == 0
that way you will have read in the whole file.
Zcoder....
The problem with HTML is that the standard allows for a LOT of slack in how tags are generated. Case sensivity is just one thing, but you also have to consider/support whitespace, double-quotes or the lack of them, et cetera. If you need to do a fair amount of HTML processing, you should look into regular expressions :)
thank you very much for the help :-)
you are right, this little code now makes it.
InternetReadFile...
mov eax,CurrentDataSize
add eax,Bufferlen
mov CurrentDataSize, eax
.if Bufferlen != 0 <--- check size
invoke WriteFile, fHand, addr Buffer, Bufferlen, addr bwrite, NULL
jmp download
.endif
yes i dont know this before and wonder why only some links are shown up and when i look in html code, is see some start/ends with " or ' so i search for booth.
thanks a lot for the snippet :-)
wish you a nice weekend, posting results the next days in hope of a finish link reader.
you must loop doing InternetReadFile until it returns TRUE
and lpdwNumberOfBytesRead param == 0
and lpdwNumberOfBytesRead param == 0
you are right, this little code now makes it.
InternetReadFile...
mov eax,CurrentDataSize
add eax,Bufferlen
mov CurrentDataSize, eax
.if Bufferlen != 0 <--- check size
invoke WriteFile, fHand, addr Buffer, Bufferlen, addr bwrite, NULL
jmp download
.endif
The problem with HTML is that the standard allows for a LOT of slack in how tags are generated.
yes i dont know this before and wonder why only some links are shown up and when i look in html code, is see some start/ends with " or ' so i search for booth.
This function will search (filesize) bytes for a string(pTo_Find)
searching in hMem$ for the string if it finds it it will return a
pointer into hMem$ where the string was found.
otherwise it returns -1 for not found in hMem$ anywhere.
searching in hMem$ for the string if it finds it it will return a
pointer into hMem$ where the string was found.
otherwise it returns -1 for not found in hMem$ anywhere.
thanks a lot for the snippet :-)
wish you a nice weekend, posting results the next days in hope of a finish link reader.