How do you remove repeating chars?

e.g. I have a string "aaaaaaaaaaaaaaaaaaaaaaaaabcde", I want to make this to "abcde"
Posted on 2005-05-08 00:13:55 by audioman
Write a scanner that does a 1 byte look ahead and if the next byte is the same as the current one, don't write the current one to the output buffer.

The basic concept in code is like this.


mov al,           ; read current byte
mov cl,       ; read the next byte
cmp cl, al
je label
mov , al
......
label:
Posted on 2005-05-08 01:20:36 by hutch--
I forgot to tell that i only need to remove spaces.

e.g.
"                                  hello " -> "hello"

"test me" -> "test me" (no change)
Posted on 2005-05-08 02:22:56 by audioman
This should do the trick.
Repeated series of input bytes are recorded as a single output byte.
Your single space will be preserved, at it will work for any character.
ie "hellllllllp? ? ? ? meeeeeeeeee" = "help me"
"help me" = "help me"
but also it's not smart by any means:
"cool beer" = "col ber"
"deep wells are creepy" = "dep wels are crepy"

Note that it's perfectly safe to have the output string overwrite the input string, ie, to hand in the same pointer for both params in your calls.
This is because we can be sure that the ouput string will be as long, or shorter, than the source string. It will never be longer, and so we can do no harm.
It's up to you if you want to remove the second param and set edi to esi at the start of it.



StripDuplicates proc uses esi edi pStringIn, pStringOut
mov esi,pStringIn
mov edi,pStringOut ;edi will point to somewhere to store the next output byte
mov al,byte ptr ;al will contain the most recent byte we've seen
inc esi ;we want esi to point to the "next" byte, which we have not yet checked
.while byte ptr!=0 ;we want to break the loop if we reach the zeroterminator in source string
? ?.if byte ptr==al ;we found a duplicated byte so skip it
? ? ? inc esi
? ?.else ;we found a different byte so record al to output and update al
      mov byte ptr,al
? ? ? mov al,byte ptr
? ? ? inc esi
? ? ? inc edi
? .endif
.endw
mov byte ptr,0;write the zero terminator
ret
StripDuplicates endp



Posted on 2005-05-08 04:26:40 by Homer
Here's some solid code I just whipped up. Trims multiple spaces (and only spaces), and doesn't produce buffer overflows.


; ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
; void trimspace(char *dst, char *src, unsigned dstlen)
; ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
;
; Removes duplicate spaces in string, and makes sure not to produce any
; buffer overflows. Safety over speed, can probably be optimized plenty.
;
; dstlen includes trailing NUL char, and destination will always be zero
; terminated - ie, a 2-byte buffer will hold a string of 1 char.
;
OPTION PROLOGUE:NONE
OPTION EPILOGUE:NONE

trimspace proc stdcall a_dst:dword, a_src:dword, a_len:dword
arg_dst    = 8
arg_src    = 12
arg_len    = 16

    push    esi
    mov    esi,
    test    esi, esi
    jz      short @@out
    cmp    esi, 1
    jnz    short @@longer_than_1
    mov    eax,
    mov    byte ptr , 0
    pop    esi
    retn    12

@@longer_than_1:
    mov    eax,
    mov    edx,
    dec    esi
    push    ebx

@@mainloop:
    mov    cl,
    inc    eax
    cmp    cl, ' '
    jnz    short @@space_done
    cmp    , cl
    jnz    short @@space_done
    nop

@@skipspace:
    mov    bl,
    inc    eax
    cmp    bl, ' '
    jz      short @@skipspace

@@space_done:
    mov    , cl
    inc    edx
    dec    esi
    test    cl, cl
    jz      short @@zterm_out
    test    esi, esi
    jnz    short @@mainloop

@@zterm_out:
    mov    byte ptr , 0
    pop    ebx

@@out:
    pop    esi
    retn    12
trimspace  endp

OPTION PROLOGUE:PROLOGUEDEF
OPTION EPILOGUE:EPILOGUEDEF

EDIT: tabs to spaces since I use 4space tabs.
Attachments:
Posted on 2005-05-08 07:31:06 by f0dder
I tried this string -> "call ExitProcess                                        " it became -> "call ExitProcess "
It still has a space at the end..

Posted on 2005-05-09 04:19:45 by audioman
Unless you want to write a dedicated algo that does both, if your source string has multiple leading and trailing spaces you need to perform that operation seperately from removing duplicate characters.

If you are using MASM32, use the "szTrim" procedure to clean up both end of a zero terminated string.
Posted on 2005-05-09 04:58:07 by hutch--
ops :P I delete my one hehe

A input


#@          oler    no    se
            oler    no    se    #@ ;<- triling and after spaces
oler#@ ; first output (no spaces before)
no#@ ; second output
se#### ;third output (no spaces after)


You only need to join them... dont know if have some error there...


org 0x100
section .text
start:
mov bx, buffer
call GetLine
call PrintNL
call Print
call PrintNL
call NoRepeat
mov bx, END_HERE
call Print
mov ah, 0
int 0x21
END_HERE db "####", 0

; Get a line until enter is pressed store in bx
GetLine:
mov ah, 0x1
push di
xor di, di
.getNextChar:
int 0x21
cmp al, 0xD
jz .fin
mov , al
inc di
jmp .getNextChar
.fin:
xor ax, ax
mov , al
pop di
ret


; print the string pointed by bx
Print:
push si
push dx
xor si, si
mov ah, 0x2
.printNextChar:
mov dl,
cmp dl, 0
jz .end
int 0x21
inc si
jmp .printNextChar
.end:
pop dx
pop si
ret

; print a new Line with indicators, preserving the anterior string in bx...
PrintNL:
push bx
mov bx, NL
call Print
pop bx
ret
NL db "#", "@", 13,10,0

PrintSpace:
push bx
mov bx, SPACE
call Print
pop bx
ret
SPACE db " ", 0

; delete characters that repeat more than 1
NoRepeat:
xor si, si
mov ah, 0x2
mov dx, si
.nextSpace:
mov dl,
cmp dl, 0
jz .end
cmp dl, " "
jnz .print_ln
.nextSpace1:
inc si
jmp .nextSpace
.print_ln
cmp dh, 0
jz .chars0
call PrintNL
call PrintSpace
.chars0:
mov dh, 1
.chars:
int 0x21
inc si
mov dl,
cmp dl, " "
jz .nextSpace1
cmp dl, 0
jz .end
jmp .chars
.end:
ret


section .bss
buffer resb 64


In fact there is a little trik here :) ... have you find it??
Posted on 2005-05-09 12:05:58 by rea