Here's another little binary workout :)

The UTF-8 text format encodes both "American ASCII" (7 bit characters), AND unicode characters.
That means its character encoding is variable in length.
Can you detect a legal series of characters, and return the length?

Notes about UNICODE (ENCODED IN UTF-8) from unix:
Non-UNICODE bytes have bit 7 clear.
The first byte in an encoded UNICODE character has a series of high bits set, followed by one bit cleared.
The number of set bits indicates the number of bytes in this encoded character.
All 'subsequent' bytes have bit 7 set, and bit 6 clear.
The remaining bytes contain the binary-encoded 'U-Code'.

;U-00000000 U-0000007F: 0xxxxxxx     ;single byte, american ascii character
;U-00000080 U-000007FF: 110xxxxx 10xxxxxx
;U-00000800 U-0000FFFF: 1110xxxx 10xxxxxx 10xxxxxx
;U-00010000 U-001FFFFF: 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
;U-00200000 U-03FFFFFF: 111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
;U-04000000 U-7FFFFFFF: 1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx

Given a series of Bytes in EDX:EAX (AL contains the low order byte), write a function to quickly determine whether the byte sequence is a UTF-8 encoded UNICODE character according to the above masks, or whether the byte series is illegal.
It should return the number of bytes in the character, or null.

If you feel like it, try to decode the U-Code for unicode characters as well :)

Posted on 2010-01-18 04:45:22 by Homer
This looks like a job for a finite state machine.

***EDIT*** I'm at work, but this is my last week (moving on to a better job) so I'm just playing desktop paperweight today :D Thank you for keeping me busy Homer

Just the function...

align 8
ValidateUnicodeState:
        REPEAT 128 ;;7 0xxx F xxxx
              dd ValidateUnicodeSuccess
        END REPEAT
        REPEAT 64 ;;B 10xx F xxxx
              dd ValidateUnicodeFail
        END REPEAT
        REPEAT 32 ;;D 110x F xxxx
              dd ValidateUnicodeNext1
        END REPEAT
        REPEAT 16 ;;E 1110 F xxxx
              dd ValidateUnicodeNext2
        END REPEAT
        REPEAT 8 ;;F 1111 7 0xxx
              dd ValidateUnicodeNext3
        END REPEAT
        REPEAT 4 ;;F 1111 B 10xx
              dd ValidateUnicodeNext4
        END REPEAT
        REPEAT 2 ;;F 1111 D 110x
              dd ValidateUnicodeNext5
        END REPEAT
        REPEAT 2 ;;F 1111 F 111x
              dd ValidateUnicodeFail
        END REPEAT
ValidateUnicodeNext1:
        REPEAT 128 ;;0xxx xxxx
              dd ValidateUnicodeFail
        END REPEAT
        REPEAT 64 ;;10xx xxxx
              dd ValidateUnicodeSuccess
        END REPEAT
        REPEAT 64 ;;11xx xxxx
              dd ValidateUnicodeFail
        END REPEAT
ValidateUnicodeNext2:
        REPEAT 128 ;;0xxx xxxx
              dd ValidateUnicodeFail
        END REPEAT
        REPEAT 64 ;;10xx xxxx
              dd ValidateUnicodeNext1
        END REPEAT
        REPEAT 64 ;;11xx xxxx
              dd ValidateUnicodeFail
        END REPEAT
ValidateUnicodeNext3:
        REPEAT 128 ;;0xxx xxxx
              dd ValidateUnicodeFail
        END REPEAT
        REPEAT 64 ;;10xx xxxx
              dd ValidateUnicodeNext2
        END REPEAT
        REPEAT 64 ;;11xx xxxx
              dd ValidateUnicodeFail
        END REPEAT
ValidateUnicodeNext4:
        REPEAT 128 ;;0xxx xxxx
              dd ValidateUnicodeFail
        END REPEAT
        REPEAT 64 ;;10xx xxxx
              dd ValidateUnicodeNext3
        END REPEAT
        REPEAT 64 ;;11xx xxxx
              dd ValidateUnicodeFail
        END REPEAT
ValidateUnicodeNext5:
        REPEAT 128 ;;0xxx xxxx
              dd ValidateUnicodeFail
        END REPEAT
        REPEAT 64 ;;10xx xxxx
              dd ValidateUnicodeNext4
        END REPEAT
        REPEAT 64 ;;11xx xxxx
              dd ValidateUnicodeFail
        END REPEAT

;; IN: EDX:EAX unicode char
;; OUT: Length or NULL (illegal character)
align 16
ValidateUnicode:
        PUSH    ebx
        PUSH    esi
        XOR    ebx, ebx ;; =0
        MOV    esi, ValidateUnicodeState ;; start state
.Again:
        MOVZX  ecx, al ;; load byte
        ADD    ebx, 1 ;; increment length
        SHRD    eax, edx, 8 ;; prepare next byte
        MOV    esi, ;; load next state
        SHR    edx, 8
        CMP    esi, ValidateUnicodeFail ;;next state?
        JB      .Again
        JMP    esi ;; fail or success
ValidateUnicodeFail:
        XOR    ebx, ebx
ValidateUnicodeSuccess:
        MOV    eax, ebx
        POP    esi
        POP    ebx
        RET 0



FASM compilable source

format PE console
include 'win32a.inc'
entry start

section ".data" data readable writeable
szFormat  db 'eax is %X',13,10,0
_pause    db 'pause',0

section ".code" code readable executable
start:
;;
        mov    eax, 0
        call    ValidateUnicode
        push            eax
        push            szFormat
        call            ;; 1
;;
        mov    eax, 80h
        call    ValidateUnicode
        push            eax
        push            szFormat
        call            ;; 0
;;
        mov    eax, 80C0h
        call    ValidateUnicode
        push            eax
        push            szFormat
        call            ;; 2
;;
        mov    eax, 0C0h
        call    ValidateUnicode
        push            eax
        push            szFormat
        call            ;; 0
;;
        mov    eax, 808080FDh
        mov    edx, 8080h
        call    ValidateUnicode
        push            eax
        push            szFormat
        call            ;; 6
;;
        mov    eax, 808080FDh
        mov    edx, 7080h
        call    ValidateUnicode
        push            eax
        push            szFormat
        call            ;; 0
;;
        push            _pause
        call           
        push            0
        call           

align 8
ValidateUnicodeState:
        REPEAT 128 ;;7 0xxx F xxxx
              dd ValidateUnicodeSuccess
        END REPEAT
        REPEAT 64 ;;B 10xx F xxxx
              dd ValidateUnicodeFail
        END REPEAT
        REPEAT 32 ;;D 110x F xxxx
              dd ValidateUnicodeNext1
        END REPEAT
        REPEAT 16 ;;E 1110 F xxxx
              dd ValidateUnicodeNext2
        END REPEAT
        REPEAT 8 ;;F 1111 7 0xxx
              dd ValidateUnicodeNext3
        END REPEAT
        REPEAT 4 ;;F 1111 B 10xx
              dd ValidateUnicodeNext4
        END REPEAT
        REPEAT 2 ;;F 1111 D 110x
              dd ValidateUnicodeNext5
        END REPEAT
        REPEAT 2 ;;F 1111 F 111x
              dd ValidateUnicodeFail
        END REPEAT
ValidateUnicodeNext1:
        REPEAT 128 ;;0xxx xxxx
              dd ValidateUnicodeFail
        END REPEAT
        REPEAT 64 ;;10xx xxxx
              dd ValidateUnicodeSuccess
        END REPEAT
        REPEAT 64 ;;11xx xxxx
              dd ValidateUnicodeFail
        END REPEAT
ValidateUnicodeNext2:
        REPEAT 128 ;;0xxx xxxx
              dd ValidateUnicodeFail
        END REPEAT
        REPEAT 64 ;;10xx xxxx
              dd ValidateUnicodeNext1
        END REPEAT
        REPEAT 64 ;;11xx xxxx
              dd ValidateUnicodeFail
        END REPEAT
ValidateUnicodeNext3:
        REPEAT 128 ;;0xxx xxxx
              dd ValidateUnicodeFail
        END REPEAT
        REPEAT 64 ;;10xx xxxx
              dd ValidateUnicodeNext2
        END REPEAT
        REPEAT 64 ;;11xx xxxx
              dd ValidateUnicodeFail
        END REPEAT
ValidateUnicodeNext4:
        REPEAT 128 ;;0xxx xxxx
              dd ValidateUnicodeFail
        END REPEAT
        REPEAT 64 ;;10xx xxxx
              dd ValidateUnicodeNext3
        END REPEAT
        REPEAT 64 ;;11xx xxxx
              dd ValidateUnicodeFail
        END REPEAT
ValidateUnicodeNext5:
        REPEAT 128 ;;0xxx xxxx
              dd ValidateUnicodeFail
        END REPEAT
        REPEAT 64 ;;10xx xxxx
              dd ValidateUnicodeNext4
        END REPEAT
        REPEAT 64 ;;11xx xxxx
              dd ValidateUnicodeFail
        END REPEAT

;; IN: EDX:EAX unicode char
;; OUT: Length or NULL (illegal character)
align 16
ValidateUnicode:
        PUSH    ebx
        PUSH    esi
        XOR    ebx, ebx ;; =0
        MOV    esi, ValidateUnicodeState ;; start state
.Again:
        MOVZX  ecx, al ;; load byte
        ADD    ebx, 1 ;; increment length
        SHRD    eax, edx, 8 ;; prepare next byte
        MOV    esi, ;; load next state
        SHR    edx, 8
        CMP    esi, ValidateUnicodeFail ;;next state?
        JB      .Again
        JMP    esi ;; fail or success
ValidateUnicodeFail:
        XOR    ebx, ebx
ValidateUnicodeSuccess:
        MOV    eax, ebx
        POP    esi
        POP    ebx
        RET 0

section '.idata' import data readable writeable
                library kernel,'KERNEL32.DLL',\
                        msvcrt,'msvcrt.dll',\
                        user,'USER32.DLL'
                import kernel,\
                      GetCurrentProcess, 'GetCurrentProcess',\
                      SetPriorityClass, 'SetPriorityClass',\
                      GetCurrentThread, 'GetCurrentThread',\
                      SetThreadPriority, 'SetThreadPriority',\
                      GetTickCount,'GetTickCount',\
                      ExitProcess,'ExitProcess'
                import msvcrt,\
                      printf,'printf',\
                      system,'system'
                import user,\
                      wsprintf,'wsprintfA',\
                      MessageBox,'MessageBoxA'
Posted on 2010-01-18 07:53:43 by r22
Homer,

Good exercise.
Not exactly UTF-8 encoding form (as Unicode standard defines it), but nevertheless (FASM too)

        format  PE console
        include "Win32A.Inc"

; char * __cdecl decode_utf_8(char *utf_8, int *utf_32);
_decode_utf_8:
        push    esi
        mov    esi,
        movzx  eax, byte
        inc    esi
        mov    ecx, 7
clear:  btr    eax, ecx
        dec    ecx
        js      error          ; all bits are non-zero
        jc      clear
; ecx==6, 5, 4, 3, 2, 1, 0 -- one less than zero bit index
        sub    ecx, 5
        jg      done            ; 0xxxxxxx, done
        je      error          ; 10xxxxxx, error
; ecx==-1, -2, -3, -4, -5 -- looks like good loop counter
        sub    esi, ecx        ; adjust pointer
next:  movzx  edx, byte
        test    dl, 0C0h
        jns    error          ; 0xxxxxxx, error
        jpe    error          ; 11xxxxxx, error
        and    edx, 3Fh
        shl    eax, 6
        or      eax, edx
        inc    ecx
        jnz    next
done:  mov    ecx,
        mov    , eax
        mov    eax, esi
        pop    esi
        ret
error:  or      eax, -1
        jmp    done

prompt  db    "Enter UTF-8 code units (hex, finish with any non-hex)", 10, 0
input_fmt db  "%2x", 0
banner  db    "Decoded as:", 10, 0
output_fmt db  "%#x", 10, 0

entry $
        cinvoke _printf, prompt
        mov    edi, utf_8
read:  cinvoke _scanf, input_fmt, edi
        add    edi, eax
        cmp    edi, utf_8_end
        jae    decode
        test    eax, eax
        jnz    read

decode: mov    esi, utf_8
        cinvoke _printf, banner
more:  ccall  _decode_utf_8, esi, utf_32
        mov    esi, eax
        cinvoke _printf, output_fmt,
        cmp    esi, edi
        jb      more
        cinvoke _getch
        ret

        data import
        library MSVCRT, "MSVCRT"
        import  MSVCRT,\
                _printf, "printf",\
                _scanf, "scanf",\
                _getch, "_getch"
        end data

        align  4
utf_32  rd      1
utf_8  rb      100
utf_8_end = $


Unicode defines only 010FFFF code points range, thus (well-formed) UTF-8 code unit sequence is no longer than four bytes. Detecting ill-formed byte sequence is another story.
Posted on 2010-01-18 16:40:55 by baldr
What's the goal in this exercise? Fastest, smallest, most elegant...?
Posted on 2010-01-19 01:54:12 by Scali
The goal is speed, unless otherwise mentioned.
Most of the time, if we're writing asm code, we're interested in making something faster.
People generally care little about size these days (the old size matters arguments are becoming meaningless anecdotes).
They don't care if the app is 5MB or 50 MB, but they'd like to run the faster one, because the size of some code is usually invisible to the user, but the speed of a slow application is painfully obvious to the user.
As an example, RadASM comes to a snail pace with files above 50k lines, even on a fast machine.


Posted on 2010-01-19 02:44:29 by Homer
I think state machine will come out as a top performer. I was thinking of a more hard coded solution, but then a lot of branching becomes unavoidable, state machine has 6 CMP's worst case. Although the SHRD instruction is fairly slow I think a mitigated any partial stalls.
Posted on 2010-01-19 08:11:18 by r22
Well, it took me a while to get around to it (At first I didn't realize this was a follow up post to "Small Challenge") and I haven't really done any programming lately.

Anyway, here's my version - it's more or less a variation of r22's but I reduced the table size and tried to minimize jumps.
Thanks for the base code to get me going - it made it a lot easier to figure out what I was supposed to be doing - hopefully I've got it right ;-).

I still haven't worked on any timing code yet, so I don't really know how well it performs in comparison.  I also haven't  checked too much invalid data, but I don't forsee any problems.

format PE console
include 'win32a.inc'
entry start

section ".code" code readable executable
start:
    mov eax,0
    call ValidateUnicode
    push eax
    push szFormat
    call ;; 1

    mov eax,80C0h
    call ValidateUnicode
    push eax
    push szFormat
    call ;; 2

    mov eax,8080E0h
    call ValidateUnicode
    push eax
    push szFormat
    call ;; 3

    mov eax,808080F0h
    call ValidateUnicode
    push eax
    push szFormat
    call ;; 4

    mov eax,808080F8h
    mov edx,80h
    call ValidateUnicode
    push eax
    push szFormat
    call ;; 5

    mov eax,808080FCh
    mov edx,8080h
    call ValidateUnicode
    push eax
    push szFormat
    call ;; 6

    mov eax,0FFh
    mov edx,8080h
    call ValidateUnicode
    push eax
    push szFormat
    call ;; 0

    mov eax,80h
    call ValidateUnicode
    push eax
    push szFormat
    call ;; 0

    mov eax,0C0h
    call ValidateUnicode
    push eax
    push szFormat
    call ;; 0

    mov eax,808080FDh
    mov edx,7080h
    call ValidateUnicode
    push eax
    push szFormat
    call ;; 0

    push 0
    call
align 8

;; IN: EDX:EAX unicode char
;; OUT: Length or NULL (illegal character)
align 16
ValidateUnicode:
    movzx ecx,al
    xor ecx,11111111b
    or  ecx,1
    bsr ecx,ecx
    and eax,
    xor eax,
    jnz .long
    mov eax,
    retn
  .long:
    and edx,
    xor edx,eax
    jnz .error
    mov eax,
    retn
  .error:
    xor eax,eax
    retn
align 8
.andtable  dd 0,0,0
            dd 11000000110000001100000011111000b
            dd 00000000110000001100000011110000b
            dd 00000000000000001100000011100000b
            dd 0
            dd 00000000000000000000000010000000b
.xortable  dd -1
            dd 00000000000000001000000010000000b
            dd 00000000000000000000000010000000b
            dd 10000000100000001000000011110000b
            dd 00000000100000001000000011100000b
            dd 00000000000000001000000011000000b
            dd -1
            dd 00000000000000000000000000000000b
.andtable2 dd 0
            dd 00000000000000001100000011000000b
            dd 00000000000000000000000011000000b
            dd 0,0,0,0
.lentable  dd 0,6,5,4,3,2,0,1

section ".data" data readable writeable
szFormat  db 'eax is %X',13,10,0

section '.idata' import data readable writeable
library kernel,'KERNEL32.DLL',\
        msvcrt,'msvcrt.dll'
import kernel,\
        ExitProcess,'ExitProcess'
import msvcrt,\
        printf,'printf'
Posted on 2010-02-11 23:20:11 by sysfce2