.data dbgbuf db 256 dup (?) .code DbgPrint macro format, window, args:VARARG pushad @invoke wsprintf,addr dbgbuf,format, args popad DbgStr dbgbuf,,window endm DbgToken macro token, window pushad mov eax,token mov edx,[eax].Token.ParentSymbol .if [edx].Symbol.Kind==STERMINAL pushad invoke GoAscii,addr buf,[edx].Symbol.sName popad mov edx,eax DbgPrint "Token %lX: Type=%s Terminal Data=%s",window ,edx,addr buf, [eax].Token.TokenData .elseif [eax].Token.TokenData!=0 pushad invoke GoAscii,addr buf,[edx].Symbol.sName popad mov edx,eax DbgPrint "Token %lX: Type=%s NonTerminal Reduction[%lX]",window ,edx,addr buf, [eax].Token.TokenData .else pushad invoke GoAscii,addr buf,[edx].Symbol.sName popad DbgPrint "Token %lX: Type=%s NonTerminal",window ,edx,addr buf .endif popad endm ;Encoding of input plaintext file TEXT_ANSI equ 0 TEXT_UTF16_LE equ 1 TEXT_UTF16_BE equ 2 ;Return values for Parse method MessageTokenRead equ 1 MessageReduction equ 2 MessageAccept equ 3 MessageNotLoaded equ 4 MessageLexicalError equ 5 MessageSyntaxError equ 6 MessageCommentError equ 7 MessageInternalError equ 8 MessageShift equ 9 ;Symbol Types: ;Note that only the first two appear in the LALR stage... ;The rest are filtered / removed in the DFA stage. SNONTERMINAL equ 0 ;SymbolTypeNonterminal Normal Nonterminal STERMINAL equ 1 ;SymbolTypeTerminal Normal Terminal SWHITESPACE equ 2 ;SymbolTypeWhitespace Whitespace Terminal SENDOFFILE equ 3 ;SymbolTypeEnd End Character - End of File. This symbol is used to represent the end of the file or the end of the source input. SCOMMENTSTART equ 4 ;SymbolTypeCommentStart Start of a block quote SCOMMENTEND equ 5 ;SymbolTypeCommentEnd End of a block quote SCOMMENTLINE equ 6 ;SymbolTypeCommentLine Line Comment Terminal SERROR equ 7 ;SymbolTypeError Error Terminal. If the parser encounters an error reading a token, this kind of symbol can used to differentiate it from other terminal types. ;;STRUCTURES::::::::: Symbol struct Kind WORD ? ;value from above enumeration sName LPWSTR ? Symbol ends Rule struct Nonterminal WORD ? ; Each rule derives a single nonterminal symbol. This field contains the index of the symbol in the Symbol Table. SymbolIndices Pointer ?; Address of an array which holds the rules indices. SymbolIndexCount WORD ? ; total number of indices hold by the above array. NOT IN SPECIFICATION OF GOLD. Rule ends ; The DFA stage is where 'characters' are input to the parser. ; It is a state engine, the current state is described by the struct below. ; We load an array of these structs from the CGT file. ; Each DFAState contains an array of Edges, and may or may not accept a terminal symbol. DFAState struct Edges Pointer ? ; Flat array of Edge structs wEdgeCount WORD ? ; Total number of elements in the above array. wAcceptIndex WORD ? ; If the state accepts a terminal symbol, this field will contain the symbol's index in the Symbol Table. Otherwise, the value in this field should be ignored. bAcceptState BYTE ? ; Boolean: Each DFA state can accept one of the grammar's terminal symbols. If the state accepts a terminal symbol, the value will be set to True and the Accept Index parameter will contain the symbol's index. DFAState ends ; An Edge is a struct which associates a CharacterSet with a DFAState, by indices. Edge struct wCharSetIndex Word ? ; Index of character set wTargetIndex Word ? ; Next DFA State Edge ends ; The LALR stage is where 'tokens' are input to the parser. ; It is a state engine, the current state is described by the struct below. ; We load an array of these structs from the CGT file. ; Each LALR state contains an array of Actions. LALR struct Actions Pointer ? ; Flat array of Action structs wActionCount WORD ? ; Total number of elements in the above array. LALR ends ;An Action tells the LALR stage what to do, given a specific Symbol. ;It could perform a SHIFT, GOTO, REDUCE or ACCEPT. Action struct wSymbolIndex Word ? ; Index in the Symbol Table. wActionType Word ? ; Represents the action for LALR stage to take based on the symbol. wTarget Word ? ; Index of next LALR State, or ReductionRule Symbol. Action ends ;A token is an 'instance of a Symbol'. ;Terminals store their string in the TokenData field. ;NonTerminals normally store NULL in this field, ;the exception being those tokens created to represent a Reduction.. ;these NonTerminals will store a ptr to a Reduction in this field. Token struct ParentSymbol dd ? ; -> Symbol TokenData dd ? ; LPWSTR if Symbol is terminal, otherwise -> Reduction State dd ? ; is a LALR state. Token ends ;A Reduction is a collection of tokens taken from the input stream ;and replaced with a single nonterminal representing the replacement. Reduction Struct ;;14 bytes. ParentRule Pointer ? ; is a Rule address Tokens Pointer ? ; Pointer to the beinning of reductions' tokens in TokenStack. TokenCount dw ? ; integer Reduction endS ;LALR Actions: ACTIONSHIFT equ 1 ; This action indicates the symbol is to be shifted. ; The Target field will contain the index of the LALR State that the parsing engine will advance to. ACTIONREDUCE equ 2 ; This action denotes that the parser can reduce a rule. ; The Target field will contain the index of the rule in the Rule Table. ACTIONGOTO equ 3 ; This action is used when a rule is reduced and the parser jumps to the state that represents the shifted nonterminal. ; The Target field will contain the index of the LALR State that the parsing engine will jump to after a reduction is completed. ACTIONACCEPT equ 4 ; When the parser encounters the Accept action for a given symbol, the source text is accepted as correct and complete. ; In this case, the Target field is not needed and should be ignored. ;parsetoken consts: PARSEACCEPT equ 1 PARSESHIFT equ 2 PARSEREDUCENORMAL equ 3 PARSEREDUCETRIMMED equ 4 PARSESYNTAXERROR equ -1 PARSEINTERNALERROR equ -2 ;Get address of Array Element by (byte or word sized) index ;ElementName = datatype / structname (anything we can apply sizeof) ;ArrayBase = ptr to base of array memory ;wIndex = 8 or 16-bit index (variable or register) Get_Element macro ElementName, ArrayBase,wIndex movzx edx, wIndex push ArrayBase mov eax, sizeof ElementName mul edx pop edx add eax, edx endm $Get_Element macro ElementName, ArrayBase,wIndex Get_Element ElementName, ArrayBase,wIndex exitm endm ;=================================== ;Helpers to read data from CGT file ;=================================== ;GET SHORT getvsh macro pb,ps inc pb ;skip entry id mov eax,pb mov ax,word ptr[eax] mov ps,ax add pb,2 endm ;GET BYTE getvb macro pb, pv inc pb ;skip entry id mov eax,pb mov al,byte ptr[eax] mov byte ptr pv,al inc pb endm ;=================================== ;Helper to allocate flat arrays ;what = datatype (can be a struct name) ;many = #elements $AllocArray macro what, many mov eax,sizeof what mul many MemAlloc eax,MEM_INIT_ZERO exitm endm ;Convert string: ascii to wide ToWide proc uses ecx stringin LOCAL stringout invoke lstrlen,stringin inc eax shl eax,1 mov stringout,$MemAlloc(eax) mov edx,stringin xor ecx,ecx .while byte ptr[edx]!=0 mov cl,byte ptr[edx] mov word ptr [eax],cx inc edx add eax,2 .endw mov word ptr [eax],0 mov eax,stringout ret ToWide endp ;Convert string: wide to ascii (not recommended) GoAscii proc uses ecx stringout, stringin mov eax,stringin mov edx,stringout .while byte ptr[eax]!=0 mov cx,word ptr[eax] mov byte ptr [edx],cl inc eax inc eax inc edx .endw mov byte ptr [edx],0 ret GoAscii endp ;Compare widestrings ;Returns EAX = 0 (match) or eax==-1 WideCompare proc uses ecx p1, p2 mov eax,p1 mov edx,p2 DbgUStr p1 DbgUStr p2 mov cx,-1 .while word ptr[eax]!=0 && word ptr[edx]!=0 mov cx,word ptr[eax] .if cx!=word ptr[edx] .break .endif add eax,2 add edx,2 .endw .if word ptr[eax]==0 && word ptr[edx]==0 xor eax,eax .else mov eax,-1 .endif ret WideCompare endp ;Length of WideString ;Returns EAX = Length of WideString in bytes, including terminator WideLen proc p1 push edx mov edx,p1 xor eax,eax .while word ptr[edx+eax*2]!=0 inc eax .endw inc eax shl eax,1 pop edx ret WideLen endp ;Search the given widestring for given Char (UTF16-LE CodePoint) LookChar2 Proc Character:Word, AddrUStr:Dword ;;if found return value is non zero, else zero. movzx eax, Character mov edx, AddrUStr @@: mov cx, [edx] add edx, 2 cmp ax, cx je EndWithSuccess or cx, cx ; is zero? jnz @B xor eax, eax ; indicate failure. EndWithSuccess: ret LookChar2 endp ;Duplicate a WideString ;Returns EAX = pNewString ; EBX = pNextData (pointer to just PAST the input string) ; EDX = #Bytes in string CloneWideString Proc addrStart:LPWSTR LOCAL dLen mov ebx,addrStart mov dLen,$invoke(WideLen,ebx) add ebx,eax push $MemAlloc(eax) invoke RtlMoveMemory,eax,addrStart,dLen pop eax mov edx,dLen ret CloneWideString EndP ;============================================================================================= Var equ DefineVariable Object Parser,2353453,Primer RedefineMethod Init, Pointer RedefineMethod Done StaticMethod DestroyParseTree, Pointer StaticMethod LoadGrammarFile, LPSTR StaticMethod LoadProgramFile, LPSTR StaticMethod Parse StaticMethod GetSymbolByName, LPWSTR Private StaticMethod Initialize StaticMethod CreateToken, dword StaticMethod DiscardRestOfLine StaticMethod GetTopToken, dword StaticMethod ParseToken, Pointer StaticMethod PushToken, dword,Pointer StaticMethod PopToken, dword StaticMethod RetrieveToken StaticMethod Reset StaticMethod ResetStacks PrivateEnd ;Variables used for DFA Scanner / tokenizer phase Var TextEncodingType Var StreamCursor Var StreamStart Var StreamEnd Var CurrentLine Var CurrentCol Var SysLine Var SysCol Var CommentLevel Var CurrentDFAState, Pointer Var CurrentLALRState, Pointer Var LastSuccessfulTokenString ;Major Arrays Var CharSets,Pointer Var Symbols, Pointer Var Rules, Pointer Var DFAStates, Pointer Var LALRs, Pointer ;Major Array Counts Var CharacterSetTableCount, WORD Var SymbolTableCount, WORD Var RuleTableCount, WORD Var DFATableCount, WORD Var LALRTableCount, WORD TOKENSTACKELEMENTCOUNT equ 128 TOKENSTACKSIZE equ TOKENSTACKELEMENTCOUNT * sizeof Token Var ErrExpStrList, LPSTR, 128 dup () Var CurrentReduction, Reduction, {<>} Var ErrExpectedCount Var ReduceRule ;Informational strings from CGT file Var GrammarName,LPWSTR Var GrammarVersion,LPWSTR Var GrammarAuthor,LPWSTR Var GrammarAbout,LPWSTR ;Initial state Var wStartSymbol, WORD Var InitDFA, WORD Var InitLALR, WORD ;Booleans Var bTablesLoaded, BYTE, FALSE ;byte-length booleans Var bCaseSensitive, BYTE, FALSE Var bStreamIsFile, BYTE, FALSE ;Output Var ParseTree,Pointer, NULL ;-> root Reduction Embed LALR_TokenStack,DataCollection Embed Input_TokenStack,DataCollection Embed Reductions,DataCollection ObjectEnd ;============================================================================================= Method Parser.Init,uses esi,pOwner SetObject esi ACall esi.Init,pOwner OCall [esi].LALR_TokenStack::DataCollection.Init,pOwner,16,256,-1 OCall [esi].Input_TokenStack::DataCollection.Init,pOwner,16,256,-1 OCall [esi].Reductions::DataCollection.Init,pOwner,16,256,-1 MethodEnd .data DefaultToken Token <> .code FORINPUT equ 0 FORSTACK equ 1 ;========================================================================================= ;Note: called at end of 'LoadGrammarFile' - do not call! Method Parser.Initialize,uses esi ;Create a new token: ;Set the State property to the Initial-LALR-State. ;Set the Parent-Symbol property to the Start-Symbol. ;Push onto the Token-Stack. SetObject esi OCall esi.Reset push $Get_Element (Symbol,[esi].Symbols, [esi].wStartSymbol) OCall esi.CreateToken, FORSTACK pop [eax].Token.ParentSymbol push eax mov [esi].CurrentLALRState, $Get_Element(LALR,[esi].LALRs, [esi].InitLALR) pop edx mov [edx].Token.State, eax OCall esi.PushToken, FORSTACK, edx ;Part 2: Set initial values. ;Other code related to setting line counters, etc... can also be added to this procedure. ;Set the Comment-Level to 0. mov [esi].CommentLevel, 0 MethodEnd ;Reset the three Major Stacks Method Parser.ResetStacks,uses esi edi ebx SetObject esi DbgDec [esi].LALR_TokenStack.dCount,"to be released" .while [esi].LALR_TokenStack.dCount!=0 OCall[esi].LALR_TokenStack::DataCollection.DeleteAt,0 push eax mov edx,[eax].Token.ParentSymbol .if [edx].Symbol.Kind==STERMINAL && [eax].Token.TokenData!=0 MemFree [eax].Token.TokenData .endif pop eax MemFree eax .endw DbgText "LALR_TokenStack freed" DbgDec [esi].Input_TokenStack.dCount,"to be released" .while[esi].Input_TokenStack.dCount!=0 OCall [esi].Input_TokenStack::DataCollection.DeleteAt,0 push eax mov edx,[eax].Token.ParentSymbol .if [edx].Symbol.Kind==STERMINAL && [eax].Token.TokenData!=0 MemFree [eax].Token.TokenData .endif pop eax MemFree eax .endw DbgText "Input_TokenStack freed" DbgDec [esi].Reductions.dCount,"to be released" .while[esi].Reductions.dCount!=0 OCall [esi].Reductions::DataCollection.DeleteAt,0 DbgHex eax push eax mov edi,eax DbgDec [edi].Reduction.TokenCount,"to be released" xor ebx,ebx ;For each token in reduction .while bx<[edi].Reduction.TokenCount Get_Element Token, [edi].Reduction.Tokens, bx mov edx,[eax].Token.ParentSymbol .if [edx].Symbol.Kind==STERMINAL && [eax].Token.TokenData!=0 push eax DbgUStr [eax].Token.TokenData,"releasing terminal string" MemFree [eax].Token.TokenData ;Free terminal token string pop eax .endif inc ebx .endw pop eax .if [eax].Reduction.Tokens!=0 push eax MemFree [eax].Reduction.Tokens ;Free reduction tokens array pop eax .endif MemFree eax ;Free reduction .endw DbgText "Reductions freed" MethodEnd ;Reset the Parser Method Parser.Reset,uses esi ;DbgText "reset" SetObject esi mov [esi].CurrentReduction.ParentRule, 0 mov [esi].CurrentReduction.TokenCount, 0 ;Other fields aren't used. mov [esi].ErrExpectedCount, 0 m2m [esi].StreamCursor, [esi].StreamStart, edx mov [esi].CurrentLine, 0 mov [esi].CurrentCol, 0 mov [esi].SysLine, 0 mov [esi].SysCol, 0 OCall esi.ResetStacks .if [esi].bStreamIsFile==TRUE && [esi].StreamStart!=0 MemFree [esi].StreamStart mov [esi].StreamStart,0 .endif .if [esi].ParseTree!=0 mov [esi].ParseTree,0 .endif MethodEnd ;Load grammar tables from .CGT File Method Parser.LoadGrammarFile, uses esi edi ebx, GrammarFile:LPSTR Local FileMem:dword Local FileLen:Dword Local tempFileMem:DWord Local MemEnd:Dword LOCAL File:Pointer LOCAL nEntries:word LOCAL nIndex:word LOCAL RecType:byte LOCAL byt:byte SetObject esi .IF [esi].bTablesLoaded!=FALSE ;No support for INCLUDE yet DbgWarning "Can't load another grammar file within the session, sorry." return FALSE .ENDIF ;***********load the file************* invoke GetFileAttributes,GrammarFile .if eax==-1 DbgWarning "Grammar File does not exist" return FALSE .else mov File,$New(DiskStream,Init,esi,GrammarFile) mov FileLen,$OCall(File::DiskStream.GetSize) mov FileMem,$MemAlloc(eax,MEM_INIT_ZERO) OCall File::DiskStream.BinRead, FileMem, FileLen .if eax==0 DbgWarning "Failed to read grammar file." ExitMethod .endif .endif ;***********check header******************* push $invoke (ToWide, $OfsCStr("GOLD Parser Tables/v1.0")) ;returns a HeapAllocated string invoke WideCompare, eax, FileMem pop edx push eax MemFree edx ;<--- release string pop eax .if eax!=0 DbgWarning "Wrong grammar file header." return FALSE .endif ;******************************************** mov eax, FileMem add eax, FileLen mov MemEnd, eax mov eax, FileMem add eax, 48 ; 48th reads 'M' mov tempFileMem, eax .WHILE 1 mov eax, tempFileMem .if eax>=MemEnd .IF eax>MemEnd DbgWarning "Stupid parsing error occured while loading the grammar file. Sorry." xor eax, eax .ELSE mov eax, TRUE .ENDIF .break .elseif byte ptr[eax]!="M" ;Each Record begins with this identifier (='MultiType') DbgWarning "Unexpected File Format" DbgMem tempFileMem,256 mov eax, FALSE .break .endif ; read number of entries in record getvsh tempFileMem, nEntries getvb tempFileMem,RecType .IF al=='P' ;Parameter information. mov ebx, tempFileMem ;Fetch informational strings inc ebx; skip 'S' entry identifier (=String) mov [esi].GrammarName, $invoke(CloneWideString, ebx) inc ebx mov [esi].GrammarVersion, $invoke(CloneWideString, ebx) inc ebx mov [esi].GrammarAuthor, $invoke(CloneWideString, ebx) inc ebx mov [esi].GrammarAbout, $invoke(CloneWideString, ebx) mov tempFileMem, ebx getvb tempFileMem,[esi].bCaseSensitive getvsh tempFileMem,[esi].wStartSymbol DbgUStr [esi].GrammarName DbgUStr [esi].GrammarVersion DbgUStr [esi].GrammarAuthor DbgUStr [esi].GrammarAbout DbgLine .ELSEIF al=='T' ;TableCount entry ;Allocate 'Major' Arrays: ;- Symbols getvsh tempFileMem, [esi].SymbolTableCount mov [esi].Symbols, $AllocArray (Symbol, [esi].SymbolTableCount) ;- CharSets getvsh tempFileMem, [esi].CharacterSetTableCount mov [esi].CharSets, $AllocArray (LPWSTR, [esi].CharacterSetTableCount) ;-Rules getvsh tempFileMem, [esi].RuleTableCount mov [esi].Rules, $AllocArray (Rule, [esi].RuleTableCount) ;-DFA States getvsh tempFileMem, [esi].DFATableCount mov [esi].DFAStates, $AllocArray (DFAState, [esi].DFATableCount) ;-LALR States getvsh tempFileMem, [esi].LALRTableCount mov [esi].LALRs, $AllocArray (LALR, [esi].LALRTableCount) .ELSEIF al=='I' ; Initial State indices getvsh tempFileMem, [esi].InitDFA getvsh tempFileMem, [esi].InitLALR .ELSEIF al=='C' ; CharSet entry getvsh tempFileMem, nIndex inc tempFileMem mov ebx,tempFileMem push $invoke (CloneWideString, tempFileMem) mov tempFileMem,ebx push eax Get_Element LPWSTR,[esi].CharSets,nIndex pop [eax] ;DbgUStr [eax] .ELSEIF al=='S' ; Symbol entry getvsh tempFileMem, nIndex inc tempFileMem mov ebx,tempFileMem invoke CloneWideString, tempFileMem ;duplicate symbol namestring mov tempFileMem, ebx ;fixup for CloneWideString push eax Get_Element Symbol,[esi].Symbols,nIndex ;look up array element (Symbol) pop [eax].Symbol.sName ;store duplicated namestring ;DbgUStr [eax].Symbol.sName mov edx,eax getvsh tempFileMem, [edx].Symbol.Kind ;store kind of Symbol .ELSEIF al=='R' ;Rule entry ;Rules may contain 0 or more 'Symbol Indices' getvsh tempFileMem, nIndex Get_Element Rule, [esi].Rules, nIndex mov edi,eax assume edi:ptr Rule getvsh tempFileMem, [edi].Nonterminal mov ax,nEntries sub ax,4 ;#symbol indices in this Rule mov [edi].SymbolIndexCount,ax mov [edi].SymbolIndices,$AllocArray(WORD,[edi].SymbolIndexCount) xor ebx,ebx inc tempFileMem ;skip 'E' record identifier .while bx<[edi].SymbolIndexCount Get_Element WORD,[edi].SymbolIndices,bx mov edx,eax getvsh tempFileMem,[edx] inc bx .endw Assume edi: Nothing .ELSEIF al=='D' ; DFA State entry getvsh tempFileMem, nIndex Get_Element DFAState,[esi].DFAStates,nIndex mov edi,eax getvb tempFileMem, [edi].DFAState.bAcceptState getvsh tempFileMem, [edi].DFAState.wAcceptIndex inc tempFileMem; reserved ;Calculate #Edges = (Entries-5)/3 movzx eax,nEntries sub eax,5 mov ebx,3 xor edx,edx div ebx mov [edi].DFAState.wEdgeCount,ax .if eax>0 ;Allocate Edges array mov [edi].DFAState.Edges,$AllocArray(Edge,[edi].DFAState.wEdgeCount) ;Read in the Edges xor ebx,ebx .while bx<[edi].DFAState.wEdgeCount Get_Element Edge,[edi].DFAState.Edges, bx mov edx,eax getvsh tempFileMem, [edx].Edge.wCharSetIndex getvsh tempFileMem, [edx].Edge.wTargetIndex inc tempFileMem ;reserved inc bx .endw .else mov [edi].DFAState.Edges,NULL .endif .ELSEIF al=='L' ; LALR State entry getvsh tempFileMem, nIndex inc tempFileMem; reserved Get_Element LALR,[esi].LALRs,nIndex mov edi,eax ;Calculate #Actions = (nEntries-3)/4 movzx edx,nEntries sub edx,3 shr edx,2 mov [edi].LALR.wActionCount,dx ;Allocate array of Actions for this LALR_State mov [edi].LALR.Actions,$AllocArray(Action, [edi].LALR.wActionCount) xor ebx,ebx .while bx<[edi].LALR.wActionCount Get_Element Action,[edi].LALR.Actions,bx mov edx,eax getvsh tempFileMem, [edx].Action.wSymbolIndex getvsh tempFileMem, [edx].Action.wActionType getvsh tempFileMem, [edx].Action.wTarget inc tempFileMem ;reserved inc ebx .endw .ELSE DbgWarning "Unidentified multitype byte is encountered. Probably the grammar file is corrupt." sub eax,4 DbgMem eax,64 mov eax, FALSE .break .ENDIF .ENDW Finito: push eax MemFree FileMem Destroy File OCall esi.Initialize pop eax ;return value mov [esi].bTablesLoaded, al MethodEnd ;Load the Program File into memory (all of it, this could be bufferstreamed ???) Method Parser.LoadProgramFile,uses esi, ProgramFile:LPSTR LOCAL File SetObject esi invoke GetFileAttributes,ProgramFile .if eax==-1 DbgWarning "Program File does not exist" return FALSE .endif mov File,$New(DiskStream,Init,esi,ProgramFile) mov [esi].StreamEnd,$OCall(File::DiskStream.GetSize) mov [esi].StreamStart,$MemAlloc(eax,MEM_INIT_ZERO) OCall File::DiskStream.BinRead, eax, [esi].StreamEnd .IF eax==0 DbgWarning "Failed to open program file." mov [esi].StreamStart, eax mov [esi].StreamEnd, eax Destroy File xor eax,eax ExitMethod .ENDIF Destroy File mov eax, [esi].StreamStart add [esi].StreamEnd, eax mov [esi].StreamCursor, eax mov [esi].bStreamIsFile, TRUE ;Try to determine the type of plaintext encoding ;Check for BOM (Byte Order Mark) ;If present, check for UTF8, UTF16 and UTF32 variants mov edx,[esi].StreamCursor .if dword ptr[edx]== 0000FEFFh DbgWarning "UTF32 LE is unsupported" xor eax,eax ExitMethod .elseif dword ptr[edx]== 0FFFE0000h DbgWarning "UTF32 BE is unsupported" xor eax,eax ExitMethod .elseif word ptr[edx]==0FEFFh mov [esi].TextEncodingType,TEXT_UTF16_LE add [esi].StreamCursor,2 .elseif word ptr[edx]==0FFFEh mov [esi].TextEncodingType,TEXT_UTF16_BE add [esi].StreamCursor,2 .elseif byte ptr[edx]== 0EFh && word ptr [eax+1]== 0BFBBh DbgWarning "UTF8 is unsupported" xor eax,eax ExitMethod .else ;Assume its plain ascii mov [esi].TextEncodingType,TEXT_ANSI .endif mov eax,TRUE MethodEnd Method Parser.Done, uses esi SetObject esi OCall esi.Reset ;free token stacks and input filestream OCall [esi].LALR_TokenStack::DataCollection.Done OCall [esi].Input_TokenStack::DataCollection.Done OCall [esi].Reductions::DataCollection.Done DbgWarning "DONE" ;Var CharSets,Pointer ;Var Symbols, Pointer ;Var Rules, Pointer ;Var DFAStates, Pointer ;Var LALRs, Pointer MethodEnd ; ллллллллллллллллллллллллллллллллллллллллллллллллллллллллллллллллллллллллл ;This method scans characters from the input datastream ;and returns them as 'Tokens' Method Parser.RetrieveToken,uses ebx esi edi Local Length_:Dword Local AcceptLength:Dword Local AcceptState:Dword ;Symbol ;DbgText "RetrieveToken" SetObject esi Mov AcceptState, 0 ;if Source has no more characters, ;Create a new Token. ;Set the Type property of Token = End-Of-File. ;**Set the parentsymbol property of token to the EOF symbol, which has the table index of 0. mov ecx, [esi].StreamCursor .if ecx>=[esi].StreamEnd DbgWarning "DFA reached EOF" mov ax,NULL push $Get_Element (Symbol,[esi].Symbols,ax) ;get ptr to 'EOF' symbol OCall esi.CreateToken, FORINPUT pop [eax].Token.ParentSymbol ExitMethod .else Get_Element DFAState,[esi].DFAStates,[esi].InitDFA mov [esi].CurrentDFAState, eax mov Length_, 0 ;************* GoForTheToken: ;************* ;Part 1: ;If the current state accepts a terminal, keep track of the cursor position and the state index. ;This will be used later. ;if the State accepts a termimal ;Set Accept-State = State. ;Set Accept-Length = Length. mov eax, [esi].CurrentDFAState .IF [eax].DFAState.bAcceptState==TRUE mov AcceptState, $Get_Element (Symbol, [esi].Symbols, [eax].DFAState.wAcceptIndex) m2m AcceptLength, Length_, edx .ENDIF ;Part 2: ;Look through the edges of the current state and advance if an edge is found that ;contains the character at Lookahead(Position). ;When no edge is found, a token is created and the loop is exited. ;If no Accept State was set then an error token is created. ;A single character is read from the Source. ;This allows error recovery by discarding the character that caused the error. ;If an Edge exists (in current DFA state) that contains the character Lookahead(Length) in Source mov eax, [esi].CurrentDFAState mov edi, [eax].DFAState.Edges movzx ecx, [eax].DFAState.wEdgeCount xor ebx, ebx .WHILE ebx0 ;DbgWarning "Handling tokens while COMMENTED" OCall esi.PopToken, FORINPUT mov edx,[eax].Token.ParentSymbol mov dx, [edx].Symbol.Kind .IF dx==SCOMMENTSTART inc [esi].CommentLevel .ELSEIF dx==SCOMMENTEND dec [esi].CommentLevel .ELSEIF dx==SENDOFFILE mov eax, MessageCommentError ExitMethod .ENDIF .endif ;System in normal mode. OCall esi.GetTopToken, FORINPUT mov edx,[eax].Token.ParentSymbol ;DbgUStr [edx].Symbol.sName,"PROCESSING" mov dx, [edx].Symbol.Kind .IF dx==SWHITESPACE ;DbgWarning "Popping WhiteSpace token" OCall esi.PopToken, FORINPUT ExitMethod .ELSEIF dx==SCOMMENTSTART ;DbgWarning "Popping commented token" mov [esi].CommentLevel, 1 OCall esi.PopToken, FORINPUT ExitMethod .ELSEIF dx==SCOMMENTLINE ;DbgWarning "Popping commented token, discarding commented line" OCall esi.DiscardRestOfLine OCall esi.PopToken, FORINPUT ExitMethod .ELSEIF dx==SERROR mov eax, MessageLexicalError ExitMethod .endif ; we can finally parse the token :) OCall esi.ParseToken, eax .IF eax==PARSESHIFT OCall esi.PopToken, FORINPUT mov eax, MessageShift ExitMethod .ELSEIF eax==PARSEREDUCENORMAL mov eax, MessageReduction ExitMethod .ELSEIF eax==PARSEACCEPT ;Generate ParseTree DbgWarning "Creating Parse Tree from Reductions stack" DbgDec [esi].LALR_TokenStack.dCount mov eax,[esi].Reductions.dCount dec eax mov [esi].ParseTree,$OCall([esi].Reductions::DataCollection.DeleteAt,eax) ; OCall esi.CreateTree,eax ; .if [esi].Reductions.dCount!=0 ; DbgWarning "Error - CreateTree failed to empty the Reductions stack" ; int 3 ; .endif mov eax, MessageAccept ExitMethod .ELSEIF eax==PARSESYNTAXERROR mov eax, MessageSyntaxError ExitMethod .ELSEIF eax==PARSEINTERNALERROR mov eax, MessageInternalError ExitMethod .ELSE ;What is this? Is it possible to end up here? DbgWarning "Unexpected return value in parsetoken" int 3 .ENDIF .until 0 ;can't end up here but just in case... mov eax, MessageInternalError MethodEnd Method Parser.ParseToken, uses esi edi ebx, Tkn LOCAL Acnt:WORD LOCAL ErrExpCount LOCAL reductiontokens,reduction,ReductionSymbol LOCAL buf [256]:BYTE ;DbgText "ParseToken" SetObject esi mov ErrExpCount,0 ;mov eax,Tkn ;DbgHex [eax].Token.ParentSymbol ;mov eax,[eax].Token.ParentSymbol ;DbgUStr [eax].Symbol.sName ;if an Action for Token exists in the Current-LALR-State mov edx, [esi].CurrentLALRState mov ax,[edx].LALR.wActionCount mov Acnt, ax .if ax!=0 mov edi, [edx].LALR.Actions ; edi is an action xor ebx, ebx .while bx") pop eax push edx mov eax, [eax].Symbol.sName lea edx,[esi].ErrExpStrList mov [edx+edi*4], eax invoke GoAscii,addr buf,eax pop edx DbgPrint edx,,addr buf inc ErrExpCount .ENDIF ;.ENDIF inc edi add ebx, sizeof Action pop edx ; restore action count .ENDW m2m [esi].ErrExpectedCount,ErrExpCount,edx mov eax, PARSESYNTAXERROR ExitMethod ;******************************** ActionFound: mov cx, [edi].Action.wActionType .IF cx==ACTIONSHIFT ;DbgWarning "SHIFTING LALR STATE" ;Set Current-LALR-State = target state of Action. mov [esi].CurrentLALRState, $Get_Element(LALR,[esi].LALRs, [edi].Action.wTarget) ;Later we will wish to remember which state we were Shifting to ;at the moment that a given Token was moved onto the LALR stack. ;Set the State property of Token to the Current-LALR-State (tag it) mov edx,Tkn mov [edx].Token.State,eax ;Push Token onto the LALR-Token-Stack. OCall esi.PushToken, FORSTACK, Tkn ;Set Result = Shift mov eax, PARSESHIFT ExitMethod .ELSEIF cx==ACTIONREDUCE ;DbgWarning "REDUCING" mov [esi].ReduceRule, $Get_Element(Rule,[esi].Rules, [edi].Action.wTarget) movzx edx,[eax].Rule.SymbolIndexCount mov edi,eax ;PART 1: REDUCE THE RULE mov [esi].CurrentReduction.ParentRule, eax mov [esi].CurrentReduction.TokenCount, dx ;look up reduction rule's nonterminal symbol mov edx, [esi].ReduceRule mov di,[edx].Rule.Nonterminal mov ReductionSymbol, $Get_Element(Symbol,[esi].Symbols, di) ; Nonterminal Symbol on the stack. (real cpu stack :) ;Print rule name and #symbols invoke GoAscii,addr buf,[eax].Symbol.sName movzx edx,[esi].CurrentReduction.TokenCount DbgPrint "Reduction Rule %s has %lu symbols", ,addr buf,edx ;If the Rule's right-hand side contains any symbols, ;we will create a Reduction container, and move that many tokens ;from the LALR stack into the new container, and collect the container. ;I will be moving elements from a DataCollection into a flat array. ;Allocate a new Reduction DbgLine "Reduction List" mov reduction,$OCall ([esi].Reductions::DataCollection.Insert,$MemAlloc(sizeof Reduction,MEM_INIT_ZERO)) movzx edx,[esi].CurrentReduction.TokenCount DbgPrint "Reduction %lX : %s = %lu tokens","Reduction List",reduction,addr buf,edx mov dx,[esi].CurrentReduction.TokenCount mov [eax].Reduction.TokenCount,dx m2m [eax].Reduction.ParentRule,[esi].CurrentReduction.ParentRule,edx ;If the Reduction has some Tokens: ; .if [edx].Rule.SymbolIndexCount!=0 ;Allocate array for Tokens in reduction push eax mov eax,sizeof Token mul [esi].CurrentReduction.TokenCount mov reductiontokens,$MemAlloc (eax,MEM_INIT_ZERO) pop edx mov [edx].Reduction.Tokens,eax ;Move the top N tokens from LALR stack into the Reduction's token array xor ebx,ebx .while bx<[esi].CurrentReduction.TokenCount ;Steal a token from the top of the LALR token stack mov eax,[esi].LALR_TokenStack.dCount dec eax OCall [esi].LALR_TokenStack::DataCollection.DeleteAt,eax ;Debug its data DbgToken eax,"Reduction List" ;Copy its data into the nth reduction-array element push eax mov ax,[esi].CurrentReduction.TokenCount dec ax ;token is at [tokencount-1-bx], ie, top n tokens sub ax,bx mov edx,$Get_Element(Token, reductiontokens, ax) ;edx -> target token pop eax m2m [edx].Token.ParentSymbol, [eax].Token.ParentSymbol m2m [edx].Token.TokenData, [eax].Token.TokenData m2m [edx].Token.State, [eax].Token.State ;Deallocate the token object, but not its data MemFree eax inc bx .endw ;.endif ;At this point, the token on top of the LALR stack ;contains the LALR state which was active when this token ;was moved across from the input tokenstack to the lalr stack. ;We should 'revert' the current LALR state to this state. MOV [esi].CurrentLALRState,$OCall (esi.GetTopToken, FORSTACK) ;Search LALR State for Action matching our Rule's NonTerminal Symbol mov eax, [eax].Token.State ;historical tag from when it was Shifted movzx ecx, [eax].LALR.wActionCount mov eax, [eax].LALR.Actions ; eax holds an action xor ebx, ebx .WHILE ebx Token Method Parser.PushToken,uses esi ecx, ForWhat:Dword, Tkn:Pointer SetObject esi MemAlloc sizeof Token,MEM_INIT_ZERO .if ForWhat==FORINPUT OCall [esi].Input_TokenStack::DataCollection.Insert,eax ;DbgDec [esi].Input_TokenStack.dCount,"TOKENS PUSHED on INPUT stack" .else OCall [esi].LALR_TokenStack::DataCollection.Insert,eax ;DbgDec [esi].LALR_TokenStack.dCount,"TOKENS PUSHED on LALR-Token stack" .endif m2m [eax].Token.State, [esi].CurrentLALRState,edx mov edx,Tkn m2m [eax].Token.ParentSymbol, [edx].Token.ParentSymbol,ecx ;For tokens whose Symbol is Terminal, the TokenData field = LPSTR ;Duplicate string via MemAlloc .if [ecx].Symbol.Kind==STERMINAL .if [edx].Token.TokenData!=0 push eax DbgUStr [edx].Token.TokenData ; int 3 ;<--- XXXXXXXXXXX these are wide strings now, fix this code invoke CloneWideString,[edx].Token.TokenData ; DbgUStr eax pop edx mov [edx].Token.TokenData,eax mov eax,edx .endif ;For tokens whose Symbol is NonTerminal, ;TokenData = NULL (normal nonterminal token), or ;TokenData -> Reduction (reduction nonterminal token) ;Either way, just copy it across .else push [edx].Token.TokenData pop [eax].Token.TokenData .endif MethodEnd Method Parser.CreateToken,uses esi, ForWhat:dword SetObject esi lea eax, DefaultToken xor ecx, ecx mov [eax].Token.ParentSymbol, ecx mov [eax].Token.TokenData, ecx mov [eax].Token.State, ecx MethodEnd Method Parser.GetTopToken,uses esi, ForWhat:Dword SetObject esi .IF ForWhat==FORINPUT ;DbgText "getting top INPUT token" mov eax,[esi].Input_TokenStack.dCount dec eax OCall [esi].Input_TokenStack::DataCollection.ItemAt,eax .ELSE ;DbgText "getting top LALR token" mov eax,[esi].LALR_TokenStack.dCount dec eax OCall [esi].LALR_TokenStack::DataCollection.ItemAt,eax .ENDIF MethodEnd Method Parser.DiscardRestOfLine,uses esi SetObject esi mov eax, [esi].StreamCursor LookForCR: cmp eax, [esi].StreamEnd jge DoneLooking mov dl, [eax] inc eax cmp dl, 13 jne LookForCR dec eax ;very important. We should leave the CR in the stream ;untouched so that it is treated as it is. DoneLooking: mov [esi].StreamCursor, eax MethodEnd ;Search for Symbol by Name ;Returns EAX -> Symbol, EDX = SymbolID ; OR EAX = NULL, EDX = -1 Method Parser.GetSymbolByName,uses esi ebx, pszSymbolName:LPWSTR SetObject esi xor ebx,ebx .while bx<[esi].SymbolTableCount DbgDec ebx,"index so far" Get_Element Symbol, [esi].Symbols, bx push eax invoke WideCompare,[eax].Symbol.sName,pszSymbolName pop edx .if eax==0 DbgUStr [edx].Symbol.sName,"FOUND IT" mov eax,edx mov edx,ebx ExitMethod .endif inc ebx .endw xor eax,eax mov edx,-1 MethodEnd Method Parser.DestroyParseTree,uses esi, pReductionNode SetObject esi xor ebx,ebx mov edi,pReductionNode .while bx<[edi].Reduction.TokenCount Get_Element Token, [edi].Reduction.Tokens,bx mov edx,[eax].Token.ParentSymbol .if [edx].Symbol.Kind==STERMINAL push eax MemFree [eax].Token.TokenData pop eax .else push eax OCall esi.DestroyParseTree,[eax].Token.TokenData pop eax .endif inc bx .endw MemFree [edi].Reduction.Tokens MemFree edi MethodEnd