Code archives/Algorithms/Lexical scanner
This code has been declared by its author to be Public Domain code.
Download source code
| |||||
| Use the Scanner class to tokenize strings and, optionally, process the token information to nest the tokens according to things like parens and braces. Python port here: https://github.com/pineapplemachine/lexscan | |||||
' --+-----------------------------------------------------------------------------------------+--
' | This code was originally written by Sophie Kirschner (meapineapple@gmail.com) and it is |
' | released as public domain. Please do not interpret that as liberty to claim credit that |
' | is not yours, or to sell this code when it could otherwise be obtained for free because |
' | that would be a really shitty thing of you to do. |
' --+-----------------------------------------------------------------------------------------+--
' Updated 13 Feb 2015:
' ScannerToken now has a match attribute which contains the TRegExMatch object from the match
' which caused the token to be created.
SuperStrict
Import brl.stream
Import brl.retro
Import brl.linkedlist
Import bah.regex ' Available here: https://code.google.com/p/maxmods/wiki/RegExModule
' Example code
Rem
Import brl.standardio
' Make a new scanner object, that's the thing that does all the work.
Local s:Scanner=New Scanner
' Add some token categories for the Tokenize() method to use the regular expressions to extract tokens from text.
s.AddCategory(ScannerCategory.Create(SRegEx.Create(ScannerCategory.RegExBMaxCommentLine),False))
s.AddCategory(ScannerCategory.Create(SRegEx.Create(ScannerCategory.RegExBMaxCommentBlock),False))
s.AddCategory(ScannerCategory.Create(SRegEx.Create(ScannerCategory.RegExStringLiteral)))
s.AddCategory(ScannerCategory.Create(SRegEx.Create(ScannerCategory.RegExIdentifier)))
s.AddCategory(ScannerCategory.Create(SRegEx.Create(ScannerCategory.RegExNumberUnsigned)))
s.AddCategory(ScannerCategory.Create(SRegEx.Create(ScannerCategory.RegExParenthese)))
s.AddCategory(ScannerCategory.Create(SRegEx.Create(ScannerCategory.RegExDelimiter)))
s.AddCategory(ScannerCategory.Create(SRegEx.Create(ScannerCategory.RegExOperator)))
s.AddCategory(ScannerCategory.Create(SRegEx.Create(ScannerCategory.RegExWhitespace),False))
' Add some branches, they define open and close tokens for the Parse() method to use.
' I'm excluding if/endif and repeat/forever/until because they're context-sensitive and it would be a lot of work
' getting them friendly with the Scanner's very generic parsing algorithm.
s.AddBranch(ScannerBranch.Create(["("],[")"]))
s.AddBranch(ScannerBranch.Create(["["],["]"]))
s.AddBranch(ScannerBranch.Create(["{"],["}"]))
s.AddBranch(ScannerBranch.Create(["while"],["wend","endwhile"]))
s.AddBranch(ScannerBranch.Create(["for"],["next"]))
s.AddBranch(ScannerBranch.Create(["type"],["endtype"]))
s.AddBranch(ScannerBranch.Create(["method"],["endmethod"]))
s.AddBranch(ScannerBranch.Create(["function"],["endfunction"]))
' The tokenizer tokenizes itself!
Local tokenizerpath$="scanner.bmx"
Local stream:TStream=ReadFile(tokenizerpath)
' Iterate through a list of tokens returned by the Process() method and print them and their children to the console.
For Local token:ScannerToken=EachIn s.ProcessStream(stream)
StandardIOStream.WriteString token.ToBranchedString()
Next
' All done!
CloseFile stream
EndRem
' Categories help the Scanner decide how to tokenize the input text. See its Tokenize() method for more info.
Type ScannerCategory
' Some common expressions (don't kill me if I got a ton of these wrong)
Const RegExNumberSigned$="[+-]?[0-9]*\.?[0-9]+([eE][-+]?[0-9]+)?" ' e.g. "25", "3.14", "-.66", "50.32e+10"
Const RegExNumberUnsigned$="[0-9]*\.?[0-9]+([eE][-+]?[0-9]+)?" ' same as above but doesn't grab preceding +/- characters
Const RegExStringLiteral$="~q.*?~q" ' e.g. "This is a string literal whoo"
Const RegExMultilineStringLiteral$="(?s)~q.*?~q" ' e.g. "This is a ~r~n multiline string literal ~r~n whoo"
Const RegExCommentLine$="//[^~r~n]*" ' e.g. "// this is a comment which ends at a newline ~r~n"
Const RegExCommentBlock$="(?s)/\*.*?(\*/)" ' e.g. /* this is a comment block thing */
Const RegExBMaxCommentLine$="'[^~r~n]*" ' BlitzMax line comment, e.g. ' this is a comment
Const RegExBMaxCommentBlock$="(?s)rem.*?(endrem)" ' BlitzMax block comment (Rem/EndRem)
Const RegExIdentifier$="[_a-zA-Z]+[\w]*" ' e.g. "abcde", "b4t_m4n", but not "3UPERMAN"
Const RegExIdentifierIntl$="[_\pL]+[\w\pL]*" ' sam e as above but also allows nonenglish letters
Const RegExParenthese$="[\[\]\(\)\{\}]" ' e.g. "(", "]", "{"
Const RegExDelimiter$="[;,]" ' i.e. ";", ","
Const RegExOperator$="[!@#$%^&*-+=:'.<>/?|`\\~~]+" ' e.g. "%", ">>>", "+="
Const RegExWhitespace$="\s+" ' matches any whitespace characters
' The regular expression for what constitutes a token of this category
Field expression:SRegEx
' Tokens of this category are only added to the list that Scanner.Tokenize() returns if they're significant
Field significant%=True
' Function to create a new ScannerCategory object
Function Create:ScannerCategory(expression:SRegEx,significant:Int=True)
Local cat:ScannerCategory=New ScannerCategory
cat.expression=expression
cat.significant=significant
Return cat
EndFunction
EndType
' Branches help the Scanner decide how to organize tokens into a tree. See its Parse() method for more info.
Type ScannerBranch
' An array of strings that open this kind of branch or close it, respectively. For example: ["("] and [")"], ["while"] and ["wend","endwhile"]
Field open$[],close$[]
' Are the tokens case-sensitive?
Field casesensitive%=False
' Create a new ScannerBranch object
Function Create:ScannerBranch(open$[],close$[],casesensitive%=False)
Local n:ScannerBranch=New ScannerBranch
n.open=open;n.close=close;n.casesensitive=casesensitive
Return n
EndFunction
' These are used by the Scanner's Parse() method
Method CheckOpen%(str$)
Return CheckArray(open,str)
EndMethod
Method CheckClose%(str$)
Return CheckArray(close,str)
EndMethod
Method CheckArray%(array$[],str$)
If casesensitive
For Local i%=0 Until array.length
If array[i]=str Return i
Next
Else
Local lstr$=Lower(str)
For Local i%=0 Until array.length
If Lower(array[i])=lstr Return i
Next
EndIf
Return -1
EndMethod
EndType
' The bread and butter of what a Scanner returns.
Type ScannerToken
' This is the string of the token, pretty simple
Field text$
' This is the regex match that this token was grabbed from
Field match:TRegExMatch
' Also simple, pointer to the category that this token belongs to
Field category:ScannerCategory
' Byte and line position in the string/stream/whatever this token was found in
Field pos_byte%,pos_line%
' Parent and children ScannerTokens for keeping track of the tree structure built by a Scanner's Parse() method
Field parent:ScannerToken,children:TList
' Following token
Field succ:ScannerToken
' Creates a new ScanerToken
Function Create:ScannerToken(text$,cat:ScannerCategory=Null,match:TRegExMatch,pos_byte%=-1,pos_line%=-1)
Local n:ScannerToken=New ScannerToken
n.text=text;n.category=cat;n.match=match
n.pos_byte=pos_byte;n.pos_line=pos_line
Return n
EndFunction
' Various methods for representing the token as a string
Method ToString$()
Return "token ~q"+text+"~q on "+PositionToString()
EndMethod
Method PositionToString$()
Local str$
If pos_line>=0
str:+"line "+pos_line
If pos_byte>=0 str:+" "
EndIf
If pos_byte>=0 str:+"byte offset "+Hex(pos_byte)
Return str
EndMethod
Method ToBranchedString$(linepadding%=3,prefix$="") ' This one is recursive, neat
Local str$=padbefore(pos_line,"0",linepadding)+" "+prefix+text+"~n"
Local subprefix$=prefix+" "
If children
For Local token:ScannerToken=EachIn children
str:+token.ToBranchedString(linepadding,subprefix)
Next
EndIf
Return str
EndMethod
Function padbefore$(s$,char$,length%)
While Len(s)<length
s=char+s
Wend
Return s
EndFunction
EndType
' Lexical scanner object, good for tokenizing string data and also organizing those tokens into a tree
Type Scanner
' Contains ScannerCategory objects, which tell Tokenize() how to divide text into tokens
Field Categories:TList=CreateList()
' Contains ScannerBranch objects, which tell Parse() how to organize tokens into tree structures
Field Branches:TList=CreateList()
Rem
/ Summary /
The Tokenize() method uses ScannerCategory objects to separate tokens, you
can add them using this method. See said method's preceding comments for a
more thorough explanation.
/ Arguments /
cat: The ScannerCategory to add
EndRem
Method AddCategory:ScannerCategory(cat:ScannerCategory)
Categories.addlast cat
Return cat
EndMethod
Rem
/ Summary /
The Parse() method uses ScannerBranch objects to determine the tree
structure, you can add them using this method. See said method's preceding
comments for a more thorough explanation.
/ Arguments /
branch: The ScannerBranch to add
EndRem
Method AddBranch:ScannerBranch(branch:ScannerBranch)
Branches.addlast branch
Return branch
EndMethod
Rem
/ Summary /
Runs the input string through both the Tokenize() and Parse() methods and
returns the result.
/ Arguments /
data: The input string
start: The initial position
EndRem
Method Process:TList(data$,start%=0)
Return Parse(Tokenize(data,start))
EndMethod
Rem
/ Summary /
Same as Process() but accepts a stream as input.
/ Arguments /
data: The input stream
EndRem
Method ProcessStream:TList(data:TStream)
Return Parse(TokenizeStream(data))
EndMethod
Rem
/ Summary /
This method tokenizes an input string using the regular expressions added
to the Scanner object using the AddCategory() method. It's done like this:
Starting at the beginning of the string, the tokenizer checks every
category for a match starting at the current position in the string. The
token is considered to be a member of the category finding the lengthiest
match, or the oldest matching category in case of ties. (i.e. the category
added first beats out the category added next.) In a case where no token
can be recognized, tokens are added to the list one character at a time
with null assigned to their category field.
The list returned is a linear sequence of ScannerToken objects recording
the text, category, and position in file by byte and by line for one token
each.
/ Arguments /
data: The input string
start: The initial position
EndRem
Method Tokenize:TList(data$,start%=0)
Local tokens:TList=CreateList()
Local pos%=start,line%=1
Repeat
Local bestcat:ScannerCategory=Null,bestmatch:TRegExMatch,beststr$,bestlength%=0
For Local cat:ScannerCategory=EachIn Categories
If cat.expression
Local match:TRegExMatch=cat.expression.find(data,pos)
If match And match.SubStart()=pos
Local str$=match.SubExp()
If str.length>bestlength
bestcat=cat
bestmatch=match
beststr=str
bestlength=str.length
EndIf
EndIf
EndIf
Next
Local token:ScannerToken=Null
If bestcat And bestlength ' token has been neatly categorized
If bestcat.significant token=ScannerToken.Create(beststr,bestcat,bestmatch,pos,line)
pos:+bestlength
line:+CountNewlines(beststr)
Else ' doesn't fit any category? make it its own token.
token=ScannerToken.Create(Chr(data[pos]),Null,Null,pos,line)
pos:+1
line:+CountNewlines(Chr(data[pos]))
EndIf
If token
Local lasttoken:ScannerToken=ScannerToken(tokens.last())
If lasttoken Then lasttoken.succ=token
tokens.addlast token
EndIf
If pos>=data.length Exit
Forever
Return tokens
EndMethod
Rem
/ Summary /
Same as Tokenize() but accepts a stream as input.
/ Arguments /
data: The input stream
EndRem
Method TokenizeStream:TList(data:TStream)
Return Tokenize(data.ReadString(data.size()))
EndMethod
Rem
/ Summary /
to the Scanner object using the AddCategory() method. It's done like this:
This method takes a sequence of tokens as input, such as the list outputted
by the Tokenize() method, and reorganizes them into a tree structure
wherein each ScannerToken can have another token as its parent and any
number of tokens as children.
The Scanner's list of ScannerBranch objects, added using AddBranch(),
decide the structure of the output. A branch's open tokens cause another
level to be added to the tree while close tokens cause the parser to back
up a level. For example, the sequence of tokens "abc" "(" "123" ")" "xyz",
when "(" opens a branch and ")" closes it, will become "abc" "(" "xyz" -
the tokens following "(" and ending with ")" will become children of the
"(" token. This is a recursive process and nesting is theoretically
unbounded.
Generally, you would use this method before calling a more implementation-
specific parser so that it wouldn't need to organize the tokens into a tree
for itself.
/ Arguments /
tokens: A list of sequential ScannerToken objects, such as is returned by
the Tokenize() method
EndRem
Method Parse:TList(tokens:TList)
If Not tokens Return Null
Local list:TList=CreateList()
Local entry:ScannerParseStackEntry=ScannerParseStackEntry.Create(Null,list,Null,Null)
For Local token:ScannerToken=EachIn tokens
entry.list.addlast token
token.parent=entry.root
Local branchopen:ScannerBranch=CheckBranchOpen(token)
Local branchclose:ScannerBranch=CheckBranchClose(token)
If branchopen And branchclose ' edge case of you being dumb enough to make the same token both an open and close branch
' if the parser ends up in here HOPEFULLY it's just because there's a branch that can start and end with the same token.
' because otherwise the parser just makes some assumptions and runs with them.
' in short, if branchopen!=branchclose here then you've probably done something stupid defining the way the code branches
If branchclose=entry.branchtype ' just ran into one of these; assume it's being closed
entry=entry.parent
Else ' haven't run into one of these in the same tree level; assume it's being opened
If Not token.children token.children=CreateList()
entry=ScannerParseStackEntry.Create(token,token.children,branchopen,entry)
EndIf
ElseIf branchopen ' a branch-open symbol was encountered
If Not token.children token.children=CreateList()
entry=ScannerParseStackEntry.Create(token,token.children,branchopen,entry)
ElseIf branchclose ' a branch-close symbol was encountered
If branchclose=entry.branchtype ' proper closing token, just the kind that's expected
entry=entry.parent
Else ' ran into some shit like { ) }
'Print entry.root.tobranchedstring()
Throw ScannerException.Create(ScannerException.ExceptionBadBranchTermination,"Unexpected branch termination",token)
EndIf
EndIf
Next
If entry.parent Throw ScannerException.Create(ScannerException.ExceptionEOFNoBranchTermination,"EOF without expected branch termination",entry.root)
Return list
EndMethod
' This is used by the Tokenize() method to keep track of line numbers
Function CountNewlines%(str$)
Local i%=0,lines%=0
While i<str.length
If str[i]=Asc("~n")
lines:+1
ElseIf str[i]=Asc("~r")
If str[i+1]=Asc("~n") i:+1
lines:+1
EndIf
i:+1
Wend
Return lines
EndFunction
' These are used by the Parse() method to cleanly determine whether a given token opens/closes a branch
Method CheckBranchOpen:ScannerBranch(token:ScannerToken)
For Local branch:ScannerBranch=EachIn Branches
If branch.CheckOpen(token.text)>=0 Return branch
Next
Return Null
EndMethod
Method CheckBranchClose:ScannerBranch(token:ScannerToken)
For Local branch:ScannerBranch=EachIn Branches
If branch.CheckClose(token.text)>=0 Return branch
Next
Return Null
EndMethod
EndType
' This class is used in a Scanner's Parse() method
Type ScannerParseStackEntry
Field root:ScannerToken,list:TList,parent:ScannerParseStackEntry
Field branchtype:ScannerBranch
Function Create:ScannerParseStackEntry(root:ScannerToken,list:TList,branchtype:ScannerBranch,parent:ScannerParseStackEntry)
Local n:ScannerParseStackEntry=New ScannerParseStackEntry
n.list=list;n.root=root;n.branchtype=branchtype;n.parent=parent
Return n
EndFunction
EndType
' Exception object to be thrown by the parser. The parser uses exceptions as a way to allow implementation-agnostic error handling.
Type ScannerException Extends TBlitzException
' Constants identify the different possible sorts of exceptions
Const ExceptionBadBranchTermination%=1
Const ExceptionEOFNoBranchTermination%=2
' Exception data: a message, an error number, and the token it happened at
Field info$,num%,token:ScannerToken
' Create a new exceptions
Function Create:ScannerException(num%,info$,token:ScannerToken)
Local n:ScannerException=New ScannerException
n.num=num;n.info=info;n.token=token
Return n
EndFunction
' Make it a string
Method ToString$()
Local str$="Error "+num+": "+info
If token str:+"; encountered at "+token.ToString()
Return str
EndMethod
EndType
' Extension of TRegEx prevents useless repetition of the same find operations
Type SRegEx Extends TRegEx
' If the conditions of the previous check are identical to current then find will return the last result
Field lastoption:TRegExOptions=Null,lastpattern$,lastpos%=-1,lastmatch:TRegExMatch
' Make a new SRegEx object
Function Create:SRegEx(searchPattern$,options:TRegExOptions=Null)
Local this:SRegEx=New SRegEx
this.searchPattern=searchPattern
If options TRegEx.options=options
Return this
EndFunction
' Do the finding
Method find:TRegExMatch(target$=Null,start%=-1)
If compiled And options=lastoption And lastpos>=0 And start<=lastpos And ..
searchpattern=lastpattern And target=lasttarget
Return lastmatch
Else
lastoption=options
lastpattern=searchpattern
lastmatch=Super.find(target,start)
If lastmatch lastpos=lastmatch.SubStart() Else lastpos=target.length
Return lastmatch
EndIf
EndMethod
EndType |
Comments
None.
Code Archives Forum