You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
 
 
 
 
 

290 lines
10 KiB

import re
# Number of spaces at the beginning of a line
def spaces(ln):
spaces = 0
while spaces < len(ln) and ln[spaces] == ' ': spaces += 1
return spaces
# Main parse function
def parse(document):
return parse_lines(document.split('\n'))
# Parse the statement-level structure, including if and while statements
def parse_lines(lns):
o = []
i = 0
while i < len(lns):
main = lns[i]
# Skip empty lines
if len(main.strip()) == 0:
i += 1
continue
if spaces(main) > 0:
raise Exception("Line "+str(i)+" indented too much!")
# Grab the child block of an if statement
start_child_block = i+1
indent = 99999999
i += 1
while i < len(lns):
sp = spaces(lns[i])
if sp == 0: break
indent = min(sp,indent)
i += 1
child_block = map(lambda x:x[indent:],lns[start_child_block:i])
# Calls parse_line to parse the individual line
out = parse_line(main)
# Include the child block into the parsed expression
if out[0] in ['if', 'else', 'while', 'else if']:
if len(child_block) == 0:
raise Exception("If/else/while statement must have sub-clause! (%d)" % i)
else:
out.append(parse_lines(child_block))
else:
if len(child_block) > 0:
raise Exception("Not an if/else/while statement, can't have sub-clause! (%d)" % i)
# This is somewhat complicated. Essentially, it converts something like
# "if c1 then s1 elif c2 then s2 elif c3 then s3 else s4" (with appropriate
# indenting) to [ if c1 s1 [ if c2 s2 [ if c3 s3 s4 ] ] ]
if out[0] == 'else if':
if len(o) == 0: raise Exception("Cannot start with else if! (%d)" % i)
u = o[-1]
while len(u) == 4: u = u[-1]
u.append(['if'] + out[1:])
elif out[0] == 'else':
if len(o) == 0: raise Exception("Cannot start with else! (%d)" % i)
u = o[-1]
while len(u) == 4: u = u[-1]
u.append(out[1])
else:
# Normal case: just add the parsed line to the output
o.append(out)
return o[0] if len(o) == 1 else ['seq'] + o
# Tokens contain one or more chars of the same type, with a few exceptions
def chartype(c):
if c in 'abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789.':
return 'alphanum'
elif c in '\t ': return 'space'
elif c in '()[]': return 'brack'
elif c == '"': return 'dquote'
elif c == "'": return 'squote'
else: return 'symb'
# Converts something like "b[4] = x+2 > y*-3" to
# [ 'b', '[', '4', ']', '=', 'x', '+', '2', '>', 'y', '*', '-', '3' ]
def tokenize(ln):
tp = 'space'
i = 0
o = []
global cur
cur = ''
# Comments
if '//' in ln: ln = ln[:ln.find('//')]
# Finish a token and start a new one
def nxt():
global cur
if len(cur) >= 2 and cur[-1] == '-':
o.extend([cur[:-1],'-'])
elif len(cur.strip()) >= 1:
o.append(cur)
cur = ''
# Main loop
while i < len(ln):
c = chartype(ln[i])
# Inside a string
if tp == 'squote' or tp == "dquote":
if c == tp:
cur += ln[i]
nxt()
i += 1
tp = 'space'
elif ln[i:i+2] == '\\x':
cur += ln[i+2:i+4].decode('hex')
i += 4
elif ln[i:i+2] == '\\n':
cur += '\x0a'
i += 2
elif ln[i] == '\\':
cur += ln[i+1]
i += 2
else:
cur += ln[i]
i += 1
# Not inside a string
else:
if c == 'brack' or tp == 'brack': nxt()
elif c == 'space': nxt()
elif c != 'space' and tp == 'space': nxt()
elif c == 'symb' and tp != 'symb': nxt()
elif c == 'alphanum' and tp == 'symb': nxt()
elif c == 'squote' or c == "dquote": nxt()
cur += ln[i]
tp = c
i += 1
nxt()
if o[-1] in [':',':\n','\n']: o.pop()
if tp in ['squote','dquote']: raise Exception("Unclosed string: "+ln)
return o
# This is the part where we turn a token list into an abstract syntax tree
precedence = {
'^': 1,
'*': 2,
'/': 3,
'%': 4,
'#/': 2,
'#%': 2,
'+': 3,
'-': 3,
'<': 4,
'<=': 4,
'>': 4,
'>=': 4,
'==': 5,
'and': 6,
'&&': 6,
'or': 7,
'||': 7,
'!': 0
}
def toktype(token):
if token is None: return None
elif token in ['(','[']: return 'left_paren'
elif token in [')',']']: return 'right_paren'
elif token == ',': return 'comma'
elif token == ':': return 'colon'
elif token in ['!']: return 'unary_operation'
elif not isinstance(token,str): return 'compound'
elif token in precedence: return 'binary_operation'
elif re.match('^[0-9a-z\-\.]*$',token): return 'alphanum'
elif token[0] in ['"',"'"] and token[0] == token[-1]: return 'alphanum'
else: raise Exception("Invalid token: "+token)
# https://en.wikipedia.org/wiki/Shunting-yard_algorithm
#
# The algorithm works by maintaining three stacks: iq, stack, oq. Initially,
# the tokens are placed in order on the iq. Then, one by one, the tokens are
# processed. Values are moved immediately to the output queue. Operators are
# pushed onto the stack, but if an operator comes along with lower precendence
# then all operators on the stack with higher precedence are applied first.
# For example:
# iq = 2 + 3 * 5 + 7, stack = \, oq = \
# iq = + 3 * 5 + 7, stack = \, oq = 2
# iq = 3 * 5 + 7, stack = +, oq = 2
# iq = * 5 + 7, stack = +, oq = 2 3
# iq = 5 + 7, stack = + *, oq = 2 3 (since * > + in precedence)
# iq = + 7, stack = + *, oq = 2 3 5
# iq = 7, stack = + +, oq = 2 [* 3 5] (since + > * in precedence)
# iq = \, stack = + +, oq = 2 [* 3 5] 7
# iq = \, stack = +, oq = 2 [+ [* 3 5] 7]
# iq = \, stack = \, oq = [+ 2 [+ [* 3 5] 7] ]
#
# Functions, where function arguments begin with a left bracket preceded by
# the function name, are separated by commas, and end with a right bracket,
# are also included in this algorithm, though in a different way
def shunting_yard(tokens):
iq = [x for x in tokens]
oq = []
stack = []
prev,tok = None,None
# The normal Shunting-Yard algorithm simply converts expressions into
# reverse polish notation. Here, we try to be slightly more ambitious
# and build up the AST directly on the output queue
# eg. say oq = [ 2, 5, 3 ] and we add "+" then "*"
# we get first [ 2, [ +, 5, 3 ] ] then [ [ *, 2, [ +, 5, 3 ] ] ]
def popstack(stack,oq):
tok = stack.pop()
typ = toktype(tok)
if typ == 'binary_operation':
a,b = oq.pop(), oq.pop()
oq.append([ tok, b, a])
elif typ == 'unary_operation':
a = oq.pop()
oq.append([ tok, a ])
elif typ == 'right_paren':
args = []
while toktype(oq[-1]) != 'left_paren':
args.insert(0,oq.pop())
oq.pop()
if tok == ']' and args[0] != 'id':
oq.append(['access'] + args)
elif tok == ']':
oq.append(['array_lit'] + args[1:])
elif tok == ')' and len(args) and args[0] != 'id':
oq.append(args)
else:
oq.append(args[1])
# The main loop
while len(iq) > 0:
prev = tok
tok = iq.pop(0)
typ = toktype(tok)
if typ == 'alphanum':
oq.append(tok)
elif typ == 'left_paren':
# Handle cases like 3 * (2 + 5) by using 'id' as a default function
# name
if toktype(prev) != 'alphanum' and toktype(prev) != 'rparen':
oq.append('id')
# Say the statement is "... f(45...". At the start, we would have f
# as the last item on the oq. So we move it onto the stack, put the
# leftparen on the oq, and move f back to the stack, so we have ( f
# as the last two items on the oq. We also put the leftparen on the
# stack so we have a separator on both the stack and the oq
stack.append(oq.pop())
oq.append(tok)
oq.append(stack.pop())
stack.append(tok)
elif typ == 'right_paren':
# eg. f(27, 3 * 5 + 4). First, we finish evaluating all the
# arithmetic inside the last argument. Then, we run popstack
# to coalesce all of the function arguments sitting on the
# oq into a single list
while len(stack) and toktype(stack[-1]) != 'left_paren':
popstack(stack,oq)
if len(stack):
stack.pop()
stack.append(tok)
popstack(stack,oq)
elif typ == 'unary_operation' or typ == 'binary_operation':
# -5 -> 0 - 5
if tok == '-' and toktype(prev) not in ['alphanum', 'right_paren']:
oq.append('0')
# Handle BEDMAS operator precedence
prec = precedence[tok]
while len(stack) and toktype(stack[-1]) == 'binary_operation' and precedence[stack[-1]] < prec:
popstack(stack,oq)
stack.append(tok)
elif typ == 'comma':
# Finish evaluating all arithmetic before the comma
while len(stack) and toktype(stack[-1]) != 'left_paren':
popstack(stack,oq)
elif typ == 'colon':
# Colon is like a comma except it stays in the argument list
while len(stack) and toktype(stack[-1]) != 'right_paren':
popstack(stack,oq)
oq.append(tok)
while len(stack):
popstack(stack,oq)
if len(oq) == 1:
return oq[0]
else:
raise Exception("Wrong number of items left on stack: "+str(oq))
def parse_line(ln):
tokens = tokenize(ln.strip())
if tokens[0] == 'if' or tokens[0] == 'while':
return [ tokens[0], shunting_yard(tokens[1:]) ]
elif len(tokens) >= 2 and tokens[0] == 'else' and tokens[1] == 'if':
return [ 'else if', shunting_yard(tokens[2:]) ]
elif len(tokens) >= 1 and tokens[0] == 'elif':
return [ 'else if', shunting_yard(tokens[1:]) ]
elif len(tokens) == 1 and tokens[0] == 'else':
return [ 'else' ]
elif '=' in tokens:
eqplace = tokens.index('=')
return [ 'set', shunting_yard(tokens[:eqplace]), shunting_yard(tokens[eqplace+1:]) ]
else:
return shunting_yard(tokens)