You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
290 lines
10 KiB
290 lines
10 KiB
import re
|
|
|
|
# Number of spaces at the beginning of a line
|
|
def spaces(ln):
|
|
spaces = 0
|
|
while spaces < len(ln) and ln[spaces] == ' ': spaces += 1
|
|
return spaces
|
|
|
|
# Main parse function
|
|
def parse(document):
|
|
return parse_lines(document.split('\n'))
|
|
|
|
# Parse the statement-level structure, including if and while statements
|
|
def parse_lines(lns):
|
|
o = []
|
|
i = 0
|
|
while i < len(lns):
|
|
main = lns[i]
|
|
# Skip empty lines
|
|
if len(main.strip()) == 0:
|
|
i += 1
|
|
continue
|
|
if spaces(main) > 0:
|
|
raise Exception("Line "+str(i)+" indented too much!")
|
|
# Grab the child block of an if statement
|
|
start_child_block = i+1
|
|
indent = 99999999
|
|
i += 1
|
|
while i < len(lns):
|
|
sp = spaces(lns[i])
|
|
if sp == 0: break
|
|
indent = min(sp,indent)
|
|
i += 1
|
|
child_block = map(lambda x:x[indent:],lns[start_child_block:i])
|
|
# Calls parse_line to parse the individual line
|
|
out = parse_line(main)
|
|
# Include the child block into the parsed expression
|
|
if out[0] in ['if', 'else', 'while', 'else if']:
|
|
if len(child_block) == 0:
|
|
raise Exception("If/else/while statement must have sub-clause! (%d)" % i)
|
|
else:
|
|
out.append(parse_lines(child_block))
|
|
else:
|
|
if len(child_block) > 0:
|
|
raise Exception("Not an if/else/while statement, can't have sub-clause! (%d)" % i)
|
|
# This is somewhat complicated. Essentially, it converts something like
|
|
# "if c1 then s1 elif c2 then s2 elif c3 then s3 else s4" (with appropriate
|
|
# indenting) to [ if c1 s1 [ if c2 s2 [ if c3 s3 s4 ] ] ]
|
|
if out[0] == 'else if':
|
|
if len(o) == 0: raise Exception("Cannot start with else if! (%d)" % i)
|
|
u = o[-1]
|
|
while len(u) == 4: u = u[-1]
|
|
u.append(['if'] + out[1:])
|
|
elif out[0] == 'else':
|
|
if len(o) == 0: raise Exception("Cannot start with else! (%d)" % i)
|
|
u = o[-1]
|
|
while len(u) == 4: u = u[-1]
|
|
u.append(out[1])
|
|
else:
|
|
# Normal case: just add the parsed line to the output
|
|
o.append(out)
|
|
return o[0] if len(o) == 1 else ['seq'] + o
|
|
|
|
# Tokens contain one or more chars of the same type, with a few exceptions
|
|
def chartype(c):
|
|
if c in 'abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789.':
|
|
return 'alphanum'
|
|
elif c in '\t ': return 'space'
|
|
elif c in '()[]': return 'brack'
|
|
elif c == '"': return 'dquote'
|
|
elif c == "'": return 'squote'
|
|
else: return 'symb'
|
|
|
|
# Converts something like "b[4] = x+2 > y*-3" to
|
|
# [ 'b', '[', '4', ']', '=', 'x', '+', '2', '>', 'y', '*', '-', '3' ]
|
|
def tokenize(ln):
|
|
tp = 'space'
|
|
i = 0
|
|
o = []
|
|
global cur
|
|
cur = ''
|
|
# Comments
|
|
if '//' in ln: ln = ln[:ln.find('//')]
|
|
# Finish a token and start a new one
|
|
def nxt():
|
|
global cur
|
|
if len(cur) >= 2 and cur[-1] == '-':
|
|
o.extend([cur[:-1],'-'])
|
|
elif len(cur.strip()) >= 1:
|
|
o.append(cur)
|
|
cur = ''
|
|
# Main loop
|
|
while i < len(ln):
|
|
c = chartype(ln[i])
|
|
# Inside a string
|
|
if tp == 'squote' or tp == "dquote":
|
|
if c == tp:
|
|
cur += ln[i]
|
|
nxt()
|
|
i += 1
|
|
tp = 'space'
|
|
elif ln[i:i+2] == '\\x':
|
|
cur += ln[i+2:i+4].decode('hex')
|
|
i += 4
|
|
elif ln[i:i+2] == '\\n':
|
|
cur += '\x0a'
|
|
i += 2
|
|
elif ln[i] == '\\':
|
|
cur += ln[i+1]
|
|
i += 2
|
|
else:
|
|
cur += ln[i]
|
|
i += 1
|
|
# Not inside a string
|
|
else:
|
|
if c == 'brack' or tp == 'brack': nxt()
|
|
elif c == 'space': nxt()
|
|
elif c != 'space' and tp == 'space': nxt()
|
|
elif c == 'symb' and tp != 'symb': nxt()
|
|
elif c == 'alphanum' and tp == 'symb': nxt()
|
|
elif c == 'squote' or c == "dquote": nxt()
|
|
cur += ln[i]
|
|
tp = c
|
|
i += 1
|
|
nxt()
|
|
if o[-1] in [':',':\n','\n']: o.pop()
|
|
if tp in ['squote','dquote']: raise Exception("Unclosed string: "+ln)
|
|
return o
|
|
|
|
# This is the part where we turn a token list into an abstract syntax tree
|
|
precedence = {
|
|
'^': 1,
|
|
'*': 2,
|
|
'/': 3,
|
|
'%': 4,
|
|
'#/': 2,
|
|
'#%': 2,
|
|
'+': 3,
|
|
'-': 3,
|
|
'<': 4,
|
|
'<=': 4,
|
|
'>': 4,
|
|
'>=': 4,
|
|
'==': 5,
|
|
'and': 6,
|
|
'&&': 6,
|
|
'or': 7,
|
|
'||': 7,
|
|
'!': 0
|
|
}
|
|
|
|
def toktype(token):
|
|
if token is None: return None
|
|
elif token in ['(','[']: return 'left_paren'
|
|
elif token in [')',']']: return 'right_paren'
|
|
elif token == ',': return 'comma'
|
|
elif token == ':': return 'colon'
|
|
elif token in ['!']: return 'unary_operation'
|
|
elif not isinstance(token,str): return 'compound'
|
|
elif token in precedence: return 'binary_operation'
|
|
elif re.match('^[0-9a-z\-\.]*$',token): return 'alphanum'
|
|
elif token[0] in ['"',"'"] and token[0] == token[-1]: return 'alphanum'
|
|
else: raise Exception("Invalid token: "+token)
|
|
|
|
# https://en.wikipedia.org/wiki/Shunting-yard_algorithm
|
|
#
|
|
# The algorithm works by maintaining three stacks: iq, stack, oq. Initially,
|
|
# the tokens are placed in order on the iq. Then, one by one, the tokens are
|
|
# processed. Values are moved immediately to the output queue. Operators are
|
|
# pushed onto the stack, but if an operator comes along with lower precendence
|
|
# then all operators on the stack with higher precedence are applied first.
|
|
# For example:
|
|
# iq = 2 + 3 * 5 + 7, stack = \, oq = \
|
|
# iq = + 3 * 5 + 7, stack = \, oq = 2
|
|
# iq = 3 * 5 + 7, stack = +, oq = 2
|
|
# iq = * 5 + 7, stack = +, oq = 2 3
|
|
# iq = 5 + 7, stack = + *, oq = 2 3 (since * > + in precedence)
|
|
# iq = + 7, stack = + *, oq = 2 3 5
|
|
# iq = 7, stack = + +, oq = 2 [* 3 5] (since + > * in precedence)
|
|
# iq = \, stack = + +, oq = 2 [* 3 5] 7
|
|
# iq = \, stack = +, oq = 2 [+ [* 3 5] 7]
|
|
# iq = \, stack = \, oq = [+ 2 [+ [* 3 5] 7] ]
|
|
#
|
|
# Functions, where function arguments begin with a left bracket preceded by
|
|
# the function name, are separated by commas, and end with a right bracket,
|
|
# are also included in this algorithm, though in a different way
|
|
def shunting_yard(tokens):
|
|
iq = [x for x in tokens]
|
|
oq = []
|
|
stack = []
|
|
prev,tok = None,None
|
|
# The normal Shunting-Yard algorithm simply converts expressions into
|
|
# reverse polish notation. Here, we try to be slightly more ambitious
|
|
# and build up the AST directly on the output queue
|
|
# eg. say oq = [ 2, 5, 3 ] and we add "+" then "*"
|
|
# we get first [ 2, [ +, 5, 3 ] ] then [ [ *, 2, [ +, 5, 3 ] ] ]
|
|
def popstack(stack,oq):
|
|
tok = stack.pop()
|
|
typ = toktype(tok)
|
|
if typ == 'binary_operation':
|
|
a,b = oq.pop(), oq.pop()
|
|
oq.append([ tok, b, a])
|
|
elif typ == 'unary_operation':
|
|
a = oq.pop()
|
|
oq.append([ tok, a ])
|
|
elif typ == 'right_paren':
|
|
args = []
|
|
while toktype(oq[-1]) != 'left_paren':
|
|
args.insert(0,oq.pop())
|
|
oq.pop()
|
|
if tok == ']' and args[0] != 'id':
|
|
oq.append(['access'] + args)
|
|
elif tok == ']':
|
|
oq.append(['array_lit'] + args[1:])
|
|
elif tok == ')' and len(args) and args[0] != 'id':
|
|
oq.append(args)
|
|
else:
|
|
oq.append(args[1])
|
|
# The main loop
|
|
while len(iq) > 0:
|
|
prev = tok
|
|
tok = iq.pop(0)
|
|
typ = toktype(tok)
|
|
if typ == 'alphanum':
|
|
oq.append(tok)
|
|
elif typ == 'left_paren':
|
|
# Handle cases like 3 * (2 + 5) by using 'id' as a default function
|
|
# name
|
|
if toktype(prev) != 'alphanum' and toktype(prev) != 'rparen':
|
|
oq.append('id')
|
|
# Say the statement is "... f(45...". At the start, we would have f
|
|
# as the last item on the oq. So we move it onto the stack, put the
|
|
# leftparen on the oq, and move f back to the stack, so we have ( f
|
|
# as the last two items on the oq. We also put the leftparen on the
|
|
# stack so we have a separator on both the stack and the oq
|
|
stack.append(oq.pop())
|
|
oq.append(tok)
|
|
oq.append(stack.pop())
|
|
stack.append(tok)
|
|
elif typ == 'right_paren':
|
|
# eg. f(27, 3 * 5 + 4). First, we finish evaluating all the
|
|
# arithmetic inside the last argument. Then, we run popstack
|
|
# to coalesce all of the function arguments sitting on the
|
|
# oq into a single list
|
|
while len(stack) and toktype(stack[-1]) != 'left_paren':
|
|
popstack(stack,oq)
|
|
if len(stack):
|
|
stack.pop()
|
|
stack.append(tok)
|
|
popstack(stack,oq)
|
|
elif typ == 'unary_operation' or typ == 'binary_operation':
|
|
# -5 -> 0 - 5
|
|
if tok == '-' and toktype(prev) not in ['alphanum', 'right_paren']:
|
|
oq.append('0')
|
|
# Handle BEDMAS operator precedence
|
|
prec = precedence[tok]
|
|
while len(stack) and toktype(stack[-1]) == 'binary_operation' and precedence[stack[-1]] < prec:
|
|
popstack(stack,oq)
|
|
stack.append(tok)
|
|
elif typ == 'comma':
|
|
# Finish evaluating all arithmetic before the comma
|
|
while len(stack) and toktype(stack[-1]) != 'left_paren':
|
|
popstack(stack,oq)
|
|
elif typ == 'colon':
|
|
# Colon is like a comma except it stays in the argument list
|
|
while len(stack) and toktype(stack[-1]) != 'right_paren':
|
|
popstack(stack,oq)
|
|
oq.append(tok)
|
|
while len(stack):
|
|
popstack(stack,oq)
|
|
if len(oq) == 1:
|
|
return oq[0]
|
|
else:
|
|
raise Exception("Wrong number of items left on stack: "+str(oq))
|
|
|
|
def parse_line(ln):
|
|
tokens = tokenize(ln.strip())
|
|
if tokens[0] == 'if' or tokens[0] == 'while':
|
|
return [ tokens[0], shunting_yard(tokens[1:]) ]
|
|
elif len(tokens) >= 2 and tokens[0] == 'else' and tokens[1] == 'if':
|
|
return [ 'else if', shunting_yard(tokens[2:]) ]
|
|
elif len(tokens) >= 1 and tokens[0] == 'elif':
|
|
return [ 'else if', shunting_yard(tokens[1:]) ]
|
|
elif len(tokens) == 1 and tokens[0] == 'else':
|
|
return [ 'else' ]
|
|
elif '=' in tokens:
|
|
eqplace = tokens.index('=')
|
|
return [ 'set', shunting_yard(tokens[:eqplace]), shunting_yard(tokens[eqplace+1:]) ]
|
|
else:
|
|
return shunting_yard(tokens)
|
|
|