""" recdesent.py An implementation of a simple recursive descent parser implementing "grammar 2" at cs.marlboro.college/courses/ fall2016/formal_languages/notes/programming_language_parsers See for example https://en.wikipedia.org/wiki/Recursive_descent_parser I've modified the grammar to remove the left recursion in grammar 2, in a way that's similar to the discussion at https://en.wikipedia.org/wiki/Left_recursion . The problem is that with left recursion, the recursive definition expr=expr+term will expand into expr(expr(expr(....) infinitely. The right recursive modified version expr=term+expr avoids this problem by expending into expr(term(expr(term(...)))) which needs to find the "+" for each continued level of recursion, and which will therefore stop at the appropriate depth. This modification does change the structure of the resulting parse tree. However, the language is still the same, and the arithmetic calculation is also still the same ... so maybe that's good enough. :) The difference can be seen by looking at the parse tree for "1+2+3". The original grammar 2 would give the tree (expr (expr 1 + 2) + 3), while the modified grammar (avoiding left recursion) Using a lisp notation to represent the tree, the original grammar turns "1+2+3" into the tree while the modified grammar will give (+ 1 (+ 2 3)). Anyway, here's the modified grammar that is implemented below. -- lexer handles these -- addop ::= "+" | "-" mulop ::= "*" | "/" number ::= digit+ identifier ::= letter+ digit ::= "0" | "1" | ... | "8" | "9" letter ::= "a" | "b" | ... | "y" | "z" -- parser handles these -- kids lisp_template ---- -------------- expr ::= term 1 singlet kid term term addop expr 3 ( op left right ) term ::= factor 1 singlet kid factor factor mulop term 3 ( op left right ) factor ::= id 0 its value number 0 its value "-" factor 2 (- fact) "(" expr ")" 3 kid expr The processing is input_string --> lexer ------> parser -----> compiler --> output_string symbols tokens tree lisp I'm storing the tokens as a sequence of Token objects, then reusing them as nodes in a parse tree. The lexer does regular expression matching. The parser uses a "recursive descent" method, with a function call (e.g. expression()) for the left hand side of each production rule, which tries to find the right hand side recursively. The lisp compilation step is just a few string templates, for example "term addop expr" turns into "({op} {term} {expr})". So there you go. Running it : $ python --version Python 3.7.3 $ python recdescent.py > recdescent_output.txt Jim Mahoney | cs.marlboro.college | Sep 2019 | MIT License """ import re DEBUG = False import pprint pp = pprint.PrettyPrinter(indent=4) def print_debug(name, value): if DEBUG: print(f" *DEBUG* {name} is '{value}'") class Token: """ A Token is both (a) what the lexer finds in the input and (b) a node in the parse tree. """ def __init__(self, kind='', value='', children=[]): self.kind = kind self.value = value self.children = children def __str__(self): return "Token(kind='" + self.kind + "'," + \ "value='" + self.value + "'," + \ "children=[" + ','.join(map(str, self.children)) + "])" def as_tree(self): return [self.kind, self.value, list(map(lambda x: x.as_tree(), self.children))] def as_lisp(self): if len(self.children) == 0: # identifier or number return str(self.value) elif len(self.children) == 1: # singlet or return self.children[0].as_lisp() elif len(self.children) == 2: # (- stuff) return '(- ' + self.children[1].as_lisp() + ')' elif self.children[0].kind == 'LEFTPAREN': # just output interior return self.children[1].as_lisp() elif len(self.children) == 3: # (op thing thing) return '(' + self.children[1].value \ + ' ' + self.children[0].as_lisp() \ + ' ' + self.children[2].as_lisp() + ')' else: return f'OOPS:{str(self)}' lexer_kind = { r'([0-9]+)([^0-9]|$)' : 'NUMBER', r'([a-z]+)([^a-z]|$)' : 'IDENTIFIER', r'(\+|\-)' : 'ADDOP', r'(\*|\/)' : 'MULOP', r'(\()' : 'LEFTPAREN', r'(\))' : 'RIGHTPAREN' } def next_token(symbols): """ Search the start of the symbols for the next token. Return (token, remaining_symbols). If a legal token cannot be found, throw an exception. """ print_debug('start next_token symbols', symbols) symbols = symbols.lstrip() for regex in lexer_kind: search = re.match(regex, symbols) if search: found = search.groups()[0] token = Token(kind=lexer_kind[regex], value=found) print_debug('in next_token token is ', token) return (token, symbols[len(found):]) raise Exception('Token not found.') def lexer(symbols): """ Convert a string of symbols to a list of (childless) tokens """ tokens = [] while len(symbols) > 0: (token, symbols) = next_token(symbols) tokens.append(token) return tokens def parse(tokens): """ Convert a sequence of tokens to a parse tree, that is, a single root token and its ancestors. """ (tree, leftover_tokens) = expression(tokens) if leftover_tokens: raise Exception("OOPS - extra symbols in input") return tree def expression(tokens): """ Search for an within a sequence of tokens. Return (tree, remaining_tokens). """ print_debug(f' from {len(tokens)} tokens ', (str(tokens[0]), str(tokens[-1]))) (left_side, tokens) = term(tokens) if tokens and tokens[0].kind == 'ADDOP': addop = tokens.pop(0) (right_side, tokens) = expression(tokens) its_children = [left_side, addop, right_side] else: its_children = [left_side] return (Token(kind = 'EXPRESSION', value = '', children = its_children), tokens) def term(tokens): """ Search for a within a sequence of tokens. Return (tree, remaining_tokens). """ print_debug(f' from {len(tokens)} tokens ', (str(tokens[0]), str(tokens[-1]))) (left_side, tokens) = factor(tokens) if tokens and tokens[0].kind == 'MULOP': mulop = tokens.pop(0) (right_side, tokens) = term(tokens) its_children = [left_side, mulop, right_side] else: its_children = [left_side] return (Token(kind = 'TERM', value = '', children = its_children), tokens) def factor(tokens): """ Search for a with a sequence of tokens. Return (tree, remaining_tokens) """ print_debug(f' from {len(tokens)} tokens ', (str(tokens[0]), str(tokens[-1]))) if tokens[0].kind in ('IDENTIFIER', 'NUMBER'): singlet = tokens.pop(0) return (Token(kind='FACTOR', value='', children=[singlet]), tokens) elif tokens[0].kind == 'ADDOP' and tokens[0].value == '-': negate = tokens.pop(0) (fact, tokens) = factor(tokens) return (Token(kind='FACTOR', value='', children=[Token(kind='NEGATE'), fact]), tokens) elif tokens[0].kind == 'LEFTPAREN': leftparen = tokens.pop(0) (expr, tokens) = expression(tokens) rightparen = tokens.pop(0) if rightparen.kind != 'RIGHTPAREN': raise Exception('OOPS - mismatched parens') return (Token(kind='FACTOR', value='', children=[leftparen, expr, rightparen]), tokens) else: raise Exception('OOPS - failed to find factor') def test(): tests = ['123', 'a*b', 'a + 2 * b', '1*c*2*d', 'a*b+c*d', '123 + abc + z * ( 3 + x )' ] for symbols in tests: print('='*40) print('-- input : ', symbols) print('-- lexer :') tokens = lexer(symbols) for token in tokens: print(token) print('-- parser : ') tree = parse(tokens) pp.pprint(tree.as_tree()) print('-- output : ', tree.as_lisp()) if __name__ == '__main__': test()