recdescent.py

"""
 recdesent.py

 An implementation of a simple recursive descent parser
 implementing "grammar 2" at cs.marlboro.college/courses/
 fall2016/formal_languages/notes/programming_language_parsers

 See for example https://en.wikipedia.org/wiki/Recursive_descent_parser

 I've modified the grammar to remove the left recursion
 in grammar 2, in a way that's similar to the discussion at
 https://en.wikipedia.org/wiki/Left_recursion . The problem
 is that with left recursion, the recursive definition 
 expr=expr+term will expand into expr(expr(expr(....) infinitely.

 The right recursive modified version expr=term+expr avoids this
 problem by expending into expr(term(expr(term(...)))) which
 needs to find the "+" for each continued level of recursion,
 and which will therefore stop at the appropriate depth.

 This modification does change the structure of the resulting parse tree.
 However, the language is still the same, and the arithmetic calculation
 is also still the same ... so maybe that's good enough. :)

 The difference can be seen by looking at the parse tree for "1+2+3". 
 The original grammar 2 would give the tree (expr (expr 1 + 2) + 3),
 while the modified grammar (avoiding left recursion) 

 Using a lisp notation to represent the tree, the original grammar 
 turns "1+2+3" into the tree  while the modified
 grammar will give (+ 1 (+ 2 3)). 

 Anyway, here's the modified grammar that is implemented below.

    -- lexer handles these --
    addop       ::=   "+" | "-"
    mulop       ::=   "*" | "/"
    number      ::=   digit+
    identifier  ::=   letter+
    digit       ::=   "0" | "1" | ... | "8" | "9"
    letter      ::=   "a" | "b" | ... | "y" | "z"

    -- parser handles these  --              kids    lisp_template
                                             ----    --------------
    expr        ::=   term                   1       singlet kid term
                      term addop expr        3       ( op left right )
    term        ::=   factor                 1       singlet kid factor
                      factor mulop term      3       ( op left right )
    factor      ::=   id                     0       its value
                      number                 0       its value
                      "-" factor             2       (- fact)
                      "(" expr ")"           3       kid expr


 The processing is 

     input_string --> lexer ------> parser -----> compiler --> output_string
     symbols                tokens         tree                lisp

 I'm storing the tokens as a sequence of Token objects,
 then reusing them as nodes in a parse tree.

 The lexer does regular expression matching.

 The parser uses a "recursive descent" method,
 with a function call (e.g. expression()) for 
 the left hand side of each production rule, 
 which tries to find the right hand side recursively.

 The lisp compilation step is just a few string templates,
 for example "term addop expr" turns into "({op} {term} {expr})".

 So there you go.

 Running it :

   $ python --version
   Python 3.7.3
   $ python recdescent.py > recdescent_output.txt

 Jim Mahoney | cs.marlboro.college | Sep 2019 | MIT License
"""

import re
DEBUG = False

import pprint
pp = pprint.PrettyPrinter(indent=4)

def print_debug(name, value):
    if DEBUG:
        print(f" *DEBUG* {name} is '{value}'")

class Token:
    """ A Token is both 
           (a) what the lexer finds in the input and
           (b) a node in the parse tree.
    """

    def __init__(self, kind='', value='', children=[]):
        self.kind = kind
        self.value = value
        self.children = children

    def __str__(self):
        return "Token(kind='"  + self.kind  + "'," + \
                     "value='" + self.value + "'," + \
                     "children=[" + ','.join(map(str, self.children)) + "])"

    def as_tree(self):
        return [self.kind, self.value,
                list(map(lambda x: x.as_tree(), self.children))]
    
    def as_lisp(self):
        if len(self.children) == 0:                  # identifier or number
            return str(self.value)
        elif len(self.children) == 1:                # singlet <term> or <expr>
            return self.children[0].as_lisp()
        elif len(self.children) == 2:                # (- stuff)
            return '(- ' + self.children[1].as_lisp() + ')'
        elif self.children[0].kind == 'LEFTPAREN':   # just output interior
            return self.children[1].as_lisp()
        elif len(self.children) == 3:                # (op thing thing)
            return '(' + self.children[1].value \
                       + ' ' + self.children[0].as_lisp() \
                       + ' ' + self.children[2].as_lisp() + ')'
        else:
            return f'OOPS:{str(self)}'
        
lexer_kind = { r'([0-9]+)([^0-9]|$)' : 'NUMBER',
               r'([a-z]+)([^a-z]|$)' : 'IDENTIFIER',
               r'(\+|\-)'            : 'ADDOP',
               r'(\*|\/)'            : 'MULOP',
               r'(\()'               : 'LEFTPAREN',
               r'(\))'               : 'RIGHTPAREN' }

def next_token(symbols):
    """ Search the start of the symbols for the next token.
        Return (token, remaining_symbols).
        If a legal token cannot be found, throw an exception. """
    print_debug('start next_token symbols', symbols)
    symbols = symbols.lstrip()
    for regex in lexer_kind:
        search = re.match(regex, symbols)
        if search:
            found = search.groups()[0]
            token = Token(kind=lexer_kind[regex], value=found)
            print_debug('in next_token token is ', token)
            return (token, symbols[len(found):])
    raise Exception('Token not found.')

def lexer(symbols):
    """ Convert a string of symbols to a list of (childless) tokens """
    tokens = []
    while len(symbols) > 0:
        (token, symbols) = next_token(symbols)
        tokens.append(token)
    return tokens

def parse(tokens):
    """ Convert a sequence of tokens to a parse tree,
        that is, a single root token and its ancestors. """
    (tree, leftover_tokens) = expression(tokens)
    if leftover_tokens:
        raise Exception("OOPS - extra symbols in input")
    return tree

def expression(tokens):
    """ Search for an <expression> within a sequence of tokens.
        Return (tree, remaining_tokens). """
    print_debug(f'<expression> from {len(tokens)} tokens ',
                (str(tokens[0]), str(tokens[-1])))
    (left_side, tokens) = term(tokens)
    if tokens and tokens[0].kind == 'ADDOP':
        addop = tokens.pop(0)
        (right_side, tokens) = expression(tokens)
        its_children = [left_side, addop, right_side]
    else:
        its_children = [left_side]
    return (Token(kind = 'EXPRESSION',
                  value = '',
                  children = its_children),
            tokens)

def term(tokens):
    """ Search for a <term> within a sequence of tokens.
        Return (tree, remaining_tokens). """
    print_debug(f'<term> from {len(tokens)} tokens ',
                (str(tokens[0]), str(tokens[-1])))
    (left_side, tokens) = factor(tokens)
    if tokens and tokens[0].kind == 'MULOP':
        mulop = tokens.pop(0)
        (right_side, tokens) = term(tokens)
        its_children = [left_side, mulop, right_side]
    else:
        its_children = [left_side]
    return (Token(kind = 'TERM',
                  value = '',
                  children = its_children),
            tokens)

def factor(tokens):
    """ Search for a <factor> with a sequence of tokens.
        Return (tree, remaining_tokens) """
    print_debug(f'<factor> from {len(tokens)} tokens ',
                (str(tokens[0]), str(tokens[-1])))
    if tokens[0].kind in ('IDENTIFIER', 'NUMBER'):
        singlet = tokens.pop(0)
        return (Token(kind='FACTOR',
                      value='',
                      children=[singlet]),
                tokens)
    elif tokens[0].kind == 'ADDOP' and tokens[0].value == '-':
        negate = tokens.pop(0)
        (fact, tokens) = factor(tokens)
        return (Token(kind='FACTOR',
                      value='',
                      children=[Token(kind='NEGATE'), fact]),
                tokens)
    elif tokens[0].kind == 'LEFTPAREN':
        leftparen = tokens.pop(0)
        (expr, tokens) = expression(tokens)
        rightparen = tokens.pop(0)
        if rightparen.kind != 'RIGHTPAREN':
            raise Exception('OOPS - mismatched parens')
        return (Token(kind='FACTOR',
                      value='',
                      children=[leftparen, expr, rightparen]),
                tokens)
    else:
        raise Exception('OOPS - failed to find factor')

def test():
    tests = ['123',
             'a*b',
             'a + 2 * b',
             '1*c*2*d',
             'a*b+c*d',
             '123 + abc + z * ( 3 + x )'
            ]
    for symbols in tests:
        print('='*40)
        print('-- input : ', symbols)

        print('-- lexer :')
        tokens = lexer(symbols)
        for token in tokens:
            print(token)

        print('-- parser : ')
        tree = parse(tokens)
        pp.pprint(tree.as_tree())

        print('-- output : ', tree.as_lisp())

if __name__ == '__main__':
    test()