"""
basictweetexlexer.py

This lexer is written to recognize basic tweetex syntax elements.
Here's a grammar of what it's meant to recognize.

Note: in EBNF, { anything in curly braces repeats,} so I've put parenthesis
around curly braces that I want to actually see in the source code.
This grammar was revised with great help from Jim.

<story> :== <preamble> <passage>+
<preamble> :== <id> <macro>*
<id> := "\ifid{" [\w]+ "}"
<macro> :== "\" <command> [ "{" <argument> "}" ]*
<command> :== "link" | "start" | "author" | "title"
<passage> :== "\passage" { "{" <argument> "}" } <text>
<char> :== [^\\\{\}]
<argument> :== <char>+ | <macro>
<text> :== <char>* {<macro>} {<text>}


    -- lexer --
    COMMAND     (including backslash)
    LEFTCURLY
    RIGHTCURLY
    CHARACTERS

    --- parser ---
    ... recursive stuff ...


And here's an example of a correctly written tweetex document.

\title{My Story}
\author{Nick Creel}
\ifid{0CA8C7C5-F219-4B1B-A3A8-45710F389818}
\start{Starting Passage}

\passage{Starting Passage}
This is some text in the first passage
\link{Second Passage}{This link goes to the second passage}

\passage{Second Passage}
This is some text in the second passage.

"""
###imports
import argparse
import codecs
import re
###end imports
###class definitions
class Token:
    """ a token takes a re match object and a string containing the type of the
        regular expression as input. the value of the token is extracted from
        the match object. """
    def __init__(self, matchObj, type):
        self.value = matchObj
        self.type = type
        self.children = []
    def __repr__(self):
        return f"Token: VALUE = '{self.value}', TYPE = {self.type}"
    def __str__(self):
        return f"Token: VALUE = '{self.value}', TYPE = {self.type}"
    def getValue(self):
        return self.value
    def getType(self):
        return self.type

class Lexer:
    def __init__(self, lines):
        self.lines = lines
        self.commands = ["link", "start", "author", "title", "ifid"]
        self.tokens = {"LEFTCURLY":   r'(\{)' ,
                       "RIGHTCURLY":  r'(\})' ,
                       "COMMAND":     r'(\\([a-z]+))', #one slash and some text
                       "CHARACTERS":  r'[^\\\{\}]'} #anything that isn't a
                                                    #delimiter or a command

    def lex(self):
        result = []
        splitline = []
        for line in self.lines:
            acc = str(re.split(self.tokens["COMMAND"], line)) #should I also
                                                    #split lines with curlies?
            splitline.append(acc)
        print(splitline, "\n")
        for string in splitline:
            print(f'String is {string}\n')
            for token in self.tokens:
                match = re.match(self.tokens[token], string)
                if match:
                    token = Token(match, token)
                    result.append(token)
        return result
###end class definitions
###helper functions
def stringPreprocessing(fileObj):
    result = []
    for line in fileObj:
        line = line.replace("\\", "\\\\") #preserve backslash as character
        result.append(line)
    return result
###end helper functions
###main routine
def main():
    ######
    ### using argparse so that filenames can be provided as arugments in the
    ### command line.
    ### see https://docs.python.org/3/tutorial/stdlib.html#command-line-arguments
    ######
    parser = argparse.ArgumentParser(description="accepts input for TweeTex\
                                                 compiler")
    parser.add_argument('input', metavar="f", type=str, nargs=1,
                        help="the location of the TweeTeX file to compile")
    args = parser.parse_args()
    lines=[]
    print("Beginning lexical analysis...")
    with open(args.input[0], "r") as file:
        lines = stringPreprocessing(file)
    lexer = Lexer(lines)

    tokens = lexer.lex()
    if tokens != None:
        print(f'Tokens: {tokens}')
###end main routine
if __name__ == '__main__':
    main()