""" basictweetexlexer.py This lexer is written to recognize basic tweetex syntax elements. Here's a grammar of what it's meant to recognize. Note: in EBNF, { anything in curly braces repeats,} so I've put parenthesis around curly braces that I want to actually see in the source code. This grammar was revised with great help from Jim. :== + :== * := "\ifid{" [\w]+ "}" :== "\" [ "{" "}" ]* :== "link" | "start" | "author" | "title" :== "\passage" { "{" "}" } :== [^\\\{\}] :== + | :== * {} {} -- lexer -- COMMAND (including backslash) LEFTCURLY RIGHTCURLY CHARACTERS --- parser --- ... recursive stuff ... And here's an example of a correctly written tweetex document. \title{My Story} \author{Nick Creel} \ifid{0CA8C7C5-F219-4B1B-A3A8-45710F389818} \start{Starting Passage} \passage{Starting Passage} This is some text in the first passage \link{Second Passage}{This link goes to the second passage} \passage{Second Passage} This is some text in the second passage. """ ###imports import argparse import codecs import re ###end imports ###class definitions class Token: """ a token takes a re match object and a string containing the type of the regular expression as input. the value of the token is extracted from the match object. """ def __init__(self, matchObj, type): self.value = matchObj self.type = type self.children = [] def __repr__(self): return f"Token: VALUE = '{self.value}', TYPE = {self.type}" def __str__(self): return f"Token: VALUE = '{self.value}', TYPE = {self.type}" def getValue(self): return self.value def getType(self): return self.type class Lexer: def __init__(self, lines): self.lines = lines self.commands = ["link", "start", "author", "title", "ifid"] self.tokens = {"LEFTCURLY": r'(\{)' , "RIGHTCURLY": r'(\})' , "COMMAND": r'(\\([a-z]+))', #one slash and some text "CHARACTERS": r'[^\\\{\}]'} #anything that isn't a #delimiter or a command def lex(self): result = [] splitline = [] for line in self.lines: acc = str(re.split(self.tokens["COMMAND"], line)) #should I also #split lines with curlies? splitline.append(acc) print(splitline, "\n") for string in splitline: print(f'String is {string}\n') for token in self.tokens: match = re.match(self.tokens[token], string) if match: token = Token(match, token) result.append(token) return result ###end class definitions ###helper functions def stringPreprocessing(fileObj): result = [] for line in fileObj: line = line.replace("\\", "\\\\") #preserve backslash as character result.append(line) return result ###end helper functions ###main routine def main(): ###### ### using argparse so that filenames can be provided as arugments in the ### command line. ### see https://docs.python.org/3/tutorial/stdlib.html#command-line-arguments ###### parser = argparse.ArgumentParser(description="accepts input for TweeTex\ compiler") parser.add_argument('input', metavar="f", type=str, nargs=1, help="the location of the TweeTeX file to compile") args = parser.parse_args() lines=[] print("Beginning lexical analysis...") with open(args.input[0], "r") as file: lines = stringPreprocessing(file) lexer = Lexer(lines) tokens = lexer.lex() if tokens != None: print(f'Tokens: {tokens}') ###end main routine if __name__ == '__main__': main()