""" basictweetexlexer.py This lexer is written to recognize basic tweetex syntax elements. Here's a grammar of what it's meant to recognize. Note: in EBNF, { anything in curly braces repeats,} so I've put parenthesis around curly braces that I want to actually see in the source code. This grammar was revised with great help from Jim. :== + :== * := "\ifid{" [\w]+ "}" :== "\" [ "{" "}" ]* :== "link" | "start" | "author" | "title" :== "\passage" { "{" "}" } :== [^\\\{\}] :== + | :== * {} {} -- lexer -- COMMAND (including backslash) LEFTCURLY RIGHTCURLY CHARACTERS --- parser --- ... recursive stuff ... And here's an example of a correctly written tweetex document. \title{My Story} \author{Nick Creel} \ifid{0CA8C7C5-F219-4B1B-A3A8-45710F389818} \start{Starting Passage} \passage{Starting Passage} This is some text in the first passage \link{Second Passage}{This link goes to the second passage} \passage{Second Passage} This is some text in the second passage. """ import argparse import codecs import re class Token: """ A lexer token or parser node. """ def __init__(self, value, _type): self.value = value # from lexer match self._type = _type self.children = [] def __str__(self): return f"Token: VALUE = '{self.value}', TYPE = {self.type}" class Lexer: """ >>> lex = Lexer("one two \start three \link{url} four") """ def __init__(self, text): """ >>> re.match(self.regexes['LEFTCURLY'], r"{foo").match '{' >>> re.match(self.regexes['RIGHTCURLY'], r"}foo").match '}' >>> re.match(self.regexes['COMMAND'], r"\foo bar").match 'foo' >>> re.match(self.regexes['COMMAND'], r"\foo{bar}").match 'foo' """ self.text = text self.lines = [] self.commands = ["link", "start", "author", "title", "ifid"] self.regexes = {"LEFTCURLY": r'(\{)' , # { "RIGHTCURLY": r'(\})' , # } "COMMAND": r'(\\([a-z]+))', # \command "CHARACTERS": r'[^\\\{\}]+'} # .* def next_token(self): """ Look for a token at the start of the text. Create it, add it to the list of tokens, and remove matching text. """ pass if __name__ == '__main__': import doctest doctest.testmod()