Jim's
Tutorials

Fall 2016
course
navigation

Jim says

Nice. I edited it a bit with you in class:
Here's what I got when I ran it:
[mahoney@softmaple Desktop]$ racket lex.rkt indentmarker.py.mod 
Warning: lexer at # can accept the empty string.
(ID result)
(PUNC =)
(ID self)
(LIT .insert)
(ID _tokens)
(PUNC ()
(ID self)
(LIT .source)
(ID _code_lines)
(PUNC ))
(KEYWORD if)
(KEYWORD not)
(ID result)
(PUNC :)
(INDENT)
(ID print)
(PUNC ()
(LIT " ERROR ")
(PUNC ))
(DEDENT)
(DEDENT)
(DEDENT)
(ID Dedenter)
(PUNC ()
(PUNC ))
(LIT .main)
(PUNC ()
(PUNC ))
(EOF)
'()

Python Parsing

So I made an attempt to complete Matt Might's first project: a python lexer/tokenizer.
I decided to use a first path with a parser coded in python to insert INDENT and DEDENT tokens (to signify logic blocks) into the source code as well as mark newlines that don't occure in string literals or arg-calls.
Here is the code: """ indentMarker.py By Logan Davis DESCRIPTION: A script to add INDENT, DEDENT, and NEWLINE tokens to a Python source file for parsing. 9/19/16 | Python 3.5 | Emacs 24 """ import sys class Dedenter(object): """ Tears through a .py file to generate a indent-tokenized version for parsing. To use: instance the class and call its .main() method """ def __init__(self): self.filename = sys.argv[1] self.source_code_lines = self.clean_hash_comments(open(self.filename,"r").read().split("\n")) self.mod_source_file = open(self.filename+".mod","w") self.indent_depth = self.id_indent_depth(self.source_code_lines) def clean_hash_comments(self, source_lines): """ Removes # comments and empty lines from source_lines and returns the new array of strings """ strings_and_comments = ['"""',"'''","'",'"'] stripped_source = [] for line in source_lines: in_string = False string_buffer = "" for char in line: if char in strings_and_comments: in_string = not in_string elif (char == "#") and not in_string: break string_buffer += char stripped_source.append(string_buffer) return list(filter((lambda x: x != ''),stripped_source)) def count_pre_space(self,line): """ returns the prepending space in arg [string]line. """ spaces = 0 for char in line: if char != " ": return spaces spaces += 1 return 0 def id_indent_depth(self,source_file): """ ID's the length of identation that should be used to analyze the file. If nothing is indented, 0 is returned. """ in_comment = False for line in source_file: if ('\"\"\"' in line) or ("\'\'\'" in line): in_comment = not in_comment elif len(line) == 0: continue elif (line[0] == " ") and (not in_comment) and (not line.isspace()): return self.count_pre_space(line) return 0 def insert_tokens(self,source_lines): """ writes out a copy of the original file while inserting INDENT and DEDENT tokens to a new file appended with '.mod' Also adds NEWLINE to "\n" that occure outside of strings and tuples. """ in_string = False in_tuple = False logical_indent = 0 for line in source_lines: if (line == "") or line.isspace(): #empty lines continue elif line[0] == " ": #code that starts with white-space indent = int(self.count_pre_space(line) / self.indent_depth) if (indent > logical_indent) and not in_tuple: self.mod_source_file.write("INDENT " + line) logical_indent += 1 elif (indent < logical_indent) and not in_tuple: self.mod_source_file.write("DEDENT "*(logical_indent - indent) + line) logical_indent -= (logical_indent - indent)#(indent - logical_indent) elif (indent == logical_indent) or in_tuple: self.mod_source_file.write(line) else: #moved to a higher indent that one logical level print ("ERROR: unexpected indent length on line {}.".format(line)) return False elif line != "": #non-indented code if in_tuple: self.mod_source_file.write(line) elif logical_indent != 0: self.mod_source_file.write("DEDENT "*logical_indent + line) logical_indent = 0 else: self.mod_source_file.write(line) if ("(" in line) and (")" not in line): #started multi-line tuple? in_tuple = True elif in_tuple and (")" in line): #ended tuple? in_tuple = False if (line.count("'")%2 != 0) or (line.count('"')%2 != 0): #string check in_string = not in_string if not in_tuple and not in_string: self.mod_source_file.write("NEWLINE") self.mod_source_file.write("\n") return True def main(self): """ Wraps the parsing and token insertion methods. """ result = self.insert_tokens(self.source_code_lines) if not result: print("ERROR") scrub = Dedenter() scrub.main()
The second pass is done in a Racket lexer to properly tokenize the intermediate code (called on the file-name.py.mod):
#lang racket #| python_lexer.rkt By Logan Davis A simple python lexer TODO: FINISH SPEC REQUIREMENTS 9/20/16 |# (require parser-tools/lex) (require parser-tools/lex-sre) (define end-of-file #f) (define-lex-abbrev single-comment (: #\# (* (char-complement #\newline)))) (define-lex-abbrev multi-comment (or (: "\"\"\"" (repetition 0 +inf.0 any-char) "\"\"\"") (: "'''" (repetition 0 +inf.0 any-char) "'''"))) (define-lex-abbrev punct (or #\+ #\- #\* #\\ #\( #\) #\: "**" "==" "<" ">" "!" "=")) (define-lex-abbrev literal (or (+ (char-range #\0 #\9)) (: (* (char-range #\0 #\9)) "." (or (* (char-range #\0 #\9)) (* (char-range #\a #\z)))) (: #\" (+ (or #\space (: #\\ (repetition 0 1 any-char)) (: #\\ #\newline) (char-range #\a #\z) (char-range #\A #\Z))) #\") (: #\' (+ (or #\space (: #\\ (repetition 0 1 any-char)) (: #\\ #\newline) (char-range #\a #\z) (char-range #\A #\Z))) #\'))) (define-lex-abbrev id-word (+ (or "_" (char-range #\a #\z) (char-range #\A #\Z)))) (define-lex-abbrev keyword? (or "False" "class" "finally" "is" "return" "None" "continue" "for" "lambda" "try" "True" "def" "from" "nonlocal" "while" "and" "del" "global" "not" "with" "as" "elif" "if" "or" "yield" "assert" "else" "import" "pass" "break" "except" "in" "raise")) ; A lexer to tokenize python files (define calc-lexer (lexer ["DEDENT" (printf "(DEDENT)\n")] ["INDENT" (printf "(INDENT)\n")] ["NEWLINE" (printf "(NEWLINE)\n")] [punct (printf "(PUNC ~a)\n" lexeme)] [literal (printf "(LIT ~a)\n" lexeme)] [(or single-comment multi-comment) (void)] [keyword? (printf "(KEYWORD ~a)\n" lexeme)] [id-word (printf "(ID ~a)\n" lexeme)] [(or #\newline #\space "") (void)] [(eof) (and (set! end-of-file #t) (printf "(EOF)\n"))])) (define test (open-input-file (vector-ref (current-command-line-arguments) 0))) ; takes a input stream and parses until eof (define reader (lambda (file) (if end-of-file '() (and (calc-lexer file) (reader file))))) (reader test)
Currently it doesn't entirely match spec, nor does in return cons of each token (Matt assumes this is being done in Dr.Racket, where it will just print returned values), but these alternations are additions of rules, not entirely new modules. I will finish it up while reading about parsing. I tested it on Matt's example on the site (since I don't have access to "vulcan," the server where their test code it actually held). I get the correct token sequence.
http://cs.marlboro.edu/ courses/ fall2016/jims_tutorials/ ldavis/ Sep_21
last modified Wednesday September 21 2016 11:56 am EDT

attachments [paper clip]

     name last modified size
   indentmarker.py Sep 21 2016 11:49 am 5.12kB    indentmarker.py.mod Sep 21 2016 11:56 am 5.89kB