Jim says

Nice. I edited it a bit with you in class:

changed "NEWLINE" to " NEWLINE" to avoid word concatenation in output
added usage docs and a few other minor changes in file info doc string
a test suite would be nice.

Here's what I got when I ran it:

[mahoney@softmaple Desktop]$ racket lex.rkt indentmarker.py.mod 
Warning: lexer at # can accept the empty string.
(ID result)
(PUNC =)
(ID self)
(LIT .insert)
(ID _tokens)
(PUNC ()
(ID self)
(LIT .source)
(ID _code_lines)
(PUNC ))
(KEYWORD if)
(KEYWORD not)
(ID result)
(PUNC :)
(INDENT)
(ID print)
(PUNC ()
(LIT " ERROR ")
(PUNC ))
(DEDENT)
(DEDENT)
(DEDENT)
(ID Dedenter)
(PUNC ()
(PUNC ))
(LIT .main)
(PUNC ()
(PUNC ))
(EOF)
'()

Python Parsing

So I made an attempt to complete Matt Might's first project: a python lexer/tokenizer.

I decided to use a first path with a parser coded in python to insert INDENT and DEDENT tokens (to signify logic blocks) into the source code as well as mark newlines that don't occure in string literals or arg-calls.

Here is the code:

"""
indentMarker.py
By Logan Davis

DESCRIPTION:
A script to add INDENT, DEDENT, and NEWLINE tokens to
a Python source file for parsing.

9/19/16 | Python 3.5 | Emacs 24
"""
import sys

class Dedenter(object):
    """
    Tears through a .py file
    to generate a indent-tokenized
    version for parsing.

    To use: instance the class and call its .main() method
    """

    def __init__(self):
        self.filename = sys.argv[1]
        self.source_code_lines = self.clean_hash_comments(open(self.filename,"r").read().split("\n"))
        self.mod_source_file = open(self.filename+".mod","w")
        self.indent_depth = self.id_indent_depth(self.source_code_lines)

    def clean_hash_comments(self, source_lines):
        """
        Removes # comments and empty lines from 
        source_lines and returns the new array of
        strings
        """
        strings_and_comments = ['"""',"'''","'",'"']
        stripped_source = []
        for line in source_lines:
            in_string = False
            string_buffer = ""
            for char in line:
                if char in strings_and_comments:
                    in_string = not in_string
                elif (char == "#") and not in_string:
                    break
                string_buffer += char
            stripped_source.append(string_buffer)
        return list(filter((lambda x: x != ''),stripped_source))

    def count_pre_space(self,line):
        """
        returns the prepending space
        in arg [string]line.
        """
        spaces = 0
        for char in line:
            if char != " ":
                return spaces
            spaces += 1
        return 0

    def id_indent_depth(self,source_file):
        """
        ID's the length of identation
        that should be used to analyze
        the file.
        If nothing is indented, 0 is returned.
        """
        in_comment = False
        for line  in source_file:
            if ('\"\"\"' in line) or ("\'\'\'" in line):
                in_comment = not in_comment
            elif len(line) == 0:
                continue
            elif (line[0] == " ") and (not in_comment) and (not line.isspace()):
                return self.count_pre_space(line)
        return 0

    def insert_tokens(self,source_lines):
        """
        writes out a copy of the original file while
        inserting INDENT and DEDENT tokens to a new
        file appended with '.mod'
        
        Also adds NEWLINE to "\n" that occure outside 
        of strings and tuples.
        """
        in_string = False
        in_tuple = False
        logical_indent = 0

        for line in source_lines:
            if (line == "") or line.isspace():          #empty lines
                continue
            elif line[0] == " ":                        #code that starts with white-space
                indent = int(self.count_pre_space(line) / self.indent_depth)
                if (indent > logical_indent) and not in_tuple:
                    self.mod_source_file.write("INDENT " + line)
                    logical_indent += 1
                    
                elif (indent < logical_indent) and not in_tuple:
                    self.mod_source_file.write("DEDENT "*(logical_indent - indent) + line)
                    logical_indent -= (logical_indent - indent)#(indent - logical_indent)
                    
                elif (indent == logical_indent) or in_tuple:
                    self.mod_source_file.write(line)
                    
                else:                                   #moved to a higher indent that one logical level
                    print ("ERROR: unexpected indent length on line {}.".format(line))
                    return False
            elif line != "":                            #non-indented code
                if in_tuple:
                   self.mod_source_file.write(line)
                elif logical_indent != 0:
                    self.mod_source_file.write("DEDENT "*logical_indent + line)
                    logical_indent = 0
                else:
                    self.mod_source_file.write(line)

            if ("(" in line) and (")" not in line):    #started multi-line tuple?
                in_tuple = True
            elif in_tuple and (")" in line):           #ended tuple?
                in_tuple = False

            if (line.count("'")%2 != 0) or (line.count('"')%2 != 0):  #string check
                in_string = not in_string

            if not in_tuple and not in_string:
                self.mod_source_file.write("NEWLINE")
            self.mod_source_file.write("\n")
        return True



    def main(self):
        """
        Wraps the parsing and token insertion
        methods.
        """
        result = self.insert_tokens(self.source_code_lines)
        if not result:
            print("ERROR")

scrub = Dedenter()
scrub.main()

The second pass is done in a Racket lexer to properly tokenize the intermediate code (called on the file-name.py.mod):

#lang racket
#| python_lexer.rkt
   By Logan Davis

   A simple python lexer
   TODO: FINISH SPEC REQUIREMENTS

   9/20/16
 |#
(require parser-tools/lex)
(require parser-tools/lex-sre)

(define end-of-file #f)

(define-lex-abbrev single-comment
  (: #\# (* (char-complement #\newline))))
(define-lex-abbrev multi-comment
  (or (: "\"\"\"" (repetition 0 +inf.0 any-char) "\"\"\"")
      (: "'''" (repetition 0 +inf.0 any-char) "'''")))

(define-lex-abbrev punct
  (or #\+ #\- #\* #\\ #\( #\) #\: "**" "==" "<" ">" "!" "="))
(define-lex-abbrev literal
  (or (+ (char-range #\0 #\9))
      (: (* (char-range #\0 #\9)) "." (or (* (char-range #\0 #\9)) (* (char-range #\a #\z))))
      (: #\" (+ (or #\space (: #\\ (repetition 0 1 any-char)) (: #\\ #\newline) (char-range #\a #\z) (char-range #\A #\Z))) #\")
      (: #\' (+ (or #\space (: #\\ (repetition 0 1 any-char)) (: #\\ #\newline) (char-range #\a #\z) (char-range #\A #\Z))) #\')))

(define-lex-abbrev id-word (+ (or "_" (char-range #\a #\z) (char-range #\A #\Z))))
(define-lex-abbrev keyword? (or "False"  "class"    "finally" "is"       "return"
                                "None"   "continue" "for"     "lambda"   "try"
                                "True"   "def"      "from"    "nonlocal" "while"
                                "and"    "del"      "global"  "not"      "with"
                                "as"     "elif"     "if"      "or"       "yield"
                                "assert" "else"     "import"  "pass"
                                "break"  "except"   "in"      "raise"))

; A lexer to tokenize python files
(define calc-lexer
  (lexer
   ["DEDENT" (printf "(DEDENT)\n")]
   ["INDENT" (printf "(INDENT)\n")]
   ["NEWLINE" (printf "(NEWLINE)\n")]
   [punct (printf "(PUNC ~a)\n" lexeme)]
   [literal (printf "(LIT ~a)\n" lexeme)]
   [(or single-comment multi-comment) (void)]
   [keyword? (printf "(KEYWORD ~a)\n" lexeme)]
   [id-word (printf "(ID ~a)\n" lexeme)]
   [(or #\newline #\space "") (void)]
   [(eof) (and (set! end-of-file #t) (printf "(EOF)\n"))]))

(define test (open-input-file (vector-ref (current-command-line-arguments) 0)))

; takes a input stream and parses until eof
(define reader
  (lambda (file)
    (if end-of-file '() (and (calc-lexer file) (reader file)))))

(reader test)

Currently it doesn't entirely match spec, nor does in return cons of each token (Matt assumes this is being done in Dr.Racket, where it will just print returned values), but these alternations are additions of rules, not entirely new modules. I will finish it up while reading about parsing. I tested it on Matt's example on the site (since I don't have access to "vulcan," the server where their test code it actually held). I get the correct token sequence.

http://cs.marlboro.edu/ courses/ fall2016/jims_tutorials/ ldavis/ Sep_21
last modified Wednesday September 21 2016 11:56 am EDT

attachments

name last modified size

Jim's
Tutorials

course

navigation

Jim says

Python Parsing

attachments

Jim'sTutorials

course

navigation

Jim says

Python Parsing

attachments

Jim's
Tutorials