jim.py

"""
 jim.py

 See https://github.com/crista/exercises-in-programming-style

 This is my first effort, in what I'd say is my default style:
   * short testable stand-alone functions with doctests
   * somewhat functional where that feels straightforward to me
   * some use of what I think of as "standard" python libraries 
   * ... but often just coding it myself instead of total code golf
   * a "main" function to pull it all together

 The task
 --------

 Given a text file, display its 25 most frequent words and the number of times
 each occurs. Convert all words to lowercase, ignore punctuation, and don't do
 common "stop words", which are specified in stop_words.txt.

 For example, if the file is named testinput.txt and contains
   White tigers live mostly in India 
   Wild lions live mostly in Africa
 then 
   $ python jim.py input.txt
 should produce
   live - 2 
   mostly - 2 
   africa - 1 
   india - 1 
   lions - 1 
   tigers - 1 
   white - 1 
   wild - 1

 $ python --version
 python 3.5.2 :: Anaconda custom (x86_64)

 Running it
 ----------

 $ time python jim.py pride-and-prejudice.txt
 mr - 786
 elizabeth - 635
 very - 488
 darcy - 418
 such - 395
 mrs - 343
 much - 329
 more - 327
 bennet - 323
 bingley - 306
 jane - 295
 miss - 283
 one - 275
 know - 239
 before - 229
 herself - 227
 though - 226
 well - 224
 never - 220
 sister - 218
 soon - 216
 think - 211
 now - 209
 time - 203
 good - 201

 real	0m0.208s  # on a 2014 imac retina
 user	0m0.183s
 sys	0m0.020s

Jim Mahoney | Apr 7 2018
"""
import re, sys

def get_text_as_lowercase(filename):
    """ Return text from a file, converted to lowercase.
        >>> n = open('test.txt', 'w').write('Three, two three one three Two ah.')
        >>> get_text_as_lowercase('test.txt')
        'three, two three one three two ah.'
    """
    return open(filename, 'r').read().lower()

def get_words(text):
    """ Return alphabetized list of words in a text string.
        >>> get_words('three, two three one three two ah.')
        ['ah', 'one', 'three', 'three', 'three', 'two', 'two']
    """
    # The definition of a "word" here is lowercase consecutive letters.
    # All other characters are removed. Empty strings are removed.
    return sorted(filter( lambda s: len(s) > 0, re.split(r'[^a-z]', text) ))

def remove_stopwords(words):
    """ Return list of words with stopwords removed.
        >>> remove_stopwords(['fred', 'says', 'much', 'too', 'too', 'much'])
        ['fred', 'much', 'much']
    """
    stop_text = open('stop_words.txt').read()
    stop_words = {word:True for word in re.split(r'[^a-z]', stop_text)}
    for other in ('t', 's', 'd'): # also ignore contraction endings
        stop_words[other] = True
    return list(filter(lambda word: word not in stop_words, words))

def word_counts(words):
    """ Return a dictionary of word counts
        >>> d = word_counts(['one', 'three', 'three', 'three', 'two', 'two', 'ah'])
        >>> [(w, d[w]) for w in sorted(d.keys())]
        [('ah', 1), ('one', 1), ('three', 3), ('two', 2)]
    """
    counts = {}
    for word in words:
        try:
            counts[word] = counts[word] + 1
        except KeyError:
            counts[word] = 1
    return counts

def words_by_count(counts):
    """ Return list of words ordered by (a) frequency (b) alphabet.
        >>> words_by_count( {'one':1, 'three':3, 'two':2, 'ah':1} )
        ['three', 'two', 'ah', 'one']
    """
    # Sorty by highest frequency first.
    words = counts.keys()
    return sorted(words, key=lambda word: (- counts[word], word))

def get_filename(default_filename='test.txt'):
    """ Return filename given in command line invocation or a default. """
    # sys.argv would be ['jim.py', 'foo.txt'] for "python jim.py foo.txt"
    if len(sys.argv) > 1:
        return sys.argv[1]
    else:
        return default_filename

def main():
    """ Print 25 most frequent words in the file specified on command line """
    counts = word_counts(
               remove_stopwords(
                 get_words(
                   get_text_as_lowercase(
                     get_filename()))))
    for word in words_by_count(counts)[:25]:
        print("{} - {}".format(word, counts[word]))

if __name__ == '__main__':
    import doctest
    doctest.testmod()
    main()