""" jim.py See https://github.com/crista/exercises-in-programming-style This is my first effort, in what I'd say is my default style: * short testable stand-alone functions with doctests * somewhat functional where that feels straightforward to me * some use of what I think of as "standard" python libraries * ... but often just coding it myself instead of total code golf * a "main" function to pull it all together The task -------- Given a text file, display its 25 most frequent words and the number of times each occurs. Convert all words to lowercase, ignore punctuation, and don't do common "stop words", which are specified in stop_words.txt. For example, if the file is named testinput.txt and contains White tigers live mostly in India Wild lions live mostly in Africa then $ python jim.py input.txt should produce live - 2 mostly - 2 africa - 1 india - 1 lions - 1 tigers - 1 white - 1 wild - 1 $ python --version python 3.5.2 :: Anaconda custom (x86_64) Running it ---------- $ time python jim.py pride-and-prejudice.txt mr - 786 elizabeth - 635 very - 488 darcy - 418 such - 395 mrs - 343 much - 329 more - 327 bennet - 323 bingley - 306 jane - 295 miss - 283 one - 275 know - 239 before - 229 herself - 227 though - 226 well - 224 never - 220 sister - 218 soon - 216 think - 211 now - 209 time - 203 good - 201 real 0m0.208s # on a 2014 imac retina user 0m0.183s sys 0m0.020s Jim Mahoney | Apr 7 2018 """ import re, sys def get_text_as_lowercase(filename): """ Return text from a file, converted to lowercase. >>> n = open('test.txt', 'w').write('Three, two three one three Two ah.') >>> get_text_as_lowercase('test.txt') 'three, two three one three two ah.' """ return open(filename, 'r').read().lower() def get_words(text): """ Return alphabetized list of words in a text string. >>> get_words('three, two three one three two ah.') ['ah', 'one', 'three', 'three', 'three', 'two', 'two'] """ # The definition of a "word" here is lowercase consecutive letters. # All other characters are removed. Empty strings are removed. return sorted(filter( lambda s: len(s) > 0, re.split(r'[^a-z]', text) )) def remove_stopwords(words): """ Return list of words with stopwords removed. >>> remove_stopwords(['fred', 'says', 'much', 'too', 'too', 'much']) ['fred', 'much', 'much'] """ stop_text = open('stop_words.txt').read() stop_words = {word:True for word in re.split(r'[^a-z]', stop_text)} for other in ('t', 's', 'd'): # also ignore contraction endings stop_words[other] = True return list(filter(lambda word: word not in stop_words, words)) def word_counts(words): """ Return a dictionary of word counts >>> d = word_counts(['one', 'three', 'three', 'three', 'two', 'two', 'ah']) >>> [(w, d[w]) for w in sorted(d.keys())] [('ah', 1), ('one', 1), ('three', 3), ('two', 2)] """ counts = {} for word in words: try: counts[word] = counts[word] + 1 except KeyError: counts[word] = 1 return counts def words_by_count(counts): """ Return list of words ordered by (a) frequency (b) alphabet. >>> words_by_count( {'one':1, 'three':3, 'two':2, 'ah':1} ) ['three', 'two', 'ah', 'one'] """ # Sorty by highest frequency first. words = counts.keys() return sorted(words, key=lambda word: (- counts[word], word)) def get_filename(default_filename='test.txt'): """ Return filename given in command line invocation or a default. """ # sys.argv would be ['jim.py', 'foo.txt'] for "python jim.py foo.txt" if len(sys.argv) > 1: return sys.argv[1] else: return default_filename def main(): """ Print 25 most frequent words in the file specified on command line """ counts = word_counts( remove_stopwords( get_words( get_text_as_lowercase( get_filename())))) for word in words_by_count(counts)[:25]: print("{} - {}".format(word, counts[word])) if __name__ == '__main__': import doctest doctest.testmod() main()