#!/usr/bin/env python """ count_words.py Count word frequencies in moby_dick.txt . This illustrates (a) reading words from a file, including ignoring punctuation, (b) using a python dictionary for counting things, and (c) a bit of somewhat tricky sorting. The output from a sample run of the program is given at the end of this file. The moby_dick.txt file has been extracted from the one at project gutenberg; just google "gutenberg moby dick" to find it. Jim M | Nov 2012 | MIT License """ def get_word_count(filename = "moby_dick.txt", verbose = True): """ Return dictionary of {word1:count1, word2:count2, ...}. If verbose, also print a summary of total words and lines. """ count = {} ignore = ( ',', '.', '"', "'", ';', ':', '!', '(', ')', '*', '$' ) if verbose: print "Counting words in '%s'..." % filename (nlines, nwords) = (0,0) input = open(filename, 'r') while True: # Loop over lines : line = input.readline() # Read in a line. if not line: # Stop if no more text to read. break nlines += 1 words = line.split() # Split the line into words. for word in words: # Loop over words : nwords += 1 word = word.lower() # Convert to lowercase for char in ignore: # Remove puncutation word = word.replace(char, '') count[word] = count.get(word,0) + 1 # Count word frequencies if verbose: print "done. Processed %s words in %s lines." % (nwords, nlines) return count def sort_by_count(wordcount_dict): """ Given a dictionary {word1:count1, word2:count2, ...}, return a list of sorted hi to low by count, e.g. [(10000,'the'), (5000,'a'), (200,'him'),...] """ # The idea here is to # (a) swap the pairs to put the numbers in front, then # (b) sort the pairs, using those numbers, then finally # (c) reverse the list so that the biggest is first. pairs = wordcount_dict.items() # e.g. [('teapot',234), ('whale',723), ...] for i in range(len(pairs)): (word, freq) = pairs[i] # Extract the pair pairs[i] = (freq, word) # ... and stick 'em back in, swapped. pairs.sort() # e.g. [(1,'accidental'), (1,'acute'),...] pairs.reverse() # e.g. [(14065, 'the'), (6437, 'of'), ...] return pairs def print_some_pairs(pairs, n=100): """ Print some of the (int,string) pairs from [(count, word),...] """ print "Top {} words are".format(n) for i in range(n): print " {:4} {:6} : {:<20} ".format(i+1, pairs[i][0], pairs[i][1]) def main(): wordcount = get_word_count() pairs_hi_to_low = sort_by_count(wordcount) print_some_pairs(pairs_hi_to_low) if __name__=="__main__": main() """ === output ============================================================ I've marked manually with ***** some of the words that look archaic or specific to this novel, namely (whale, ahab, ye, ship, whales, sea, captain, thou). $ time ./count_words.py Counting words in 'moby_dick.txt'... done. Processed 208433 words in 22447 lines. Top 100 words are 1 14065 : the 2 6437 : of 3 6257 : and 4 4534 : a 5 4489 : to 6 4048 : in 7 2881 : that 8 2484 : his 9 2318 : it 10 1929 : i 11 1762 : but 12 1720 : he 13 1698 : as 14 1690 : with 15 1672 : is 16 1618 : was 17 1568 : for 18 1463 : all 19 1339 : this 20 1294 : at 21 1169 : by 22 1115 : not 23 1071 : from 24 1028 : on 25 1023 : so 26 1021 : be 27 1018 : him 28 874 : one 29 871 : whale ***** 30 846 : you 31 764 : had 32 751 : have 33 745 : now 34 738 : there 35 687 : or 36 674 : were 37 637 : they 38 617 : which 39 604 : some 40 603 : their 41 600 : then 42 592 : me 43 590 : when 44 584 : my 45 583 : are 46 583 : an 47 566 : like 48 565 : no 49 560 : upon 50 530 : what 51 516 : into 52 508 : out 53 496 : up 54 495 : more 55 468 : if 56 457 : its 57 447 : them 58 435 : old 59 426 : man 60 425 : we 61 421 : would 62 408 : been 63 406 : ahab ***** 64 397 : over 65 394 : ye ***** 66 392 : other 67 386 : these 68 376 : will 69 371 : ship ***** 70 369 : only 71 364 : such 72 362 : whales ***** 73 362 : though 74 359 : sea ***** 75 357 : down 76 338 : yet 77 329 : who 78 321 : time 79 319 : her 80 318 : any 81 317 : very 82 313 : long 83 306 : still 84 302 : those 85 300 : than 86 300 : about 87 294 : do 88 292 : captain ***** 89 289 : before 90 288 : great 91 287 : has 92 286 : said 93 280 : seemed 94 279 : must 95 278 : two 96 276 : here 97 273 : most 98 272 : last 99 259 : thou ***** 100 259 : head real 0m5.548s user 0m5.112s sys 0m0.113s thirty:Desktop$ """