#!/usr/bin/env python
"""
 count_words.py 

 Count word frequencies in moby_dick.txt .

 This illustrates
  (a) reading words from a file, including ignoring punctuation,
  (b) using a python dictionary for counting things, and
  (c) a bit of somewhat tricky sorting.

 The output from a sample run of the program is given at the end of this file.

 The moby_dick.txt file has been extracted from the one at project gutenberg;
 just google "gutenberg moby dick" to find it.
 
 Jim M | Nov 2012 | MIT License
"""

def get_word_count(filename = "moby_dick.txt", verbose = True):
    """ Return dictionary of {word1:count1, word2:count2, ...}.
        If verbose, also print a summary of total words and lines. """
    count = {}
    ignore = ( ',', '.', '"', "'", ';', ':', '!', '(', ')', '*', '$' )
    if verbose:
        print "Counting words in '%s'..." % filename
    (nlines, nwords) = (0,0)
    input = open(filename, 'r')
    while True:                                 # Loop over lines :
        line = input.readline()                 #   Read in a line.
        if not line:                            #   Stop if no more text to read.
            break
        nlines += 1
        words = line.split()                    #   Split the line into words.
        for word in words:                      #   Loop over words :
            nwords += 1
            word = word.lower()                 #     Convert to lowercase
            for char in ignore:                 #     Remove puncutation
                word = word.replace(char, '')
            count[word] = count.get(word,0) + 1 #     Count word frequencies
    if verbose:
        print "done.  Processed %s words in %s lines." % (nwords, nlines)
    return count

def sort_by_count(wordcount_dict):
    """ Given a dictionary {word1:count1, word2:count2, ...},
        return a list of sorted hi to low by count,
        e.g. [(10000,'the'), (5000,'a'), (200,'him'),...] """
    # The idea here is to
    #  (a) swap the pairs to put the numbers in front, then
    #  (b) sort the pairs, using those numbers, then finally
    #  (c) reverse the list so that the biggest is first.
    pairs = wordcount_dict.items()     # e.g. [('teapot',234), ('whale',723), ...]
    for i in range(len(pairs)):
        (word, freq) = pairs[i]        # Extract the pair
        pairs[i] = (freq, word)        # ... and stick 'em back in, swapped.
    pairs.sort()                       # e.g. [(1,'accidental'), (1,'acute'),...]
    pairs.reverse()                    # e.g. [(14065, 'the'), (6437, 'of'), ...]
    return pairs

def print_some_pairs(pairs, n=100):
    """ Print some of the (int,string) pairs from [(count, word),...] """
    print "Top {} words are".format(n)
    for i in range(n):
        print " {:4} {:6} : {:<20} ".format(i+1, pairs[i][0], pairs[i][1])

def main():
    wordcount = get_word_count()
    pairs_hi_to_low = sort_by_count(wordcount)
    print_some_pairs(pairs_hi_to_low)

if __name__=="__main__":
    main()

"""
 === output ============================================================

  I've marked manually with ***** some of the words
  that look archaic or specific to this novel, namely
  (whale, ahab, ye, ship, whales, sea, captain, thou).

$ time ./count_words.py 
Counting words in 'moby_dick.txt'...
done.  Processed 208433 words in 22447 lines.
Top 100 words are 
    1  14065 : the                  
    2   6437 : of                   
    3   6257 : and                  
    4   4534 : a                    
    5   4489 : to                   
    6   4048 : in                   
    7   2881 : that                 
    8   2484 : his                  
    9   2318 : it                   
   10   1929 : i                    
   11   1762 : but                  
   12   1720 : he                   
   13   1698 : as                   
   14   1690 : with                 
   15   1672 : is                   
   16   1618 : was                  
   17   1568 : for                  
   18   1463 : all                  
   19   1339 : this                 
   20   1294 : at                   
   21   1169 : by                   
   22   1115 : not                  
   23   1071 : from                 
   24   1028 : on                   
   25   1023 : so                   
   26   1021 : be                   
   27   1018 : him                  
   28    874 : one                  
   29    871 : whale           *****
   30    846 : you                  
   31    764 : had                  
   32    751 : have                 
   33    745 : now                  
   34    738 : there                
   35    687 : or                   
   36    674 : were                 
   37    637 : they                 
   38    617 : which                
   39    604 : some                 
   40    603 : their                
   41    600 : then                 
   42    592 : me                   
   43    590 : when                 
   44    584 : my                   
   45    583 : are                  
   46    583 : an                   
   47    566 : like                 
   48    565 : no                   
   49    560 : upon                 
   50    530 : what                 
   51    516 : into                 
   52    508 : out                  
   53    496 : up                   
   54    495 : more                 
   55    468 : if                   
   56    457 : its                  
   57    447 : them                 
   58    435 : old                  
   59    426 : man                  
   60    425 : we                   
   61    421 : would                
   62    408 : been                 
   63    406 : ahab            *****     
   64    397 : over                 
   65    394 : ye              *****     
   66    392 : other                
   67    386 : these                
   68    376 : will                 
   69    371 : ship            *****      
   70    369 : only                 
   71    364 : such                 
   72    362 : whales          *****     
   73    362 : though               
   74    359 : sea             *****     
   75    357 : down                 
   76    338 : yet                  
   77    329 : who                  
   78    321 : time                 
   79    319 : her                  
   80    318 : any                  
   81    317 : very                 
   82    313 : long                 
   83    306 : still                
   84    302 : those                
   85    300 : than                 
   86    300 : about                
   87    294 : do                   
   88    292 : captain        *****      
   89    289 : before               
   90    288 : great                
   91    287 : has                  
   92    286 : said                 
   93    280 : seemed               
   94    279 : must                 
   95    278 : two                  
   96    276 : here                 
   97    273 : most                 
   98    272 : last                 
   99    259 : thou           *****      
  100    259 : head                 

real    0m5.548s
user    0m5.112s
sys     0m0.113s
thirty:Desktop$ 


"""