#!/usr/bin/env python
"""
 count_words.py 

 Count the words in a file, namely moby_dick.txt .

 The illustrates the use of
  (a) a python dictionary, and
  (b) customized sorting of a dictionary.

 The output from a sample run of the program is given at the end of this file.

 The moby_dick.txt file has been extracted from the one at project gutenberg;
 just google "gutenberg moby dick" to find it.
 
 Jim M | Nov 2012 | intro programming class
"""

# The name of the file
filename = 'moby_dick.txt'

# Number of words to display
n_print = 100

# punctuation characters to ignore
punctuation = [',', '.', '"', "'", ';', ':', '!', '(', ')', '*', '$' ]

# (initially empty) dictionary of word frequencies
count = {}

# open the file for reading
input = open(filename, 'r')

print "Counting words in '%s'..." % filename
(nlines, nwords) = (0,0)
while True:                           # Loop over lines :
    line = input.readline()                  # Read in a line.
    nlines += 1                              # Increment n lines counter.
    if not line:                             # If empty string, 
        break                                #   leave this loop
    words = line.split()                     # Split this line into words.
    for word in words:
        nwords += 1
        word = word.lower()                    # Convert to lowercase
        for punc in punctuation:               # Remove puncutation
            word = word.replace(punc,'')
        count[word] = count.get(word,0) + 1    # Count word frequencies
print "done.  Processed %s words in %s lines." % (nwords, nlines)

# Sort the results from highest to lowest frequency.
# The method I'm using here is :
#   (a) extract list of (word, freq) pairs from the count dictionary,
#   (b) swap each element of the list to a (freq, word) pair
#   (c) sorting the list gives us lowest to highest
#   (d) reversing that list finishes 
pairs = count.items()        # e.g. [(word1,freq1), (word2,freq2), ...]
for i in range(len(pairs)):
    (word, freq) = pairs[i] 
    pairs[i] = (freq, word)
pairs.sort()
pairs.reverse()

# Print out the top results.
print "Top %i words are " % n_print
for i in range(n_print):
    print " %4i %6i : %-20s " % (i+1, pairs[i][0], pairs[i][1])

"""
--- output -------------------------------------
  I've marked a few of the words that look specific 
  to this novel or archaic, namely
  (whale, ahab, ye, ship, whales, sea, captain, thou).

thirty:Desktop$ time ./count_words.py 
Counting words in 'moby_dick.txt'...
done.  Processed 208433 words in 22447 lines.
Top 100 words are 
    1  14065 : the                  
    2   6437 : of                   
    3   6257 : and                  
    4   4534 : a                    
    5   4489 : to                   
    6   4048 : in                   
    7   2881 : that                 
    8   2484 : his                  
    9   2318 : it                   
   10   1929 : i                    
   11   1762 : but                  
   12   1720 : he                   
   13   1698 : as                   
   14   1690 : with                 
   15   1672 : is                   
   16   1618 : was                  
   17   1568 : for                  
   18   1463 : all                  
   19   1339 : this                 
   20   1294 : at                   
   21   1169 : by                   
   22   1115 : not                  
   23   1071 : from                 
   24   1028 : on                   
   25   1023 : so                   
   26   1021 : be                   
   27   1018 : him                  
   28    874 : one                  
   29    871 : whale           *****
   30    846 : you                  
   31    764 : had                  
   32    751 : have                 
   33    745 : now                  
   34    738 : there                
   35    687 : or                   
   36    674 : were                 
   37    637 : they                 
   38    617 : which                
   39    604 : some                 
   40    603 : their                
   41    600 : then                 
   42    592 : me                   
   43    590 : when                 
   44    584 : my                   
   45    583 : are                  
   46    583 : an                   
   47    566 : like                 
   48    565 : no                   
   49    560 : upon                 
   50    530 : what                 
   51    516 : into                 
   52    508 : out                  
   53    496 : up                   
   54    495 : more                 
   55    468 : if                   
   56    457 : its                  
   57    447 : them                 
   58    435 : old                  
   59    426 : man                  
   60    425 : we                   
   61    421 : would                
   62    408 : been                 
   63    406 : ahab            *****     
   64    397 : over                 
   65    394 : ye              *****     
   66    392 : other                
   67    386 : these                
   68    376 : will                 
   69    371 : ship            *****      
   70    369 : only                 
   71    364 : such                 
   72    362 : whales          *****     
   73    362 : though               
   74    359 : sea             *****     
   75    357 : down                 
   76    338 : yet                  
   77    329 : who                  
   78    321 : time                 
   79    319 : her                  
   80    318 : any                  
   81    317 : very                 
   82    313 : long                 
   83    306 : still                
   84    302 : those                
   85    300 : than                 
   86    300 : about                
   87    294 : do                   
   88    292 : captain        *****      
   89    289 : before               
   90    288 : great                
   91    287 : has                  
   92    286 : said                 
   93    280 : seemed               
   94    279 : must                 
   95    278 : two                  
   96    276 : here                 
   97    273 : most                 
   98    272 : last                 
   99    259 : thou           *****      
  100    259 : head                 

real    0m5.548s
user    0m5.112s
sys     0m0.113s
thirty:Desktop$ 

"""