#!/usr/bin/env python

""" count the words in a file """

# The name of the file
filename = 'moby_dick.txt'

# Number of words to display
n_print = 100

# punctuation characters to ignore
punctuation = [',', '.', '"', "'", ';', ':', '!', '(', ')', '*', '$' ]

# (initially empty) dictionary of word frequencies
count = {}

# open the file for reading
input = open(filename, 'r')

print "Counting words in '%s'..." % filename
(nlines, nwords) = (0,0)
while True:                           # Loop over lines :
	line = input.readline()                  # Read in a line.
	nlines += 1
	if not line:                             # If empty string, stop.
		break
	words = line.split()                     # Split the line into words.
	for word in words:
		nwords += 1
		word = word.lower()                    # Convert to lowercase
		for punc in punctuation:               # Remove puncutation
			word = word.replace(punc,'')
		count[word] = count.get(word,0) + 1    # Count word frequencies
print "done.  Processed %s words in %s lines." % (nwords, nlines)

# Sort the results from highest to lowest frequency.
# The method I'm using here is :
#   (a) extract list of (word, freq) pairs from the count dictionary,
#   (b) swap each element of the list to a (freq, word) pair
#   (c) sorting the list gives us lowest to highest
#   (d) reversing that list finishes 
pairs = count.items()        # e.g. [(word1,freq1), (word2,freq2), ...]
for i in range(len(pairs)):
	(word, freq) = pairs[i] 
	pairs[i] = (freq, word)
pairs.sort()
pairs.reverse()

# Another way to do the sorting, as described in our text (chap 11),
# is to pass pairs.sort() our own function to do the comparison.
# A comparison function compare(alpha, beta) should
#   return -1 when alpha goes before beta,
#   return +1 when alpha goes after beta,
#   return 0 when they are equivalent
# By default list.sort() uses python's built-in cmp() to compare items,
# which behaves this way already.  It words on numbers, strings,
# and pretty much anything else.
#   >>> cmp( (1,'hi') , (3,'bye') )
#   -1                         # (1,'hi') comes before (3,'bye')
#   >>> cmp( [1,2,3], 17 )
#   1                          # [1,2,3] goes after 17
#
# So we can just define our own comparison function
# to put (word, freq) pairs into the order that we want.
# Defining and using it looks like this.
#
#    def our_compare((word1, freq1), (word2, freq2)):
#       if freq1 > freq2 :              # bigger numbers
#           return -1                   #   go earlier in the list
#       elif freq1 < freq2 :            # smaller numbers
#          return 1                     #   go later in the list
#       else                            # equal numbers
#          return cmp(word1, word2)     #   are sorted by the words
#    pairs = count.items()
#    pairs.sort(our_compare)            # sort using our comparison
#

# Print out some results.
print "Top %i words are " % n_print
for i in range(n_print):
	print " %4i %6i : %-20s " % (i+1, pairs[i][0], pairs[i][1])

# --- output -------------------------------------
#  I've marked a few of the words that look specific 
#  to this novel or archaic, namely
#  (whale, ahab, ye, ship, whales, sea, captain, thou).
"""

thirty:Desktop$ time ./count_words.py 
Counting words in 'moby_dick.txt'...
done.  Processed 208433 words in 22447 lines.
Top 100 words are 
    1  14065 : the                  
    2   6437 : of                   
    3   6257 : and                  
    4   4534 : a                    
    5   4489 : to                   
    6   4048 : in                   
    7   2881 : that                 
    8   2484 : his                  
    9   2318 : it                   
   10   1929 : i                    
   11   1762 : but                  
   12   1720 : he                   
   13   1698 : as                   
   14   1690 : with                 
   15   1672 : is                   
   16   1618 : was                  
   17   1568 : for                  
   18   1463 : all                  
   19   1339 : this                 
   20   1294 : at                   
   21   1169 : by                   
   22   1115 : not                  
   23   1071 : from                 
   24   1028 : on                   
   25   1023 : so                   
   26   1021 : be                   
   27   1018 : him                  
   28    874 : one                  
   29    871 : whale           *****
   30    846 : you                  
   31    764 : had                  
   32    751 : have                 
   33    745 : now                  
   34    738 : there                
   35    687 : or                   
   36    674 : were                 
   37    637 : they                 
   38    617 : which                
   39    604 : some                 
   40    603 : their                
   41    600 : then                 
   42    592 : me                   
   43    590 : when                 
   44    584 : my                   
   45    583 : are                  
   46    583 : an                   
   47    566 : like                 
   48    565 : no                   
   49    560 : upon                 
   50    530 : what                 
   51    516 : into                 
   52    508 : out                  
   53    496 : up                   
   54    495 : more                 
   55    468 : if                   
   56    457 : its                  
   57    447 : them                 
   58    435 : old                  
   59    426 : man                  
   60    425 : we                   
   61    421 : would                
   62    408 : been                 
   63    406 : ahab            *****     
   64    397 : over                 
   65    394 : ye              *****     
   66    392 : other                
   67    386 : these                
   68    376 : will                 
   69    371 : ship            *****      
   70    369 : only                 
   71    364 : such                 
   72    362 : whales          *****     
   73    362 : though               
   74    359 : sea             *****     
   75    357 : down                 
   76    338 : yet                  
   77    329 : who                  
   78    321 : time                 
   79    319 : her                  
   80    318 : any                  
   81    317 : very                 
   82    313 : long                 
   83    306 : still                
   84    302 : those                
   85    300 : than                 
   86    300 : about                
   87    294 : do                   
   88    292 : captain        *****      
   89    289 : before               
   90    288 : great                
   91    287 : has                  
   92    286 : said                 
   93    280 : seemed               
   94    279 : must                 
   95    278 : two                  
   96    276 : here                 
   97    273 : most                 
   98    272 : last                 
   99    259 : thou           *****      
  100    259 : head                 

real    0m5.548s
user    0m5.112s
sys     0m0.113s
thirty:Desktop$ 


"""