#!/usr/bin/env python """ count the words in a file """ # The name of the file filename = 'moby_dick.txt' # Number of words to display n_print = 100 # punctuation characters to ignore punctuation = [',', '.', '"', "'", ';', ':', '!', '(', ')', '*', '$' ] # (initially empty) dictionary of word frequencies count = {} # open the file for reading input = open(filename, 'r') print "Counting words in '%s'..." % filename (nlines, nwords) = (0,0) while True: # Loop over lines : line = input.readline() # Read in a line. nlines += 1 if not line: # If empty string, stop. break words = line.split() # Split the line into words. for word in words: nwords += 1 word = word.lower() # Convert to lowercase for punc in punctuation: # Remove puncutation word = word.replace(punc,'') count[word] = count.get(word,0) + 1 # Count word frequencies print "done. Processed %s words in %s lines." % (nwords, nlines) # Sort the results from highest to lowest frequency. # The method I'm using here is : # (a) extract list of (word, freq) pairs from the count dictionary, # (b) swap each element of the list to a (freq, word) pair # (c) sorting the list gives us lowest to highest # (d) reversing that list finishes pairs = count.items() # e.g. [(word1,freq1), (word2,freq2), ...] for i in range(len(pairs)): (word, freq) = pairs[i] pairs[i] = (freq, word) pairs.sort() pairs.reverse() # Another way to do the sorting, as described in our text (chap 11), # is to pass pairs.sort() our own function to do the comparison. # A comparison function compare(alpha, beta) should # return -1 when alpha goes before beta, # return +1 when alpha goes after beta, # return 0 when they are equivalent # By default list.sort() uses python's built-in cmp() to compare items, # which behaves this way already. It words on numbers, strings, # and pretty much anything else. # >>> cmp( (1,'hi') , (3,'bye') ) # -1 # (1,'hi') comes before (3,'bye') # >>> cmp( [1,2,3], 17 ) # 1 # [1,2,3] goes after 17 # # So we can just define our own comparison function # to put (word, freq) pairs into the order that we want. # Defining and using it looks like this. # # def our_compare((word1, freq1), (word2, freq2)): # if freq1 > freq2 : # bigger numbers # return -1 # go earlier in the list # elif freq1 < freq2 : # smaller numbers # return 1 # go later in the list # else # equal numbers # return cmp(word1, word2) # are sorted by the words # pairs = count.items() # pairs.sort(our_compare) # sort using our comparison # # Print out some results. print "Top %i words are " % n_print for i in range(n_print): print " %4i %6i : %-20s " % (i+1, pairs[i][0], pairs[i][1]) # --- output ------------------------------------- # I've marked a few of the words that look specific # to this novel or archaic, namely # (whale, ahab, ye, ship, whales, sea, captain, thou). """ thirty:Desktop$ time ./count_words.py Counting words in 'moby_dick.txt'... done. Processed 208433 words in 22447 lines. Top 100 words are 1 14065 : the 2 6437 : of 3 6257 : and 4 4534 : a 5 4489 : to 6 4048 : in 7 2881 : that 8 2484 : his 9 2318 : it 10 1929 : i 11 1762 : but 12 1720 : he 13 1698 : as 14 1690 : with 15 1672 : is 16 1618 : was 17 1568 : for 18 1463 : all 19 1339 : this 20 1294 : at 21 1169 : by 22 1115 : not 23 1071 : from 24 1028 : on 25 1023 : so 26 1021 : be 27 1018 : him 28 874 : one 29 871 : whale ***** 30 846 : you 31 764 : had 32 751 : have 33 745 : now 34 738 : there 35 687 : or 36 674 : were 37 637 : they 38 617 : which 39 604 : some 40 603 : their 41 600 : then 42 592 : me 43 590 : when 44 584 : my 45 583 : are 46 583 : an 47 566 : like 48 565 : no 49 560 : upon 50 530 : what 51 516 : into 52 508 : out 53 496 : up 54 495 : more 55 468 : if 56 457 : its 57 447 : them 58 435 : old 59 426 : man 60 425 : we 61 421 : would 62 408 : been 63 406 : ahab ***** 64 397 : over 65 394 : ye ***** 66 392 : other 67 386 : these 68 376 : will 69 371 : ship ***** 70 369 : only 71 364 : such 72 362 : whales ***** 73 362 : though 74 359 : sea ***** 75 357 : down 76 338 : yet 77 329 : who 78 321 : time 79 319 : her 80 318 : any 81 317 : very 82 313 : long 83 306 : still 84 302 : those 85 300 : than 86 300 : about 87 294 : do 88 292 : captain ***** 89 289 : before 90 288 : great 91 287 : has 92 286 : said 93 280 : seemed 94 279 : must 95 278 : two 96 276 : here 97 273 : most 98 272 : last 99 259 : thou ***** 100 259 : head real 0m5.548s user 0m5.112s sys 0m0.113s thirty:Desktop$ """