#! /usr/bin/env python """ A program to count occurrences of words in a corpus, using only lowercase letters, so that words at beginning of sentences aren't treated differently from those in the middle. Prints results as Comma Separated Values. Elias Zeidan | GPL | January 2012 """ import sys, string, pprint def only_lower(word): lower_only = filter(lambda x: 'a' <= x <= 'z', list(word)) return string.join(lower_only, '') def file_to_words(filename = None, content = None): """ Read in a file, return a list of words in it. ("content" parameter for testing.) >>> file_to_words(content="hello, this is a test") ['hello', 'this', 'is', 'a', 'test'] """ if filename: file = open(filename) text = file.read() elif content: text = content words = string.split(text) words = map(lambda x: x.lower(), words) words = map(only_lower, words) return words def count_words(words): """ Counts occurrences of words in text, input as a list of strings. >>> count_words(["hello", "this", "is", "a", "test"]) {'this': 1, 'a': 1, 'is': 1, 'test': 1, 'hello': 1} """ word_counts = {} for word in words: word_counts[word] = word_counts.get(word, 0) + 1 return word_counts def make_sorted_list(list): sorted_words = sorted(list, key=list.__getitem__, reverse=True) # from a comment on # http://coreygoldberg.blogspot.com/2008/06/python-sort-dictionary-by-values.html#c2127054110479202076 counts_sorted = [] for word in sorted_words: counts_sorted.append((word, list[word])) return counts_sorted def print_list_for_r(list): counts = make_sorted_list(list) print "word,count,rank" rank = 0 curnum = counts[0][1] for word, num in counts: rank += 1 print "%s,%d,%d" %(word, num, rank) def main(): test = file_to_words(filename=sys.argv[1]) counts = count_words(test) pprint.pprint(print_list_for_r(counts)) if __name__ == "__main__": import doctest doctest.testmod() main() # uncomment when there is a working main() function