#! /usr/bin/env python

"""
A program to count occurrences of words in a corpus,
using only lowercase letters, so that words at beginning
of sentences aren't treated differently from those in the middle.

Prints results as Comma Separated Values.


Elias Zeidan | GPL | January 2012
"""
import sys, string, pprint

def only_lower(word):
	lower_only = filter(lambda x: 'a' <= x <= 'z', list(word))
	return string.join(lower_only, '')

def file_to_words(filename = None, content = None):
	""" Read in a file, return a list of words in it.
		("content" parameter for testing.)
		>>> file_to_words(content="hello, this is a test")
		['hello', 'this', 'is', 'a', 'test']
	"""
	if filename:
		file = open(filename)
		text = file.read()
	elif content:
		text = content
	words = string.split(text)
	words = map(lambda x: x.lower(), words)
	words = map(only_lower, words)
	return words

def count_words(words):
	""" Counts occurrences of words in text, input as a list of strings.
		>>> count_words(["hello", "this", "is", "a", "test"])
		{'this': 1, 'a': 1, 'is': 1, 'test': 1, 'hello': 1}
	"""
	word_counts = {}
	for word in words:
		word_counts[word] = word_counts.get(word, 0) + 1
	return word_counts

def make_sorted_list(list):
	sorted_words = sorted(list, key=list.__getitem__, reverse=True)
	# from a comment on
	# http://coreygoldberg.blogspot.com/2008/06/python-sort-dictionary-by-values.html#c2127054110479202076
	counts_sorted = []
	for word in sorted_words:
		counts_sorted.append((word, list[word]))
	return counts_sorted

def print_list_for_r(list):
	counts = make_sorted_list(list)
	print "word,count,rank"
	rank = 0
	curnum = counts[0][1]
	for word, num in counts:
		rank += 1
		print "%s,%d,%d" %(word, num, rank)

def main():
	test = file_to_words(filename=sys.argv[1])
	counts = count_words(test)
	pprint.pprint(print_list_for_r(counts))

if __name__ == "__main__":
	import doctest
	doctest.testmod()
	main()	 # uncomment when there is a working main() function