'''
A python program to count the 25 most common words appearing in pride and prejudice
Nate Weeks April 2018
'''

import string
from collections import Counter

#method read from a file and to create the array of words to be ignored
def createStopArray():
    with open("stop-words.txt") as f:
        for line in f:
            splitLine = line.split(",") #splitting each word into a seperate array entry
        stopArray = splitLine[:-1]      # ignore the extra space at the end
    return stopArray

# method to read from a file and remove all punctuation and format the text of pride and prejudice into an array of strings
def formatContent():
    formattedContent = []   # intiate the array
    with open("pride-and-prejudice.txt") as f:
        for line in f:
            # function to remove the punctuation, turn everything to lowercase, then split the words into an array of strings
            formattedContent += ((line.translate(None, string.punctuation)).lower()).split()
    return formattedContent

# takes an array of content and an array of words to ignore and outputs the 25 most common words and how often they occur
def countWords(formatted_content, stop_words):
    word_array = []         # initiate the array
    for word in formatted_content:
        if word not in stop_words:      # check to see if the word is in stop_words
            word_array.append(word)     # if its not, add it to the word array
    counter = Counter(word_array)       # input the word_array into a counter dictionary
    return counter.most_common(25)      # return the 25 most common words and their number of occurences

# takes a counter object and outputs a string formatted as requested
def formatCount(counter):
    for item in counter:
        print item[0] + " - " + str(item[1])

# driver function that calls and chains each function together
def main():
    formatted_content = formatContent()
    stop_words = createStopArray()
    counter = countWords(formatted_content, stop_words)
    print "The most common words in pride and prejudice not included in the stop words are:"
    formatCount(counter)

main()