# NLP Programming Assignment #3
# NaiveBayes
# 2012
#
# The area for you to implement is marked with TODO!
# Generally, you should not need to touch things *not* marked TODO
#
# Remember that when you submit your code, it is not run from the command line
# and your main() will *not* be run. To be safest, restrict your changes to
# addExample() and classify() and anything you further invoke from there.
#
# Revamp for NLP tutorial final project
import sys
import getopt
import os
import math
import string
class NaiveBayes:
class TrainSplit:
"""Represents a set of training/testing data. self.train is a list of Examples, as is self.test.
"""
def __init__(self):
self.train = []
self.test = []
class Example:
"""Represents a document with a label. klass is 'pos' or 'neg' by convention.
words is a list of strings.
"""
def __init__(self):
self.klass = ''
self.words = []
def __init__(self):
"""NaiveBayes initialization"""
self.FILTER_STOP_WORDS = False
self.stopList = set(self.readFile('../data/english.stop'))
self.numFolds = 10
self.PosUnigramCounts = {}
self.NegUnigramCounts = {}
self.posApriori = 0
self.negApriori = 0
# self.bigram_counts = {}
# self.trigram_counts = {}
#############################################################################
# TODO TODO TODO TODO TODO
def classify(self, words):
""" TODO
'words' is a list of words to classify. Return 'pos' or 'neg' classification.
"""
ts = self.TrainSplit()
posScore = math.log(posApriori)
negScore = math.log(negApriori)
for word in words:
for example in ts.train:
if example.klass == "pos":
posScore += math.log(self.PosUnigramCounts[word])
else:
negScore += math.log(self.NegUnigramCounts[word])
if posScore > negScore:
return "pos"
else:
return "neg"
def addExample(self, klass, words):
"""
* TODO
* Train your model on an example document with label klass ('pos' or 'neg') and
* words, a list of strings.
* You should store whatever data structures you use for your classifier
* in the NaiveBayes class.
* Returns nothing
"""
example = self.Example()
example.klass = klass
example.words = words
# unigrams
if example.klass == "pos":
self.posApriori += 1
for word in example.words:
self.PosUnigramCounts[word] = self.PosUnigramCounts.get(word, 0) +1
total = sum(self.PosUnigramCounts.values())
for word in self.PosUnigramCounts:
# print("Count, total = %d, %d" % (self.PosUnigramCounts[word], total))
self.PosUnigramCounts[word] = float(self.PosUnigramCounts[word]+1)/(total+self.PosUnigramCounts[word])
# print("--DEBUG: Pos Unigrams: \n\t", self.PosUnigramCounts)
if example.klass == "neg":
self.negApriori += 1
for word in example.words:
self.NegUnigramCounts[word] = self.NegUnigramCounts.get(word, 0) +1
total = sum(self.NegUnigramCounts.values())
for word in self.NegUnigramCounts:
# print("Count, total = %d, %d" % (self.NegUnigramCounts[word], total))
self.NegUnigramCounts[word] = float(self.NegUnigramCounts[word]+1)/(total+self.NegUnigramCounts[word])
# print("--DEBUG: Neg Unigrams: \n\t", self.NegUnigramCounts)
########################################
# Elias possibly being ambitious here... bigrams and trigram
# calculations below
#
# # bigrams
# for j in example.words:
# self.bigram_counts[j] = {}
# for i in example.words:
# self.bigram_counts[j][i] = 0
# print("Bigrams: \n")
# print(self.bigram_counts)
# for index in range(example.words)-1:
# (j, i) = (example.words[index], example.words[index+1])
# self.bigram_counts[j][i] += 1
# for j in example.words:
# total = sum(self.bigram_counts[j].values())
# for i in example.words:
# self.bigram_counts[j][i] = float(self.bigram_counts[j][i])/total
#
# # trigrams
# for k in example.words:
# self.trigram_counts[k] = {}
# for (j,i) in example.words:
# self.trigram_counts[k][(j,i)] = 0
# print("Trigrams: \n")
# print(self.trigram_counts)
# for index in range(example.words)-1:
# (k, (j, i)) = (example.words[index], (example.words[index+1], example.words[index+2]))
# self.trigram_counts[k][(j,i)] += 1
# for k in example.words:
# total = sum(self.trigram_counts[k].values())
# for (j,i) in example.words:
# self.trigram_counts[k][(j,i)] = float(self.trigram_counts[k][(j,i)])/total
########################################
self.posApriori = float(self.posApriori)/(self.posApriori+self.negApriori)
self.negApriori = float(self.negApriori)/(self.posApriori+self.negApriori)
print("number of training Examples: %d" % (self.posApriori+self.negApriori))
print("a priori positive: %.3f" % self.posApriori)
print("a priori negative: %.3f" % self.negApriori)
## DEBUG:
# print "--DEBUG: Example: \n -Class: %s, \n -Words: %s" %(example.klass, example.words)
# TODO TODO TODO TODO TODO
#############################################################################
def readFile(self, fileName):
"""
* Code for reading a file. you probably don't want to modify anything here,
* unless you don't like the way we segment files.
"""
contents = []
f = open(fileName)
for line in f:
contents.append(line)
f.close()
result = self.segmentWords('\n'.join(contents))
return result
def segmentWords(self, s):
"""
* Splits lines on whitespace for file reading
"""
return s.split()
def trainSplit(self, trainDir):
"""Takes in a trainDir, returns one TrainSplit with train set."""
split = self.TrainSplit()
posTrainFileNames = os.listdir('%s/pos/' % trainDir)
negTrainFileNames = os.listdir('%s/neg/' % trainDir)
for fileName in posTrainFileNames:
example = self.Example()
example.words = self.readFile('%s/pos/%s' % (trainDir, fileName))
example.klass = 'pos'
split.train.append(example)
for fileName in negTrainFileNames:
example = self.Example()
example.words = self.readFile('%s/neg/%s' % (trainDir, fileName))
example.klass = 'neg'
split.train.append(example)
return split
def train(self, split):
for example in split.train:
words = example.words
if self.FILTER_STOP_WORDS:
words = self.filterStopWords(words)
self.addExample(example.klass, words)
def crossValidationSplits(self, trainDir):
"""Returns a lsit of TrainSplits corresponding to the cross validation splits."""
splits = []
posTrainFileNames = os.listdir('%s/pos/' % trainDir)
negTrainFileNames = os.listdir('%s/neg/' % trainDir)
#for fileName in trainFileNames:
for fold in range(0, self.numFolds):
split = self.TrainSplit()
for fileName in posTrainFileNames:
example = self.Example()
example.words = self.readFile('%s/pos/%s' % (trainDir, fileName))
example.klass = 'pos'
if fileName[2] == str(fold):
split.test.append(example)
else:
split.train.append(example)
for fileName in negTrainFileNames:
example = self.Example()
example.words = self.readFile('%s/neg/%s' % (trainDir, fileName))
example.klass = 'neg'
if fileName[2] == str(fold):
split.test.append(example)
else:
split.train.append(example)
splits.append(split)
return splits
def test(self, split):
"""Returns a list of labels for split.test."""
labels = []
for example in split.test:
words = example.words
if self.FILTER_STOP_WORDS:
words = self.filterStopWords(words)
guess = self.classify(words)
labels.append(guess)
return labels
def buildSplits(self, args):
"""Builds the splits for training/testing"""
trainData = []
testData = []
splits = []
trainDir = args[0]
if len(args) == 1:
print('[INFO]\tPerforming %d-fold cross-validation on data set:\t%s' % (self.numFolds, trainDir))
posTrainFileNames = os.listdir('%s/pos/' % trainDir)
negTrainFileNames = os.listdir('%s/neg/' % trainDir)
for fold in range(0, self.numFolds):
split = self.TrainSplit()
for fileName in posTrainFileNames:
example = self.Example()
example.words = self.readFile('%s/pos/%s' % (trainDir, fileName))
example.klass = 'pos'
if fileName[2] == str(fold):
split.test.append(example)
else:
split.train.append(example)
for fileName in negTrainFileNames:
example = self.Example()
example.words = self.readFile('%s/neg/%s' % (trainDir, fileName))
example.klass = 'neg'
if fileName[2] == str(fold):
split.test.append(example)
else:
split.train.append(example)
splits.append(split)
elif len(args) == 2:
split = self.TrainSplit()
testDir = args[1]
print('[INFO]\tTraining on data set:\t%s testing on data set:\t%s' % (trainDir, testDir))
posTrainFileNames = os.listdir('%s/pos/' % trainDir)
negTrainFileNames = os.listdir('%s/neg/' % trainDir)
for fileName in posTrainFileNames:
example = self.Example()
example.words = self.readFile('%s/pos/%s' % (trainDir, fileName))
example.klass = 'pos'
split.train.append(example)
for fileName in negTrainFileNames:
example = self.Example()
example.words = self.readFile('%s/neg/%s' % (trainDir, fileName))
example.klass = 'neg'
split.train.append(example)
posTestFileNames = os.listdir('%s/pos/' % testDir)
negTestFileNames = os.listdir('%s/neg/' % testDir)
for fileName in posTestFileNames:
example = self.Example()
example.words = self.readFile('%s/pos/%s' % (testDir, fileName))
example.klass = 'pos'
split.test.append(example)
for fileName in negTestFileNames:
example = self.Example()
example.words = self.readFile('%s/neg/%s' % (testDir, fileName))
example.klass = 'neg'
split.test.append(example)
splits.append(split)
return splits
def filterStopWords(self, words):
"""Filters stop words."""
filtered = []
for word in words:
if not word in self.stopList and word.strip() != '':
filtered.append(word)
return filtered
def main():
nb = NaiveBayes()
(options, args) = getopt.getopt(sys.argv[1:], 'f')
if ('-f','') in options:
nb.FILTER_STOP_WORDS = True
splits = nb.buildSplits(args)
avgAccuracy = 0.0
fold = 0
for split in splits:
classifier = NaiveBayes()
accuracy = 0.0
for example in split.train:
words = example.words
if nb.FILTER_STOP_WORDS:
words = classifier.filterStopWords(words)
classifier.addExample(example.klass, words)
for example in split.test:
words = example.words
if nb.FILTER_STOP_WORDS:
words = classifier.filterStopWords(words)
guess = classifier.classify(words)
if example.klass == guess:
accuracy += 1.0
accuracy = accuracy / len(split.test)
avgAccuracy += accuracy
print('[INFO]\tFold %d Accuracy: %f' % (fold, accuracy))
fold += 1
avgAccuracy = avgAccuracy / fold
print('[INFO]\tAccuracy: %f' % avgAccuracy)
if __name__ == "__main__":
from doctest import testmod
testmod()
main()
syntax highlighted by Code2HTML, v. 0.93pm6