/var/www/cs/htdocs/courses/spring2012/jims_tutorials/elias/private/sentiment/NaiveBayes.py

# NLP Programming Assignment #3
# NaiveBayes
# 2012

#
# The area for you to implement is marked with TODO!
# Generally, you should not need to touch things *not* marked TODO
#
# Remember that when you submit your code, it is not run from the command line 
# and your main() will *not* be run. To be safest, restrict your changes to
# addExample() and classify() and anything you further invoke from there.
#
# Revamp for NLP tutorial final project

import sys
import getopt
import os
import math
import string

class NaiveBayes:
  class TrainSplit:
    """Represents a set of training/testing data. self.train is a list of Examples, as is self.test. 
    """
    def __init__(self):
      self.train = []
      self.test = []

  class Example:
    """Represents a document with a label. klass is 'pos' or 'neg' by convention.
       words is a list of strings.
    """
    def __init__(self):
      self.klass = ''
      self.words = []


  def __init__(self):
    """NaiveBayes initialization"""
    self.FILTER_STOP_WORDS = False
    self.stopList = set(self.readFile('../data/english.stop'))
    self.numFolds = 10
    self.PosUnigramCounts = {}
    self.NegUnigramCounts = {}
    self.posApriori = 0
    self.negApriori = 0
    # self.bigram_counts = {}
    # self.trigram_counts = {}
  #############################################################################
  # TODO TODO TODO TODO TODO 
  
  def classify(self, words):
    """ TODO
        'words' is a list of words to classify. Return 'pos' or 'neg' classification.

        
    """
    ts = self.TrainSplit()
    posScore = math.log(posApriori)
    negScore = math.log(negApriori)

    for word in words:
      for example in ts.train:
        if example.klass == "pos":
          posScore += math.log(self.PosUnigramCounts[word])
        else:
          negScore += math.log(self.NegUnigramCounts[word])
    
    if posScore > negScore:
      return "pos"
    else:
      return "neg"




  def addExample(self, klass, words):
    """
     * TODO
     * Train your model on an example document with label klass ('pos' or 'neg') and
     * words, a list of strings.
     * You should store whatever data structures you use for your classifier 
     * in the NaiveBayes class.
     * Returns nothing
    """

    example = self.Example()
    example.klass = klass
    example.words = words

    # unigrams
    if example.klass == "pos":
      self.posApriori += 1
      for word in example.words:
        self.PosUnigramCounts[word] = self.PosUnigramCounts.get(word, 0) +1
      total = sum(self.PosUnigramCounts.values())
      for word in self.PosUnigramCounts:
        # print("Count, total = %d, %d" % (self.PosUnigramCounts[word], total))
        self.PosUnigramCounts[word] = float(self.PosUnigramCounts[word]+1)/(total+self.PosUnigramCounts[word])
      # print("--DEBUG: Pos Unigrams: \n\t", self.PosUnigramCounts)
    if example.klass == "neg":
      self.negApriori += 1
      for word in example.words:
        self.NegUnigramCounts[word] = self.NegUnigramCounts.get(word, 0) +1
      total = sum(self.NegUnigramCounts.values())
      for word in self.NegUnigramCounts:
        # print("Count, total = %d, %d" % (self.NegUnigramCounts[word], total))
        self.NegUnigramCounts[word] = float(self.NegUnigramCounts[word]+1)/(total+self.NegUnigramCounts[word])
        # print("--DEBUG: Neg Unigrams: \n\t", self.NegUnigramCounts)

    ########################################
    #   Elias possibly being ambitious here... bigrams and trigram
    #   calculations below
    #
    # # bigrams
    # for j in example.words:
    #   self.bigram_counts[j] = {}
    #   for i in example.words:
    #     self.bigram_counts[j][i] = 0
    # print("Bigrams: \n")
    # print(self.bigram_counts)
    # for index in range(example.words)-1:
    #   (j, i) = (example.words[index], example.words[index+1])
    #   self.bigram_counts[j][i] += 1
    # for j in example.words:
    #   total = sum(self.bigram_counts[j].values())
    #   for i in example.words:
    #     self.bigram_counts[j][i] = float(self.bigram_counts[j][i])/total
    #
    # # trigrams
    # for k in example.words:
    #   self.trigram_counts[k] = {}
    #   for (j,i) in example.words:
    #     self.trigram_counts[k][(j,i)] = 0
    # print("Trigrams: \n")
    # print(self.trigram_counts)
    # for index in range(example.words)-1:
    #   (k, (j, i)) = (example.words[index], (example.words[index+1], example.words[index+2]))
    #   self.trigram_counts[k][(j,i)] += 1
    # for k in example.words:
    #   total = sum(self.trigram_counts[k].values())
    #   for (j,i) in example.words:
    #     self.trigram_counts[k][(j,i)] = float(self.trigram_counts[k][(j,i)])/total
    ########################################

    self.posApriori = float(self.posApriori)/(self.posApriori+self.negApriori)
    self.negApriori = float(self.negApriori)/(self.posApriori+self.negApriori)
    print("number of training Examples: %d" % (self.posApriori+self.negApriori))
    print("a priori positive: %.3f" % self.posApriori)
    print("a priori negative: %.3f" % self.negApriori)
      
    ## DEBUG:
    # print "--DEBUG: Example: \n -Class: %s, \n -Words: %s" %(example.klass, example.words)

  # TODO TODO TODO TODO TODO 
  #############################################################################
  
  
  def readFile(self, fileName):
    """
     * Code for reading a file.  you probably don't want to modify anything here, 
     * unless you don't like the way we segment files.
    """
    contents = []
    f = open(fileName)
    for line in f:
      contents.append(line)
    f.close()
    result = self.segmentWords('\n'.join(contents)) 
    return result

  
  def segmentWords(self, s):
    """
     * Splits lines on whitespace for file reading
    """
    return s.split()

  
  def trainSplit(self, trainDir):
    """Takes in a trainDir, returns one TrainSplit with train set."""
    split = self.TrainSplit()
    posTrainFileNames = os.listdir('%s/pos/' % trainDir)
    negTrainFileNames = os.listdir('%s/neg/' % trainDir)
    for fileName in posTrainFileNames:
      example = self.Example()
      example.words = self.readFile('%s/pos/%s' % (trainDir, fileName))
      example.klass = 'pos'
      split.train.append(example)
    for fileName in negTrainFileNames:
      example = self.Example()
      example.words = self.readFile('%s/neg/%s' % (trainDir, fileName))
      example.klass = 'neg'
      split.train.append(example)
    return split

  def train(self, split):
    for example in split.train:
      words = example.words
      if self.FILTER_STOP_WORDS:
        words =  self.filterStopWords(words)
      self.addExample(example.klass, words)

  def crossValidationSplits(self, trainDir):
    """Returns a lsit of TrainSplits corresponding to the cross validation splits."""
    splits = [] 
    posTrainFileNames = os.listdir('%s/pos/' % trainDir)
    negTrainFileNames = os.listdir('%s/neg/' % trainDir)
    #for fileName in trainFileNames:
    for fold in range(0, self.numFolds):
      split = self.TrainSplit()
      for fileName in posTrainFileNames:
        example = self.Example()
        example.words = self.readFile('%s/pos/%s' % (trainDir, fileName))
        example.klass = 'pos'
        if fileName[2] == str(fold):
          split.test.append(example)
        else:
          split.train.append(example)
      for fileName in negTrainFileNames:
        example = self.Example()
        example.words = self.readFile('%s/neg/%s' % (trainDir, fileName))
        example.klass = 'neg'
        if fileName[2] == str(fold):
          split.test.append(example)
        else:
          split.train.append(example)
      splits.append(split)
    return splits


  def test(self, split):
    """Returns a list of labels for split.test."""
    labels = []
    for example in split.test:
      words = example.words
      if self.FILTER_STOP_WORDS:
        words =  self.filterStopWords(words)
      guess = self.classify(words)
      labels.append(guess)
    return labels
  
  def buildSplits(self, args):
    """Builds the splits for training/testing"""
    trainData = [] 
    testData = []
    splits = []
    trainDir = args[0]
    if len(args) == 1: 
      print('[INFO]\tPerforming %d-fold cross-validation on data set:\t%s' % (self.numFolds, trainDir))

      posTrainFileNames = os.listdir('%s/pos/' % trainDir)
      negTrainFileNames = os.listdir('%s/neg/' % trainDir)
      for fold in range(0, self.numFolds):
        split = self.TrainSplit()
        for fileName in posTrainFileNames:
          example = self.Example()
          example.words = self.readFile('%s/pos/%s' % (trainDir, fileName))
          example.klass = 'pos'
          if fileName[2] == str(fold):
            split.test.append(example)
          else:
            split.train.append(example)
        for fileName in negTrainFileNames:
          example = self.Example()
          example.words = self.readFile('%s/neg/%s' % (trainDir, fileName))
          example.klass = 'neg'
          if fileName[2] == str(fold):
            split.test.append(example)
          else:
            split.train.append(example)
        splits.append(split)
    elif len(args) == 2:
      split = self.TrainSplit()
      testDir = args[1]
      print('[INFO]\tTraining on data set:\t%s testing on data set:\t%s' % (trainDir, testDir))
      posTrainFileNames = os.listdir('%s/pos/' % trainDir)
      negTrainFileNames = os.listdir('%s/neg/' % trainDir)
      for fileName in posTrainFileNames:
        example = self.Example()
        example.words = self.readFile('%s/pos/%s' % (trainDir, fileName))
        example.klass = 'pos'
        split.train.append(example)
      for fileName in negTrainFileNames:
        example = self.Example()
        example.words = self.readFile('%s/neg/%s' % (trainDir, fileName))
        example.klass = 'neg'
        split.train.append(example)

      posTestFileNames = os.listdir('%s/pos/' % testDir)
      negTestFileNames = os.listdir('%s/neg/' % testDir)
      for fileName in posTestFileNames:
        example = self.Example()
        example.words = self.readFile('%s/pos/%s' % (testDir, fileName)) 
        example.klass = 'pos'
        split.test.append(example)
      for fileName in negTestFileNames:
        example = self.Example()
        example.words = self.readFile('%s/neg/%s' % (testDir, fileName)) 
        example.klass = 'neg'
        split.test.append(example)
      splits.append(split)
    return splits
  
  def filterStopWords(self, words):
    """Filters stop words."""
    filtered = []
    for word in words:
      if not word in self.stopList and word.strip() != '':
        filtered.append(word)
    return filtered



def main():
  nb = NaiveBayes()
  (options, args) = getopt.getopt(sys.argv[1:], 'f')
  if ('-f','') in options:
    nb.FILTER_STOP_WORDS = True
  
  splits = nb.buildSplits(args)
  avgAccuracy = 0.0
  fold = 0
  for split in splits:
    classifier = NaiveBayes()
    accuracy = 0.0
    for example in split.train:
      words = example.words
      if nb.FILTER_STOP_WORDS:
        words =  classifier.filterStopWords(words)
      classifier.addExample(example.klass, words)
  
    for example in split.test:
      words = example.words
      if nb.FILTER_STOP_WORDS:
        words =  classifier.filterStopWords(words)
      guess = classifier.classify(words)
      if example.klass == guess:
        accuracy += 1.0

    accuracy = accuracy / len(split.test)
    avgAccuracy += accuracy
    print('[INFO]\tFold %d Accuracy: %f' % (fold, accuracy))
    fold += 1
  avgAccuracy = avgAccuracy / fold
  print('[INFO]\tAccuracy: %f' % avgAccuracy)

if __name__ == "__main__":
    from doctest import testmod
    testmod()
    main()
syntax highlighted by Code2HTML, v. 0.93pm6