#!/usr/bin/env python

from nltk.probability import *
import codecs
import sys

class BigramSegmenter(object):
    def __init__(self, punct=' ".,;!?-\n'):
        self.dist = ConditionalFreqDist()
        self.punct = punct
        
    def train_file(self, filename):
        f = codecs.open(filename, 'r', 'utf-8')
        try:
            self.train(f.read())
        finally:
            f.close()
            
    def train(self, text):
        ch = ' '
        for nextch in text:
            if ch not in self.punct and nextch not in self.punct:
                self.dist[ch].inc(nextch)
            ch = nextch
            
    def test(self, text, cutoff=0.04):
        seg = self.segment(text, cutoff)
        #print text.encode('utf-8')
        #print
        print seg.encode('utf-8')
        return self.evaluate(seg, text)
    
    def test_file(self, filename, cutoff=0.04):
        f = codecs.open(filename, 'r', 'utf-8')
        try:
            text = f.read()
            seg = self.segment(text, cutoff)
            print seg.encode('utf-8')
            return self.evaluate(seg, text)
        finally:
            f.close()
        
    def segment(self, text, cutoff):
        ch = ''
        out = []
        for nextch in text:
            if ch and nextch not in self.punct:
                prob = self.dist[ch].freq(nextch)
                #print ch.encode('utf-8'), nextch.encode('utf-8'), prob
                if prob < cutoff: out.append(' ')
            if nextch not in self.punct:
                out.append(nextch)
                ch = nextch
        return ''.join(out)

    def evaluate(self, segmented, gold):
        segmented = ''.join(segmented)
        gold = ''.join(gold)
        c1 = 0
        c2 = 0
        correct = 0
        false_pos = 0
        false_neg = 0
        while c1 < len(segmented) and c2 < len(gold):
            seg_char = segmented[c1]
            gold_char = gold[c2]
            if seg_char != ' ' and seg_char in self.punct:
                c1 += 1
                continue
            if gold_char != ' ' and gold_char in self.punct:
                c2 += 1
                continue
            if seg_char == ' ':
                if gold_char == ' ':
                    correct += 1
                else:
                    false_pos += 1
                    c1 += 1
                    continue
            else:
                if gold_char == ' ':
                    false_neg += 1
                    c2 += 1
                    continue
                else:
                    assert gold_char == seg_char, '%s/%s' % (gold_char,
                    seg_char)
            c1 += 1
            c2 += 1
        return (correct, false_pos, false_neg)

def morphemes(text, size=2, punct=' .,;?!"-'):
    pairs = []
    t = ''
    for ch in text:
        if ch in punct:
            if t: pairs.append(t)
            pairs.append(ch)
            t = ''
        else:
            t += ch
            if len(t) >= size:
                pairs.append(t)
                t = ''
    return pairs

def reportResults(tuple):
    correct, false_pos, false_neg = tuple
    print 'Correct boundaries: %d' % correct
    print 'False positives: %d' % false_pos
    print 'False negatives: %d' % false_neg
    print 'Precision: %2.1f%%' % (float(correct)/(correct+false_pos)*100)
    print 'Recall: %2.1f%%' % (float(correct)/(correct+false_neg)*100)

def main(training, testing, cutoff):
    seg = BigramSegmenter()
    seg.train_file(training)
    reportResults(seg.test_file(testing, cutoff))

if __name__ == '__main__':
    if (len(sys.argv) >= 4):
	training = sys.argv[1]
	testing = sys.argv[2]
	cutoff = float(sys.argv[3])
    else:
	print "Using defaults"
	training = 'HawaiianCorpus.txt'
	testing = 'HawaiianStory.txt'
	cutoff = 0.04

    print "Training=%s" %(training)
    print "Testing=%s" %(testing)
    print "Cutoff=%0.5f" %(cutoff)
    main(training, testing, cutoff)