#!/usr/bin/env python from nltk.probability import * import codecs import sys class BigramSegmenter(object): def __init__(self, punct=' ".,;!?-\n'): self.dist = ConditionalFreqDist() self.punct = punct def train_file(self, filename): f = codecs.open(filename, 'r', 'utf-8') try: self.train(f.read()) finally: f.close() def train(self, text): ch = ' ' for nextch in text: if ch not in self.punct and nextch not in self.punct: self.dist[ch].inc(nextch) ch = nextch def test(self, text, cutoff=0.04): seg = self.segment(text, cutoff) #print text.encode('utf-8') #print print seg.encode('utf-8') return self.evaluate(seg, text) def test_file(self, filename, cutoff=0.04): f = codecs.open(filename, 'r', 'utf-8') try: text = f.read() seg = self.segment(text, cutoff) print seg.encode('utf-8') return self.evaluate(seg, text) finally: f.close() def segment(self, text, cutoff): ch = '' out = [] for nextch in text: if ch and nextch not in self.punct: prob = self.dist[ch].freq(nextch) #print ch.encode('utf-8'), nextch.encode('utf-8'), prob if prob < cutoff: out.append(' ') if nextch not in self.punct: out.append(nextch) ch = nextch return ''.join(out) def evaluate(self, segmented, gold): segmented = ''.join(segmented) gold = ''.join(gold) c1 = 0 c2 = 0 correct = 0 false_pos = 0 false_neg = 0 while c1 < len(segmented) and c2 < len(gold): seg_char = segmented[c1] gold_char = gold[c2] if seg_char != ' ' and seg_char in self.punct: c1 += 1 continue if gold_char != ' ' and gold_char in self.punct: c2 += 1 continue if seg_char == ' ': if gold_char == ' ': correct += 1 else: false_pos += 1 c1 += 1 continue else: if gold_char == ' ': false_neg += 1 c2 += 1 continue else: assert gold_char == seg_char, '%s/%s' % (gold_char, seg_char) c1 += 1 c2 += 1 return (correct, false_pos, false_neg) def morphemes(text, size=2, punct=' .,;?!"-'): pairs = [] t = '' for ch in text: if ch in punct: if t: pairs.append(t) pairs.append(ch) t = '' else: t += ch if len(t) >= size: pairs.append(t) t = '' return pairs def reportResults(tuple): correct, false_pos, false_neg = tuple print 'Correct boundaries: %d' % correct print 'False positives: %d' % false_pos print 'False negatives: %d' % false_neg print 'Precision: %2.1f%%' % (float(correct)/(correct+false_pos)*100) print 'Recall: %2.1f%%' % (float(correct)/(correct+false_neg)*100) def main(training, testing, cutoff): seg = BigramSegmenter() seg.train_file(training) reportResults(seg.test_file(testing, cutoff)) if __name__ == '__main__': if (len(sys.argv) >= 4): training = sys.argv[1] testing = sys.argv[2] cutoff = float(sys.argv[3]) else: print "Using defaults" training = 'HawaiianCorpus.txt' testing = 'HawaiianStory.txt' cutoff = 0.04 print "Training=%s" %(training) print "Testing=%s" %(testing) print "Cutoff=%0.5f" %(cutoff) main(training, testing, cutoff)