#! /usr/bin/env python """ A simple metric to compare two translation strings. USAGE: ./compare.py Elias Zeidan | GPL | April 2012 """ import sys from nltk.tokenize import * def add_filler(sent1, sent2): difference = len(sent1) - len(sent2) for i in range(difference): sent2.append("blah") return sent2 def compare(human, machine): human_token = word_tokenize(human) machine_token = word_tokenize(machine) # print human_token, machine_token total_unmatched = 0 if len(human_token) != len(machine_token): print len(human_token), len(machine_token) if len(human_token) > len(machine_token): machine_token = add_filler(human_token, machine_token) else: human_token = add_filler(human_token, machine_token) # print human, machine # print human_token, machine_token for i in range(min(len(human_token), len(machine_token))): if human_token[i] != machine_token[i]: total_unmatched += 1 print "human translation: %s" % human print "machine translation: %s" % machine return float(total_unmatched)/len(human_token) def main(): print compare(human, machine) if __name__ == "__main__": human = sys.argv[1] machine = sys.argv[2] main()