#!/usr/bin/env python """ A program that implements the IBM model 1 translation. Elias Zeidan | GPL | February 2012 v3 : Elias for Feb 29 tutorial with Jim v3a : Jim'n'Elias together minor modifications in tutorial """ # list of pairs [english_sentence, french_sentence] sentence_corpus = [[["older", "brother"], ["aine", "frere"]], \ [["brother"], ["frere"]]] # Step 1: Set parameter values uniformly. def make_alignment_parameters(sentences): """ Input: a list of pairs of "sentences" (word lists) in two languages. [[["english", "sentence", "one"], ["french", "sentence", "one"]], [["english", "sentence", "two"], ["french", "sentence", "two"]], ... ] Output: initial uniform conditional probabilities P(english_word|french_word) for words in this corpus, each value set to 1/n where n = total number of distinct french words from all sentences. {E1 : {F1 : 1/n, F2 : 1/n, ..., Fn : 1/n}, {E2 : {F1 : 1/n, F2 : 1/n, ..., Fn : 1/n}, ... } >>> make_alignment_parameters(sentence_corpus) {"brother": {"aine": 0.5, "frere": 0.5}, "older": {"aine": 0.5, "frere": 0.5}} """ # figure out which words we have from each language lang2 = {} lang1 = {} for (lang1_words, lang2_words) in sentences: for word in lang2_words: lang2[word] = lang2.get(word, 0) + 1 for word in lang1_words: lang1[word] = lang1.get(word, 0) + 1 # number of words in lang 2 n = len(lang2) prob = 1.0/n result = {} for word1 in lang1: result[word1] = {} for word2 in lang2: result[word1][word2] = prob return result # Step 2: Compute P(a,f | e) for all alignments. # a = alignment, f = French (or other language), e = English def make_alignment_probabilities(sentences, conditionals): """ Input: sentences : corpus of list of sentence pairs conditionals : P(f|e), e.g. the initial probabilities made with make_alignment_parameters Output: P(a f|e) = list of P(f|e), one for each sentence pair (i.e. alignment). >>> corpus = [[["the", "blue", "house"], ["la", "maison", "bleue"]], \ [["blue", "house"], ["maison", "bleue"]], \ [["blue"], ["bleue"]]] >>> third = 0.333333333 >>> p_f_e = {'blue' : ... # tests are good. Clean readable tests are better. """ probabilities = [] totals = [] counts = [] for (en_sentence, fr_sentence) in sentences: probabilities.append({}) sen_total = 0 for i in range(len(en_sentence)): en_word = en_sentence[i] fr_word = fr_sentence[i] sen_total += conditionals[en_word][fr_word] totals.append(sen_total) print totals for j in range(len(sentences)): # j=0,1,2, different sentences # totals[j], probabilities[j], sentences[j] en_sentence = sentences[j][0] fr_sentence = sentences[j][1] for i in range(len(en_sentence)): en_word = en_sentence[i] fr_word = fr_sentence[i] probabilities[j][en_word][fr_word] = conditionals[en_word][fr_word] * totals[j] return probabilities # Step 3: Normalize P(a,f | e) values to yield P(a | e,f) values. def normalize(sentences, conditionals): """ Normalizes probabilities by factor. >>> normalize([[["the", "blue", "house"], ["la", "maison", "bleue"]],[["blue", "house"], ["maison", "bleue"]], [["blue"], ["bleue"]]],{'blue' : {'maison': 0.037037037037037028, 'bleue': 0.037037037037037028, 'la': 0.037037037037037028},'house': {'maison': 0.037037037037037028, 'bleue': 0.037037037037037028, 'la': 0.037037037037037028},'the' : {'maison': 0.037037037037037028, 'bleue': 0.037037037037037028, 'la': 0.037037037037037028}}) {'blue': 3, 'la': 1, 'house': 2, 'bleue': 3, 'maison': 2, 'the': 1} {'blue': {'maison': 0.33333333333333337, 'bleue': 0.33333333333333337, 'la': 0.33333333333333337}, 'house': {'maison': 0.33333333333333337, 'bleue': 0.33333333333333337, 'la': 0.33333333333333337}, 'the': {'maison': 0.33333333333333337, 'bleue': 0.33333333333333337, 'la': 0.33333333333333337}} """ probabilities = conditionals.values()[0].values() counts = {} # how many (total) of each word in all sentences? for sentence_pair in sentences: for sentence in sentence_pair: for word in sentence: counts[word] = counts.get(word, 0) + 1 print counts norm = {} for eng in conditionals: factor = 0 # normalization factor for x in conditionals[eng].values(): factor += x norm[eng] = {} for fre in conditionals[eng]: norm[eng][fre] = conditionals[eng][fre]/factor return norm # # Step 4: Collect fractional counts. # def collect_fracts(normalized): # """ Sums fractional counts of each probability # >>> collect_fracts([{'brother': {'aine': 0.5, 'frere': 0.5}, 'older': {'aine': 0.5, 'frere': 0.5}}, {'brother': {'frere': 1.0}}]) # P("aine" | "brother") = 0.50 # P("frere" | "brother") = 1.50 # P("aine" | "older") = 0.50 # P("frere" | "older") = 0.50 # [{'brother': {'aine': 0.5, 'frere': 1.5}, 'older': {'aine': 0.5, 'frere': 0.5}}, {'brother': {'frere': 1.5}}] # """ # fracts = normalized # for eng in normalized[0].keys(): # for fre in normalized[0][eng].keys(): # for i in range(len(normalized[0][eng].values())-1): # fracts[0][eng][fre] = normalized[0][eng].values()[i] # for eng in normalized[1].keys(): # for fre in normalized[1][eng].keys(): # for i in range(len(normalized[1][eng].values())): # fracts[0][eng][fre] += normalized[1][eng].values()[i] # for eng in normalized[1].keys(): # for fre in normalized[1][eng].keys(): # for i in range(len(normalized[1][eng].values())): # fracts[1][eng][fre] = fracts[0][eng][fre] # for eng in fracts[0]: # for fre in fracts[0][eng].keys(): # print "P(\"%s\" | \"%s\") = %.2f" %(fre, eng, fracts[0][eng][fre]) # return fracts # def iterations(): # # print "--- Iteration 0 ---" # probabilities = make_alignment_probabilities(parameters) # normalized = normalize(probabilities) # fractions = collect_fracts(normalized) # renormalized = normalize(fractions) # print renormalized if __name__ == "__main__": from doctest import testmod testmod() # parameters = make_alignment_parameters(sentence_corpus) # iterations()