Jim's
Tutorials

Spring 2012
course
navigation

2012-02-22

Important notes

NLP-Class.org class has been delayed yet again to "either late in February or early in March."

Code

This is [preliminary] thinking [about modifications to the posted code] after I posted translation2.py, between waking up, and Tutorial.
probabilities = make_alignment_probabilities(parameters) normalized = normalize(probabilities) fractions = collect_fracts(normalized) renormalized = normalize(fractions) # Same as normalize(collect_fracts(normalize(make_alignment_probabilities(parameters)))) # Now loop through n iterations of that. count = 0 while count < num: # e.g. num=1000 # do it. count += 1 return

in Jim's office

We looked at the first subroutine more closely, compared with Knight's algorithm, and came up with this. (Not yet tested; not yet compatible with the rest of the subroutines in translation2.)
# list of pairs [english_sentence, french_sentence] sentence_corpus = [[["older", "brother"], ["aine", "frere"]], [["brother"], ["frere"]]] # Step 1: Set parameter values uniformly. def make_alignment_parameters(sentences): """ Input: a list of pairs of "sentences" (word lists) in two languages. [[["english", "sentence", "one"], ["french", "sentence", "one"]] [["english", "sentence", "two"], ["french", "sentence", "two"]] ... ] Output: initial uniform conditional probabilities P(english_word|french_word) for words in this corpus, each value set to 1/n where n = number of french words {{E1 : {F1 : 1/n, F2 : 1/n, ..., Fn : 1/n}, {E2 : {F1 : 1/n, F2 : 1/n, ..., Fn : 1/n}, ... } >>> make_alignment_parameters(sentence_corpus) """ # figure out which words we have from each language lang2 = {} lang1 = {} for (lang1_words, lang2_words) in sentence_corpus: for word in lang2_words: lang2[word] = lang2.get(word, 0) + 1 for word in lang1_words: lang1[word] = lang1.get(word, 0) + 1 # number of words in lang 2 n = len(lang2) prob = 1.0/n result = {} for word1 in lang1: result[word1] = {} for word2 in lang2: result[word1][word2] = prob return result
Code and a doctest [more to be added] added to translation2 and uploaded.

My work

# Step 1: Set parameter values uniformly. def make_alignment_parameters(sentences): """ Input: a list of pairs of "sentences" (word lists) in two languages. [[["english", "sentence", "one"], ["french", "sentence", "one"]] [["english", "sentence", "two"], ["french", "sentence", "two"]] ... ] Output: initial uniform conditional probabilities P(english_word|french_word) for words in this corpus, each value set to 1/n where n = number of french words {E1 : {F1 : 1/n, F2 : 1/n, ..., Fn : 1/n}, {E2 : {F1 : 1/n, F2 : 1/n, ..., Fn : 1/n}, ... } >>> make_alignment_parameters(sentence_corpus) {"brother" : {"aine" : 0.5, "frere" : 0.5}, "older" : {"aine" : 0.5, "frere" : 0.5} } >>> make_alignment_parameters([[["the", "blue", "house"], ["la", "maison", "bleue"]]]) {'blue': {'maison': 0.33333333333333331, 'bleue': 0.33333333333333331, 'la': 0.33333333333333331}, 'house': {'maison': 0.33333333333333331, 'bleue': 0.33333333333333331, 'la': 0.33333333333333331}, 'the': {'maison': 0.33333333333333331, 'bleue': 0.33333333333333331, 'la': 0.33333333333333331}} >>> make_alignment_parameters([[["you", "are", "my", "favorite"], ["tu", "es", "mon", "favori"]]]) {'my': {'tu': 0.25, 'favori': 0.25, 'mon': 0.25, 'es': 0.25}, 'you': {'tu': 0.25, 'favori': 0.25, 'mon': 0.25, 'es': 0.25}, 'favorite': {'tu': 0.25, 'favori': 0.25, 'mon': 0.25, 'es': 0.25}, 'are': {'tu': 0.25, 'favori': 0.25, 'mon': 0.25, 'es': 0.25}} >>> make_alignment_parameters([ [["the", "blue", "house"], ["la", "maison", "bleue"]], [["blue", "house"], ["maison", "bleue"]], [["blue"], ["bleue"]]]) {'blue': {'maison': 0.33333333333333331, 'bleue': 0.33333333333333331, 'la': 0.33333333333333331}, 'house': {'maison': 0.33333333333333331, 'bleue': 0.33333333333333331, 'la': 0.33333333333333331}, 'the': {'maison': 0.33333333333333331, 'bleue': 0.33333333333333331, 'la': 0.33333333333333331}} """ # figure out which words we have from each language lang2 = {} lang1 = {} for (lang1_words, lang2_words) in sentences: for word in lang2_words: lang2[word] = lang2.get(word, 0) + 1 for word in lang1_words: lang1[word] = lang1.get(word, 0) + 1 # number of words in lang 2 n = len(lang2) prob = 1.0/n result = {} for word1 in lang1: result[word1] = {} for word2 in lang2: result[word1][word2] = prob return result # Step 2: Compute P(a,f | e) for all alignments. # a = alignment, f = French (or other language), e = English def make_alignment_probabilities(conditionals): """ Takes as input the parameters created in "make_alignment_parameters." {E1 : {F1 : P1, F2 : P2, ..., Fn : Pn}, E2 : {F1 : P1, F2 : P2, ..., Fn : Pn}, ... En : {F1 : P1, F2 : P2, ..., Fn : Pn} } Returns a dict of probability p(a, f|e) {E1 : {F1 : P1**n, F2 : P2**n, ..., Fn : Pn**n}, E2 : {F1 : P1**n, F2 : P2**n, ..., Fn : Pn**n}, ... En : {F1 : P1**n, F2 : P2**n, ..., Fn : Pn**n} } >>> make_alignment_probabilities( {'blue' : {'maison': 0.33333333333333331, 'bleue': 0.33333333333333331, 'la': 0.33333333333333331}, 'house': {'maison': 0.33333333333333331, 'bleue': 0.33333333333333331, 'la': 0.33333333333333331}, 'the' : {'maison': 0.33333333333333331, 'bleue': 0.33333333333333331, 'la': 0.33333333333333331}}) {'blue': {'maison': 0.037037037037037028, 'bleue': 0.037037037037037028, 'la': 0.037037037037037028}, 'house': {'maison': 0.037037037037037028, 'bleue': 0.037037037037037028, 'la': 0.037037037037037028}, 'the': {'maison': 0.037037037037037028, 'bleue': 0.037037037037037028, 'la': 0.037037037037037028}} >>> make_alignment_probabilities( {'my': {'tu': 0.25, 'favori': 0.25, 'mon': 0.25, 'es': 0.25}, 'you': {'tu': 0.25, 'favori': 0.25, 'mon': 0.25, 'es': 0.25}, 'favorite': {'tu': 0.25, 'favori': 0.25, 'mon': 0.25, 'es': 0.25}, 'are': {'tu': 0.25, 'favori': 0.25, 'mon': 0.25, 'es': 0.25}}) {'my': {'tu': 0.00390625, 'favori': 0.00390625, 'mon': 0.00390625, 'es': 0.00390625}, 'you': {'tu': 0.00390625, 'favori': 0.00390625, 'mon': 0.00390625, 'es': 0.00390625}, 'favorite': {'tu': 0.00390625, 'favori': 0.00390625, 'mon': 0.00390625, 'es': 0.00390625}, 'are': {'tu': 0.00390625, 'favori': 0.00390625, 'mon': 0.00390625, 'es': 0.00390625}} """ exponent = len(conditionals.values()) probabilities = {} for word in conditionals: probabilities[word] = {} for key in conditionals[word]: vals = conditionals[word].values() for i in range(len(vals)): probabilities[word][key] = probabilities.get(key, vals[i]**exponent) return probabilities # Step 3: Normalize P(a,f | e) values to yield P(a | e,f) values. def normalize(sentences, conditionals): """ Normalizes probabilities by factor. >>> normalize([[["the", "blue", "house"], ["la", "maison", "bleue"]],[["blue", "house"], ["maison", "bleue"]], [["blue"], ["bleue"]]], {'blue' : {'maison': 0.037037037037037028, 'bleue': 0.037037037037037028, 'la': 0.037037037037037028}, 'house': {'maison': 0.037037037037037028, 'bleue': 0.037037037037037028, 'la': 0.037037037037037028}, 'the' : {'maison': 0.037037037037037028, 'bleue': 0.037037037037037028, 'la': 0.037037037037037028}}) {'blue': 3, 'la': 1, 'house': 2, 'bleue': 3, 'maison': 2, 'the': 1} {'blue' : {'maison': 0.33333333333333337, 'bleue': 0.33333333333333337, 'la': 0.33333333333333337}, 'house': {'maison': 0.33333333333333337, 'bleue': 0.33333333333333337, 'la': 0.33333333333333337}, 'the' : {'maison': 0.33333333333333337, 'bleue': 0.33333333333333337, 'la': 0.33333333333333337}} """ probabilities = conditionals.values()[0].values() counts = {} # how many (total) of each word in all sentences? for sentence_pair in sentences: for sentence in sentence_pair: for word in sentence: counts[word] = counts.get(word, 0) + 1 print counts norm = {} for eng in conditionals: factor = 0 # normalization factor for x in conditionals[eng].values(): factor += x norm[eng] = {} for fre in conditionals[eng]: norm[eng][fre] = conditionals[eng][fre]/factor return norm
http://cs.marlboro.edu/ courses/ spring2012/jims_tutorials/ elias/ 2012-02-22
last modified Saturday February 25 2012 7:49 am EST

attachments [paper clip]

     name last modified size
   translation2.5.py Feb 25 2012 7:46 am 8.38kB    translation2.py Feb 25 2012 6:39 am 8.38kB