2012-02-22
Important notes
NLP-Class.org class has been delayed yet again to "either late in February or early in March."
Code
This is [preliminary] thinking [about modifications to the posted code] after I posted translation2.py, between waking up, and Tutorial.
probabilities = make_alignment_probabilities(parameters)
normalized = normalize(probabilities)
fractions = collect_fracts(normalized)
renormalized = normalize(fractions)
# Same as
normalize(collect_fracts(normalize(make_alignment_probabilities(parameters))))
# Now loop through n iterations of that.
count = 0
while count < num: # e.g. num=1000
# do it.
count += 1
return
in Jim's office
We looked at the first subroutine more closely,
compared with Knight's algorithm, and came up with this. (Not yet tested; not yet compatible with the rest of the subroutines in translation2.)
# list of pairs [english_sentence, french_sentence]
sentence_corpus = [[["older", "brother"], ["aine", "frere"]],
[["brother"], ["frere"]]]
# Step 1: Set parameter values uniformly.
def make_alignment_parameters(sentences):
""" Input: a list of pairs of "sentences" (word lists) in two languages.
[[["english", "sentence", "one"], ["french", "sentence", "one"]]
[["english", "sentence", "two"], ["french", "sentence", "two"]]
...
]
Output: initial uniform conditional probabilities
P(english_word|french_word)
for words in this corpus, each value set to 1/n where n = number of french words
{{E1 : {F1 : 1/n, F2 : 1/n, ..., Fn : 1/n},
{E2 : {F1 : 1/n, F2 : 1/n, ..., Fn : 1/n},
...
}
>>> make_alignment_parameters(sentence_corpus)
"""
# figure out which words we have from each language
lang2 = {}
lang1 = {}
for (lang1_words, lang2_words) in sentence_corpus:
for word in lang2_words:
lang2[word] = lang2.get(word, 0) + 1
for word in lang1_words:
lang1[word] = lang1.get(word, 0) + 1
# number of words in lang 2
n = len(lang2)
prob = 1.0/n
result = {}
for word1 in lang1:
result[word1] = {}
for word2 in lang2:
result[word1][word2] = prob
return result
Code and a doctest [more to be added] added to translation2 and uploaded.
My work
- Added tests to make_alignment_parameters()
- Rewrote make_alignment_probabilities()
- Rewrote normalize()
# Step 1: Set parameter values uniformly.
def make_alignment_parameters(sentences):
""" Input: a list of pairs of "sentences" (word lists) in two languages.
[[["english", "sentence", "one"], ["french", "sentence", "one"]]
[["english", "sentence", "two"], ["french", "sentence", "two"]]
...
]
Output: initial uniform conditional probabilities
P(english_word|french_word)
for words in this corpus, each value set to 1/n where n = number of french words
{E1 : {F1 : 1/n, F2 : 1/n, ..., Fn : 1/n},
{E2 : {F1 : 1/n, F2 : 1/n, ..., Fn : 1/n},
...
}
>>> make_alignment_parameters(sentence_corpus)
{"brother" : {"aine" : 0.5, "frere" : 0.5},
"older" : {"aine" : 0.5, "frere" : 0.5}
}
>>> make_alignment_parameters([[["the", "blue", "house"], ["la", "maison", "bleue"]]])
{'blue': {'maison': 0.33333333333333331, 'bleue': 0.33333333333333331, 'la': 0.33333333333333331},
'house': {'maison': 0.33333333333333331, 'bleue': 0.33333333333333331, 'la': 0.33333333333333331},
'the': {'maison': 0.33333333333333331, 'bleue': 0.33333333333333331, 'la': 0.33333333333333331}}
>>> make_alignment_parameters([[["you", "are", "my", "favorite"], ["tu", "es", "mon", "favori"]]])
{'my': {'tu': 0.25, 'favori': 0.25, 'mon': 0.25, 'es': 0.25},
'you': {'tu': 0.25, 'favori': 0.25, 'mon': 0.25, 'es': 0.25},
'favorite': {'tu': 0.25, 'favori': 0.25, 'mon': 0.25, 'es': 0.25},
'are': {'tu': 0.25, 'favori': 0.25, 'mon': 0.25, 'es': 0.25}}
>>> make_alignment_parameters([
[["the", "blue", "house"], ["la", "maison", "bleue"]],
[["blue", "house"], ["maison", "bleue"]],
[["blue"], ["bleue"]]])
{'blue': {'maison': 0.33333333333333331, 'bleue': 0.33333333333333331, 'la': 0.33333333333333331},
'house': {'maison': 0.33333333333333331, 'bleue': 0.33333333333333331, 'la': 0.33333333333333331},
'the': {'maison': 0.33333333333333331, 'bleue': 0.33333333333333331, 'la': 0.33333333333333331}}
"""
# figure out which words we have from each language
lang2 = {}
lang1 = {}
for (lang1_words, lang2_words) in sentences:
for word in lang2_words:
lang2[word] = lang2.get(word, 0) + 1
for word in lang1_words:
lang1[word] = lang1.get(word, 0) + 1
# number of words in lang 2
n = len(lang2)
prob = 1.0/n
result = {}
for word1 in lang1:
result[word1] = {}
for word2 in lang2:
result[word1][word2] = prob
return result
# Step 2: Compute P(a,f | e) for all alignments.
# a = alignment, f = French (or other language), e = English
def make_alignment_probabilities(conditionals):
""" Takes as input the parameters created in "make_alignment_parameters."
{E1 : {F1 : P1, F2 : P2, ..., Fn : Pn},
E2 : {F1 : P1, F2 : P2, ..., Fn : Pn},
...
En : {F1 : P1, F2 : P2, ..., Fn : Pn}
}
Returns a dict of probability p(a, f|e)
{E1 : {F1 : P1**n, F2 : P2**n, ..., Fn : Pn**n},
E2 : {F1 : P1**n, F2 : P2**n, ..., Fn : Pn**n},
...
En : {F1 : P1**n, F2 : P2**n, ..., Fn : Pn**n}
}
>>> make_alignment_probabilities(
{'blue' : {'maison': 0.33333333333333331, 'bleue': 0.33333333333333331, 'la': 0.33333333333333331},
'house': {'maison': 0.33333333333333331, 'bleue': 0.33333333333333331, 'la': 0.33333333333333331},
'the' : {'maison': 0.33333333333333331, 'bleue': 0.33333333333333331, 'la': 0.33333333333333331}})
{'blue': {'maison': 0.037037037037037028, 'bleue': 0.037037037037037028, 'la': 0.037037037037037028},
'house': {'maison': 0.037037037037037028, 'bleue': 0.037037037037037028, 'la': 0.037037037037037028},
'the': {'maison': 0.037037037037037028, 'bleue': 0.037037037037037028, 'la': 0.037037037037037028}}
>>> make_alignment_probabilities(
{'my': {'tu': 0.25, 'favori': 0.25, 'mon': 0.25, 'es': 0.25},
'you': {'tu': 0.25, 'favori': 0.25, 'mon': 0.25, 'es': 0.25},
'favorite': {'tu': 0.25, 'favori': 0.25, 'mon': 0.25, 'es': 0.25},
'are': {'tu': 0.25, 'favori': 0.25, 'mon': 0.25, 'es': 0.25}})
{'my': {'tu': 0.00390625, 'favori': 0.00390625, 'mon': 0.00390625, 'es': 0.00390625},
'you': {'tu': 0.00390625, 'favori': 0.00390625, 'mon': 0.00390625, 'es': 0.00390625},
'favorite': {'tu': 0.00390625, 'favori': 0.00390625, 'mon': 0.00390625, 'es': 0.00390625},
'are': {'tu': 0.00390625, 'favori': 0.00390625, 'mon': 0.00390625, 'es': 0.00390625}}
"""
exponent = len(conditionals.values())
probabilities = {}
for word in conditionals:
probabilities[word] = {}
for key in conditionals[word]:
vals = conditionals[word].values()
for i in range(len(vals)):
probabilities[word][key] = probabilities.get(key, vals[i]**exponent)
return probabilities
# Step 3: Normalize P(a,f | e) values to yield P(a | e,f) values.
def normalize(sentences, conditionals):
""" Normalizes probabilities by factor.
>>> normalize([[["the", "blue", "house"], ["la", "maison", "bleue"]],[["blue", "house"], ["maison", "bleue"]], [["blue"], ["bleue"]]],
{'blue' : {'maison': 0.037037037037037028, 'bleue': 0.037037037037037028, 'la': 0.037037037037037028},
'house': {'maison': 0.037037037037037028, 'bleue': 0.037037037037037028, 'la': 0.037037037037037028},
'the' : {'maison': 0.037037037037037028, 'bleue': 0.037037037037037028, 'la': 0.037037037037037028}})
{'blue': 3, 'la': 1, 'house': 2, 'bleue': 3, 'maison': 2, 'the': 1}
{'blue' : {'maison': 0.33333333333333337, 'bleue': 0.33333333333333337, 'la': 0.33333333333333337},
'house': {'maison': 0.33333333333333337, 'bleue': 0.33333333333333337, 'la': 0.33333333333333337},
'the' : {'maison': 0.33333333333333337, 'bleue': 0.33333333333333337, 'la': 0.33333333333333337}}
"""
probabilities = conditionals.values()[0].values()
counts = {}
# how many (total) of each word in all sentences?
for sentence_pair in sentences:
for sentence in sentence_pair:
for word in sentence:
counts[word] = counts.get(word, 0) + 1
print counts
norm = {}
for eng in conditionals:
factor = 0 # normalization factor
for x in conditionals[eng].values():
factor += x
norm[eng] = {}
for fre in conditionals[eng]:
norm[eng][fre] = conditionals[eng][fre]/factor
return norm