#!/usr/bin/env python
"""
 Playing around with a little Hidden Markov Model
 of the Dow Jones average and breakfast drinks.

 The problem itself is from a Machine Learning course
 taught by Bryan Pardo at northwestern.edu, Fall 2007, at
 http://www.cs.northwestern.edu/~pardo/courses/eecs349/
 in homework 3, assignments/eecs349-fall07-hw3.pdf

 """

from Numeric import *

def matrix_power(mat, n):
    "Return mat**n where mat is a square matrix, using matrix multiplication"
    result = mat.copy()
    for i in range(n-1):
        result = pow(result, mat)
    return result

print """
trans[row][col] is prob(row state at time k+1 | col state at time k) :"""
trans = array([[.2, .6, .6], [.3, .2, .3], [.5, .2, .1]])
print trans

print """
obs[row][col] is prob(row observe at time k | col state at time k) :"""
obs = array([[.1, .6, .2], [.6, .3, .1], [.3, .1, .7]])
print obs

print """
state0 is state at time t=0.  (Often called "pi" in the HMM literature.) :"""
state0 = array([.5, .3, .2])
print state0

print """
State and observables evolution without evidence :"""
state = state0.copy()
print " %6s    %8s  %8s  %8s     %8s  %8s  %8s " \
      % ('time', 'down', 'up', 'same', 'coffee', 'oj', 'tea')
print " %6s    %8s  %8s  %8s     %8s  %8s  %8s " \
      % ('----', '----', '--', '----', '------', '--', '---')
print " %6i   [%8.5f  %8.5f  %8.5f ]" % (0, state[0], state[1], state[2])
for i in range(10):
    state = dot(trans, state)
    emit = dot(obs, state)
    print " %6i   [%8.5f  %8.5f  %8.5f ]  [%8.5f  %8.5f  %8.5f ]" \
          % (i+1, state[0], state[1], state[2], emit[0], emit[1], emit[2])

print """
Now suppose we have actual observations of the morning drink,
where (coffee=0, oj=1, tea=2), and the list is time=1..10 ,
seen is : """
seen = array([0,0,0,2,2, 1,0,2,0,0])
print seen

print """
How do we figure out what the probabilities are for the dow sequences?
"""

print """
---------------------
Method 0: BRUTE FORCE
  (This is completely impractical for non-trivial cases,
   but still interesting conceptually.)
  There are 3**10 = 59049 different state sequences;
  the probability of each sequences is the product of
  10 terms from the trans matrix and state0.

  For example, for the
  three state sequence (up,up,up) the probability is
  P(up t=3|up t=2)*P(up t=2|up t=1)*P(up t=1) = 0.2 * 0.2 * 0.27

  Each of those 59049 sequences has a finite probability
  of producing any of the 3**10=5904 observation sequences.

  For example, the probability of seeing (coffee, oj, tea)
  given that the states are (up,up,up) is P(coffee|up)*P(oj|up)*P(tea|up).

  Therefore the probability of getting (up,up,up) and (coffee,oj,tea) is
   P(up t=3|up t=2)*P(up t=2|up t=1)*P(up t=1)*P(coffee|up)*P(oj|up)*P(tea|up)
   = 0.2 * 0.2 * 0.27 * 0.6 * 0.3 * 0.1

  If we do a similar calculation for all 59049 possible state sequences
  and add up all the probabilities consistent with (coffee,oj,tea),
  we can then normalize them all and get the probabilities of each
  state sequence.
"""

# Empty dictionary 
seqs = {}
total = [0]

def loop_nm(n, m, func):
    """Invoke func([i1,i2,..,iN]) where each index is over range(m)"""
    x = [0]*n
    loop_nm_recur(0, x, m, func)

def loop_nm_recur(i, x, m, func):
    """Recursive implementation of loop_nm()"""
    if i==len(x):
        func(x)
    else:
        for j in range(m):
            x[i]=j
            loop_nm_recur(i+1, x, m, func)

# def print_func(x): print x
# print "test loop_nm : "
# loop_nm(3, 2, print_func)

def init_seqs(x): seqs[str(x)]=1
print "Initializing seqs..."
loop_nm(10, 3, init_seqs)
print "done.  Number of entries is " + str(len(seqs))

state1=dot(trans,state0)
def prob_from_seq(x):
    """Return prob of getting observations obs given this state seq"""
    prob = obs[seen[0]][x[0]] * state1[x[0]]
    for t in range(1,10):
        prob *= obs[seen[t]][x[t]] * trans[x[t]][x[t-1]]
    return prob

def update_seqs(x):
    p = prob_from_seq(x)
    seqs[str(x)] *= p
    total[0] += p
print "Updating seqs..."
loop_nm(10, 3, update_seqs)
print "done."

def cmp_seqs(x,y): return cmp(seqs[y],seqs[x])
print "Sorting seqs..."
seqstr = seqs.keys()
seqstr.sort(cmp_seqs)
print "done.  Largest 10 are :"
for s in seqstr[:10]:
    print " %10.6g   %20s " % (seqs[s], s)
print "Total probability is " + str(total[0])


print """
If we now wanted to know the probability of a particular
state at a particular time, we could sum all the
applicable state sequences.

------------------------
HOWEVER, this approach blows up exponentially in time and states.

There are several recursive algorithms which can do these
sorts of things in linear time and space :

Forward-Backward
 * is the backbone of all of these notions
 * uses two recursive formula to calculate prob(state | evidence),
   one using current & past evidence to get prob of current state;
   another using future evidence to get prob of current state.
 * Technique is to start at t=start, propogate evidence "messages"
   forward to t=end, then back to t=end, and repeat until some
   sort of desired convergence is found.
 * It can calculate
   * the liklihood of a given observation set for a given model
   * the probability of a particular state at a particular time
 * all "local" results, averaged over many paths

Baum-Welch
 * can be done along with forward-backward
 * an EM "expectation maximization" algorithm
 * learns improvements to model
 * transition matrix based on P(state_i|state_j) counts in forward-back states

Viterbi
 * finds most probably sequence of states
 * another recursive linear algorithm
 * best sequence to state_i is best to state_(i-1) + one more step
 * ... so only best so far needs to be remembered
 * ... and then next evidence is added in with forward-like recurence relation

"""