#!/usr/bin/env python """ Playing around with a little Hidden Markov Model of the Dow Jones average and breakfast drinks. The problem itself is from a Machine Learning course taught by Bryan Pardo at northwestern.edu, Fall 2007, at http://www.cs.northwestern.edu/~pardo/courses/eecs349/ in homework 3, assignments/eecs349-fall07-hw3.pdf """ from Numeric import * def matrix_power(mat, n): "Return mat**n where mat is a square matrix, using matrix multiplication" result = mat.copy() for i in range(n-1): result = pow(result, mat) return result print """ trans[row][col] is prob(row state at time k+1 | col state at time k) :""" trans = array([[.2, .6, .6], [.3, .2, .3], [.5, .2, .1]]) print trans print """ obs[row][col] is prob(row observe at time k | col state at time k) :""" obs = array([[.1, .6, .2], [.6, .3, .1], [.3, .1, .7]]) print obs print """ state0 is state at time t=0. (Often called "pi" in the HMM literature.) :""" state0 = array([.5, .3, .2]) print state0 print """ State and observables evolution without evidence :""" state = state0.copy() print " %6s %8s %8s %8s %8s %8s %8s " \ % ('time', 'down', 'up', 'same', 'coffee', 'oj', 'tea') print " %6s %8s %8s %8s %8s %8s %8s " \ % ('----', '----', '--', '----', '------', '--', '---') print " %6i [%8.5f %8.5f %8.5f ]" % (0, state[0], state[1], state[2]) for i in range(10): state = dot(trans, state) emit = dot(obs, state) print " %6i [%8.5f %8.5f %8.5f ] [%8.5f %8.5f %8.5f ]" \ % (i+1, state[0], state[1], state[2], emit[0], emit[1], emit[2]) print """ Now suppose we have actual observations of the morning drink, where (coffee=0, oj=1, tea=2), and the list is time=1..10 , seen is : """ seen = array([0,0,0,2,2, 1,0,2,0,0]) print seen print """ How do we figure out what the probabilities are for the dow sequences? """ print """ --------------------- Method 0: BRUTE FORCE (This is completely impractical for non-trivial cases, but still interesting conceptually.) There are 3**10 = 59049 different state sequences; the probability of each sequences is the product of 10 terms from the trans matrix and state0. For example, for the three state sequence (up,up,up) the probability is P(up t=3|up t=2)*P(up t=2|up t=1)*P(up t=1) = 0.2 * 0.2 * 0.27 Each of those 59049 sequences has a finite probability of producing any of the 3**10=5904 observation sequences. For example, the probability of seeing (coffee, oj, tea) given that the states are (up,up,up) is P(coffee|up)*P(oj|up)*P(tea|up). Therefore the probability of getting (up,up,up) and (coffee,oj,tea) is P(up t=3|up t=2)*P(up t=2|up t=1)*P(up t=1)*P(coffee|up)*P(oj|up)*P(tea|up) = 0.2 * 0.2 * 0.27 * 0.6 * 0.3 * 0.1 If we do a similar calculation for all 59049 possible state sequences and add up all the probabilities consistent with (coffee,oj,tea), we can then normalize them all and get the probabilities of each state sequence. """ # Empty dictionary seqs = {} total = [0] def loop_nm(n, m, func): """Invoke func([i1,i2,..,iN]) where each index is over range(m)""" x = [0]*n loop_nm_recur(0, x, m, func) def loop_nm_recur(i, x, m, func): """Recursive implementation of loop_nm()""" if i==len(x): func(x) else: for j in range(m): x[i]=j loop_nm_recur(i+1, x, m, func) # def print_func(x): print x # print "test loop_nm : " # loop_nm(3, 2, print_func) def init_seqs(x): seqs[str(x)]=1 print "Initializing seqs..." loop_nm(10, 3, init_seqs) print "done. Number of entries is " + str(len(seqs)) state1=dot(trans,state0) def prob_from_seq(x): """Return prob of getting observations obs given this state seq""" prob = obs[seen[0]][x[0]] * state1[x[0]] for t in range(1,10): prob *= obs[seen[t]][x[t]] * trans[x[t]][x[t-1]] return prob def update_seqs(x): p = prob_from_seq(x) seqs[str(x)] *= p total[0] += p print "Updating seqs..." loop_nm(10, 3, update_seqs) print "done." def cmp_seqs(x,y): return cmp(seqs[y],seqs[x]) print "Sorting seqs..." seqstr = seqs.keys() seqstr.sort(cmp_seqs) print "done. Largest 10 are :" for s in seqstr[:10]: print " %10.6g %20s " % (seqs[s], s) print "Total probability is " + str(total[0]) print """ If we now wanted to know the probability of a particular state at a particular time, we could sum all the applicable state sequences. ------------------------ HOWEVER, this approach blows up exponentially in time and states. There are several recursive algorithms which can do these sorts of things in linear time and space : Forward-Backward * is the backbone of all of these notions * uses two recursive formula to calculate prob(state | evidence), one using current & past evidence to get prob of current state; another using future evidence to get prob of current state. * Technique is to start at t=start, propogate evidence "messages" forward to t=end, then back to t=end, and repeat until some sort of desired convergence is found. * It can calculate * the liklihood of a given observation set for a given model * the probability of a particular state at a particular time * all "local" results, averaged over many paths Baum-Welch * can be done along with forward-backward * an EM "expectation maximization" algorithm * learns improvements to model * transition matrix based on P(state_i|state_j) counts in forward-back states Viterbi * finds most probably sequence of states * another recursive linear algorithm * best sequence to state_i is best to state_(i-1) + one more step * ... so only best so far needs to be remembered * ... and then next evidence is added in with forward-like recurence relation """