# entropy.py
"""
 Utilities for doing some information entropy stuff in python.

 Analyzing a text file :
   >>> from entropy import *
   >>> text = alphanumeric(read('letters.txt'))
   >>> text
   'aaabbbcccaaabbcaaabbccbbaaa'
   >>> histogram(text)
   {'a': 12, 'c': 6, 'b': 9}
   >>> entropy(text)
   1.5304930567574826

 That first test uses a small file of letters in this directory.
   --- start letters.txt ---
   aaabbbcccaaabbc
   aaabbccbbaaa
   --- end letters.txt ---

 Here's a confirmation of the first entropy calculation :
   >>> n=12+6+9
   >>> probabilities = map(lambda x:float(x)/n, [12,6,9])
   >>> from math import log
   >>> reduce(lambda x,y: x + -y*log(y,2), probabilities, 0)
   1.5304930567574826

 Higher order entropy :
   >>> "%6.4f" % entropy("aabbaa", 1)
   '0.8742'
   >>> "%6.4f" % entropy("aabbaa", 2)
   '0.4591'
   >>> "%6.4f" % entropy("aabbaa", 3)
   '0.3333'

 A look at two binary streams :

   >>> chars = {'0':1, '1':1}             # equal frequecies
   >>> text = markov_string(chars, 1000)  # e.g. '10110100111001001...'
   >>> 0 < (1.0 - entropy(text)) < 0.01   # entropy near 1.0
   True

   >>> chars = {'0': 0.01, '1': 0.99}     # very unequal probabilities
   >>> text = markov_string(chars, 1e5)   # 100,000 characters
   >>> 0 < entropy(text) < 0.1            # entropy near 0
   True

 Jim Mahoney (mahoney@marlboro.edu) | Jan 8 2009 | GPL
"""

# ------------------------------------------------------
# N'th order entropy and conditional probablity summary
#
#    H_n     = sum( p(x) H(x) )
#            = n'th order entropy 
#    x       = string of n symbols, e.g. 'aba' for n=3
#    p(x)    = probability of the string of n symbols
#    H(x)    = conditional entropy of x,
#            = entropy of next symbol given preceding x
#            = - sum_y ( p(y|x) log_2( p(y|x) ) )
#    p(y|x)  = probability of symbol y given preceding n symbols x
#            = p(x,y) / p(x)
#    p(x,y)  = probablity of x (n-symbols) followed by y (1 symbol)
# 
# Then
#
#    H_n     = sum_x[  p(x)  sum_y{ - p(y|x) log_2( p(y|x) ) }]
#            = - sum_x sum_y [ p(x) p(y|x) log_2{ p(y|x) }]
#            = - sum_x,y [ p(x,y) log_2{ p(x,y)/p(x) }]
# where
#    x    is any string from the text of length n-1, and
#    x,y  is any string from the text of length n (i.e. x followed by y).
#
# When n=1 this reduces to
#    H_1     = - sum_c [ p(c) log_2{ p(c) }]
# where the sum is over characters c.
#
# -------------------------------------------------
# cyclic boundry
#
# To keep the conditional probability definitions consistent,
# a finite text block of M characters is assumed to have periodic
# boundary conditions when calculating the frequences of sub strings.
# This is equivalent to turning the text into an infinite markov
# stream by repeating it over and over.
#
# For example, if the string is 'aabbaa', and we wrap around,
# then the substrings of length 3 are taken to be
#   {'aab', 'abb', 'bba', 'baa', 'aaa', 'aaa'}
#
# Without that assumption we would have only 4 substrings of length 3,
# and 5 substrings of length 2, which leads to the (incorrect) result
#   p(a|ba) = p(baa)/p(ba) = (1/4)/(1/5) = 1.25        (Oops.)
#
# With the cyclic boundry there are always length(text) substrings
# of any length, and this instead becomes
#   p(a|ba) = p(baa)/b(ba) = (1/6)/(1/6) = 1.0         (Correct)
# 
# ------------------------------------------------------

from sys import stdin, stdout
from random import random
from math import log
import os

def cmd_line(string):
  """Return as list of words result of executing string as unix command."""
  return os.popen(string).read().split()

def flush():
  """Send text to stdout even if line isn't complete."""
  stdout.flush()

def read(filename=""):
  """Return text from file, or from stdin if file omitted."""
  if filename:
    file = open(filename)
  else:
    file = stdin
  return file.read()


def write(text, filename=""):
  """Write text to file, or to stdin if file omitted."""  
  if filename:
    file = open(filename, 'w')
  else:
    file = stdout
  file.write(text)

def alphanumeric(text):
  """Return text with all non-alphabetic (a-z, A-Z, 0-9)
     characters removed."""
  chars = [c for c in text if
          'a' <= c <= 'z' or
          'A' <= c <= 'Z' or          
          '0' <= c <= '9']
  return ''.join(chars)

def histogram(text, n=1):
  """Return dictionary of frequencies of strings of length n
     using cyclic boundry condition.
     >>> histogram("aabc")
     {'a': 2, 'c': 1, 'b': 1}
     >>> histogram("aabaacab", 2)
     {'aa': 2, 'ac': 1, 'ab': 2, 'ca': 1, 'ba': 2}
  """
  length = len(text)
  text = text + text[:n]    # wrap beginning around to end
  counts = {}
  for i in range(length):
    string = text[i:i+n]
    counts[string] = counts.get(string,0) + 1
  return counts

def probabilities(text, n=1):
  """Return probabilities of strings of length n in text."""
  length = len(text)
  h = histogram(text, n)
  p = {}
  for string in h:
    p[string] = float(h[string])/length
  return p

def entropy(text, order=0):
  """Return information entropy base of text to given order.
     The definition used here is
       y       = one character
       x       = string of length order
       xy      = string of length order+1
       entropy =   sum_y  { p(y) * sum_x { - p(y|x) * log_2(p(y|x))}}
               = - sum_xy { p(y) * p(y|x) = 
       - sum { prob(xy) * log_2( prob(xy)/prob(x) ) }. """
  result = 0.0
  if order==0:
    p = probabilities(text)
    for char in p:
      result += - p[char] * log(p[char], 2)
  else:
    # p(y|x) = probability of char 1 given preceding string x
    #        = p(xy)/p(x) where x is a string of length 'order'
    p_x  = probabilities(text, order)
    # print p_x
    p_xy = probabilities(text, order+1)
    # print p_xy
    for xy in p_xy:
      x = xy[:-1]
      result += - p_xy[xy] * log(p_xy[xy]/p_x[x], 2)
  return result

def markov_stream(character_counts):
  """Generate a 0'th order markov
     stream given a dictionary of character probabilities."""
  total = reduce(lambda x,y:x+y,character_counts.values(),0)
  boundary = {}
  probability = {}
  mark = 0.0
  chars = character_counts.keys()
  for char in chars:
    probability[char] = float(character_counts[char])/total
    mark += probability[char]
    boundary[char] = mark
  while True:
    r = random()
    i= 0
    while r > boundary[chars[i]]: i += 1
    yield chars[i]

def markov_list(character_counts, n_chars):
  i = 0
  result = []
  for char in markov_stream(character_counts):
    i += 1
    result.append(char)
    if i >= n_chars:
      return result

def markov_string(char_counts, n):
  return ''.join(markov_list(char_counts, n))

def _test():
  from doctest import testmod
  testmod()
if __name__=="__main__": _test()