# entropy.py """ Utilities for doing some information entropy stuff in python. Analyzing a text file : >>> from entropy import * >>> text = alphanumeric(read('letters.txt')) >>> text 'aaabbbcccaaabbcaaabbccbbaaa' >>> histogram(text) {'a': 12, 'c': 6, 'b': 9} >>> entropy(text) 1.5304930567574826 That first test uses a small file of letters in this directory. --- start letters.txt --- aaabbbcccaaabbc aaabbccbbaaa --- end letters.txt --- Here's a confirmation of the first entropy calculation : >>> n=12+6+9 >>> probabilities = map(lambda x:float(x)/n, [12,6,9]) >>> from math import log >>> reduce(lambda x,y: x + -y*log(y,2), probabilities, 0) 1.5304930567574826 Higher order entropy : >>> "%6.4f" % entropy("aabbaa", 1) '0.8742' >>> "%6.4f" % entropy("aabbaa", 2) '0.4591' >>> "%6.4f" % entropy("aabbaa", 3) '0.3333' A look at two binary streams : >>> chars = {'0':1, '1':1} # equal frequecies >>> text = markov_string(chars, 1000) # e.g. '10110100111001001...' >>> 0 < (1.0 - entropy(text)) < 0.01 # entropy near 1.0 True >>> chars = {'0': 0.01, '1': 0.99} # very unequal probabilities >>> text = markov_string(chars, 1e5) # 100,000 characters >>> 0 < entropy(text) < 0.1 # entropy near 0 True Jim Mahoney (mahoney@marlboro.edu) | Jan 8 2009 | GPL """ # ------------------------------------------------------ # N'th order entropy and conditional probablity summary # # H_n = sum( p(x) H(x) ) # = n'th order entropy # x = string of n symbols, e.g. 'aba' for n=3 # p(x) = probability of the string of n symbols # H(x) = conditional entropy of x, # = entropy of next symbol given preceding x # = - sum_y ( p(y|x) log_2( p(y|x) ) ) # p(y|x) = probability of symbol y given preceding n symbols x # = p(x,y) / p(x) # p(x,y) = probablity of x (n-symbols) followed by y (1 symbol) # # Then # # H_n = sum_x[ p(x) sum_y{ - p(y|x) log_2( p(y|x) ) }] # = - sum_x sum_y [ p(x) p(y|x) log_2{ p(y|x) }] # = - sum_x,y [ p(x,y) log_2{ p(x,y)/p(x) }] # where # x is any string from the text of length n-1, and # x,y is any string from the text of length n (i.e. x followed by y). # # When n=1 this reduces to # H_1 = - sum_c [ p(c) log_2{ p(c) }] # where the sum is over characters c. # # ------------------------------------------------- # cyclic boundry # # To keep the conditional probability definitions consistent, # a finite text block of M characters is assumed to have periodic # boundary conditions when calculating the frequences of sub strings. # This is equivalent to turning the text into an infinite markov # stream by repeating it over and over. # # For example, if the string is 'aabbaa', and we wrap around, # then the substrings of length 3 are taken to be # {'aab', 'abb', 'bba', 'baa', 'aaa', 'aaa'} # # Without that assumption we would have only 4 substrings of length 3, # and 5 substrings of length 2, which leads to the (incorrect) result # p(a|ba) = p(baa)/p(ba) = (1/4)/(1/5) = 1.25 (Oops.) # # With the cyclic boundry there are always length(text) substrings # of any length, and this instead becomes # p(a|ba) = p(baa)/b(ba) = (1/6)/(1/6) = 1.0 (Correct) # # ------------------------------------------------------ from sys import stdin, stdout from random import random from math import log import os def cmd_line(string): """Return as list of words result of executing string as unix command.""" return os.popen(string).read().split() def flush(): """Send text to stdout even if line isn't complete.""" stdout.flush() def read(filename=""): """Return text from file, or from stdin if file omitted.""" if filename: file = open(filename) else: file = stdin return file.read() def write(text, filename=""): """Write text to file, or to stdin if file omitted.""" if filename: file = open(filename, 'w') else: file = stdout file.write(text) def alphanumeric(text): """Return text with all non-alphabetic (a-z, A-Z, 0-9) characters removed.""" chars = [c for c in text if 'a' <= c <= 'z' or 'A' <= c <= 'Z' or '0' <= c <= '9'] return ''.join(chars) def histogram(text, n=1): """Return dictionary of frequencies of strings of length n using cyclic boundry condition. >>> histogram("aabc") {'a': 2, 'c': 1, 'b': 1} >>> histogram("aabaacab", 2) {'aa': 2, 'ac': 1, 'ab': 2, 'ca': 1, 'ba': 2} """ length = len(text) text = text + text[:n] # wrap beginning around to end counts = {} for i in range(length): string = text[i:i+n] counts[string] = counts.get(string,0) + 1 return counts def probabilities(text, n=1): """Return probabilities of strings of length n in text.""" length = len(text) h = histogram(text, n) p = {} for string in h: p[string] = float(h[string])/length return p def entropy(text, order=0): """Return information entropy base of text to given order. The definition used here is y = one character x = string of length order xy = string of length order+1 entropy = sum_y { p(y) * sum_x { - p(y|x) * log_2(p(y|x))}} = - sum_xy { p(y) * p(y|x) = - sum { prob(xy) * log_2( prob(xy)/prob(x) ) }. """ result = 0.0 if order==0: p = probabilities(text) for char in p: result += - p[char] * log(p[char], 2) else: # p(y|x) = probability of char 1 given preceding string x # = p(xy)/p(x) where x is a string of length 'order' p_x = probabilities(text, order) # print p_x p_xy = probabilities(text, order+1) # print p_xy for xy in p_xy: x = xy[:-1] result += - p_xy[xy] * log(p_xy[xy]/p_x[x], 2) return result def markov_stream(character_counts): """Generate a 0'th order markov stream given a dictionary of character probabilities.""" total = reduce(lambda x,y:x+y,character_counts.values(),0) boundary = {} probability = {} mark = 0.0 chars = character_counts.keys() for char in chars: probability[char] = float(character_counts[char])/total mark += probability[char] boundary[char] = mark while True: r = random() i= 0 while r > boundary[chars[i]]: i += 1 yield chars[i] def markov_list(character_counts, n_chars): i = 0 result = [] for char in markov_stream(character_counts): i += 1 result.append(char) if i >= n_chars: return result def markov_string(char_counts, n): return ''.join(markov_list(char_counts, n)) def _test(): from doctest import testmod testmod() if __name__=="__main__": _test()