""" utilities.py tested with python 3.5.2 Jim Mahoney | cs.marlboro.edu | Feb 2017 | MIT License """ from math import log2 def file_to_string(filename): """ Return the text from a file as one string, with newlines omitted. """ return open(filename).read().replace('\n', '') def string_to_substrings(thestring, substringsize): """ Return list of consecutive substrings of thestring of given size. If len(thestring) doesn't evenly divide substringsize, then the last chunk of thestring which is smaller than substringsize will be ignored. >>> string_to_substrings('1234', 2) ['12', '34'] >>> string_to_substrings('12345', 2) ['12', '34'] """ substrings = [] offset = 0 while offset + substringsize <= len(thestring): substrings.append(thestring[offset : offset + substringsize]) offset += substringsize return substrings def file_to_substrings(filename, substringsize): """ Open file, get text, convert to chunks of given size """ return string_to_substrings(file_to_string(filename), substringsize) def probabilities(symbols): """ Given a list of strings, return a probability dictionary i.e. { string: p(string), ...} >>> sorted( probabilities(['a', 'a', 'b', 'c']).items() ) [('a', 0.5), ('b', 0.25), ('c', 0.25)] """ # No conditional probability here - just counting times for each symbol. probability = {} for symbol in symbols: # count 'em probability[symbol] = probability.get(symbol, 0) + 1 for symbol in probability.keys(): # normalize probability[symbol] /= float(len(symbols)) return probability def mean(numbers): """ Return the mean of a list of numbers """ return sum(numbers)/len(numbers) def entropy(probability): """ Return the entropy of a set of probabilities """ h = 0.0 for p in probability.values(): h += - p * log2(p) return h if __name__ == '__main__': import doctest doctest.testmod()