"""
 conditional.py

 Read 1's and 0's in stream*.txt;
 print analysis of conditional probabilities,
 and entropy based on conditional probabilities.

 The (long) outputs are in these files

    print_conditionals()  => conditional_probabilities.txt
    print_entropies()     => conditional_entropies.txt

 Though there are lots of numbers in those files, which helped me see that
 the calculations were good, the upshot can be summarized in this way :

 --- probabilities ---
 
 In stream1.txt, the bit probabilities are essentially independent.
 That means that P(0), P(1) alone are a good model for the data.
 Looking at priors, for example P(0|1) and P(0|0), don't give
 any extra information.

 stream1.txt :
 
  P(0) = 0.2496337890625
  P(1) = 0.7503662109375

  P(0|0)=0.2508557457212714 ~ P(0|1)=0.24923740187904178 ~ P(0)
  P(1|0)=0.7491442542787287 ~ P(1|1)=0.7507625981209582  ~ P(1)
 
 But in stream2.txt, the conditional probabilities for 1 prior bit matter.

  P(0) = 0.396942138671875
  P(1) = 0.603057861328125

  P(0|0) = 0.048358576151303144  DIFFERENT_THAN  P(0|1) = 0.626417004048583     
  P(1|0) = 0.9516414238486969    DIFFERENT_THAN  P(1|1) = 0.37358299595141703

 Putting in more priors doesn't change this model :

  P(0|00)=0.04292527821939587 ~  P(0|10) = 0.04863467442236226 ~ P(0|0)=0.048
  and similar for the other ones.

 --- entropies based on those models ---
  
 The entropy follows a similar pattern.

 In stream1.txt, just looking at P(0) an P(1) gives a good entropy;
 using more priors doesn't change it.

 H_1 = entropy using 0 priors (i.e. P(0) & P(1)) is 0.8106971777397934
 H_2 = entropy using 1 prior is                     0.8107073860770417 

 The formula I'm using for H_2 weights the log2( conditional_probability )
 with the corresponding joint probability, all of which add to 1.

 H_1 = - P(0) * log2( P(0) )
       - P(1) * log2( P(1) )
 
 H_2 = - P(01) * log2(P(1|0))
       - P(11) * log2(P(1|1))
       - P(10) * log2(P(0|1))
       - P(00) * log2(P(0|0)) - 

 where for example P(1|0) means
 "the probability of a 1 given that a 0 came right before it",
 which is a conditional probability

 while P(01) means "the probability of the string 01",
 which is a joint probability.

 In both cases, this is the calculation of the entropy for a single bit;
 that's the log2(P(1|....)) part, since it's the "1" that we're using
 inside the probability piece. The other parts are given conditions
 or weights which add to 1.

 But after all that work, the numbers are almost the same - no improvement.

 Using priors (0, 1, 2, 3) to find (H_1, H_2, H_3, H_4) successive
 measures of H, we get (rounding off to save typing space)

  stream1 : (0.810697, 0.810707, 0.810663, 0.810460)

 Looking at the same series of H based on different probability
 models for stream2 gives

  stream2 : (0.969133, 0.685836, 0.685812, 0.6858186)

 and this time we see an abrupt jump from H_1 (using
 the P(0) and P(1) only model, which is not a good model here)
 to the conditional probabilities with 1 prior,
 P(0|1), P(0|0), P(1|0), P(1|1). 

 This is a better model, and so the entropy goes down, because
 we can predict better what the next bit will be.

 Using more priors (things like P(0|00)) doesn't help;
 the entropy stays about the same.

 -- conclusion --
 
 This all means that with a good compression algorithm,
 we should be able to compress stream1 by a factor of
 about 0.82 or so, making it 18% smaller.

 And we should be able to do even better with stream2,
 compressing it by a factor of 0.69 or so, making it
 31% smaller.
  
 
 Jim Mahoney | cs.marlboro.edu | Jan 2017 | MIT License 
"""
from utilities import file_to_string
from math import log2

def make_testfile():
    """ Create a file of test bits (well, OK, ascii 1's and 0's)
        and return its filename.
    """
    bits = '1111\n0101\n'  # Newlines will be ignored when read in.
    testfilename = 'testbits.txt'
    open(testfilename, 'w').write(bits)
    return testfilename

def conditional_probabilities(bits, n_priors):
    """ Given a string and an integer length n_priors, 
        return a dict of dicts { B:{A:Pa , ... }, } where
        A is an element of the list, 
        B is a substring of length n_priors of characters preceding A, and
        Pa is the conditional probability P(A|B).
        >>> testfilename = make_testfile()
        >>> bits = file_to_string(testfilename)
        >>> for priors in range(3):
        ...   conditionals = conditional_probabilities(bits, priors)
        ...   print(str_format_conditionals(conditionals))
        P(0) = 0.25
        P(1) = 0.75
        P(1|0) = 1.0
        P(0|1) = 0.4
        P(1|1) = 0.6
        P(0|01) = 1.0
        P(1|10) = 1.0
        P(0|11) = 0.3333333333333333
        P(1|11) = 0.6666666666666666
    """
    result = {}
    # First find the integer counts of how many of each (a,b) there are.
    for i in range(len(bits)):
        if i >= n_priors:    # if there are n prior characters
            a = bits[i]      
            b = bits[ i - n_priors : i ]
            if not b in result.keys():
                result[b] = {}
            result[b][a] = result[b].get(a, 0) + 1
    # Then normalize the counts by dividing by the total for each b value.
    for b in result.keys():
        total_b = sum(result[b].values())
        for a in result[b].keys():
            result[b][a] /= total_b
    return result

def str_format_conditionals(conditionals):
    """ return nicely formatted string of conditional probabilities """
    result = ''
    for b in sorted(conditionals.keys()):
        for a in sorted(conditionals[b]):
            if b == '':  # Only happens when n_priors=0.
                result += 'P({}) = {}\n'.format(a, conditionals[b][a])
            else:
                result += 'P({}|{}) = {}\n'.format(a, b, conditionals[b][a])
    return result[:-1]  # omit last newline character

def analyze_conditionals(filename):
    print('--- {} ---\n'.format(filename))
    for n_priors in (0, 1, 2, 3):
        bits = file_to_string(filename)
        p = conditional_probabilities(bits, n_priors)
        print(str_format_conditionals(p))
        print()

def joint_probabilities(bits, length):
    """ Return dict of joint probabilities {substring: probability}
        for (overlapping) substrings of given length.
        >>> testfilename = make_testfile()
        >>> bits = file_to_string(testfilename)
        >>> joint2 = joint_probabilities(bits, 3)  # substrings of lengt
        >>> for item in sorted(joint2.items()):
        ...   print("{} : {:.3f}".format(item[0], item[1]))
        010 : 0.167
        101 : 0.333
        110 : 0.167
        111 : 0.333
    """
    # print("debug joint: bits = {}".format(bits))
    joint = {}
    n_symbols = len(bits) - length + 1   # how many (overlapping) susbtrings
    for offset in range(0, n_symbols):
        symbol = bits[offset : offset+length]
        # print("debug joint: symbol = {}".format(symbol)) # debug
        joint[symbol] = joint.get(symbol, 0) + 1
    for symbol in joint.keys():
        joint[symbol] /= n_symbols
    return joint

def analyze_entropy(filename):
    print('--- entropy in {} ---\n'.format(filename))
    for n_priors in (0, 1, 2, 3):
        bits = file_to_string(filename)
        p = conditional_probabilities(bits, n_priors)
        joint = joint_probabilities(bits, 1+n_priors)
        # p(a|b) is conditional probabilty ; joint(ba) is joint probability
        h = 0.0  # entropy = - sum of joint(ba) * log2( p(a|b) )
        for b in sorted(p.keys()):
            for a in sorted(p[b]):
                ba = b + a
                print("   p({}|{}) = {} ; p({}) = {}".format(
                    a, b, p[b][a], ba, joint[ba]))
                hh = - log2( p[b][a] )
                print("   entropy of {}|{} = -log2(p(a|b)) = {};".format(a, b, hh))
                print("   weighted contribution = {}".format(joint[ba] * hh))
                print()
                h += joint[ba] * hh
        print(" entropy with n_priors={} is {} ".format(n_priors, h))
        print()
        
def print_conditionals():
    analyze_conditionals('stream1.txt')
    analyze_conditionals('stream2.txt')

def print_entropies():
    analyze_entropy('stream1.txt')
    analyze_entropy('stream2.txt')
        
if __name__ == '__main__':
    import doctest
    doctest.testmod()