"""
 apr_16_homework.py

 (a) Find the compression fraction for moby dick using Huffman coding.

     $ python apr_16_homework.py 
     --- analyzing moby dick symbols with a Huffman code ---
        char     count      prob   len     code                
        ----  --------  --------  ----     --------------------
        \n       22446   0.01873     6     111010              
        \s      194695   0.16242     3     110                 
        !         1741   0.00145    10     1110111111          
        "         2856   0.00238     9     111011110           
        $            2   0.00000    19     1111110010000100000 
        &            2   0.00000    19     1111110010000100001 
        '         2848   0.00238     9     111011011           
        (          200   0.00017    13     1111110010010       
        )          200   0.00017    13     1111110010001       
        *           45   0.00004    15     011011111001110     
        ,        18947   0.01581     6     100110              
        -         5740   0.00479     8     11101110            
        .         7250   0.00605     8     11111101            
        0          123   0.00010    14     11111100100111      
        1          123   0.00010    13     0110111001010       
        2           54   0.00005    15     111111001001100     
        3           45   0.00004    15     011011111001111     
        4           34   0.00003    15     011011111001101     
        5           52   0.00004    15     111111001000011     
        6           31   0.00003    16     1111110010011010    
        7           47   0.00004    15     111111001000000     
        8           47   0.00004    15     111111001000001     
        9           31   0.00003    16     1111110010011011    
        :          192   0.00016    13     1110110100111       
        ;         4144   0.00346     8     01100011            
        ?          999   0.00083    10     0110001011          
        A         2378   0.00198     9     011011110           
        B         1364   0.00114    10     1110110011          
        C         1042   0.00087    10     0110111000          
        D          661   0.00055    11     11101100101         
        E          908   0.00076    11     11111100101         
        F          732   0.00061    11     11101101010         
        G          575   0.00048    11     01101110011         
        H         1294   0.00108    10     1110110001          
        I         3344   0.00279     9     111111000           
        J          243   0.00020    12     011011100100        
        K          140   0.00012    13     0110111110010       
        L          737   0.00061    11     11101101011         
        M          675   0.00056    11     11101101000         
        N          981   0.00082    10     0110001000          
        O          775   0.00065    11     11101111100         
        P          991   0.00083    10     0110001001          
        Q          320   0.00027    12     111011010010        
        R          640   0.00053    11     11101100100         
        S         1966   0.00164    10     1111110011          
        T         2203   0.00184     9     011011101           
        U          179   0.00015    13     1110110100110       
        V          134   0.00011    13     0110111001011       
        W         1210   0.00101    10     0110111111          
        X           15   0.00001    17     11111100100001001   
        Y          283   0.00024    12     011011111000        
        Z           34   0.00003    15     011011111001100     
        [            2   0.00000    19     1111110010000100010 
        ]            2   0.00000    19     1111110010000100011 
        _           26   0.00002    16     1111110010000101    
        a        74115   0.06183     4     1000                
        b        15238   0.01271     7     1111111             
        c        21100   0.01760     6     111000              
        d        36996   0.03086     5     10010               
        e       114117   0.09520     3     000                 
        f        19743   0.01647     6     100111              
        g        19918   0.01662     6     101000              
        h        60487   0.05046     4     0010                
        i        61039   0.05092     4     0011                
        j          818   0.00068    11     11101111101         
        k         7798   0.00651     7     0110000             
        l        41315   0.03447     5     10101               
        m        22230   0.01855     6     111001              
        n        63573   0.05304     4     0101                
        o        67362   0.05620     4     0111                
        p        15970   0.01332     6     011001              
        q         1224   0.00102    10     1110110000          
        r        50519   0.04215     5     11110               
        s        61139   0.05100     4     0100                
        t        84350   0.07037     4     1011                
        u        26137   0.02180     6     111110              
        v         8285   0.00691     7     0110110             
        w        20565   0.01716     6     101001              
        x          991   0.00083    10     0110001010          
        y        16321   0.01362     6     011010              
        z          589   0.00049    11     01101111101         
     
     Mean code length is 4.51208 bits.
     Compression factor = mean_length/8 = 0.56401.

 (b) Compress with a utility and compare.

     Here's what I get with gzip which uses LZ77 compression.
     (See for example https://en.wikipedia.org/wiki/LZ77_and_LZ78.)

       $ gzip --best --keep moby_dick.txt 
       $ ls -al mo*
       -rw-r--r--  1 mahoney  staff  1198687 Jan 26  2018 moby_dick.txt
       -rw-r--r--  1 mahoney  staff   488609 Jan 26  2018 moby_dick.txt.gz

    So the compression ratio that gives is 488609/1198687 = 0.40762.

    LZ77 does better by taking advantage of patterns longer than one character,
    unlike Huffman which uses only symbol freqency counts, not patterns of 
    multiple symbols.

 Jim Mahoney | cs.marlboro.college | April 2019 | MIT License
"""

from huffman import Huffman

def get_counts(text):
    """ Return dictionary of {char1:count1, char2:count2, ...} from text.
        >>> counts = get_counts('ababa')
        >>> sorted(list(counts.items()))
        [('a', 3), ('b', 2)]
    """
    counts = {}
    for char in text:
        counts[char] = counts.get(char, 0) + 1
    return counts

def main():
    print("--- analyzing moby dick symbols with a Huffman code ---")
    with open('moby_dick.txt', 'r') as file:
        text = file.read()
    counts = get_counts(text)
    n = sum(counts.values())
    huffman = Huffman(counts)
    code = huffman.huffman_code
    format_string = "   {:<4}  {:>8}  {:>8}  {:>4}     {:<20}"
    format_string2 = "   {:<4}  {:>8}  {:>8.5f}  {:>4}     {:<20}"    
    print(format_string.format('char', 'count', 'prob', 'len', 'code'))
    print(format_string.format('-'*4, '-'*8, '-'*8, '-'*4, '-'*20))
    for sym in sorted(code.keys()):
        if sym == '\n':
            print_sym = '\\n'
        elif sym == ' ':
            print_sym = '\\s'
        elif sym == '\t':
            print_sym = '\\t'
        else:
            print_sym = sym
        print(format_string2.format(print_sym,
                                   counts[sym],
                                   counts[sym]/n,
                                   len(code[sym]),
                                   code[sym]))
    mean_length = 0.0
    for sym in code:
        mean_length += len(code[sym]) * counts[sym]/n
    print()
    print("Mean code length is {:.5f} bits.".format(mean_length))
    print("Compression factor = mean_length/8 = {:.5f}.".format(
        mean_length/8))

if __name__ == '__main__':
    import doctest
    doctest.testmod()
    main()