# compression.py
"""
 Create a file with random (markov 0th order) characters,
 analyze how much it can be compressed,
 and print out a summary.

 The symbol frequences, filename, and file size
 must all be defined below before the program is run.

 Sample output from running the program :

  $ python compression.py 

   --- make_random_file --- 
   working ...  done.
   probabilities are
          \n :   8 
       space :  64 
           0 :  16 
           1 :   1 
           2 :   1 
           3 :   1 
           4 :   4 
           5 :   4 
           6 :   4 
           7 :   8 
           8 :   8 
           9 :  16 
           a :  32 
           b :  32 
           c :  32 
           d :  32 
   filename   : random.txt
   n symbols  : 16
   -- 0'th order --
     entropy per symbol base 16 = 0.823468
     entropy per symbol base  2 = 3.293870
       = smallest bits per symbol possible.
     entropy per bit base 2     = 0.411734
       = best 0'th compression if 8 bits per symbol.
     file size  : 10000000 bytes
     0th order  : 4117338 bytes
   -- higher orders --
     The statistics are at best OK to order log_16(1.0e+07) - 1 =  4.81 
     order 1 : 0.411732 
     order 2 : 0.411701 
     order 3 : 0.411192 
     order 4 : 0.405720 
     order 5 : 0.373826 
     order 6 : 0.280604 
       (Thinking of these as compression ratios isn't entirely correct 
        since they ignore the size of the symbol table, 
        which as the order grows becomes huge. In the (ridiculous) limit,
        limit the whole message is one 'symbol', which appears with 
        probability 1, but the symbol table to encode it is the size 
        of the whole file.)
   -- compression utilities -- 
     gzip size  : 4799221 bytes
     bzip2 size : 4623095 bytes

 And a sample of the random file created :
  
  $ head -10 random.txt
  bc9 b cb aa  bd d bbbda a   d0acc9c0d b6ca0d bd  dac5 aa8c 06 0cb9 c 9 0a \
  addbaa  d 50a db9bb  a b7ab 5678 5  d6bc c d8a70dbba   c 6
  d  ca 
  ddb7 abc9baa
  8c 7cb a bb849 c9b  10   ab59abb8bcccbcbcb bc a8a b c c  a 5 bd0d8db \
  a6a9bdcbaa6a9a  6 a dd  58 bb b  cbc 0c a 0ca3cdd 0 cdaac4915a0
    da c ac     7 bc 7bd 0a07
  d9cdb0a   dc8   aab 9c ddc7ab496cb7aadc96 7d079cacb9d8b9b58a0ab 90 80ca b5
  db 7dbb acb97
  bd c 
  3d8
  dd7dacd 0c0  b9 ddcb 0d0 ad7b9bd dcc  25 9

 (I added \ as a line continuation; they're not in the original.)

 Jim Mahoney (mahoney@marlboro.edu) | Jan 9 2009 | GPL
"""

# --- start parameters -----------------

n = 1e7                    # file size
filename = 'random.txt'     
freqs = {                  # symbols and their relative probabilities
  '0' :   1,
  '1' :   1,    
  '2' :   1,
  '3' :   1,
  '4' :   4,
  '5' :   4,
  '6' :   4,
  '7' :   8,
  '8' :   8,
  '9' :  16,
  '0' :  16,
  'a' :  32,
  'b' :  32,
  'c' :  32,
  'd' :  32,
  ' '  : 64,
  "\n" : 8,
  }

# --- end parameters ---------------------

from entropy import *
from math import log

n_sym = len(freqs)

print " --- make_random_file --- "
print " working ...",
flush()

text = markov_string(freqs, n)
write(text, filename)
e0 = entropy(text)
bytes = cmd_line("wc %s" % filename)[2] # [words, lines, bytes, filename]

print " done."
print " probabilities are"
for char in sorted(freqs.keys()):
  if char == ' ':
    symbol = 'space'
  elif char == "\n":
    symbol = "\\n"
  else:
    symbol = char
  print "     %5s : %3i " % (symbol, freqs[char])
print " filename   : %s" % filename
print " n symbols  : %i" % n_sym
print " -- 0'th order --"
print "   entropy per symbol base %2i = %f" % (n_sym, e0 * log(2, n_sym))
print "   entropy per symbol base  2 = %f" % e0
print "     = smallest bits per symbol possible."
print "   entropy per bit base 2     = %f" % (e0/8)
print "     = best 0'th compression if 8 bits per symbol."
print "   file size  : %s bytes" % bytes
print "   0th order  : %i bytes" % int(float(n)*e0/8)
print " -- higher orders --"
print "   The statistics are at best OK to order log_%i(%4.1e) - 1 = %5.2f " \
      % (n_sym, float(n), log(float(n),n_sym)-1)
for order in range(1,7):
  e_n = entropy(text, order)
  print "   order %i : %f " % (order, e_n/8)
print "     (Thinking of these as compression ratios isn't entirely correct "
print "      since they ignore the size of the symbol table, "
print "      which as the order grows becomes huge. In the (ridiculous) limit,"
print "      limit the whole message is one 'symbol', which appears with "
print "      probability 1, but the symbol table to encode it is the size "
print "      of the whole file.)"
print " -- compression utilities -- "

# gzip's compression algorithm is LZ77
cmd_line("cp %s tmp" % filename)
cmd_line("gzip -f tmp")
bytes_gz = cmd_line("wc tmp.gz")[2]
print "   gzip size  : %s bytes" % bytes_gz

# bzip2's compression algorithm is burrows-wheeler and huffman
cmd_line("cp %s tmp" % filename)
cmd_line("bzip2 -f tmp")
bytes_bz = cmd_line("wc tmp.bz2")[2]
print "   bzip2 size : %s bytes" % bytes_bz

cmd_line("rm -f tmp*")