# compression.py """ Create a file with random (markov 0th order) characters, analyze how much it can be compressed, and print out a summary. The symbol frequences, filename, and file size must all be defined below before the program is run. Sample output from running the program : $ python compression.py --- make_random_file --- working ... done. probabilities are \n : 8 space : 64 0 : 16 1 : 1 2 : 1 3 : 1 4 : 4 5 : 4 6 : 4 7 : 8 8 : 8 9 : 16 a : 32 b : 32 c : 32 d : 32 filename : random.txt n symbols : 16 -- 0'th order -- entropy per symbol base 16 = 0.823468 entropy per symbol base 2 = 3.293870 = smallest bits per symbol possible. entropy per bit base 2 = 0.411734 = best 0'th compression if 8 bits per symbol. file size : 10000000 bytes 0th order : 4117338 bytes -- higher orders -- The statistics are at best OK to order log_16(1.0e+07) - 1 = 4.81 order 1 : 0.411732 order 2 : 0.411701 order 3 : 0.411192 order 4 : 0.405720 order 5 : 0.373826 order 6 : 0.280604 (Thinking of these as compression ratios isn't entirely correct since they ignore the size of the symbol table, which as the order grows becomes huge. In the (ridiculous) limit, limit the whole message is one 'symbol', which appears with probability 1, but the symbol table to encode it is the size of the whole file.) -- compression utilities -- gzip size : 4799221 bytes bzip2 size : 4623095 bytes And a sample of the random file created : $ head -10 random.txt bc9 b cb aa bd d bbbda a d0acc9c0d b6ca0d bd dac5 aa8c 06 0cb9 c 9 0a \ addbaa d 50a db9bb a b7ab 5678 5 d6bc c d8a70dbba c 6 d ca ddb7 abc9baa 8c 7cb a bb849 c9b 10 ab59abb8bcccbcbcb bc a8a b c c a 5 bd0d8db \ a6a9bdcbaa6a9a 6 a dd 58 bb b cbc 0c a 0ca3cdd 0 cdaac4915a0 da c ac 7 bc 7bd 0a07 d9cdb0a dc8 aab 9c ddc7ab496cb7aadc96 7d079cacb9d8b9b58a0ab 90 80ca b5 db 7dbb acb97 bd c 3d8 dd7dacd 0c0 b9 ddcb 0d0 ad7b9bd dcc 25 9 (I added \ as a line continuation; they're not in the original.) Jim Mahoney (mahoney@marlboro.edu) | Jan 9 2009 | GPL """ # --- start parameters ----------------- n = 1e7 # file size filename = 'random.txt' freqs = { # symbols and their relative probabilities '0' : 1, '1' : 1, '2' : 1, '3' : 1, '4' : 4, '5' : 4, '6' : 4, '7' : 8, '8' : 8, '9' : 16, '0' : 16, 'a' : 32, 'b' : 32, 'c' : 32, 'd' : 32, ' ' : 64, "\n" : 8, } # --- end parameters --------------------- from entropy import * from math import log n_sym = len(freqs) print " --- make_random_file --- " print " working ...", flush() text = markov_string(freqs, n) write(text, filename) e0 = entropy(text) bytes = cmd_line("wc %s" % filename)[2] # [words, lines, bytes, filename] print " done." print " probabilities are" for char in sorted(freqs.keys()): if char == ' ': symbol = 'space' elif char == "\n": symbol = "\\n" else: symbol = char print " %5s : %3i " % (symbol, freqs[char]) print " filename : %s" % filename print " n symbols : %i" % n_sym print " -- 0'th order --" print " entropy per symbol base %2i = %f" % (n_sym, e0 * log(2, n_sym)) print " entropy per symbol base 2 = %f" % e0 print " = smallest bits per symbol possible." print " entropy per bit base 2 = %f" % (e0/8) print " = best 0'th compression if 8 bits per symbol." print " file size : %s bytes" % bytes print " 0th order : %i bytes" % int(float(n)*e0/8) print " -- higher orders --" print " The statistics are at best OK to order log_%i(%4.1e) - 1 = %5.2f " \ % (n_sym, float(n), log(float(n),n_sym)-1) for order in range(1,7): e_n = entropy(text, order) print " order %i : %f " % (order, e_n/8) print " (Thinking of these as compression ratios isn't entirely correct " print " since they ignore the size of the symbol table, " print " which as the order grows becomes huge. In the (ridiculous) limit," print " limit the whole message is one 'symbol', which appears with " print " probability 1, but the symbol table to encode it is the size " print " of the whole file.)" print " -- compression utilities -- " # gzip's compression algorithm is LZ77 cmd_line("cp %s tmp" % filename) cmd_line("gzip -f tmp") bytes_gz = cmd_line("wc tmp.gz")[2] print " gzip size : %s bytes" % bytes_gz # bzip2's compression algorithm is burrows-wheeler and huffman cmd_line("cp %s tmp" % filename) cmd_line("bzip2 -f tmp") bytes_bz = cmd_line("wc tmp.bz2")[2] print " bzip2 size : %s bytes" % bytes_bz cmd_line("rm -f tmp*")