""" apr_16_homework.py (a) Find the compression fraction for moby dick using Huffman coding. $ python apr_16_homework.py --- analyzing moby dick symbols with a Huffman code --- char count prob len code ---- -------- -------- ---- -------------------- \n 22446 0.01873 6 111010 \s 194695 0.16242 3 110 ! 1741 0.00145 10 1110111111 " 2856 0.00238 9 111011110 $ 2 0.00000 19 1111110010000100000 & 2 0.00000 19 1111110010000100001 ' 2848 0.00238 9 111011011 ( 200 0.00017 13 1111110010010 ) 200 0.00017 13 1111110010001 * 45 0.00004 15 011011111001110 , 18947 0.01581 6 100110 - 5740 0.00479 8 11101110 . 7250 0.00605 8 11111101 0 123 0.00010 14 11111100100111 1 123 0.00010 13 0110111001010 2 54 0.00005 15 111111001001100 3 45 0.00004 15 011011111001111 4 34 0.00003 15 011011111001101 5 52 0.00004 15 111111001000011 6 31 0.00003 16 1111110010011010 7 47 0.00004 15 111111001000000 8 47 0.00004 15 111111001000001 9 31 0.00003 16 1111110010011011 : 192 0.00016 13 1110110100111 ; 4144 0.00346 8 01100011 ? 999 0.00083 10 0110001011 A 2378 0.00198 9 011011110 B 1364 0.00114 10 1110110011 C 1042 0.00087 10 0110111000 D 661 0.00055 11 11101100101 E 908 0.00076 11 11111100101 F 732 0.00061 11 11101101010 G 575 0.00048 11 01101110011 H 1294 0.00108 10 1110110001 I 3344 0.00279 9 111111000 J 243 0.00020 12 011011100100 K 140 0.00012 13 0110111110010 L 737 0.00061 11 11101101011 M 675 0.00056 11 11101101000 N 981 0.00082 10 0110001000 O 775 0.00065 11 11101111100 P 991 0.00083 10 0110001001 Q 320 0.00027 12 111011010010 R 640 0.00053 11 11101100100 S 1966 0.00164 10 1111110011 T 2203 0.00184 9 011011101 U 179 0.00015 13 1110110100110 V 134 0.00011 13 0110111001011 W 1210 0.00101 10 0110111111 X 15 0.00001 17 11111100100001001 Y 283 0.00024 12 011011111000 Z 34 0.00003 15 011011111001100 [ 2 0.00000 19 1111110010000100010 ] 2 0.00000 19 1111110010000100011 _ 26 0.00002 16 1111110010000101 a 74115 0.06183 4 1000 b 15238 0.01271 7 1111111 c 21100 0.01760 6 111000 d 36996 0.03086 5 10010 e 114117 0.09520 3 000 f 19743 0.01647 6 100111 g 19918 0.01662 6 101000 h 60487 0.05046 4 0010 i 61039 0.05092 4 0011 j 818 0.00068 11 11101111101 k 7798 0.00651 7 0110000 l 41315 0.03447 5 10101 m 22230 0.01855 6 111001 n 63573 0.05304 4 0101 o 67362 0.05620 4 0111 p 15970 0.01332 6 011001 q 1224 0.00102 10 1110110000 r 50519 0.04215 5 11110 s 61139 0.05100 4 0100 t 84350 0.07037 4 1011 u 26137 0.02180 6 111110 v 8285 0.00691 7 0110110 w 20565 0.01716 6 101001 x 991 0.00083 10 0110001010 y 16321 0.01362 6 011010 z 589 0.00049 11 01101111101 Mean code length is 4.51208 bits. Compression factor = mean_length/8 = 0.56401. (b) Compress with a utility and compare. Here's what I get with gzip which uses LZ77 compression. (See for example https://en.wikipedia.org/wiki/LZ77_and_LZ78.) $ gzip --best --keep moby_dick.txt $ ls -al mo* -rw-r--r-- 1 mahoney staff 1198687 Jan 26 2018 moby_dick.txt -rw-r--r-- 1 mahoney staff 488609 Jan 26 2018 moby_dick.txt.gz So the compression ratio that gives is 488609/1198687 = 0.40762. LZ77 does better by taking advantage of patterns longer than one character, unlike Huffman which uses only symbol freqency counts, not patterns of multiple symbols. Jim Mahoney | cs.marlboro.college | April 2019 | MIT License """ from huffman import Huffman def get_counts(text): """ Return dictionary of {char1:count1, char2:count2, ...} from text. >>> counts = get_counts('ababa') >>> sorted(list(counts.items())) [('a', 3), ('b', 2)] """ counts = {} for char in text: counts[char] = counts.get(char, 0) + 1 return counts def main(): print("--- analyzing moby dick symbols with a Huffman code ---") with open('moby_dick.txt', 'r') as file: text = file.read() counts = get_counts(text) n = sum(counts.values()) huffman = Huffman(counts) code = huffman.huffman_code format_string = " {:<4} {:>8} {:>8} {:>4} {:<20}" format_string2 = " {:<4} {:>8} {:>8.5f} {:>4} {:<20}" print(format_string.format('char', 'count', 'prob', 'len', 'code')) print(format_string.format('-'*4, '-'*8, '-'*8, '-'*4, '-'*20)) for sym in sorted(code.keys()): if sym == '\n': print_sym = '\\n' elif sym == ' ': print_sym = '\\s' elif sym == '\t': print_sym = '\\t' else: print_sym = sym print(format_string2.format(print_sym, counts[sym], counts[sym]/n, len(code[sym]), code[sym])) mean_length = 0.0 for sym in code: mean_length += len(code[sym]) * counts[sym]/n print() print("Mean code length is {:.5f} bits.".format(mean_length)) print("Compression factor = mean_length/8 = {:.5f}.".format( mean_length/8)) if __name__ == '__main__': import doctest doctest.testmod() main()