/***** * huffman.c * * Routines implementing of a huffman code from an analysis of a text file. * Look in run_huffman.c to see how they're used. * * $Id: huffman.c 13167 2007-04-30 03:54:16Z mahoney $ ****************/ #include #include #include #include #include #include "utility.h" #include "heap.h" #include "huffman.h" // Number of characters to read from file in one gulp. #define BUFFER_SIZE 1000 // Given a code (e.g. "001" and a digit to append (e.g. 1), // return a pointer to a new longer code (e.g. "0011") char* new_code(char* old_code, int digit){ int length = 1 + strlen(old_code); char* code = (char*) safe_malloc((length+1)*sizeof(char)); char new_digit; strcpy(code, old_code); code[length-1] = (char)(digit+(int)'0'); code[length] = (char)0; // string termination return code; } // Recursively walk down the huffman tree. "code" is the string // "e.g. 001" corresponding to the current node hn; therefore, // strlen(code) is the current depth down the tree. At the terminal // nodes (which have ascii symbols but no children), the code is // stored in codes[ascii]; otherwise, calculate_codes is called // recursively on the children appending appending "0" and "1" to the // code for each branch. void calculate_codes(huffnode hn, char* code, char* codes[]){ int b; if (hn->branches[0] == NULL){ codes[(char)hn->ascii] = code; } else { for (b=0; b<2; b++){ calculate_codes(hn->branches[b], new_code(code,b), codes); } } } huffnode make_huffman_tree(heap the_heap){ int new_weight; node n1 = pop_heap(the_heap); // Pull out two node n2 = pop_heap(the_heap); // smallest nodes. huffnode hn_new, hn1, hn2; while (n1 != NULL && n2 != NULL){ // While nodes left, new_weight = get_node_weight(n1) + get_node_weight(n2); hn1 = (huffnode) get_node_data(n1); // create a new node hn2 = (huffnode) get_node_data(n2); // whose weight is hn_new = new_huffnode(new_weight, (char)0, hn1, hn2); // sum of two pulled, push_heap(the_heap, hn_new->heapnode); // add it to heap, n1 = pop_heap(the_heap); // and then get two n2 = pop_heap(the_heap); // more smallest. } assert(n1 != NULL); // n1 shouldn't be null; exit with error if it is. return (huffnode)get_node_data(n1); } // Input are freqs[i] for each possible symbol i. For those that // aren't 0, a terminal node of the huffman tree is created and placed // in the corresponding part of the nodes array. Empty nodes[i] are // set to NULL. void frequencies2huffnodes(int freqs[], huffnode nodes[]){ int i; for (i=0; iheapnode; n_nodes++; } } return new_heap(n_nodes, nodes, descending); } huffnode new_huffnode(int freq, char c, huffnode left, huffnode right){ huffnode hn = (huffnode) safe_malloc(sizeof(struct huffnode_struct)); hn->heapnode = new_node(freq, hn); hn->ascii = c; hn->branches[0] = left; hn->branches[1] = right; return hn; } // Count how many times each possible symbol occurs in the file with // the given name, and put the results into the the frequencies array. // for each possible symbol. // Ignore the first start_offset characters in the file, // and only count up to max_chars (unless max_chars is negative). void calculate_frequencies(char* filename, int frequencies[], int start_offset, int max_chars){ FILE* file; int i, n_chars; int file_index, chars_examined; char buffer[BUFFER_SIZE]; for (i=0; i start_offset) && ((max_chars < 0) || (max_chars < chars_examined))){ frequencies[(int)buffer[i]]++; chars_examined++; } } } fclose(file); } void print_summary(int freq[], char* codes[]){ int i; printf(" %8s %8s %8s %30s \n", "char", "ascii", "freq", "code"); printf(" %8s %8s %8s %30s \n", "----", "-----", "----", "----"); for (i=0; i<256; i++){ if (freq[i] != 0){ if ((char)i == '\n'){ // display newline as '\n' printf(" 0x%02x '\\n' %8i %30s \n", i, freq[i], codes[i]); } else { printf(" 0x%02x '%c' %8i %30s \n", i, i, freq[i], codes[i]); } } } } void print_analysis(int freq[], char* codes[]){ int i; int n_file_chars = 0; int n_coded_bits = 0; int n_codes = 0; double avg_code_length = 0.0; double avg_square = 0.0; double std_dev; for (i=0; i0){ n_codes++; n_coded_bits += freq[i] * strlen(codes[i]); avg_code_length += 1.0*strlen(codes[i]); avg_square += pow(1.0*strlen(codes[i]),2); n_file_chars += freq[i]; } } avg_code_length = avg_code_length / n_codes; avg_square = avg_square / n_codes; std_dev = sqrt( avg_square - pow(avg_code_length,2)); printf("-- analysis --\n"); printf("Total bytes (bits) in original file is %i (%i).\n", n_file_chars, 8*n_file_chars); printf("Number of different codes = %i\n", n_codes); printf("Average code length = %g with standard deviation = %g \n", avg_code_length, std_dev); printf("Number of coded bits is %i\n", n_coded_bits); printf("Compression ratio (without table) is %g\n", (1.0*n_coded_bits)/(8.0*n_file_chars)); }