/********* * Using a hash to count words in a file. * An exercise in the Algorithms class. * * Jim Mahoney (mahoney@marlboro.edu) * * -------------------------------- * * version 0.3 * - fixed "seg fault" errors in word reading routines. * * version 0.2, Oct 24 2002 * - words (a-z A-Z - _) can be read in from a text file one by one * - hash table data structure * ***********/ #define VERSION "version 0.3, Oct 24 2002" #include #include #include // ==== typedef's, define's, and all that. ===================== // (May go in a .h file later) // The file I'm using has about 5000 distinct words. // The size of the hash table should be something bigger than that. // And it should be prime. (See hashFunction(), below.) // Give up trying to add things to the hash if it gets bigger than bailout. #define HASH_SIZE 20011 #define HASH_BAILOUT_SIZE 10000 #define TRUE 1 #define FALSE 0 #define INPUT_BUFFER_SIZE 80 // File globals char* fileName = "bskrv10.txt"; FILE* theFile; char inputBuffer[INPUT_BUFFER_SIZE]; char* nextCharPtr; // Data structure typedef struct DataStruct_ { char* key; // i.e. "Sherlock" int value; // Times it occurs in the file. // for hash table analysis. int skips; // how many times this entry collided before finding a home. int index; // where it is in the hash table // Hey! It's a doubly linked list, too! struct DataStruct_* next; struct DataStruct_* prev; } DataStruct, *Data; //typedef struct ListSectionStruct_ { // Data head; // Data tail; //} ListSectionStruct, *ListSection; // The hash table. // A big array of pointers to data structures. // (And they're linked into a list, too.) int nHashEntries; Data hash[HASH_SIZE]; Data listHead; Data listTail; // Subroutine declarations. void initFileStuff(); char* getNextWord(); int isWordChar(char c); double timeInSec(); int hashFunction(const char *key); int skipFunction(const char *key); void initHash(); Data fetchData(char* key); Data createData(char* key); void printData(Data d); void printHashFunc(char* key); void testHashFunc(); void testHash(); void testWords(); void countWordsInFile(); void showHashStats(); void showSomeOfList(Data d, int howMany); void swap(Data a, Data b); void quickSort(ListSection ls); // ==== subroutine implementations ============================== //--------------- void initHash(){ int n; for (n=0;nkey = key; d->value = 0; d->skips = 0; d->index = -1; d->next = NULL; d->prev = NULL; return d; } //------ // Insert a new data structure at index h in the hash table. // Update list pointers, head, tail, and number of entries. void addToHash(Data d, int h){ // printf(" adding "); printData(d); // printf(" to hash at index %i \n",h); if (nHashEntries == 0){ listHead = d; listTail = d; } else { listTail->next = d; d->prev = listTail; listTail = d; } nHashEntries++; d->index=h; hash[h] = d; } //----------- // Given a key, return the corresponding Data structure from the hash. // If there isn't one in the hash, create one and add it in. Data fetchData(char* key){ int h; int skip; int nskips; Data d; // printf(" fetching '%s' ", key); h = hashFunction(key); // printf(" h = %i \n", h); d = hash[h]; // printData(d); if (d==NULL) { // printf(" Found empty slot; creating new data\n"); d = createData(key); // printf(" new data: "); printData(d); addToHash(d,h); return d; } else if ( strcmp(key,d->key)==0 ) { return d; } else { // --------- collision -------------- // I handle collisions by computing a "skip" value // (essentially another hash function) and then skipping forward // in the hash array by this increment until I find the Data // or an empty slot. if ( nHashEntries >= HASH_BAILOUT_SIZE ) { printf(" *** OOPS **** the has grown too big. Bailing out. \n"); exit(1); } nskips=0; skip = skipFunction(key); while (1) { // keep looking until we find it or find an empty slot. nskips++; h = (h + skip) % HASH_SIZE; d = hash[h]; if (d==NULL) { d = createData(key); addToHash(d,h); d->skips=nskips; return d; } else if ( strcmp(key,d->key)==0 ) { return d; } } } } // --------------------- // Initialize file globals: theFile, inputBuffer, nextCharPtr void initFileStuff(){ theFile = fopen(fileName,"r"); // open read-only inputBuffer[0] = '\0'; nextCharPtr = inputBuffer; } // --------------------- // Test to see if given character is part of a word, // namely that it's either a-z, A-Z, -, or _ int isWordChar(char c){ if ( ( c>='a' && c<='z') || ( c>='A' && c<='Z') || ( c == '-' ) || ( c == '_' ) ){ return TRUE; } else { return FALSE; } } // --------------------- // Return next word from global theFile, using input buffers // and our definition of "word" characters. char* getNextWord(){ int n, nCharsInWord; char thisChar; char* charPtr; char* newWord; //printf(" next word: \n"); while (! isWordChar(*nextCharPtr) ) { // If *nextCharPtr is at the end of a string, // read new stuff into the buffer and reset nextCharPtr if ( *nextCharPtr == '\0') { fscanf(theFile, "%s", inputBuffer); // printf(" new input buffer = '%s'\n", inputBuffer); nextCharPtr = inputBuffer; if (feof(theFile)){ return NULL; } } // If we're looking at text that's not in a word, skip past it. // (But don't read past the end of the string) while ( (! isWordChar(*nextCharPtr)) && ( *nextCharPtr != '\0' ) ){ // printf(" skipping '%c' \n", *nextCharPtr); nextCharPtr++; } } // Now we're at the start of the next word. Count its characters. nCharsInWord=0; while ( isWordChar(*nextCharPtr) ){ // printf(" word contains '%c' \n", *nextCharPtr); nCharsInWord++; nextCharPtr++; } // Allocate space for that word, make it lower case, and copy it. newWord = (char*)malloc(nCharsInWord+1); //printf(" copying the word, %i chars\n", nCharsInWord); for (n=0; n='A') && (thisChar<='Z') ){ thisChar = (char)((int)thisChar + (int)'a' - (int)'A'); } newWord[n] = thisChar; } newWord[nCharsInWord] = '\0'; // end of string marker // printf(" '%s' with %d chars. \n", // newWord,nCharsInWord); return newWord; } // ----------------- // Variation on the hashFunction, for offsets within // the table after collisions. int skipFunction(const char *key){ const char *ptr; int val; int tmp; val = 0; ptr = key; while (*ptr != '\0') { // loop until end of string // printf(" ptr='%c', val=%p \n", *ptr, val); val = (val << 5) + (*ptr); if (tmp = (val & 0xf0000000)) { val = val ^ (tmp >> 24); val = val ^ tmp; } ptr++; // next character. } return abs(val % HASH_SIZE); } // --------------------- // From "Algorithms" text, "int hashpjw(const void *key)" // Note that // << bitwise left shift // & bitwise AND // ^ bitwise XOR int hashFunction(const char *key) { // Hash the key by performing a number of bit operations on it. const char *ptr; int val; int tmp; val = 0; ptr = key; while (*ptr != '\0') { // loop until end of string // printf(" ptr='%c', val=%p \n", *ptr, val); val = (val << 4) + (*ptr); // shift val 6 if (tmp = (val & 0xf0000000)) { // If it's getting too big, val = val ^ (tmp >> 24); // kill the highest bits. val = val ^ tmp; } ptr++; // next character. } return abs(val % HASH_SIZE); // HASH_SIZE should be prime. // (I added the abs(), since I was // seeing some negatives that looked iffy. } // --------------------- // See if we can read in the words of the file. void testWords(){ char* word; int n; initFileStuff(); for (n=0;n<200;n++){ word = getNextWord(); printf(" %i : '%s' \n", n, word); } } // --------------------- void printHashFunc(char* key){ printf(" '%s' => %i, %i \n", key, hashFunction(key), skipFunction(key)); } // --------------------- // A look at some of the values returned by the hash function. void testHashFunc(){ printf(" sample hash function values \n"); printHashFunc("One"); printHashFunc("Two"); printHashFunc("Hello"); printHashFunc("Jello"); printHashFunc("GoodBye"); printHashFunc("GoodByi"); printHashFunc("This is a pretty long string"); printHashFunc("a"); printHashFunc("b"); } void printData(Data d){ if (d==NULL){ printf(" Data = NULL \n"); } else { printf(" '%s' : value=%i, next=%p, prev=%p, index=%i \n", d->key, d->value, d->next, d->prev, d->index); } } // ------------- void testHash(){ Data d; printf(" testing hash \n"); d = fetchData("Hi"); printData(d); d = fetchData("Fred"); printData(d); d = fetchData("George"); printData(d); d = fetchData("Fred"); printData(d); } // -------------- double timeInSec(){ return ((double)clock())/CLOCKS_PER_SEC; } // ------------- void countWordsInFile(){ char* word; int nWords; double startTime; Data d; initFileStuff(); printf(" Looking at words in '%s' ...", fileName); startTime=timeInSec(); nWords=0; while (1) { nWords++; word = getNextWord(); // printf("Word %d is '%s'.\n", nWords, word); // if (nWords>300){ // printf(" word limit hit; exiting. \n"); // exit(1); // } if ( word == NULL) { break; } //printf(" Creating Data ..."); d = fetchData(word); //printf(" done.\n"); d->value++; // increment the number of times we've seen this word. } printf(" done.\n Examined %i words in %5.2f sec.\n", nWords, (timeInSec()-startTime)); } //-------------------- void showSomeOfList(Data d, int howMany){ int n; for (n=0;nkey, d->value); d=d->next; if ( d==NULL) { return; } } } //-------------------- void showHashStats(){ int nCollisions; int skipStats; int skipMax; Data d; nCollisions=0; skipStats=0; skipMax=0; d = listHead; while ( d != NULL) { if ( d->skips > 0 ) { nCollisions++; } skipStats += d->skips; if ( d->skips > skipMax ) { skipMax = d->skips; } d = d->next; } printf(" Of %i entries in the %i slot hash, \n", nHashEntries, HASH_SIZE); printf(" %i were collisions, with the largest skip number = %i.\n", nCollisions, skipMax); } //------------------------ //------------------------ void quickSort(Data head, Data tail){ /// *** this isn't working yet *** if (head==tail) { return; } beforeHead = head->prev; afterTail = tail->next; pivot = head; left = head->next; right = tail; while (left != right) { while ( (left->value <= pivot->value) &7 (left != right) ){ left = left->next; } while (right->value > pivot->value){ right = right->prev; } swap(right,left); // always? } // partition boundry has merged; // drop in pivot before or after this final one // call quicksort on both sides // Input: a-b-c-d-e-f // (a) choose first as pivot // (b) partition list // -b-c- -a- -d-e-f- // ones before ones after // (c) call recursively on left and right pieces. } // --head--a--b--left--c--d--e--right--p--q--tail-- // .... < pivot // Implement this as a recursive descent, in place, // swapping-elements-with-edge-elments quicksort. // // (1) The "ListSection" holds the head and tail of a section of a list. // (Null pointers do *not* need to be at the ends.) // If these are the same on entry, just return. // (2) Next, pick a pivot which is the average of the left and right values. // (3) Walk the list, swapping values lower than the pivot to the left end, // and values higher than the pivot to the right end. // Move the "ends" in towards each other, marking // the edges of the unsorted stuff. When the ends meet, we have values // lower than the pivot on the left, and values higher on the right. // ls->head,..,left,right,...,ls->tail // Make sure that swaps change ls->head and ls->tail as need be. // (4) Then construct two smaller ListSections, and do again recursively. // // === Let 'er rip. =============== int main() { //testWords(); //testHashFunc(); //testHash(); printf(" --- Counting words with a hash --- \n"); printf(VERSION); printf("\n"); countWordsInFile(); printf(" Number of unique words put in hash = %i \n", nHashEntries); printf(" First few in the list: \n"); showSomeOfList(listHead,40); showHashStats(); }