/*********
 * Using a hash to count words in a file.
 * An exercise in the Algorithms class.
 *
 * Jim Mahoney (mahoney@marlboro.edu)
 *
 * --------------------------------
 *
 * version 0.3
 *   - fixed "seg fault" errors in word reading routines.
 *
 * version 0.2, Oct 24 2002
 *   - words (a-z A-Z - _) can be read in from a text file one by one
 *   - hash table data structure
 * 
 ***********/
#define VERSION "version 0.3, Oct 24 2002"

#include <stdio.h>
#include <string.h>
#include <time.h>

// ==== typedef's, define's, and all that.  =====================
//      (May go in a .h file later)

// The file I'm using has about 5000 distinct words.
// The size of the hash table should be something bigger than that.
// And it should be prime.  (See hashFunction(), below.)
// Give up trying to add things to the hash if it gets bigger than bailout.
#define HASH_SIZE 20011
#define HASH_BAILOUT_SIZE 10000

#define TRUE 1
#define FALSE 0
#define INPUT_BUFFER_SIZE 80

// File globals
char* fileName = "bskrv10.txt";
FILE* theFile;
char  inputBuffer[INPUT_BUFFER_SIZE];
char* nextCharPtr;

// Data structure
typedef struct DataStruct_ {
  char* key;					// i.e. "Sherlock"
  int	value;					// Times it occurs in the file.
  // for hash table analysis.
  int skips; // how many times this entry collided before finding a home.
  int index; // where it is in the hash table
  // Hey! It's a doubly linked list, too!
  struct DataStruct_*	next;
  struct DataStruct_*	prev;
} DataStruct, *Data;

//typedef struct ListSectionStruct_ {
//  Data head;
//  Data tail;
//} ListSectionStruct, *ListSection;

// The hash table.  
// A big array of pointers to data structures.
// (And they're linked into a list, too.)
int nHashEntries;
Data hash[HASH_SIZE];
Data listHead;
Data listTail;

// Subroutine declarations.
void initFileStuff();
char* getNextWord();
int isWordChar(char c);
double timeInSec();

int hashFunction(const char *key);
int skipFunction(const char *key);
void initHash();
Data fetchData(char* key);

Data createData(char* key);
void printData(Data d);

void printHashFunc(char* key);
void testHashFunc();
void testHash();
void testWords();

void countWordsInFile();
void showHashStats();
void showSomeOfList(Data d, int howMany);

void swap(Data a, Data b);
void quickSort(ListSection ls);

// ==== subroutine implementations ==============================

//---------------
void initHash(){
  int n;
  for (n=0;n<HASH_SIZE;n++){
    hash[n] = NULL;		// Set 'em all to empty.
  }
  nHashEntries=0;
  listHead = NULL;
  listTail = NULL;
}

//---------------
// Given a ptr to a string, create a DataStruct
// with that string as its key.  (This doesn't allocate
// new memory for the string, but only copies the pointer.)
Data createData(char* key){
  Data  d  = (Data)malloc(sizeof(DataStruct));
  d->key   = key;
  d->value = 0;
  d->skips = 0;
  d->index = -1;
  d->next  = NULL;
  d->prev  = NULL;
  return d;
}

//------
// Insert a new data structure at index h in the hash table.
// Update list pointers, head, tail, and number of entries.
void addToHash(Data d, int h){
  // printf(" adding "); printData(d);
  // printf(" to hash at index %i \n",h);
  if (nHashEntries == 0){
    listHead = d;
    listTail = d;
  }
  else {
    listTail->next = d;
    d->prev = listTail;
    listTail = d;
  }
  nHashEntries++;
  d->index=h;
  hash[h] = d;
}

//-----------
// Given a key, return the corresponding Data structure from the hash.
// If there isn't one in the hash, create one and add it in.
Data fetchData(char* key){
  int h;
  int skip;
  int nskips;
  Data d;
  // printf("   fetching '%s' ", key);
  h = hashFunction(key);
  // printf("   h = %i \n", h);
  d = hash[h];
  // printData(d);
  if (d==NULL) {
    // printf(" Found empty slot; creating new data\n");
    d = createData(key);
    // printf(" new data: ");  printData(d);
    addToHash(d,h);
    return d;
  }
  else if ( strcmp(key,d->key)==0 ) {
    return d;
  }
  else {  //  --------- collision --------------
    // I handle collisions by computing a "skip" value
    // (essentially another hash function) and then skipping forward
    // in the hash array by this increment until I find the Data
    // or an empty slot.
    if ( nHashEntries >= HASH_BAILOUT_SIZE ) {
      printf("  *** OOPS **** the has grown too big.  Bailing out. \n");
      exit(1);
    }
    nskips=0;
    skip = skipFunction(key);
    while (1) {	   // keep looking until we find it or find an empty slot.
      nskips++;
      h = (h + skip) % HASH_SIZE;
      d = hash[h];
      if (d==NULL) {
	d = createData(key);
	addToHash(d,h);
	d->skips=nskips;
	return d;
      }
      else if ( strcmp(key,d->key)==0 ) {
	return d;
      }
    }
  }
}

// ---------------------
// Initialize file globals:  theFile, inputBuffer, nextCharPtr
void initFileStuff(){
  theFile = fopen(fileName,"r");	  // open read-only
  inputBuffer[0] = '\0';
  nextCharPtr = inputBuffer;
}

// ---------------------
// Test to see if given character is part of a word, 
// namely that it's either a-z, A-Z, -, or _
int isWordChar(char c){
  if ( ( c>='a' && c<='z') ||
       ( c>='A' && c<='Z') ||
       ( c == '-' ) || ( c == '_' )       ){
    return TRUE;
  }
  else {
    return FALSE;
  }
}

// ---------------------
// Return next word from global theFile, using input buffers 
// and our definition of "word" characters.
char* getNextWord(){
  int n, nCharsInWord;
  char thisChar;
  char* charPtr;
  char* newWord;

  //printf(" next word: \n");

  while (! isWordChar(*nextCharPtr) ) {
    // If *nextCharPtr is at the end of a string, 
    // read new stuff into the buffer and reset nextCharPtr
    if ( *nextCharPtr == '\0') {
      fscanf(theFile, "%s", inputBuffer);
      // printf("  new input buffer = '%s'\n", inputBuffer);
      nextCharPtr = inputBuffer;
      if (feof(theFile)){
	return NULL;
      }
    }
    // If we're looking at text that's not in a word, skip past it.
    // (But don't read past the end of the string)
    while ( (! isWordChar(*nextCharPtr)) && ( *nextCharPtr != '\0' ) ){
      // printf("  skipping '%c' \n", *nextCharPtr);
      nextCharPtr++;
    }
  }
  // Now we're at the start of the next word.  Count its characters.
  nCharsInWord=0;
  while ( isWordChar(*nextCharPtr) ){
    // printf("   word contains '%c' \n", *nextCharPtr);
    nCharsInWord++;
    nextCharPtr++;
  }
  // Allocate space for that word, make it lower case, and copy it.
  newWord = (char*)malloc(nCharsInWord+1);
  //printf("   copying the word, %i chars\n", nCharsInWord);
  for (n=0; n<nCharsInWord; n++){
    thisChar = *(nextCharPtr-nCharsInWord+n);
    if ( (thisChar>='A') && (thisChar<='Z') ){
      thisChar = (char)((int)thisChar + (int)'a' - (int)'A');
    }
    newWord[n] = thisChar;
  }
  newWord[nCharsInWord] = '\0';  // end of string marker
  // printf(" '%s' with %d chars. \n",
  //	 newWord,nCharsInWord);
  return newWord;
}

// -----------------
// Variation on the hashFunction, for offsets within
// the table after collisions.
int skipFunction(const char *key){
  const char *ptr;
  int val;
  int tmp;
  val = 0;
  ptr = key;					
  while (*ptr != '\0') {		// loop until end of string
    // printf("    ptr='%c', val=%p \n", *ptr, val);
    val = (val << 5) + (*ptr);		
    if (tmp = (val & 0xf0000000)) {	
      val = val ^ (tmp >> 24);		
      val = val ^ tmp;
    }
    ptr++;				// next character.
  }
  return abs(val % HASH_SIZE);
}

// ---------------------
// From "Algorithms" text, "int hashpjw(const void *key)"
// Note that 
//            <<   bitwise left shift
//            &    bitwise AND
//            ^    bitwise XOR
int hashFunction(const char *key) {
  // Hash the key by performing a number of bit operations on it.
  const char *ptr;
  int val;
  int tmp;
  val = 0;
  ptr = key;					
  while (*ptr != '\0') {		// loop until end of string
    // printf("    ptr='%c', val=%p \n", *ptr, val);
    val = (val << 4) + (*ptr);		//   shift val 6
    if (tmp = (val & 0xf0000000)) {	//   If it's getting too big,
      val = val ^ (tmp >> 24);		//   kill the highest bits.
      val = val ^ tmp;
    }
    ptr++;				// next character.
  }
  return abs(val % HASH_SIZE);	    // HASH_SIZE should be prime.
				    // (I added the abs(), since I was
				    // seeing some negatives that looked iffy.
}

// ---------------------
// See if we can read in the words of the file.
void testWords(){
  char* word;
  int n;
  initFileStuff();
  for (n=0;n<200;n++){
    word = getNextWord();
    printf(" %i : '%s' \n", n, word);
  }
}
// ---------------------
void printHashFunc(char* key){
  printf(" '%s' => %i, %i \n", key, hashFunction(key), skipFunction(key));
}
 
// ---------------------
// A look at some of the values returned by the hash function.
void testHashFunc(){
  printf(" sample hash function values \n");
  printHashFunc("One");
  printHashFunc("Two");
  printHashFunc("Hello");
  printHashFunc("Jello");
  printHashFunc("GoodBye");
  printHashFunc("GoodByi");
  printHashFunc("This is a pretty long string");
  printHashFunc("a");
  printHashFunc("b");
}

void printData(Data d){
  if (d==NULL){
    printf(" Data = NULL \n");
  }
  else {
    printf(" '%s' : value=%i, next=%p, prev=%p, index=%i \n", 
	   d->key, d->value, d->next, d->prev, d->index);
  }
}

// -------------
void testHash(){
  Data d; 
  printf(" testing hash \n");
  d = fetchData("Hi");
  printData(d);
  d = fetchData("Fred");
  printData(d);
  d = fetchData("George");
  printData(d);
  d = fetchData("Fred");
  printData(d);
}

// --------------
double timeInSec(){
  return ((double)clock())/CLOCKS_PER_SEC;
}

// -------------
void countWordsInFile(){
  char* word;
  int nWords;
  double startTime;
  Data d;
  initFileStuff();
  printf(" Looking at words in '%s' ...", fileName);
  startTime=timeInSec();
  nWords=0;
  while (1) {
    nWords++;
    word = getNextWord();
    // printf("Word %d is '%s'.\n", nWords, word);
    // if (nWords>300){
    //  printf(" word limit hit; exiting. \n");
    //  exit(1);
    // }
    if ( word == NULL) {
      break;
    }
    //printf(" Creating Data ...");
    d = fetchData(word);
    //printf(" done.\n");
    d->value++;     // increment the number of times we've seen this word.
  }
  printf(" done.\n Examined %i words in %5.2f sec.\n", 
	 nWords, (timeInSec()-startTime));
} 

//--------------------
void showSomeOfList(Data d, int howMany){
  int n;
  for (n=0;n<howMany;n++){
    printf("   %-20s  %10i \n", d->key, d->value);
    d=d->next;
    if ( d==NULL) {
      return;
    }
  }
}

//--------------------
void showHashStats(){
  int nCollisions;
  int skipStats;
  int skipMax;
  Data d;
  nCollisions=0;
  skipStats=0;
  skipMax=0;
  d = listHead;
  while ( d != NULL) {
    if ( d->skips > 0 ) { 
      nCollisions++;
    }
    skipStats += d->skips;
    if ( d->skips > skipMax ) {
      skipMax = d->skips;
    }
    d = d->next;
  }
  printf(" Of %i entries in the %i slot hash, \n", 
	 nHashEntries, HASH_SIZE);
  printf(" %i were collisions, with the largest skip number = %i.\n", 
	 nCollisions, skipMax);
}

//------------------------

//------------------------
void quickSort(Data head, Data tail){
  /// *** this isn't working yet ***

  if (head==tail) {
    return;
  }
  beforeHead = head->prev;
  afterTail  = tail->next;
  pivot = head;
  left = head->next;
  right = tail;

  while (left != right) {

    while ( (left->value <= pivot->value) &7 (left != right) ){
      left = left->next;
    }

    while (right->value > pivot->value){
      right = right->prev;
    }

    swap(right,left);   // always?  

  }

  // partition boundry has merged;
  // drop in pivot before or after this final one
  // call quicksort on both sides


  // Input:  a-b-c-d-e-f
  //   (a) choose first as pivot
  //   (b) partition list
  //       -b-c-         -a-    -d-e-f-
  //       ones before          ones after
  //   (c) call recursively on left and right pieces.

}

//   --head--a--b--left--c--d--e--right--p--q--tail--
//     .... < pivot     


  // Implement this as a recursive descent, in place, 
  // swapping-elements-with-edge-elments quicksort.
  //
  // (1) The "ListSection" holds the head and tail of a section of a list.  
  //     (Null pointers do *not* need to be at the ends.)
  //     If these are the same on entry, just return.
  // (2) Next, pick a pivot which is the average of the left and right values.
  // (3) Walk the list, swapping values lower than the pivot to the left end, 
  //     and values higher than the pivot to the right end.  
  //     Move the "ends" in towards each other, marking
  //     the edges of the unsorted stuff.  When the ends meet, we have values
  //     lower than the  pivot on the left, and values higher on the right.
  //        ls->head,..,left,right,...,ls->tail
  //     Make sure that swaps change ls->head and ls->tail as need be.
  // (4) Then construct two smaller ListSections, and do again recursively.
  //


// === Let 'er rip. ===============

int main() {

  //testWords();
  //testHashFunc();
  //testHash();

  printf(" --- Counting words with a hash --- \n");
  printf(VERSION); printf("\n");

  countWordsInFile();
  printf(" Number of unique words put in hash = %i \n", nHashEntries);

  printf(" First few in the list: \n");
  showSomeOfList(listHead,40);

  showHashStats();

}