#ifdef HAVE_CONFIG_H
#include "config.h"
#endif

#include <savant.h>
#include <savutil.h>
#include <template.h>
#include <savantio.h>
#include <main.h>

void savant_index(List_of_Files *);
int do_vector(FileInfo *, DB_INT, DB_INT, List_of_Docs *, DB_INT *);
int do_title(FileInfo *, List_of_Docs *, int);
char *make_nice_name(char *);
void make_nice_date(struct parsedate *pd, char *buffer);
void make_nice_time(struct parsedate *pd, char *buffer);
void make_nice_day (struct parsedate *pd, char *buffer);

void 
savant_index(List_of_Files *all_files)
{
  char error_owner[] = "(none)";
  DB_INT doc_count;
  List_of_Files *cur_src, *prev_src;
  List_of_Docs *docs, *prev_doc;
  FileInfo cur_src_info;
  struct stat statbuf;
  struct passwd *passbuf;
  struct parsedate *pd;

  init_docvecs();

  if (SavantVerbose) {
    for(cur_src=all_files, doc_count=0; cur_src!=NULL; cur_src=cur_src->next)
      doc_count += cur_src->file->num_docs;
    printf("%d documents... indexing 000000", doc_count);
    fflush(stdout);
  }

  doc_count = 0;
  cur_src = all_files;
  while(cur_src != NULL) {
    cur_src_info.filename = cur_src->file->filename;
    if((cur_src_info.file = fopen(cur_src_info.filename,"r")) == NULL) {
      fprintf(stderr,"s_index: Unable to open %s for reading\n", 
	      cur_src_info.filename);
      fflush(stderr);
    }
    else {
      if(SavantDebug) {
	fprintf(stderr, "\nindexing %s\n", cur_src_info.filename);
      }
      /* fill in fileinfo for default cases */
      stat(cur_src_info.filename, &statbuf);
      passbuf = getpwuid(statbuf.st_uid);
      if(passbuf != NULL) {
	cur_src_info.owner = passbuf->pw_name;
      } 
      else {
	cur_src_info.owner = error_owner;
      }
      cur_src_info.date = ctime(&statbuf.st_mtime);
      cur_src_info.date[strlen(cur_src_info.date) - 1] = '\0';
      cur_src_info.pd = parsedate(cur_src_info.date);
      cur_src_info.size = (size_t)statbuf.st_size;

      docs = cur_src->file->docs;
      while(cur_src->file->num_docs > 0) {
	if(docs->next != NULL) {
	  do_vector(&cur_src_info, doc_count, docs->next->doc_start, 
		    docs, cur_src->file->biases);
	}
	else {
	  do_vector(&cur_src_info, doc_count, -1, docs, cur_src->file->biases);
	} 
	do_title(&cur_src_info, docs, doc_count);
	/* clean up the document list */
	prev_doc = docs;
	docs = docs->next;
	free(prev_doc);
	cur_src->file->num_docs--;
        doc_count++;
	if(SavantVerbose) {
	  printf("\b\b\b\b\b\b%06d", doc_count);
	  fflush(stdout);
	}
      }
    }
    /* clean up the file list*/
    fclose(cur_src_info.file);
    free(cur_src->file->filename);
    free(cur_src->file);
    prev_src = cur_src;
    cur_src = cur_src->next;
    free(prev_src);
  }

  if(SavantVerbose) {
    printf("\nWriting database...");
    fflush(stdout);
  }
  finalize_write();

  if(SavantVerbose) {
    printf("done.\n");
    fflush(stdout);
  }

}

int do_vector(FileInfo *info,
	      DB_INT doc_num,
	      DB_INT next_doc,
	      List_of_Docs *doc,
	      DB_INT *biases)
{
  DV_Tree *vector, *subject_vector=NULL, *location_vector=NULL, *source_vector=NULL;
  DV_Tree *date_vector=NULL, *time_vector=NULL, *day_vector=NULL;
  int i, num_windows;
  char save_char, *body_buffer, *window_ptr, *next_window_ptr, *end_ptr;
  DB_INT dl_offset, body_size, window_offset;

  /* do_vector breaks the document into windows, vectorizes and saves each of 
     these, writes some other info to disk and returns the number of windows found. */

  fwrite_big(biases, sizeof(DB_INT), NUM_FIELD_TYPES, BIAS_FILE); 

  if (doc->body_start == -1) { /* -1 indicates body was not found */
    return(0);
  }

  /* save the important info for this document */
  dl_offset = ftell(DOCLOC_FILE);
  fprintf(DOCLOC_FILE, "%s\n", info->filename);
  if(next_doc == -1) { /* use rest of file */
    body_size = info->size - doc->body_start;
  }
  else { /* suck in everything up to next_doc */
    body_size = next_doc - doc->body_start;
  }
  fwrite_big(&dl_offset, sizeof(DB_INT), 1, DLOFF_FILE);
  fwrite_big(&(doc->body_start), sizeof(DB_INT), 1, DLOFF_FILE);
  fwrite_big(&body_size, sizeof(DB_INT), 1, DLOFF_FILE);

  /* vectorize the subject line */
  if ((doc->subject[0] != -1) && (doc->subject[1] != -1)) {
    subject_vector = vectorize_file(info->file, doc->subject[0], doc->subject[1], 
				    1, NULL, SUBJECT_FIELD);				    
    /*    printf("\nIn subject stuff: subject_vector = %d, doc->subject[0] = %d, doc->subject[1] = %d\n", 
	  subject_vector, doc->subject[0], doc->subject[1]);
	  */
  }
  /* vectorize the source line */
  if ((doc->source[0] != -1) && (doc->source[1] != -1)) {
    source_vector = vectorize_file(info->file, doc->source[0], doc->source[1], 
				   1, NULL, SOURCE_FIELD);
    /*    printf("\nIn source stuff: source_vector = %d, doc->source[0] = %d, doc->source[1] = %d\n", 
	  source_vector, doc->source[0], doc->source[1]);
	  */
  }
  /* vectorize the location line */
  if ((doc->location[0] != -1) && (doc->location[1] != -1)) {
    location_vector = vectorize_file(info->file, doc->location[0], doc->location[1], 
				     1, NULL, LOCATION_FIELD);
    /*    printf("\nIn location stuff: location_vector = %d, doc->location[0] = %d, doc->location[1] = %d\n", 
	  location_vector, doc->location[0], doc->location[1]);
	  */  
  }
  /* vectorize the date line into date, time, and day vectors */
  if ((doc->date[0] != -1) && (doc->date[1] != -1)) {
    date_vector = vectorize_file(info->file, doc->date[0], doc->date[1], 1, NULL, DATE_FIELD);
    time_vector = vectorize_file(info->file, doc->date[0], doc->date[1], 1, NULL, TIME_FIELD);
    day_vector = vectorize_file(info->file, doc->date[0], doc->date[1], 1, NULL, DAY_FIELD);
    /*    printf("\nIn Date stuff: date_vector = %d, doc->date[0] = %d, doc->date[1] = %d\n", 
	  date_vector, doc->date[0], doc->date[1]);
	  */  
  }

  /* snarf the document */
  body_buffer = (char *)malloc(body_size+1);
  fseek(info->file, doc->body_start, SEEK_SET);
  fread_big(body_buffer, 1, body_size, info->file);
  body_buffer[body_size] = '\0';
  
  /* do the vectorizing */
  if (Config.windowing) {
    num_windows = 0;
    /* initialize the window pointers */
    next_window_ptr = end_ptr = body_buffer;
    for(i=0; (i<(Config.lines_per_window+1)/2) && (end_ptr != NULL); i++) {
      /* if we run off the end of the buffer, end_ptr == NULL. 
	 We'll use this to test the while loop below. */
      end_ptr = strchr(end_ptr, '\n');
      if (end_ptr != NULL) end_ptr++;
    }

    do {
      /* set start of window and find end */
      window_ptr = next_window_ptr; 
      next_window_ptr = end_ptr;
      for(i=0; (i<(Config.lines_per_window+1)/2) && (end_ptr != NULL); i++) {
	end_ptr = strchr(end_ptr, '\n');
	if (end_ptr != NULL) end_ptr++;
      }
      /* temporarily cut off body_buffer at end_ptr and vectorize */
      if(end_ptr != NULL) {
	save_char = *end_ptr;
	*end_ptr = '\0';
      }
      vector = vectorize_buffer(window_ptr, BODY_FIELD);
      if(end_ptr != NULL) {
	*end_ptr = save_char;
      }

      /* save the stuff */
      if(subject_vector != NULL) {
	/* printf("merging subject\n");*/
	merge_dvtrees(vector, subject_vector);
      }
      if(source_vector != NULL) {
	/* printf("merging source\n");*/
	merge_dvtrees(vector, source_vector);
      }
      if(location_vector != NULL) {
	/* printf("merging location\n");*/
	merge_dvtrees(vector, location_vector);
      }
      if(date_vector != NULL) {
	/* printf("merging date\n");*/
	merge_dvtrees(vector, date_vector);
      }
      if(time_vector != NULL) {
	/* printf("merging time\n");*/
	merge_dvtrees(vector, time_vector);
      }
      if(day_vector != NULL) {
	/* printf("merging day\n");*/
	merge_dvtrees(vector, day_vector);
      }


      save_dv(vector, doc_num);
      destroy_dvtree(vector);
      /*printf("Writing to winmap: doc_num= %d\n", doc_num);*/
      fwrite_big(&doc_num, sizeof(DB_INT), 1, WMAP_FILE);
      window_offset = (int)(window_ptr-body_buffer);
      /*printf("Writing offset to winmap: window_offset= %d\n", window_offset);*/
      fwrite_big(&window_offset, sizeof(DB_INT), 1, WMAP_FILE);
      num_windows++;
    } while (end_ptr != NULL);
  }
  else { /* if no document windowing, just do the whole buffer */
    vector = vectorize_buffer(body_buffer, BODY_FIELD);
    if(subject_vector != NULL) {
      merge_dvtrees(vector, subject_vector);
    }
    if(source_vector != NULL) {
      merge_dvtrees(vector, source_vector);
    }
    if(location_vector != NULL) {
      merge_dvtrees(vector, location_vector);
    }
    if(date_vector != NULL) {
      merge_dvtrees(vector, date_vector);
    }
    if(time_vector != NULL) {
      merge_dvtrees(vector, time_vector);
    }
    if(day_vector != NULL) {
      merge_dvtrees(vector, day_vector);
    }
    save_dv(vector, doc_num);
    destroy_dvtree(vector);
    fwrite_big(&doc_num, sizeof(DB_INT), 1, WMAP_FILE);
    window_offset = 0;
    fwrite_big(&window_offset, sizeof(DB_INT), 1, WMAP_FILE);
    num_windows = 1;
  }
    
  /* cleanup */
  free(body_buffer);
  if(subject_vector != NULL) {
    destroy_dvtree(subject_vector);
  }
  if(source_vector != NULL) {
    destroy_dvtree(source_vector);
  }
  if(location_vector != NULL) {
    destroy_dvtree(location_vector);
  }
  if(date_vector != NULL) {
    destroy_dvtree(date_vector);
  }
  if(time_vector != NULL) {
    destroy_dvtree(time_vector);
  }
  if(day_vector != NULL) {
    destroy_dvtree(day_vector);
  }
  
  /* This is a horrid waste of diskspace, but since we throw away
     the document numbers (keeping each window as a seperate doc),
     we need to write a seperate bias for each.  Dumb, but oh well */
  
  for (i = 1; i < num_windows; i++) {   /* did one already, so start at 1 */
    fwrite_big(biases, sizeof(DB_INT), NUM_FIELD_TYPES, BIAS_FILE); 
  }
  return(num_windows);
}

int do_title(FileInfo *info,
	     List_of_Docs *doc,
	     int doc_num)
{
  char title[1024], buffer[1024], fulltitle[1024], *temp;
  int num_bytes, i;
  DB_UINT offset;
  struct parsedate *pd;

  /* start with a blank */
  title[0] = '\0';
  fulltitle[0] = '\0';

  /* keep strings terminated. */
  buffer[sizeof(buffer) - 1] = '\0';
  title[sizeof(title) - 1] = '\0';
  fulltitle[sizeof(fulltitle) - 1] = '\0';

  /* add the SOURCE field */
  if ((doc->source[0] == -1) || (doc->source[1] < 1)) {
    strncpy(buffer, info->owner, sizeof(buffer) - 1);
    make_nice_name(buffer);
    strncat(title, buffer, sizeof(title) - strlen(title) - 1);
    strncat(fulltitle, buffer, sizeof(fulltitle) - strlen(fulltitle) - 1);
  }
  else {
    num_bytes = (doc->source[1] < 512) ? doc->source[1] : 511;
    fseek(info->file, doc->source[0], SEEK_SET);
    fread_big(buffer, 1, num_bytes, info->file);
    buffer[num_bytes] = '\0';
    make_nice_name(buffer);
    strncat(title, buffer, sizeof(title) - strlen(title) - 8);
    strncat(fulltitle, buffer, sizeof(title) - strlen(title) - 8);
  }
  strcat(fulltitle, "|");
  strcat(title, " | ");

  /* add the LOCATION field (full title only) */
  if ((doc->location[0] == -1) || (doc->location[1] < 1)) {
    /* Not really a reasonable default... (pathname?) */
  }
  else {
    num_bytes = (doc->location[1] < 512) ? doc->location[1] : 511;
    fseek(info->file, doc->location[0], SEEK_SET);
    fread_big(buffer, 1, num_bytes, info->file);
    buffer[num_bytes] = '\0';
    make_nice_name(buffer);
/*  strncat(title, buffer, sizeof(title) - strlen(title) - 8); */
    strncat(fulltitle, buffer, sizeof(title) - strlen(title) - 8);
  }
  strcat(fulltitle, "|");
/*strcat(title, " | "); */

  /* add the DATE field(s) */
  if ((doc->date[0] == -1) || (doc->date[1] < 1)) {
    pd = parsedate(info->date);
  }
  else {
    num_bytes = (doc->date[1] < 512) ? doc->date[1] : 511;
    fseek(info->file, doc->date[0], SEEK_SET);
    fread_big(buffer, 1, num_bytes, info->file);
    buffer[num_bytes] = '\0';
    pd = parsedate(buffer);
  }

  make_nice_date(pd, buffer);
  strncat(title, buffer, sizeof(title) - strlen(title) - 10);
  strncat(fulltitle, buffer, sizeof(fulltitle) - strlen(fulltitle) - 10);

  strcat(fulltitle, "|");
  make_nice_time(pd, buffer);
  strncat(fulltitle, buffer, sizeof(fulltitle) - strlen(fulltitle) - 10);

  strcat(fulltitle, "|");
  make_nice_day(pd, buffer);
  strncat(fulltitle, buffer, sizeof(fulltitle) - strlen(fulltitle) - 10);

  sprintf(buffer, "|%11d", pd->unixtime);
  strncat(fulltitle, buffer, sizeof(fulltitle) - strlen(fulltitle) - 10);

  
  strcat(title, " | ");
  strcat(fulltitle, "|");


  /* add the SUBJECT field */
  if ((doc->subject[0] == -1) || (doc->subject[1] < 1)) {
    temp = info->filename;
    if (strrchr(temp, '/'))
      temp=1+strrchr(temp, '/');
    strncat(title, temp, sizeof(title) - strlen(title) - 2);
    strncat(fulltitle, temp, sizeof(fulltitle) - strlen(fulltitle) - 2);
  }
  else {
    num_bytes = (doc->subject[1] < 512) ? doc->subject[1] : 511;
    fseek(info->file, doc->subject[0], SEEK_SET);
    fread_big(buffer, 1, num_bytes, info->file);
    buffer[num_bytes] = '\0';
    strncat(title, buffer, sizeof(title) - strlen(title) - 1);
    strncat(fulltitle, buffer, sizeof(fulltitle) - strlen(fulltitle) - 1);
  }

  strcat(title, "\n");
  strcat(fulltitle, "\n");

  /* avoid these typographical gremlins */
  for(i=0; i<strlen(fulltitle)-1; i++) {
    if ((fulltitle[i] < 32) || (fulltitle[i] > 126)) 
      fulltitle[i] = ' ';
  }
  for(i=0; i<strlen(title)-1; i++) {
    if ((title[i] < 32) || (title[i] > 126)) 
      title[i] = ' ';
  }

  /* write the title and info*/
  offset = ftell(TITLE_FILE);
  fwrite_big(&offset, sizeof(DB_INT), 1, TOFF_FILE);
  fputs(title, TITLE_FILE);
  fflush(TITLE_FILE);

  offset = ftell(FTITLE_FILE);
  fwrite_big(&offset, sizeof(DB_INT), 1, FTOFF_FILE);
  fputs(fulltitle, FTITLE_FILE);
  fflush(FTITLE_FILE);

  return(0);
}

char *make_nice_name(char *name)
{
  char *temp, *ptr;
  size_t tempwidth = 512;

  if (Config.source_field_width >= tempwidth)
    tempwidth = Config.source_field_width + 1;
  temp = malloc(tempwidth + 1);
  if (temp == NULL)
    return name;
  temp[tempwidth - 1] = '\0';
  if((ptr = strchr(name, '<')) != NULL) {
    if(ptr != name) { /* cut off username junk */
      *ptr = '\0';
    }
    else { /* remove the '<' and lop off the '>' and beyond */
      *ptr = ' ';
      if((ptr = strchr(name, '>')) != NULL) {
	*ptr = '\0';
      }
    }
  }
  if((ptr = strchr(name, '(')) != NULL) { /* this will be the real name */
    strncpy(temp, ptr+1, tempwidth - 1);
    if ((ptr = strchr(temp, ')')) != NULL) {
      *ptr = '\0';
    }
  }
  else {
    strncpy(temp, name, tempwidth - 1);
  }

  if(strlen(temp) > 0) {
    while(isspace(temp[strlen(temp)-1])) {
      temp[strlen(temp)-1] = '\0';
    }
  }
  else if(SavantDebug) {
    fprintf(stderr, "index.c:make_nice_name():  zero length name\n");
  }

  if(strlen(temp) > Config.source_field_width) {
    if(Config.ellipses) {
      temp[Config.source_field_width - 3] = '\0';
      strcat(temp, "...");
    }
    else {
      temp[Config.source_field_width] = '\0';
    }
  }
  else while(strlen(temp) < Config.source_field_width) {
    strcat(temp, " ");
  }
  
  strcpy(name, temp);
  free(temp);
  return(name);
}

void make_nice_date(struct parsedate *pd, char *date)
{
  static char *monthnames[] = {"???", "???", 
			       "Jan", "Feb", "Mar", "Apr", "May", "Jun",
			       "Jul", "Aug", "Sep", "Oct", "Nov", "Dec"};
  char day[] = "??", year[] = "??";
  int temp;

  /* Try to fix mixups with mm/dd/yy format date */
  if ((pd->month > 12) && (pd->month <= 31) && (pd->day > 0) && (pd->day <= 12)) {
    temp = pd->month;
    pd->month = pd->day;
    pd->day = temp;
    pd->weekday = -1;
  }
  if ((pd->month < 1) || (pd->month > 12)) {
    pd->month = -1;
  }
  if ((pd->day > 0) && (pd->day <= 31)) {
    sprintf(day, "%02d", pd->day);
  }
  if(pd->year != -1) {
    sprintf(year, "%02d", pd->year % 100);
  }
  sprintf(date, "%s %s %s", 
	  day, monthnames[pd->month + 1], year);
}

void make_nice_time(struct parsedate *pd, char *time)
{
  static char hour[] = "??", min[] = "??", sec[] = "??", zone[] = "+????";

  if(pd->hour != -1) {
    sprintf(hour, "%02d", pd->hour);
  }
  if(pd->minute != -1) {
    sprintf(min, "%02d", pd->minute);
  }
  if(pd->second != -1) {
    sprintf(sec, "%02d", pd->second);
  }
  if(pd->hour != -1) {
    if (pd->zone >= 0)
      sprintf(zone, "+%04d", pd->zone);
    else
      sprintf(zone, "%05d", pd->zone);
  }
  sprintf(time, "%s:%s:%s %s", hour, min, sec, zone);
}

void make_nice_day(struct parsedate *pd, char *day)
{
  static char *daynames[] = {"???", 
			     "Sunday", "Monday", "Tuesday", "Wednesday", 
			     "Thursday", "Friday", "Saturday"};

  if ((pd->weekday >7) || (pd->weekday < 0)) {
    pd->weekday = -1;
  }
  sprintf(day, "%s", daynames[pd->weekday + 1]);
}

char *make_nice_date_old(char *date)
{
  char scratch[128], *ptr;
  /* I found these different formats...
     a. Tue Jun 25 15:03:45 1996
     b. Wed, 8 Nov 1995 09:07:34 -0800
     c. 12 Jul 1995 17:16:28 GMT
     d. 15 Jul 95 01:02:05 GMT
     e. December 13, 1995
     These should all go to "DD MON YY" */
  for(ptr=date; isspace(*ptr) && *ptr; ptr++); /* cut initial spaces */
  if (strlen(ptr) >= 8) {
    strcpy(scratch, ptr);
    
    if(isdigit(scratch[0])) { /* c or d */
      if(scratch[1] == ' ') { 
	date[0] = '0';
	strcpy(date+1, scratch);
	if(scratch[8] != ' ') {
	  date[7] = scratch[8];
	  date[8] = scratch[9];
	}
      }
      else {
	strcpy(date, scratch);
	if(scratch[9] != ' ') {
	  date[7] = scratch[9];
	  date[8] = scratch[10];
	}
      }
      date[9] = '\0';
    }
    else if (scratch[3] == ',') { /* b */
      if(scratch[6] == ' ') {
	date[0] = '0';
	date[1] = scratch[5];
	date[3] = scratch[7];
	date[4] = scratch[8];
	date[5] = scratch[9];
	if(scratch[13] == ' ') { /* two digit scratch */
	  date[7] = scratch[11];
	  date[8] = scratch[12];
	}
	else {
	  date[7] = scratch[13];
	  date[8] = scratch[14];
	}
      } 
      else {
	date[0] = scratch[5];
	date[1] = scratch[6];
	date[3] = scratch[8];
	date[4] = scratch[9];
	date[5] = scratch[10];
	if(scratch[14] == ' ') { /* two digit scratch */      
	  date[7] = scratch[12];
	  date[8] = scratch[13];
	}
	else {
	  date[7] = scratch[14];
	  date[8] = scratch[15];
	}
      }
      date[2] = date[6] = ' ';
      date[9] = '\0';
    }
    else if (scratch[3] == ' ') { /* a or e abbreviated */
      if (isalpha(scratch[4])) { /* a */
	if(scratch[9] == ' ') { /* left justified single digit */
	  date[0] = '0';
	  date[1] = scratch[8];
	}
	else if(scratch[8] == ' ') { /*right justified single digit */
	  date[0] = '0';
	  date[1] = scratch[9];
	}
	else {
	  date[0] = scratch[8];
	  date[1] = scratch[9];
	}
	date[2] = date[6] = ' ';
	date[3] = scratch[4];
	date[4] = scratch[5];
	date[5] = scratch[6];
	ptr = strchr(scratch, ':');
	if (ptr == NULL) {
	  scratch[7] = scratch[8] = '?';
	}
	else {
	  date[7] = *(ptr+9);
	  date[8] = *(ptr+10);
	}
	date[9] = '\0';
      }
      else { /* e abbreviated */
	if(scratch[5] == ' ') {
	  date[0] = '0';
	  date[1] = scratch[4];
	  date[7] = scratch[8];
	  date[8] = scratch[9];
	}
	else {
	  date[0] = scratch[4];
	  date[1] = scratch[5];
	  date[7] = scratch[9];
	  date[8] = scratch[10];
	}
	date[2] = date[6] = ' ';
	date[3] = scratch[0];
	date[4] = scratch[1];
	date[5] = scratch[2];
	date[9] = '\0';
      }
    }
    else { /* e */
      ptr = strchr(scratch, ' '); /* ptr+1 points to day */
      strcpy(date, ptr+1);
      if(date[1] == ' ') {
	date[1] = date[0];
	date[0] = '0';
      }
      date[2] = ' ';
      date[3] = scratch[0]; /* fill in the month */
      date[4] = scratch[1];
      date[5] = scratch[2];
      date[6] = '\0';
      ptr = strchr(ptr+1, ' '); /* ptr now points to " YY" or " yyYY" */
      strcat(date, ptr);
      if(isdigit(date[9])) { /* if four digit year, shift 3rd and 4th digits */
	date[7] = date[9];
	date[8] = date[10];
      }
      date[9] = '\0'; /* cut it off */
    }
  }
  return(date);
}

