#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <sys/types.h>
#include <regex.h>

#include "udm_config.h"
#include "udm_common.h"
#include "udm_utils.h"
#include "udm_charset.h"
#include "udm_indexer.h"
#include "udm_proto.h"
#include "udm_parseurl.h"
#include "udm_hrefs.h"
#include "udm_server.h"
#include "udm_xmalloc.h"

#define DEFAULT_PROXY_PORT	3128


/* return values: 0 on success, non-zero on error */

__INDLIB__ int UdmAddServer(UDM_ENV * Conf,UDM_SERVER * srv, int load_flags, int match_type){
#define ERRSTRSIZ 1000
	int res;
	UDM_URL from;
	regex_t * pexpr=NULL;
	char urlstr[UDMSTRSIZ];

	/* Copy URL to temp string    */
	/* to keep srv->url unchanged */
	strcpy(urlstr,UDM_NULL2EMPTY(srv->url));

	if((UDM_SRV_TYPE(match_type)==UDM_SERVER_SUBSTR)&&(urlstr[0])){
		/* Check whether valid URL is passed */
		if((res=UdmParseURL(&from,urlstr))){
			switch(res){
				case UDM_PARSEURL_LONG:
					Conf->errcode=1;
					sprintf(Conf->errstr,"URL too long");
					break;
				case UDM_PARSEURL_BAD:
				default:
					Conf->errcode=1;
					sprintf(Conf->errstr,"Badly formed URL");
					break;
			}
			return(1);
		}
		if((from.hostinfo[0])&&(!from.filename[0])){
			/* Add trailing slash                    */
			/* http://localhost -> http://localhost/ */
			sprintf(urlstr,"%s://%s%s",from.schema,from.hostinfo,from.path);
		}
		if((urlstr[0])&&(load_flags&UDM_FLAG_ADD_SERV)){
			switch(srv->follow){
				char * s, * anchor;
				case UDM_FOLLOW_PATH:
					/* Cut before '?' and after last '/' */
					if((anchor=strchr(urlstr,'?')))
						*anchor='\0';
					if((s=strrchr(urlstr,'/')))
						*(s+1)='\0';
					break;

				case UDM_FOLLOW_SITE:
					if(from.hostinfo[0]){
						/* Cut after hostinfo */
						sprintf(urlstr,"%s://%s/",from.schema,from.hostinfo);
					}else{
						/* Cut after first '/' */
						if((s=strchr(urlstr,'/')))
							*(s+1)='\0';
					}
					break;
				
				case UDM_FOLLOW_NO: 
				case UDM_FOLLOW_WORLD:
				default:
					break;
			}
			
		}
		if (!strcmp(from.schema, "news")) {
			char *c, *cc;
			/* Cat server name to remove group names */
			/* This is because group names do not    */
			/* present in message URL                */
			c=urlstr+7;
			cc=strchr(c,'/');
			if(cc)*(cc+1)='\0';
		}
	}else
	if(UDM_SRV_TYPE(match_type)==UDM_SERVER_REGEX){
		int err;
		char regerrstr[ERRSTRSIZ]="";
		pexpr=UdmXmalloc(sizeof(regex_t));
		if(match_type&UDM_SERVER_CS){
			err=regcomp(pexpr,urlstr,REG_EXTENDED);
		}else{
			err=regcomp(pexpr,urlstr,REG_EXTENDED|REG_ICASE);
		}
		if(err){
			regerror(err, pexpr, regerrstr, ERRSTRSIZ);
			free(pexpr);
			Conf->errcode=1;
			sprintf(Conf->errstr,"Wrong regex in config file: %s: %s", urlstr,regerrstr);
			return(1);
		}
	}
	if(Conf->nservers>=Conf->mservers){
		if(Conf->mservers){
			Conf->mservers+=16;
			Conf->Server=(UDM_SERVER *)UdmXrealloc(Conf->Server,Conf->mservers*sizeof(UDM_SERVER));
		}else{
			Conf->mservers=16;
			Conf->Server=(UDM_SERVER *)UdmXmalloc(Conf->mservers*sizeof(UDM_SERVER));
		}
	}
	Conf->Server[Conf->nservers].rec_id=Conf->nservers;
	Conf->Server[Conf->nservers].regexp=pexpr;
	Conf->Server[Conf->nservers].match_type=match_type;
	Conf->Server[Conf->nservers].url=strdup(urlstr);
	Conf->Server[Conf->nservers].alias=srv->alias?strdup(srv->alias):NULL;
	Conf->Server[Conf->nservers].charset=srv->charset?strdup(srv->charset):NULL;
	Conf->Server[Conf->nservers].basic_auth=srv->basic_auth?strdup(srv->basic_auth):NULL;
	Conf->Server[Conf->nservers].proxy_basic_auth=srv->proxy_basic_auth?strdup(srv->proxy_basic_auth):NULL;
	Conf->Server[Conf->nservers].htdb_list=srv->htdb_list?strdup(srv->htdb_list):NULL;
	Conf->Server[Conf->nservers].htdb_doc=srv->htdb_doc?strdup(srv->htdb_doc):NULL;
	Conf->Server[Conf->nservers].user=srv->user?strdup(srv->user):NULL;
	Conf->Server[Conf->nservers].passwd=srv->passwd?strdup(srv->passwd):NULL;
	Conf->Server[Conf->nservers].category=srv->category?strdup(srv->category):NULL;
	Conf->Server[Conf->nservers].tag=srv->tag?strdup(srv->tag):NULL;
	Conf->Server[Conf->nservers].mirror_root=srv->mirror_root?strdup(srv->mirror_root):NULL;
	Conf->Server[Conf->nservers].mirror_headers=srv->mirror_headers?strdup(srv->mirror_headers):NULL;
	
	strncpy(Conf->Server[Conf->nservers].lang,srv->lang,2);
	
	if(srv->proxy){
		char *s;
		Conf->Server[Conf->nservers].proxy=strdup(srv->proxy);
		if((s=strchr(Conf->Server[Conf->nservers].proxy,':'))){
			*s=0;
			Conf->Server[Conf->nservers].proxy_port=atoi(s+1)?atoi(s+1):DEFAULT_PROXY_PORT;
		}else{
			Conf->Server[Conf->nservers].proxy_port=DEFAULT_PROXY_PORT;
		}
	}else{
		Conf->Server[Conf->nservers].proxy=NULL;
		Conf->Server[Conf->nservers].proxy_port=0;
	}
	Conf->Server[Conf->nservers].period=srv->period;
	Conf->Server[Conf->nservers].net_errors=0;
	Conf->Server[Conf->nservers].maxhops=srv->maxhops;
	Conf->Server[Conf->nservers].index=srv->index;
	Conf->Server[Conf->nservers].follow=srv->follow;
	Conf->Server[Conf->nservers].deletebad=srv->deletebad;
	Conf->Server[Conf->nservers].use_robots=srv->use_robots;
	Conf->Server[Conf->nservers].bodyweight=srv->bodyweight;
	Conf->Server[Conf->nservers].crossweight=srv->crossweight;
	Conf->Server[Conf->nservers].titleweight=srv->titleweight;
	Conf->Server[Conf->nservers].urlweight=srv->urlweight;
	Conf->Server[Conf->nservers].urlhostweight=srv->urlhostweight;
	Conf->Server[Conf->nservers].urlpathweight=srv->urlpathweight;
	Conf->Server[Conf->nservers].urlfileweight=srv->urlfileweight;
	Conf->Server[Conf->nservers].descweight=srv->descweight;
	Conf->Server[Conf->nservers].keywordweight=srv->keywordweight;
	Conf->Server[Conf->nservers].max_net_errors=srv->max_net_errors;
	Conf->Server[Conf->nservers].read_timeout=srv->read_timeout;
	Conf->Server[Conf->nservers].doc_timeout=srv->doc_timeout;
	Conf->Server[Conf->nservers].delete_no_server=srv->delete_no_server;
	Conf->Server[Conf->nservers].correct_factor=srv->correct_factor;
	Conf->Server[Conf->nservers].incorrect_factor=srv->incorrect_factor;
	Conf->Server[Conf->nservers].number_factor=srv->number_factor;
	Conf->Server[Conf->nservers].alnum_factor=srv->alnum_factor;
	Conf->Server[Conf->nservers].use_mirror=srv->use_mirror;
	Conf->Server[Conf->nservers].use_clones=srv->use_clones;
	Conf->Server[Conf->nservers].check_mp3_tag=srv->check_mp3_tag;
	Conf->Server[Conf->nservers].check_only_mp3_tag=srv->check_only_mp3_tag;
	Conf->Server[Conf->nservers].net_error_delay_time=srv->net_error_delay_time;

	if((UDM_SRV_TYPE(match_type)==UDM_SERVER_SUBSTR)&&(urlstr[0])&&(load_flags&UDM_FLAG_ADD_SERV)){
		/* Add robots.txt for HTTP*/
		if((!strcmp(from.schema,"http"))&&(srv->use_robots)){
			char str[UDMSTRSIZ];
			sprintf(str,"%s://%s/%s",from.schema,from.hostinfo,"robots.txt");
			UdmAddHref(Conf,str,0,0,0,Conf->Server[Conf->nservers].tag,Conf->Server[Conf->nservers].category);
		}
		UdmAddHref(Conf,srv->url,0,0,0,Conf->Server[Conf->nservers].tag,Conf->Server[Conf->nservers].category);
	}
	
	Conf->nservers++;
	return(0);
}



void UdmFreeServers(UDM_ENV * Conf){
	size_t i;
	
	for(i=0;i<Conf->nservers;i++){
		UDM_FREE(Conf->Server[i].url);
		UDM_FREE(Conf->Server[i].alias);
		UDM_FREE(Conf->Server[i].charset);
		UDM_FREE(Conf->Server[i].basic_auth);
		UDM_FREE(Conf->Server[i].proxy_basic_auth);
		UDM_FREE(Conf->Server[i].htdb_list);
		UDM_FREE(Conf->Server[i].htdb_doc);
		UDM_FREE(Conf->Server[i].proxy);
		UDM_FREE(Conf->Server[i].category);
		UDM_FREE(Conf->Server[i].tag);
		UDM_FREE(Conf->Server[i].mirror_root);
		UDM_FREE(Conf->Server[i].mirror_headers);
		if(Conf->Server[i].regexp){
			regfree((regex_t*)(Conf->Server[i].regexp));
			UDM_FREE(Conf->Server[i].regexp);
		}
	}
	Conf->nservers=Conf->mservers=0;
	UDM_FREE(Conf->Server);
}

/* This fuction finds Server entry for given URL         */
/* and return Alias in "aliastr" if it is not NULL       */
/* "aliastr" must be big enough to store result          */
/* not more than UDM_URLSTR bytes are written to aliastr */

UDM_SERVER * UdmFindServer(UDM_ENV * Conf,char *url, char * aliastr){
#define NS 10

	size_t i;
	char *robots=NULL;
	UDM_SERVER * Srv=NULL;

	if(!Conf->Server)return(NULL);
	url=UdmRemove2Dot(url);

	/* If it's a robot.txt, cut to hostinfo and find result */
	if((robots=strstr(url,"/robots.txt"))){
		if(!strcmp(robots,"/robots.txt")){
			robots=strdup(url);
			robots[strlen(url)-10]='\0';
		}else{
			robots=NULL;
		}
	}

	for(i=0;i<Conf->nservers;i++){
 		int res;
 		regmatch_t subs[NS];
		switch(UDM_SRV_TYPE(Conf->Server[i].match_type)){
    			case UDM_SERVER_REGEX:
				res=regexec(Conf->Server[i].regexp,url,NS,subs,0);
				if((!res)&&(Conf->Server[i].alias)&&(aliastr)){
					char *dst=aliastr;
					char *src=Conf->Server[i].alias;
					
					while((*src)&&((dst-aliastr)<(UDM_URLSIZE-1))){
						if(*src=='$'){
							char digit[2];
							int sub;
							size_t len;
							
							digit[0]=src[1];
							digit[1]='\0';
							sub=atoi(digit);
							len=subs[sub].rm_eo-subs[sub].rm_so;
							strncpy(dst,url+subs[sub].rm_so,len);
#ifdef DEBUG_ALIAS
							fprintf(stderr,"Match %d-%d '%s'\n",(int)subs[sub].rm_so,(int)subs[sub].rm_eo,dst);
#endif
							dst+=len;*dst='\0';
							src+=2;
						}else{
							*dst=*src;
							dst++;*dst='\0';
							src++;
						}
					}
					*dst='\0';
#ifdef DEBUG_ALIAS							
					fprintf(stderr,"'%s' '%s' '%s' '%s'\n",url,Conf->Server[i].url,Conf->Server[i].alias,aliastr);
#endif
				}
					
				break;
			case UDM_SERVER_STRING:
				if(Conf->Server[i].match_type&UDM_SERVER_CS){
					res=UdmStrMatch(url,Conf->Server[i].url);
				}else{
					res=UdmStrCaseMatch(url,Conf->Server[i].url);
				}
				break;
			case UDM_SERVER_SUBSTR:
			default:
				if(robots){
					if(Conf->Server[i].match_type&UDM_SERVER_CS){
						res=UDM_STRNCMP(Conf->Server[i].url,robots);
					}else{
						res=UDM_STRNCMP(Conf->Server[i].url,robots);
					}
				}else{
					if(Conf->Server[i].follow==UDM_FOLLOW_NO){
						if(Conf->Server[i].match_type&UDM_SERVER_CS){
							res=strcmp(url,Conf->Server[i].url);
						}else{
							res=strcasecmp(url,Conf->Server[i].url);
						}
					}else{
						if(Conf->Server[i].match_type&UDM_SERVER_CS){
							res=UDM_STRNCMP(url,Conf->Server[i].url);
						}else{
							res=UDM_STRNCMP(url,Conf->Server[i].url);
						}
					}
				}
				if((aliastr)&&(!res)&&(Conf->Server[i].alias)&&((strlen(url)-strlen(Conf->Server[i].url)+strlen(Conf->Server[i].alias)))<UDM_URLSIZE){
					sprintf(aliastr,"%s%s",Conf->Server[i].alias,url+strlen(Conf->Server[i].url));
				}
				break;
		}
		if((!(Conf->Server[i].match_type&UDM_SERVER_MATCH))&&(res)){
			Srv=(&Conf->Server[i]);
			break;
		}
		if((Conf->Server[i].match_type&UDM_SERVER_MATCH)&&(!res)){
			Srv=(&Conf->Server[i]);
			break;
		}
	}
	UDM_FREE(robots);
	return(Srv);
}

static int cmpserver(const void *s1,const void *s2){
	int res;
	
	if(!(res=strlen(((const UDM_SERVER*)s2)->url)-strlen(((const UDM_SERVER*)s1)->url)))
		res=(((const UDM_SERVER*)s2)->rec_id)-(((const UDM_SERVER*)s1)->rec_id);
	return(res);
}
void UdmSortServers(UDM_ENV * Conf){
	/*  Long name should be found first    */
	/*  to allow different options         */
	/*  for server and it's subdirectories */
	qsort((void*)Conf->Server,Conf->nservers,sizeof(UDM_SERVER),cmpserver);
}
__INDLIB__ int UdmInitServer(UDM_SERVER * srv){
	srv->rec_id=0;
	srv->match_type=UDM_SERVER_SUBSTR|UDM_SERVER_MATCH;
	srv->regexp=NULL;
	srv->url=NULL;
	srv->alias=NULL;
	srv->proxy=NULL;
	srv->basic_auth=NULL;
	srv->proxy_basic_auth=NULL;
	srv->charset=UDM_CHARSET_USASCII;
	srv->htdb_list=NULL;
	srv->htdb_doc=NULL;
	srv->category=NULL;
	srv->proxy_port=0;
	srv->period=UDM_DEFAULT_REINDEX_TIME;
	srv->tag=NULL;
	srv->mirror_root=NULL;
	srv->mirror_headers=NULL;
	srv->net_errors=0;
	srv->max_net_errors=UDM_MAXNETERRORS;
	srv->read_timeout=UDM_READ_TIMEOUT;
	srv->doc_timeout=UDM_DOC_TIMEOUT;
	srv->delete_no_server=1;
	srv->maxhops=UDM_DEFAULT_MAX_HOPS;
	srv->index=1;
	srv->follow=UDM_FOLLOW_PATH;
	srv->deletebad=0;
	srv->use_robots=1;

	/* Weights */
	srv->crossweight=32;
	srv->bodyweight=2;
	srv->titleweight=4;
	srv->keywordweight=8;
	srv->descweight=16;
	srv->urlweight=0;
	srv->urlhostweight=0;
	srv->urlpathweight=0;
	srv->urlfileweight=0;

	/* Words filtering */
	srv->correct_factor=1;
	srv->incorrect_factor=1;
	srv->number_factor=1;
	srv->alnum_factor=1;
	
	srv->use_mirror=UDM_MIRROR_NO;
	srv->use_clones=1;
	srv->net_error_delay_time=UDM_DEFAULT_NET_ERROR_DELAY_TIME;
	srv->lang[0]=0;
	srv->user=NULL;
	srv->passwd=NULL;
	
	srv->check_mp3_tag=0;
	srv->check_only_mp3_tag=0;
	
	return(0);
}
