#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <sys/types.h>
#include <ctype.h>


#include "udm_config.h"
#include "udm_common.h"
#include "udm_charset.h"
#include "udm_filter.h"
#include "udm_utils.h"
#include "udm_parseurl.h"
#include "udm_log.h"
#include "udm_xmalloc.h"
#include "udm_robots.h"
#include "udm_server.h"
#include "udm_hrefs.h"
#include "udm_word.h"
#include "udm_crossword.h"
#include "udm_parsehtml.h"
#include "udm_agent.h"
#include "udm_spell.h"


/**************************** Built-in Parsers ***************************/

int UdmParseText(UDM_AGENT * Indexer,UDM_SERVER * Server,char *content,int weight,int check_stopwords){
	char *s, *lt;

	if(weight&&content){
		s=UdmGetWord(content, &lt, Indexer->Conf->local_charset);
		while(s){
			UdmAddWord(Indexer, Server, s, weight,check_stopwords);
			s=UdmGetWord(NULL, &lt, Indexer->Conf->local_charset);
		}
	}
	return(0);
}

static int UdmParseCrossText(UDM_AGENT * Indexer,UDM_SERVER * Server,char * referree,char *content,int weight,int check_stopwords){
	char *s, *lt;

	if(weight&&content){
		s=UdmGetWord(content, &lt, Indexer->Conf->local_charset);
		while(s){
			UDM_CROSSWORD cw;
			UdmAddWord(Indexer, Server, s, weight,check_stopwords);
			cw.url=referree;
			cw.count=Server->crossweight;
			cw.word=s;
			UdmAddCrossWord(Indexer,Server,&cw,check_stopwords);
			s=UdmGetWord(NULL, &lt, Indexer->Conf->local_charset);
		}
	}
	return(0);
}


/* HTML parser states */
#define HTML_TAG	1
#define HTML_TXT	2
#define HTML_COM	3

int UdmParseHtml(UDM_AGENT *Indexer,UDM_SERVER *CurSrv,UDM_URL *CurURL,UDM_DOCUMENT *Doc){

	int inbody=0;
	int inscript=0;
	int instyle=0;
	int intitle=0;
	int inhref=0;
	int hrefonly=0;
	int comment=0;
	char *htok;
	char str[UDMSTRSIZ]="";
	char res[UDMSTRSIZ]="";
	char * lasthref=NULL;
	UDM_LANGSTACK HEAD, *TAIL = &HEAD, *tstk;

	int titleweight=CurSrv->titleweight;
	int keywordweight=CurSrv->keywordweight;
	int descweight=CurSrv->descweight;
	int do_index=CurSrv->index;
	int follow=CurSrv->follow;

	strncpy(HEAD.lang, Indexer->lang[Indexer->curlang].lang, 2);
	HEAD.prev = NULL;

	Doc->text=(char*)UdmXmalloc(2*UDM_MAXTEXTSIZE);
	Doc->keywords=(char*)UdmXmalloc(UDM_MAXKEYWORDSIZE);
	Doc->description=(char*)UdmXmalloc(UDM_MAXDESCSIZE);
	Doc->title=(char*)UdmXmalloc(UDM_MAXTITLESIZE);

	strcpy(str,Doc->url);
	hrefonly=(UDM_HREFONLY==UdmFindFilter(Indexer->Conf,str,res));

	htok=Doc->content;

	while(*htok){
		char * href=NULL;
		char * tmp=NULL;
		char * hend;
		char * s;
		int opening;
		UDM_TAG tag;
		int state=HTML_TXT;
		size_t len;

		if(!UDM_STRNCMP(htok,"<!--"))	state=HTML_COM;
		else	if(*htok=='<')		state=HTML_TAG;

		switch(state){
		case HTML_TAG: /* tag */

			href=NULL;
			for(hend=htok;(*hend!='>')&&(*hend);hend++);
			if(*hend=='>')hend++;
			tmp=(char*)UdmXmalloc((size_t)(hend-htok+1));
			strncpy(tmp,htok,(size_t)(hend-htok));tmp[hend-htok]=0;
			
			UdmParseTag(&tag,tmp);

			/* Convert to lower case */
			for(s=tag.tag;*s;*s=tolower(*s),s++);

			/* Detect whether opening or closing tag */
			if((opening=(tag.tag[0]!='/'))) { 
			  s = tag.tag;
			  tstk = (UDM_LANGSTACK*)UdmXmalloc(sizeof(UDM_LANGSTACK));
			  tstk->prev = TAIL;
			  tstk->tag = strdup(s);
			  strncpy(tstk->lang, Indexer->lang[Indexer->curlang].lang, 3);
			  if (tag.lang != NULL) {
			    UdmSelectLang(Indexer, tag.lang);
/*			  fprintf(stderr, "\nTAG: %s LANG: %s\n", tstk->tag, tag.lang);*/
			  }
			  TAIL = tstk;
			} else {				
			  s = tag.tag+1;
			  while((TAIL->prev != NULL) && (strncmp(TAIL->tag, s, 2) != 0) ) {
			    UdmSelectLang(Indexer, TAIL->lang);
			    tstk = TAIL;
			    TAIL = TAIL->prev;
			    UDM_FREE(tstk->tag);
			    UDM_FREE(tstk);
			  }
			  if (TAIL->prev != NULL) {
			    UdmSelectLang(Indexer, TAIL->lang);
			    tstk = TAIL;
			    TAIL = TAIL->prev;
			    UDM_FREE(tstk->tag);
			    UDM_FREE(tstk);
			  }
			}

			/* Let's find tag name in order of frequency */

			if(!strcmp(s,"a")){
				href=tag.href;/*117941*/
				inhref=opening;
				UDM_FREE(lasthref);
			}else	if(!strcmp(s,"title"))	intitle=opening;/*6192*/
			else	
			if(!strcmp(s,"body")){/*5146*/
				if((inbody=opening)){
					int DCIndex;
#ifdef USE_CHARSET_GUESSER					
					DCIndex = UdmGuessCharset(Doc->content, UdmGetCharset(CurSrv->charset));
#else
					if(Indexer->charset){
						DCIndex=Indexer->charset;
					}else{
						DCIndex=UdmGetCharset(CurSrv->charset);
					}
#endif
					UdmRecode(Doc->content, DCIndex,Indexer->Conf->local_charset);
					UdmRecode(Doc->title,   DCIndex,Indexer->Conf->local_charset);
					UdmRecode(Doc->description,DCIndex,Indexer->Conf->local_charset);
					UdmRecode(Doc->keywords,DCIndex,Indexer->Conf->local_charset);
				}
			}else
			if((!strcmp(tag.tag,"meta"))&&(tag.name)&&(tag.content)){
			
				if((!strcasecmp(tag.name,"Content-Type"))&&(!Indexer->charset)){
					char *p;
					if((p=strstr(tag.content,"charset="))){
						Indexer->charset = UdmGetCharset(p + 8);
					}
				}else
				if(!strcasecmp(tag.name,"refresh")){
					/* Format: "10; Url=http://something/" */
					/* URL can be written in different     */
					/* forms: URL, url, Url and so on      */
					
					if((href=strchr(tag.content,'='))){
						if((href>=tag.content+3)&&(!UDM_STRNCASECMP(href-3,"URL="))){
							href=href+1;
						}else{
							href=NULL;
						}
					}
				}else
				if(!strcasecmp(tag.name,"keywords")){
					UdmUnescapeSgmlStr(tag.content);
					strncpy(Doc->keywords,tag.content,UDM_MAXKEYWORDSIZE-1);
					Doc->keywords[UDM_MAXKEYWORDSIZE-1]=0;
					keywordweight*=comment?0:1;
				}else
				if(!strcasecmp(tag.name,"description")){
					UdmUnescapeSgmlStr(tag.content);
					strncpy(Doc->description,tag.content,UDM_MAXDESCSIZE-1);
					Doc->description[UDM_MAXDESCSIZE-1]=0;
					descweight*=comment?0:1;
				}else
				if(!strcasecmp(tag.name,"robots")&&CurSrv->use_robots&&tag.content){
					char * lt;
					char * rtok;
					rtok=UdmGetWord(tag.content,&lt,Indexer->Conf->local_charset);
					while(rtok){
						if(!strcasecmp(rtok,"ALL")){
							/* Set Server parameters */
							follow=CurSrv->follow;
							do_index=CurSrv->index;
						}else
						if(!strcasecmp(rtok,"NONE")){
							follow=UDM_FOLLOW_NO;
							do_index=0;
						}else
						if(!strcasecmp(rtok,"NOINDEX"))
							do_index=0;
						else
						if(!strcasecmp(rtok,"NOFOLLOW"))
							follow=UDM_FOLLOW_NO;
						else
						if(!strcasecmp(rtok,"INDEX"))
							do_index=1;
						else
						if(!strcasecmp(rtok,"FOLLOW"))
							follow=CurSrv->follow;
						rtok=UdmGetWord(NULL,&lt,Indexer->Conf->local_charset);
					}
				}else
				if(( (!strcasecmp(tag.name,"language")) || (!strcasecmp(tag.name, "content-language")) ) && (tag.content)) {
				  UdmSelectLang(Indexer, tag.content);
					if (TAIL!=NULL)
					{
					    tstk=TAIL;
			            	    while(tstk->prev != NULL) {
					      strncpy(tstk->lang, Indexer->lang[Indexer->curlang].lang, 3);
						tstk=tstk->prev;
					    }
					    strncpy(tstk->lang, Indexer->lang[Indexer->curlang].lang, 3);
					}					
				}
			}
			else	if(!strcmp(s,"img"))	href=tag.src;/*2786*/
			else	if(!strcmp(s,"link"))	href=tag.href;/*2241*/
			else	if(!strcmp(s,"frame"))	href=tag.src;
			else	if(!strcmp(s,"script"))	inscript=opening;
			else	if(!strcmp(s,"style"))	instyle=opening;
			else	if(!strcmp(s,"area"))	href=tag.href;
			else	if(!strcmp(s,"noindex"))comment=opening;
			else	if((!strcmp(s,"base"))&&(tag.href)){
				/* <BASE HREF="xxx"> stuff            */
				/* Check that URL is properly formed  */
				/* baseURL is just temporary variable */
				/* If parsing  fails we'll use old    */
				/* base href, passed via CurURL       */
				
				/* Note that we will not check BASE     */
				/* if delete_no_server is unset         */
				/* This is  actually dirty hack. We     */
				/* must check that hostname is the same */

				if((CurSrv->delete_no_server)||(CurSrv->follow==UDM_FOLLOW_WORLD)){
					UDM_URL baseURL;
					int parse_res;
					
					if(!(parse_res=UdmParseURL(&baseURL,tag.href))){
						UdmParseURL(CurURL,tag.href);
					}else{
						switch(parse_res){
						case UDM_PARSEURL_LONG:
							UdmLog(Indexer,UDM_LOG_ERROR,"BASE HREF too long: '%s'",tag.href);
							break;
						case UDM_PARSEURL_BAD:
						default:
							UdmLog(Indexer,UDM_LOG_ERROR,"Error in BASE HREF URL: '%s'",tag.href);
						}
					}
				}
			}

			if((href)&&(follow!=UDM_FOLLOW_NO)){
				UDM_URL newURL;
				UDM_SERVER * FServ;
				int parse_res;
				if(Doc->hops>=CurSrv->maxhops){
					UdmLog(Indexer,UDM_LOG_DEBUG,"Skip \"%s\" : too many hops: %d",href,Doc->hops);
				}else
				if(!(parse_res=UdmParseURL(&newURL,href))){
					char * newschema;
					int Method;
					int add=1;
					char reason[UDMSTRSIZ]="";
					
					if(newURL.schema[0])newschema=newURL.schema;
					else	newschema=CurURL->schema;
#if (WIN32|WINNT)
					if(!strcasecmp(newschema,"htdb")||(!strcasecmp(newschema,"file")&&!newURL.hostinfo[0])){
#else
					if(!strcasecmp(newschema,"file")||!strcasecmp(newschema,"htdb")){
#endif
						sprintf(str,"%s:%s%s",newschema,newURL.path[0]?newURL.path:CurURL->path,newURL.filename);
					}else{
						sprintf(str,"%s://%s%s%s",
						newURL.schema[0]?newURL.schema:CurURL->schema,
						newURL.hostinfo[0]?newURL.hostinfo:CurURL->hostinfo,
						newURL.path[0]?newURL.path:CurURL->path,newURL.filename);
					}
										
					if((newschema=strchr(str,':')))
						UdmRemove2Dot(newschema+1);
					
					if(!UDM_STRNCMP(str,"ftp://")&&(strstr(str,";type=")))
						*(strstr(str,";type"))=0;

					Method=UdmFindFilter(Indexer->Conf,str,reason);
					UdmLog(Indexer,UDM_LOG_DEBUG,"\"%s\": %s",href,reason);

					if((Method==UDM_DISALLOW)){
						add=0;
					}else
					if(!(FServ=UdmFindServer(Indexer->Conf,str,NULL))){
						UdmLog(Indexer,UDM_LOG_DEBUG,"Skip \"%s\" : no Server",href);
						add=0;
					}else
					if(CurSrv->use_robots){
						UDM_ROBOT * R;
						UDM_URL rURL;
						
						/* Parse it again for FindRobots */
						UdmParseURL(&rURL,str);
						if(Indexer->Conf->LockProc)Indexer->Conf->LockProc(UDM_LOCK,UDM_LOCK_ROBOTS);
						R=UdmFindRobots(Indexer->Conf,&rURL);
						if(Indexer->Conf->LockProc)Indexer->Conf->LockProc(UDM_UNLOCK,UDM_LOCK_ROBOTS);
						if(R){
							add=0;
							UdmLog(Indexer,UDM_LOG_DEBUG,"Skip '%s' : robots.txt '%s'",href,R->path);
							UdmLog(Indexer,UDM_LOG_DEBUG,"Full URL \"%s\"",str);
						}
					}
					if(add){
						
						/* compare hostinfo in some cases */
						if((!CurSrv->delete_no_server)&&(CurSrv->follow!=UDM_FOLLOW_WORLD)){
							if(newURL.hostinfo[0])
								add=!strcmp(CurURL->hostinfo,newURL.hostinfo);
							if((add)&&(newURL.schema[0]))
								add=!strcmp(CurURL->schema,newURL.schema);
						}
						if(add){
							/* FIXME: move to indexer.c */
							if(Indexer->Conf->LockProc)Indexer->Conf->LockProc(UDM_LOCK,UDM_LOCK_TARGET);
							
							/* Add URL itself */
							if(UdmAddHref(Indexer->Conf,str,Doc->url_id,Doc->hops+1,0,FServ->tag,FServ->category))
								Indexer->Conf->have_targets=1;
							/* For ranking */
							UDM_FREE(lasthref);
							lasthref=strdup(str);
							
							/* Add robots.txt for HTTP schema */
							/* if robots.txt support enabled  */
							if((!strcasecmp(newURL.schema,"http"))&&(FServ->use_robots)){
								char str1[UDMSTRSIZ]="";
								sprintf(str1,"%s://%s/%s",newURL.schema,newURL.hostinfo,"robots.txt");
								if(UdmAddHref(Indexer->Conf,str1,0,0,0,FServ->tag,FServ->category))
									Indexer->Conf->have_targets=1;
							}
							if(Indexer->Conf->LockProc)Indexer->Conf->LockProc(UDM_UNLOCK,UDM_LOCK_TARGET);
						}
					}
				}else{
					switch(parse_res){
					case UDM_PARSEURL_LONG:
						UdmLog(Indexer,UDM_LOG_DEBUG,"URL too long: '%s'",href);
						break;
					case UDM_PARSEURL_BAD:
					default:
						UdmLog(Indexer,UDM_LOG_DEBUG,"Error in URL: '%s'",href);
					}
				}
			}
			UdmFreeTag(&tag);
			free(tmp);
			htok=hend;
			break;
 
		case HTML_COM: /* comment */

			if(!UDM_STRNCASECMP(htok,"<!--UdmComment-->"))
				comment=1;
			else
			if(!UDM_STRNCASECMP(htok,"<!--/UdmComment-->"))
				comment=0;

			while(*htok){
				if(!UDM_STRNCMP(htok,"-->")){
					htok+=3;
					break;
				}
				htok++;
			}
			break;

		case HTML_TXT: /* text */

			if(inscript){
				/* Special case when script  */
				/* body is not commented:    */
				/* <script> x="<"; </script> */
				/* We should find </script>  */
				/* in this case:             */
				for(hend=htok;*hend;hend++){
					if(!UDM_STRNCASECMP(hend,"</script>"))
						break;
				}
			}else{
				for(hend=htok;(*hend)&&(*hend!='<');hend++);
			}
			tmp=(char*)UdmXmalloc((size_t)(2*(hend-htok)+6));
			strncpy(tmp,htok,(size_t)(hend-htok));
			tmp[hend-htok+1]='\0';
			UdmUnescapeSgmlStr(tmp);
			/* \240 is &nbsp; */
			UdmStrRemoveDoubleChars(tmp," \240\t\r\n");

			if((!comment&&inbody&&!inscript&&!instyle)&&((len=strlen(Doc->text))<UDM_MAXTEXTSIZE-2)){
				len=UDM_MAXTEXTSIZE-2-len;
				if(Doc->text[0])strcat(Doc->text," ");
				strncat(Doc->text,tmp,len);
				Doc->text[UDM_MAXTEXTSIZE-1]=0;
			}
			if((!comment)&&(intitle)&&((len=strlen(Doc->title))<UDM_MAXTITLESIZE-2)){
				titleweight*=comment?0:1;
				len=UDM_MAXTITLESIZE-2-len;
				if(Doc->title[0])strcat(Doc->title," ");
				strncat(Doc->title,tmp,len);
				Doc->title[UDM_MAXTITLESIZE-1]=0;
			}
			if(!intitle&&!comment&&(!hrefonly)&&do_index&&!inscript&&!instyle){
				if(inhref&&lasthref){
					UdmParseCrossText(Indexer,CurSrv,lasthref,tmp,CurSrv->bodyweight*inbody,1);
				}else{
					UdmParseText(Indexer,CurSrv,tmp,CurSrv->bodyweight*inbody,1);
				}
			}
			free(tmp);
			htok=hend;
			break;
		}
	}
	if(do_index&&titleweight&&!hrefonly){
		strcpy(str,Doc->title);
		UdmParseText(Indexer,CurSrv,str,titleweight,1);
	}
	if(do_index&&keywordweight&&!hrefonly){
		strcpy(str,Doc->keywords);
		UdmParseText(Indexer,CurSrv,str,keywordweight,1);
	}
	if(do_index&&descweight&&!hrefonly){
		strcpy(str,Doc->description);
		UdmParseText(Indexer,CurSrv,str,descweight,1);
	}
	while(TAIL->prev != NULL) {
		UdmSelectLang(Indexer, TAIL->lang);
		tstk = TAIL;
		TAIL = TAIL->prev;
		UDM_FREE(tstk->tag);
		UDM_FREE(tstk);
	}
	UDM_FREE(lasthref);
	return(0);
}
