
/**************************************************************************
 *                                                                        *
 *   Copyright (C) 2001 Grub, Inc.                                        *
 *                                                                        *
 *   This program is free software; you can redistribute it and/or modify *
 *   it under the terms of the GNU General Public License as published by *
 *   the Free Software Foundation; either version 1, or (at your option)  *
 *   any later version.                                                   *
 *                                                                        *
 *   This program is distributed in the hope that it will be useful,      *
 *   but WITHOUT ANY WARRANTY; without even the implied warranty of       *
 *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the        *
 *   GNU General Public License for more details.                         *
 *                                                                        *
 *   You should have received a copy of the GNU General Public License    *
 *   along with this program; if not, write to the Free Software          *
 *   Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.            *
 *                                                                        *
 *                                                                        *
 **************************************************************************/

/* Project: GRUB-CLIENT
 * <http://www.grub.org>
 * module: CRW (crawler)
 * Author: Kord Campbell (kord@grub.org)
 * Last revision: September, 2001
 * Files: NewCrawler.cpp NewCrawler.h
 *
 * Class Crawler retrieves the URLs prescribed by the server. It
 * performs this function using the cURL C++ API, available at
 * http://curl.haxx.se.  This version is a re-write of the version
 * authored by Kosta Damevski in Spring, 2001.
 * 
 * Each child process controls a single cURL sessions, pulling new URLs
 * from the shared memory segment controlled by the parent process.  The
 * older version of this portion of the client used GNU's wget and pipes
 * to communicate with the wget processes.  This evidently caused some 
 * syncranization problems that should now be aliviated in the rewrite.
 */

#ifndef _Crawler_H_
#define _Crawler_H_

#include <stdio.h>
#include <ClientDB.h>
#include <stdlib.h>
#include <string.h>
#include <unistd.h>
#include <sys/time.h>
#include <sys/stat.h>
#include <sys/types.h>
#include <sys/wait.h>
#include <signal.h>
#include <fcntl.h>
#include <CrwStat.h>
#include <string>
#include <Gui.h>
#include <time.h>
#include <strip_url.h>
#include <pthread.h>
#include <StatusInterface.h>
#include <ConfigFileInfo.h>

// cURL includes
// see http://curl.haxx.se for more info
#include <curl/curl.h>
#include <curl/types.h>
#include <curl/easy.h>

#define MAX_URL_LENGTH 200
#define CRAWL_TIMEOUT 30 
#define CRAWL_LOW_SPEED_LIMIT 500
#define CRAWL_LOW_SPEED_TIME 15
#define MAX_NUMBER_CRAWLERS 200
#define MAX_REDIRECT_LEN 100
#define BANDWIDTH_SAMPLE_TIME 5 
#define CRITICAL_DOWN_AMOUNT 150	// number of DOWN URLs in a row allowed before exiting
#define SIZE_THRESHOLD 100		// size in bytes that needs to be different to notice a change
#define SMALL_PAGE_SIZE 1000		// cuttoff of what we consider a small sized webpage in bytes
#define START_BAND_LIMIT 100000		// we assume that we've used this much bandwidth on startup 
#define FILE_SIZE_LIMIT 1000000		// biggest web page that we will return 
#define BROWSER_VERSION "Mozilla/4.0 (compatible; grub-client-" 
#define LOG_PROPEGANDA "; Crawl your own stuff with http://grub.org)"
extern "C" {
  #include "file_data.h"
  #include <verbose.h>
}

#define THROTTLE_ON 1

// #if defined(__GNU_LIBRARY__) && !defined(_SEM_SEMUN_UNDEFINED)
//     /* union semun is defined by including <sys/sem.h> */
// #else
//    /* according to X/OPEN we have to define it ourselves */
//    union semun {
//        int val;                    /* value for SETVAL */
//        struct semid_ds *buf;       /* buffer for IPC_STAT, IPC_SET */
//        unsigned short int *array;  /* array for GETALL, SETALL */
//        struct seminfo *__buf;      /* buffer for IPC_INFO */
//    };
// #endif


// struct for bandwidth limiting on crawling
struct BandwidthInfo
{
	int bandwidth_limit;
	int bandwidth_usage;
	int bandwidth_time;
};

struct MemoryStruct  // struct to contain contents of cURL retrievals
{
	char *memory;
	size_t size;
	BandwidthInfo *bandwidth_pointer; // pointer to our bandwidth share 
};

struct CrawlInfo  // vehicle for passing URLs, content and header result addresses 
                  // between the parent crawler process and a number of children
{
	pthread_t crawl_pid;
	int crawl_error_code;
	int crawl_result_length;
	int crawl_header_length;
	int crawl_download_speed;
	
	char *crawl_result_address;
	char *crawl_header_address;

	long crawl_http_code;

	char crawl_url[MAX_URL_LENGTH];
	char crawl_host[MAX_URL_LENGTH];

	bool crawl_done; // semaphore for telling parent that url content is ready
	bool crawl_go;	 // semaphore for telling child that a url is ready to be crawled
	bool crawl_wait; // semaphore for telling child to wait to submit url results
	bool crawl_exit; // semaphore for telling a child to exit at next chance
	bool crawl_dead; // semaphore for telling parent that a child exited cleanly
};

class Crawler
{
	public:
		// Standard class stuff
		Crawler(ClientDB *clientdb);
		~Crawler();
		
		// Start and stop crawler	
		int start();
		int end();
		 
		// global variable
		bool quit;
		void signaledEnd();

		// needs to be called from thread main func.
		// MOVE THIS TO PRIVATE!!!
		void startChildCrawler(int child_num);

	private:
		// variables
		int num_of_children_crawlers;
		int maximum_bandwidth;
		int temp_down;
	
		// max threads stuff
		int max_threads_per_host;

		// bandwidth limiting variables
		time_t band_start_time;
		time_t band_end_time;
		int throughput_start;
		int throughput_end;

		// structs for passing around crawl info and results
		BandwidthInfo Bandwidth_Info;
		CrawlInfo Crawler_Info[MAX_NUMBER_CRAWLERS]; 

		// child number
		int my_child_num;

		// functions
		void resetCrawlInfoValues(int child_num, bool flag);

		void manageChildInfo(int num_crawlers, bool *more_urls, bool *multi_hosts);
		void initCurlOpts(CURL *curl_handle, struct MemoryStruct *chunk, struct MemoryStruct *thunk);
		void parseHeaderResults(int child_num, std::string &mime_url, std::string &redirect_url);
		void adjustBandwidth();

		bool checkForMultipleHosts(char *host, int num_crawlers, int max_num_hosts);

		int checkChildCrawlers(int num_crawlers, bool *more_urls);
		int countOfEngagedCrawlers(int num_crawlers);

		// Client database access
		ClientDB *clientdb;

		// Client URL Database Handle
		URLHandler **dbhandle;

		// CRC calculation stuff			
		file_data_t *page_file_data;
		
		// Statistics module		
		CrwStat *Cstat;
};

#endif
