//
// $Id: thread.cc,v 1.8 2001/12/17 00:43:48 dredd Exp $
//
// $Source: /cvsroot/hammerhead/hammerhead/src/thread.cc,v $
// $Revision: 1.8 $
// $Date: 2001/12/17 00:43:48 $
// $State: Exp $
//
// Main Function for each separate process (thread).
// Author: Geoff Wong
//
#include <stdio.h>
#include <unistd.h>    // usleep()
#include <string.h> // strncmp()
#include "session.h"
#include "scenario.h"
#include "dictionary.h"
#include "statistics.h"

#include <netdb.h> //dns resolutions

#include "config.h"
#include "str.h"

extern Scenario **scenario_arr;
extern Scenario **sequence_arr;
extern int scn_size;
extern int seq_size;

extern void shutdown(int);
    // shutdown the thread/process (if called in parent - shuts down the lot)

// global
Scenario simage;

list<String> beenCrawled;
int crawlDepth = 0;

void Crawl(Scenario * scn, Session * sess, Result * res)
{
    if (LogLevel & CRAWL_CALLS)
        fprintf(Scenario::logStream, "Crawl( %s )\n", 
                scn->request->c_str());

    String hrefRE = "[Hh][Rr][Ee][Ff]=";    // what we're looking for
    String httpRE = "[Hh][Tt][Tt][Pp]:";    // Protocol (to strip)
    String pinRE = "&pin=[0-9a-zA-Z]*";        // PIN info (to strip)
    String crawlRE = CrawlRE;

    // list of urls to crawl to when current page is completely analysed
    //
    list<String> toCrawl;

    list<String>::iterator i;

    // non-0 if we have already told the user what the source url for the
    // current page is
    //
    int doneSource = 0; 

    // title of the current page (i.e. <title> line)
    //
    String title = "Unknown";

    // go through every line in the current page, looking for hrefs...
    //
    for (i = res->buffer.begin(); i != res->buffer.end(); i++)
    {
        // where we are up to in the current line
        //
        const char *cp = (*i).c_str();

        // if this is the title line, just grab it, and skip to next
        //
        if (LogLevel & CRAWL_SOURCE && 
            strncmp(cp, "<title>", 7) == 0)
        {
            title = cp;
            continue;
        }
        
        
        int startIdx;    // start of href 

        // step through every href in the current line
        //
        while ((startIdx = regfind(cp, hrefRE)) != -1)
        {
            // start of the href
            //
            const char *startP = cp + startIdx;
            
            // look for the start of the url:
            // - an equals sign (e.g. href="/r?a=b&c=d")
            //
            while (*startP != '=')
                startP++;

            
            // find the "real" url 
            // - adjust startP and endP to take care of quotes
            //
            startP++;    // skip over the equal sign
            
            // check for quotes around url, and skip if found
            //
            char quote = '\0';
            if (*startP == '"' || *startP == '\'')
            {
                quote = *startP;
                startP++;
            }

            // step through href, looking for its end
            // NOTE: Can't use hrefLen, since it matches as much as
            // possible 
            const char *endP = startP + 1;
            while (*endP != quote && *endP != ' ' && *endP != '>' &&
                   *endP != '\0')
                endP++;
            endP--;    // skip back over end character
            
            // work out startIdx based on beginning of current line
            //
            startIdx = startP - (*i).c_str();

            // length of matched url
            //
            int urlLen = endP - startP + 1;

            // matched url string (finally !)
            //
            String urlStr = (*i).substr(startIdx, urlLen);

            // skip cp on to the next possible href position
            //
            cp = endP + 2;

            reggsub(urlStr,httpRE, "");        // remove the protocol
            reggsub(urlStr,pinRE, "");        // remove any pin
            
            // check to see if we've already found this url
            //
            list<String>::iterator d;
            for (d = beenCrawled.begin(); d != beenCrawled.end(); d++)
                if ((*d) == urlStr)
                    break;
            
            // didn't find this url, so tell user we've found it, and
            // add it to appropriate lists
            //
            if (d == beenCrawled.end())
            {
                if (LogLevel & CRAWL_FOUND)
                {
                    if (!doneSource && LogLevel & CRAWL_SOURCE)
                    {
                        doneSource = 1;
                        fprintf(Scenario::logStream, 
                                "Crawled To %s\n\t%s\n",
                                scn->request->c_str(), title.c_str());
                    }
                    fprintf(Scenario::logStream, 
                            "\tFound (%4d) %s\n",
                            crawlDepth, urlStr.c_str());
                }
                beenCrawled.push_back(urlStr);

                // check to see whether we should crawl to this page. If
                // so, add it to the list...
                int matchIdx = urlStr.find(crawlRE, 0);
                if (matchIdx == 0)
                {
                    toCrawl.push_back(urlStr);
                }
                else if (LogLevel & CRAWL_IGNORED)
                {
                    fprintf(Scenario::logStream, "\tIgnoring %s\n", 
                            urlStr.c_str());
                }
            }
        }
    }
    
    // having completely analysed the page, do the crawl to the next
    // level... 
    //
    crawlDepth++;
    for (i = toCrawl.begin(); i != toCrawl.end(); i++)
    {
        String req = "GET " + (*i);    // build a valid http request
        
        Scenario scrawl;
        scrawl.request = &req;

        // try and get the page
        //
        Result *res = scrawl.Request(sess);
        if (LogLevel & CRAWL_REQUEST)
            fprintf(Scenario::logStream, "\tRequested %s", req.c_str());

        if (res == 0 || res->returnCode >= 400)
        {
            fprintf(Scenario::logStream,
                    "FAILED: %3d %s\n\tFrom %s\n",
                    res->returnCode, (*i).c_str(), scn->request->c_str());
            sess->noResultCnt++;
            inc_failures(1);
        }
        else
        {
            // tell the user about any non-trivial ok results
            //
            if (res->returnCode != 200 && LogLevel & CRAWL_NON_200)
            {
                fprintf(Scenario::logStream,
                        "OK: %3d %s\n\tfrom %s\n",
                        res->returnCode, (*i).c_str(), scn->request->c_str());
            }
            inc_requests(1, res->timetaken);
            inc_responses(1, res->response);
            
            Crawl(&scrawl, sess, res);
            
            delete res;
        }
    }
    crawlDepth--;
}

int find_img_src(const String& str, int x) 
{
    const char * orig, * cstr, * ind, * src;

    cstr = str.c_str();
    orig = cstr;

    ind = index(cstr, 'i');
    if (ind == NULL) ind = index(cstr, 'I');
    if (ind != NULL 
        && (ind[1] == 'm' || ind[1] == 'M') 
        && (ind[2] == 'g' || ind[2] == 'G'))
    while (ind[0] != '\0')
    {
        if ((ind[0] == 'i' || ind[0] == 'I') 
                && (ind[1] == 'm' || ind[1] == 'M') 
                && (ind[2] == 'g' || ind[2] == 'G') )
        {
            src = &ind[3];
            while (src[0] != '\0')
            {
                if ((src[0] == 's' || src[0] == 'S') 
                    && (src[1] == 'r' || src[1] == 'R') 
                    && (src[2] == 'c' || src[2] == 'C') 
                    && (src[3] == '='))
                    return (int)(src - orig) + 4;
                src++;
            }
        }
        ind++;
    }

    return - 1;
}

void do_line(Session * sess, String str)
{
    int x, e, e1, e2;
    String req;
    Result * res = 0;

    //printf("Entering do_line\n");
    //printf("Str = #%s#\n", str.c_str());

    x = 0;

    while ((x = find_img_src(str, x)) != -1)
    {
        //printf("found an image at %d\n", x);
        if (x >= (signed)str.length()) break;
        if (str[x] == '"') x = x + 1;

        // this is an ugly mess
        e = -1;
        e1 = (signed)str.find('"', x);
        if (e1 != -1) e = e1;
        e2 = (signed)str.find(' ', x);
        if (e2 != -1) e = e2;
        if ((e1 != -1) && (e1 < e2)) e = e1;
        e2 = (signed)str.find('>', x);
        if ((e2 != -1) && (e2 < e)) e = e2;
        if (e < 0 || (e-x) < 0) break;
        if (x >= (signed)str.length() || (e-x) >= (signed)str.length()) break;

//        printf("str.length=%d (%d, %d)\n", str.length(), x, e-x);
//        This check need to be better - fails if the img src is
//        a fully qualified URI. But should we be pulling images
//        from somewhere else anyway 

        if (((str.substr(x, e-x)).c_str())[0] != '/')
        {
            // FIX: it's a relative fetch should add the rest of the path?
            req = "GET " + sess->scenario->request_base + str.substr(x, e-x);
        }
        else
        {
            req = "GET " + str.substr(x, e-x);
        }

        //printf("imagereq = (%d,%d) #%s#\n", e, (e-x), req.c_str());
        simage.request = &req;
        res =  simage.Request(sess); 
        if (res == 0 || res->returnCode >= 400)
        {
            sess->noResultCnt++;
            inc_failures(1);
        }
        else
        {
            inc_requests(1, res->timetaken);
            inc_responses(1, res->response);
            delete res;
        }
        x = e;
    }

//    printf("Leaving do_line\n");
}

void load_images(Scenario * scn, Session * sess, Result * res)
{
    list<String>::iterator i;

    // If it's already an image don't bother checking the result buffer
    if (((int)scn->name.find(".gif") != -1) || 
        ((int)scn->name.find(".jpg") != -1) ||
        ((int)scn->name.find(".png") != -1))
    {
        return;
    }

    // Otherwise - do it!
    for (i = res->buffer.begin(); i != res->buffer.end(); i++)
    {
        do_line(sess, (*i));
    }
}

#define MAX_LOOP 20
Scenario * select_scenario(Scenario * scenario, int pin)
{
    Scenario   * nxt;
    int      maxloop = 0, x;

    if (scenario && scenario->next.size() != 0 &&
        (random() % 100 < SequenceProbability))
    {
        /* follow the next scenario down the current sequence */
        nxt = scenario->SelectNext();
        scenario = nxt;
        if (scenario == 0)
        {
            /* select a new scenario/sequence  - should 
               this really be an error?*/
            while (scenario == 0 && maxloop++ < MAX_LOOP)
            {
                switch(SelectOn)
                {
                    case SO_SCN:
                        /* select all the scenarios */
                        x = random() % scn_size;
                        scenario = scenario_arr[x];
                        break;
                    case SO_SEQ:
                        x = random() % seq_size;
                        scenario = sequence_arr[x];
                        break;
                    default:
                        fprintf(stderr, "Unknown selection strategy, aborting\n");
                        exit(1);
                }

                if (scenario == 0)
                    fprintf(stderr, "%d : ERROR: scenario_arr[%d] = 0 in failed sequence\n", pin, x);
                else 
                {    
                    inc_sequences(1);
                    break;
                }

            }
            if (scenario == 0) scenario->Log(itoa(pin) +
                          ": Unable to find next scenario (" +
                          nxt->name.c_str() + ") - randomly chose " +
                          scenario->name.c_str());
        }
    }
    else
    {
        /* select a new sequence/scenario */
        do {
            switch(SelectOn) 
            {
                case SO_SCN:
                    /* select all the scenarios */
                    x = random() % scn_size;
                    scenario = scenario_arr[x];
                    break;
                case SO_SEQ:
                    x = random() % seq_size;
                    scenario = sequence_arr[x];
                    break;
                default:
                    fprintf(stderr, "Unknown selection strategy, aborting\n");
                    exit(1);
            }

            if (scenario == 0 && maxloop++ < MAX_LOOP)
                fprintf(stderr, "%d : ERROR: %s_arr[%d] = 0 in random select\n", pin, SO_Names[SelectOn], x);
        }
        while (scenario == 0);
        /* we're starting a new sequence now
           shoudl we log this? */
        inc_sequences(1);
    }

    if (scenario == NULL)
    {
        fprintf(Scenario::logStream, "\tFAILED to select a scenario\n");
    }

    return scenario;
}

//
// Main thread function
// Note: this function is way too big and complex.
// Needs to be broken up a bit
//
void * thread_main(void * arg)
{
    Session * sess;
    Scenario * scenario = NULL;
    String line;
    Result * res;
    long altif;
    struct in_addr address;
    ThreadConfig tc;

    // initial random sleep
    // Also reflect that we may be starting many threads.
    // All threads should be going in StartLag * Sessions
    // microseconds
    if (StartLag > 0)
        usleep(random() % (StartLag * Sessions));

    tc = *(ThreadConfig *)arg;

    altif = tc.altif;
    address.s_addr = tc.altif;

    sess = new Session();

    if (altif != 0) 
    {
        fprintf(stderr,"Creating alternate session interface %s\n", inet_ntoa(address));
        sess->AltInterface(altif);
    }

    sess->ssl_port = tc.target.SSLPort;
    sess->hammer_port = tc.target.HammerPort;

    if (UseSSLLayer == 1) // we're using SSL always.
        sess->port = tc.target.SSLPort;
    else
        sess->port = tc.target.HammerPort;

    sess->ip_addr = tc.target.HammerMachine;
    sess->machine = tc.target.HammerIP;

    // select a random scenario to start
    scenario = select_scenario(scenario, sess->pin);
    sess->scenario = scenario;

#if 0
    cerr << "scenario[" << x << "] = " << scenario << flush;
    cerr << " " << scenario->name << "\n";
#endif

    scenario->Log(" Pid: " + itoa(sess->pin) + ": Starting with " + scenario->name);

    while (Shutdown == 0 && scenario != NULL)
    {
        if (tc.target.DoDNSLookups == 1 && DNSTTLExpired == 1) {
            /* we need to resolv the machine IP */
            struct hostent *host;
            struct in_addr addr;

            /* fprintf(stderr, "Doing hostname resolution\n."); */

            host = gethostbyname(tc.target.HammerName.c_str());
            if (host == NULL)
            {
                fprintf(stderr, "Unable to resolve Machine_Name Entry. (%s)\n", tc.target.HammerName.c_str());
            }

            memcpy(&addr, host->h_addr, sizeof(addr));

            sess->ip_addr = addr.s_addr;
            sess->machine = inet_ntoa(addr);
            DNSTTLExpired = 0;
            alarm(DNSTimeToLive);
        }

        // make the request
        res = scenario->Request(sess);

        if (res == 0 || res->returnCode >= 400)
        {
            sess->noResultCnt++;
            inc_failures(1);
        }
        else
        {
            inc_requests(1, res->timetaken);
            inc_responses(1, res->response);
            inc_claimed(res->contentLength);
            inc_read(res->readLength);
        }

        // verify - take action - log
        if (!scenario->Verify(sess, res))
        {
            sess->noVerifyCnt++;
            inc_noverify(1);
        }
        else 
        {
            inc_scenarios(1);
            if (LoadImages == 1 && res != 0)
            {
                load_images(scenario,sess,res);
            }
        }

        if (DoCrawl == 1 && res != 0)
        {
            Crawl(scenario, sess, res);
        }

        
        // print a summary
        sess->printSummary(scenario, res);

        delete res;

        // if we're crawling, bail out after the first page, cos
        // everything will have been done  in Crawl()
        //
        if (DoCrawl == 1) 
        {
            printf("outputting results\n");
            output_statistics(Scenario::reportStream);
            fclose(Scenario::reportStream);
            fclose(Scenario::logStream);
            exit(1);
        }

        if (scenario->wait_until != 0)
        {
            // wait until a known time offset in the future
            unsigned int zzz;

            zzz = (scenario->wait_until - gethrtime())/1000000;
            if (zzz > 0) usleep(zzz);
        }
        else if (SleepTime || scenario->think)
        {
            // should be random around the given average time
            unsigned int zzz = scenario->think;

            if (SleepTime > 0)
                zzz += random() % (SleepTime * 2);
            usleep(zzz);
        }

#if 0
        if (res->buffer.size())
        {
            line = *(res->buffer.begin());
            printf("res: %s\n", line.c_str());
        }
#endif

        // possibly do a report
        //Report(Scenario::reportStream);

        //
        // check for thread termination
        if (scenario->terminate == true)
        {
            shutdown(0);
        }
        else
        {
            // select the next scenario
            scenario = select_scenario(scenario, sess->pin);
            sess->scenario = scenario;
        }
    }
    return NULL;
}
