/*
 * scamper_do_trace.c
 *
 * $Id: scamper_do_trace.c,v 1.135.2.15 2008/04/08 19:03:42 mjl Exp $
 *
 * Copyright (C) 2005-2008 The University of Waikato
 *
 * This program is free software; you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation, version 2.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program; if not, write to the Free Software
 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
 * 
 */

#include <sys/param.h>

#if !defined(__sun__)
#include <sys/sysctl.h>
#endif

#include <sys/types.h>
#include <sys/socket.h>
#include <sys/time.h>

#include <net/if.h>

#if defined(__linux__)
#define __FAVOR_BSD
#else
#include <net/if_dl.h>
#endif

#if defined(__sun__)
#define BSD_COMP
#endif

#include <sys/ioctl.h>

#include <netinet/in.h>
#include <netinet/in_systm.h>
#include <netinet/ip.h>
#include <netinet/ip_icmp.h>
#include <netinet/udp.h>
#include <netinet/icmp6.h>
#include <netinet/tcp.h>

#if defined(__APPLE__)
#include <stdint.h>
#endif

#include <stdlib.h>
#include <errno.h>
#include <string.h>
#include <unistd.h>
#include <assert.h>

#if defined(DMALLOC)
#include <dmalloc.h>
#endif

#include "scamper.h"
#include "scamper_addr.h"
#include "scamper_list.h"
#include "scamper_tlv.h"
#include "scamper_trace.h"
#include "scamper_task.h"
#include "scamper_queue.h"
#include "scamper_icmp_resp.h"
#include "scamper_fds.h"
#include "scamper_dl.h"
#include "scamper_probe.h"
#include "scamper_rtsock.h"
#include "scamper_privsep.h"
#include "scamper_getsrc.h"
#include "scamper_file.h"
#include "scamper_outfiles.h"
#include "scamper_addresslist.h"
#include "scamper_debug.h"
#include "scamper_do_trace.h"
#include "scamper_addr2mac.h"
#include "scamper_options.h"
#include "scamper_icmp4.h"
#include "scamper_icmp6.h"
#include "scamper_tcp4.h"
#include "scamper_udp4.h"
#include "scamper_udp6.h"
#include "utils.h"

#define SCAMPER_DO_TRACE_ATTEMPTS_MIN  1
#define SCAMPER_DO_TRACE_ATTEMPTS_DEF  2
#define SCAMPER_DO_TRACE_ATTEMPTS_MAX  9

#define SCAMPER_DO_TRACE_DPORT_MIN     1
#define SCAMPER_DO_TRACE_DPORT_DEF     (32768+777)
#define SCAMPER_DO_TRACE_DPORT_MAX     65535

#define SCAMPER_DO_TRACE_FIRSTHOP_MIN  1
#define SCAMPER_DO_TRACE_FIRSTHOP_DEF  1
#define SCAMPER_DO_TRACE_FIRSTHOP_MAX  255

#define SCAMPER_DO_TRACE_GAPLIMIT_MIN  1
#define SCAMPER_DO_TRACE_GAPLIMIT_DEF  5
#define SCAMPER_DO_TRACE_GAPLIMIT_MAX  255

#define SCAMPER_DO_TRACE_GAPACTION_MIN 1
#define SCAMPER_DO_TRACE_GAPACTION_DEF SCAMPER_TRACE_GAPACTION_STOP
#define SCAMPER_DO_TRACE_GAPACTION_MAX 2

#define SCAMPER_DO_TRACE_HOLDTIME_MIN  0
#define SCAMPER_DO_TRACE_HOLDTIME_DEF  0
#define SCAMPER_DO_TRACE_HOLDTIME_MAX  255

#define SCAMPER_DO_TRACE_HOPLIMIT_MIN  0
#define SCAMPER_DO_TRACE_HOPLIMIT_DEF  0
#define SCAMPER_DO_TRACE_HOPLIMIT_MAX  255

#define SCAMPER_DO_TRACE_LOOPS_MIN     0
#define SCAMPER_DO_TRACE_LOOPS_DEF     1 /* stop on the first loop found */
#define SCAMPER_DO_TRACE_LOOPS_MAX     255

#define SCAMPER_DO_TRACE_LOOPACTION_MIN 0
#define SCAMPER_DO_TRACE_LOOPACTION_DEF 0
#define SCAMPER_DO_TRACE_LOOPACTION_MAX 1

#define SCAMPER_DO_TRACE_PPS_MIN       1
#define SCAMPER_DO_TRACE_PPS_MAX       1000
#define SCAMPER_DO_TRACE_PPS_DEF       20

#define SCAMPER_DO_TRACE_SPORT_MIN     1
#define SCAMPER_DO_TRACE_SPORT_MAX     65535

#define SCAMPER_DO_TRACE_TOS_MIN 0
#define SCAMPER_DO_TRACE_TOS_DEF 0
#define SCAMPER_DO_TRACE_TOS_MAX 255

#define SCAMPER_DO_TRACE_WAIT_MIN   1
#define SCAMPER_DO_TRACE_WAIT_DEF   5
#define SCAMPER_DO_TRACE_WAIT_MAX   10

/*
 * pmtud_L2_state
 *
 * this struct records state when inferring the MTU of the underlying media.
 *
 * when scamper has to discover the MTU of the link itself, it uses the L2
 * table above to choose a suitable initial guess.  it records the index
 * into the L2 table into L2_idx.
 */
typedef struct pmtud_L2_state
{
  int                  idx;   /* index into the L2 table */
  int                  lower; /* lower bounds of the L2 search space */
  int                  upper; /* upper bounds of the L2 search space */
  int                  in;    /* probe size not to get a suitable response */
  int                  out;   /* size of probe to infer the underlying MTU */
  scamper_trace_hop_t *hop;   /* the last probe to obtain a response */
} pmtud_L2_state_t;

/*
 * pmtud_TTL_state
 *
 * this struct records state when inferring the TTL range of hops that
 * are responsible for not sending a fragmentation required message where
 * one is required.
 */
typedef struct pmtud_TTL_state
{
  int                  lower; /* lower bounds of the TTL search space */
  int                  upper; /* upper bounds of the TTL search space */
  scamper_trace_hop_t *hop;   /* the last TTL probe to obtain a response */
} pmtud_TTL_state_t;

/*
 * pmtud_L2
 *
 * this struct associates a known MTU with an index into an array.
 */
typedef struct pmtud_L2
{
  int   idx;            /* index into the L2 array where this node resides */
  int   mtu;            /* the MTU of the link */
  char *descr;          /* some description of the L2 media */
} pmtud_L2_t;

/*
 * trace_probe
 *
 * this struct keeps state of each probe sent with the trace
 */
typedef struct trace_probe
{
  struct timeval  tx_tv;  /* the time we transmitted the probe */
  struct timeval  rx_tv;  /* the time we received the first answer */
  scamper_addr_t *rx_mac; /* the mac addr where we received the first answer */
  uint16_t        rx;     /* how many responses scamper got to the probe */
  uint16_t        size;   /* the size of the probe sent */
  uint8_t         ttl;    /* the TTL that was set for the probe */
  uint8_t         id;     /* the attempt number made with ttl/size params */
  uint8_t         mode;   /* the mode scamper was in when probe was sent */
  uint8_t         flags;  /* the probe's flags */
} trace_probe_t;

#define TRACE_PROBE_FLAG_DL_TX   0x01
#define TRACE_PROBE_FLAG_DL_RX   0x02
#define TRACE_ALLOC_HOPS         16

/*
 * trace_state
 *
 * this is a fairly large struct that keeps state for the traceroute
 * process.  it also deals with state in the PMTUD phase, if used.
 */
typedef struct trace_state
{
  uint8_t              mode;          /* current trace mode scamper is in */

  uint8_t              ttl;           /* ttl to set in the probe packet */
  uint8_t              attempt;       /* attempt number at the current probe */
  uint8_t              loopc;         /* count of loops so far */
  uint8_t              iloopc;        /* count of ignored loops */
  uint16_t             alloc_hops;    /* number of trace->hops allocated */
  uint16_t             payload_size;  /* how much payload to include */
  uint16_t             header_size;   /* size of headers */

  scamper_fd_t        *route;         /* fd to query route socket with */
  scamper_fd_t        *icmp;          /* fd to listen to icmp packets with */
  scamper_fd_t        *probe;         /* fd to probe with */
  scamper_fd_t        *dl;            /* struct to use with datalink access */

  uint8_t             *dl_hdr;        /* header to use with datalink */
  uint16_t             dl_size;       /* how large the header is */

  trace_probe_t      **probes;        /* probes sent so far */
  uint16_t             id_next;       /* next id to use in probes */
  uint16_t             id_max;        /* maximum id available */

  pmtud_L2_state_t    *L2;           /* state kept when doing L2 MTU search */
  pmtud_TTL_state_t   *TTL;          /* state kept when doing a TTL search */
  scamper_trace_hop_t *last_fragmsg; /* last fragmentation msg stored */
} trace_state_t;

const uint8_t MODE_TRACE            = 0;
const uint8_t MODE_LASTDITCH        = 1;
const uint8_t MODE_PMTUD_DEFAULT    = 2;
const uint8_t MODE_PMTUD_SILENT_L2  = 3;
const uint8_t MODE_PMTUD_SILENT_TTL = 4;
const uint8_t MODE_PMTUD_BADSUGG    = 5;
const uint8_t MODE_RTSOCK           = 6;

#define MODE_MIN             MODE_TRACE
#define MODE_MAX             MODE_RTSOCK

/* the callback functions registered with the trace task */
static scamper_task_funcs_t trace_funcs;

/* address cache used to avoid reallocating the same address multiple times */
extern scamper_addrcache_t *addrcache;

/* temporary buffer shared amongst traceroutes */
static uint8_t *pktbuf     = NULL;
static size_t   pktbuf_len = 0;

/* socket used to obtain the MTU of a particular interface */
static int   if_sock = -1;

/*
 * these MTUs were largely taken from the NetBSD version of traceroute, and
 * are used to choose a packet size to probe with in the absense of a
 * Fragmentation Needed message.
 *
 * they have been annoted with their corresponding Layer 2 type, largely
 * taken from RFC 1191
 */
static const pmtud_L2_t L2[] =
{
  { 0,    68, "RFC791 MTU"},    /* Official RFC 791 minimum MTU */
  { 1,   296, "P2P low delay"}, /* Point-to-Point links, (low delay) */
  { 2,   508, ""},
  { 3,   512, "NetBIOS"},       /* NetBIOS */
  { 4,   544, "DEC Portal"},    /* DEC IP Portal */
  { 5,   552, ""},
  { 6,   576, "v4 min MTU"},    /* X25 MTU, IPv4 Minimum MTU */
  { 7,  1006, "SLIP"},          /* SLIP */
  { 8,  1280, "v6 min MTU"},    /* IPv6 Minimum MTU */
  { 9,  1454, "PPPoE ADSL"},    /* an optimally sized PPPoE frame in DSL */
  {10,  1480, "v4tun Ether"},   /* Ethernet MTU with tunnel over IPv4 */
  {11,  1492, "IEEE 802.3"},    /* IEEE 802.3 */
  {12,  1500, "Ethernet"},      /* Ethernet MTU */
  {13,  1514, "Ethernet Max"},  /* Ethernet Max MTU */
  {14,  1536, "Exp. Ether"},    /* Exp. Ethernet Nets */
  {15,  2002, "IEEE 802.5"},    /* IEEE 802.5, Recommended MTU */
  {16,  2048, "Wideband"},      /* Wideband Network */
  {17,  4352, "FDDI"},          /* FDDI */
  {18,  4464, "IEEE 802.5"},    /* IEEE 802.5, Maximum MTU */
  {19,  4470, "IP over ATM"},   /* ATM / T3 / SONET SDH */
  {20,  8166, "IEEE 802.4"},    /* IEEE 802.4 */
  {21,  9000, "Broadcom GigE"}, /* Broadcom GigE MTU */
  {22,  9192, "OC-192"},        /* OC-192 and other really fast media */
  {23, 16110, "Intel GigE"},    /* Intel Pro 1000 MTU */
  {24, 17914, "Token Ring"},    /* 16Mb IBM Token Ring */
  {25, 65535, "IPv[46] MTU"}    /* The IPv[46] Maximum MTU */
};

static const pmtud_L2_t *L2_1454 = &L2[9];
static const pmtud_L2_t *L2_1500 = &L2[12];
static const int         L2_cnt  = sizeof(L2) / sizeof(pmtud_L2_t);

#define TRACE_OPT_DPORT       1
#define TRACE_OPT_FIRSTHOP    2
#define TRACE_OPT_GAPLIMIT    3
#define TRACE_OPT_GAPACTION   4
#define TRACE_OPT_LOOPS       5
#define TRACE_OPT_LOOPACTION  6
#define TRACE_OPT_MAXTTL      7
#define TRACE_OPT_PMTUD       8
#define TRACE_OPT_PROTOCOL    9
#define TRACE_OPT_ATTEMPTS    10
#define TRACE_OPT_ALLATTEMPTS 11
#define TRACE_OPT_SPORT       12
#define TRACE_OPT_TOS         13
#define TRACE_OPT_WAIT        14

static const scamper_option_in_t trace_opts_in[] = {
  {'d', NULL, TRACE_OPT_DPORT,       SCAMPER_OPTION_TYPE_NUM},
  {'f', NULL, TRACE_OPT_FIRSTHOP,    SCAMPER_OPTION_TYPE_NUM},
  {'g', NULL, TRACE_OPT_GAPLIMIT,    SCAMPER_OPTION_TYPE_NUM},
  {'G', NULL, TRACE_OPT_GAPACTION,   SCAMPER_OPTION_TYPE_NUM},
  {'l', NULL, TRACE_OPT_LOOPS,       SCAMPER_OPTION_TYPE_NUM},
  {'L', NULL, TRACE_OPT_LOOPACTION,  SCAMPER_OPTION_TYPE_NUM},
  {'m', NULL, TRACE_OPT_MAXTTL,      SCAMPER_OPTION_TYPE_NUM},
  {'M', NULL, TRACE_OPT_PMTUD,       SCAMPER_OPTION_TYPE_NULL},
  {'P', NULL, TRACE_OPT_PROTOCOL,    SCAMPER_OPTION_TYPE_STR},
  {'q', NULL, TRACE_OPT_ATTEMPTS,    SCAMPER_OPTION_TYPE_NUM},
  {'Q', NULL, TRACE_OPT_ALLATTEMPTS, SCAMPER_OPTION_TYPE_NULL},
  {'s', NULL, TRACE_OPT_SPORT,       SCAMPER_OPTION_TYPE_NUM},
  {'t', NULL, TRACE_OPT_TOS,         SCAMPER_OPTION_TYPE_NUM},
  {'w', NULL, TRACE_OPT_WAIT,        SCAMPER_OPTION_TYPE_NUM},
};
static const int trace_opts_cnt = SCAMPER_OPTION_COUNT(trace_opts_in);

/*
 * if_getmtu
 *
 * given an interface index, return the MTU of it.  return zero if
 * we can't get the interface's MTU.
 */
static int if_getmtu(const int ifindex, uint16_t *ifmtu)
{
  struct ifreq ifr;
  int mtu;

  assert(ifindex >= 0);

  /* given the index, return the interface name to query */
  if(if_indextoname((unsigned int)ifindex, ifr.ifr_name) == NULL)
    {
      printerror(errno, strerror, __func__, "could not if_indextoname");
      return -1;
    }

  if(ioctl(if_sock, SIOCGIFMTU, &ifr) == -1)
    {
      printerror(errno, strerror, __func__, "could not SIOCGIFMTU");
      return -1;
    }

#if defined(__sun__)
  mtu = ifr.ifr_metric;
#else
  mtu = ifr.ifr_mtu;
#endif

  if(mtu >= 0 && mtu <= 65535)
    {
      *ifmtu = mtu;
      return 0;
    }

  return -1;
}

#if defined(__linux__) || defined(__sun__)
static int if_getmac(const int ifindex, uint8_t *mac)
{
  struct ifreq ifr;

  if(if_indextoname(ifindex, ifr.ifr_name) == NULL)
    {
      printerror(errno, strerror, __func__, "could not if_indextoname");
      return -1;
    }

#if defined(__linux__)
  if(ioctl(if_sock, SIOCGIFHWADDR, &ifr) == -1)
    {
      printerror(errno, strerror, __func__, "could not SIOCGIFHWADDR");
      return -1;
    }
  memcpy(mac, ifr.ifr_hwaddr.sa_data, 6);
#elif defined(__sun__)
  if(ioctl(if_sock, SIOCGENADDR, &ifr) == -1)
    {
      printerror(errno, strerror, __func__, "could not SIOCGENADDR");
      return -1;
    }
  memcpy(mac, ifr.ifr_enaddr, 6);
#endif

  return 0;
}
#else
static int if_getmac(const int ifindex, uint8_t *mac)
{
  struct if_msghdr   *ifm;
  struct sockaddr_dl *sdl;
  int                 mib[6];
  size_t              len;
  uint8_t            *buf;

  mib[0] = CTL_NET;
  mib[1] = AF_ROUTE;
  mib[2] = 0;
  mib[3] = AF_LINK;
  mib[4] = NET_RT_IFLIST;
  mib[5] = ifindex;

  if(sysctl(mib, 6, NULL, &len, NULL, 0) == -1)
    {
      printerror(errno, strerror, __func__, "could not sysctl buflen");
      return -1;
    }

  if((buf = malloc(len)) == NULL)
    {
      printerror(errno, strerror, __func__, "could not malloc buf");
      return -1;
    }

  if(sysctl(mib, 6, buf, &len, NULL, 0) < 0)
    {
      printerror(errno, strerror, __func__, "could not sysctl data");
      free(buf);
      return -1;
    }

  ifm = (struct if_msghdr *)buf;
  sdl = (struct sockaddr_dl *)(buf+sizeof(struct if_msghdr));
  memcpy(mac, LLADDR(sdl), 6);

  free(buf);
  return 0;
}
#endif

/*
 * pmtud_L2_hop
 *
 * utility to replace the currently cached hop record recorded when scamper
 * does the L2 MTU search.
 */
static void pmtud_L2_hop(trace_state_t *state, scamper_trace_hop_t *hop)
{
  assert(state->L2 != NULL);

  if(state->L2->hop != NULL) scamper_trace_hop_free(state->L2->hop);
  state->L2->hop = hop;
  return;
}

/*
 * pmtud_L2_set_probesize
 *
 * given the lower and upper values of the PMTU search, suggest a packet
 * size to probe next.  apply a few heuristics to the search to try and
 * find the PMTU to the next node faster.
 */
static void pmtud_L2_set_probesize(trace_state_t *state,
				   const int lower, const int upper)
{
  int idx = state->L2->idx;
  int size;

  /* callers should detect end of L2 search before calling this function */
  assert(lower + 1 != upper);

  /* make sure the L2->idx parameter has been set (to something reasonable) */
  assert(idx >= 0);
  assert(idx < L2_cnt);

  /* make sure the suggested window size is within the current window */
  assert(state->L2->lower == -1 || lower >= state->L2->lower);
  assert(state->L2->upper == -1 || upper <= state->L2->upper);

  /*
   * if we've narrowed it down to between two entries in the L2 table,
   * then try one byte higher than the lower, as there's a fair chance
   * the underlying mtu will be L2[idx].mtu.
   *
   * we make an exception if the lower bounds is Ethernet: there exists
   * a strong possibility the underlying MTU is Ethernet, and the cost
   * of guessing wrong [i.e. getting an unexpected response] is small.
   */
  if(lower == 1500 || (lower == L2[idx].mtu && upper <= L2[idx+1].mtu))
    {
      size = lower + 1;
    }
  /*
   * if there is a media MTU higher than the current lower bounds that
   * is smaller than the upper bounds, then try it
   */
  else if(lower >= L2[idx].mtu && L2[idx+1].mtu < upper)
    {
      size = L2[++idx].mtu;
    }
  /*
   * if we did not get a response to the last media MTU probe, and there
   * is a smaller known media MTU to try, then try it now
   */
  else if(upper == L2[idx].mtu && lower < L2[idx-1].mtu)
    {
      size = L2[--idx].mtu;
    }
  /*
   * scamper is operating between two known MTU types, do a binary chop
   */
  else
    {
      size = (lower + upper) / 2;
    }

  state->attempt = 0;
  state->payload_size = size - state->header_size;
  state->L2->idx = idx;
  state->L2->lower = lower;
  state->L2->upper = upper;

  return;
}

/*
 * pmtud_L2_init
 *
 * utility to search the L2 table for a suitable initial probe size, based
 * on known [to scamper] L2 media MTUs in relation to the last probe sent that
 * went unacknowledged.
 */
static int pmtud_L2_init(trace_state_t *state)
{
  pmtud_L2_state_t *L2s;
  int size = state->header_size + state->payload_size;
  int idx;

  /*
   * if the probe that was not answered is > 1500 bytes and scamper has
   * not got a response to a packet 1500 bytes or larger yet, then
   * forcibly try the ethernet MTU next, as the chances are good that the
   * media will be plain old ethernet.
   */
  if(size > 1500)
    {
      idx = L2_1500->idx;
    }
  /*
   * if the probe that was not answered is > 1454 bytes, then forcibly try
   * the lower bounds of X-over-ethernet types.
   */
  else if(size > 1454)
    {
      idx = L2_1454->idx;
    }
  else
    {
      for(idx=0; idx<L2_cnt-1; idx++)
	{
	  if(size > L2[idx].mtu && size <= L2[idx+1].mtu)
	    {
	      break;
	    }
	}
    }

  if((L2s = malloc(sizeof(pmtud_L2_state_t))) == NULL)
    {
      return -1;
    }

  L2s->idx   = idx;
  L2s->hop   = NULL;
  L2s->lower = -1;
  L2s->upper = size;
  L2s->in    = size;
  L2s->out   = -1;

  state->L2           = L2s;
  state->payload_size = L2[idx].mtu - state->header_size;
  state->attempt      = 0;

  return 0;
}

/*
 * pmtud_TTL_hop
 *
 * utility to replace the currently cached hop record recorded when scamper
 * does the TTL search.
 */
static void pmtud_TTL_hop(trace_state_t *state, scamper_trace_hop_t *hop)
{
  if(state->TTL->hop != NULL)
    {
      scamper_trace_hop_free(state->TTL->hop);
    }
  state->TTL->hop = hop;
  return;
}

/*
 * pmtud_TTL_set_probettl
 *
 * return: 0 if there are no more TTLs to probe, 1 if probing should continue
 */
static int pmtud_TTL_set_probettl(scamper_task_t *task,
				  const int lower, int upper)
{
  scamper_trace_t *trace = task->data;
  trace_state_t *state = task->state;
  int cur;

  /* check to see if we have to do any more TTL searching */
  while(lower + 1 < upper)
    {
      /* halve the TTL space */
      cur = (lower + upper) / 2;

      /*
       * check to see if experience at soliciting a TTL expired message has
       * been good.  skip TTLs that have been non-responsive
       */
      while(cur < upper && trace->hops[cur-1] == NULL)
	{
	  cur++;
	}

      /* scamper got a suitable TTL probe value, so we are done */
      if(cur != upper)
	{
	  state->TTL->lower = lower;
	  state->TTL->upper = upper;
	  state->ttl = cur;
	  state->attempt = 0;
	  return 1;
	}

      /*
       * there are no TTLs above the half-way point to probe for, so try for
       * ones lower
       */
      upper = (lower + upper) / 2;
    }

  return 0;  
}

/*
 * hop_find
 *
 * check to see if there is any other hop in the trace with the
 * same address
 */
static scamper_trace_hop_t *hop_find(const scamper_trace_t *trace,
				     const scamper_addr_t *addr)
{
  scamper_trace_hop_t *hop;
  uint16_t i;

  for(i=0; i<trace->hop_count; i++)
    {
      for(hop = trace->hops[i]; hop != NULL; hop = hop->hop_next)
	{
	  if(scamper_addr_cmp(hop->hop_addr, addr) == 0)
	    {
	      return hop;
	    }
	}
    }

  return NULL;
}

/*
 * pmtud_TTL_init
 *
 * initialise the bounds of a TTL search
 */
static int pmtud_TTL_init(scamper_task_t *task)
{
  scamper_trace_hop_t *hop;
  scamper_trace_t *trace = task->data;
  trace_state_t *state = task->state;
  int lower, upper;
  uint8_t turn_ttl;

  if((state->TTL = malloc_zero(sizeof(pmtud_TTL_state_t))) == NULL)
    {
      return -1;
    }

  /*
   * the packet size that is dropped silently is the size we are
   * doing a TTL limited search with
   */
  state->payload_size = state->L2->in - state->header_size;

  /*
   * use the last ICMP fragmentation required message recorded in the
   * path MTU discovery phase to infer a suitable lower-bound for inferring
   * the range of TTLs that could be responsible for not sending an ICMP
   * fragmentation required message
   */
  if(state->last_fragmsg == NULL)
    {
      lower = 0;
    }
  else
    {
      SCAMPER_TRACE_HOP_GET_TURN_TTL(state->last_fragmsg, turn_ttl);
      if((lower = state->last_fragmsg->hop_probe_ttl - turn_ttl) < 1)
	{
	  lower = 0;
	}
    }

  /*
   * the upper bound of TTLs to search is set by closest response past
   * the hop that sends nothing
   */
  if((hop = hop_find(trace, state->L2->hop->hop_addr)) != NULL)
    {
      upper = hop->hop_probe_ttl;
    }
  else
    {
      SCAMPER_TRACE_HOP_GET_TURN_TTL(state->L2->hop, turn_ttl);
      upper = state->L2->hop->hop_probe_ttl - turn_ttl + 1;
    }

  /* if the TTL limited search is a null operation, then say so */
  if(pmtud_TTL_set_probettl(task, lower, upper) == 0)
    {
      return 0;
    }

  return 1;
}

/*
 * pmtud_hopins
 *
 * take the hop structure and put it into the list of hops based on the
 * TTL distance into the path.
 * XXX: this isn't what this function actually does.
 */
static void pmtud_hopins(scamper_trace_t *trace, scamper_trace_hop_t *hop)
{
  scamper_trace_hop_t *cur, *pre;

  assert(hop != NULL);

  if((pre = trace->pmtud->hops) == NULL)
    {
      trace->pmtud->hops = hop;
      return;
    }
  
  for(cur = pre->hop_next; cur != NULL; cur = cur->hop_next)
    {
      pre = cur;
    }

  pre->hop_next = hop;
  hop->hop_next = cur;

  return;
}

/*
 * pmtu_L2_search_end
 *
 * scamper has had to infer the underlying next-hop MTU due to a pmtud
 * fault.  given the hop used to infer the nhmtu, insert that into the
 * trace and tidy up.
 */
static int pmtud_L2_search_end(scamper_task_t *task)
{
  scamper_trace_t *trace = task->data;
  trace_state_t *state = task->state;
  scamper_trace_hop_t *hop;
  uint16_t out;

  assert(state->L2->out >= 0);
  assert(state->L2->out <= 65535);

  out = state->L2->out;

  /*
   * copy details of the TTL-expired message furthest into the path
   * into the trace if there is one to copy
   */
  if(state->TTL != NULL)
    {
      /*
       * if there was no TTL response with the large packet from anywhere
       * in the path
       */
      if(state->TTL->hop == NULL)
	{
	  /*
	   * if the lowest TTL tried was zero, then we infer that the host
	   * itself has an MTU mismatch with the particular router it is
	   * using for the destination
	   */
	  if(state->TTL->lower == 0)
	    {
	      scamper_tlv_set(&trace->pmtud->tlvs,
			      SCAMPER_TRACE_PMTUD_TLV_OUTMTU, 2, &out);
	    }
	}
      else
	{
	  pmtud_hopins(trace, state->TTL->hop);
	}
      free(state->TTL);
      state->TTL = NULL;
    }

  /*
   * copy details of the hop to terminate the largest probe into
   * the pmtu struct.  hops between the TTL expired message (if we
   * have one) and the ICMP unreach message have their PMTU inferred
   */
  if((hop = state->L2->hop) != NULL)
    {
      pmtud_hopins(trace, hop);
      state->last_fragmsg = hop;
      free(state->L2);
      state->L2 = NULL;

      /*
       * if the hop that we last recorded is a hop message that would
       * ordinarily have caused scamper to stop PMTU discovery, then
       * stop it now
       */
      if(!SCAMPER_TRACE_HOP_IS_ICMP_PACKET_TOO_BIG(hop))
	{
	  trace->pmtud->pmtu = hop->hop_probe_size;
	  scamper_queue_done(task->queue, scamper_holdtime_get()*1000);
	  return 1;
	}
    }

  if(state->L2 != NULL)
    {
      free(state->L2);
      state->L2 = NULL;
    }

  state->payload_size = out - state->header_size;
  state->mode = MODE_PMTUD_DEFAULT;
  state->attempt = 0;
  state->ttl = 255;

  return 0;
}

/*
 * trace_ipid_fudge
 *
 * play games with the embedded IP ID, which may come back with a different
 * IP ID than what was sent; return the ID of the corresponding probe in *id.
 * this code was inspired by information from David Malone.
 *
 * the IPID transmitted is assigned from a counter (state->id_next) which
 * starts from one -- *not* zero.  this is so systems that zero the IPID
 * will not confuse this algorithm.
 *
 * the IPID is transmitted by scamper in network byte order.
 *
 */
static int trace_ipid_fudge(const trace_state_t *state,
			    const uint16_t ipid, uint16_t *id)
{
  /* ensure the IP ID is not zero */
  if(ipid == 0)
    {
      return -1;
    }

  /* check if the IP ID is in range */
  if(ipid <= state->id_next)
    {
      *id = ipid - 1;
      return 0;
    }
  
  /* check if the IP ID was incremented */
  if(ipid == state->id_next + 1)
    {
      scamper_debug(__func__, "ip id one greater than sent");
      *id = ipid - 2;
      return 0;
    }

  /* check if the IP ID was byte swapped. XXX: is this correct? */
  if(byteswap16(ipid) <= state->id_next)
    {
      scamper_debug(__func__, "ip id byte swapped");
      *id = byteswap16(ipid) - 1;
      return 0;
    }

  return -1;
}

/*
 * trace_stop
 *
 * set the trace's stop parameters to whatever it is passed
 */
static void trace_stop(scamper_trace_t *trace,
		       const uint8_t reason, const uint8_t data)
{
  /* if we've already set a stop reason, then don't clobber it */
  if(trace->stop_reason != SCAMPER_TRACE_STOP_NONE)
    {
      scamper_debug(__func__, "reason %d/%d preceeds %d/%d",
		    trace->stop_reason, trace->stop_data, reason, data);
      return;
    }

  trace->stop_reason = reason;
  trace->stop_data   = data;

  return;
}

static void trace_stop_completed(scamper_trace_t *trace)
{
  trace_stop(trace, SCAMPER_TRACE_STOP_COMPLETED, 0);
  return;
}

static void trace_stop_gaplimit(scamper_trace_t *trace)
{
  trace_stop(trace, SCAMPER_TRACE_STOP_GAPLIMIT, 0);
  return;
}

static void trace_stop_error(scamper_trace_t *trace, int error)
{
  trace_stop(trace, SCAMPER_TRACE_STOP_ERROR, error);
  return;
}

static void trace_stop_hoplimit(scamper_trace_t *trace)
{
  trace_stop(trace, SCAMPER_TRACE_STOP_HOPLIMIT, 0);
  return;
}

/*
 * trace_isloop
 *
 * given a trace and a hop record, determine if there is a loop.
 */
static int trace_isloop(const scamper_trace_t *trace,
			const scamper_trace_hop_t *hop,
			trace_state_t *state)
{
  scamper_trace_hop_t *tmp;
  int i;

  /* need at least a couple of probes first */
  if(hop->hop_probe_ttl <= trace->firsthop)
    {
      return 0;
    }

  /*
   * check to see if the address has already been seen this hop; if it is,
   * then we've already checked this address for loops so we don't need to
   * check it again.
   */
  for(tmp = trace->hops[hop->hop_probe_ttl-1]; tmp != hop; tmp = tmp->hop_next)
    {
      if(scamper_addr_cmp(hop->hop_addr, tmp->hop_addr) == 0)
	{
	  return 0;
	}
    }

  /* compare all hop records until the hop prior to this one */
  for(i=hop->hop_probe_ttl-2; i>=trace->firsthop-1; i--)
    {
      for(tmp = trace->hops[i]; tmp != NULL; tmp = tmp->hop_next)
	{
	  assert(i+1 == tmp->hop_probe_ttl);

	  /* if the addresses match, then there is a loop */
	  if(scamper_addr_cmp(hop->hop_addr, tmp->hop_addr) == 0)
	    {
	      /*
	       * if the loop is between adjacent hops, check the loopaction
	       * parameter for what we should do
	       */
	      if(tmp->hop_probe_ttl + 1 == hop->hop_probe_ttl)
		{
		  /*
		   * the loopaction parameter has values 0 .. 255; currently
		   * the loopaction parameter counts the number of loops
		   * between adjacent hops to ignore.
		   */
		  if(++state->iloopc <= trace->loopaction)
		    return 0;		  
		}

	      /* check if the loop condition is met */
	      state->loopc++;
	      if(state->loopc >= trace->loops)
		{
		  return 1;
		}

	      /* count the loop just once for this hop */
	      break;
	    }
	}
    }

  return 0;
}

/*
 * trace_hopins
 *
 * insert the hop record into the hop list at the appropriate place
 */
static int trace_hopins(scamper_trace_hop_t **hops, scamper_trace_hop_t *hop)
{
  scamper_trace_hop_t *pre, *cur;

  /* insert at head if no other hop recorded */
  if((cur = *hops) == NULL)
    {
      *hops = hop;
      hop->hop_next = NULL;
      return 0;
    }

  /* search for the place to insert this hop record */
  pre = NULL;
  while(cur != NULL && cur->hop_probe_id <= hop->hop_probe_id)
    {
      pre = cur;
      cur = cur->hop_next;
    }

  /* the place to insert is at the head of the list */
  if(pre == NULL)
    {
      *hops = hop;
    }
  else
    {
      pre->hop_next = hop;
    }
  hop->hop_next = cur;

  return 0;
}

/*
 * trace_handlerror
 *
 * the code encountered some error when doing the traceroute, so stop the
 * trace now.
 */
static int trace_handleerror(scamper_task_t *task, const int error)
{
  trace_stop_error((scamper_trace_t *)task->data, error);
  scamper_queue_done(task->queue, scamper_holdtime_get()*1000);
  return 0;
}

/*
 * trace_hop
 *
 * this function creates a generic hop record with the basic details from
 * the probe structure copied in, as well as an address based on the details
 * passed in
 */
static scamper_trace_hop_t *trace_hop(const trace_probe_t *probe,
				      const int af, const void *addr)
{
  scamper_trace_hop_t *hop;
  int type;

  /* determine the scamper address type to use from the address family */
  if(af == AF_INET) type = SCAMPER_ADDR_TYPE_IPV4;
  else if(af == AF_INET6) type = SCAMPER_ADDR_TYPE_IPV6;
  else return NULL;

  if((hop = scamper_trace_hop_alloc()) == NULL)
    {
      return NULL;
    }

  if((hop->hop_addr = scamper_addrcache_get(addrcache, type, addr)) == NULL)
    {
      scamper_trace_hop_free(hop);
      return NULL;
    }

  hop->hop_probe_ttl  = probe->ttl;
  hop->hop_probe_id   = probe->id + 1;
  hop->hop_probe_size = probe->size;  

  /*
   * if the probe's datalink tx timestamp flag is set, scamper has a tx
   * timestamp recorded
   */
  if(probe->flags & TRACE_PROBE_FLAG_DL_TX)
    {
      hop->hop_flags |= SCAMPER_TRACE_HOP_FLAG_TS_DL_TX;
    }

  return hop;
}

/*
 * trace_icmp_hop
 *
 * given a trace probe and an ICMP response, allocate and initialise a
 * scamper_trace_hop record.
 */
static scamper_trace_hop_t *trace_icmp_hop(scamper_trace_t *trace,
					   trace_probe_t *probe,
					   scamper_icmp_resp_t *ir)
{
  scamper_trace_hop_t *hop;
  scamper_addr_t addr;
  uint16_t dl;
  uint16_t cn, ct;
  int off;

  /* get a pointer to the source address of the ICMP response */
  if(scamper_icmp_resp_src(ir, &addr) != 0)
    {
      return NULL;
    }

  /* create a generic hop record without any special bits filled out */
  if((hop = trace_hop(probe, ir->ir_af, addr.addr)) == NULL)
    {
      return NULL;
    }

  /* fill out the basic bits of the hop structure */
  hop->hop_reply_size = ir->ir_ip_size;
  hop->hop_icmp_type  = ir->ir_icmp_type;
  hop->hop_icmp_code  = ir->ir_icmp_code;

  /*
   * we cannot depend on the TTL field of the IP packet being made available,
   * so we signal explicitly when the reply ttl is valid
   */
  if(ir->ir_ip_ttl != -1)
    {
      hop->hop_reply_ttl = ir->ir_ip_ttl;
      hop->hop_flags |= SCAMPER_TRACE_HOP_FLAG_REPLY_TTL;
    }

  /*
   * if the probe's datalink rx timestamp flag is set, scamper has a rx
   * timestamp recorded
   */
  if(probe->flags & TRACE_PROBE_FLAG_DL_RX)
    {
      hop->hop_flags |= SCAMPER_TRACE_HOP_FLAG_TS_DL_RX;
      timeval_rtt(&hop->hop_rtt, &probe->tx_tv, &probe->rx_tv);
    }
  else
    {
      timeval_rtt(&hop->hop_rtt, &probe->tx_tv, &ir->ir_rx);
      if(ir->ir_flags & SCAMPER_ICMP_RESP_FLAG_KERNRX)
	{
	  hop->hop_flags |= SCAMPER_TRACE_HOP_FLAG_TS_SOCK_RX;
	}
    }

  if(SCAMPER_ICMP_RESP_IS_PACKET_TOO_BIG(ir))
    {
      if(scamper_tlv_set(&hop->hop_tlvs, SCAMPER_TRACE_HOP_TLV_NHMTU,
			 2, &ir->ir_icmp_nhmtu) == NULL)
	{
	  goto err;
	}
    }

  if(ir->ir_af == AF_INET)
    {
      if(scamper_tlv_set(&hop->hop_tlvs, SCAMPER_TRACE_HOP_TLV_REPLY_IPID,
			 2, &ir->ir_ip_id) == NULL ||
	 scamper_tlv_set(&hop->hop_tlvs, SCAMPER_TRACE_HOP_TLV_REPLY_IPTOS,
			 1, &ir->ir_ip_tos) == NULL)
	{
	  goto err;
	}
    }

  if(SCAMPER_ICMP_RESP_INNER_IS_SET(ir))
    {
      /*
       * IPv4: record ToS byte
       * IPv6: might pay to record traffic class byte here.
       */
      if(ir->ir_af == AF_INET &&
	 scamper_tlv_set(&hop->hop_tlvs, SCAMPER_TRACE_HOP_TLV_INNER_IPTOS,
			 1, &ir->ir_inner_ip_tos) == NULL)
	{
	  goto err;
	}

      if(ir->ir_inner_ip_ttl != 1)
	{
	  if(scamper_tlv_set(&hop->hop_tlvs, SCAMPER_TRACE_HOP_TLV_INNER_IPTTL,
			     1, &ir->ir_inner_ip_ttl) == NULL)
	    {
	      goto err;
	    }
	}

      if(ir->ir_inner_ip_size != probe->size)
	{
	  if(scamper_tlv_set(&hop->hop_tlvs, SCAMPER_TRACE_HOP_TLV_INNER_IPLEN,
			     2, &ir->ir_inner_ip_size) == NULL)
	    {
	      goto err;
	    }
	}
    }

  /*
   * if ICMP extensions are included, then parse and include them.
   * note that the icmp extension header is included in ir_ext, 
   * but the hop structure only includes the extensions themselves.
   */
  if(ir->ir_ext != NULL)
    {
      /* start at offset 4 so the extension header is skipped */
      for(off = 4; off + 4 < ir->ir_extlen; off += dl)
	{
	  /* extract the length field */
	  memcpy(&dl, ir->ir_ext+off, 2);
	  dl = ntohs(dl);

	  /* make sure there is enough in the packet left */
	  if(off + dl < ir->ir_extlen)
	    break;

	  cn = ir->ir_ext[off+2];
	  ct = ir->ir_ext[off+3];

	  if(dl < 8)
	    {
	      scamper_debug(__func__, "icmp ext %d/%d/%d ignored", cn, ct, dl);
	      continue;
	    }

	  if(scamper_trace_hop_icmpext_add(hop, cn, ct, dl-4,
					   ir->ir_ext+off+4) != 0)
	    {
	      printerror(errno, strerror, __func__,
			 "could not add icmp ext %d/%d/%d", cn, ct, dl);
	      goto err;
	    }
	}
    }

  /* record the fact that we have a hop record thanks to this probe */
  if(probe->rx != 65535) probe->rx++;

  return hop;

 err:
  scamper_trace_hop_free(hop);
  return NULL;
}

static scamper_trace_hop_t *trace_tcp_hop(trace_probe_t *probe,
					  scamper_dl_rec_t *dl)
{
  scamper_trace_hop_t *hop;

  /* create a generic hop record without any special bits filled out */
  if((hop = trace_hop(probe, dl->dl_af, dl->dl_ip_src)) == NULL)
    {
      return NULL;
    }

  /* fill out the basic bits of the hop structure */
  hop->hop_reply_size = dl->dl_ip_size;
  hop->hop_reply_ttl = dl->dl_ip_ttl;
  hop->hop_tcp_flags = dl->dl_tcp_flags;
  timeval_rtt(&hop->hop_rtt, &probe->tx_tv, &dl->dl_tv);

  /* set the flags that are known to apply to this hop record */
  hop->hop_flags |= (SCAMPER_TRACE_HOP_FLAG_REPLY_TTL |
		     SCAMPER_TRACE_HOP_FLAG_TCP |
		     SCAMPER_TRACE_HOP_FLAG_TS_DL_RX);

  if(dl->dl_af == AF_INET)
    {
      if(scamper_tlv_set(&hop->hop_tlvs, SCAMPER_TRACE_HOP_TLV_REPLY_IPID,
			 2, &dl->dl_ip_id) == NULL ||
	 scamper_tlv_set(&hop->hop_tlvs, SCAMPER_TRACE_HOP_TLV_REPLY_IPTOS,
			 1, &dl->dl_ip_tos) == NULL)
	{
	  goto err;
	}
    }

  return hop;

 err:
  scamper_trace_hop_free(hop);
  return NULL;
}

/*
 * trace_next_mode
 *
 * if the trace is going into another mode, this function figures out
 * which mode to put it into
 */
static void trace_next_mode(scamper_task_t *task)
{
  scamper_trace_t *trace = task->data;
  trace_state_t *state = task->state;
  uint16_t ifmtu;
  int ifindex;

  if((trace->flags & SCAMPER_TRACE_FLAG_PMTUD) == 0 ||
     trace->stop_reason == SCAMPER_TRACE_STOP_HOPLIMIT ||
     trace->stop_reason == SCAMPER_TRACE_STOP_GAPLIMIT ||
     trace->stop_reason == SCAMPER_TRACE_STOP_LOOP ||
     trace->stop_reason == SCAMPER_TRACE_STOP_NONE)
    {
      scamper_queue_done(task->queue, scamper_holdtime_get()*1000);
      return;
    }

  /* if the interface's MTU is useless, then we can't do PMTUD */
  scamper_fd_ifindex(state->dl, &ifindex);
  if(if_getmtu(ifindex, &ifmtu) == -1 || ifmtu <= state->header_size)
    {
      scamper_queue_done(task->queue, scamper_holdtime_get()*1000);
      return;
    }

  if((trace->pmtud = malloc_zero(sizeof(struct scamper_trace_pmtud))) == NULL)
    {
      scamper_queue_done(task->queue, scamper_holdtime_get()*1000);
      return;
    }
  trace->pmtud->ifmtu = ifmtu;

  state->attempt      = 0;
  state->mode         = MODE_PMTUD_DEFAULT;
  state->payload_size = ifmtu - state->header_size;
  state->ttl          = 255;

  scamper_queue_probe(task->queue);
  return;
}

/*
 * trace_stop_reason
 *
 * check to see if we have a stop condition based on the hop record
 */
static void trace_stop_reason(scamper_trace_t *trace, scamper_trace_hop_t *hop,
			      trace_state_t *state,
			      uint8_t *stop_reason, uint8_t *stop_data)
{
  int rc;

  if(SCAMPER_TRACE_HOP_IS_ICMP_UNREACH_PORT(hop))
    {
      *stop_reason = SCAMPER_TRACE_STOP_COMPLETED;
      *stop_data = 0;
    }
  else if(SCAMPER_TRACE_HOP_IS_ICMP_UNREACH(hop))
    {
      *stop_reason = SCAMPER_TRACE_STOP_UNREACH;
      *stop_data = hop->hop_icmp_code;
    }
  else if(SCAMPER_TRACE_HOP_IS_ICMP_ECHO_REPLY(hop))
    {
      *stop_reason = SCAMPER_TRACE_STOP_COMPLETED;
      *stop_data = 0;
    }
  else if(trace->dst->type == SCAMPER_ADDR_TYPE_IPV6 &&
	  hop->hop_icmp_type == ICMP6_PACKET_TOO_BIG)
    {
      *stop_reason = SCAMPER_TRACE_STOP_ICMP;
      *stop_data   = hop->hop_icmp_type;
    }
  else if(trace->loops != 0 && (rc = trace_isloop(trace, hop, state)) != 0)
    {
      *stop_reason = SCAMPER_TRACE_STOP_LOOP;
      *stop_data   = 0;
    }
  else if(scamper_addr_cmp(trace->dst, hop->hop_addr) == 0)
    {
      /*
       * in IPv6, we can have anonymous interfaces on the path that
       * send ICMP errors 'spoofing' their source address as the
       * destination we sent.  if we get a TTL expired / fragmentation
       * needed message on the path from the 'destination address' then
       * it is an anonymous interface
       */
      if(trace->dst->type != SCAMPER_ADDR_TYPE_IPV6 ||
	 hop->hop_icmp_type == ICMP6_DST_UNREACH)
	{
	  *stop_reason = SCAMPER_TRACE_STOP_COMPLETED;
	  *stop_data = 0;
	}
      else
	{
	  *stop_reason = SCAMPER_TRACE_STOP_NONE;
	  *stop_data = 0;
	}
    }
  else
    {
      *stop_reason = SCAMPER_TRACE_STOP_NONE;
      *stop_data = 0;
    }

  return;
}

/*
 * handleicmp_trace
 *
 * we received an ICMP response in the traceroute state.  check to see
 * if the probe is in sequence, and adjust the trace accordingly.
 */
static int handleicmp_trace(scamper_task_t *task,
			    scamper_icmp_resp_t *ir,
			    trace_probe_t *probe)
{
  scamper_trace_t *trace = task->data;
  trace_state_t *state = task->state;
  scamper_trace_hop_t *hop;
  uint8_t stop_reason;
  uint8_t stop_data;
  int cwh = 0;

  /* we should only have to deal with probes sent while in the trace state */
  if(probe->mode != MODE_TRACE)
    {
      return 0;
    }

  /* create a hop record and insert it into the trace */
  if((hop = trace_icmp_hop(trace, probe, ir)) == NULL)
    {
      return -1;
    }
  trace_hopins(&trace->hops[hop->hop_probe_ttl-1], hop);

  /* if the response is for the current working hop */
  if(hop->hop_probe_ttl - 1 == trace->hop_count)
    {
      /* current working hop */
      cwh = 1;

      /* if we are sending all allotted probes to the target */
      if(trace->flags & SCAMPER_TRACE_FLAG_ALLATTEMPTS)
	{
	  /*
	   * if we get an out of order reply, then we go back to waiting for
	   * the one we just probed for
	   */
	  if(probe->id+1 != state->attempt)
	    {
	      return 0;
	    }

	  /*
	   * this response is for the last probe sent.  if there are still
	   * probes to send for this hop, then send the next one
	   */
	  if(state->attempt < trace->attempts)
	    {
	      scamper_queue_probe(task->queue);
	      return 0;
	    }
	}

      trace->hop_count++;
      state->attempt = 0;
      state->ttl++;
    }

  /* check to see if we have a stop reason from the ICMP response */
  trace_stop_reason(trace, hop, state, &stop_reason, &stop_data);
  if(stop_reason != SCAMPER_TRACE_STOP_NONE)
    {
      /* did we get a stop condition out of all that? */
      trace_stop(trace, stop_reason, stop_data);
      trace_next_mode(task);
    }
  else if(trace->hop_count == 255 || trace->hop_count == trace->hoplimit)
    {
      /* if not, has the hop limit now reached? */
      trace_stop_hoplimit(trace);
      trace_next_mode(task);
    }
  else if(cwh != 0)
    {
      /* if not, keep probing */
      scamper_queue_probe(task->queue);
    }

  return 0;
}

/*
 * handleicmp_lastditch
 *
 * we received an ICMP response while checking if the end-host is
 * responsive.
 */
static int handleicmp_lastditch(scamper_task_t *task,
				scamper_icmp_resp_t *ir,
				trace_probe_t *probe)
{
  scamper_trace_t *trace = task->data;
  scamper_trace_hop_t *hop;

  if(probe->mode == MODE_TRACE)
    {
      /* record the response in the trace */
      if((hop = trace_icmp_hop(trace, probe, ir)) == NULL)
	{
	  return -1;
	}
      trace_hopins(&trace->hops[hop->hop_probe_ttl-1], hop);
    }
  else if(probe->mode == MODE_LASTDITCH)
    {
      if((hop = trace_icmp_hop(trace, probe, ir)) == NULL)
	{
	  return -1;
	}
      trace_hopins(&trace->lastditch, hop);
      trace_stop_gaplimit((scamper_trace_t *)task->data);
      scamper_queue_done(task->queue, scamper_holdtime_get()*1000);
    }

  return 0;
}

static int handleicmp_pmtud_default(scamper_task_t *task,
				    scamper_icmp_resp_t *ir,
				    trace_probe_t *probe)
{
  scamper_trace_t *trace = task->data;
  trace_state_t   *state = task->state;
  scamper_trace_hop_t *hop;

  /*
   * if the response is for a probe that fits with the current
   * probing details, then record it
   */
  if(probe->mode == MODE_PMTUD_DEFAULT &&
     probe->size == state->header_size + state->payload_size)
    {
      if((hop = trace_icmp_hop(trace, probe, ir)) == NULL)
	{
	  return -1;
	}
      pmtud_hopins(trace, hop);
      state->last_fragmsg = hop;

      if(SCAMPER_ICMP_RESP_IS_PACKET_TOO_BIG(ir))
	{
	  /*
	   * if the fragmentation required message did not include the
	   * suggested packet size for the next hop, or if the suggested
	   * MTU is larger than (or the same size) as the probe packet, then
	   * abandon the pmtud operation at this point
	   */
	  if(ir->ir_icmp_nhmtu == 0 || ir->ir_icmp_nhmtu >= probe->size)
	    {
	      state->mode = MODE_PMTUD_BADSUGG;
	      pmtud_L2_init(state);
	      scamper_queue_probe(task->queue);
	      return 0;
	    }

	  /*
	   * if the message supplied an MTU to use, but it is too small
	   * to be probed, then stop probing here.
	   */
	  if(ir->ir_icmp_nhmtu < state->header_size)
	    {
	      scamper_queue_done(task->queue, scamper_holdtime_get()*1000);
	      return 0;
	    }

	  state->attempt = 0;
	  state->payload_size = ir->ir_icmp_nhmtu - state->header_size;
	  scamper_queue_probe(task->queue);
	}
      else if(SCAMPER_ICMP_RESP_IS_TTL_EXP(ir) ||
	      SCAMPER_ICMP_RESP_IS_UNREACH(ir) ||
	      SCAMPER_ICMP_RESP_IS_ECHO_REPLY(ir))
	{
	  trace->pmtud->pmtu = probe->size;
	  scamper_queue_done(task->queue, scamper_holdtime_get()*1000);
	}
    }

  return 0;
}

static int handleicmp_pmtud_silent_L2(scamper_task_t *task,
				      scamper_icmp_resp_t *ir,
				      trace_probe_t *probe)
{
  scamper_trace_t *trace = task->data;
  trace_state_t *state = task->state;
  scamper_trace_hop_t *hop;

  assert(state->L2 != NULL);

  /*
   * if we get a response that is out of the bounds we are searching, it
   * could be a delayed message.  at the moment, we just ignore the response.
   */
  if(probe->size < state->L2->lower || state->L2->upper <= probe->size)
    {
      scamper_debug(__func__, "L2 search %d < %d || %d <= %d",
		    probe->size, state->L2->lower,
		    state->L2->upper, probe->size);
      return 0;
    }

  /* record the hop details */
  if((hop = trace_icmp_hop(trace, probe, ir)) == NULL)
    {
      return -1;
    }
  pmtud_L2_hop(state, hop);

  /*
   * if there is still space to search, reduce the search space and send
   * another probe
   */
  if(probe->size + 1 != state->L2->upper)
    {
      /*
       * raise the lower bounds of our search based on successfully
       * receiving a response for a given packet size.
       */
      pmtud_L2_set_probesize(state, probe->size, state->L2->upper);
    }
  else
    {
      state->L2->lower = state->L2->out = probe->size;
      if(pmtud_TTL_init(task) == 1)
	{
	  state->mode = MODE_PMTUD_SILENT_TTL;
	}
      else
	{
	  scamper_queue_done(task->queue, scamper_holdtime_get()*1000);
	  return 0;
	}
    }

  scamper_queue_probe(task->queue);
  return 0;
}

static int handleicmp_pmtud_silent_TTL(scamper_task_t *task,
				       scamper_icmp_resp_t *ir,
				       trace_probe_t *probe)
{
  scamper_trace_t *trace = task->data;
  trace_state_t *state = task->state;
  scamper_trace_hop_t *hop;

  /* we got a TTL expired message */
  if(SCAMPER_ICMP_RESP_IS_TTL_EXP(ir))
    {
      /* record the hop details */
      if((hop = trace_icmp_hop(trace, probe, ir)) == NULL)
	{
	  return -1;
	}
      pmtud_TTL_hop(state, hop);

      /* if there is no more TTL space to search, then we are done */
      if(pmtud_TTL_set_probettl(task, probe->ttl, state->TTL->upper) == 0)
	{
	  /*
	   * if we are not finished with PMTU yet, put the trace back in
	   * the queue
	   */
	  if(pmtud_L2_search_end(task) == 1)
	    {
	      return 0;
	    }
	}
    }
  /*
   * if we get a fragmentation required message during a TTL limited
   * search for the MTU inferred, then record the message and stop
   * the TTL limited search
   */
  else if(SCAMPER_ICMP_RESP_IS_PACKET_TOO_BIG(ir) &&
	  ir->ir_icmp_nhmtu == state->L2->out)
    {
      /* record the hop details */
      if((hop = trace_icmp_hop(trace, probe, ir)) == NULL)
	{
	  return -1;
	}
      pmtud_hopins((scamper_trace_t *)task->data, hop);

      state->attempt      = 0;
      state->payload_size = ir->ir_icmp_nhmtu - state->header_size;
      state->ttl          = 255;
      state->mode         = MODE_PMTUD_DEFAULT;

      free(state->L2); state->L2 = NULL;
      free(state->TTL); state->TTL = NULL;
    }

  /* put the trace back into the probe queue */
  scamper_queue_probe(task->queue);
  return 0;
}

/*
 * handleicmp_pmtud_badsugg
 *
 * we are in the badsugg state, which is used to infer a 'correct' next-hop
 * mtu size when the suggested packet size is no help.
 */
static int handleicmp_pmtud_badsugg(scamper_task_t *task,
				    scamper_icmp_resp_t *ir,
				    trace_probe_t *probe)
{
  scamper_trace_t *trace = task->data;
  trace_state_t *state = task->state;
  scamper_trace_hop_t *hop;
  scamper_addr_t addr;
  int upper, lower;

  if(scamper_icmp_resp_src(ir, &addr) != 0)
    {
      return -1;
    }

  /*
   * adjust the window we are searching based on where the response came
   * from and the size of the probe that caused the response
   */
  if(scamper_addr_cmp(state->last_fragmsg->hop_addr, &addr) == 0)
    {
      lower = state->L2->lower;
      upper = probe->size;
    }
  else
    {
      lower = probe->size;
      upper = state->L2->upper;

      /* replace the layer-2 hop we get a response for with this hop */
      if((hop = trace_icmp_hop(trace, probe, ir)) == NULL)
	{
	  return -1;
	}
      pmtud_L2_hop(state, hop);
    }

  if(lower + 1 != upper)
    {
      pmtud_L2_set_probesize(state, lower, upper);
    }
  else
    {
      /* terminate the search now */
      state->L2->lower = state->L2->out = lower;
      state->L2->upper = upper;

      /* if the pmtud is completed, then move on */
      if(pmtud_L2_search_end(task) == 1)
	{
	  return 0;
	}
    }

  /* put the trace back into the probe queue */
  scamper_queue_probe(task->queue);

  return 0;
}

static int do_trace_handle_icmp(scamper_task_t *task, scamper_icmp_resp_t *ir)
{
  static int (*const func[])(scamper_task_t *, scamper_icmp_resp_t *,
			     trace_probe_t *) = {
    handleicmp_trace,            /* MODE_TRACE            == 0x00 */
    handleicmp_lastditch,        /* MODE_LASTDITCH        == 0x01 */
    handleicmp_pmtud_default,    /* MODE_PMTUD_DEFAULT    == 0x02 */
    handleicmp_pmtud_silent_L2,  /* MODE_PMTUD_SILENT_L2  == 0x03 */
    handleicmp_pmtud_silent_TTL, /* MODE_PMTUD_SILENT_TTL == 0x04 */
    handleicmp_pmtud_badsugg,    /* MODE_PMTUD_BADSUGG    == 0x05 */
    NULL,                        /* MODE_RTSOCK           == 0x06 */
  };

  scamper_trace_t   *trace = task->data;
  trace_state_t     *state = task->state;
  uint16_t           id;
  uint8_t            proto;

  assert(state->mode <= MODE_MAX);

  /*
   * if the trace is in a mode that does not handle ICMP responses, then
   * stop now
   */
  if(func[state->mode] == NULL)
    {
      return 0;
    }

  if(trace->type == SCAMPER_TRACE_TYPE_UDP ||
     trace->type == SCAMPER_TRACE_TYPE_UDP_PARIS)
    {
      /* if the ICMP type is not something that we care for, then drop it */
      if(SCAMPER_ICMP_RESP_IS_TTL_EXP(ir) == 0 &&
	 SCAMPER_ICMP_RESP_IS_UNREACH(ir) == 0 &&
	 SCAMPER_ICMP_RESP_IS_PACKET_TOO_BIG(ir) == 0)
	{
	  return 0;
	}

      /*
       * if the ICMP response does not reference a UDP probe sent from our
       * source port to a destination probe we're likely to have probed, then
       * ignore the packet
       */
      if(SCAMPER_ICMP_RESP_INNER_IS_SET(ir) == 0 ||
	 ir->ir_inner_ip_proto  != IPPROTO_UDP ||
	 ir->ir_inner_udp_sport != trace->sport)
	{
	  return 0;
	}

      if(trace->type == SCAMPER_TRACE_TYPE_UDP)
	{
	  if(ir->ir_inner_udp_dport <  trace->dport ||
	     ir->ir_inner_udp_dport >= trace->dport+state->id_next)
	    {
	      return 0;
	    }

	  /* XXX: handle wrap-around */
	  id = ir->ir_inner_udp_dport - trace->dport;
	}
      else if(trace->type == SCAMPER_TRACE_TYPE_UDP_PARIS)
	{
	  if(ir->ir_inner_udp_dport != trace->dport)
	    {
	      return 0;
	    }

	  if(ir->ir_af == AF_INET)
	    {
	      if(ir->ir_inner_udp_sum == ir->ir_inner_ip_id &&
		 ir->ir_inner_udp_sum != 0)
		{
		  id = ntohs(ir->ir_inner_udp_sum) - 1;
		}
	      else if(trace_ipid_fudge(state, ir->ir_inner_ip_id, &id) != 0)
		{
		  return 0;
		}
	    }
	  else
	    {
	      if(ir->ir_inner_udp_sum == 0)
		{
		  return 0;
		}
	      id = ntohs(ir->ir_inner_udp_sum) - 1;
	    }
	}
      else return 0;
    }
  else if(trace->type == SCAMPER_TRACE_TYPE_ICMP_ECHO ||
	  trace->type == SCAMPER_TRACE_TYPE_ICMP_ECHO_PARIS)
    {
     /* if the ICMP type is not something that we care for, then drop it */
      if(SCAMPER_ICMP_RESP_IS_ECHO_REPLY(ir) == 0)
	{
	  if(SCAMPER_ICMP_RESP_IS_TTL_EXP(ir) == 0 &&
	     SCAMPER_ICMP_RESP_IS_UNREACH(ir) == 0 &&
	     SCAMPER_ICMP_RESP_IS_PACKET_TOO_BIG(ir) == 0)
	    {
	      return 0;
	    }

	  if(ir->ir_af == AF_INET) proto = IPPROTO_ICMP;
	  else if(ir->ir_af == AF_INET6) proto = IPPROTO_ICMPV6;
	  else return 0;

	  if(SCAMPER_ICMP_RESP_INNER_IS_SET(ir) == 0 ||
	     ir->ir_inner_ip_proto != proto          ||
	     ir->ir_inner_icmp_id  != trace->sport   ||
	     ir->ir_inner_icmp_seq >= state->id_next)
	    {
	      return 0;
	    }

	  id = ir->ir_inner_icmp_seq;
	}
      else
	{
	  if(ir->ir_icmp_id  != trace->sport ||
	     ir->ir_icmp_seq >= state->id_next)
	    {
	      return 0;
	    }

	  id = ir->ir_icmp_seq;
	}
    }
  else if(trace->type == SCAMPER_TRACE_TYPE_TCP)
    {
      /* if the ICMP type is not something that we care for, then drop it */
      if(SCAMPER_ICMP_RESP_IS_TTL_EXP(ir) == 0 &&
	 SCAMPER_ICMP_RESP_IS_UNREACH(ir) == 0 &&
	 SCAMPER_ICMP_RESP_IS_PACKET_TOO_BIG(ir) == 0)
	{
	  return 0;
	}

      /*
       * if the ICMP response does not reference a TCP probe sent from our
       * source port to the destination port specified then ignore the
       * ICMP packet
       */
      if(SCAMPER_ICMP_RESP_INNER_IS_SET(ir) == 0 ||
	 ir->ir_inner_ip_proto  != IPPROTO_TCP ||
	 ir->ir_inner_tcp_sport != trace->sport ||
	 ir->ir_inner_tcp_dport != trace->dport)
	{
	  return 0;
	}

      if(ir->ir_af == AF_INET)
	{
	  /* determine which probe id the ip id corresponds to */
	  if(trace_ipid_fudge(state, ir->ir_inner_ip_id, &id) != 0)
	    {
	      return 0;
	    }
	}
      else
	{
	  if(ir->ir_inner_ip_flow == 0)
	    {
	      return 0;
	    }
	  id = ir->ir_inner_ip_flow - 1;
	}
    }
  else
    {
      return 0;
    }

  if(id < state->id_next)
    {
      func[state->mode](task, ir, state->probes[id]);
    }

  return 0;
}

/*
 * timeout_trace
 *
 * this function is called if the trace timed out on the wait queue, and
 * all allotted attempts have been sent.
 */
static int timeout_trace(scamper_task_t *task)
{
  scamper_trace_t     *trace = task->data;
  trace_state_t       *state = task->state;
  scamper_trace_hop_t *hop;
  int                  i, deadpath;
  uint8_t              stop_reason, stop_data;

  /* we tried this hop, so move onto the next */
  trace->hop_count++;
  state->ttl++;

  /*
   * if we probed for all attempts on the hop, then check to see if we
   * got any responses on this hop, and if we did, check to see if we
   * should stop probing this target yet
   */
  if(trace->flags & SCAMPER_TRACE_FLAG_ALLATTEMPTS)
    {
      for(hop = trace->hops[trace->hop_count-1];hop != NULL; hop=hop->hop_next)
	{
	  /*
	   * first, check to see if there is a reason to stop probing with
	   * this particular hop record
	   */
	  trace_stop_reason(trace, hop, state, &stop_reason, &stop_data);
	  if(stop_reason != SCAMPER_TRACE_STOP_NONE)
	    {
	      trace_stop(trace, stop_reason, stop_data);
	      trace_next_mode(task);
	      return 0;
	    }
	}
    }

  if(trace->hop_count == 255 || trace->hop_count == trace->hoplimit)
    {
      trace_stop_hoplimit(trace);
      trace_next_mode(task);
      return 0;
    }

  /*
   * if we haven't checked to see if the path is dead yet, check to see
   * if we should do so at this time.  a dead path is defined as a path
   * that has an unresponsive target host, which we stop tracing after
   * the gaplimit is reached.
   */
  if(trace->hop_count - (trace->firsthop - 1) >= trace->gaplimit)
    {
      deadpath = 1;
      for(i=0; i<trace->gaplimit; i++)
	{
	  if(trace->hops[trace->hop_count-1-i] != NULL)
	    {
	      deadpath = 0;
	      break;
	    }
	}

      if(deadpath != 0)
	{
	  if(trace->gapaction == SCAMPER_TRACE_GAPACTION_LASTDITCH)
	    {
	      state->mode = MODE_LASTDITCH;
	      state->ttl = 255;
	    }
	  else
	    {
	      trace_stop_gaplimit(trace);
	      trace_next_mode(task);
	    }
	}
    }

  return 0;
}

static int timeout_lastditch(scamper_task_t *task)
{
  /* we received no responses to any of the last-ditch probes */
  trace_stop_gaplimit((scamper_trace_t *)task->data);
  scamper_queue_done(task->queue, scamper_holdtime_get()*1000);
  return 0;
}

static int timeout_pmtud_default(scamper_task_t *task)
{
  trace_state_t *state = task->state;

  pmtud_L2_init(state);
  state->mode = MODE_PMTUD_SILENT_L2;

  return 0;
}

static int timeout_pmtud_silent_L2(scamper_task_t *task)
{
  trace_state_t *state = task->state;
  int size = state->header_size + state->payload_size;

  /*
   * have we scanned the L2 table to the official minimum MTU?
   * if we have, then PMTU fails and we abort.
   */
  if(state->L2->idx == 0)
    {
      scamper_queue_done(task->queue, scamper_holdtime_get()*1000);
      return 0;
    }

  /*
   * we did not get a response for this probe size
   * if we can halve the search space again, then do that
   */
  if(state->L2->lower + 1 != size)
    {
      pmtud_L2_set_probesize(state, state->L2->lower, size);
    }
  else
    {
      state->L2->out = state->L2->lower;

      /* set the bounds of the TTL search */
      if(pmtud_TTL_init(task) == 1)
	{
	  state->mode = MODE_PMTUD_SILENT_TTL;
	}
      else
	{
	  scamper_queue_done(task->queue, scamper_holdtime_get()*1000);
	}
    }

  return 0;
}

static int timeout_pmtud_silent_TTL(scamper_task_t *task)
{
  trace_state_t *state = task->state;

  /*
   * select another TTL to probe with, if possible. if not, then
   * the search halts and we move on
   */
  if(pmtud_TTL_set_probettl(task, state->TTL->lower, state->ttl) == 0)
    {
      /* finish this portion of the TTL limited search */
      pmtud_L2_search_end(task);
    }

  return 0;
}

/*
 * timeout_pmtud_badsugg
 *
 * if we timeout while trying to determine the underlying MTU on a path
 * where a router gives a bad suggestion, chances are that an ICMP blackhole
 * exists later in the path.  try sending a larger packet, if we can.
 */
static int timeout_pmtud_badsugg(scamper_task_t *task)
{
  trace_state_t *state = task->state;
  int lower, upper;

  lower = state->header_size + state->payload_size;
  upper = state->L2->upper;

  pmtud_L2_hop(state, NULL);

  if(lower + 1 != upper)
    {
      pmtud_L2_set_probesize(state, lower, upper);
    }
  else
    {
      /* terminate the search now */
      state->L2->lower = state->L2->out = lower;
      state->L2->upper = upper;

      /* if the pmtud is completed, then move on */
      if(pmtud_L2_search_end(task) == 1)
	{
	  return 0;
	}
    }

  return 0;
}

static int timeout_rtsock(scamper_task_t *task)
{
  scamper_trace_t *trace = task->data;
  trace_state_t *state = task->state;

  /*
   * if we can't get sense out of the route socket, then do the trace
   * anyway, but without pmtud or datalink support
   */
  trace->flags &= ~(SCAMPER_TRACE_FLAG_PMTUD | SCAMPER_TRACE_FLAG_DL);
  state->mode = MODE_TRACE;

  return 0;
}

/*
 * do_trace_handle_timeout
 *
 * the trace has expired while sitting on the wait queue.
 * handle this event appropriately.
 */
static int do_trace_handle_timeout(scamper_task_t *task)
{
  static int (* const func[])(scamper_task_t *) = {
    timeout_trace,             /* MODE_TRACE            == 0x00 */
    timeout_lastditch,         /* MODE_LASTDITCH        == 0x01 */
    timeout_pmtud_default,     /* MODE_PMTUD_DEFAULT    == 0x02 */
    timeout_pmtud_silent_L2,   /* MODE_PMTUD_SILENT_L2  == 0x03 */
    timeout_pmtud_silent_TTL,  /* MODE_PMTUD_SILENT_TTL == 0x04 */
    timeout_pmtud_badsugg,     /* MODE_PMTUD_BADSUGG    == 0x05 */
    timeout_rtsock,            /* MODE_RTSOCK           == 0x06 */
  };

  scamper_trace_t *trace = task->data;
  trace_state_t   *state = task->state;

  assert(state->mode <= MODE_MAX);

  /*
   * if we have sent all alloted attempts for this probe type, then
   * handle this particular probe failing
   */
  if(state->attempt == trace->attempts)
    {
      /* we're probably going to send another probe, so reset the attempt # */
      state->attempt = 0;

      /* call the function that handles a timeout in this particular mode */
      func[state->mode](task);
    }

  return 0;
}

static int handletcp_trace(scamper_task_t *task, scamper_dl_rec_t *dl,
			   trace_probe_t *probe)
{
  scamper_trace_t *trace = task->data;
  trace_state_t   *state = task->state;
  scamper_trace_hop_t *hop;

  /* we should only have to deal with probes sent while in the trace state */
  if(probe->mode != MODE_TRACE)
    {
      return 0;
    }

  /* create a hop record based off the TCP data */
  if((hop = trace_tcp_hop(probe, dl)) == NULL)
    {
      return -1;
    }
  trace_hopins(&trace->hops[hop->hop_probe_ttl-1], hop);

  /* record the receive timestamp with the probe structure */
  if((probe->flags & TRACE_PROBE_FLAG_DL_RX) != 0 &&
     (dl->dl_flags & SCAMPER_DL_FLAG_TIMESTAMP) != 0)
    {
      timeval_cpy(&probe->rx_tv, &dl->dl_tv);
      probe->flags |= TRACE_PROBE_FLAG_DL_RX;
    }

  probe->rx++;

  /* if the reply is for the current working hop */
  if(hop->hop_probe_ttl - 1 == trace->hop_count)
    {
      /* if we are sending all allotted probes to the target */
      if(trace->flags & SCAMPER_TRACE_FLAG_ALLATTEMPTS)
	{
	  /*
	   * if we get an out of order reply, then we go back to waiting for
	   * the one we just probed for
	   */
	  if(probe->id + 1 != state->attempt)
	    {
	      return 0;
	    }

	  if(state->attempt < trace->attempts)
	    {
	      scamper_queue_probe(task->queue);
	    }
	}
      else
	{
	  trace->hop_count++;
	  trace_stop_completed(trace);
	  scamper_queue_done(task->queue, scamper_holdtime_get()*1000);
	}
    }
  else
    {
      trace_stop_completed(trace);
      scamper_queue_done(task->queue, scamper_holdtime_get()*1000);
    }

  return 0;
}

static int handletcp_lastditch(scamper_task_t *task, scamper_dl_rec_t *dl,
			       trace_probe_t *probe)
{
  scamper_trace_t *trace = task->data;
  scamper_trace_hop_t *hop;

  /* only handle TCP responses in these two states */
  if(probe->mode != MODE_TRACE && probe->mode != MODE_LASTDITCH)
    {
      return 0;
    }

  probe->rx++;

  /* create a hop record based off the TCP data */
  if((hop = trace_tcp_hop(probe, dl)) == NULL)
    {
      return -1;
    }

  if(probe->mode == MODE_LASTDITCH)
    {
      trace_hopins(&trace->lastditch, hop);
      trace_stop_gaplimit((scamper_trace_t *)task->data);
      scamper_queue_done(task->queue, scamper_holdtime_get()*1000);
    }
  else
    {
      /*
       * XXX: after inserting the hop, probably should go back into the trace
       * mode
       */
      trace_hopins(&trace->hops[hop->hop_probe_ttl-1], hop);
    }

  return 0;
}

/*
 * dlin_trace
 *
 * handle a datalink record for an inbound packet which was sent
 * for a probe in the trace state.
 *
 * in this case, we use the timestamp and the ethernet mac address
 * [if available] to update the hop record.
 */
static void dlin_trace(scamper_trace_t *trace,
		       scamper_dl_rec_t *dl, trace_probe_t *probe)
{
  scamper_trace_hop_t *hop;
  struct timeval tv;

  /* adjust the rtt based on the timestamp included in the datalink record */
  timeval_rtt(&tv, &probe->tx_tv, &probe->rx_tv);

  for(hop=trace->hops[probe->ttl-1]; hop != NULL; hop = hop->hop_next)
    {
      if(probe->id + 1 < hop->hop_probe_id) continue;
      if(probe->id + 1 > hop->hop_probe_id) break;

      scamper_debug(__func__,
		    "hop %d.%06d dl_rec %d.%06d diff %lld",
		    hop->hop_rtt.tv_sec, hop->hop_rtt.tv_usec,
		    tv.tv_sec, tv.tv_usec, 
		    timeval_diff_usec(&hop->hop_rtt, &tv));

      hop->hop_flags &= ~(SCAMPER_TRACE_HOP_FLAG_TS_SOCK_RX);
      hop->hop_flags |= SCAMPER_TRACE_HOP_FLAG_TS_DL_RX;
      timeval_cpy(&hop->hop_rtt, &tv);
    }

  return;
}

static void dlout_apply(scamper_trace_hop_t *hop,
			trace_probe_t *probe, struct timeval *diff)
{
  while(hop != NULL)
    {
      if(probe->id + 1 > hop->hop_probe_id)
	{
	  break;
	}

      if(probe->id + 1 == hop->hop_probe_id)
	{
	  hop->hop_flags |= SCAMPER_TRACE_HOP_FLAG_TS_DL_TX;
	  timeval_add_tv(&hop->hop_rtt, diff);
	}

      hop = hop->hop_next;
    }

  return;
}

/*
 * dlout_trace
 *
 * adjust the RTT recorded for a probe/reply sequence based on an updated
 * transmit timestamp corresponding to when the packet was queued at the
 * network interface.
 */
static void dlout_trace(scamper_trace_t *trace,
			trace_probe_t *probe, struct timeval *diff)
{
  dlout_apply(trace->hops[probe->ttl-1], probe, diff);
  return;
}

/*
 * dlout_lastditch
 *
 */
static void dlout_lastditch(scamper_trace_t *trace,
			    trace_probe_t *probe, struct timeval *diff)
{
  dlout_apply(trace->lastditch, probe, diff);
  return;
}

/*
 * do_trace_handle_dl
 *
 * handle a datalink record that may have something useful for the
 * traceroute, such as a more accurate timestamp or a mac address of
 * the host that delivered the response to us.
 */
static int do_trace_handle_dl(scamper_task_t *task, scamper_dl_rec_t *dl)
{
  static void (* const dlout_func[])(scamper_trace_t *, trace_probe_t *,
				     struct timeval *) =
  {
    dlout_trace,     /* MODE_TRACE            == 0x00 */
    dlout_lastditch, /* MODE_LASTDITCH        == 0x01 */
    NULL,            /* MODE_PMTUD_DEFAULT    == 0x02 */
    NULL,            /* MODE_PMTUD_SILENT_L2  == 0x03 */
    NULL,            /* MODE_PMTUD_SILENT_TTL == 0x04 */
    NULL,            /* MODE_PMTUD_BADSUGG    == 0x05 */
    NULL,            /* MODE_RTSOCK           == 0x06 */
  };

  static void (* const dlin_func[])(scamper_trace_t *, scamper_dl_rec_t *,
				    trace_probe_t *) =
  {
    dlin_trace,      /* MODE_TRACE            == 0x00 */
    NULL,            /* MODE_LASTDITCH        == 0x01 */
    NULL,            /* MODE_PMTUD_DEFAULT    == 0x02 */
    NULL,            /* MODE_PMTUD_SILENT_L2  == 0x03 */
    NULL,            /* MODE_PMTUD_SILENT_TTL == 0x04 */
    NULL,            /* MODE_PMTUD_BADSUGG    == 0x05 */
    NULL,            /* MODE_RTSOCK           == 0x06 */
  };

  static int (* const handletcp_func[])(scamper_task_t *, scamper_dl_rec_t *,
					trace_probe_t *) =
  {
    handletcp_trace,     /* MODE_TRACE            == 0x00 */
    handletcp_lastditch, /* MODE_LASTDITCH        == 0x01 */
    NULL,                /* MODE_PMTUD_DEFAULT    == 0x02 */
    NULL,                /* MODE_PMTUD_SILENT_L2  == 0x03 */
    NULL,                /* MODE_PMTUD_SILENT_TTL == 0x04 */
    NULL,                /* MODE_PMTUD_BADSUGG    == 0x05 */
    NULL,                /* MODE_RTSOCK           == 0x06 */
  };

  scamper_trace_t *trace = task->data;
  trace_state_t   *state = task->state;
  trace_probe_t   *probe;
  uint16_t         probe_id;
  int              direction;
  struct timeval   diff;

  /* if this record has no timestamp, go no further */
  if((dl->dl_flags & SCAMPER_DL_FLAG_TIMESTAMP) == 0)
    {
      return 0;
    }

  /*
   * try and determine the direction of the packet and the associated probe
   * for this datalink record
   */
  if(trace->type == SCAMPER_TRACE_TYPE_UDP ||
     trace->type == SCAMPER_TRACE_TYPE_UDP_PARIS)
    {
      if(dl->dl_ip_proto == IPPROTO_UDP)
	{
	  if(dl->dl_udp_sport != trace->sport) return 0;

	  direction = 1;

	  if(trace->type == SCAMPER_TRACE_TYPE_UDP)
	    {
	      probe_id = dl->dl_udp_dport - trace->dport;
	    }
	  else
	    {
	      probe_id = ntohs(dl->dl_udp_sum) - 1;
	    }
	}
      else if(SCAMPER_DL_IS_ICMP(dl))
	{
	  if(SCAMPER_DL_IS_ICMP_TTL_EXP(dl) == 0 &&
	     SCAMPER_DL_IS_ICMP_UNREACH(dl) == 0 &&
	     SCAMPER_DL_IS_ICMP_PACKET_TOO_BIG(dl) == 0)
	    {
	      return 0;
	    }
	  if(dl->dl_icmp_ip_proto  != IPPROTO_UDP) return 0;
	  if(dl->dl_icmp_udp_sport != trace->sport) return 0;

	  direction = 0;

	  if(trace->type == SCAMPER_TRACE_TYPE_UDP)
	    {
	      probe_id = dl->dl_icmp_udp_dport - trace->dport;
	    }
	  else
	    {
	      if(dl->dl_icmp_udp_dport != trace->dport)
		{
		  return 0;
		}

	      if(dl->dl_af == AF_INET)
		{
		  if(dl->dl_icmp_udp_sum == dl->dl_icmp_ip_id &&
		     dl->dl_icmp_udp_sum != 0)
		    {
		      probe_id = ntohs(dl->dl_icmp_udp_sum) - 1;
		    }
		  else if(trace_ipid_fudge(state,dl->dl_icmp_ip_id,
					   &probe_id) != 0)
		    {
		      return 0;
		    }
		}
	      else
		{
		  assert(dl->dl_af == AF_INET6);
		  if(dl->dl_icmp_udp_sum == 0)
		    {
		      return 0;
		    }
		  probe_id = ntohs(dl->dl_icmp_udp_sum) - 1;
		}
	    }
	}
      else return 0;
    }
  else if(trace->type == SCAMPER_TRACE_TYPE_ICMP_ECHO ||
	  trace->type == SCAMPER_TRACE_TYPE_ICMP_ECHO_PARIS)
    {
      if(SCAMPER_DL_IS_ICMP(dl) == 0) return 0;

      if(SCAMPER_DL_IS_ICMP_ECHO_REQUEST(dl))
	{
	  if(dl->dl_icmp_id != trace->sport) return 0;
	  probe_id = dl->dl_icmp_seq;
	  direction = 1;
	}
      else if(SCAMPER_DL_IS_ICMP_ECHO_REPLY(dl))
	{
	  if(dl->dl_icmp_id != trace->sport) return 0;
	  probe_id = dl->dl_icmp_seq;
	  direction = 0;
	}
      else if((SCAMPER_DL_IS_ICMP_TTL_EXP(dl) ||
	       SCAMPER_DL_IS_ICMP_UNREACH(dl) ||
	       SCAMPER_DL_IS_ICMP_PACKET_TOO_BIG(dl)) &&
	      SCAMPER_DL_IS_ICMP_PROTO_ICMP_ECHO_REQ(dl))
	{
	  if(dl->dl_icmp_icmp_id != trace->sport) return 0;
	  probe_id = dl->dl_icmp_icmp_seq;
	  direction = 0;
	}
      else return 0;
    }
  else if(trace->type == SCAMPER_TRACE_TYPE_TCP)
    {
      if(dl->dl_ip_proto == IPPROTO_TCP)
	{
	  /*
	   * if the syn flag (and only the syn flag is set) and the sport
	   * and dport match what we probe with, then the probe is probably
	   * an outgoing one.
	   */
	  if((dl->dl_tcp_flags & TH_SYN)  == TH_SYN &&
	     (dl->dl_tcp_flags & ~TH_SYN) == 0 &&
	     dl->dl_tcp_sport == trace->sport)
	    {
	      if(dl->dl_af == AF_INET)
		probe_id = dl->dl_ip_id - 1;
	      else
		probe_id = dl->dl_ip_flow - 1;

	      direction = 1;
	    }
	  else if(dl->dl_tcp_sport == trace->dport &&
		  dl->dl_tcp_dport == trace->sport)
	    {
	      probe_id = state->id_next - 1;
	      direction = 0;	      
	    }
	  else return 0;
	}
      else if(SCAMPER_DL_IS_ICMP(dl))
	{
	  if(SCAMPER_DL_IS_ICMP_TTL_EXP(dl) == 0 &&
	     SCAMPER_DL_IS_ICMP_UNREACH(dl) == 0 &&
	     SCAMPER_DL_IS_ICMP_PACKET_TOO_BIG(dl) == 0)
	    {
	      return 0;
	    }
	  if(dl->dl_icmp_ip_proto  != IPPROTO_TCP) return 0;
	  if(dl->dl_icmp_tcp_sport != trace->sport) return 0;
	  if(dl->dl_icmp_tcp_dport != trace->dport) return 0;

	  /* determine which probe the ICMP response corresponds to */
	  if(dl->dl_af == AF_INET)
	    {
	      if(trace_ipid_fudge(state, dl->dl_icmp_ip_id, &probe_id) != 0)
		{
		  return 0;
		}
	    }
	  else
	    {
	      if(dl->dl_icmp_ip_flow == 0) return 0;
	      probe_id = dl->dl_icmp_ip_flow - 1;
	    }

	  direction = 0;
	}
      else return 0;
    }
  else return 0;

  /* find the probe that corresponds to this datalink record */
  if(probe_id >= state->id_next)
    {
      return 0;
    }
  probe = state->probes[probe_id];

  /* make sure the probe structure makes sense */
  assert(probe->mode <= MODE_MAX);

  /* if this is an inbound packet with a timestamp attached */
  if(direction == 0)
    {
      /* inbound TCP packets result in a hop record being created */
      if(dl->dl_ip_proto == IPPROTO_TCP)
	{
	  /*
	   * record the receive timestamp with the probe structure if it hasn't
	   * been previously recorded
	   */
	  if((probe->flags & TRACE_PROBE_FLAG_DL_RX) != 0)
	    {
	      timeval_cpy(&probe->rx_tv, &dl->dl_tv);
	      probe->flags |= TRACE_PROBE_FLAG_DL_RX;
	    }

	  if(handletcp_func[probe->mode] != NULL)
	    {
	      handletcp_func[probe->mode](task, dl, probe);
	    }
	}
      /* other datalink records result in timestamps being adjusted */
      else if((probe->flags & TRACE_PROBE_FLAG_DL_RX) == 0)
	{
	  /* update the receive timestamp stored with the probe */
	  probe->flags |= TRACE_PROBE_FLAG_DL_RX;
	  timeval_cpy(&probe->rx_tv, &dl->dl_tv);

	  /* if at least one hop record is present then adjust */
	  if(probe->rx > 0 && dlin_func[probe->mode] != NULL)
	    {
	      dlin_func[probe->mode](trace, dl, probe);
	    }
	}
    }
  else
    {
      scamper_debug(__func__, "probe %d.%06d dl_rec %d.%06d diff %lld",
		    probe->tx_tv.tv_sec, probe->tx_tv.tv_usec,
		    dl->dl_tv.tv_sec, dl->dl_tv.tv_usec, 
		    timeval_diff_usec(&probe->tx_tv, &dl->dl_tv));

      /* if at least one hop record is present then adjust */
      if(probe->rx > 0 && dlout_func[probe->mode] != NULL)
	{
	  timeval_rtt(&diff, &probe->tx_tv, &dl->dl_tv);
	  dlout_func[probe->mode](trace, probe, &diff);
	}

      /* update the TX timestamp of the probe */
      probe->flags |= TRACE_PROBE_FLAG_DL_TX;
      timeval_cpy(&probe->tx_tv, &dl->dl_tv);
    }

  return 0;
}

static int rt_framing(scamper_task_t *task, scamper_rt_rec_t *rt)
{
  scamper_trace_t *trace = task->data;
  trace_state_t *state = task->state;
  scamper_dl_t *dl = scamper_fd_write_state(state->dl);
  int af, tx_type;

  /*
   * determine the header to append when sending packets on the datalink.
   * if the datalink code cannot send packets, then we can't do PMTUD
   */
  tx_type = scamper_dl_tx_type(dl);

  /*
   * determine the size of the header to prepend to the packet to put it
   * on the datalink
   */
  switch(tx_type)
    {
    case SCAMPER_DL_TX_UNSUPPORTED:
      return -1;

    case SCAMPER_DL_TX_ETHERNET:
    case SCAMPER_DL_TX_ETHLOOP:
      state->dl_size = 14;
      break;

    case SCAMPER_DL_TX_NULL:
      state->dl_size = sizeof(int);
      break;

    case SCAMPER_DL_TX_RAW:
      state->dl_size = 0;
      return 0;

    default:
      scamper_debug(__func__, "unhandled tx_type %d", tx_type);
      return -1;
    }

  /* allocate the header */
  if((state->dl_hdr = malloc(state->dl_size)) == NULL)
    {
      printerror(errno, strerror, __func__, "could not malloc dl_hdr");
      return -1;
    }

  if(tx_type == SCAMPER_DL_TX_ETHERNET)
    {
      /*
       * allocate a datalink header to use, and determine the source mac
       * address to use
       */
      if(if_getmac(rt->ifindex, state->dl_hdr+6) == -1)
	{
	  scamper_debug(__func__, "could not get source mac");
	  return -1;
	}

      /*
       * determine the destination mac address (the target).
       */
      if(rt->gwaddr == NULL)
	{
	  /* no gateway address means destination is on local network */
	  memcpy(state->dl_hdr, state->dl_hdr+6, 6);
	  if(scamper_addr2mac_whohas(rt->ifindex, trace->src, trace->dst,
				     state->dl_hdr) != 1)
	    {
	      scamper_debug(__func__, "could not get destination mac");
	      return -1;
	    }
	}
      else if(rt->gwaddr->type == SCAMPER_ADDR_TYPE_ETHERNET)
	{
	  /* the gateway mac address was provided by the route socket */
	  memcpy(state->dl_hdr, rt->gwaddr->addr, 6);
	}
      else
	{
	  /* the gateway address was returned as an IP */
	  memcpy(state->dl_hdr, state->dl_hdr+6, 6);
	  if(scamper_addr2mac_whohas(rt->ifindex, trace->src, rt->gwaddr,
				     state->dl_hdr) != 1)
	    {
	      scamper_debug(__func__, "could not get gateway mac");
	      return -1;
	    }
	}

      if(trace->dst->type == SCAMPER_ADDR_TYPE_IPV4)
	{
	  state->dl_hdr[12] = 0x08;
	  state->dl_hdr[13] = 0x00;
	}
      else if(trace->dst->type == SCAMPER_ADDR_TYPE_IPV6)
	{
	  state->dl_hdr[12] = 0x86;
	  state->dl_hdr[13] = 0xDD;
	}
      else return -1;
    }
  else if(tx_type == SCAMPER_DL_TX_NULL)
    {
      if(trace->dst->type == SCAMPER_ADDR_TYPE_IPV4)
	{
	  af = AF_INET;
	}
      else if(trace->dst->type == SCAMPER_ADDR_TYPE_IPV6)
	{
	  af = AF_INET6;
	}
      else return -1;

      memcpy(state->dl_hdr, &af, sizeof(int));
    }
  else if(tx_type == SCAMPER_DL_TX_ETHLOOP)
    {
      memset(state->dl_hdr, 0, 12);
      if(trace->dst->type == SCAMPER_ADDR_TYPE_IPV4)
	{
	  state->dl_hdr[12] = 0x08;
	  state->dl_hdr[13] = 0x00;
	}
      else if(trace->dst->type == SCAMPER_ADDR_TYPE_IPV6)
	{
	  state->dl_hdr[12] = 0x86;
	  state->dl_hdr[13] = 0xDD;
	}
      else return -1;
    }
  else return -1;

  return 0;
}

static int do_trace_handle_rt(scamper_task_t *task, scamper_rt_rec_t *rt)
{
  scamper_trace_t *trace = task->data;
  trace_state_t *state = task->state;

  if(state->mode != MODE_RTSOCK)
    {
      return 0;
    }

  /* if there was a problem getting the ifindex, handle that */
  if(rt->error != 0 || rt->ifindex < 0)
    {
      printerror(errno, strerror, __func__, "could not get ifindex");
      goto err;
    }

  /*
   * if scamper is supposed to get tx timestamps from the datalink, or
   * scamper needs the datalink to transmit packets, then try and get a
   * datalink on the ifindex specified.
   */
  if((state->dl = scamper_fd_dl(rt->ifindex)) == NULL)
    {
      scamper_debug(__func__, "could not get dl for %d", rt->ifindex);
      goto err;
    }

  /*
   * if we're doing path MTU discovery debugging, or doing tcp traceroute,
   * or doing udp paris traceroute, determine the underlying framing to use
   * with each probe packet that will be sent on the datalink.
   */
  if((trace->flags & SCAMPER_TRACE_FLAG_PMTUD) != 0 ||
     trace->type == SCAMPER_TRACE_TYPE_TCP ||
     trace->type == SCAMPER_TRACE_TYPE_UDP_PARIS)
    {
      if(rt_framing(task, rt) != 0)
	{
	  goto err;
	}
    }

 done:
  state->mode = MODE_TRACE;
  state->attempt = 0;
  scamper_queue_probe(task->queue);
  return 0;

 err:
  if(trace->type == SCAMPER_TRACE_TYPE_TCP ||
     trace->type == SCAMPER_TRACE_TYPE_UDP_PARIS)
    {
      goto harderr;
    }
  trace->flags &= ~(SCAMPER_TRACE_FLAG_PMTUD | SCAMPER_TRACE_FLAG_DL);
  goto done;

 harderr:
  trace_handleerror(task, errno);
  return -1;
}

static int do_trace_write(scamper_task_t *task)
{
  scamper_outfile_t *outfile = scamper_source_getoutfile(task->source);
  scamper_file_t *sf = scamper_outfile_getfile(outfile);
  scamper_file_write_trace(sf, (scamper_trace_t *)task->data);
  return 0;
}

static void trace_state_free(trace_state_t *state)
{
  trace_probe_t *probe;
  int i;

  /* free the probe records scamper kept */
  if(state->probes != NULL)
    {
      for(i=0; i<state->id_next; i++)
	{
	  probe = state->probes[i];
	  if(probe->rx_mac != NULL)
	    {
	      scamper_addr_free(probe->rx_mac);
	    }
	  free(probe);
	}
      free(state->probes);
    }

  if(state->dl != NULL)     scamper_fd_free(state->dl);
  if(state->icmp != NULL)   scamper_fd_free(state->icmp);
  if(state->probe != NULL)  scamper_fd_free(state->probe);
  if(state->route != NULL)  scamper_fd_free(state->route);
  if(state->dl_hdr != NULL) free(state->dl_hdr);

  free(state);
  return;
}

static int trace_state_alloc(scamper_task_t *task)
{
  scamper_trace_t *trace = task->data;
  trace_state_t *state;
  int id_max;

  assert(trace != NULL);

  /* allocate struct to keep state while processing the trace */
  if((state = malloc_zero(sizeof(trace_state_t))) == NULL)
    {
      goto err;
    }

  /* allocate memory to record hops */
  state->alloc_hops = TRACE_ALLOC_HOPS;
  if(trace->firsthop >= state->alloc_hops)
    {
      if(state->alloc_hops + (uint16_t)trace->firsthop > 256)
	{
	  state->alloc_hops = 256;
	}
      else
	{
	  state->alloc_hops += trace->firsthop;
	}
    }

  if(scamper_trace_hops_alloc(trace, state->alloc_hops) == -1)
    {
      goto err;
    }

  /* allocate enough ids to probe each hop with max number of attempts */
  id_max = (state->alloc_hops - trace->firsthop + 2) * trace->attempts;

  /* allocate enough space to store state for each probe */
  if((state->probes = malloc(sizeof(trace_probe_t *) * id_max)) == NULL)
    {
      goto err;
    }

  /* if scamper has to get the ifindex, then start in the rtsock mode */
  if((trace->flags & (SCAMPER_TRACE_FLAG_PMTUD|SCAMPER_TRACE_FLAG_DL)) != 0 ||
     trace->type == SCAMPER_TRACE_TYPE_TCP ||
     trace->type == SCAMPER_TRACE_TYPE_UDP_PARIS)
    {
      state->mode = MODE_RTSOCK;
      if((state->route = scamper_fd_rtsock()) == NULL)
	{
	  goto err;
	}
    }
  else
    {
      state->mode = MODE_TRACE;
    }

  state->dl           = NULL;
  state->dl_hdr       = NULL;
  state->dl_size      = 0;
  state->ttl          = trace->firsthop;
  state->attempt      = 0;
  state->header_size  = scamper_trace_probe_headerlen(trace);
  state->payload_size = trace->probe_size - state->header_size;
  state->id_next      = 0;
  state->id_max       = id_max;

  if(trace->dst->type == SCAMPER_ADDR_TYPE_IPV4)
    {
      state->icmp = scamper_fd_icmp4();
    }
  else if(trace->dst->type == SCAMPER_ADDR_TYPE_IPV6)
    {
      state->icmp = scamper_fd_icmp6();
    }
  else goto err;

  switch(trace->type)
    {
    case SCAMPER_TRACE_TYPE_TCP:
      if(trace->dst->type == SCAMPER_ADDR_TYPE_IPV4)
	state->probe = scamper_fd_tcp4(scamper_sport_get());
      else
	state->probe = scamper_fd_tcp6(scamper_sport_get());
      break;

    case SCAMPER_TRACE_TYPE_ICMP_ECHO:
    case SCAMPER_TRACE_TYPE_ICMP_ECHO_PARIS:
      if(trace->dst->type == SCAMPER_ADDR_TYPE_IPV4)
	state->probe = scamper_fd_icmp4();
      else
	state->probe = scamper_fd_icmp6();
      break;

    case SCAMPER_TRACE_TYPE_UDP:
    case SCAMPER_TRACE_TYPE_UDP_PARIS:
      if(trace->dst->type == SCAMPER_ADDR_TYPE_IPV4)
	state->probe = scamper_fd_udp4(scamper_sport_get());
      else
	state->probe = scamper_fd_udp6(scamper_sport_get());
      break;
    }

  if(state->icmp == NULL || state->probe == NULL)
    {
      goto err;
    }

  task->state = state;
  return 0;

 err:
  if(state != NULL) trace_state_free(state);
  return -1;
}

static void do_trace_free(scamper_task_t *task)
{
  scamper_trace_t *trace;
  trace_state_t *state;

  /* free any state kept */
  if((state = task->state) != NULL)
    {
      trace_state_free(state);
    }

  /* free any trace data collected */
  if((trace = task->data) != NULL)
    {
      scamper_trace_free(trace);
    }

  return;
}

/*
 * do_trace_probe
 *
 * time to probe, so send the packet.
 */
static int do_trace_probe(scamper_task_t *task)
{
  scamper_trace_t *trace = task->data;
  trace_state_t   *state = task->state;
  trace_probe_t   *tp = NULL;
  scamper_probe_t  probe;
  uint16_t         u16, i;
  trace_probe_t  **probes;
  uint8_t         *buf;

  assert(trace != NULL);

  if(state != NULL)
    {
      assert(state->attempt < trace->attempts);
      assert(state->id_next <= state->id_max);
      assert(state->alloc_hops > 0);
      assert(state->alloc_hops <= 256);
      assert(state->ttl != 0);
    }
  else
    {
      /* timestamp when the trace began */
      gettimeofday_wrap(&trace->start);

      /* determine the source address used for sending probes */
      if((trace->src = scamper_getsrc(trace->dst)) == NULL)
	{
	  trace_handleerror(task, errno);
	  return -1;
	}

      /* allocate state and store it with the task */
      if(trace_state_alloc(task) != 0)
	{
	  trace_handleerror(task, errno);
	  return -1;
	}
      state = task->state;
    }

  /* allocate some more space in the trace to store replies, if necessary */
  if(state->alloc_hops == trace->hop_count)
    {
      /*
       * figure out exactly how many hops should be allocated in the
       * trace structure
       */
      if(256 - state->alloc_hops <= TRACE_ALLOC_HOPS)
	{
	  u16 = state->alloc_hops + TRACE_ALLOC_HOPS;
	}
      else
	{
	  u16 = 256;
	}

      /* allocate the new hops */
      if(scamper_trace_hops_alloc(trace, u16) != 0)
	{
	  printerror(errno, strerror, __func__, "could not realloc hops");
	  trace_handleerror(task, errno);
	  return -1;
	}

      /* initialise the new hops to have null pointers */
      for(i=state->alloc_hops; i<u16; i++)
	{
	  trace->hops[i] = NULL;	  
	}
      state->alloc_hops = u16;
    }

  /* allocate some more space to store probes, if necessary */
  if(state->id_next == state->id_max)
    {
      u16 = state->id_max + TRACE_ALLOC_HOPS;
      probes = realloc(state->probes, sizeof(trace_probe_t *) * u16);
      if(probes == NULL)
	{
	  printerror(errno, strerror, __func__, "could not realloc");
	  trace_handleerror(task, errno);
	  return -1;	  
	}

      state->id_max = u16;
      state->probes = probes;
    }

  if(state->mode == MODE_RTSOCK)
    {
      if(scamper_rtsock_getroute(state->route, trace->dst) != 0)
	{
	  trace->flags &= ~(SCAMPER_TRACE_FLAG_PMTUD | SCAMPER_TRACE_FLAG_DL);
	  state->mode = MODE_TRACE;
	  state->attempt = 0;
	}
      else
	{
	  state->attempt++;
	  scamper_queue_wait(task->queue, trace->wait * 1000);
	  return 0;
	}
    }

  /* allocate a larger global pktbuf if needed */
  if(pktbuf_len < state->payload_size)
    {
      if((buf = realloc(pktbuf, state->payload_size)) == NULL)
	{
	  printerror(errno, strerror, __func__, "could not realloc");
	  trace_handleerror(task, errno);
	  return -1;
	}
      pktbuf     = buf;
      pktbuf_len = state->payload_size;
      memset(pktbuf, 0, pktbuf_len);
    }

  probe.pr_ip_src    = trace->src;
  probe.pr_ip_dst    = trace->dst;
  probe.pr_ip_tos    = trace->tos;
  probe.pr_ip_ttl    = state->ttl;
  probe.pr_ip_id     = 0;
  probe.pr_ip_flow   = 0;
  probe.pr_data      = pktbuf;
  probe.pr_len       = state->payload_size;
  probe.pr_ipoptc    = 0;
  probe.pr_ipopts    = NULL;
  probe.pr_fd        = scamper_fd_fd_get(state->probe);

  if((state->mode == MODE_PMTUD_DEFAULT ||
      state->mode == MODE_PMTUD_SILENT_L2 ||
      state->mode == MODE_PMTUD_SILENT_TTL ||
      state->mode == MODE_PMTUD_BADSUGG ||
      trace->type == SCAMPER_TRACE_TYPE_TCP ||
      trace->type == SCAMPER_TRACE_TYPE_UDP_PARIS) &&
     state->dl != NULL)
    {
      probe.pr_dl      = scamper_fd_write_state(state->dl);
      probe.pr_dl_hdr  = state->dl_hdr;
      probe.pr_dl_size = state->dl_size;
    }
  else
    {
      probe.pr_dl      = NULL;
      probe.pr_dl_hdr  = NULL;
      probe.pr_dl_size = 0;
    }

  if(trace->type == SCAMPER_TRACE_TYPE_UDP ||
     trace->type == SCAMPER_TRACE_TYPE_UDP_PARIS)
    {
      probe.pr_ip_proto  = IPPROTO_UDP;
      probe.pr_udp_sport = trace->sport;
      probe.pr_udp_dport = trace->dport;

      /*
       * traditional traceroute identifies probes by varying the UDP
       * destination port number.  UDP-based paris traceroute identifies
       * probes by varying the UDP checksum -- accomplished by manipulating
       * the payload of the packet to get sequential values for the checksum
       */
      if(trace->type == SCAMPER_TRACE_TYPE_UDP)
	{
	  probe.pr_udp_dport += state->id_next;
	}
      else
	{
	  /*
	   * hack the checksum to be our id field by setting the checksum
	   * id we want into the packet's body, then calculate the checksum
	   * across the packet, and then set the packet's body to be the
	   * value returned for the checksum.  this effectively swaps two
	   * 16 bit quantities in the packet
	   */
	  u16 = htons(state->id_next + 1);
	  memcpy(probe.pr_data, &u16, 2);

	  if(trace->dst->type == SCAMPER_ADDR_TYPE_IPV4)
	    {
	      /*
	       * while the paris traceroute paper says that the payload of the
	       * packet is set so that the checksum field can be used to
	       * identify a returned probe, the paris traceroute code uses the
	       * IP ID field.
	       * this is presumably because FreeBSD systems seem to reset the
	       * UDP checksum quoted in ICMP destination unreachable messages.
	       * scamper's paris traceroute implementation used both IP ID and
	       * UDP checksum.
	       */
	      probe.pr_ip_id = state->id_next + 1;
	      u16 = scamper_udp4_cksum(&probe);
	    }
	  else
	    {
	      u16 = scamper_udp6_cksum(&probe);
	    }

	  memcpy(probe.pr_data, &u16, 2);
	}
    }
  else if(trace->type == SCAMPER_TRACE_TYPE_ICMP_ECHO ||
	  trace->type == SCAMPER_TRACE_TYPE_ICMP_ECHO_PARIS)
    {
      switch(trace->dst->type)
	{
	case SCAMPER_ADDR_TYPE_IPV4:
	  probe.pr_ip_proto = IPPROTO_ICMP;
	  probe.pr_icmp_type = ICMP_ECHO;
	  break;

	case SCAMPER_ADDR_TYPE_IPV6:
	  probe.pr_ip_proto = IPPROTO_ICMPV6;
	  probe.pr_icmp_type = ICMP6_ECHO_REQUEST;
	  break;
	}

      probe.pr_icmp_code = 0;
      probe.pr_icmp_id   = trace->sport;
      probe.pr_icmp_seq  = state->id_next;

      /*
       * ICMP-based paris traceroute tries to ensure the same path is taken
       * through a load balancer by sending all probes with a constant value
       * for the checksum.  manipulate the payload so this happens.
       * the value chosen to seed the checksum is the trace->sport value, but
       * it could really be anything.
       */
      if(trace->type == SCAMPER_TRACE_TYPE_ICMP_ECHO_PARIS)
	{
	  memcpy(probe.pr_data, &trace->sport, 2);
	  switch(trace->dst->type)
	    {
	    case SCAMPER_ADDR_TYPE_IPV4:
	      u16 = scamper_icmp4_cksum(&probe);
	      break;

	    case SCAMPER_ADDR_TYPE_IPV6:
	      u16 = scamper_icmp6_cksum(&probe);
	      break;
	    }
	  memcpy(probe.pr_data, &u16, 2);
	}
    }
  else if(trace->type == SCAMPER_TRACE_TYPE_TCP)
    {
      probe.pr_ip_proto  = IPPROTO_TCP;
      probe.pr_tcp_sport = trace->sport;
      probe.pr_tcp_dport = trace->dport;
      probe.pr_tcp_seq   = 0;
      probe.pr_tcp_ack   = 0;
      probe.pr_tcp_flags = TH_SYN;
      probe.pr_tcp_win   = 0;

      if(trace->dst->type == SCAMPER_ADDR_TYPE_IPV4)
	probe.pr_ip_id   = state->id_next + 1;
      else
	probe.pr_ip_flow = state->id_next + 1;

    }
  else return -1;

  /*
   * allocate a trace probe state record before we try and send the probe
   * as there is no point sending something into the wild that we can't
   * record
   */
  if((tp = malloc_zero(sizeof(trace_probe_t))) == NULL)
    {
      trace_handleerror(task, errno);
      return -1;
    }

  /* send the probe */
  if(scamper_probe(&probe) == -1)
    {
      free(tp);
      trace_handleerror(task, probe.pr_errno);
      return -1;
    }

  timeval_cpy(&tp->tx_tv, &probe.pr_tx);
  tp->ttl   = probe.pr_ip_ttl;
  tp->size  = probe.pr_len + state->header_size;
  tp->mode  = state->mode;
  tp->id    = state->attempt;

  state->probes[state->id_next] = tp;
  state->id_next++;
  state->attempt++;

  scamper_queue_wait(task->queue, trace->wait * 1000);

  return 0;
}

/*
 * scamper_do_trace_alloc
 *
 * given a string representing a traceroute task, parse the parameters and
 * assemble a trace.  return the trace structure so that it is all ready to
 * go.
 */
scamper_trace_t *scamper_do_trace_alloc(char *str)
{
  /* default values of various trace parameters */
  uint8_t  type        = SCAMPER_TRACE_TYPE_UDP;
  uint8_t  flags       = 0;
  uint8_t  attempts    = SCAMPER_DO_TRACE_ATTEMPTS_DEF;
  uint8_t  firsthop    = SCAMPER_DO_TRACE_FIRSTHOP_DEF;
  uint8_t  gaplimit    = SCAMPER_DO_TRACE_GAPLIMIT_DEF;
  uint8_t  gapaction   = SCAMPER_DO_TRACE_GAPACTION_DEF;
  uint8_t  hoplimit    = SCAMPER_DO_TRACE_HOPLIMIT_DEF;
  uint8_t  tos         = SCAMPER_DO_TRACE_TOS_DEF;
  uint8_t  wait        = SCAMPER_DO_TRACE_WAIT_DEF;
  uint8_t  loops       = SCAMPER_DO_TRACE_LOOPS_DEF;
  uint8_t  loopaction  = SCAMPER_DO_TRACE_LOOPACTION_DEF;
  uint16_t sport       = scamper_sport_get();
  uint16_t dport       = SCAMPER_DO_TRACE_DPORT_DEF;

  scamper_option_out_t *opts_out = NULL, *opt;
  scamper_trace_t *trace = NULL;
  char *addr;
  long tmp;

  /* try and parse the string passed in */
  if(scamper_options_parse(str, trace_opts_in, trace_opts_cnt,
			   &opts_out, &addr) != 0)
    {
      goto err;
    }

  /* if there is no IP address after the options string, then stop now */
  if(addr == NULL)
    {
      goto err;
    }

  /* parse the options, do preliminary sanity checks */
  for(opt = opts_out; opt != NULL; opt = opt->next)
    {
      switch(opt->id)
	{
	case TRACE_OPT_DPORT:
	  if(string_tolong(opt->str, &tmp) == -1 ||
	     tmp < SCAMPER_DO_TRACE_DPORT_MIN ||
	     tmp > SCAMPER_DO_TRACE_DPORT_MAX)
	    {
	      goto err;
	    }
	  dport = tmp;
	  break;

	case TRACE_OPT_FIRSTHOP:
	  if(string_tolong(opt->str, &tmp) == -1 ||
	     tmp < SCAMPER_DO_TRACE_FIRSTHOP_MIN ||
	     tmp > SCAMPER_DO_TRACE_FIRSTHOP_MAX)
	    {
	      goto err;
	    }
	  firsthop = tmp;
	  break;

	case TRACE_OPT_GAPLIMIT:
	  if(string_tolong(opt->str, &tmp) == -1 ||
	     tmp < SCAMPER_DO_TRACE_GAPLIMIT_MIN ||
	     tmp > SCAMPER_DO_TRACE_GAPLIMIT_MAX)
	    {
	      goto err;
	    }
	  gaplimit = tmp;
	  break;

	case TRACE_OPT_GAPACTION:
	  if(string_tolong(opt->str, &tmp) == -1 ||
	     tmp < SCAMPER_DO_TRACE_GAPACTION_MIN ||
	     tmp > SCAMPER_DO_TRACE_GAPACTION_MAX)
	    {
	      goto err;
	    }
	  gapaction = tmp;
	  break;

	case TRACE_OPT_LOOPS:
	  if(string_tolong(opt->str, &tmp) == -1 ||
	     tmp < SCAMPER_DO_TRACE_LOOPS_MIN    ||
	     tmp > SCAMPER_DO_TRACE_LOOPS_MAX)
	    {
	      goto err;
	    }
	  loops = tmp;
	  break;

	case TRACE_OPT_LOOPACTION:
	  if(string_tolong(opt->str, &tmp) == -1   ||
	     tmp < SCAMPER_DO_TRACE_LOOPACTION_MIN ||
	     tmp > SCAMPER_DO_TRACE_LOOPACTION_MAX)
	    {
	      goto err;
	    }
	  loopaction = tmp;
	  break;

	case TRACE_OPT_MAXTTL:
	  if(string_tolong(opt->str, &tmp) == -1 ||
	     tmp < SCAMPER_DO_TRACE_HOPLIMIT_MIN ||
	     tmp > SCAMPER_DO_TRACE_HOPLIMIT_MAX)
	    {
	      goto err;
	    }
	  hoplimit = tmp;
	  break;

	case TRACE_OPT_PMTUD:
	  flags |= SCAMPER_TRACE_FLAG_PMTUD;
	  break;

	case TRACE_OPT_PROTOCOL:
	  if(strcasecmp(opt->str, "UDP") == 0)
	    type = SCAMPER_TRACE_TYPE_UDP;
	  else if(strcasecmp(opt->str, "TCP") == 0)
	    type = SCAMPER_TRACE_TYPE_TCP;
	  else if(strcasecmp(opt->str, "ICMP") == 0)
	    type = SCAMPER_TRACE_TYPE_ICMP_ECHO;
	  else if(strcasecmp(opt->str, "ICMP-paris") == 0)
	    type = SCAMPER_TRACE_TYPE_ICMP_ECHO_PARIS;
	  else if(strcasecmp(opt->str, "UDP-paris") == 0)
	    type = SCAMPER_TRACE_TYPE_UDP_PARIS;
	  else goto err;
	  break;

	case TRACE_OPT_ATTEMPTS:
	  if(string_tolong(opt->str, &tmp) == -1 ||
	     tmp < SCAMPER_DO_TRACE_ATTEMPTS_MIN ||
	     tmp > SCAMPER_DO_TRACE_ATTEMPTS_MAX)
	    {
	      goto err;
	    }
	  attempts = tmp;
	  break;

	case TRACE_OPT_ALLATTEMPTS:
	  flags |= SCAMPER_TRACE_FLAG_ALLATTEMPTS;
	  break;

	case TRACE_OPT_SPORT:
	  if(string_tolong(opt->str, &tmp) == -1 ||
	     tmp < SCAMPER_DO_TRACE_SPORT_MIN ||
	     tmp > SCAMPER_DO_TRACE_SPORT_MAX)
	    {
	      goto err;
	    }
	  sport = tmp;
	  break;

	case TRACE_OPT_TOS:
	  if(string_tolong(opt->str, &tmp) == -1 ||
	     tmp < SCAMPER_DO_TRACE_TOS_MIN ||
	     tmp > SCAMPER_DO_TRACE_TOS_MAX)
	    {
	      goto err;
	    }
	  tos = tmp;
	  break;

	case TRACE_OPT_WAIT:
	  if(string_tolong(opt->str, &tmp) == -1 ||
	     tmp < SCAMPER_DO_TRACE_WAIT_MIN ||
	     tmp > SCAMPER_DO_TRACE_WAIT_MAX)
	    {
	      goto err;
	    }
	  wait = tmp;
	  break;
	}
    }
  scamper_options_free(opts_out); opts_out = NULL;

  /* sanity check that we don't begin beyond our probe hoplimit */
  if(firsthop > hoplimit && hoplimit != 0)
    {
      goto err;
    }

  /* can't really do pmtud properly without all of the path */
  if(firsthop > 1 && (flags & SCAMPER_TRACE_FLAG_PMTUD))
    {
      goto err;
    }

  /* can't really do pmtud properly without a UDP traceroute method */
  if((flags & SCAMPER_TRACE_FLAG_PMTUD) == 1 &&
     type != SCAMPER_TRACE_TYPE_UDP && type != SCAMPER_TRACE_TYPE_UDP_PARIS)
    {
      goto err;
    }

  if((trace = scamper_trace_alloc()) == NULL)
    {
      goto err;
    }
  if((trace->dst= scamper_addrcache_resolve(addrcache,AF_UNSPEC,addr)) == NULL)
    {
      goto err;
    }

  trace->type       = type;
  trace->flags      = flags;
  trace->attempts   = attempts;
  trace->hoplimit   = hoplimit;
  trace->gaplimit   = gaplimit;
  trace->gapaction  = gapaction;
  trace->firsthop   = firsthop;
  trace->tos        = tos;
  trace->wait       = wait;
  trace->loops      = loops;
  trace->loopaction = loopaction;
  trace->sport      = sport;
  trace->dport      = dport;

  switch(trace->dst->type)
    {
    case SCAMPER_ADDR_TYPE_IPV4:
      if(trace->type == SCAMPER_TRACE_TYPE_TCP)
	{
	  trace->probe_size = 40;
	}
      else
	{
	  trace->probe_size = 44;
	}
      break;

    case SCAMPER_ADDR_TYPE_IPV6:
      trace->probe_size = 60;
      break;

    default:
      goto err;
    }

  if(scamper_option_dl() != 0)
    {
      trace->flags |= SCAMPER_TRACE_FLAG_DL;
    }

  return trace;

 err:
  if(trace != NULL) scamper_trace_free(trace);
  if(opts_out != NULL) scamper_options_free(opts_out);
  return NULL;
}

scamper_task_t *scamper_do_trace_alloctask(scamper_trace_t *trace,
					   scamper_list_t *list,
					   scamper_cycle_t *cycle)
{
  scamper_task_t *task;

  /* associate the list and cycle with the trace */
  trace->list = scamper_list_use(list);
  trace->cycle = scamper_cycle_use(cycle);

  /* allocate the task structure and store the trace with it */
  if((task = scamper_task_alloc(trace->dst, &trace_funcs)) != NULL)
    {
      task->data = trace;
      return task;
    }

  return NULL;
}

void scamper_do_trace_cleanup()
{
  if(pktbuf != NULL)
    {
      free(pktbuf);
      pktbuf = NULL;
    }

  if(if_sock != -1)
    {
      close(if_sock);
      if_sock = -1;
    }

  return;
}

int scamper_do_trace_init()
{
  if((if_sock = socket(AF_INET, SOCK_DGRAM, 0)) == -1)
    {
      return -1;
    }

  trace_funcs.probe                  = do_trace_probe;
  trace_funcs.handle_icmp            = do_trace_handle_icmp;
  trace_funcs.handle_dl              = do_trace_handle_dl;
  trace_funcs.handle_rt              = do_trace_handle_rt;
  trace_funcs.handle_timeout         = do_trace_handle_timeout;
  trace_funcs.write                  = do_trace_write;
  trace_funcs.task_free              = do_trace_free;

  return 0;
}
