/*
**	SWISH++
**	search.c
**
**	Copyright (C) 1998  Paul J. Lucas
**
**	This program is free software; you can redistribute it and/or modify
**	it under the terms of the GNU General Public License as published by
**	the Free Software Foundation; either version 2 of the License, or
**	(at your option) any later version.
** 
**	This program is distributed in the hope that it will be useful,
**	but WITHOUT ANY WARRANTY; without even the implied warranty of
**	MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
**	GNU General Public License for more details.
** 
**	You should have received a copy of the GNU General Public License
**	along with this program; if not, write to the Free Software
**	Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
*/

// standard
#include <algorithm>
#include <cstdlib>
#include <cstring>
#include <iostream>
#include <map>
#include <string>
#include <strstream>
#include <vector>

// local
#include "config.h"
#include "fake_ansi.h"
#include "file_index.h"
#include "file_list.h"
#include "less.h"
#include "token.h"
#include "util.h"
#include "version.h"
#include "word_index.h"

extern "C" {
	extern char*	optarg;
	extern int	optind, opterr;
}

#ifndef	PJL_NO_NAMESPACES
using namespace std;
#endif

typedef map< int, int > result_type;

char const*	me;				// executable name
word_index	words;
file_index	files;

bool		parse_primary( istream&, result_type& );
bool		parse_query( istream&, result_type& );
bool		parse_optional_relop( istream&, token::type& );
void		usage();

//*****************************************************************************
//
// SYNOPSIS
//
	struct sort_by_rank /* :
		binary_function<
			result_type::value_type const&,
			result_type::value_type const&,
			bool
		> */
//
// DESCRIPTION
//
//	A binary_function used to sort the search results by rank in
//	descending order (highest rank first).
//
// BUGS
//
//	This struct should be derived from binary_function, but g++ 2.8 barfs
//	on it.  It must be a compiler bug.
//
//*****************************************************************************
{
	bool operator()(
		result_type::value_type const &a,
		result_type::value_type const &b
	) {
		return a.second > b.second;
	}
};

//*****************************************************************************
//
// SYNOPSIS
//
	int main( int argc, char *argv[] )
//
// DESCRIPTION
//
//	Parse the command line, initialize, call other functions ... the
//	usual things that are done in main().
//
// PARAMETERS
//
//	argc	The number of arguments.
//
//	argv	A vector of the arguments; argv[argc] is null.  Aside from
//		the options below, the arguments form the query.
//
// SEE ALSO
//
//	Stroustrup, Bjarne.  "The C++ Programming Language, 3rd ed."
//	Addison-Wesley, Reading, MA.  pp. 116-118.
//
//*****************************************************************************
{
	me = ::strrchr( argv[0], '/' );		// determine base name
	me = me ? me + 1 : argv[0];		// of executable

	/////////// Process command-line options //////////////////////////////

	bool dump_index = false;
	char const *index_file_name = "the.index";
	int max_results = Results_Max_Default;
	int skip_results = 0;

	::opterr = 1;
	for ( int opt; (opt = ::getopt( argc, argv, "di:m:s:V" )) != EOF; )
		switch ( opt ) {

			case 'd': // Dump index to standard out.
				dump_index = true;
				break;

			case 'i': // Specify index file overriding the default.
				index_file_name = ::optarg;
				break;

			case 'm': // Specify max. number of results.
				max_results = ::atoi( ::optarg );
				break;

			case 's': // Specify number of initial results to skip.
				skip_results = ::atoi( ::optarg );
				if ( skip_results < 0 )
					skip_results = 0;
				break;

			case 'V': // Display version and exit.
				cout << "SWISH++ " << version << endl;
				::exit( 0 );

			case '?': // Bad option.
				usage();
		}

	argc -= ::optind, argv += ::optind;
	if ( !argc && !dump_index )
		usage();

	/////////// Load index file ///////////////////////////////////////////

	file_vector<char> the_index( index_file_name );
	if ( !the_index ) {
		cerr	<< me << ": could not read index from "
			<< index_file_name << endl;
		::exit( 2 );
	}
	words.set_index_file( the_index );
	files.set_index_file( the_index );

	if ( dump_index ) {
		for ( word_index::const_iterator
			word = words.begin(); word != words.end(); ++word
		) {
			cout << *word << '\n';
			file_list list( word );
			for ( file_list::const_iterator
				file = list.begin(); file != list.end(); ++file
			)
				cout << "  " << files[ file->index ] << '\n';
			cout << '\n';
		}
		return 0;
	}

	////////// Perform the query //////////////////////////////////////////

	//
	// Paste the rest of the command line together into a single query
	// string.
	//
	string query = *argv++;
	while ( *argv ) {
		query += ' ';
		query += *argv++;
	}
	istrstream query_stream( query.c_str() );

	result_type results;
	if ( !parse_query( query_stream, results ) ) {
		cerr << me << ": malformed query" << endl;
		::exit( 3 );
	}

	////////// Print the results //////////////////////////////////////////

	if ( skip_results >= results.size() )
		return 0;

	// Copy the results to a vector to sort them by rank.
	vector< result_type::value_type > v;
	v.reserve( results.size() );
	::copy( results.begin(), results.end(), ::back_inserter( v ) );
	::sort( v.begin(), v.end(), sort_by_rank() );
	double const normalize = 100.0 / v[0].second;	// highest rank

	for ( vector< result_type::value_type >::const_iterator
		i  = v.begin() + skip_results;
		i != v.end() && max_results-- > 0; ++i
	)
		cout	<< int( i->second * normalize )
			<< ' ' << files[ i->first ] << '\n';

	return 0;
}

//*****************************************************************************
//
// SYNOPSIS
//
	bool parse_query( istream &query, result_type &result )
//
// DESCRIPTION
//
//	Parse a query via predictive top-down recursive decent.  The grammar
//	for a query is:
//
//		query:		query optional_relop primary
//			|	primary
//
//		primary:	'(' query ')'
//			|	'not' primary
//			|	word
//			|	word*
//
//		optional_relop:	'and'
//			|	'or'
//			|	(empty)
//
//	However, the 'query' production is left-recursive which doesn't work
//	for top-down parsers.  It must therefore be rewritten to be right-
//	recursive as:
//
//		query:		primary rest
//
//		rest:		optional_relop primary rest
//			|	(empty)
//
// PARAMETERS
//
//	query	The istream from which the query string is extracted.
//
//	result	Where the result of performing the (sub)query is deposited.
//
// RETURN VALUE
//
//	Returns true only if a query was sucessfully parsed.
//
// SEE ALSO
//
//	Alfred V. Aho, Ravi Sethi, Jeffrey D. Ullman.  "Compilers:
//	Principles, Techniques, and Tools," Addison-Wesley, Reading, MA,
//	1986, pp. 44-48.
//
//*****************************************************************************
{
	if ( !parse_primary( query, result ) )
		return false;

	//
	// This is parse_rest placed inside inside parse_query since every
	// "primary" is followed by a "rest" in the grammar.
	//
	token::type relop;
	while ( parse_optional_relop( query, relop ) ) {
		result_type temp1;
		if ( !parse_primary( query, temp1 ) )
			break;

		switch ( relop ) {

			case token::and_token: {
#				ifdef DEBUG_parse_query
				cerr << "-----> performing and" << endl;
#				endif
				result_type temp2;
				for ( result_type::const_iterator
					i = temp1.begin(); i != temp1.end(); ++i
				) {
					result_type::iterator found =
						result.find( i->first );
					if ( found != result.end() )
						temp2[ found->first ] = 
						(found->second + i->second) / 2;
				}
				result.swap( temp2 );
				break;
			}

			case token::or_token: {
#				ifdef DEBUG_parse_query
				cerr << "-----> performing or" << endl;
#				endif
				for ( result_type::const_iterator
					i = temp1.begin(); i != temp1.end(); ++i
				)
					result[ i->first ] += i->second;
				break;
			}
		}
	}
	return true;
}

//*****************************************************************************
//
// SYNOPSIS
//
	bool parse_optional_relop( istream &query, token::type &relop )
//
// DESCRIPTION
//
//	Parse an optional relational operator of either "and" or "or" from
//	the given istream.  In the absense of a relational operator, "and" is
//	implied.
//
// PARAMETERS
//
//	query	The istream from which the relational operator string is
//		extracted (if present).
//
//	relop	Where the type of the relational operator is deposited.
//
// RETURN VALUE
//
//	Returns true unless no token at all could be extracted.
//
//*****************************************************************************
{
	token t( query );
	switch ( t ) {

		case token::no_token:
			return false;

		case token::and_token:
		case token::or_token:
#			ifdef DEBUG_parse_query
			cerr	<< "-----> relop \""
				<< ( t == token::and_token ? "and" : "or" )
				<< '"' << endl;
#			endif
			relop = t;
			return true;

		default:
#			ifdef DEBUG_parse_query
			cerr << "-----> relop \"and\" (implicit)" << endl;
#			endif
			relop = token::and_token;
			t.put_back();
			return true;
	}
}

//*****************************************************************************
//
// SYNOPSIS
//
	bool parse_primary( istream &query, result_type &result )
//
// DESCRIPTION
//
//	Parse a primary from the given istream.
//
// PARAMETERS
//
//	query	The istream from which the primary is extracted.
//
//	result	Where the result of performing the (sub)query is deposited.
//
// RETURN VALUE
//
//	Returns true only if a primary was sucessfully parsed.
//
//*****************************************************************************
{
	pair< word_index::const_iterator, word_index::const_iterator > found;
	token t( query );

	switch ( t ) {

		case token::word_token: {
			less< char const* > comparator;
			found = ::equal_range( words.begin(), words.end(),
				t.str(), comparator
			);
			if ( found.first == words.end() ||
				comparator( t.str(), *found.first ) ) {
				//
				// The following "return true" indicates that a
				// word was parsed successfully, not that we
				// found the word.
				//
				return true;
			}
			break;
		}

		case token::word_star_token: {
			less_n< char const* > comparator( t.length() );
			found = ::equal_range( words.begin(), words.end(),
				t.str(), comparator
			);
			if ( found.first == words.end() ||
				comparator( t.str(), *found.first ) ) {
				//
				// The following "return true" indicates that a
				// word was parsed successfully, not that we
				// found the word.
				//
				return true;
			}
			break;
		}

		case token::lparen_token:
#			ifdef DEBUG_parse_query
			cerr << "-----> '('" << endl;
#			endif
			if ( !parse_query( query, result ) )
				return false;
			query >> t;
#			ifdef DEBUG_parse_query
			if ( t == token::lparen_token )
				cerr << "-----> '('" << endl;
#			endif
			return t == token::lparen_token;

		case token::not_token: {
#			ifdef DEBUG_parse_query
			cerr << "-----> begin not" << endl;
#			endif
			result_type temp;
			if ( !parse_primary( query, temp ) )
				return false;
#			ifdef DEBUG_parse_query
			cerr << "-----> end not" << endl;
#			endif

			for ( register int i = 0; i < files.size(); ++i )
				if ( temp.find( i ) == temp.end() )
					result[ i ] = 100;
			return true;
		}

		default:
			return false;
	}

	//
	// Found a word or set of words matching a wildcard: iterate over all
	// files the word(s) is/are in and add their ranks together.
	//
#	ifdef DEBUG_parse_query
	cerr << "-----> word \"" << t.str() << "\"" << endl;
#	endif
	while ( found.first != found.second ) {
		file_list list( found.first++ );
		for ( file_list::const_iterator
			file = list.begin(); file != list.end(); ++file
		)
			result[ file->index ] += file->rank;
	}

	return true;
}

//*****************************************************************************
//
//	Miscellaneous function(s)
//
//*****************************************************************************

void usage() {
	cerr <<	"usage: " << me << " [options] query\n"
		" options:\n"
		" --------\n"
		"  -d              : Dump index to standard out and exit\n"
		"  -i index_file   : Name of index file to use\n"
		"  -m max_results  : Maximum number of results\n"
		"  -s skip_results : Number of initial results to skip\n"
		"  -V              : Print version number and exit\n";
	::exit( 1 );
}
