#!/usr/bin/perl -w
# spelling -- lintian check script

# Look for common spelling errors in the package description and the
# copyright file.

# Copyright (C) 1998 by Richard Braakman
# 
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation; either version 2 of the License, or
# (at your option) any later version.
# 
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.
# 
# You should have received a copy of the GNU General Public License
# along with this program.  If not, you can find it on the World Wide
# Web at http://www.gnu.org/copyleft/gpl.html, or write to the Free
# Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
# MA 02111-1307, USA.

# Todo:
#   - if a word has only one alphabetic part, don't check it twice.

# All spelling errors that have been observed "in the wild" in package
# descriptions are added here, on the grounds that if they occurred
# once they are more likely to occur again.

# Misspellings of "compatibility", "separate", and "similar" are 
# particularly common.

# Be careful with corrections that involve punctuation, since the check
# is a bit rough with punctuation.  For example, I had to delete the
# correction of "builtin" to "built-in".

%corrections = qw(
accesnt accent
additionaly additionally
adress address
adresses addresses
alegorical allegorical
algorith algorithm
alows allows
amoung among
analysator analyzer
artifical artificial
artillary artillery
automatize automate
automatized automated
automatizes automates
auxilliary auxiliary
availble available
avaliable available
backgroud background
baloons balloons
becomming becoming
challanges challenges
charachters characters
charcter character
colorfull colorful
comand command
commoditiy commodity
compability compatibility
compatability compatibility
compatable compatible
compatibiliy compatibility
compatibilty compatibility
compleatly completely
complient compliant
compres compress
contence contents
cryptocraphic cryptographic
deamon daemon
decompres decompress
definate definite
definately definitely
dependancy dependency
dependant dependent
developement development
dont don't
easilly easily
edditable editable
enchanced enhanced
encorporating incorporating
enviroiment environment
exprimental experimental
extention extension
failuer failure
familar familiar
fatser faster
fetaures features
fortan fortran
fuction function
functionnality functionality
futhermore furthermore
grahical graphical
guage gauge
halfs halves
hierachy hierarchy
hierarchie hierarchy
howver however
implemantation implementation
incomming incoming
incompatabilities incompatibilities
indended intended
independant independent
initalize initialize
intendet intended
jave java
langage language
langugage language
licenceing licencing
logile logfile
loggging logging
maintainance maintenance
maintainence maintenance
manoeuvering maneuvering
modulues modules
navagating navigating
nead need
neccesary necessary
neccessary necessary
necesary necessary
noticable noticeable
optionnal optional
pacakge package
particularily particularly
plattform platform
powerfull powerful
prepaired prepared
priorty priority
proccesors processors
proces process
processsing processing
progams programs
programers programmers
programms programs
promps prompts
prononciation pronunciation
pronouce pronounce
protcol protocol
protocoll protocol
recieved received
remoote remote
repectively respectively
replacments replacements
requiere require
runnning running
searchs searches
seperate separate
seperately separately
seperatly separately
serveral several
similiar similar
speach speech
standart standard
succesful successful
suppport support
synchonized synchronized
syncronize synchronize
syncronizing synchronizing
sythesis synthesis
useable usable
usefull useful
utillities utilities
utilties utilities
utiltity utility
variantions variations
varient variant
wierd weird
);

($#ARGV == 1) or fail("syntax: field <pkg> <type>");
$pkg = shift;
$type = shift;

# Read in entire files at one gulp.
undef $/;

# Check defined(), because for some reason <CPY> returns the undefined
# value if the file is length 0.

if (open(DESC, "fields/description")) {
    $description = <DESC>;
    close(DESC);
    spelling_check("spelling-error-in-description", $description)
	if defined($description);
}

if (open(CPY, "copyright")) {
    $copyright = <CPY>;
    close(CPY);
    spelling_check("spelling-error-in-copyright", $copyright)
	if defined($copyright);
}

if (open(RMD, "README.Debian")) {
    $readme = <RMD>;
    close(RMD);
    spelling_check("spelling-error-in-readme-debian", $readme)
	if defined($readme);
}

exit 0;

# -----------------------------------

sub fail {
    if ($_[0]) {
	print STDERR "error: $_[0]\n";
    } elsif ($!) {
	print STDERR "error: $!\n";
    } else {
	print STDERR "error.\n";
    }
    exit 1;
}

sub tag_error {
    my $tag = shift;
    if ($#_ >= 0) {
	# We can't have newlines in a tag message, so turn them into \n
	map { s,\n,\\n, } @_;
	my $args = join(' ', @_);
	print "E: $pkg $type: $tag $args\n";
    } else {
	print "E: $pkg $type: $tag\n";
    }
}

sub spelling_check {
    my $tag = shift;
    foreach $file (@_) {
	foreach $word (split(' ', $file)) {
	    $word = lc $word;
	    # Such "words" can contain punctuation, internal hyphens, etc.
	    # First try splitting it into alphabetic parts.
	    $count = 0;
	    foreach $part (split(/[^a-z]+/, $word)) {
		tag_error($tag, $part, $corrections{$part})
		    if (exists $corrections{$part});
		$count++ if ($part ne '');
	    }
	    next if ($count < 2);
	    # Then try deleting the non-alphabetic parts from the word.
	    # Treat apostrophes specially: only delete them if they occur
	    # at the beginning or end of the word.
	    $word =~ s/(^')|[^a-z']+|('$)//g;
	    if (exists $corrections{$word}) {
		tag_error($tag, $word, $corrections{$word});
	    }
	}
    }
}
