#!/usr/bin/perl
#
my $revision = '$Id: HTMLCleaner.pm,v 1.11 2002/02/15 16:59:07 bre Exp $';
my $version = 'Anomy 0.0.0 : sanitizer.pl';
#
##  Copyright (c) 2001-2002 Bjarni R. Einarsson. All rights reserved.
##  This program is free software; you can redistribute it and/or modify
##  it under the terms of the GNU General Public License as published by
##  the Free Software Foundation; either version 2 of the License, or
##  (at your option) any later version.
#
##############################################################################
#
# NOTE:  Sanitizer development is for the most part sponsored by
#        FRISK Software International, http://www.f-prot.com/.  Please
#        consider buying their anti-virus products to show your 
#        appreciation.
#
##############################################################################
#
# This a module implements a state machine for cleaning up HTML input in 
# email.
#
# Example:
#
#   my $cleaner = new Anomy::HTMLCleaner 
#   {
#       Paranoid => 1,             # default 0
#       NoWebBugs => 1,            # default 0
#       UnknownTagsOK => 1,        # default 0
#       MaxLeftoverSize => $size   # default 4096
#       ModCounter => \$counter,
#       DefangString => "defanged",
#       Log => $log,
#   };
#
#   $leftovers = $cleaner->clean(\$data);
#
# Notes:
#
#   If "UnknownTagsOK" is set to 1, then unrecognized tags will always be
#   passed through.  The default is to mangle all unknown tags, but only
#   after the parser is "reasonably sure" that it really is dealing with
#   HTML.
#
#
#
##[ Package definition ]######################################################

package Anomy::HTMLCleaner;
use strict;
 
BEGIN {
    use Exporter ();
    use vars         qw($VERSION @ISA @EXPORT @EXPORT_OK %EXPORT_TAGS);
    $VERSION         = do { my @r = (q$Revision: 1.11 $ =~ /\d+/g); sprintf "%d."."%02d" x $#r, @r };
    @ISA             = qw(Exporter);
    @EXPORT          = qw( );
    @EXPORT_OK       = qw($src_ban_std $src_ban_webbugs);
}

use vars @EXPORT_OK;


##[ Constants ]###############################################################

# Reasons for excluding tags...
my $r_unused = "Tag is obscure or obsolete, should never be seen in normal mail.";
my $r_style  = "Styles and layers give attackers many tools to fool the\n".
               "user and common browsers interpret Javascript code found\n".
	       "within style definitions.  References:\n".
               " - http://www.securityfocus.com/bid/630\n".
	       " - http://archives.indenial.com/hypermail/bugtraq/2001/January2001/0512.html";
my $r_active = "Scripting languages, embedded objects and other \"advanced\"\n".
               "features are the primary security risks in HTML. References:\n".
	       " - http://www.securityfocus.com/bid/3025\n".
	       " - http://www.securityfocus.com/bid/1474\n".
	       " - http://www.securityfocus.com/bid/962";
my $r_form   = "Forms invoke complex, interactive elements of the operating\n".
               "system which may be buggy.  In addition, carefully crafted\n".
	       "forms can be used to trick the user into performing attacks\n".
	       "on his own network (thus avoiding firewalls).  References:\n".
	       " - http://www.securityfocus.com/bid/606\n".
	       " - http://www.remote.org/jochen/sec/hfpa/";

# Misc. email security stuff ...
#
# Why defang known tags in plain text:
#   http://www.securityfocus.com/archive/1/11478903.1000316394992.JavaMail.imail@bernie.excite.com
#
# Vcard stuff:
#   http://www.securityfocus.com/bid/2459
#   http://www.securityfocus.com/bid/1633
#
# Address book vuln:
#   http://www.securityfocus.com/bid/2823
#
# Why we mangle MIME types all over the place:
#   http://www.whitehatsec.com/labs/advisories/WH-Security_Advisory-08232001.html


# Endings we don't want to see within CID: URLs.
my $executables = '(exe|com|cmd|bat|pif|scr|sys|sct|lnk|dll'.
                  '|vbs?|vbe|hta|shb|shs|hlp|chm|eml|wsf|wsh|js'.
		  '|mdb|mht|msi|msp|cpl|lib|reg)';

# Different ban-lists for auto-loaded content, depending on config.
#
my $src_ban_std      = '^(?i)([A-Za-z]*script|about|mailto|\/(dev|proc)|\\\\|file|smb'.
                       '|cid:.*\.'.$executables.'(@|\?|$))';
my $src_ban_webbugs  = '^(?i)([A-Za-z]*script|about|mailto|\/(dev|proc)|\\\\|file|smb|http|ftp'.
                       '|cid:.*\.'.$executables.'(@|\?|$))';
my $fonts            = '\"?(Arial|Comic( Sans( MS)?)?|Courier( New)?|Geneva|Helvetica|Lucida( Console)?|sans-?serif|Tahoma|Times New Roman|Verdana)\"?';
my $alignments       = "(absbottom|absmiddle|all|autocentre|baseline|bottom|center|justify|left|middle|none|right|texttop|top)";

# These are the rules used to check each attribute type for validity.
#
# Rules come in pairs, first a rule which must always match, then a rule
# which may never match.  If either is 0, then that check is skipped.
#
my $attribute_rules = 
{
    # Disallow unknown tags by default
    "_unknown"     => [ 0, '.*' ],
    "align"        => [ "^(?i)$alignments\$", 0 ],
    "alnum"        => [ '^[A-Za-z0-9_\.-]+$', 0 ],
    "anything"     => [ 0, 0 ],
    "boolean"      => [ '^(0|1|true|yes|no|false)$', 0 ],
    "charset"      => [ '^[A-Za-z0-9_][A-Za-z0-9_\.-]*$', 0 ],
    "color"        => [ '^(?i)\#?[0-9A-Z]+$', 0 ],
    "coords"       => [ '^(?i)(\d+,)+\d+$', 0 ],
    "datetime"     => [ '^\d\d\d\d-\d\d-\d\d.{0,5}\d\d:\d\d:\d\d.{0,5}$', 0 ],
    "dir"          => [ '^(?i)(ltr|rtl)$', 0 ],
    "font-face"    => [ "^(?i)(($fonts)[,\\s]*)+\$", 0 ],
    "frame"        => [ '^(?i)(void|above|below|hsides|vsides|lhs|rhs|box|border)$', 0 ],
    # href: Not javascript, vbs or vbscript
    "href"         => [ 0, '(?i)^[A-Za-z]*script' ],
    "integer"      => [ '^(-|\+)?\d+$', 0 ],
    # language: Not javascript, vbs or vbscript
    "language"     => [ '^(?i)(XML)$', 0 ], 
    "media"        => [ '^(?i)((screen|print|projection|braille|speech|all)[,\s]*)+$', 0 ],
    "meta:content" => [ 0, 0 ],
    "meta:name"    => [ '^(?i)(author|progid|originator|generator|keywords|description|content-type|pragma|expires)$', 0 ],
    # mime-type: Not javascript
    "mime-type"    => [ '^(?i)(cite|text/(plain|css|html|xml))$', 0 ],
    "list-type"    => [ '^(?i)(none,a,i,upper-alpha,lower-alpha,upper-roman,lower-roman,decimal,disc,square,circle,round)$', 0 ],
    "rel"          => [ '^(?i)((copyright|author|stylesheet)\s*)+$', 0 ],
    "rules"        => [ '^(?i)(none|groups|rows|cols|all)$', 0 ],
    "scope"        => [ '^(?i)(row|col|rowgroup|colgroup)$', 0 ],
    "shape"        => [ '^(?i)(rect|rectangle|circ|circle|poly|polygon)$', 0 ],
    # The following two are for URLs we expect to be auto-loaded by the browser,
    # because they are within a frame, image or something like that.
    "src"          => [ 0, $src_ban_std ],
    "style"        => [ "^(?i)(text-align\\s*:\\s*$alignments|((background|(background-|font-)?color)\\s*:\\s*\\#?[A-Z0-9]+|(tab-interval|height|width)\\s*:\\s*[\\d\\.]+(pt|px)|font(-family|-size|)\\s*:(\\s*[\\d\\.]+(pt|px)|\\s*$fonts)+)[;\\s]*)+\$", 0 ], 
    "size"         => [ '^(?i)\d+(px|\%)?$', 0 ],
    "target"       => [ '^[A-Za-z0-9_][A-Za-z0-9_\.-]*$', 0 ],
};
my $attribute_mods =
{
    "NoWebBugs" => {
        "src" => [ 0, $src_ban_webbugs ],
    },
    "Paranoid" => {
        "src"   => [ 0, ".*" ],
	"style" => [ 0, ".*" ],
    },
    "UnknownTagsOK" => {
        "_unknown" => [ 0, 0 ],
    },
};

# These attributes are valid within any tag (although the HTML spec may
# disagree!).
#
# Note: All event attributes omitted for security reasons.
# 
my $common_attributes =
{
    # Core attributes
    "class"     => "class",
    "id"        => "alnum",
    "name"      => "alnum",
    "style"     => "style",
    "accesskey" => "alnum",
    "tabindex"  => "integer",
    "title"     => "anything",
    # Language attributes
    "dir"       => "dir",
    "lang"      => "alnum",
    "language"  => "language",
    "longdesc"  => "anything",
    # Height, width, alignment, etc.
    "align"      => "align",
    "bgcolor"     => "color",
    "bottommargin" => "size",
    "clear"        => "align",
    "color"        => "color",
    "height"       => "size",
    "leftmargin"   => "size",
    "marginheight" => "size",
    "marginwidth"  => "size",
    "nowrap"       => "anything",
    "rightmargin"  => "size",
    "scroll"       => "boolean",
    "scrolling"    => "boolean",
    "topmargin"    => "size",
    "valign"      => "align",
    "width"      => "size",
};

my $list_attributes =
{
    "compact" => "anything",
    "start"   => "integer",
    "type"    => "list-type",
};
my $table_attributes =
{
    "axis"         => "alnum",
    "background"   => "src",
    "border"        => "integer",
    "bordercolor"    => "color",
    "bordercolordark" => "color",
    "bordercolorlight" => "color",
    "cellpadding"     => "integer",
    "cellspacing"    => "integer",
    "cols"          => "anything",
    "colspan"      => "integer",
    "char"         => "alnum",
    "charoff"      => "integer",
    "datapagesize" => "integer",
    "frame"        => "frame",
    "frameborder"  => "boolean",
    "framespacing" => "integer",
    "headers"      => "anything",
    "rows"         => "anything",
    "rowspan"      => "integer",
    "rules"        => "rules",
    "scope"        => "scope",
    "span"         => "integer",
    "summary"      => "anything"
};
my $frame_attributes =
{
    "border"       => "integer",
    "bordercolor"  => "color",
    "cols"         => "anything",
    "frame"        => "frame",
    "frameborder"  => "boolean",
    "framespacing" => "integer",
    "hspace"       => "integer",
    "marginheight" => "integer",
    "marginwidth"  => "integer",
    "noresize"     => "anything",
    "headers"      => "anything",
    "rows"         => "anything",
    "scrolling"    => "boolean",
    "src"          => "src",
    "vspace"       => "integer",
    "target"       => "anything",
};

# Tags we recognize, and the attributes we like.
#
# A tag with an value of 1 or an attribute hash is to be conditionally
# recognized - it is permitted but it's attributes are all compared to the 
# expected values and mangled if they don't match.
#
# A tag with a reference to a scalar "reason for avoiding" is Considered 
# Harmful and is /always/ mangled.
#
# Missing:  All FORM tags.
#           Object embedding tags.
#
my $known_tags = 
{
    "!Doctype" => 0,  # FIXME:  needs special support
    "html" => 100,
    #
    # Safe elements commonly found in the <head> block follow.
    #
    "head" => 2,
    "base" => 
    {
        "href"   => "src",
        "target" => "target",
    },
    "link" => 
    {
        "rel"     => "rel",
        "rev"     => "rel",
        "src"     => "src",
        "href"    => "src",       # Might be auto-loaded by the browser!!
        "charset" => "charset",
        "media"   => "media",
        "target"  => "target",
        "type"    => "mime-type",
    },
    "meta" =>
    {
        "_score"     => 2,
        "content"    => "meta:content",
        "http-equiv" => "meta:name",
        "name"       => "meta:name",
	"charset"    => "charset",
    },
    "title" => 2,
    #
    # Safe elements commonly found in the <body> block follow.
    #
    "body" => 
    {
        "_score"       => 2,
        "link"         => "color",
        "alink"        => "color",
        "vlink"        => "color",
        "background"   => "src",
        "nowrap"       => "boolean",
        "text"         => "color",
        "vlink"        => "color",
    },
    "a" =>
    {
        "charset"     => "charset",
        "coords"      => "coords",
        "href"        => "href",
        "shape"       => "shape", 
        "target"      => "target",
        "type"        => "mime-type",
    },
    "address" => 1,
    "area" =>
    {
        "alt"    => "anything",
        "coords" => "coords",
        "href"   => "href",
        "nohref" => "anything",
        "shape"  => "shape", 
        "target" => "target",
    },
    "applet" => \$r_active,
    "basefont" =>
    {
        "face"   => "font-face",
        "family" => "font-face",
        "size"   => "integer",
    },
    "bdo"     => 1,
    "bgsound" =>
    {
        "balance" => "integer",
        "delay"   => "integer",
        "loop"    => "alnum",
        "src"     => "src",
        "volume"  => "integer",
    },
    "blockquote" => 
    {
        "cite" => "href",
	"type" => "mime-type",
    },
    "br"      => 1,
    "button"  => \$r_form,
    "caption" => 1,
    "center"  => 1,
    "col"     => $table_attributes,
    "colgroup" => $table_attributes,
    "comment" => 1,
    "dd"      => 1,
    "del"     =>
    {
        "cite"     => "href",
        "datetime" => "datetime",
    },
    "dir"   => $list_attributes,
    "div"   => 1,
    "dl"    => $list_attributes,
    "dt"    => 1,
    "embed" => \$r_active,
    "fieldset" => \$r_form,
    "font" =>
    {
        "face"   => "font-face",
        "family" => "font-face",
        "back"   => "color",
        "size"   => "integer",
    },
    "form" => \$r_form,
    "hr" => 
    {
        "size"    => "integer",
        "noshade" => "anything",
    },
    "h1"     => 1,
    "h2"     => 1,
    "h3"     => 1,
    "h4"     => 1,
    "h5"     => 1,
    "h6"     => 1,
    "iframe" => $frame_attributes,
    "ilayer" => \$r_style,
    "img" =>
    {
        "alt"      => "anything",
        "border"   => "size",
        "dynsrc"   => "src",
        "hspace"   => "size",
        "ismap"    => "anything",
        "loop"     => "alnum",
        "lowsrc"   => "src",
        "src"      => "src",
        "start"    => "alnum",
        "usemap"   => "href",
        "vspace"   => "size",
    },
    "inlineinput" => \$r_unused,
    "ins" =>
    {
        "cite" => "href",
        "datetime" => "datetime",
    },
    "isindex" => \$r_unused,
    "keygen"  => \$r_unused,
    "label"   => \$r_form,
    "layer"   => \$r_style,
    "legend"  => \$r_form,
    "li" => {
        "value" => "integer",
    },
    "listing"  => \$r_unused,
    "map"      => 1,
    "marquee"  => \$r_unused,
    "menu"     => $list_attributes,
    "multicol" => \$r_form,
    "nextid"   => \$r_form,
    "nobr"     => \$r_unused,
    "noembed"  => 1,
    "nolayer"  => 1,
    "noscript" => 1,
    "noembed"  => 1,
    "object"   => \$r_active,
    "ol"       => $list_attributes,
    "optgroup" => \$r_form,
    "option"   => \$r_form,
    "p"        => 1,
    "param"    => \$r_active,
    "plaintext"=> \$r_unused,
    "pre"      => 1,
    "rt"       => \$r_unused,
    "ruby"     => \$r_unused,
    "script"   => \$r_active,
    "select"   => \$r_form,
    "spacer"   => \$r_unused,
    "span"     => 1,
    "spell"    => \$r_unused,
    "sound" => 
    {
        "delay" => "integer",
        "loop"  => "integer",
        "src"   => "src",
    },
    "style"  => \$r_style,
    "table"  => $table_attributes,
    "tbody"  => $table_attributes,
    "textarea" => \$r_form,
    "td"     => $table_attributes,
    "tfoot"  => $table_attributes,
    "th"     => $table_attributes,
    "thead"  => $table_attributes,
    "tr"     => $table_attributes,
    "ul"     => $list_attributes,
    "wbr"    => \$r_unused,
    "xml"    => \$r_active,
    "xmp"    => \$r_unused,
    "x-tab"  => 1,
    # Character formatting
    "abbr"   => 1,
    "acronym"=> 1,
    "big"    => 1, 
    "blink"  => \$r_unused, 
    "b"      => 1, 
    "cite"   => 1,
    "code"   => 1,
    "dfn"    => 1, 
    "em"     => 1,
    "i"      => 1, 
    "kbd"    => 1,
    "q"      => 1,
    "s"      => 1,
    "samp"   => 1,
    "small"  => 1, 
    "strike" => 1, 
    "strong" => 1, 
    "sub"    => 1, 
    "sup"    => 1, 
    "tt"     => 1, 
    "u"      => 1,
    "var"    => 1,
    #
    # Safe elements commonly found in the <frameset> block follow.
    #
    "frameset" => $frame_attributes, 
    "frame"    => $frame_attributes, 
    "noframes" => 1,
};


##[ Implementation ]##########################################################

use Anomy::Log;

sub new 
{
    my ($proto, $cfg) = @_;
    my $class = ref($proto) || $proto;
    my $tmp = undef;

    $cfg = { } unless ($cfg);
    my $self = {
        "conf"         => $cfg,
		"log"          => $cfg->{"Log"} || new Anomy::Log,
		"mod_count"    => $cfg->{"ModCounter"} || \$tmp,
		"msg_defanged" => $cfg->{"DefangString"} || "DEFANGED",
		"rules"        => { %{ $attribute_rules } },
		"explained"    => { },
		"html_count"   => 0,
		"style_count"  => 0,
		"style_index"  => 0,
		"styles"       => [ ],
    };
    bless ($self, $class);
    
    ${ $self->{"mod_count"} } = 0 unless (${ $self->{"mod_count"} });

    $cfg->{"MaxLeftoverSize"} = 4096 unless ($cfg->{"MaxLeftoverSize"});

    foreach my $mod ("NoWebBugs", "Paranoid")
    {
        if ($cfg->{$mod})
	{
	    foreach my $attr (keys(%{ $attribute_mods->{$mod} }))
	    {
	        $self->{"rules"}->{$attr} = $attribute_mods->{$mod}->{$attr};
	    }
	}
    }

    return $self;
}

# Helper function:  
#
#   Check attribute
#
sub cleanTag_2
{
    my ($self, $tag, $val, $s, $a, $e, $v) = @_;

    # Decode URL-encoded stuff which needn't be URL-encoded.
    $v =~  s/%((2[ef]|3[0-9a]|4[0-9a-f]|5[0-9a]|6[1-9a-f]|7[0-9a]))/
               chr(hex("0x$2"))
	    /gsei;

    my $rule = $common_attributes->{lc($a)};
    $rule = $val->{lc($a)} if ((ref($val) =~ /HASH/) && ($val->{lc($a)}));
    $rule = "_unknown" unless ($rule);

#   print STDERR "Attribute $a = $v, using rule $rule\n";

    my $match = $self->{"rules"}->{$rule};
    if ((($match->[0]) && ($v !~ $match->[0])) || 
        (($match->[1]) && ($v =~ $match->[1])))
    {
		$a =~ s/^($self->{"msg_defanged"}_)+//;
        $a = $self->{"msg_defanged"}."_".$a;
        ${ $self->{"mod_count"} }++;
    }
    $v = '"'.$v.'"' if ($v);

    return $s.$a.$e.$v;
}

# Helper function:  
#
#   Check the $known_tags list
#   Check all attributes
#
sub cleanTag_1
{
    my ($self, $tag, $attr, $close) = @_;

    if ($tag =~ /$self->{"msg_defanged"}_/i)
    {
        # Do nothing, already defanged!
    }
    elsif (my $val = $known_tags->{lc($tag)})
    {
        if (ref($val) =~ /SCALAR/i) 
	{
	    $tag = $self->{"msg_defanged"}."_".$tag;
	    $self->{"html_count"}++;

	    $self->{"log"}->entry("reason-html", SLOG_WARNING|SLOG_INFO, 
	                          { id => ${ $self->{"mod_count"} }, reason => $$val }, 
			          "Note: %reason%")
	      unless ($self->{"explained"}->{$$val});

            $self->{"explained"}->{$$val} = 1;
            ${ $self->{"mod_count"} }++;
	}
	else
	{
	    if (ref($val) =~ /HASH/i)
	    {
                $self->{"html_count"} += ($val->{"_score"} || 1);
	    }
	    else
	    {
                $self->{"html_count"} += $val;
	    }
	    $attr =~ s/(\s+)([a-z0-9_:-]+)(?:(\s*=(?:3D)?)(?:\"([^\"]+)\"|\'([^\']+)\'|([^\'\"\s=]\S*)))?(?=(?:\s|$))/
	                  $self->cleanTag_2($tag, $val, $1, $2, $3, $4.$5.$6);
                      /gise;
	}
    }
    elsif ((($self->{"html_count"} > 10) || ($self->{"conf"}->{"Paranoid"})) && 
           (!$self->{"conf"}->{"UnknownTagsOK"}))
    {
	    $tag = $self->{"msg_defanged"}."_".$tag;
        ${ $self->{"mod_count"} }++;
    }
    else
    {
	$self->{"html_count"}--; 
    }

    return "<".$tag.$attr.">";
}

# Helper function: 
#
#   Decode tag
#   Check the $known_tags list
#   Check all attributes
#   Log changes, if any
#
sub cleanTag
{
    my $self = shift;
    my $data = shift;

    # Don't defang email addresses or comments!
    return $data if ($data =~ /^<(\S+\@|!--)/);

    # Useful variables
    my $omod  = ${ $self->{"mod_count"} };
    my $odata = $data;
    my $log   = $self->{"log"};
    my $conf  = $self->{"conf"};

    # Unquote things that have no good reason to be quoted - potentially
    # dangerous obfuscated URLs, for the most part.

    # Decode entity-encoded stuff which needn't be encoded (decimal).
    $data =~ s/&#(4[6-9]|5[0-8]|6[4-9]|[78][0-9]|9[07-9]|1[0-1][0-9]|12[0-2]);?/
                 chr($1)
              /gsei;

    # Decode entity-encoded stuff which needn't be encoded (hex).
    $data =~ s/&#x(2[ef]|3[0-9a]|4[0-9a-f]|5[0-9a]|6[1-9a-f]|7[0-9a]);?/
                 chr(hex("0x$1"))
              /gsei;

    # Recognize end-tags.
    my $close = 0;
    $close = 1 if ($data =~ s/^<(\/)/</);

    # Go go gadget!
    $data =~ s/^<(\S+)(.*)>/ $self->cleanTag_1($1, $2, $close) /se;
    $data =~ s/^</<\// if ($close);
    
    # Undo changes if nothing dangerous was found.
    $data = $odata if (${ $self->{"mod_count"} } == $omod);
    

    # Logging
    if ($odata ne $data)
    {
        my $ndata = $data;

        $ndata =~ s/\s+/ /gs;
        $ndata =~ s/[<>]/_/gs;
        $odata =~ s/\s+/ /gs;
        $odata =~ s/[<>]/_/gs;
        $log->entry("rewrote-html", SLOG_WARNING|SLOG_INFO, 
                    { id => $omod, old => $odata, new => $ndata }, 
                    "Rewrote HTML tag: >>%old%<<\n".
                    "              as: >>%new%<<");
    }

    $self->{"style_count"} = 0;
    return $data;
} 


# Record this style tag for later restoration.
#
sub record_style
{
    my $self = shift;
	my $data = shift;
	my $cnt = $self->{style_index}++;
	$self->{styles}->[$cnt] = $data;
	return "<style_$cnt ";
}

# This routine sanitizes a snippet of HTML.
#
# If the tag is incomplete, it is returned as a "leftover" for later
# scanning, else the function returns undef.  The calling function is
# expected to detect overly long leftovers, and truncate them if 
# necessary.
#
# This is loosely based on John's HTML sanitization code.
#
sub clean
{
    my $self = shift;
    my $data_ptr = shift;
    
    my $conf = $self->{"conf"};
    my $log = $self->{"log"};
    my $leftovers = undef;

    # FIXME:
    #   IF we're pretty sure this is HTML, then we want to
    #   split long tags.  If not, we want to leave them alone.
    #
    #   Either way, we have to keep track of them ...

    # Special handling of <style>...</style> blocks.
    if (length($$data_ptr) < $self->{"conf"}->{"MaxLeftoverSize"}) 
    {
        $self->{"style_count"} += ($$data_ptr =~ s/(<style[^<>]*)>/ $self->record_style($1) /geis);
    }
    $self->{"style_count"} -= ($$data_ptr =~ s/<style_(\d+)( [^<>]+)(<\/style>)/$self->{styles}->[$1]$2>$3/gis);
    $self->{"style_count"} -= ($$data_ptr =~ s/<style_(\d+) ([^<>]+)</$self->{styles}->[$1]>$2</gis);

    # Check for trailing tags...
    if ($$data_ptr =~ s/(<(?:!--\s+)?[A-Za-z]+[^>]+)$//s)
    {
        $leftovers = $1;
        if (length($leftovers) > $self->{"conf"}->{"MaxLeftoverSize"}) 
        {
	    if (($self->{"style_count"} > 0) && 
	        # Fix this long evil tag we artificially spawned above...
	        (my $n = ($leftovers =~ s/<style_(\d+) ([^>]*)$/$self->{styles}->[$1]>$2/is)))
	    {
	        $self->{"style_count"} -= $n;
	    }
	    else
	    {	    
                my $lo = $leftovers;

                $lo =~ s/[<>]/_/gs;
                $lo =~ s/\s+/ /gs;

                $log->entry("split-html", SLOG_WARNING|SLOG_INFO,
                            { id => ${ $self->{"mod_count"} },
                              begin => substr($lo, 0, 25),
                              end => substr($lo, -25, -1) },
                            "Split really long tag (over 2k):\n".
                            "    >>%begin% ... %end%<<");

                $$data_ptr .= $leftovers . '>';
                $leftovers = "<". $self->{"msg_defanged"} .".".${ $self->{"mod_count"} }." ";
	        ${ $self->{"mod_count"} }++;
	    }
        }
    }

    # Defang untrusted HTML content
    $$data_ptr =~ s/(<[A-Za-z0-9\#\&\;\:\!_\/-]+(\s+[^>]*)?\/?>)/ $self->cleanTag($1) /gse;

    return $leftovers;
}

sub test
{
    my $hash = shift || { };
    $hash->{"Log"} = new Anomy::Log;
    my $cleaner = new Anomy::HTMLCleaner $hash;
    my $t;
    my $l;
    
    while ($t = $l.<STDIN>)
    {
        $l = $cleaner->clean(\$t);
	print $t;
    }

    print $hash->{"Log"}->print_as_text(), "\n";
}

1;
