=head1 NAME
HTML::Scrubber - Perl extension for scrubbing/sanitizing html
=head1 SYNOPSIS
=for example begin
#!/usr/bin/perl -w
use HTML::Scrubber;
use strict;
#
my $html = q[
a => link
br =>
b => bold
u => UNDERLINE
];
#
my $scrubber = HTML::Scrubber->new( allow => [ qw[ p b i u hr br ] ] ); #
#
print $scrubber->scrub($html); #
#
$scrubber->deny( qw[ p b i u hr br ] ); #
#
print $scrubber->scrub($html); #
#
=for example end
=head1 DESCRIPTION
If you wanna "scrub" or "sanitize" html input
in a reliable an flexible fashion,
then this module is for you.
I wasn't satisfied with HTML::Sanitizer because it is
based on HTML::TreeBuilder,
so I thought I'd write something similar
that works directly with HTML::Parser.
=head1 METHODS
First a note on documentation: just study the L below.
It's all the documentation you could need
Also, be sure to read all the comments as well as
L.
If you're new to perl, good luck to you.
=cut
package HTML::Scrubber;
use HTML::Parser();
use HTML::Entities;
use vars qw[ $VERSION @_scrub @_scrub_fh ];
use strict;
$VERSION = '0.08';
# my my my my, these here to prevent foolishness like
# http://perlmonks.org/index.pl?node_id=251127#Stealing+Lexicals
(@_scrub )= ( \&_scrub, "self, event, tagname, attr, attrseq, text");
(@_scrub_fh )= ( \&_scrub_fh, "self, event, tagname, attr, attrseq, text");
sub new {
my $package = shift;
my $p = HTML::Parser->new(
api_version => 3,
default_h => \@_scrub,
marked_sections => 0,
strict_comment => 0,
unbroken_text => 1,
case_sensitive => 0,
boolean_attribute_value => undef,
);
my $self = {
_p => $p,
_rules => {
'*' => 0,
},
_comment => 0,
_process => 0,
_r => "",
_optimize => 1,
_script => 0,
_style => 0,
};
$p->{"\0_s"} = bless $self, $package;
return $self unless @_;
my(%args)= @_;
for my $f( qw[ default allow deny rules process comment ] ) {
next unless exists $args{$f};
if( ref $args{$f} ) {
$self->$f( @{ $args{$f} } ) ;
} else {
$self->$f( $args{$f} ) ;
}
}
return $self;
}
=head2 comment
warn "comments are ", $p->comment ? 'allowed' : 'not allowed';
$p->comment(0); # off by default
=cut
sub comment {
return
$_[0]->{_comment}
if @_ == 1;
$_[0]->{_comment} = $_[1];
return;
}
=head2 process
warn "process instructions are ", $p->process ? 'allowed' : 'not allowed';
$p->process(0); # off by default
=cut
sub process {
return
$_[0]->{_process}
if @_ == 1;
$_[0]->{_process} = $_[1];
return;
}
=head2 script
warn "script tags (and everything in between) are supressed"
if $p->script; # off by default
$p->script( 0 || 1 );
B<**> Please note that this is implemented
using HTML::Parser's ignore_elements function,
so if C