=head1 NAME HTML::Scrubber - Perl extension for scrubbing/sanitizing html =head1 SYNOPSIS =for example begin #!/usr/bin/perl -w use HTML::Scrubber; use strict; # my $html = q[
a => link br =>
b => bold u => UNDERLINE ]; # my $scrubber = HTML::Scrubber->new( allow => [ qw[ p b i u hr br ] ] ); # # print $scrubber->scrub($html); # # $scrubber->deny( qw[ p b i u hr br ] ); # # print $scrubber->scrub($html); # # =for example end =head1 DESCRIPTION If you wanna "scrub" or "sanitize" html input in a reliable an flexible fashion, then this module is for you. I wasn't satisfied with HTML::Sanitizer because it is based on HTML::TreeBuilder, so I thought I'd write something similar that works directly with HTML::Parser. =head1 METHODS First a note on documentation: just study the L below. It's all the documentation you could need Also, be sure to read all the comments as well as L. If you're new to perl, good luck to you. =cut package HTML::Scrubber; use HTML::Parser(); use HTML::Entities; use vars qw[ $VERSION @_scrub @_scrub_fh ]; use strict; $VERSION = '0.08'; # my my my my, these here to prevent foolishness like # http://perlmonks.org/index.pl?node_id=251127#Stealing+Lexicals (@_scrub )= ( \&_scrub, "self, event, tagname, attr, attrseq, text"); (@_scrub_fh )= ( \&_scrub_fh, "self, event, tagname, attr, attrseq, text"); sub new { my $package = shift; my $p = HTML::Parser->new( api_version => 3, default_h => \@_scrub, marked_sections => 0, strict_comment => 0, unbroken_text => 1, case_sensitive => 0, boolean_attribute_value => undef, ); my $self = { _p => $p, _rules => { '*' => 0, }, _comment => 0, _process => 0, _r => "", _optimize => 1, _script => 0, _style => 0, }; $p->{"\0_s"} = bless $self, $package; return $self unless @_; my(%args)= @_; for my $f( qw[ default allow deny rules process comment ] ) { next unless exists $args{$f}; if( ref $args{$f} ) { $self->$f( @{ $args{$f} } ) ; } else { $self->$f( $args{$f} ) ; } } return $self; } =head2 comment warn "comments are ", $p->comment ? 'allowed' : 'not allowed'; $p->comment(0); # off by default =cut sub comment { return $_[0]->{_comment} if @_ == 1; $_[0]->{_comment} = $_[1]; return; } =head2 process warn "process instructions are ", $p->process ? 'allowed' : 'not allowed'; $p->process(0); # off by default =cut sub process { return $_[0]->{_process} if @_ == 1; $_[0]->{_process} = $_[1]; return; } =head2 script warn "script tags (and everything in between) are supressed" if $p->script; # off by default $p->script( 0 || 1 ); B<**> Please note that this is implemented using HTML::Parser's ignore_elements function, so if C