=pod
=head1 NAME
HTML::HiLiter - highlight words in an HTML document just like a felt-tip HiLiter
=head1 VERSION
0.13
=cut
# TODO: would HTML::Tree be faster?
# or even XML::LibXML since we assume libxml2 for swish-e?
# perhaps experiment with both and toggle on the fly?
package HTML::HiLiter;
use 5.006001;
use strict;
use sigtrap qw(die normal-signals error-signals);
use vars qw(
$VERSION $BegChar $EndChar $WordChar $White_Space $HiTag $HiClass
$CSS_Class $hrefs $buffer $Debug $Delim $color $nocolor
$OC $CC %entity2char %codeunis %unicodes %char2entity $ISO_ext
@whitesp $SkipTag
);
my $ticker;
eval { require Pubs::Times; };
unless ($@) {
$ticker = 1;
Pubs::Times::tick('start ok');
}
#$ticker = 0;
$VERSION = '0.13';
$OC = "\n\n";
$SkipTag = '';
# ISO 8859 Latin1 encodings
# remove dependency on HTML::Entities by copying them all here
# we don't use the functions in HTML::Entities and it does a require HTML::Parser anyway
%entity2char = (
# Some normal chars that have special meaning in SGML context
amp => '&', # ampersand
'gt' => '>', # greater than
'lt' => '<', # less than
quot => '"', # double quote
apos => "'", # single quote
# PUBLIC ISO 8879-1986//ENTITIES Added Latin 1//EN//HTML
AElig => 'Æ', # capital AE diphthong (ligature)
Aacute => 'Á', # capital A, acute accent
Acirc => 'Â', # capital A, circumflex accent
Agrave => 'À', # capital A, grave accent
Aring => 'Å', # capital A, ring
Atilde => 'Ã', # capital A, tilde
Auml => 'Ä', # capital A, dieresis or umlaut mark
Ccedil => 'Ç', # capital C, cedilla
ETH => 'Ð', # capital Eth, Icelandic
Eacute => 'É', # capital E, acute accent
Ecirc => 'Ê', # capital E, circumflex accent
Egrave => 'È', # capital E, grave accent
Euml => 'Ë', # capital E, dieresis or umlaut mark
Iacute => 'Í', # capital I, acute accent
Icirc => 'Î', # capital I, circumflex accent
Igrave => 'Ì', # capital I, grave accent
Iuml => 'Ï', # capital I, dieresis or umlaut mark
Ntilde => 'Ñ', # capital N, tilde
Oacute => 'Ó', # capital O, acute accent
Ocirc => 'Ô', # capital O, circumflex accent
Ograve => 'Ò', # capital O, grave accent
Oslash => 'Ø', # capital O, slash
Otilde => 'Õ', # capital O, tilde
Ouml => 'Ö', # capital O, dieresis or umlaut mark
THORN => 'Þ', # capital THORN, Icelandic
Uacute => 'Ú', # capital U, acute accent
Ucirc => 'Û', # capital U, circumflex accent
Ugrave => 'Ù', # capital U, grave accent
Uuml => 'Ü', # capital U, dieresis or umlaut mark
Yacute => 'Ý', # capital Y, acute accent
aacute => 'á', # small a, acute accent
acirc => 'â', # small a, circumflex accent
aelig => 'æ', # small ae diphthong (ligature)
agrave => 'à', # small a, grave accent
aring => 'å', # small a, ring
atilde => 'ã', # small a, tilde
auml => 'ä', # small a, dieresis or umlaut mark
ccedil => 'ç', # small c, cedilla
eacute => 'é', # small e, acute accent
ecirc => 'ê', # small e, circumflex accent
egrave => 'è', # small e, grave accent
eth => 'ð', # small eth, Icelandic
euml => 'ë', # small e, dieresis or umlaut mark
iacute => 'í', # small i, acute accent
icirc => 'î', # small i, circumflex accent
igrave => 'ì', # small i, grave accent
iuml => 'ï', # small i, dieresis or umlaut mark
ntilde => 'ñ', # small n, tilde
oacute => 'ó', # small o, acute accent
ocirc => 'ô', # small o, circumflex accent
ograve => 'ò', # small o, grave accent
oslash => 'ø', # small o, slash
otilde => 'õ', # small o, tilde
ouml => 'ö', # small o, dieresis or umlaut mark
szlig => 'ß', # small sharp s, German (sz ligature)
thorn => 'þ', # small thorn, Icelandic
uacute => 'ú', # small u, acute accent
ucirc => 'û', # small u, circumflex accent
ugrave => 'ù', # small u, grave accent
uuml => 'ü', # small u, dieresis or umlaut mark
yacute => 'ý', # small y, acute accent
yuml => 'ÿ', # small y, dieresis or umlaut mark
# Some extra Latin 1 chars that are listed in the HTML3.2 draft (21-May-96)
copy => '©', # copyright sign
reg => '®', # registered sign
nbsp => "\240", # non breaking space
# Additional ISO-8859/1 entities listed in rfc1866 (section 14)
iexcl => '¡',
cent => '¢',
pound => '£',
curren => '¤',
yen => '¥',
brvbar => '¦',
sect => '§',
uml => '¨',
ordf => 'ª',
laquo => '«',
'not' => '¬', # not is a keyword in perl
shy => '',
macr => '¯',
deg => '°',
plusmn => '±',
sup1 => '¹',
sup2 => '²',
sup3 => '³',
acute => '´',
micro => 'µ',
para => '¶',
middot => '·',
cedil => '¸',
ordm => 'º',
raquo => '»',
frac14 => '¼',
frac12 => '½',
frac34 => '¾',
iquest => '¿',
'times' => '×', # times is a keyword in perl
divide => '÷'
);
while (my($entity, $char) = each(%entity2char)) {
$char2entity{$char} = "&$entity;";
}
delete $char2entity{"'"}; # only one-way decoding
# Fill in missing entities
for (0 .. 255) {
next if exists $char2entity{chr($_)};
$char2entity{chr($_)} = "$_;";
}
########## end copy from HTML::Entities
# a subset of chars per SWISH
$ISO_ext = 'ªµºÀÁÂÃÄÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖØÙÚÛÜÝÞßàáâãäåæçèéêëìíîïðñòóôõöøùúûüýþÿ';
######################################################################################
# http://www.pemberley.com/janeinfo/latin1.html
# The CP1252 characters that are not part of ANSI/ISO 8859-1, and that should therefore
# always be encoded as Unicode characters greater than 255, are the following:
# Windows Unicode Char.
# char. HTML code test Description of Character
# ----- ----- --- ------------------------
#ALT-0130 ‚ â Single Low-9 Quotation Mark
#ALT-0131 ƒ Ä Latin Small Letter F With Hook
#ALT-0132 „ ã Double Low-9 Quotation Mark
#ALT-0133 … É Horizontal Ellipsis
#ALT-0134 † Dagger
#ALT-0135 ‡ à Double Dagger
#ALT-0136 ˆ ö Modifier Letter Circumflex Accent
#ALT-0137 ‰ ä Per Mille Sign
#ALT-0138 Š ? Latin Capital Letter S With Caron
#ALT-0139 ‹ Ü Single Left-Pointing Angle Quotation Mark
#ALT-0140 Œ Î Latin Capital Ligature OE
#ALT-0145 ‘ Ô Left Single Quotation Mark
#ALT-0146 ’ Õ Right Single Quotation Mark
#ALT-0147 “ Ò Left Double Quotation Mark
#ALT-0148 ” Ó Right Double Quotation Mark
#ALT-0149 • ¥ Bullet
#ALT-0150 – Ð En Dash
#ALT-0151 — Ñ Em Dash
#ALT-0152 ˜ ÷ Small Tilde
#ALT-0153 ™ ª Trade Mark Sign
#ALT-0154 š ? Latin Small Letter S With Caron
#ALT-0155 › Ý Single Right-Pointing Angle Quotation Mark
#ALT-0156 œ Ï Latin Small Ligature OE
#ALT-0159 Ÿ Ù Latin Capital Letter Y With Diaeresis
#
#######################################################################################
# NOTE that all the Char tests will likely fail above unless your terminal/editor
# supports Unicode
# browsers should support these numbers, and in order for perl < 5.8 to work correctly,
# we add the most common if missing
%unicodes = (
8218 => "'",
402 => 'f',
8222 => '"',
8230 => '...',
8224 => 't',
8225 => 't',
8216 => "'",
8217 => "'",
8220 => '"',
8221 => '"',
8226 => '*',
8211 => '-',
8212 => '-',
732 => '~',
8482 => '(TM)',
376 => 'Y',
352 => 'S',
353 => 's',
8250 => '>',
8249 => '<',
710 => '^',
338 => 'OE',
339 => 'oe',
);
for (keys %unicodes) {
# quotemeta required since build_regexp will look for the \
my $ascii = quotemeta($unicodes{$_});
next if length $ascii > 2;
#warn "pushing $_ into $ascii\n";
push(@{ $codeunis{$ascii} }, $_);
}
################################################################################
$WordChar = '\w' . $ISO_ext . './-';
$BegChar = '\w' . $ISO_ext . './-';
$EndChar = '\w' . $ISO_ext;
# regexp for what constitutes whitespace in an HTML doc
# it's not as simple as \s| so we define it separately
# NOTE that the pound sign # seems to need escaping, though that seems like a perl bug to me.
# Mon Sep 20 11:34:04 CDT 2004
@whitesp = (
'&\#0020;',
'&\#0009;',
'&\#000C;',
'&\#200B;',
'&\#2028;',
'&\#2029;',
' ',
'&\#32;',
'&\#160;',
'\s',
'\xa0',
'\x20',
);
$White_Space = join('|', @whitesp);
$HiTag = 'span'; # what tag to use to hilite
$HiClass = undef; # what class to use (none by default)
$CSS_Class = 'hilite';
$buffer = ''; # init the buffer
$hrefs = []; # init the href buffer (for Links option)
$Delim = '"'; # phrase delimiter
$Debug = 0; # set to 0 to turn off debugging comments
# if we're running via terminal (usually for testing)
# and the Term::ANSIColor module is installed
# use that for debugging -- easier on the eyes...
# if we were really clever, we might rotate colors ala span class
$color = '';
$nocolor = '';
if ( $ENV{HTML_HILITER_TERMINAL} ) {
eval { require Term::ANSIColor };
unless ($@) {
$color = Term::ANSIColor::color('bold blue');
$nocolor = Term::ANSIColor::color('reset');
}
}
my $HiLiting = 0; # flag initially OFF, then turned ON
# whenever we pass out of
my $tag_regexp = '(?s-mx:<[^>]+>)'; # this might miss comments inside tags
# or CDATA attributes
# unless HTML::Parser is used
my %common_char = (
'>' => '>',
'<' => '<',
'&' => '&',
#'\xa0' => ' ', # this is ok asis
'"' => '"',
#"\xa0" => ' '
);
sub new
{
my $package = shift;
my $self = {};
bless($self, $package);
$self->_init(@_);
return $self;
}
sub _swish_new
{
# takes a SWISH::API object and
# uses the SWISH methods to set WordChar, etc.
my $self = shift;
my $swish_obj = $self->{SWISHE} || $self->{SWISH};
# use standard name internally, for calling SWISH:: methods like Stemming
$self->{swishobj} = $swish_obj;
my @head_names = $swish_obj->HeaderNames;
my @indexes = $swish_obj->IndexNames;
# just use the first index, assuming user
# won't pass more than one with different Header values
my $index = shift @indexes;
$self->{swishindex} = $index;
for my $h (@head_names) {
my @v = $swish_obj->HeaderValue( $index, $h );
$self->{$h} = scalar @v > 1
? [ @v ]
: $v[0];
$self->{$h} = quotemeta( $v[0] || '' ) if $h =~/char/i;
}
# set stemmer flag if it was used in the index
$self->{stemmer} = $self->{'Stemming Applied'};
}
sub _init
{
my $self = shift;
$self->{'start'} = time;
my %extra = @_;
@$self{keys %extra} = values %extra;
$Debug = $self->{debug} if $self->{debug};
# special handling for swish flag
# allow common naming mistake :)
_swish_new( $self ) if $self->{SWISHE} or $self->{SWISH};
# default values for object
$self->{WordCharacters} ||= $WordChar;
$self->{EndCharacters} ||= $EndChar;
$self->{BeginCharacters} ||= $BegChar;
# a search for a '<' or '>' should still highlight,
# since < or > can be indexed as literal < and >, at least by SWISH-E
for (qw(WordCharacters EndCharacters BeginCharacters)) {
$self->{$_} =~ s,[<>&],,g;
# escape some special chars in a class []
#$self->{$_} =~ s/([.-])/\\$1/g;
}
# what's the boundary between a word and a not-word?
# by default:
# the beginning of a string
# the end of a string
# whatever we've defined as White_Space
# any character that is not a WordChar
#
# the \A and \Z (beginning and end) should help if the word butts up
# against the beginning or end of a tagset
# like Word or Word
$self->{StartBound} ||= join('|',
'\A',
'[>]',
'(?:&[\w\#]+;)', # because a ; might be a legitimate wordchar
# and we treat a char entity like a single char.
# if &char; resolves to a legit wordchar
# this might give unexpected results.
# NOTE that etc is in $White_Space
$White_Space,
'[^' . $self->{BeginCharacters} . ']'
);
$self->{EndBound} ||= join('|',
'\Z',
'[<&]',
$White_Space,
'[^' . $self->{EndCharacters} . ']'
);
# the whitespace in a query phrase might be:
# any ignorelastchar, followed by
# one or more nonwordchar or whitespace, followed by
# any ignorefirstchar
# define for both text and html
my $igf = $self->{IgnoreFirstChar} ? qr/[$self->{IgnoreFirstChar}]*/i : '';
my $igl = $self->{IgnoreLastChar} ? qr/[$self->{IgnoreLastChar}]*/i : '';
$self->{textPhraseBound} = join '',
$igl,
qr/[\s\x20]|[^$self->{WordCharacters}]/is,
'+',
$igf;
$self->{HTMLPhraseBound} = join '',
$igl,
qr/$White_Space|[^$self->{WordCharacters}]/is,
'+',
$igf;
$self->{HiTag} ||= $HiTag;
$self->{HiClass} = $HiClass unless( defined $self->{HiClass} );
$self->{Colors} ||= [ '#FFFF99', '#99FFFF', '#ffccff', '#ccccff' ];
$self->{Links} ||= 0; # off by default
$self->{BufferLim} ||= 100000; # eval'ing enormous buffers can cause
# huge bottlenecks. if buffer length
# exceeds BufferLim, it will not be highlighted
$self->{Force} ||= undef; # wrap Inline HTML with tagset
# to force HTML interpolation by HTML::Parser
# load the parser unless explicitly asked not to
unless ( defined($self->{Parser}) && $self->{Parser} == 0)
{
$self->{Parser}++;
require HTML::Parser;
require HTML::Tagset;
# HTML::Tagset::isHeadElement doesn't define these,
# so we add them here
$HTML::Tagset::isHeadElement{'head'}++;
$HTML::Tagset::isHeadElement{'html'}++;
}
unless ( defined($self->{Print}) && $self->{Print} == 0)
{
$self->{Print} = 1;
}
$self->{TagFilter} ||= sub {};
$self->{TextFilter} ||= sub {};
$self->{noplain} ||= 0; # allow for plaintext() as optimization
}
sub _escape
{
my $C = join '', keys %common_char;
$_[0] =~ s/([$C])/$common_char{$1}/og;
1;
}
sub _mytag
{
my ($parser,$tag,$tagname,$offset,$length,$offset_end,$attr,$text) = @_;
my $hiliter = $parser->{HiLiter};
# $tag has ! for declarations and / for endtags
# $tagname is just bare tagname
if ($Debug >= 3) {
print $OC;
print "\n". '=' x 20 . "\n";
print "Tag is :$tag:\n";
print "TagName is :$tagname:\n";
print "Offset is $offset\n";
print "Length is $length\n";
print "Offset_end is $offset_end\n";
print "Text is $text\n";
print "Attr is $_ = $attr->{$_}\n" for keys %$attr;
print "SkipTag is :$SkipTag:\n";
print $CC;
}
if ( $attr->{nohiliter} and $tag !~ m!^/! ) {
# we want to not highlight this tag's contents
$SkipTag = $tagname;
#warn "skipping <$tag> with nohiliter\n";
} elsif ( $SkipTag eq $tagname and $tag =~ m!^/! ) {
# should be endtag
$SkipTag = '';
}
# if we encounter an inline tag, add it to the buffer
# for later evaluation
# PhraseMarkup is closest to libxml2 'inline' definition
if ( $HTML::Tagset::isPhraseMarkup{$tagname} )
{
my $tagfilter = $hiliter->{TagFilter};
my $reassemble = &$tagfilter( @_ ) || $text;
print "${OC} adding :$reassemble: to buffer ${CC}" if $Debug >= 3;
$buffer .= $reassemble; # add to the buffer for later evaluation
# as a potential match
# for Links option
if ($hiliter->{Links} and exists($attr->{'href'})) {
push(@$hrefs, $attr->{'href'});
}
#warn "INLINEBUFFER:$buffer:INLINEBUFFER";
return;
}
else
{
if ($Debug >= 3) {
Pubs::Times::tick('start buffer eval') if $ticker;
}
# if we have a BufferLim defined and the current $buffer
# length exceeds that limit, deal with it immediately
# and don't highlight
if ($hiliter->{BufferLim} and
length($buffer) > $hiliter->{BufferLim})
{
if ($hiliter->{Print}) {
print $buffer;
} else {
$hiliter->{Buffer} .= $buffer;
}
} else {
# otherwise, call the hiliter on $buffer
# this is the main event
my $hilited = $hiliter->hilite( $buffer, $hrefs );
# remove any NULL markers we inserted to skip hiliting
$hilited =~ s/\000//g;
if ($hiliter->{Print}) {
print $hilited;
} else {
$hiliter->{Buffer} .= $hilited;
}
}
if ($Debug >= 3) {
Pubs::Times::tick('end buffer eval') if $ticker;
}
$buffer = '';
$hiliter->{dtext} = '';
$hrefs = [];
}
# turn HiLiting ON if we are not inside the
tagset.
# this prevents us from hiliting a for example.
unless ( $HTML::Tagset::isHeadElement{$tagname} )
{
$HiLiting = 1;
}
# use reassemble to futz with attribute values or tagnames
# before printing them.
# otherwise, default to what we have in original HTML
#
# NOTE: this is where we could change HREF values, for example
my $tagfilter = $hiliter->{TagFilter};
my $reassemble = &$tagfilter( @_ ) || $text;
if ($hiliter->{Print}) {
print $reassemble;
} else {
$hiliter->{Buffer} .= $reassemble;
}
# if this is the opening tag,
# add the \n";
$self->{StyleHead} = $tagset;
1;
}
sub _make_styles_inline
{
# create hash for adding style attribute inline
# each query gets assigned a color
my $self = shift;
my $queries = $self->{query_array};
my $num = 0;
my @colors = @{ $self->{Colors} };
my $tag = $self->{HiTag};
for (@$queries) {
my $s = '';
# if we are using HiClass
if( defined $self->{HiClass} ) {
# this allows for having an empty, but defined HiClass
$s = "class='$self->{HiClass}'" if( $self->{HiClass} );
# else, use the color
} else {
$s = "style='background:" . $colors[$num++] . "'";
}
$self->{OTags}->{$_} = $s ? "<$tag $s>" : "<$tag>";
$self->{CTags}->{$_} = "$tag>"; # this is always the same; should we bother?
$num = 0 if $num > $#colors; # start over if we exceed
# total number of colors
}
1;
}
sub _get_real_html
{
# this could be a bottleneck if buffer is really large
# so use $self->{BufferLim} to avoid that.
# or can the s//eval{}/ approach be improved upon??
if ($Debug >= 3) {
Pubs::Times::tick('start get real html') if $ticker;
}
my ($html,$re) = @_;
my $uniq = {};
# $1 should be st_bound, $2 should be query, $3 should be end_bound
#warn "looking for '$re' in '$html'\n";
while( $html =~ m/$re/g ) {
# print $OC,
# "\$1 is '$1'\n",
# "\$2 is '$2'\n",
# "\$3 is '$3'\n",
# $CC;
$uniq->{$2}++;
pos($html) = pos($html) - 1;
# move back and consider $3 again as possible $1 for next match
}
#$html =~ s$reeval { $uniq->{$2}++ }gisex;
#print $OC . "UNIQ looked for \n$re\n" . $CC;
#print $OC . "UNIQ: $_\n" . $CC for keys %$uniq;
if ($Debug >= 3) {
Pubs::Times::tick('end get real html') if $ticker;
}
return $uniq;
}
sub _count_instances
{
my ($self,$query,$tagless,$links,$re) = @_;
print $OC, "counting instances of : $re :\nin text: $tagless\n", $CC if $Debug > 1;
my $count = 0;
$count++ while ( $tagless =~ m/$re/g );
# second, count instances in $links (an array ref)
# just one hit per link, even if the pattern appears multiple times
for my $i (@$links)
{
print $OC . "looking for LINK '$i' against $re" , $CC if $Debug >= 1;
$count++ while ( $tagless =~ m/$re/g );
}
return $count;
}
sub build_regexp
{
my ($self,$q) = @_;
my $wild = $self->{EndCharacters};
my $begchars = $self->{BeginCharacters};
my $st_bound = $self->{StartBound};
my $end_bound = $self->{EndBound};
my $wordchars = $self->{WordCharacters};
my $text_phr_bound = $self->{textPhraseBound};
my $html_phr_bound = $self->{HTMLPhraseBound};
# define simple pattern for plain text
# and complex pattern for HTML markup
my ($simple,$complex);
my $escaped = quotemeta( $q );
$escaped =~ s/\\[\*]/[$wordchars]*/g; # wildcard
$escaped =~ s/\\[\s]/$text_phr_bound/g; # whitespace
$simple = qr/
(
\A|[^$begchars]
)
(
${escaped}
)
(
[^$wild]|\Z
)
/xis; # no -o because we might have multiple $q's
my (@char) = split(//,$q);
my $counter = -1;
CHAR: foreach my $c (@char)
{
$counter++;
my $ent = $char2entity{$c} || warn "no entity defined for >$c< !\n";
my $num = ord($c);
# if this is a special regexp char, protect it
$c = quotemeta($c);
# if it's a *, replace it with the Wild class
$c = "[$wild]*" if $c eq '\*';
#warn "char: $c\n";
if ($c eq '\ ') {
$c = $html_phr_bound . $tag_regexp . '*';
#warn "whitespace: $c\n";
next CHAR;
} elsif (exists $codeunis{$c} ) {
#warn "matched $c in codeunis\n";
my @e = @{ $codeunis{$c} };
$c = join('|', $c, grep { $_ = "$_;" } @e );
}
#warn "c after: $c\n";
my $aka = $ent eq "$num;" ? $ent : "$ent|$num;";
# make $c into a regexp
#$c = "(?i-xsm:$c|$aka)" unless $c eq "[$wild]*";
$c = qr/$c|$aka/i unless $c eq "[$wild]*";
#$c = "(?:$c|$aka)";
# any char might be followed by zero or more tags, unless it's the last char
$c .= $tag_regexp . '*' unless $counter == $#char;
}
# re-join the chars into a single string
my $safe = join("\n",@char); # use \n to make it legible in debugging
# for debugging legibility we include newlines, so make sure we s//x in matches
$complex =qr/
(
${st_bound}
)
(
${safe}
)
(
${end_bound}
)
/xis; # no -o because we have multiple $safe's
#warn "complex: '$complex'\n";
#warn "simple: '$simple'\n";
return [ $complex, $simple ];
}
sub _add_hilite_tags
{
my ($self,$html,$q,$to_hilite,$count) = @_;
# $to_hilite is the real html that matched our regexp in _get_real...
# we still check boundaries just to be safe...
my $st_bound = $self->{StartBound};
my $end_bound = $self->{EndBound};
my $open = $self->{OTags}->{$q};
my $close = $self->{CTags}->{$q};
_ascii_chars($html) if $Debug >= 3;
my $safe = quotemeta($to_hilite);
# pre-fix nested tags in match
my $prefixed = $to_hilite;
my $pre_added = $prefixed =~ s($tag_regexp+)${nocolor}$close$1$open${color}g;
my $len_added = length( $nocolor.$close.$open.$color) * $pre_added;
# should be same as length( $to_hilite) - length( $prefixed );
my $len_diff = ( length( $to_hilite ) - length( $prefixed ) );
$len_diff *= -1 if $len_diff < 0; # pre_added might be -1 if no subs were made
if ( $len_diff != $len_added ) {
warn "length math failed!\n";
warn "len_diff = $len_diff\nlen_added = $len_added\n";
}
my $c = 0;
while ( $html =~ m/($st_bound)($safe)($end_bound)/g ) {
$c++;
my $s = $1;
my $m = $2;
my $e = $3;
if ( $Debug >= 2 )
{
#print "$OC add_hilite_tags:\n$st_bound\n$safe\n$end_bound\n $CC";
print "$OC matched:\n'$s'\n'$m'\n'$e'\n $CC";
print "$OC \$1 is " . ord( $s ) . $CC;
print "$OC \$3 is " . ord( $e ) . $CC;
}
# use substr to do what s// would normally do if pos() wasn't an issue
# -- is this a big speed hit?
my $len = length($s.$m.$e);
my $pos = pos($html);
my $newstring = $s.$open.$color.$prefixed.$nocolor.$close.$e;
substr( $html, $pos - $len, $len, $newstring );
pos($html) = $pos + length( $open.$color.$nocolor.$close ) + $len_added - 1;
# adjust for new text added
# $prefixed is the hard bit, since we must take $len_added into account
# move back 1 to reconsider $3 as next $1
# warn "pos was $pos\nnow ", pos( $html ), "\n";
# warn "new: '$html'\n";
# warn "new text: '$newstring'\n";
# warn "first chars of new pos are '", substr( $html, pos($html), 10 ), "'\n";
}
# put this in while() loop above so we can futz with pos()
#$c = ($html =~ s/($st_bound)($safe)($end_bound)/$1${open}${color}${prefixed}${nocolor}${close}$3/g );
# no -s, -i or -x flags wanted or needed
# since we watch exact (case sensitive) matches
# on real, previously identified HTML
# but -g to get all instances
if ($Debug >= 1) {
print $OC .
"SAFE was $safe\n".
"PREFIXED was $prefixed\n".
"HILITED $c times\n".
"AFTER is $html\n".
$CC;
}
$count->{$q} -= $c;
$self->{Report}->{$q}->{HiLites} += $c;
$html = _clean_up_hilites($self, $html, $q, $open, $close, $safe, $count);
print $OC . "AFTER hilite clean:$html:" . $CC if $Debug >= 3;
return $html;
}
sub _ascii_chars
{
my $s = shift;
for (split(//,$s)) {
print $OC . "$_ = ". ord($_) . $CC;
}
}
sub _clean_up_hilites
{
# try and keep Report honest
# if it was a mistake, don't count it as an Instance
# so that it also doesn't show up as a Miss
my ($self,$html,$q,$open,$close,$safe,$count) = @_;
print $OC . "BEFORE cleanup, HiLite Count for '$q' is $self->{Report}->{$q}->{HiLites}" . $CC if $Debug >= 1;
# empty hilites are useless
my $empty = ( $html =~ s,$open(?:\Q$color\E)(?:\Q$nocolor\E)$close,,sgi ) || 0;
# to be safe: in some cases we might match against entities or within tag content.
my $ent_split = ( $html =~ s/(&[\w#]*)$open(?:\Q$color\E)(${safe})(?:\Q$nocolor\E)$close([\w#]*;)/$1$2$3/igs ) || 0;
my $tag_split = 0;
while ( $html =~ m/(<[^<>]*)\Q$open\E(?:\Q$color\E)($safe)(?:\Q$nocolor\E)\Q$close\E([^>]*>)/gxsi ) {
print "$OC appears to split tag: $1$2$3 $CC" if $Debug >= 1;
$tag_split += ( $html =~ s/(<[^<>]*)\Q$open\E(?:\Q$color\E)($safe)(?:\Q$nocolor\E)\Q$close\E([^>]*>)/$1$2$3/gxsi );
#$count->{$q} += $c;
}
$self->{Report}->{$q}->{HiLites} -= ($tag_split + $ent_split);
$self->{Report}->{$q}->{Instances} -= ($ent_split + $tag_split);
if ($Debug >= 1) {
print $OC.
"\tfound $empty empty hilite tags\n".
"\tfound $tag_split split tags\n".
"\tfound $ent_split split entities\n".
$CC;
}
print "$OC AFTER cleanup, HiLite Count for '$q' is $self->{Report}->{$q}->{HiLites} $CC" if $Debug >= 1;
return $html;
}
sub _metanames
{
my $self = shift;
my $swish = $self->{swishobj};
my $index = ( $swish->IndexNames )[0];
my @metaobjs = $swish->MetaList( $index );
my @metanames;
for (@metaobjs) {
push(@metanames, $_->Name);
}
return @metanames;
}
sub prep_queries
{
require Text::ParseWords;
my $self = shift;
my @query = @{ shift(@_) };
my $metanames = shift || undef;
my $stopwords = shift || $self->{StopWords} || [];
$stopwords = [ split(/\s+/, $stopwords) ] unless ref $stopwords;
if ( $self->{swishobj} and ! defined $metanames ) {
# get metanames automatically
$metanames = [ $self->_metanames ];
}
my (%words,%uniq);
my $quot = ord($Delim);
my $lparen = ord('(');
my $rparen = ord(')');
my $paren_regexp = '\(|\)' . '|\x'. $rparen . '|\x' . $lparen;
my $Q = join('|', $Delim, $quot );
# only SWISH would define these
my $igf = $self->{IgnoreFirstChar} || '';
my $igl = $self->{IgnoreLastChar} || '';
Q: for my $q (@query) {
chomp $q;
#print $OC . "raw:$q:" . $CC if $Debug >= 1;
# remove any swish metanames from each query
$q =~ s,\b$_\s*=\s*,,gi for @$metanames;
# no () groupers
# replace with space, in case we have something like
# (foo)(this or that)
$q =~ s,$paren_regexp, ,g;
my @words = Text::ParseWords::shellwords($q);
# try preserving order of query for better intuitive highlighting color order
my $c = 0;
WORD: for my $w (@words) {
next WORD if exists $uniq{$w};
# remove any Ignore chars, since search will ignore them too
#warn "s/(\A|\s+)[$igf]*/$1/gi\n";
#warn "s/[$igl]*(\Z|\s+)/$1/gi\n";
$w =~ s/(\A|\s+)[$igf]*/$1/gi if $igf;
$w =~ s/[$igl]*(\Z|\s+)/$1/gi if $igl;
$uniq{$w} = $c++;
}
}
# clean up:
delete $uniq{''};
delete $uniq{0}; # parsing errors generate this value
# remove keywords from words but not phrases
# because we can search for a literal 'and' or 'or' inside a phrase
delete $uniq{'and'};
delete $uniq{'or'};
delete $uniq{'not'};
# no individual stopwords should get highlighted
# but stopwords in phrases should.
delete $uniq{$_} for @$stopwords;
#print "\n". '=' x 20 . "\n" if $Debug;
for (keys %uniq) {
# print $OC . ':'. $_ . ":" . $CC if $Debug >= 1;
# double-check that we don't have something like foo and foo*
if ($_ =~ m/\*/) {
(my $b = $_) =~ s,\*,,g;
if (exists($uniq{$b})) {
delete($uniq{$b}); # ax the more exact of the two
# since the * will match both
}
}
}
#print $OC . '~' x 40 . $CC if $Debug >= 1;
# if stemmer flag is turned on, that means we're dealing with a swish::api instance
# and we need to stem each query word after we're through.
# in order to build a sane regexp, we take the first N common chars from the original
# word and the stemmed word.
if ( $self->{stemmer} )
{
# split each $uniq into words
# stem each word
# if stem ne word, break into chars and find first N common
# rejoin $uniq
#warn "stemming ON\n";
K: for ( keys %uniq ) {
my (@w) = split /\ /;
W: for my $w (@w) {
my $f = $self->_stem( $w );
#warn "w: $w\nf: $f\n";
if ( $f ne $w ) {
my @stemmed = split //, $f;
my @char = split //, $w;
$f = ''; #reset
while ( @char && @stemmed && $stemmed[0] eq $char[0] ) {
$f .= shift @stemmed;
shift @char;
}
}
$w = $f . '*'; # add wildcard so that regexp gets built correctly
}
my $new = join ' ',@w;
if ($new ne $_) {
$uniq{$new} = $uniq{$_};
delete $uniq{$_};
}
}
}
return ( [ sort { $uniq{$a} <=> $uniq{$b} } keys %uniq ] );
# sort keeps query in same order as we entered
}
sub _stem
{
# this is a copy of SWISH::HiLiter::stem()
my $self = shift;
my $w = shift;
my $i = $self->{swishindex};
my $fw = $self->{swishobj}->Fuzzify( $i, $w );
my @fuzz = $fw->WordList;
if ( my $e = $fw->WordError ) {
warn "Error in Fuzzy WordList ($e): $!\n";
return undef;
}
return $fuzz[0]; # we ignore possible doublemetaphone
}
sub Report
{
my $self = shift;
my $report;
if ($self->{Report}) {
$report = "HTML::HiLiter report:\n";
my $r = $self->{Report};
for my $query (sort keys %$r) {
$report .= "$query\n";
for my $cat (sort keys %{ $r->{$query} }) {
my $val = '';
if (ref $r->{$query}->{$cat} eq 'HASH') {
$val = "\n";
$val .= "\t $_ ( $r->{$query}->{$cat}->{$_} )\n"
for keys %{ $r->{$query}->{$cat} };
} else {
$val = $r->{$query}->{$cat};
}
$report .= "\t$cat -> $val\n";
}
}
} else {
$report = "nothing hilited\n";
}
$report .= "hilite() called " . $self->{hilitecalled} . " times\n";
$report .= "hilite() took " . $self->{hilitetime} . " total secs\n" if $ticker;
# add settings summary
if ( $Debug )
{
require Data::Dumper;
$Data::Dumper::Indent = 1;
$Data::Dumper::Deepcopy = 1;
$report .= Data::Dumper::Dumper( $self );
$report .= "Debug = $Debug\n";
}
$report .= '-' x 40 . "\n"; # trailing line for readability
# reset report, so it can be used multiply with single object
delete $self->{Report};
return $report;
}
sub _get_url
{
require HTTP::Request;
require LWP::UserAgent;
my $self = shift;
my $url = shift || return;
my ($http_ua,$request,$response,$content_type,$buf,$size);
$http_ua = LWP::UserAgent->new;
$request = HTTP::Request->new(GET => $url);
$response = $http_ua->request($request);
$content_type ||= '';
if( $response->is_error ) {
warn "Error: Couldn't get '$url': response code " . $response->code. "\n";
return;
}
if( $response->headers_as_string =~ m/^Content-Type:\s*(.+)$/im ) {
$content_type = $1;
$content_type =~ s/^(.*?);.*$/$1/; # ignore possible charset value???
}
$buf = $response->content;
$size = length($buf);
$url = $response->base;
return ($buf, $url, $response->last_modified, $size, $content_type);
}
1;
__END__
=pod
=head1 DESCRIPTION
HTML::HiLiter is designed to make highlighting search queries
in HTML easy and accurate. HTML::HiLiter was designed for CrayDoc 4, the
Cray documentation server. It has been written with SWISH::API users in mind,
but can be used within any Perl program.
=head1 SYNOPSIS
use HTML::HiLiter;
my $hiliter = new HTML::HiLiter(
WordCharacters => '\w\-\.',
BeginCharacters => '\w',
EndCharacters => '\w',
HiTag => 'span',
Colors => [ qw(#FFFF33 yellow pink) ],
Links => 1
TagFilter => \&yourtagcode(),
TextFilter => \&yourtextcode(),
Force => 1,
SWISH => $swish_api_object
);
$hiliter->Queries( 'foo bar or "some phrase"' );
$hiliter->CSS;
$hiliter->Run('some_file_or_URL');
=head1 REQUIREMENTS
The following are absolutely required:
=over
=item
Perl version 5.6.1 or later.
=item
Text::ParseWords
=back
Required if using with SWISH::HiLiter or the SWISH param in new():
=over
=item
SWISH::API version 0.03 or later
=back
Required if running with Parser=>1 (default):
=over
=item
HTML::Parser
=item
HTML::Entities
=item
HTML::Tagset
=back
Required to use the HTTP option in the Run() method:
=over
=item
HTTP::Request
=item
LWP::UserAgent
=back
The Debug feature requires L when you run Report().
=head1 FEATURES
A cornucopia of features.
=over
=item *
With HTML::Parser enabled (default), HTML::HiLiter evals highlighted HTML
chunk by chunk, buffering all text
within an HTML block element before evaluating the buffer for highlighting.
If no matches to the queries are found, the HTML is immediately printed (default)
or cached and returned at the end of all evaluation (Print=>0).
You can direct the print() to a filehandle with the standard select() function
in your script. Or use Print=>0 to return the highlighted HTML as a scalar string.
=item *
Turn highlighting off on a per-tagset basis with the custom HTML "nohiliter" attribute.
Set the attribute to a TRUE value (like 1) to turn off
highlighting for the duration of that tag.
=item *
Ample debugging. Set the $HTML::HiLiter::Debug variable to a level between 1 and 3,
and lots of debugging info will be printed within HTML comments .
=item *
Will highlight link text (the stuff within an tagset) if the HREF
value is a valid match. See the Links option.
=item *
Smart context. Won't highlight across an HTML block element like a
tagset or a tagset. (IMHO, your indexing software shouldn't consider
matches for phrases that span across those tags either.)
=item *
Rotating colors. Each query gets a unique color. The default is four different
colors, which will repeat if you have more than four queries in a single
document. You can define more colors in the new() object call.
=item *
Cascading Style Sheets. Will add a