package PDF::FromHTML::Twig; use strict; use warnings; use XML::Twig; use base 'XML::Twig'; use charnames ':full'; use Graphics::ColorNames qw( hex2tuple ); use File::Spec; use File::Basename; use List::Util qw( sum first reduce ); =head1 NAME PDF::FromHTML::Twig - PDF::FromHTML guts =head1 SYNOPSIS (internal use only) =head1 DESCRIPTION No user-serviceable parts inside. =cut sub new { my $class = shift; XML::Twig::new($class, $class->TwigArguments, @_); } our $PageWidth = 640; our $PageResolution = 540; our $FontBold = 'HelveticaBold'; our $FontOblique = 'HelveticaOblique'; our $FontBoldOblique = 'HelveticaBoldOblique'; our $LineHeight = 12; our $FontUnicode = 'Helvetica'; our $Font = $FontUnicode; # $Font = '/usr/local/share/fonts/TrueType/minguni.ttf'; our $PageSize = 'A4'; our $Landscape = 0; use constant SuperScript => [ "\N{SUPERSCRIPT ZERO}", "\N{SUPERSCRIPT ONE}", "\N{SUPERSCRIPT TWO}", "\N{SUPERSCRIPT THREE}", "\N{SUPERSCRIPT FOUR}", "\N{SUPERSCRIPT FIVE}", "\N{SUPERSCRIPT SIX}", "\N{SUPERSCRIPT SEVEN}", "\N{SUPERSCRIPT EIGHT}", "\N{SUPERSCRIPT NINE}", ]; use constant SubScript => [ "\N{SUBSCRIPT ZERO}", "\N{SUBSCRIPT ONE}", "\N{SUBSCRIPT TWO}", "\N{SUBSCRIPT THREE}", "\N{SUBSCRIPT FOUR}", "\N{SUBSCRIPT FIVE}", "\N{SUBSCRIPT SIX}", "\N{SUBSCRIPT SEVEN}", "\N{SUBSCRIPT EIGHT}", "\N{SUBSCRIPT NINE}", ]; use constant InlineTags => { map { $_ => 1 } '#PCDATA', 'font' }; use constant DeleteTags => { map { $_ => 1 } qw( head style applet script ) }; use constant IgnoreTags => { map { $_ => 1 } qw( title a ul del address blockquote colgroup fieldset input form frameset object noframes noscript small optgroup isindex area textarea col pre frame param menu acronym abbr bdo label basefont big caption option cite dd dfn dt base code map iframe ins kbd legend samp span dir strike meta link tbody q tfoot button thead tt select s var ) }; use constant TwigArguments => ( twig_handlers => { html => sub { $_->del_atts; $_->set_gi('pdftemplate'); }, map(( "h$_" => ( sub { my $size = 4 + shift; sub { $_->insert_new_elt(before => 'textbox') ->wrap_in('row') ->wrap_in(font => { face => $FontBold }); $_->wrap_in( font => { h => $LineHeight + 6 - $size }); $_->wrap_in( row => { h => $LineHeight + 8 - $size }); $_->set_tag('textbox'), $_->set_att(w => '100%'); }; } )->($_) ), 1 .. 6), center => sub { foreach my $child ($_->children('p')) { # XXX - revert other blocklevel to left/original alignment $child->set_att(align => 'center'); } $_->erase; }, sup => sub { my $digits = $_->text; my $text = ''; $text .= +SuperScript->[$1] while $digits =~ s/(\d)//; $_->set_text($text); $_->erase; }, sub => sub { my $digits = $_->text; my $text = ''; $text .= +SubScript->[$1] while $digits =~ s/(\d)//; $_->set_text($text); $_->erase; }, u => sub { _set(underline => 1, $_); $_->erase; }, em => sub { _set(font => $FontOblique, $_); $_->erase; }, i => sub { _set(font => $FontOblique, $_); $_->erase; }, strong => sub { _set(font => $FontBold, $_); $_->erase; }, b => sub { _set(font => $FontBold, $_); $_->erase; }, div => sub { if (my $tag = (_type(header => $_) || _type(footer => $_))) { $_->set_tag($tag); $_->set_att( "${tag}_height" => int( sum( $LineHeight * 2, grep defined, map $_->att('h'), $_->descendants ) ), ); } else { $_->erase; } }, hr => sub { $_->insert_new_elt(first_child => (_type(pagebreak => $_) || 'hr')); $_->erase; }, img => sub { my $src = $_->att('src'); my $file = File::Spec->rel2abs($src); if ($src =~ m{^(\w+):/}) { require LWP::Simple; require File::Basename; require File::Spec; $file = File::Spec->catfile(File::Spec->tmpdir, File::Basename::basename($src)); LWP::Simple::mirror($src => $file); } # CSA - check for real file first # if (-e $file) { my $w = $_->att('width'); my $h = $_->att('height'); if (($w eq '') or ($h eq '')) { require Image::Size; my ($iw, $ih) = Image::Size::imgsize($file); # CSA - catch this now, before we crash # warn "unable to read image file '$file' ($w x $h)" unless (defined $iw && defined $ih); $iw ||= 1; $ih ||= 1; if (!$w and !$h) { ($w, $h) = ($iw, $ih); } elsif (!$w) { $w = $iw * ($h / $ih); } else { $h = $ih * ($w / $iw); } } my $image = $_->insert_new_elt( first_child => image => { filename => $file, w => ($w / $PageWidth * $PageResolution), h => ($h / $PageWidth * $PageResolution), type => '', } ); $image->wrap_in('row'); # CSA - File has gone missing # } else { warn "image file '$file' does not exist"; } $_->erase; }, body => sub { $_->wrap_in( pagedef => { pagesize => $PageSize, landscape => $Landscape, margins => $LineHeight - 2, }, ); $_->wrap_in( font => { face => $Font, h => $LineHeight - 2, } ); my $pagedef = $_->parent->parent; my $head = ($pagedef->descendants('header'))[0] || $pagedef->insert_new_elt( first_child => header => { header_height => $LineHeight * 2 }); my $row = $head->insert_new_elt(first_child => 'row'); $row->insert_new_elt( first_child => textbox => { w => '100%', text => '' }); foreach my $child ($_->children('#PCDATA')) { $child->set_text( join(' ', grep length, split(/\n+/, $child->text))); if ($child->text =~ /[^\x00-\x7f]/) { $child->wrap_in(font => { face => $FontUnicode }); } $child->wrap_in('row'); $child->wrap_in(textbox => { w => '100%' }); $child->insert_new_elt(after => 'textbox')->wrap_in('row'); } $_->erase; }, p => \&_p, li => \&_p, table => sub { our @RowSpan = (); my $cols = $_->root->att('#total_cols') or do { $_->erase for $_->children('tr'); $_->erase; return; }; my $widths = $_->root->att('#widths'); if (!$widths) { $widths = []; $_->root->set_att('#widths', $widths); } my $table_width = $_->root->att('#total_width'); if (!$table_width) { $table_width = _percentify($_->att('width'), $PageWidth); $_->root->set_att('#total_width', $table_width); } my $unallocated_sum = 100; my $unallocated_cols = 0; foreach my $idx (0..$cols-1) { if (my $w = $widths->[$idx]) { $unallocated_sum -= $w; } else { $unallocated_cols++; } } if ($unallocated_cols and $unallocated_sum > 0) { # warn "UNALLOC: $unallocated_cols, $unallocated_sum\n"; # Populate unallocated columns my $w = int($unallocated_sum / $unallocated_cols); $widths->[$_] ||= $w for (0..$cols-1); } elsif ($unallocated_cols) { # Redistribute all columns. my $w = int(100 / $cols); $widths->[$_] = $w for (0..$cols-1); } elsif ($unallocated_sum < 0) { # warn "WIDTHS: @$widths ($unallocated_sum)\n"; # Redistribute all columns, part 2. -- not sure we should do it actually. my $overflow = (100-$unallocated_sum); $widths->[$_] = int($widths->[$_] * 100 / $overflow) for (0..$cols-1); } for ($_->children('tr')) { return $_->erase if $_->descendants('row'); my @children = $_->descendants('textbox'); my @cells = @{ shift(@RowSpan) || [] }; foreach my $i (1 .. $#cells) { my $cell = $cells[$i] or next; my $child; if ($child = $children[ $i - 1 ]) { $child->insert_new_elt(before => 'textbox', $cell); } elsif ($child = $children[ $i - 2 ]) { $child->insert_new_elt(after => 'textbox', $cell); } else { next; } @children = $_->descendants('textbox'); } my $cols = sum(map { $_->att('colspan') || 1 } @children); # print STDERR "==> Total cols: $cols :".@children.$/; my $sum = 100; my $last_child = pop(@children); my $col_idx = 0; foreach my $child (@children) { my $colspan = $child->att('colspan') || 1; my $w = 0; foreach my $idx ($col_idx .. $col_idx+$colspan-1) { $w += $widths->[$idx]; } $col_idx += $colspan; $child->set_att(w => "$w%"); $sum -= $w; } $last_child->set_att(w => "$sum%") if $last_child; $_->set_tag('row'); $_->set_att(lmargin => '3'); $_->set_att(rmargin => '3'); $_->set_att(border => $_->parent('table')->att('border')); $_->set_att(h => $LineHeight); } $_->root->del_att('#widths'); $_->root->set_att('#total_width' => undef); $_->root->set_att('#total_cols' => undef); $_->insert_new_elt(last_child => row => { h => $LineHeight }); $_->erase; }, ol => sub { my $count = 1; foreach my $child ($_->descendants('counter')) { $child->set_tag('textbox'); $child->set_text("$count. "); $count++; } $_->insert_new_elt(last_child => row => { h => $LineHeight }); $_->erase; }, br => sub { $_->insert_new_elt(last_child => row => { h => $LineHeight }); $_->erase; }, ul => sub { foreach my $child ($_->descendants('counter')) { $child->set_tag('textbox'); $child->set_text("* "); } $_->insert_new_elt(last_child => row => { h => $LineHeight }); $_->erase; }, dl => sub { foreach my $child ($_->descendants('counter')) { $child->delete; } $_->insert_new_elt(last_child => row => { h => $LineHeight }); $_->erase; }, td => \&_td, th => \&_td, font => sub { $_->del_att('face'); if ($_->att_names) { $_->set_att(face => $Font); $_->erase; # XXX } else { $_->erase; } }, var => sub { # XXX - Proper variable support }, _default_ => sub { $_->erase if +IgnoreTags->{ $_->tag }; $_->delete if +DeleteTags->{ $_->tag }; } }, pretty_print => 'indented', empty_tags => 'html', start_tag_handlers => { _all_ => sub { if (my $h = $_->att('size')) { $_->set_att(h => $LineHeight + (2 * ($h - 4))); } if (my $bgcolor = $_->att('bgcolor')) { $_->set_att(bgcolor => _to_color($bgcolor)); } $_->del_att( qw( color bordercolor bordercolordark bordercolorlight cellpadding cellspacing size href ) ); }, } ); sub _set { my ($key, $value, $elt) = @_; my $att = $elt->root->att("#$key") || {}; $att->{ $elt->parent } = $value; $elt->root->set_att("#$key", $att); } sub _get { my ($key, $elt) = @_; my $att = $elt->root->att("#$key") || {}; return $att->{$elt}; } sub _p { my @children; foreach my $child ($_->children) { +InlineTags->{ $child->tag } or last; push @children, $child->cut; } if (@children) { my $textbox = $_->insert_new_elt( before => textbox => { w => (($_->tag eq 'p') ? '100%' : '97%'), align => $_->att('align') }, ); $textbox->wrap_in('row'); if ($_->tag eq 'li') { $textbox->insert_new_elt( before => counter => { w => '3%', align => 'right' }); } foreach my $child (@children) { $child->paste(last_child => $textbox); $child->set_text( join(' ', grep { length and $_ ne 1 } split(/\n+/, $child->text)) ); } my $font = _get(font => $_); if ($textbox->text =~ /[^\x00-\x7f]/) { $font = $FontUnicode; } elsif ($_->parent('i') and $_->parent('b')) { $font ||= $FontBoldOblique; } elsif ($_->parent('i')) { $font ||= $FontOblique; } elsif ($_->parent('b')) { $font ||= $FontBold; } my %attr; $attr{face} = $font if $font; if (_get(underline => $_)) { my $align = $textbox->att('align'); $align .= '_underline'; $textbox->del_att('align'); require PDF::FromHTML::Template::Constants; $PDF::FromHTML::Template::Constants::Verify{ALIGN}{$align} = 1 if %PDF::FromHTML::Template::Constants::Verify; $attr{align} = $align; } $textbox->wrap_in('font' => \%attr) if %attr; } $_->insert_new_elt(first_child => 'textbox')->wrap_in('row') if $_->tag eq 'p'; $_->erase; } sub _td { return $_->erase if $_->descendants('row'); $_->set_tag('textbox'); if (my $font = _get(font => $_)) { $_->wrap_in(font => { face => $font }); } my $cols = $_->parent->att('_cols') || 0; no warnings 'uninitialized'; if ($_->att('colspan') <= 1 and my $width = $_->att('width')) { my $table_width = $_->root->att('#total_width') || 100; my $cell_width = _percentify($width, int($table_width * $PageWidth / 100)); # Register us in the width table my $widths = $_->root->att('#widths'); if (!$widths) { $widths = []; $_->root->set_att('#widths', $widths); } # warn "[$cols] = $widths->[$cols] vs $cell_width\n"; $widths->[$cols] = $cell_width if $widths->[$cols] < $cell_width; } $cols += ($_->att('colspan') || 1); $_->parent->set_att(_cols => $cols); $_->root->set_att('#total_cols', $cols) if $_->root->att('#total_cols') < $cols; if (my $rowspan = $_->att('rowspan')) { # ok, we can't really do this. # what we can do, though, is to add 'fake' cells in the next row. our @RowSpan; foreach my $i (1 .. ($rowspan - 1)) { $RowSpan[$i][$cols] = $_->atts; } } } sub _percentify { my $num = shift or return '100'; my $total_width = shift or Carp::confess( '100') ; return $1 if $num =~ /(\d+)%/; return int($num / $total_width * 100); } sub _type { my ($val, $elt) = @_; return first { $_ eq $val } grep defined, map $elt->att($_), qw(type class); } sub _to_color { my ($color) = @_; if ($color !~ s/^#//) { $color = Graphics::ColorNames->new('Netscape')->hex($color); } return join ',', hex2tuple($color); } 1; =head1 AUTHORS Audrey Tang Ecpan@audreyt.orgE =head1 COPYRIGHT Copyright 2004, 2005, 2006, 2007 by Audrey Tang Ecpan@audreyt.orgE. This software is released under the MIT license cited below. =head2 The "MIT" License Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. =cut