package PDF::FromHTML::Twig;
use strict;
use warnings;
use XML::Twig;
use base 'XML::Twig';
use charnames ':full';
use Graphics::ColorNames qw( hex2tuple );
use File::Spec;
use File::Basename;
use List::Util qw( sum first reduce );
=head1 NAME
PDF::FromHTML::Twig - PDF::FromHTML guts
=head1 SYNOPSIS
(internal use only)
=head1 DESCRIPTION
No user-serviceable parts inside.
=cut
sub new {
my $class = shift;
XML::Twig::new($class, $class->TwigArguments, @_);
}
our $PageWidth = 640;
our $PageResolution = 540;
our $FontBold = 'HelveticaBold';
our $FontOblique = 'HelveticaOblique';
our $FontBoldOblique = 'HelveticaBoldOblique';
our $LineHeight = 12;
our $FontUnicode = 'Helvetica';
our $Font = $FontUnicode;
# $Font = '/usr/local/share/fonts/TrueType/minguni.ttf';
our $PageSize = 'A4';
our $Landscape = 0;
use constant SuperScript => [
"\N{SUPERSCRIPT ZERO}",
"\N{SUPERSCRIPT ONE}",
"\N{SUPERSCRIPT TWO}",
"\N{SUPERSCRIPT THREE}",
"\N{SUPERSCRIPT FOUR}",
"\N{SUPERSCRIPT FIVE}",
"\N{SUPERSCRIPT SIX}",
"\N{SUPERSCRIPT SEVEN}",
"\N{SUPERSCRIPT EIGHT}",
"\N{SUPERSCRIPT NINE}",
];
use constant SubScript => [
"\N{SUBSCRIPT ZERO}",
"\N{SUBSCRIPT ONE}",
"\N{SUBSCRIPT TWO}",
"\N{SUBSCRIPT THREE}",
"\N{SUBSCRIPT FOUR}",
"\N{SUBSCRIPT FIVE}",
"\N{SUBSCRIPT SIX}",
"\N{SUBSCRIPT SEVEN}",
"\N{SUBSCRIPT EIGHT}",
"\N{SUBSCRIPT NINE}",
];
use constant InlineTags => { map { $_ => 1 } '#PCDATA', 'font' };
use constant DeleteTags => {
map { $_ => 1 }
qw(
head style applet script
)
};
use constant IgnoreTags => {
map { $_ => 1 }
qw(
title a ul
del address blockquote colgroup fieldset
input form frameset object noframes noscript
small optgroup isindex area textarea col
pre frame param menu acronym abbr bdo
label basefont big caption option cite
dd dfn dt base code map iframe ins kbd legend
samp span dir strike meta link tbody q tfoot
button thead tt select s
var
)
};
use constant TwigArguments => (
twig_handlers => {
html => sub {
$_->del_atts;
$_->set_gi('pdftemplate');
},
map((
"h$_" => (
sub {
my $size = 4 + shift;
sub {
$_->insert_new_elt(before => 'textbox')
->wrap_in('row')
->wrap_in(font => { face => $FontBold });
$_->wrap_in(
font => { h => $LineHeight + 6 - $size });
$_->wrap_in(
row => { h => $LineHeight + 8 - $size });
$_->set_tag('textbox'), $_->set_att(w => '100%');
};
}
)->($_)
),
1 .. 6),
center => sub {
foreach my $child ($_->children('p')) {
# XXX - revert other blocklevel to left/original alignment
$child->set_att(align => 'center');
}
$_->erase;
},
sup => sub {
my $digits = $_->text;
my $text = '';
$text .= +SuperScript->[$1] while $digits =~ s/(\d)//;
$_->set_text($text);
$_->erase;
},
sub => sub {
my $digits = $_->text;
my $text = '';
$text .= +SubScript->[$1] while $digits =~ s/(\d)//;
$_->set_text($text);
$_->erase;
},
u => sub {
_set(underline => 1, $_);
$_->erase;
},
em => sub {
_set(font => $FontOblique, $_);
$_->erase;
},
i => sub {
_set(font => $FontOblique, $_);
$_->erase;
},
strong => sub {
_set(font => $FontBold, $_);
$_->erase;
},
b => sub {
_set(font => $FontBold, $_);
$_->erase;
},
div => sub {
if (my $tag = (_type(header => $_) || _type(footer => $_))) {
$_->set_tag($tag);
$_->set_att(
"${tag}_height" => int(
sum(
$LineHeight * 2,
grep defined,
map $_->att('h'),
$_->descendants
)
),
);
}
else {
$_->erase;
}
},
hr => sub {
$_->insert_new_elt(first_child => (_type(pagebreak => $_) || 'hr'));
$_->erase;
},
img => sub {
my $src = $_->att('src');
my $file = File::Spec->rel2abs($src);
if ($src =~ m{^(\w+):/}) {
require LWP::Simple;
require File::Basename;
require File::Spec;
$file =
File::Spec->catfile(File::Spec->tmpdir,
File::Basename::basename($src));
LWP::Simple::mirror($src => $file);
}
# CSA - check for real file first
#
if (-e $file) {
my $w = $_->att('width');
my $h = $_->att('height');
if (($w eq '') or ($h eq '')) {
require Image::Size;
my ($iw, $ih) = Image::Size::imgsize($file);
# CSA - catch this now, before we crash
#
warn "unable to read image file '$file' ($w x $h)"
unless (defined $iw && defined $ih);
$iw ||= 1;
$ih ||= 1;
if (!$w and !$h) {
($w, $h) = ($iw, $ih);
}
elsif (!$w) {
$w = $iw * ($h / $ih);
}
else {
$h = $ih * ($w / $iw);
}
}
my $image = $_->insert_new_elt(
first_child => image => {
filename => $file,
w => ($w / $PageWidth * $PageResolution),
h => ($h / $PageWidth * $PageResolution),
type => '',
}
);
$image->wrap_in('row');
# CSA - File has gone missing
#
}
else {
warn "image file '$file' does not exist";
}
$_->erase;
},
body => sub {
$_->wrap_in(
pagedef => {
pagesize => $PageSize,
landscape => $Landscape,
margins => $LineHeight - 2,
},
);
$_->wrap_in(
font => {
face => $Font,
h => $LineHeight - 2,
}
);
my $pagedef = $_->parent->parent;
my $head = ($pagedef->descendants('header'))[0]
|| $pagedef->insert_new_elt(
first_child => header => { header_height => $LineHeight * 2 });
my $row = $head->insert_new_elt(first_child => 'row');
$row->insert_new_elt(
first_child => textbox => { w => '100%', text => '' });
foreach my $child ($_->children('#PCDATA')) {
$child->set_text(
join(' ', grep length, split(/\n+/, $child->text)));
if ($child->text =~ /[^\x00-\x7f]/) {
$child->wrap_in(font => { face => $FontUnicode });
}
$child->wrap_in('row');
$child->wrap_in(textbox => { w => '100%' });
$child->insert_new_elt(after => 'textbox')->wrap_in('row');
}
$_->erase;
},
p => \&_p,
li => \&_p,
table => sub {
our @RowSpan = ();
my $cols = $_->root->att('#total_cols') or do {
$_->erase for $_->children('tr');
$_->erase;
return;
};
my $widths = $_->root->att('#widths');
if (!$widths) {
$widths = [];
$_->root->set_att('#widths', $widths);
}
my $table_width = $_->root->att('#total_width');
if (!$table_width) {
$table_width = _percentify($_->att('width'), $PageWidth);
$_->root->set_att('#total_width', $table_width);
}
my $unallocated_sum = 100;
my $unallocated_cols = 0;
foreach my $idx (0..$cols-1) {
if (my $w = $widths->[$idx]) {
$unallocated_sum -= $w;
}
else {
$unallocated_cols++;
}
}
if ($unallocated_cols and $unallocated_sum > 0) {
# warn "UNALLOC: $unallocated_cols, $unallocated_sum\n";
# Populate unallocated columns
my $w = int($unallocated_sum / $unallocated_cols);
$widths->[$_] ||= $w for (0..$cols-1);
}
elsif ($unallocated_cols) {
# Redistribute all columns.
my $w = int(100 / $cols);
$widths->[$_] = $w for (0..$cols-1);
}
elsif ($unallocated_sum < 0) {
# warn "WIDTHS: @$widths ($unallocated_sum)\n";
# Redistribute all columns, part 2. -- not sure we should do it actually.
my $overflow = (100-$unallocated_sum);
$widths->[$_] = int($widths->[$_] * 100 / $overflow) for (0..$cols-1);
}
for ($_->children('tr')) {
return $_->erase if $_->descendants('row');
my @children = $_->descendants('textbox');
my @cells = @{ shift(@RowSpan) || [] };
foreach my $i (1 .. $#cells) {
my $cell = $cells[$i] or next;
my $child;
if ($child = $children[ $i - 1 ]) {
$child->insert_new_elt(before => 'textbox', $cell);
}
elsif ($child = $children[ $i - 2 ]) {
$child->insert_new_elt(after => 'textbox', $cell);
}
else {
next;
}
@children = $_->descendants('textbox');
}
my $cols = sum(map { $_->att('colspan') || 1 } @children);
# print STDERR "==> Total cols: $cols :".@children.$/;
my $sum = 100;
my $last_child = pop(@children);
my $col_idx = 0;
foreach my $child (@children) {
my $colspan = $child->att('colspan') || 1;
my $w = 0;
foreach my $idx ($col_idx .. $col_idx+$colspan-1) {
$w += $widths->[$idx];
}
$col_idx += $colspan;
$child->set_att(w => "$w%");
$sum -= $w;
}
$last_child->set_att(w => "$sum%") if $last_child;
$_->set_tag('row');
$_->set_att(lmargin => '3');
$_->set_att(rmargin => '3');
$_->set_att(border => $_->parent('table')->att('border'));
$_->set_att(h => $LineHeight);
}
$_->root->del_att('#widths');
$_->root->set_att('#total_width' => undef);
$_->root->set_att('#total_cols' => undef);
$_->insert_new_elt(last_child => row => { h => $LineHeight });
$_->erase;
},
ol => sub {
my $count = 1;
foreach my $child ($_->descendants('counter')) {
$child->set_tag('textbox');
$child->set_text("$count. ");
$count++;
}
$_->insert_new_elt(last_child => row => { h => $LineHeight });
$_->erase;
},
br => sub {
$_->insert_new_elt(last_child => row => { h => $LineHeight });
$_->erase;
},
ul => sub {
foreach my $child ($_->descendants('counter')) {
$child->set_tag('textbox');
$child->set_text("* ");
}
$_->insert_new_elt(last_child => row => { h => $LineHeight });
$_->erase;
},
dl => sub {
foreach my $child ($_->descendants('counter')) {
$child->delete;
}
$_->insert_new_elt(last_child => row => { h => $LineHeight });
$_->erase;
},
td => \&_td,
th => \&_td,
font => sub {
$_->del_att('face');
if ($_->att_names) {
$_->set_att(face => $Font);
$_->erase; # XXX
}
else {
$_->erase;
}
},
var => sub {
# XXX - Proper variable support
},
_default_ => sub {
$_->erase if +IgnoreTags->{ $_->tag };
$_->delete if +DeleteTags->{ $_->tag };
}
},
pretty_print => 'indented',
empty_tags => 'html',
start_tag_handlers => {
_all_ => sub {
if (my $h = $_->att('size')) {
$_->set_att(h => $LineHeight + (2 * ($h - 4)));
}
if (my $bgcolor = $_->att('bgcolor')) {
$_->set_att(bgcolor => _to_color($bgcolor));
}
$_->del_att(
qw(
color bordercolor bordercolordark bordercolorlight
cellpadding cellspacing size href
)
);
},
}
);
sub _set {
my ($key, $value, $elt) = @_;
my $att = $elt->root->att("#$key") || {};
$att->{ $elt->parent } = $value;
$elt->root->set_att("#$key", $att);
}
sub _get {
my ($key, $elt) = @_;
my $att = $elt->root->att("#$key") || {};
return $att->{$elt};
}
sub _p {
my @children;
foreach my $child ($_->children) {
+InlineTags->{ $child->tag } or last;
push @children, $child->cut;
}
if (@children) {
my $textbox = $_->insert_new_elt(
before => textbox => {
w => (($_->tag eq 'p') ? '100%' : '97%'),
align => $_->att('align')
},
);
$textbox->wrap_in('row');
if ($_->tag eq 'li') {
$textbox->insert_new_elt(
before => counter => { w => '3%', align => 'right' });
}
foreach my $child (@children) {
$child->paste(last_child => $textbox);
$child->set_text(
join(' ',
grep { length and $_ ne 1 } split(/\n+/, $child->text))
);
}
my $font = _get(font => $_);
if ($textbox->text =~ /[^\x00-\x7f]/) {
$font = $FontUnicode;
}
elsif ($_->parent('i') and $_->parent('b')) {
$font ||= $FontBoldOblique;
}
elsif ($_->parent('i')) {
$font ||= $FontOblique;
}
elsif ($_->parent('b')) {
$font ||= $FontBold;
}
my %attr;
$attr{face} = $font if $font;
if (_get(underline => $_)) {
my $align = $textbox->att('align');
$align .= '_underline';
$textbox->del_att('align');
require PDF::FromHTML::Template::Constants;
$PDF::FromHTML::Template::Constants::Verify{ALIGN}{$align} = 1
if %PDF::FromHTML::Template::Constants::Verify;
$attr{align} = $align;
}
$textbox->wrap_in('font' => \%attr) if %attr;
}
$_->insert_new_elt(first_child => 'textbox')->wrap_in('row')
if $_->tag eq 'p';
$_->erase;
}
sub _td {
return $_->erase if $_->descendants('row');
$_->set_tag('textbox');
if (my $font = _get(font => $_)) {
$_->wrap_in(font => { face => $font });
}
my $cols = $_->parent->att('_cols') || 0;
no warnings 'uninitialized';
if ($_->att('colspan') <= 1 and my $width = $_->att('width')) {
my $table_width = $_->root->att('#total_width') || 100;
my $cell_width = _percentify($width, int($table_width * $PageWidth / 100));
# Register us in the width table
my $widths = $_->root->att('#widths');
if (!$widths) {
$widths = [];
$_->root->set_att('#widths', $widths);
}
# warn "[$cols] = $widths->[$cols] vs $cell_width\n";
$widths->[$cols] = $cell_width if $widths->[$cols] < $cell_width;
}
$cols += ($_->att('colspan') || 1);
$_->parent->set_att(_cols => $cols);
$_->root->set_att('#total_cols', $cols)
if $_->root->att('#total_cols') < $cols;
if (my $rowspan = $_->att('rowspan')) {
# ok, we can't really do this.
# what we can do, though, is to add 'fake' cells in the next row.
our @RowSpan;
foreach my $i (1 .. ($rowspan - 1)) {
$RowSpan[$i][$cols] = $_->atts;
}
}
}
sub _percentify {
my $num = shift or return '100';
my $total_width = shift or Carp::confess( '100') ;
return $1 if $num =~ /(\d+)%/;
return int($num / $total_width * 100);
}
sub _type {
my ($val, $elt) = @_;
return first { $_ eq $val } grep defined, map $elt->att($_), qw(type class);
}
sub _to_color {
my ($color) = @_;
if ($color !~ s/^#//) {
$color = Graphics::ColorNames->new('Netscape')->hex($color);
}
return join ',', hex2tuple($color);
}
1;
=head1 AUTHORS
Audrey Tang Ecpan@audreyt.orgE
=head1 COPYRIGHT
Copyright 2004, 2005, 2006, 2007 by Audrey Tang Ecpan@audreyt.orgE.
This software is released under the MIT license cited below.
=head2 The "MIT" License
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in
all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
DEALINGS IN THE SOFTWARE.
=cut