File manager - Edit - /var/www/payraty/helpdesk/public/storage/avatars/HTML.tar
Back
TreeBuilder.pm 0000644 00000243314 00000000000 0007260 0 ustar 00 package HTML::TreeBuilder; # ABSTRACT: Parser that builds a HTML syntax tree use warnings; use strict; use integer; # vroom vroom! use Carp (); our $VERSION = '5.07'; # VERSION from OurPkgVersion #--------------------------------------------------------------------------- # Make a 'DEBUG' constant... our $DEBUG; # Must be set BEFORE loading this file BEGIN { # We used to have things like # print $indent, "lalala" if $Debug; # But there were an awful lot of having to evaluate $Debug's value. # If we make that depend on a constant, like so: # sub DEBUG () { 1 } # or whatever value. # ... # print $indent, "lalala" if DEBUG; # Which at compile-time (thru the miracle of constant folding) turns into: # print $indent, "lalala"; # or, if DEBUG is a constant with a true value, then that print statement # is simply optimized away, and doesn't appear in the target code at all. # If you don't believe me, run: # perl -MO=Deparse,-uHTML::TreeBuilder -e 'BEGIN { \ # $HTML::TreeBuilder::DEBUG = 4} use HTML::TreeBuilder' # and see for yourself (substituting whatever value you want for $DEBUG # there). ## no critic if ( defined &DEBUG ) { # Already been defined! Do nothing. } elsif ( $] < 5.00404 ) { # Grudgingly accomodate ancient (pre-constant) versions. eval 'sub DEBUG { $Debug } '; } elsif ( !$DEBUG ) { eval 'sub DEBUG () {0}'; # Make it a constant. } elsif ( $DEBUG =~ m<^\d+$>s ) { eval 'sub DEBUG () { ' . $DEBUG . ' }'; # Make THAT a constant. } else { # WTF? warn "Non-numeric value \"$DEBUG\" in \$HTML::Element::DEBUG"; eval 'sub DEBUG () { $DEBUG }'; # I guess. } ## use critic } #--------------------------------------------------------------------------- use HTML::Entities (); use HTML::Tagset 3.02 (); use HTML::Element (); use HTML::Parser 3.46 (); our @ISA = qw(HTML::Element HTML::Parser); # This looks schizoid, I know. # It's not that we ARE an element AND a parser. # We ARE an element, but one that knows how to handle signals # (method calls) from Parser in order to elaborate its subtree. # Legacy aliases: *HTML::TreeBuilder::isKnown = \%HTML::Tagset::isKnown; *HTML::TreeBuilder::canTighten = \%HTML::Tagset::canTighten; *HTML::TreeBuilder::isHeadElement = \%HTML::Tagset::isHeadElement; *HTML::TreeBuilder::isBodyElement = \%HTML::Tagset::isBodyElement; *HTML::TreeBuilder::isPhraseMarkup = \%HTML::Tagset::isPhraseMarkup; *HTML::TreeBuilder::isHeadOrBodyElement = \%HTML::Tagset::isHeadOrBodyElement; *HTML::TreeBuilder::isList = \%HTML::Tagset::isList; *HTML::TreeBuilder::isTableElement = \%HTML::Tagset::isTableElement; *HTML::TreeBuilder::isFormElement = \%HTML::Tagset::isFormElement; *HTML::TreeBuilder::p_closure_barriers = \@HTML::Tagset::p_closure_barriers; #========================================================================== # Two little shortcut constructors: sub new_from_file { # or from a FH my $class = shift; Carp::croak("new_from_file takes only one argument") unless @_ == 1; Carp::croak("new_from_file is a class method only") if ref $class; my $new = $class->new(); defined $new->parse_file( $_[0] ) or Carp::croak("unable to parse file: $!"); return $new; } sub new_from_content { # from any number of scalars my $class = shift; Carp::croak("new_from_content is a class method only") if ref $class; my $new = $class->new(); foreach my $whunk (@_) { if ( ref($whunk) eq 'SCALAR' ) { $new->parse($$whunk); } else { $new->parse($whunk); } last if $new->{'_stunted'}; # might as well check that. } $new->eof(); return $new; } sub new_from_url { # should accept anything that LWP does. undef our $lwp_response; my $class = shift; Carp::croak("new_from_url takes only one argument") unless @_ == 1; Carp::croak("new_from_url is a class method only") if ref $class; my $url = shift; my $new = $class->new(); require LWP::UserAgent; # RECOMMEND PREREQ: LWP::UserAgent 5.815 LWP::UserAgent->VERSION( 5.815 ); # HTTP::Headers content_is_html method $lwp_response = LWP::UserAgent->new->get( $url ); Carp::croak("GET failed on $url: " . $lwp_response->status_line) unless $lwp_response->is_success; Carp::croak("$url returned " . $lwp_response->content_type . " not HTML") unless $lwp_response->content_is_html; $new->parse( $lwp_response->decoded_content ); $new->eof; undef $lwp_response; # Processed successfully return $new; } # TODO: document more fully? sub parse_content { # from any number of scalars my $tree = shift; my $retval; foreach my $whunk (@_) { if ( ref($whunk) eq 'SCALAR' ) { $retval = $tree->parse($$whunk); } else { $retval = $tree->parse($whunk); } last if $tree->{'_stunted'}; # might as well check that. } $tree->eof(); return $retval; } #--------------------------------------------------------------------------- sub new { # constructor! my $class = shift; $class = ref($class) || $class; # Initialize HTML::Element part my $self = $class->element_class->new('html'); { # A hack for certain strange versions of Parser: my $other_self = HTML::Parser->new(); %$self = ( %$self, %$other_self ); # copy fields # Yes, multiple inheritance is messy. Kids, don't try this at home. bless $other_self, "HTML::TreeBuilder::_hideyhole"; # whack it out of the HTML::Parser class, to avoid the destructor } # The root of the tree is special, as it has these funny attributes, # and gets reblessed into this class. # Initialize parser settings $self->{'_implicit_tags'} = 1; $self->{'_implicit_body_p_tag'} = 0; # If true, trying to insert text, or any of %isPhraseMarkup right # under 'body' will implicate a 'p'. If false, will just go there. $self->{'_tighten'} = 1; # whether ignorable WS in this tree should be deleted $self->{'_implicit'} = 1; # to delete, once we find a real open-"html" tag $self->{'_ignore_unknown'} = 1; $self->{'_ignore_text'} = 0; $self->{'_warn'} = 0; $self->{'_no_space_compacting'} = 0; $self->{'_store_comments'} = 0; $self->{'_store_declarations'} = 1; $self->{'_store_pis'} = 0; $self->{'_p_strict'} = 0; $self->{'_no_expand_entities'} = 0; # Parse attributes passed in as arguments if (@_) { my %attr = @_; for ( keys %attr ) { $self->{"_$_"} = $attr{$_}; } } $HTML::Element::encoded_content = $self->{'_no_expand_entities'}; # rebless to our class bless $self, $class; $self->{'_element_count'} = 1; # undocumented, informal, and maybe not exactly correct $self->{'_head'} = $self->insert_element( 'head', 1 ); $self->{'_pos'} = undef; # pull it back up $self->{'_body'} = $self->insert_element( 'body', 1 ); $self->{'_pos'} = undef; # pull it back up again return $self; } #========================================================================== sub _elem # universal accessor... { my ( $self, $elem, $val ) = @_; my $old = $self->{$elem}; $self->{$elem} = $val if defined $val; return $old; } # accessors.... sub implicit_tags { shift->_elem( '_implicit_tags', @_ ); } sub implicit_body_p_tag { shift->_elem( '_implicit_body_p_tag', @_ ); } sub p_strict { shift->_elem( '_p_strict', @_ ); } sub no_space_compacting { shift->_elem( '_no_space_compacting', @_ ); } sub ignore_unknown { shift->_elem( '_ignore_unknown', @_ ); } sub ignore_text { shift->_elem( '_ignore_text', @_ ); } sub ignore_ignorable_whitespace { shift->_elem( '_tighten', @_ ); } sub store_comments { shift->_elem( '_store_comments', @_ ); } sub store_declarations { shift->_elem( '_store_declarations', @_ ); } sub store_pis { shift->_elem( '_store_pis', @_ ); } sub warn { shift->_elem( '_warn', @_ ); } sub no_expand_entities { shift->_elem( '_no_expand_entities', @_ ); $HTML::Element::encoded_content = @_; } #========================================================================== sub warning { my $self = shift; CORE::warn("HTML::Parse: $_[0]\n") if $self->{'_warn'}; # should maybe say HTML::TreeBuilder instead } #========================================================================== { # To avoid having to rebuild these lists constantly... my $_Closed_by_structurals = [qw(p h1 h2 h3 h4 h5 h6 pre textarea)]; my $indent; sub start { return if $_[0]{'_stunted'}; # Accept a signal from HTML::Parser for start-tags. my ( $self, $tag, $attr ) = @_; # Parser passes more, actually: # $self->start($tag, $attr, $attrseq, $origtext) # But we can merrily ignore $attrseq and $origtext. if ( $tag eq 'x-html' ) { print "Ignoring open-x-html tag.\n" if DEBUG; # inserted by some lame code-generators. return; # bypass tweaking. } $tag =~ s{/$}{}s; # So <b/> turns into <b>. Silently forgive. unless ( $tag =~ m/^[-_a-zA-Z0-9:%]+$/s ) { DEBUG and print "Start-tag name $tag is no good. Skipping.\n"; return; # This avoids having Element's new() throw an exception. } my $ptag = ( my $pos = $self->{'_pos'} || $self )->{'_tag'}; my $already_inserted; #my($indent); if (DEBUG) { # optimization -- don't figure out indenting unless we're in debug mode my @lineage = $pos->lineage; $indent = ' ' x ( 1 + @lineage ); print $indent, "Proposing a new \U$tag\E under ", join( '/', map $_->{'_tag'}, reverse( $pos, @lineage ) ) || 'Root', ".\n"; #} else { # $indent = ' '; } #print $indent, "POS: $pos ($ptag)\n" if DEBUG > 2; # $attr = {%$attr}; foreach my $k ( keys %$attr ) { # Make sure some stooge doesn't have "<span _content='pie'>". # That happens every few million Web pages. $attr->{ ' ' . $k } = delete $attr->{$k} if length $k and substr( $k, 0, 1 ) eq '_'; # Looks bad, but is fine for round-tripping. } my $e = $self->element_class->new( $tag, %$attr ); # Make a new element object. # (Only rarely do we end up just throwing it away later in this call.) # Some prep -- custom messiness for those damned tables, and strict P's. if ( $self->{'_implicit_tags'} ) { # wallawallawalla! unless ( $HTML::TreeBuilder::isTableElement{$tag} ) { if ( $ptag eq 'table' ) { print $indent, " * Phrasal \U$tag\E right under TABLE makes implicit TR and TD\n" if DEBUG > 1; $self->insert_element( 'tr', 1 ); $pos = $self->insert_element( 'td', 1 ) ; # yes, needs updating } elsif ( $ptag eq 'tr' ) { print $indent, " * Phrasal \U$tag\E right under TR makes an implicit TD\n" if DEBUG > 1; $pos = $self->insert_element( 'td', 1 ) ; # yes, needs updating } $ptag = $pos->{'_tag'}; # yes, needs updating } # end of table-implication block. # Now maybe do a little dance to enforce P-strictness. # This seems like it should be integrated with the big # "ALL HOPE..." block, further below, but that doesn't # seem feasable. if ( $self->{'_p_strict'} and $HTML::TreeBuilder::isKnown{$tag} and not $HTML::Tagset::is_Possible_Strict_P_Content{$tag} ) { my $here = $pos; my $here_tag = $ptag; while (1) { if ( $here_tag eq 'p' ) { print $indent, " * Inserting $tag closes strict P.\n" if DEBUG > 1; $self->end( \q{p} ); # NB: same as \'q', but less confusing to emacs cperl-mode last; } #print("Lasting from $here_tag\n"), last if $HTML::TreeBuilder::isKnown{$here_tag} and not $HTML::Tagset::is_Possible_Strict_P_Content{ $here_tag}; # Don't keep looking up the tree if we see something that can't # be strict-P content. $here_tag = ( $here = $here->{'_parent'} || last )->{'_tag'}; } # end while $ptag = ( $pos = $self->{'_pos'} || $self ) ->{'_tag'}; # better update! } # end of strict-p block. } # And now, get busy... #---------------------------------------------------------------------- if ( !$self->{'_implicit_tags'} ) { # bimskalabim # do nothing print $indent, " * _implicit_tags is off. doing nothing\n" if DEBUG > 1; #---------------------------------------------------------------------- } elsif ( $HTML::TreeBuilder::isHeadOrBodyElement{$tag} ) { if ( $pos->is_inside('body') ) { # all is well print $indent, " * ambilocal element \U$tag\E is fine under BODY.\n" if DEBUG > 1; } elsif ( $pos->is_inside('head') ) { print $indent, " * ambilocal element \U$tag\E is fine under HEAD.\n" if DEBUG > 1; } else { # In neither head nor body! mmmmm... put under head? if ( $ptag eq 'html' ) { # expected case # TODO?? : would there ever be a case where _head would be # absent from a tree that would ever be accessed at this # point? die "Where'd my head go?" unless ref $self->{'_head'}; if ( $self->{'_head'}{'_implicit'} ) { print $indent, " * ambilocal element \U$tag\E makes an implicit HEAD.\n" if DEBUG > 1; # or rather, points us at it. $self->{'_pos'} = $self->{'_head'}; # to insert under... } else { $self->warning( "Ambilocal element <$tag> not under HEAD or BODY!?" ); # Put it under HEAD by default, I guess $self->{'_pos'} = $self->{'_head'}; # to insert under... } } else { # Neither under head nor body, nor right under html... pass thru? $self->warning( "Ambilocal element <$tag> neither under head nor body, nor right under html!?" ); } } #---------------------------------------------------------------------- } elsif ( $HTML::TreeBuilder::isBodyElement{$tag} ) { # Ensure that we are within <body> if ( $ptag eq 'body' ) { # We're good. } elsif ( $HTML::TreeBuilder::isBodyElement{$ptag} # glarg and not $HTML::TreeBuilder::isHeadOrBodyElement{$ptag} ) { # Special case: Save ourselves a call to is_inside further down. # If our $ptag is an isBodyElement element (but not an # isHeadOrBodyElement element), then we must be under body! print $indent, " * Inferring that $ptag is under BODY.\n", if DEBUG > 3; # I think this and the test for 'body' trap everything # bodyworthy, except the case where the parent element is # under an unknown element that's a descendant of body. } elsif ( $pos->is_inside('head') ) { print $indent, " * body-element \U$tag\E minimizes HEAD, makes implicit BODY.\n" if DEBUG > 1; $ptag = ( $pos = $self->{'_pos'} = $self->{'_body'} # yes, needs updating || die "Where'd my body go?" )->{'_tag'}; # yes, needs updating } elsif ( !$pos->is_inside('body') ) { print $indent, " * body-element \U$tag\E makes implicit BODY.\n" if DEBUG > 1; $ptag = ( $pos = $self->{'_pos'} = $self->{'_body'} # yes, needs updating || die "Where'd my body go?" )->{'_tag'}; # yes, needs updating } # else we ARE under body, so okay. # Handle implicit endings and insert based on <tag> and position # ... ALL HOPE ABANDON ALL YE WHO ENTER HERE ... if ( $tag eq 'p' or $tag eq 'h1' or $tag eq 'h2' or $tag eq 'h3' or $tag eq 'h4' or $tag eq 'h5' or $tag eq 'h6' or $tag eq 'form' # Hm, should <form> really be here?! ) { # Can't have <p>, <h#> or <form> inside these $self->end( $_Closed_by_structurals, @HTML::TreeBuilder::p_closure_barriers # used to be just li! ); } elsif ( $tag eq 'ol' or $tag eq 'ul' or $tag eq 'dl' ) { # Can't have lists inside <h#> -- in the unlikely # event anyone tries to put them there! if ( $ptag eq 'h1' or $ptag eq 'h2' or $ptag eq 'h3' or $ptag eq 'h4' or $ptag eq 'h5' or $ptag eq 'h6' ) { $self->end( \$ptag ); } # TODO: Maybe keep closing up the tree until # the ptag isn't any of the above? # But anyone that says <h1><h2><ul>... # deserves what they get anyway. } elsif ( $tag eq 'li' ) { # list item # Get under a list tag, one way or another unless ( exists $HTML::TreeBuilder::isList{$ptag} or $self->end( \q{*}, keys %HTML::TreeBuilder::isList ) #' ) { print $indent, " * inserting implicit UL for lack of containing ", join( '|', keys %HTML::TreeBuilder::isList ), ".\n" if DEBUG > 1; $self->insert_element( 'ul', 1 ); } } elsif ( $tag eq 'dt' or $tag eq 'dd' ) { # Get under a DL, one way or another unless ( $ptag eq 'dl' or $self->end( \q{*}, 'dl' ) ) { #' print $indent, " * inserting implicit DL for lack of containing DL.\n" if DEBUG > 1; $self->insert_element( 'dl', 1 ); } } elsif ( $HTML::TreeBuilder::isFormElement{$tag} ) { if ($self->{ '_ignore_formies_outside_form'} # TODO: document this and not $pos->is_inside('form') ) { print $indent, " * ignoring \U$tag\E because not in a FORM.\n" if DEBUG > 1; return; # bypass tweaking. } if ( $tag eq 'option' ) { # return unless $ptag eq 'select'; $self->end( \q{option} ); $ptag = ( $self->{'_pos'} || $self )->{'_tag'}; unless ( $ptag eq 'select' or $ptag eq 'optgroup' ) { print $indent, " * \U$tag\E makes an implicit SELECT.\n" if DEBUG > 1; $pos = $self->insert_element( 'select', 1 ); # but not a very useful select -- has no 'name' attribute! # is $pos's value used after this? } } } elsif ( $HTML::TreeBuilder::isTableElement{$tag} ) { if ( !$pos->is_inside('table') ) { print $indent, " * \U$tag\E makes an implicit TABLE\n" if DEBUG > 1; $self->insert_element( 'table', 1 ); } if ( $tag eq 'td' or $tag eq 'th' ) { # Get under a tr one way or another unless ( $ptag eq 'tr' # either under a tr or $self->end( \q{*}, 'tr', 'table' ) #or we can get under one ) { print $indent, " * \U$tag\E under \U$ptag\E makes an implicit TR\n" if DEBUG > 1; $self->insert_element( 'tr', 1 ); # presumably pos's value isn't used after this. } } else { $self->end( \$tag, 'table' ); #' } # Hmm, I guess this is right. To work it out: # tr closes any open tr (limited at a table) # thead closes any open thead (limited at a table) # tbody closes any open tbody (limited at a table) # tfoot closes any open tfoot (limited at a table) # colgroup closes any open colgroup (limited at a table) # col can try, but will always fail, at the enclosing table, # as col is empty, and therefore never open! # But! # td closes any open td OR th (limited at a table) # th closes any open th OR td (limited at a table) # ...implementable as "close to a tr, or make a tr" } elsif ( $HTML::TreeBuilder::isPhraseMarkup{$tag} ) { if ( $ptag eq 'body' and $self->{'_implicit_body_p_tag'} ) { print " * Phrasal \U$tag\E right under BODY makes an implicit P\n" if DEBUG > 1; $pos = $self->insert_element( 'p', 1 ); # is $pos's value used after this? } } # End of implicit endings logic # End of "elsif ($HTML::TreeBuilder::isBodyElement{$tag}" #---------------------------------------------------------------------- } elsif ( $HTML::TreeBuilder::isHeadElement{$tag} ) { if ( $pos->is_inside('body') ) { print $indent, " * head element \U$tag\E found inside BODY!\n" if DEBUG; $self->warning("Header element <$tag> in body"); # [sic] } elsif ( !$pos->is_inside('head') ) { print $indent, " * head element \U$tag\E makes an implicit HEAD.\n" if DEBUG > 1; } else { print $indent, " * head element \U$tag\E goes inside existing HEAD.\n" if DEBUG > 1; } $self->{'_pos'} = $self->{'_head'} || die "Where'd my head go?"; #---------------------------------------------------------------------- } elsif ( $tag eq 'html' ) { if ( delete $self->{'_implicit'} ) { # first time here print $indent, " * good! found the real HTML element!\n" if DEBUG > 1; } else { print $indent, " * Found a second HTML element\n" if DEBUG; $self->warning("Found a nested <html> element"); } # in either case, migrate attributes to the real element for ( keys %$attr ) { $self->attr( $_, $attr->{$_} ); } $self->{'_pos'} = undef; return $self; # bypass tweaking. #---------------------------------------------------------------------- } elsif ( $tag eq 'head' ) { my $head = $self->{'_head'} || die "Where'd my head go?"; if ( delete $head->{'_implicit'} ) { # first time here print $indent, " * good! found the real HEAD element!\n" if DEBUG > 1; } else { # been here before print $indent, " * Found a second HEAD element\n" if DEBUG; $self->warning("Found a second <head> element"); } # in either case, migrate attributes to the real element for ( keys %$attr ) { $head->attr( $_, $attr->{$_} ); } return $self->{'_pos'} = $head; # bypass tweaking. #---------------------------------------------------------------------- } elsif ( $tag eq 'body' ) { my $body = $self->{'_body'} || die "Where'd my body go?"; if ( delete $body->{'_implicit'} ) { # first time here print $indent, " * good! found the real BODY element!\n" if DEBUG > 1; } else { # been here before print $indent, " * Found a second BODY element\n" if DEBUG; $self->warning("Found a second <body> element"); } # in either case, migrate attributes to the real element for ( keys %$attr ) { $body->attr( $_, $attr->{$_} ); } return $self->{'_pos'} = $body; # bypass tweaking. #---------------------------------------------------------------------- } elsif ( $tag eq 'frameset' ) { if (!( $self->{'_frameset_seen'}++ ) # first frameset seen and !$self->{'_noframes_seen'} # otherwise it'll be under the noframes already and !$self->is_inside('body') ) { # The following is a bit of a hack. We don't use the normal # insert_element because 1) we don't want it as _pos, but instead # right under $self, and 2), more importantly, that we don't want # this inserted at the /end/ of $self's content_list, but instead # in the middle of it, specifically right before the body element. # my $c = $self->{'_content'} || die "Contentless root?"; my $body = $self->{'_body'} || die "Where'd my BODY go?"; for ( my $i = 0; $i < @$c; ++$i ) { if ( $c->[$i] eq $body ) { splice( @$c, $i, 0, $self->{'_pos'} = $pos = $e ); HTML::Element::_weaken($e->{'_parent'} = $self); $already_inserted = 1; print $indent, " * inserting 'frameset' right before BODY.\n" if DEBUG > 1; last; } } die "BODY not found in children of root?" unless $already_inserted; } } elsif ( $tag eq 'frame' ) { # Okay, fine, pass thru. # Should probably enforce that these should be under a frameset. # But hey. Ditto for enforcing that 'noframes' should be under # a 'frameset', as the DTDs say. } elsif ( $tag eq 'noframes' ) { # This basically assumes there'll be exactly one 'noframes' element # per document. At least, only the first one gets to have the # body under it. And if there are no noframes elements, then # the body pretty much stays where it is. Is that ever a problem? if ( $self->{'_noframes_seen'}++ ) { print $indent, " * ANOTHER noframes element?\n" if DEBUG; } else { if ( $pos->is_inside('body') ) { print $indent, " * 'noframes' inside 'body'. Odd!\n" if DEBUG; # In that odd case, we /can't/ make body a child of 'noframes', # because it's an ancestor of the 'noframes'! } else { $e->push_content( $self->{'_body'} || die "Where'd my body go?" ); print $indent, " * Moving body to be under noframes.\n" if DEBUG; } } #---------------------------------------------------------------------- } else { # unknown tag if ( $self->{'_ignore_unknown'} ) { print $indent, " * Ignoring unknown tag \U$tag\E\n" if DEBUG; $self->warning("Skipping unknown tag $tag"); return; } else { print $indent, " * Accepting unknown tag \U$tag\E\n" if DEBUG; } } #---------------------------------------------------------------------- # End of mumbo-jumbo print $indent, "(Attaching ", $e->{'_tag'}, " under ", ( $self->{'_pos'} || $self )->{'_tag'}, ")\n" # because if _pos isn't defined, it goes under self if DEBUG; # The following if-clause is to delete /some/ ignorable whitespace # nodes, as we're making the tree. # This'd be a node we'd catch later anyway, but we might as well # nip it in the bud now. # This doesn't catch /all/ deletable WS-nodes, so we do have to call # the tightener later to catch the rest. if ( $self->{'_tighten'} and !$self->{'_ignore_text'} ) { # if tightenable my ( $sibs, $par ); if (( $sibs = ( $par = $self->{'_pos'} || $self )->{'_content'} ) and @$sibs # parent already has content and ! ref( $sibs->[-1] ) # and the last one there is a text node and $sibs->[-1] !~ m<[^\n\r\f\t ]>s # and it's all whitespace and ( # one of these has to be eligible... $HTML::TreeBuilder::canTighten{$tag} or (( @$sibs == 1 ) ? # WS is leftmost -- so parent matters $HTML::TreeBuilder::canTighten{ $par->{'_tag'} } : # WS is after another node -- it matters ( ref $sibs->[-2] and $HTML::TreeBuilder::canTighten{ $sibs->[-2] {'_tag'} } ) ) ) and !$par->is_inside( 'pre', 'xmp', 'textarea', 'plaintext' ) # we're clear ) { pop @$sibs; print $indent, "Popping a preceding all-WS node\n" if DEBUG; } } $self->insert_element($e) unless $already_inserted; if (DEBUG) { if ( $self->{'_pos'} ) { print $indent, "(Current lineage of pos: \U$tag\E under ", join( '/', reverse( # $self->{'_pos'}{'_tag'}, # don't list myself! $self->{'_pos'}->lineage_tag_names ) ), ".)\n"; } else { print $indent, "(Pos points nowhere!?)\n"; } } unless ( ( $self->{'_pos'} || '' ) eq $e ) { # if it's an empty element -- i.e., if it didn't change the _pos &{ $self->{"_tweak_$tag"} || $self->{'_tweak_*'} || return $e }( map $_, $e, $tag, $self ) ; # make a list so the user can't clobber } return $e; } } #========================================================================== { my $indent; sub end { return if $_[0]{'_stunted'}; # Either: Accept an end-tag signal from HTML::Parser # Or: Method for closing currently open elements in some fairly complex # way, as used by other methods in this class. my ( $self, $tag, @stop ) = @_; if ( $tag eq 'x-html' ) { print "Ignoring close-x-html tag.\n" if DEBUG; # inserted by some lame code-generators. return; } unless ( ref($tag) or $tag =~ m/^[-_a-zA-Z0-9:%]+$/s ) { DEBUG and print "End-tag name $tag is no good. Skipping.\n"; return; # This avoids having Element's new() throw an exception. } # This method accepts two calling formats: # 1) from Parser: $self->end('tag_name', 'origtext') # in which case we shouldn't mistake origtext as a blocker tag # 2) from myself: $self->end(\q{tagname1}, 'blk1', ... ) # from myself: $self->end(['tagname1', 'tagname2'], 'blk1', ... ) # End the specified tag, but don't move above any of the blocker tags. # The tag can also be a reference to an array. Terminate the first # tag found. my $ptag = ( my $p = $self->{'_pos'} || $self )->{'_tag'}; # $p and $ptag are sort-of stratch if ( ref($tag) ) { # First param is a ref of one sort or another -- # THE CALL IS COMING FROM INSIDE THE HOUSE! $tag = $$tag if ref($tag) eq 'SCALAR'; # otherwise it's an arrayref. } else { # the call came from Parser -- just ignore origtext # except in a table ignore unmatched table tags RT #59980 @stop = $tag =~ /^t[hdr]\z/ ? 'table' : (); } #my($indent); if (DEBUG) { # optimization -- don't figure out depth unless we're in debug mode my @lineage_tags = $p->lineage_tag_names; $indent = ' ' x ( 1 + @lineage_tags ); # now announce ourselves print $indent, "Ending ", ref($tag) ? ( '[', join( ' ', @$tag ), ']' ) : "\U$tag\E", scalar(@stop) ? ( " no higher than [", join( ' ', @stop ), "]" ) : (), ".\n"; print $indent, " (Current lineage: ", join( '/', @lineage_tags ), ".)\n" if DEBUG > 1; if ( DEBUG > 3 ) { #my( # $package, $filename, $line, $subroutine, # $hasargs, $wantarray, $evaltext, $is_require) = caller; print $indent, " (Called from ", ( caller(1) )[3], ' line ', ( caller(1) )[2], ")\n"; } #} else { # $indent = ' '; } # End of if DEBUG # Now actually do it my @to_close; if ( $tag eq '*' ) { # Special -- close everything up to (but not including) the first # limiting tag, or return if none found. Somewhat of a special case. PARENT: while ( defined $p ) { $ptag = $p->{'_tag'}; print $indent, " (Looking at $ptag.)\n" if DEBUG > 2; for (@stop) { if ( $ptag eq $_ ) { print $indent, " (Hit a $_; closing everything up to here.)\n" if DEBUG > 2; last PARENT; } } push @to_close, $p; $p = $p->{'_parent'}; # no match so far? keep moving up print $indent, " (Moving on up to ", $p ? $p->{'_tag'} : 'nil', ")\n" if DEBUG > 1; } unless ( defined $p ) { # We never found what we were looking for. print $indent, " (We never found a limit.)\n" if DEBUG > 1; return; } #print # $indent, # " (To close: ", join('/', map $_->tag, @to_close), ".)\n" # if DEBUG > 4; # Otherwise update pos and fall thru. $self->{'_pos'} = $p; } elsif ( ref $tag ) { # Close the first of any of the matching tags, giving up if you hit # any of the stop-tags. PARENT: while ( defined $p ) { $ptag = $p->{'_tag'}; print $indent, " (Looking at $ptag.)\n" if DEBUG > 2; for (@$tag) { if ( $ptag eq $_ ) { print $indent, " (Closing $_.)\n" if DEBUG > 2; last PARENT; } } for (@stop) { if ( $ptag eq $_ ) { print $indent, " (Hit a limiting $_ -- bailing out.)\n" if DEBUG > 1; return; # so it was all for naught } } push @to_close, $p; $p = $p->{'_parent'}; } return unless defined $p; # We went off the top of the tree. # Otherwise specified element was found; set pos to its parent. push @to_close, $p; $self->{'_pos'} = $p->{'_parent'}; } else { # Close the first of the specified tag, giving up if you hit # any of the stop-tags. while ( defined $p ) { $ptag = $p->{'_tag'}; print $indent, " (Looking at $ptag.)\n" if DEBUG > 2; if ( $ptag eq $tag ) { print $indent, " (Closing $tag.)\n" if DEBUG > 2; last; } for (@stop) { if ( $ptag eq $_ ) { print $indent, " (Hit a limiting $_ -- bailing out.)\n" if DEBUG > 1; return; # so it was all for naught } } push @to_close, $p; $p = $p->{'_parent'}; } return unless defined $p; # We went off the top of the tree. # Otherwise specified element was found; set pos to its parent. push @to_close, $p; $self->{'_pos'} = $p->{'_parent'}; } $self->{'_pos'} = undef if $self eq ( $self->{'_pos'} || '' ); print $indent, "(Pos now points to ", $self->{'_pos'} ? $self->{'_pos'}{'_tag'} : '???', ".)\n" if DEBUG > 1; ### EXPENSIVE, because has to check that it's not under a pre ### or a CDATA-parent. That's one more method call per end()! ### Might as well just do this at the end of the tree-parse, I guess, ### at which point we'd be parsing top-down, and just not traversing ### under pre's or CDATA-parents. ## ## Take this opportunity to nix any terminal whitespace nodes. ## TODO: consider whether this (plus the logic in start(), above) ## would ever leave any WS nodes in the tree. ## If not, then there's no reason to have eof() call ## delete_ignorable_whitespace on the tree, is there? ## #if(@to_close and $self->{'_tighten'} and !$self->{'_ignore_text'} and # ! $to_close[-1]->is_inside('pre', keys %HTML::Tagset::isCDATA_Parent) #) { # if tightenable # my($children, $e_tag); # foreach my $e (reverse @to_close) { # going top-down # last if 'pre' eq ($e_tag = $e->{'_tag'}) or # $HTML::Tagset::isCDATA_Parent{$e_tag}; # # if( # $children = $e->{'_content'} # and @$children # has children # and !ref($children->[-1]) # and $children->[-1] =~ m<^\s+$>s # last node is all-WS # and # ( # # has a tightable parent: # $HTML::TreeBuilder::canTighten{ $e_tag } # or # ( # has a tightenable left sibling: # @$children > 1 and # ref($children->[-2]) # and $HTML::TreeBuilder::canTighten{ $children->[-2]{'_tag'} } # ) # ) # ) { # pop @$children; # #print $indent, "Popping a terminal WS node from ", $e->{'_tag'}, # # " (", $e->address, ") while exiting.\n" if DEBUG; # } # } #} foreach my $e (@to_close) { # Call the applicable callback, if any $ptag = $e->{'_tag'}; &{ $self->{"_tweak_$ptag"} || $self->{'_tweak_*'} || next }( map $_, $e, $ptag, $self ); print $indent, "Back from tweaking.\n" if DEBUG; last if $self->{ '_stunted' }; # in case one of the handlers called stunt } return @to_close; } } #========================================================================== { my ( $indent, $nugget ); sub text { return if $_[0]{'_stunted'}; # Accept a "here's a text token" signal from HTML::Parser. my ( $self, $text, $is_cdata ) = @_; # the >3.0 versions of Parser may pass a cdata node. # Thanks to Gisle Aas for pointing this out. return unless length $text; # I guess that's always right my $ignore_text = $self->{'_ignore_text'}; my $no_space_compacting = $self->{'_no_space_compacting'}; my $no_expand_entities = $self->{'_no_expand_entities'}; my $pos = $self->{'_pos'} || $self; HTML::Entities::decode($text) unless $ignore_text || $is_cdata || $HTML::Tagset::isCDATA_Parent{ $pos->{'_tag'} } || $no_expand_entities; #my($indent, $nugget); if (DEBUG) { # optimization -- don't figure out depth unless we're in debug mode my @lineage_tags = $pos->lineage_tag_names; $indent = ' ' x ( 1 + @lineage_tags ); $nugget = ( length($text) <= 25 ) ? $text : ( substr( $text, 0, 25 ) . '...' ); $nugget =~ s<([\x00-\x1F])> <'\\x'.(unpack("H2",$1))>eg; print $indent, "Proposing a new text node ($nugget) under ", join( '/', reverse( $pos->{'_tag'}, @lineage_tags ) ) || 'Root', ".\n"; #} else { # $indent = ' '; } my $ptag; if ($HTML::Tagset::isCDATA_Parent{ $ptag = $pos->{'_tag'} } #or $pos->is_inside('pre') or $pos->is_inside( 'pre', 'textarea' ) ) { return if $ignore_text; $pos->push_content($text); } else { # return unless $text =~ /\S/; # This is sometimes wrong if ( !$self->{'_implicit_tags'} || $text !~ /[^\n\r\f\t ]/ ) { # don't change anything } elsif ( $ptag eq 'head' or $ptag eq 'noframes' ) { if ( $self->{'_implicit_body_p_tag'} ) { print $indent, " * Text node under \U$ptag\E closes \U$ptag\E, implicates BODY and P.\n" if DEBUG > 1; $self->end( \$ptag ); $pos = $self->{'_body'} ? ( $self->{'_pos'} = $self->{'_body'} ) # expected case : $self->insert_element( 'body', 1 ); $pos = $self->insert_element( 'p', 1 ); } else { print $indent, " * Text node under \U$ptag\E closes, implicates BODY.\n" if DEBUG > 1; $self->end( \$ptag ); $pos = $self->{'_body'} ? ( $self->{'_pos'} = $self->{'_body'} ) # expected case : $self->insert_element( 'body', 1 ); } } elsif ( $ptag eq 'html' ) { if ( $self->{'_implicit_body_p_tag'} ) { print $indent, " * Text node under HTML implicates BODY and P.\n" if DEBUG > 1; $pos = $self->{'_body'} ? ( $self->{'_pos'} = $self->{'_body'} ) # expected case : $self->insert_element( 'body', 1 ); $pos = $self->insert_element( 'p', 1 ); } else { print $indent, " * Text node under HTML implicates BODY.\n" if DEBUG > 1; $pos = $self->{'_body'} ? ( $self->{'_pos'} = $self->{'_body'} ) # expected case : $self->insert_element( 'body', 1 ); #print "POS is $pos, ", $pos->{'_tag'}, "\n"; } } elsif ( $ptag eq 'body' ) { if ( $self->{'_implicit_body_p_tag'} ) { print $indent, " * Text node under BODY implicates P.\n" if DEBUG > 1; $pos = $self->insert_element( 'p', 1 ); } } elsif ( $ptag eq 'table' ) { print $indent, " * Text node under TABLE implicates TR and TD.\n" if DEBUG > 1; $self->insert_element( 'tr', 1 ); $pos = $self->insert_element( 'td', 1 ); # double whammy! } elsif ( $ptag eq 'tr' ) { print $indent, " * Text node under TR implicates TD.\n" if DEBUG > 1; $pos = $self->insert_element( 'td', 1 ); } # elsif ( # # $ptag eq 'li' || # # $ptag eq 'dd' || # $ptag eq 'form') { # $pos = $self->insert_element('p', 1); #} # Whatever we've done above should have had the side # effect of updating $self->{'_pos'} #print "POS is now $pos, ", $pos->{'_tag'}, "\n"; return if $ignore_text; $text =~ s/[\n\r\f\t ]+/ /g # canonical space unless $no_space_compacting; print $indent, " (Attaching text node ($nugget) under ", # was: $self->{'_pos'} ? $self->{'_pos'}{'_tag'} : $self->{'_tag'}, $pos->{'_tag'}, ").\n" if DEBUG > 1; $pos->push_content($text); } &{ $self->{'_tweak_~text'} || return }( $text, $pos, $pos->{'_tag'} . '' ); # Note that this is very exceptional -- it doesn't fall back to # _tweak_*, and it gives its tweak different arguments. return; } } #========================================================================== # TODO: test whether comment(), declaration(), and process(), do the right # thing as far as tightening and whatnot. # Also, currently, doctypes and comments that appear before head or body # show up in the tree in the wrong place. Something should be done about # this. Tricky. Maybe this whole business of pre-making the body and # whatnot is wrong. sub comment { return if $_[0]{'_stunted'}; # Accept a "here's a comment" signal from HTML::Parser. my ( $self, $text ) = @_; my $pos = $self->{'_pos'} || $self; return unless $self->{'_store_comments'} || $HTML::Tagset::isCDATA_Parent{ $pos->{'_tag'} }; if (DEBUG) { my @lineage_tags = $pos->lineage_tag_names; my $indent = ' ' x ( 1 + @lineage_tags ); my $nugget = ( length($text) <= 25 ) ? $text : ( substr( $text, 0, 25 ) . '...' ); $nugget =~ s<([\x00-\x1F])> <'\\x'.(unpack("H2",$1))>eg; print $indent, "Proposing a Comment ($nugget) under ", join( '/', reverse( $pos->{'_tag'}, @lineage_tags ) ) || 'Root', ".\n"; } ( my $e = $self->element_class->new('~comment') )->{'text'} = $text; $pos->push_content($e); ++( $self->{'_element_count'} ); &{ $self->{'_tweak_~comment'} || $self->{'_tweak_*'} || return $e }( map $_, $e, '~comment', $self ); return $e; } sub declaration { return if $_[0]{'_stunted'}; # Accept a "here's a markup declaration" signal from HTML::Parser. my ( $self, $text ) = @_; my $pos = $self->{'_pos'} || $self; if (DEBUG) { my @lineage_tags = $pos->lineage_tag_names; my $indent = ' ' x ( 1 + @lineage_tags ); my $nugget = ( length($text) <= 25 ) ? $text : ( substr( $text, 0, 25 ) . '...' ); $nugget =~ s<([\x00-\x1F])> <'\\x'.(unpack("H2",$1))>eg; print $indent, "Proposing a Declaration ($nugget) under ", join( '/', reverse( $pos->{'_tag'}, @lineage_tags ) ) || 'Root', ".\n"; } ( my $e = $self->element_class->new('~declaration') )->{'text'} = $text; $self->{_decl} = $e; return $e; } #========================================================================== sub process { return if $_[0]{'_stunted'}; # Accept a "here's a PI" signal from HTML::Parser. return unless $_[0]->{'_store_pis'}; my ( $self, $text ) = @_; my $pos = $self->{'_pos'} || $self; if (DEBUG) { my @lineage_tags = $pos->lineage_tag_names; my $indent = ' ' x ( 1 + @lineage_tags ); my $nugget = ( length($text) <= 25 ) ? $text : ( substr( $text, 0, 25 ) . '...' ); $nugget =~ s<([\x00-\x1F])> <'\\x'.(unpack("H2",$1))>eg; print $indent, "Proposing a PI ($nugget) under ", join( '/', reverse( $pos->{'_tag'}, @lineage_tags ) ) || 'Root', ".\n"; } ( my $e = $self->element_class->new('~pi') )->{'text'} = $text; $pos->push_content($e); ++( $self->{'_element_count'} ); &{ $self->{'_tweak_~pi'} || $self->{'_tweak_*'} || return $e }( map $_, $e, '~pi', $self ); return $e; } #========================================================================== #When you call $tree->parse_file($filename), and the #tree's ignore_ignorable_whitespace attribute is on (as it is #by default), HTML::TreeBuilder's logic will manage to avoid #creating some, but not all, nodes that represent ignorable #whitespace. However, at the end of its parse, it traverses the #tree and deletes any that it missed. (It does this with an #around-method around HTML::Parser's eof method.) # #However, with $tree->parse($content), the cleanup-traversal step #doesn't happen automatically -- so when you're done parsing all #content for a document (regardless of whether $content is the only #bit, or whether it's just another chunk of content you're parsing into #the tree), call $tree->eof() to signal that you're at the end of the #text you're inputting to the tree. Besides properly cleaning any bits #of ignorable whitespace from the tree, this will also ensure that #HTML::Parser's internal buffer is flushed. sub eof { # Accept an "end-of-file" signal from HTML::Parser, or thrown by the user. return if $_[0]->{'_done'}; # we've already been here return $_[0]->SUPER::eof() if $_[0]->{'_stunted'}; my $x = $_[0]; print "EOF received.\n" if DEBUG; my (@rv); if (wantarray) { # I don't think this makes any difference for this particular # method, but let's be scrupulous, for once. @rv = $x->SUPER::eof(); } else { $rv[0] = $x->SUPER::eof(); } $x->end('html') unless $x eq ( $x->{'_pos'} || $x ); # That SHOULD close everything, and will run the appropriate tweaks. # We /could/ be running under some insane mode such that there's more # than one HTML element, but really, that's just insane to do anyhow. unless ( $x->{'_implicit_tags'} ) { # delete those silly implicit head and body in case we put # them there in implicit tags mode foreach my $node ( $x->{'_head'}, $x->{'_body'} ) { $node->replace_with_content if defined $node and ref $node and $node->{'_implicit'} and $node->{'_parent'}; # I think they should be empty anyhow, since the only # logic that'd insert under them can apply only, I think, # in the case where _implicit_tags is on } # this may still leave an implicit 'html' at the top, but there's # nothing we can do about that, is there? } $x->delete_ignorable_whitespace() # this's why we trap this -- an after-method if $x->{'_tighten'} and !$x->{'_ignore_text'}; $x->{'_done'} = 1; return @rv if wantarray; return $rv[0]; } #========================================================================== # TODO: document sub stunt { my $self = $_[0]; print "Stunting the tree.\n" if DEBUG; $self->{'_done'} = 1; if ( $HTML::Parser::VERSION < 3 ) { #This is a MEAN MEAN HACK. And it works most of the time! $self->{'_buf'} = ''; my $fh = *HTML::Parser::F{IO}; # the local'd FH used by parse_file loop if ( defined $fh ) { print "Closing Parser's filehandle $fh\n" if DEBUG; close($fh); } # But if they called $tree->parse_file($filehandle) # or $tree->parse_file(*IO), then there will be no *HTML::Parser::F{IO} # to close. Ahwell. Not a problem for most users these days. } else { $self->SUPER::eof(); # Under 3+ versions, calling eof from inside a parse will abort the # parse / parse_file } # In the off chance that the above didn't work, we'll throw # this flag to make any future events be no-ops. $self->stunted(1); return; } # TODO: document sub stunted { shift->_elem( '_stunted', @_ ); } sub done { shift->_elem( '_done', @_ ); } #========================================================================== sub delete { # Override Element's delete method. # This does most, if not all, of what Element's delete does anyway. # Deletes content, including content in some special attributes. # But doesn't empty out the hash. $_[0]->{'_element_count'} = 1; # never hurts to be scrupulously correct delete @{ $_[0] }{ '_body', '_head', '_pos' }; for ( @{ delete( $_[0]->{'_content'} ) || [] }, # all/any content # delete @{$_[0]}{'_body', '_head', '_pos'} # ...and these, in case these elements don't appear in the # content, which is possible. If they did appear (as they # usually do), then calling $_->delete on them again is harmless. # I don't think that's such a hot idea now. Thru creative reattachment, # those could actually now point to elements in OTHER trees (which we do # NOT want to delete!). ## Reasoned out: # If these point to elements not in the content list of any element in this # tree, but not in the content list of any element in any OTHER tree, then # just deleting these will make their refcounts hit zero. # If these point to elements in the content lists of elements in THIS tree, # then we'll get to deleting them when we delete from the top. # If these point to elements in the content lists of elements in SOME OTHER # tree, then they're not to be deleted. ) { $_->delete if defined $_ and ref $_ # Make sure it's an object. and $_ ne $_[0]; # And avoid hitting myself, just in case! } $_[0]->detach if $_[0]->{'_parent'} and $_[0]->{'_parent'}{'_content'}; # An 'html' element having a parent is quite unlikely. return; } sub tighten_up { # legacy shift->delete_ignorable_whitespace(@_); } sub elementify { # Rebless this object down into the normal element class. my $self = $_[0]; my $to_class = $self->element_class; delete @{$self}{ grep { ; length $_ and substr( $_, 0, 1 ) eq '_' # The private attributes that we'll retain: and $_ ne '_tag' and $_ ne '_parent' and $_ ne '_content' and $_ ne '_implicit' and $_ ne '_pos' and $_ ne '_element_class' } keys %$self }; bless $self, $to_class; # Returns the same object we were fed } sub element_class { return 'HTML::Element' if not ref $_[0]; return $_[0]->{_element_class} || 'HTML::Element'; } #-------------------------------------------------------------------------- sub guts { my @out; my @stack = ( $_[0] ); my $destructive = $_[1]; my $this; while (@stack) { $this = shift @stack; if ( !ref $this ) { push @out, $this; # yes, it can include text nodes } elsif ( !$this->{'_implicit'} ) { push @out, $this; delete $this->{'_parent'} if $destructive; } else { # it's an implicit node. Delete it and recurse delete $this->{'_parent'} if $destructive; unshift @stack, @{ ( $destructive ? delete( $this->{'_content'} ) : $this->{'_content'} ) || [] }; } } # Doesn't call a real $root->delete on the (when implicit) root, # but I don't think it needs to. return @out if wantarray; # one simple normal case. return unless @out; return $out[0] if @out == 1 and ref( $out[0] ); my $x = HTML::Element->new( 'div', '_implicit' => 1 ); $x->push_content(@out); return $x; } sub disembowel { $_[0]->guts(1) } #-------------------------------------------------------------------------- 1; __END__ =pod =head1 NAME HTML::TreeBuilder - Parser that builds a HTML syntax tree =head1 VERSION This document describes version 5.07 of HTML::TreeBuilder, released August 31, 2017 as part of L<HTML-Tree|HTML::Tree>. =head1 SYNOPSIS use HTML::TreeBuilder 5 -weak; # Ensure weak references in use foreach my $file_name (@ARGV) { my $tree = HTML::TreeBuilder->new; # empty tree $tree->parse_file($file_name); print "Hey, here's a dump of the parse tree of $file_name:\n"; $tree->dump; # a method we inherit from HTML::Element print "And here it is, bizarrely rerendered as HTML:\n", $tree->as_HTML, "\n"; # Now that we're done with it, we must destroy it. # $tree = $tree->delete; # Not required with weak references } =head1 DESCRIPTION (This class is part of the L<HTML::Tree|HTML::Tree> dist.) This class is for HTML syntax trees that get built out of HTML source. The way to use it is to: 1. start a new (empty) HTML::TreeBuilder object, 2. then use one of the methods from HTML::Parser (presumably with C<< $tree->parse_file($filename) >> for files, or with C<< $tree->parse($document_content) >> and C<< $tree->eof >> if you've got the content in a string) to parse the HTML document into the tree C<$tree>. (You can combine steps 1 and 2 with the "new_from_file" or "new_from_content" methods.) 2b. call C<< $root->elementify() >> if you want. 3. do whatever you need to do with the syntax tree, presumably involving traversing it looking for some bit of information in it, 4. previous versions of HTML::TreeBuilder required you to call C<< $tree->delete() >> to erase the contents of the tree from memory when you're done with the tree. This is not normally required anymore. See L<HTML::Element/"Weak References"> for details. =head1 ATTRIBUTES Most of the following attributes native to HTML::TreeBuilder control how parsing takes place; they should be set I<before> you try parsing into the given object. You can set the attributes by passing a TRUE or FALSE value as argument. E.g., C<< $root->implicit_tags >> returns the current setting for the C<implicit_tags> option, C<< $root->implicit_tags(1) >> turns that option on, and C<< $root->implicit_tags(0) >> turns it off. =head2 implicit_tags Setting this attribute to true will instruct the parser to try to deduce implicit elements and implicit end tags. If it is false you get a parse tree that just reflects the text as it stands, which is unlikely to be useful for anything but quick and dirty parsing. (In fact, I'd be curious to hear from anyone who finds it useful to have C<implicit_tags> set to false.) Default is true. Implicit elements have the L<HTML::Element/implicit> attribute set. =head2 implicit_body_p_tag This controls an aspect of implicit element behavior, if C<implicit_tags> is on: If a text element (PCDATA) or a phrasal element (such as C<< <em> >>) is to be inserted under C<< <body> >>, two things can happen: if C<implicit_body_p_tag> is true, it's placed under a new, implicit C<< <p> >> tag. (Past DTDs suggested this was the only correct behavior, and this is how past versions of this module behaved.) But if C<implicit_body_p_tag> is false, nothing is implicated -- the PCDATA or phrasal element is simply placed under C<< <body> >>. Default is false. =head2 no_expand_entities This attribute controls whether entities are decoded during the initial parse of the source. Enable this if you don't want entities decoded to their character value. e.g. '&' is decoded to '&' by default, but will be unchanged if this is enabled. Default is false (entities will be decoded.) =head2 ignore_unknown This attribute controls whether unknown tags should be represented as elements in the parse tree, or whether they should be ignored. Default is true (to ignore unknown tags.) =head2 ignore_text Do not represent the text content of elements. This saves space if all you want is to examine the structure of the document. Default is false. =head2 ignore_ignorable_whitespace If set to true, TreeBuilder will try to avoid creating ignorable whitespace text nodes in the tree. Default is true. (In fact, I'd be interested in hearing if there's ever a case where you need this off, or where leaving it on leads to incorrect behavior.) =head2 no_space_compacting This determines whether TreeBuilder compacts all whitespace strings in the document (well, outside of PRE or TEXTAREA elements), or leaves them alone. Normally (default, value of 0), each string of contiguous whitespace in the document is turned into a single space. But that's not done if C<no_space_compacting> is set to 1. Setting C<no_space_compacting> to 1 might be useful if you want to read in a tree just to make some minor changes to it before writing it back out. This method is experimental. If you use it, be sure to report any problems you might have with it. =head2 p_strict If set to true (and it defaults to false), TreeBuilder will take a narrower than normal view of what can be under a C<< <p> >> element; if it sees a non-phrasal element about to be inserted under a C<< <p> >>, it will close that C<< <p> >>. Otherwise it will close C<< <p> >> elements only for other C<< <p> >>'s, headings, and C<< <form> >> (although the latter may be removed in future versions). For example, when going thru this snippet of code, <p>stuff <ul> TreeBuilder will normally (with C<p_strict> false) put the C<< <ul> >> element under the C<< <p> >> element. However, with C<p_strict> set to true, it will close the C<< <p> >> first. In theory, there should be strictness options like this for other/all elements besides just C<< <p> >>; but I treat this as a special case simply because of the fact that C<< <p> >> occurs so frequently and its end-tag is omitted so often; and also because application of strictness rules at parse-time across all elements often makes tiny errors in HTML coding produce drastically bad parse-trees, in my experience. If you find that you wish you had an option like this to enforce content-models on all elements, then I suggest that what you want is content-model checking as a stage after TreeBuilder has finished parsing. =head2 store_comments This determines whether TreeBuilder will normally store comments found while parsing content into C<$root>. Currently, this is off by default. =head2 store_declarations This determines whether TreeBuilder will normally store markup declarations found while parsing content into C<$root>. This is on by default. =head2 store_pis This determines whether TreeBuilder will normally store processing instructions found while parsing content into C<$root> -- assuming a recent version of HTML::Parser (old versions won't parse PIs correctly). Currently, this is off (false) by default. It is somewhat of a known bug (to be fixed one of these days, if anyone needs it?) that PIs in the preamble (before the C<< <html> >> start-tag) end up actually I<under> the C<< <html> >> element. =head2 warn This determines whether syntax errors during parsing should generate warnings, emitted via Perl's C<warn> function. This is off (false) by default. =head1 METHODS Objects of this class inherit the methods of both HTML::Parser and HTML::Element. The methods inherited from HTML::Parser are used for building the HTML tree, and the methods inherited from HTML::Element are what you use to scrutinize the tree. Besides this (HTML::TreeBuilder) documentation, you must also carefully read the HTML::Element documentation, and also skim the HTML::Parser documentation -- probably only its parse and parse_file methods are of interest. =head2 new_from_file $root = HTML::TreeBuilder->new_from_file($filename_or_filehandle); This "shortcut" constructor merely combines constructing a new object (with the L</new> method, below), and calling C<< $new->parse_file(...) >> on it. Returns the new object. Note that this provides no way of setting any parse options like C<store_comments> (for that, call C<new>, and then set options, before calling C<parse_file>). See the notes (below) on parameters to L</parse_file>. If HTML::TreeBuilder is unable to read the file, then C<new_from_file> dies. The error can also be found in C<$!>. (This behavior is new in HTML-Tree 5. Previous versions returned a tree with only implicit elements.) =head2 new_from_content $root = HTML::TreeBuilder->new_from_content(...); This "shortcut" constructor merely combines constructing a new object (with the L</new> method, below), and calling C<< for(...){$new->parse($_)} >> and C<< $new->eof >> on it. Returns the new object. Note that this provides no way of setting any parse options like C<store_comments> (for that, call C<new>, and then set options, before calling C<parse>). Example usages: C<< HTML::TreeBuilder->new_from_content(@lines) >>, or C<< HTML::TreeBuilder->new_from_content($content) >>. =head2 new_from_url $root = HTML::TreeBuilder->new_from_url($url) This "shortcut" constructor combines constructing a new object (with the L</new> method, below), loading L<LWP::UserAgent>, fetching the specified URL, and calling C<< $new->parse( $response->decoded_content) >> and C<< $new->eof >> on it. Returns the new object. Note that this provides no way of setting any parse options like C<store_comments>. If LWP is unable to fetch the URL, or the response is not HTML (as determined by L<HTTP::Headers/content_is_html>), then C<new_from_url> dies, and the HTTP::Response object is found in C<$HTML::TreeBuilder::lwp_response>. You must have installed LWP::UserAgent for this method to work. LWP is not installed automatically, because it's a large set of modules and you might not need it. =head2 new $root = HTML::TreeBuilder->new(); This creates a new HTML::TreeBuilder object. This method takes no attributes. =head2 parse_file $root->parse_file(...) [An important method inherited from L<HTML::Parser|HTML::Parser>, which see. Current versions of HTML::Parser can take a filespec, or a filehandle object, like *FOO, or some object from class IO::Handle, IO::File, IO::Socket) or the like. I think you should check that a given file exists I<before> calling C<< $root->parse_file($filespec) >>.] When you pass a filename to C<parse_file>, HTML::Parser opens it in binary mode, which means it's interpreted as Latin-1 (ISO-8859-1). If the file is in another encoding, like UTF-8 or UTF-16, this will not do the right thing. One solution is to open the file yourself using the proper C<:encoding> layer, and pass the filehandle to C<parse_file>. You can automate this process by using L<IO::HTML/html_file>, which will use the HTML5 encoding sniffing algorithm to automatically determine the proper C<:encoding> layer and apply it. In the next major release of HTML-Tree, I plan to have it use IO::HTML automatically. If you really want your file opened in binary mode, you should open it yourself and pass the filehandle to C<parse_file>. The return value is C<undef> if there's an error opening the file. In that case, the error will be in C<$!>. =head2 parse $root->parse(...) [A important method inherited from L<HTML::Parser|HTML::Parser>, which see. See the note below for C<< $root->eof() >>.] =head2 eof $root->eof(); This signals that you're finished parsing content into this tree; this runs various kinds of crucial cleanup on the tree. This is called I<for you> when you call C<< $root->parse_file(...) >>, but not when you call C<< $root->parse(...) >>. So if you call C<< $root->parse(...) >>, then you I<must> call C<< $root->eof() >> once you've finished feeding all the chunks to C<parse(...)>, and before you actually start doing anything else with the tree in C<$root>. =head2 parse_content $root->parse_content(...); Basically a handy alias for C<< $root->parse(...); $root->eof >>. Takes the exact same arguments as C<< $root->parse() >>. =head2 delete $root->delete(); [A previously important method inherited from L<HTML::Element|HTML::Element>, which see.] =head2 elementify $root->elementify(); This changes the class of the object in C<$root> from HTML::TreeBuilder to the class used for all the rest of the elements in that tree (generally HTML::Element). Returns C<$root>. For most purposes, this is unnecessary, but if you call this after (after!!) you've finished building a tree, then it keeps you from accidentally trying to call anything but HTML::Element methods on it. (I.e., if you accidentally call C<$root-E<gt>parse_file(...)> on the already-complete and elementified tree, then instead of charging ahead and I<wreaking havoc>, it'll throw a fatal error -- since C<$root> is now an object just of class HTML::Element which has no C<parse_file> method. Note that C<elementify> currently deletes all the private attributes of C<$root> except for "_tag", "_parent", "_content", "_pos", and "_implicit". If anyone requests that I change this to leave in yet more private attributes, I might do so, in future versions. =head2 guts @nodes = $root->guts(); $parent_for_nodes = $root->guts(); In list context (as in the first case), this method returns the topmost non-implicit nodes in a tree. This is useful when you're parsing HTML code that you know doesn't expect an HTML document, but instead just a fragment of an HTML document. For example, if you wanted the parse tree for a file consisting of just this: <li>I like pie! Then you would get that with C<< @nodes = $root->guts(); >>. It so happens that in this case, C<@nodes> will contain just one element object, representing the C<< <li> >> node (with "I like pie!" being its text child node). However, consider if you were parsing this: <hr>Hooboy!<hr> In that case, C<< $root->guts() >> would return three items: an element object for the first C<< <hr> >>, a text string "Hooboy!", and another C<< <hr> >> element object. For cases where you want definitely one element (so you can treat it as a "document fragment", roughly speaking), call C<guts()> in scalar context, as in C<< $parent_for_nodes = $root->guts() >>. That works like C<guts()> in list context; in fact, C<guts()> in list context would have returned exactly one value, and if it would have been an object (as opposed to a text string), then that's what C<guts> in scalar context will return. Otherwise, if C<guts()> in list context would have returned no values at all, then C<guts()> in scalar context returns undef. In all other cases, C<guts()> in scalar context returns an implicit C<< <div> >> element node, with children consisting of whatever nodes C<guts()> in list context would have returned. Note that that may detach those nodes from C<$root>'s tree. =head2 disembowel @nodes = $root->disembowel(); $parent_for_nodes = $root->disembowel(); The C<disembowel()> method works just like the C<guts()> method, except that disembowel definitively destroys the tree above the nodes that are returned. Usually when you want the guts from a tree, you're just going to toss out the rest of the tree anyway, so this saves you the bother. (Remember, "disembowel" means "remove the guts from".) =head1 INTERNAL METHODS You should not need to call any of the following methods directly. =head2 element_class $classname = $h->element_class; This method returns the class which will be used for new elements. It defaults to HTML::Element, but can be overridden by subclassing or esoteric means best left to those will will read the source and then not complain when those esoteric means change. (Just subclass.) =head2 comment Accept a "here's a comment" signal from HTML::Parser. =head2 declaration Accept a "here's a markup declaration" signal from HTML::Parser. =head2 done TODO: document =head2 end Either: Accept an end-tag signal from HTML::Parser Or: Method for closing currently open elements in some fairly complex way, as used by other methods in this class. TODO: Why is this hidden? =head2 process Accept a "here's a PI" signal from HTML::Parser. =head2 start Accept a signal from HTML::Parser for start-tags. TODO: Why is this hidden? =head2 stunt TODO: document =head2 stunted TODO: document =head2 text Accept a "here's a text token" signal from HTML::Parser. TODO: Why is this hidden? =head2 tighten_up Legacy Redirects to L<HTML::Element/delete_ignorable_whitespace>. =head2 warning Wrapper for CORE::warn TODO: why not just use carp? =head1 SUBROUTINES =head2 DEBUG Are we in Debug mode? This is a constant subroutine, to allow compile-time optimizations. To control debug mode, set C<$HTML::TreeBuilder::DEBUG> I<before> loading HTML::TreeBuilder. =head1 HTML AND ITS DISCONTENTS HTML is rather harder to parse than people who write it generally suspect. Here's the problem: HTML is a kind of SGML that permits "minimization" and "implication". In short, this means that you don't have to close every tag you open (because the opening of a subsequent tag may implicitly close it), and if you use a tag that can't occur in the context you seem to using it in, under certain conditions the parser will be able to realize you mean to leave the current context and enter the new one, that being the only one that your code could correctly be interpreted in. Now, this would all work flawlessly and unproblematically if: 1) all the rules that both prescribe and describe HTML were (and had been) clearly set out, and 2) everyone was aware of these rules and wrote their code in compliance to them. However, it didn't happen that way, and so most HTML pages are difficult if not impossible to correctly parse with nearly any set of straightforward SGML rules. That's why the internals of HTML::TreeBuilder consist of lots and lots of special cases -- instead of being just a generic SGML parser with HTML DTD rules plugged in. =head1 TRANSLATIONS? The techniques that HTML::TreeBuilder uses to perform what I consider very robust parses on everyday code are not things that can work only in Perl. To date, the algorithms at the center of HTML::TreeBuilder have been implemented only in Perl, as far as I know; and I don't foresee getting around to implementing them in any other language any time soon. If, however, anyone is looking for a semester project for an applied programming class (or if they merely enjoy I<extra-curricular> masochism), they might do well to see about choosing as a topic the implementation/adaptation of these routines to any other interesting programming language that you feel currently suffers from a lack of robust HTML-parsing. I welcome correspondence on this subject, and point out that one can learn a great deal about languages by trying to translate between them, and then comparing the result. The HTML::TreeBuilder source may seem long and complex, but it is rather well commented, and symbol names are generally self-explanatory. (You are encouraged to read the Mozilla HTML parser source for comparison.) Some of the complexity comes from little-used features, and some of it comes from having the HTML tokenizer (HTML::Parser) being a separate module, requiring somewhat of a different interface than you'd find in a combined tokenizer and tree-builder. But most of the length of the source comes from the fact that it's essentially a long list of special cases, with lots and lots of sanity-checking, and sanity-recovery -- because, as Roseanne Rosannadanna once said, "it's always I<something>". Users looking to compare several HTML parsers should look at the source for Raggett's Tidy (C<E<lt>http://www.w3.org/People/Raggett/tidy/E<gt>>), Mozilla (C<E<lt>http://www.mozilla.org/E<gt>>), and possibly root around the browsers section of Yahoo to find the various open-source ones (C<E<lt>http://dir.yahoo.com/Computers_and_Internet/Software/Internet/World_Wide_Web/Browsers/E<gt>>). =head1 BUGS * Framesets seem to work correctly now. Email me if you get a strange parse from a document with framesets. * Really bad HTML code will, often as not, make for a somewhat objectionable parse tree. Regrettable, but unavoidably true. * If you're running with C<implicit_tags> off (God help you!), consider that C<< $tree->content_list >> probably contains the tree or grove from the parse, and not C<$tree> itself (which will, oddly enough, be an implicit C<< <html> >> element). This seems counter-intuitive and problematic; but seeing as how almost no HTML ever parses correctly with C<implicit_tags> off, this interface oddity seems the least of your problems. =head1 BUG REPORTS When a document parses in a way different from how you think it should, I ask that you report this to me as a bug. The first thing you should do is copy the document, trim out as much of it as you can while still producing the bug in question, and I<then> email me that mini-document I<and> the code you're using to parse it, to the HTML::Tree bug queue at S<C<< <bug-html-tree at rt.cpan.org> >>>. Include a note as to how it parses (presumably including its C<< $tree->dump >> output), and then a I<careful and clear> explanation of where you think the parser is going astray, and how you would prefer that it work instead. =head1 SEE ALSO For more information about the HTML-Tree distribution: L<HTML::Tree>. Modules used by HTML::TreeBuilder: L<HTML::Parser>, L<HTML::Element>, L<HTML::Tagset>. For converting between L<XML::DOM::Node>, L<HTML::Element>, and L<XML::Element> trees: L<HTML::DOMbo>. For opening a HTML file with automatic charset detection: L<IO::HTML>. =head1 AUTHOR Current maintainers: =over =item * Christopher J. Madsen S<C<< <perl AT cjmweb.net> >>> =item * Jeff Fearn S<C<< <jfearn AT cpan.org> >>> =back Original HTML-Tree author: =over =item * Gisle Aas =back Former maintainers: =over =item * Sean M. Burke =item * Andy Lester =item * Pete Krawczyk S<C<< <petek AT cpan.org> >>> =back You can follow or contribute to HTML-Tree's development at L<< https://github.com/kentfredric/HTML-Tree >>. =head1 COPYRIGHT AND LICENSE Copyright 1995-1998 Gisle Aas, 1999-2004 Sean M. Burke, 2005 Andy Lester, 2006 Pete Krawczyk, 2010 Jeff Fearn, 2012 Christopher J. Madsen. This library is free software; you can redistribute it and/or modify it under the same terms as Perl itself. The programs in this library are distributed in the hope that they will be useful, but without any warranty; without even the implied warranty of merchantability or fitness for a particular purpose. =cut FormatText.pm 0000644 00000016300 00000000000 0007140 0 ustar 00 package HTML::FormatText; # ABSTRACT: Format HTML as plaintext use 5.006_001; use strict; use warnings; # We now use Smart::Comments in place of the old DEBUG framework. # this should be commented out in release versions.... ##use Smart::Comments; use base 'HTML::Formatter'; our $VERSION = '2.12'; # VERSION our $AUTHORITY = 'cpan:NIGELM'; # AUTHORITY # ------------------------------------------------------------------------ sub default_values { ( shift->SUPER::default_values(), lm => 3, # left margin rm => 72, # right margin (actually, maximum text width) ); } # ------------------------------------------------------------------------ sub configure { my ( $self, $hash ) = @_; my $lm = $self->{lm}; my $rm = $self->{rm}; $lm = delete $hash->{lm} if exists $hash->{lm}; $lm = delete $hash->{leftmargin} if exists $hash->{leftmargin}; $rm = delete $hash->{rm} if exists $hash->{rm}; $rm = delete $hash->{rightmargin} if exists $hash->{rightmargin}; my $width = $rm - $lm; if ( $width < 1 ) { warn "Bad margins, ignored" if $^W; return; } if ( $width < 20 ) { warn "Page probably too narrow" if $^W; } for ( keys %$hash ) { warn "Unknown configure option '$_'" if $^W; } $self->{lm} = $lm; $self->{rm} = $rm; $self; } # ------------------------------------------------------------------------ sub begin { my $self = shift; $self->SUPER::begin; $self->{curpos} = 0; # current output position. $self->{maxpos} = 0; # highest value of $pos (used by header underliner) $self->{hspace} = 0; # horizontal space pending flag } # ------------------------------------------------------------------------ sub end { shift->collect("\n"); } # ------------------------------------------------------------------------ sub header_start { my ( $self, $level ) = @_; $self->vspace( 1 + ( 6 - $level ) * 0.4 ); $self->{maxpos} = 0; 1; } # ------------------------------------------------------------------------ sub header_end { my ( $self, $level ) = @_; if ( $level <= 2 ) { my $line; $line = '=' if $level == 1; $line = '-' if $level == 2; $self->vspace(0); $self->out( $line x ( $self->{maxpos} - $self->{lm} ) ); } $self->vspace(1); 1; } # ------------------------------------------------------------------------ sub bullet { my $self = shift; $self->SUPER::bullet( $_[0] . ' ' ); } # ------------------------------------------------------------------------ sub hr_start { my $self = shift; $self->vspace(1); $self->out( '-' x ( $self->{rm} - $self->{lm} ) ); $self->vspace(1); } # ------------------------------------------------------------------------ sub pre_out { my $self = shift; # should really handle bold/italic etc. if ( defined $self->{vspace} ) { if ( $self->{out} ) { $self->nl() while $self->{vspace}-- >= 0; $self->{vspace} = undef; } } my $indent = ' ' x $self->{lm}; my $pre = shift; $pre =~ s/^/$indent/mg; $self->collect($pre); $self->{out}++; } # ------------------------------------------------------------------------ sub out { my $self = shift; my $text = shift; $text =~ tr/\xA0\xAD/ /d; if ( $text =~ /^\s*$/ ) { $self->{hspace} = 1; return; } if ( defined $self->{vspace} ) { if ( $self->{out} ) { $self->nl while $self->{vspace}-- >= 0; } $self->goto_lm; $self->{vspace} = undef; $self->{hspace} = 0; } if ( $self->{hspace} ) { if ( $self->{curpos} + length($text) > $self->{rm} ) { # word will not fit on line; do a line break $self->nl; $self->goto_lm; } else { # word fits on line; use a space $self->collect(' '); ++$self->{curpos}; } $self->{hspace} = 0; } $self->collect($text); my $pos = $self->{curpos} += length $text; $self->{maxpos} = $pos if $self->{maxpos} < $pos; $self->{'out'}++; } # ------------------------------------------------------------------------ sub goto_lm { my $self = shift; my $pos = $self->{curpos}; my $lm = $self->{lm}; if ( $pos < $lm ) { $self->{curpos} = $lm; $self->collect( " " x ( $lm - $pos ) ); } } # ------------------------------------------------------------------------ sub nl { my $self = shift; $self->{'out'}++; $self->{curpos} = 0; $self->collect("\n"); } # ------------------------------------------------------------------------ sub adjust_lm { my $self = shift; $self->{lm} += $_[0]; $self->goto_lm; } # ------------------------------------------------------------------------ sub adjust_rm { shift->{rm} += $_[0]; } 1; __END__ =pod =for test_synopsis 1; __END__ =for stopwords latin1 leftmargin lm plaintext rightmargin rm CPAN homepage =head1 NAME HTML::FormatText - Format HTML as plaintext =head1 VERSION version 2.12 =head1 SYNOPSIS use HTML::TreeBuilder; $tree = HTML::TreeBuilder->new->parse_file("test.html"); use HTML::FormatText; $formatter = HTML::FormatText->new(leftmargin => 0, rightmargin => 50); print $formatter->format($tree); or, more simply: use HTML::FormatText; my $string = HTML::FormatText->format_file( 'test.html', leftmargin => 0, rightmargin => 50 ); =head1 DESCRIPTION HTML::FormatText is a formatter that outputs plain latin1 text. All character attributes (bold/italic/underline) are ignored. Formatting of HTML tables and forms is not implemented. HTML::FormatText is built on L<HTML::Formatter> and documentation for that module applies to this - especially L<HTML::Formatter/new>, L<HTML::Formatter/format_file> and L<HTML::Formatter/format_string>. You might specify the following parameters when constructing the formatter: =over 4 =item I<leftmargin> (alias I<lm>) The column of the left margin. The default is 3. =item I<rightmargin> (alias I<rm>) The column of the right margin. The default is 72. =back =head1 SEE ALSO L<HTML::Formatter> =head1 INSTALLATION See perlmodinstall for information and options on installing Perl modules. =head1 BUGS AND LIMITATIONS You can make new bug reports, and view existing ones, through the web interface at L<http://rt.cpan.org/Public/Dist/Display.html?Name=HTML-Format>. =head1 AVAILABILITY The project homepage is L<https://metacpan.org/release/HTML-Format>. The latest version of this module is available from the Comprehensive Perl Archive Network (CPAN). Visit L<http://www.perl.com/CPAN/> to find a CPAN site near you, or see L<https://metacpan.org/module/HTML::Format/>. =head1 AUTHORS =over 4 =item * Nigel Metheringham <nigelm@cpan.org> =item * Sean M Burke <sburke@cpan.org> =item * Gisle Aas <gisle@ActiveState.com> =back =head1 COPYRIGHT AND LICENSE This software is copyright (c) 2015 by Nigel Metheringham, 2002-2005 Sean M Burke, 1999-2002 Gisle Aas. This is free software; you can redistribute it and/or modify it under the same terms as the Perl 5 programming language system itself. =cut Parse.pm 0000644 00000010572 00000000000 0006122 0 ustar 00 package HTML::Parse; use 5.008; #ABSTRACT: Deprecated, a wrapper around HTML::TreeBuilder use warnings; use strict; our $VERSION = '5.07'; # VERSION from OurPkgVersion use vars qw(@ISA @EXPORT $IMPLICIT_TAGS $IGNORE_UNKNOWN $IGNORE_TEXT $WARN ); require Exporter; @ISA = qw(Exporter); @EXPORT = qw(parse_html parse_htmlfile); # Backwards compatability $IMPLICIT_TAGS = 1; $IGNORE_UNKNOWN = 1; $IGNORE_TEXT = 0; $WARN = 0; require HTML::TreeBuilder; sub parse_html { my $p = $_[1]; $p = _new_tree_maker() unless $p; $p->parse( $_[0] ); } sub parse_htmlfile { my ( $file, $p ) = @_; my ($HTML); open( $HTML, "<", $file ) or return; $p = _new_tree_maker() unless $p; $p->parse_file($HTML); } sub _new_tree_maker { my $p = HTML::TreeBuilder->new( implicit_tags => $IMPLICIT_TAGS, ignore_unknown => $IGNORE_UNKNOWN, ignore_text => $IGNORE_TEXT, 'warn' => $WARN, ); $p->strict_comment(1); $p; } 1; __END__ =pod =head1 NAME HTML::Parse - Deprecated, a wrapper around HTML::TreeBuilder =head1 VERSION This document describes version 5.07 of HTML::Parse, released August 31, 2017 as part of L<HTML-Tree|HTML::Tree>. =head1 SYNOPSIS See the documentation for HTML::TreeBuilder =head1 DESCRIPTION Disclaimer: This module is provided only for backwards compatibility with earlier versions of this library. New code should I<not> use this module, and should really use the HTML::Parser and HTML::TreeBuilder modules directly, instead. The C<HTML::Parse> module provides functions to parse HTML documents. There are two functions exported by this module: =over 4 =item parse_html($html) or parse_html($html, $obj) This function is really just a synonym for $obj->parse($html) and $obj is assumed to be a subclass of C<HTML::Parser>. Refer to L<HTML::Parser> for more documentation. If $obj is not specified, the $obj will default to an internally created new C<HTML::TreeBuilder> object configured with strict_comment() turned on. That class implements a parser that builds (and is) a HTML syntax tree with HTML::Element objects as nodes. The return value from parse_html() is $obj. =item parse_htmlfile($file, [$obj]) Same as parse_html(), but pulls the HTML to parse, from the named file. Returns C<undef> if the file could not be opened, or $obj otherwise. =back When a C<HTML::TreeBuilder> object is created, the following variables control how parsing takes place: =over 4 =item $HTML::Parse::IMPLICIT_TAGS Setting this variable to true will instruct the parser to try to deduce implicit elements and implicit end tags. If this variable is false you get a parse tree that just reflects the text as it stands. Might be useful for quick & dirty parsing. Default is true. Implicit elements have the implicit() attribute set. =item $HTML::Parse::IGNORE_UNKNOWN This variable contols whether unknown tags should be represented as elements in the parse tree. Default is true. =item $HTML::Parse::IGNORE_TEXT Do not represent the text content of elements. This saves space if all you want is to examine the structure of the document. Default is false. =item $HTML::Parse::WARN Call warn() with an appropriate message for syntax errors. Default is false. =back =head1 REMEMBER! HTML::TreeBuilder objects should be explicitly destroyed when you're finished with them. See L<HTML::TreeBuilder>. =head1 SEE ALSO L<HTML::Parser>, L<HTML::TreeBuilder>, L<HTML::Element> =head1 AUTHOR Current maintainers: =over =item * Christopher J. Madsen S<C<< <perl AT cjmweb.net> >>> =item * Jeff Fearn S<C<< <jfearn AT cpan.org> >>> =back Original HTML-Tree author: =over =item * Gisle Aas =back Former maintainers: =over =item * Sean M. Burke =item * Andy Lester =item * Pete Krawczyk S<C<< <petek AT cpan.org> >>> =back You can follow or contribute to HTML-Tree's development at L<< https://github.com/kentfredric/HTML-Tree >>. =head1 COPYRIGHT AND LICENSE Copyright 1995-1998 Gisle Aas, 1999-2004 Sean M. Burke, 2005 Andy Lester, 2006 Pete Krawczyk, 2010 Jeff Fearn, 2012 Christopher J. Madsen. This library is free software; you can redistribute it and/or modify it under the same terms as Perl itself. The programs in this library are distributed in the hope that they will be useful, but without any warranty; without even the implied warranty of merchantability or fitness for a particular purpose. =cut Element/traverse.pm 0000644 00000023756 00000000000 0010304 0 ustar 00 # This is a .pm just to (try to) make some CPAN document converters # convert it happily as part of the dist's documentation tree. package HTML::Element::traverse; # ABSTRACT: discussion of HTML::Element's traverse method use warnings; use strict; our $VERSION = '5.07'; # VERSION from OurPkgVersion use HTML::Element (); 1; __END__ =pod =head1 NAME HTML::Element::traverse - discussion of HTML::Element's traverse method =head1 VERSION This document describes version 5.07 of HTML::Element::traverse, released August 31, 2017 as part of L<HTML-Tree|HTML::Tree>. =head1 SYNOPSIS # $element->traverse is unnecessary and obscure. # Don't use it in new code. =head1 DESCRIPTION C<HTML::Element> provides a method C<traverse> that traverses the tree and calls user-specified callbacks for each node, in pre- or post-order. However, use of the method is quite superfluous: if you want to recursively visit every node in the tree, it's almost always simpler to write a subroutine does just that, than it is to bundle up the pre- and/or post-order code in callbacks for the C<traverse> method. =head1 EXAMPLES Suppose you want to traverse at/under a node $tree and give elements an 'id' attribute unless they already have one. You can use the C<traverse> method: { my $counter = 'x0000'; $start_node->traverse( [ # Callbacks; # pre-order callback: sub { my $x = $_[0]; $x->attr('id', $counter++) unless defined $x->attr('id'); return HTML::Element::OK; # keep traversing }, # post-order callback: undef ], 1, # don't call the callbacks for text nodes ); } or you can just be simple and clear (and not have to understand the calling format for C<traverse>) by writing a sub that traverses the tree by just calling itself: { my $counter = 'x0000'; sub give_id { my $x = $_[0]; $x->attr('id', $counter++) unless defined $x->attr('id'); foreach my $c ($x->content_list) { give_id($c) if ref $c; # ignore text nodes } }; give_id($start_node); } See, isn't that nice and clear? But, if you really need to know: =head1 THE TRAVERSE METHOD The C<traverse()> method is a general object-method for traversing a tree or subtree and calling user-specified callbacks. It accepts the following syntaxes: =over =item $h->traverse(\&callback) =item or $h->traverse(\&callback, $ignore_text) =item or $h->traverse( [\&pre_callback,\&post_callback] , $ignore_text) =back These all mean to traverse the element and all of its children. That is, this method starts at node $h, "pre-order visits" $h, traverses its children, and then will "post-order visit" $h. "Visiting" means that the callback routine is called, with these arguments: $_[0] : the node (element or text segment), $_[1] : a startflag, and $_[2] : the depth If the $ignore_text parameter is given and true, then the pre-order call I<will not> be happen for text content. The startflag is 1 when we enter a node (i.e., in pre-order calls) and 0 when we leave the node (in post-order calls). Note, however, that post-order calls don't happen for nodes that are text segments or are elements that are prototypically empty (like "br", "hr", etc.). If we visit text nodes (i.e., unless $ignore_text is given and true), then when text nodes are visited, we will also pass two extra arguments to the callback: $_[3] : the element that's the parent of this text node $_[4] : the index of this text node in its parent's content list Note that you can specify that the pre-order routine can be a different routine from the post-order one: $h->traverse( [\&pre_callback,\&post_callback], ...); You can also specify that no post-order calls are to be made, by providing a false value as the post-order routine: $h->traverse([ \&pre_callback,0 ], ...); And similarly for suppressing pre-order callbacks: $h->traverse([ 0,\&post_callback ], ...); Note that these two syntaxes specify the same operation: $h->traverse([\&foo,\&foo], ...); $h->traverse( \&foo , ...); The return values from calls to your pre- or post-order routines are significant, and are used to control recursion into the tree. These are the values you can return, listed in descending order of my estimation of their usefulness: =over =item HTML::Element::OK, 1, or any other true value ...to keep on traversing. Note that C<HTML::Element::OK> et al are constants. So if you're running under C<use strict> (as I hope you are), and you say: C<return HTML::Element::PRUEN> the compiler will flag this as an error (an unallowable bareword, specifically), whereas if you spell PRUNE correctly, the compiler will not complain. =item undef, 0, '0', '', or HTML::Element::PRUNE ...to block traversing under the current element's content. (This is ignored if received from a post-order callback, since by then the recursion has already happened.) If this is returned by a pre-order callback, no post-order callback for the current node will happen. (Recall that if your callback exits with just C<return;>, it is returning undef -- at least in scalar context, and C<traverse> always calls your callbacks in scalar context.) =item HTML::Element::ABORT ...to abort the whole traversal immediately. This is often useful when you're looking for just the first node in the tree that meets some criterion of yours. =item HTML::Element::PRUNE_UP ...to abort continued traversal into this node and its parent node. No post-order callback for the current or parent node will happen. =item HTML::Element::PRUNE_SOFTLY Like PRUNE, except that the post-order call for the current node is not blocked. =back Almost every task to do with extracting information from a tree can be expressed in terms of traverse operations (usually in only one pass, and usually paying attention to only pre-order, or to only post-order), or operations based on traversing. (In fact, many of the other methods in this class are basically calls to traverse() with particular arguments.) The source code for HTML::Element and HTML::TreeBuilder contain several examples of the use of the "traverse" method to gather information about the content of trees and subtrees. (Note: you should not change the structure of a tree I<while> you are traversing it.) [End of documentation for the C<traverse()> method] =head2 Traversing with Recursive Anonymous Routines Now, if you've been reading I<Structure and Interpretation of Computer Programs> too much, maybe you even want a recursive lambda. Go ahead: { my $counter = 'x0000'; my $give_id; $give_id = sub { my $x = $_[0]; $x->attr('id', $counter++) unless defined $x->attr('id'); foreach my $c ($x->content_list) { $give_id->($c) if ref $c; # ignore text nodes } }; $give_id->($start_node); undef $give_id; } It's a bit nutty, and it's I<still> more concise than a call to the C<traverse> method! It is left as an exercise to the reader to figure out how to do the same thing without using a C<$give_id> symbol at all. It is also left as an exercise to the reader to figure out why I undefine C<$give_id>, above; and why I could achieved the same effect with any of: $give_id = 'I like pie!'; # or... $give_id = []; # or even; $give_id = sub { print "Mmmm pie!\n" }; But not: $give_id = sub { print "I'm $give_id and I like pie!\n" }; # nor... $give_id = \$give_id; # nor... $give_id = { 'pie' => \$give_id, 'mode' => 'a la' }; =head2 Doing Recursive Things Iteratively Note that you may at times see an iterative implementation of pre-order traversal, like so: { my @to_do = ($tree); # start-node while(@to_do) { my $this = shift @to_do; # "Visit" the node: $this->attr('id', $counter++) unless defined $this->attr('id'); unshift @to_do, grep ref $_, $this->content_list; # Put children on the stack -- they'll be visited next } } This can I<under certain circumstances> be more efficient than just a normal recursive routine, but at the cost of being rather obscure. It gains efficiency by avoiding the overhead of function-calling, but since there are several method dispatches however you do it (to C<attr> and C<content_list>), the overhead for a simple function call is insignificant. =head2 Pruning and Whatnot The C<traverse> method does have the fairly neat features of the C<ABORT>, C<PRUNE_UP> and C<PRUNE_SOFTLY> signals. None of these can be implemented I<totally> straightforwardly with recursive routines, but it is quite possible. C<ABORT>-like behavior can be implemented either with using non-local returning with C<eval>/C<die>: my $died_on; # if you need to know where... sub thing { ... visits $_[0]... ... maybe set $died_on to $_[0] and die "ABORT_TRAV" ... ... else call thing($child) for each child... ...any post-order visiting $_[0]... } eval { thing($node) }; if($@) { if($@ =~ m<^ABORT_TRAV>) { ...it died (aborted) on $died_on... } else { die $@; # some REAL error happened } } or you can just do it with flags: my($abort_flag, $died_on); sub thing { ... visits $_[0]... ... maybe set $abort_flag = 1; $died_on = $_[0]; return; foreach my $c ($_[0]->content_list) { thing($c); return if $abort_flag; } ...any post-order visiting $_[0]... return; } $abort_flag = $died_on = undef; thing($node); ...if defined $abort_flag, it died on $died_on =head1 SEE ALSO L<HTML::Element> =head1 AUTHOR Current maintainers: =over =item * Christopher J. Madsen S<C<< <perl AT cjmweb.net> >>> =item * Jeff Fearn S<C<< <jfearn AT cpan.org> >>> =back Original HTML-Tree author: =over =item * Gisle Aas =back Former maintainers: =over =item * Sean M. Burke =item * Andy Lester =item * Pete Krawczyk S<C<< <petek AT cpan.org> >>> =back You can follow or contribute to HTML-Tree's development at L<< https://github.com/kentfredric/HTML-Tree >>. =head1 COPYRIGHT Copyright 2000,2001 Sean M. Burke =cut FormatRTF.pm 0000644 00000046424 00000000000 0006661 0 ustar 00 package HTML::FormatRTF; # ABSTRACT: Format HTML as RTF use 5.006_001; use strict; use warnings; # We now use Smart::Comments in place of the old DEBUG framework. # this should be commented out in release versions.... ##use Smart::Comments; use base 'HTML::Formatter'; our $VERSION = '2.12'; # VERSION our $AUTHORITY = 'cpan:NIGELM'; # AUTHORITY # ------------------------------------------------------------------------ my %Escape = ( map( ( chr($_), chr($_) ), # things not apparently needing escaping 0x20 .. 0x7E ), map( ( chr($_), sprintf( "\\'%02x", $_ ) ), # apparently escapeworthy things 0x00 .. 0x1F, 0x5c, 0x7b, 0x7d, 0x7f .. 0xFF, 0x46 ), # We get to escape out 'F' so that we can send RTF files thru the mail # without the slightest worry that paragraphs beginning with "From" # will get munged. # And some refinements: #"\n" => "\n\\line ", #"\cm" => "\n\\line ", #"\cj" => "\n\\line ", "\t" => "\\tab ", # Tabs (altho theoretically raw \t's are okay) # "\f" => "\n\\page\n", # Formfeed "-" => "\\_", # Turn plaintext '-' into a non-breaking hyphen "\xA0" => "\\~", # Latin-1 non-breaking space "\xAD" => "\\-", # Latin-1 soft (optional) hyphen # CRAZY HACKS: "\n" => "\\line\n", "\r" => "\n", # "\cb" => "{\n\\cs21\\lang1024\\noproof ", # \\cf1 # "\cc" => "}", ); # ------------------------------------------------------------------------ sub default_values { ( shift->SUPER::default_values(), 'lm' => 0, # left margin 'rm' => 0, # right margin (actually, maximum text width) 'head1_halfpoint_size' => 32, 'head2_halfpoint_size' => 28, 'head3_halfpoint_size' => 25, 'head4_halfpoint_size' => 22, 'head5_halfpoint_size' => 20, 'head6_halfpoint_size' => 18, 'codeblock_halfpoint_size' => 18, 'header_halfpoint_size' => 17, 'normal_halfpoint_size' => 22, ); } # ------------------------------------------------------------------------ sub configure { my ( $self, $hash ) = shift; $self->{lm} = 0; $self->{rm} = 0; # include the hash parameters into self - as RT#56278 map { $self->{$_} = $hash->{$_} } keys %$hash if ( ref($hash) ); $self; } # ------------------------------------------------------------------------ sub begin { my $self = shift; ### Start document... $self->SUPER::begin; $self->collect( $self->doc_init, $self->font_table, $self->stylesheet, $self->color_table, $self->doc_info, $self->doc_really_start, "\n" ) unless $self->{'no_prolog'}; $self->{'Para'} = ''; $self->{'quotelevel'} = 0; return; } # ------------------------------------------------------------------------ sub end { my $self = shift; $self->vspace(0); $self->out('THIS IS NEVER SEEN'); # just to force the previous para to be written out. $self->collect("}") unless $self->{'no_trailer'}; # ends the document ### End document... return; } # ------------------------------------------------------------------------ sub vspace { my $self = shift; #$self->emit_para if defined $self->{'vspace'}; my $rv = $self->SUPER::vspace(@_); $self->emit_para if defined $self->{'vspace'}; $rv; } # ------------------------------------------------------------------------ sub stylesheet { # TODO: maybe actually /use/ the character styles? return sprintf <<'END', # snazzy styles {\stylesheet {\snext0 Normal;} {\*\cs1 \additive Default Paragraph Font;} {\*\cs2 \additive \i\sbasedon1 html-ital;} {\*\cs3 \additive \b\sbasedon1 html-bold;} {\*\cs4 \additive \f1\sbasedon1 html-code;} {\s20\ql \f1\fs%s\lang1024\noproof\sbasedon0 \snext0 html-pre;} {\s31\ql \keepn\sb90\sa180\f2\fs%s\ul\sbasedon0 \snext0 html-head1;} {\s32\ql \keepn\sb90\sa180\f2\fs%s\ul\sbasedon0 \snext0 html-head2;} {\s33\ql \keepn\sb90\sa180\f2\fs%s\ul\sbasedon0 \snext0 html-head3;} {\s34\ql \keepn\sb90\sa180\f2\fs%s\ul\sbasedon0 \snext0 html-head4;} {\s35\ql \keepn\sb90\sa180\f2\fs%s\ul\sbasedon0 \snext0 html-head5;} {\s36\ql \keepn\sb90\sa180\f2\fs%s\ul\sbasedon0 \snext0 html-head6;} } END @{ $_[0] }{ qw< codeblock_halfpoint_size head1_halfpoint_size head2_halfpoint_size head3_halfpoint_size head4_halfpoint_size head5_halfpoint_size head6_halfpoint_size > }; } # ------------------------------------------------------------------------ # Override these as necessary for further customization sub font_table { my $self = shift; return sprintf <<'END' , # text font, code font, heading font {\fonttbl {\f0\froman %s;} {\f1\fmodern %s;} {\f2\fswiss %s;} } END map { ; # custom-dumb escaper: my $x = $_; $x =~ s/([\x00-\x1F\\\{\}\x7F-\xFF])/sprintf("\\'%02x", $1)/g; $x =~ s/([^\x00-\xFF])/'\\uc1\\u'.((ord($1)<32768)?ord($1):(ord($1)-65536)).'?'/eg; $x; } $self->{'fontname_body'} || 'Times', $self->{'fontname_code'} || 'Courier New', $self->{'fontname_headings'} || 'Arial', ; } # ------------------------------------------------------------------------ sub doc_init { return <<'END'; {\rtf1\ansi\deff0 END } # ------------------------------------------------------------------------ sub color_table { return <<'END'; {\colortbl;\red255\green0\blue0;\red0\green0\blue255;} END } # ------------------------------------------------------------------------ sub doc_info { my $self = $_[0]; return sprintf <<'END', $self->version_tag; {\info{\doccomm generated by %s} {\author [see doc]}{\company [see doc]}{\operator [see doc]} } END } # ------------------------------------------------------------------------ sub doc_really_start { my $self = $_[0]; return sprintf <<'END', \deflang%s\widowctrl {\header\pard\qr\plain\f2\fs%s p.\chpgn\par} \fs%s END $self->{'document_language'} || 1033, $self->{"header_halfpoint_size"}, $self->{"normal_halfpoint_size"},; } # ------------------------------------------------------------------------ sub emit_para { # rather like showline in FormatPS my $self = shift; my $para = $self->{'Para'}; $self->{'Para'} = undef; #### emit_para called by: (caller(1) )[3]; unless ( defined $para ) { #### emit_para with empty buffer... return; } $para =~ s/^ +//s; $para =~ s/ +$//s; # And now: a not terribly clever algorithm for inserting newlines # at a guaranteed harmless place: after a block of whitespace # after the 65th column. This was copied from RTF::Writer. $para =~ s/( [^\cm\cj\n]{65} # Snare 65 characters from a line [^\cm\cj\n\x20]{0,50} # and finish any current word ) (\x20{1,10})(?![\cm\cj\n]) # capture some spaces not at line-end /$1$2\n/gx # and put a NL before those spaces ; $self->collect( sprintf( '{\pard\sa%d\li%d\ri%d%s\plain' . "\n", #100 + 10 * $self->{'normal_halfpoint_size'} * ( $self->{'vspace'} || 0 ), $self->{'lm'}, $self->{'rm'}, $self->{'center'} ? '\qc' : '\ql', ), defined( $self->{'next_bullet'} ) ? do { my $bullet = $self->{'next_bullet'}; $self->{'next_bullet'} = undef; sprintf "\\fi-%d\n%s", 4.5 * $self->{'normal_halfpoint_size'}, ( $bullet eq '*' ) ? "\\'95 " : ( rtf_esc($bullet) . ". " ); } : (), $para, "\n\\par}\n\n", ); $self->{'vspace'} = undef; # we finally get to clear it here! return; } # ------------------------------------------------------------------------ sub new_font_size { my $self = $_[0]; $self->out( \sprintf "{\\fs%u\n", $self->scale_font_for( $self->{'normal_halfpoint_size'} ) ); } # ------------------------------------------------------------------------ sub restore_font_size { shift->out( \'}' ) } # ------------------------------------------------------------------------ sub hr_start { my $self = shift; # A bit of a hack: $self->vspace(.3); $self->out( \( '\qc\ul\f1\fs20\nocheck\lang1024 ' . ( '\~' x ( $self->{'hr_width'} || 50 ) ) ) ); $self->vspace(.7); 1; } # ------------------------------------------------------------------------ sub br_start { $_[0]->out( \"\\line\n" ); } # ------------------------------------------------------------------------ sub header_start { my ( $self, $level ) = @_; # for h1 ... h6's # This really should have been called heading_start, but it's too late # to change now. ### Heading of level: $level #$self->adjust_lm(0); # assert new paragraph $self->vspace(1.5); $self->out( \( sprintf '\s3%s\ql\keepn\f2\fs%s\ul' . "\n", $level, $self->{ 'head' . $level . '_halfpoint_size' }, $level, ) ); return 1; } # ------------------------------------------------------------------------ sub header_end { # This really should have been called heading_end but it's too late # to change now. $_[0]->vspace(1); 1; } # ------------------------------------------------------------------------ sub bullet { my ( $self, $bullet ) = @_; $self->{'next_bullet'} = $bullet; return; } # ------------------------------------------------------------------------ sub adjust_lm { $_[0]->emit_para(); $_[0]->{'lm'} += $_[1] * $_[0]->{'normal_halfpoint_size'} * 5; 1; } # ------------------------------------------------------------------------ sub adjust_rm { $_[0]->emit_para(); $_[0]->{'rm'} -= $_[1] * $_[0]->{'normal_halfpoint_size'} * 5; 1; } # Yes, flip the sign on the right margin! # BTW, halfpoints * 10 = twips # ------------------------------------------------------------------------ sub pre_start { my $self = shift; $self->SUPER::pre_start(@_); $self->out( \sprintf '\s20\f1\fs%s\noproof\lang1024\lang1076 ', $self->{'codeblock_halfpoint_size'}, ); return 1; } # ------------------------------------------------------------------------ sub b_start { shift->out( \'{\b ' ) } sub b_end { shift->out( \'}' ) } sub i_start { shift->out( \'{\i ' ) } sub i_end { shift->out( \'}' ) } sub tt_start { shift->out( \'{\f1\noproof\lang1024\lang1076 ' ) } sub tt_end { shift->out( \'}' ) } sub sub_start { shift->out( \'{\sub ' ) } sub sub_end { shift->out( \'}' ) } sub sup_start { shift->out( \'{\super ' ) } sub sup_end { shift->out( \'}' ) } sub strike_start { shift->out( \'{\strike ' ) } sub strike_end { shift->out( \'}' ) } # ------------------------------------------------------------------------ sub q_start { my $self = $_[0]; $self->out( ( ( ++$self->{'quotelevel'} ) % 2 ) ? \'\ldblquote ' : \'\lquote ' ); } # ------------------------------------------------------------------------ sub q_end { my $self = $_[0]; $self->out( ( ( --$self->{'quotelevel'} ) % 2 ) ? \'\rquote ' : \'\rdblquote ' ); } # ------------------------------------------------------------------------ sub pre_out { $_[0]->out( ref( $_[1] ) ? $_[1] : \rtf_esc_codely( $_[1] ) ) } # ------------------------------------------------------------------------ sub out { # output a word (or, if escaped, chunk of RTF) my $self = shift; #return $self->pre_out(@_) if $self->{pre}; #### out called by: $_[0], (caller(1) )[3] return unless defined $_[0]; # and length $_[0]; $self->{'Para'} = '' unless defined $self->{'Para'}; $self->{'Para'} .= ref( $_[0] ) ? ${ $_[0] } : rtf_esc( $_[0] ); return 1; } # ------------------------------------------------------------------------ use integer; sub rtf_esc { my $x; # scratch if ( !defined wantarray ) { # void context: alter in-place! for (@_) { s/([F\x00-\x1F\-\\\{\}\x7F-\xFF])/$Escape{$1}/g; # ESCAPER s/([^\x00-\xFF])/'\\uc1\\u'.((ord($1)<32768)?ord($1):(ord($1)-65536)).'?'/eg; } return; } elsif (wantarray) { # return an array return map { ; ( $x = $_ ) =~ s/([F\x00-\x1F\-\\\{\}\x7F-\xFF])/$Escape{$1}/g; # ESCAPER $x =~ s/([^\x00-\xFF])/'\\uc1\\u'.((ord($1)<32768)?ord($1):(ord($1)-65536)).'?'/eg; # Hyper-escape all Unicode characters. $x; } @_; } else { # return a single scalar ( $x = ( ( @_ == 1 ) ? $_[0] : join '', @_ ) ) =~ s/([F\x00-\x1F\-\\\{\}\x7F-\xFF])/$Escape{$1}/g; # ESCAPER # Escape \, {, }, -, control chars, and 7f-ff. $x =~ s/([^\x00-\xFF])/'\\uc1\\u'.((ord($1)<32768)?ord($1):(ord($1)-65536)).'?'/eg; # Hyper-escape all Unicode characters. return $x; } } # ------------------------------------------------------------------------ sub rtf_esc_codely { # Doesn't change "-" to hard-hyphen, nor apply computerese style my $x; # scratch if ( !defined wantarray ) { # void context: alter in-place! for (@_) { s/([F\x00-\x1F\\\{\}\x7F-\xFF])/$Escape{$1}/g; s/([^\x00-\xFF])/'\\uc1\\u'.((ord($1)<32768)?ord($1):(ord($1)-65536)).'?'/eg; # Hyper-escape all Unicode characters. } return; } elsif (wantarray) { # return an array return map { ; ( $x = $_ ) =~ s/([F\x00-\x1F\\\{\}\x7F-\xFF])/$Escape{$1}/g; $x =~ s/([^\x00-\xFF])/'\\uc1\\u'.((ord($1)<32768)?ord($1):(ord($1)-65536)).'?'/eg; # Hyper-escape all Unicode characters. $x; } @_; } else { # return a single scalar ( $x = ( ( @_ == 1 ) ? $_[0] : join '', @_ ) ) =~ s/([F\x00-\x1F\\\{\}\x7F-\xFF])/$Escape{$1}/g; # Escape \, {, }, -, control chars, and 7f-ff. $x =~ s/([^\x00-\xFF])/'\\uc1\\u'.((ord($1)<32768)?ord($1):(ord($1)-65536)).'?'/eg; # Hyper-escape all Unicode characters. return $x; } } 1; __END__ =pod =for test_synopsis 1; __END__ =for stopwords arial bookman lm pagenumber prolog rtf tahoma verdana CPAN homepage rm sans serif twentieths =head1 NAME HTML::FormatRTF - Format HTML as RTF =head1 VERSION version 2.12 =head1 SYNOPSIS use HTML::FormatRTF; my $out_file = "test.rtf"; open(RTF, ">$out_file") or die "Can't write-open $out_file: $!\nAborting"; print RTF HTML::FormatRTF->format_file( 'test.html', 'fontname_headings' => "Verdana", ); close(RTF); =head1 DESCRIPTION HTML::FormatRTF is a class for objects that you use to convert HTML to RTF. There is currently no proper support for tables or forms. This is a subclass of L<HTML::Formatter>, whose documentation you should consult for more information on underlying methods such as C<new>, C<format>, C<format_file> etc You can specify any of the following parameters in the call to C<new>, C<format_file>, or C<format_string>: =over =item lm Amount of I<extra> indenting to apply to the left margin, in twips (I<tw>entI<i>eths of a I<p>oint). Default is 0. So if you wanted the left margin to be an additional half inch larger, you'd set C<< lm => 720 >> (since there's 1440 twips in an inch). If you wanted it to be about 1.5cm larger, you'd set C<< lw => 850 >> (since there's about 567 twips in a centimeter). =item rm Amount of I<extra> indenting to apply to the left margin, in twips (I<tw>entI<i>eths of a I<p>oint). Default is 0. =item normal_halfpoint_size This is the size of normal text in the document, in I<half>-points. The default value is 22, meaning that normal text is in 11 point. =item header_halfpoint_size This is the size of text used in the document's page-header, in I<half>-points. The default value is 17, meaning that normal text is in 7.5 point. Currently, the header consists just of "p. I<pagenumber>" in the upper-right-hand corner, and cannot be disabled. =item head1_halfpoint_size ... head6_halfpoint_size These control the font size of each heading level, in half-twips. For example, the default for head3_halfpoint_size is 25, meaning that HTML C<< <h3>...</h3> >> text will be in 12.5 point text (in addition to being underlined and in the heading font). =item codeblock_halfpoint_size This controls the font size (in half-points) of the text used for C<< <pre>...</pre> >> text. By default, it is 18, meaning 9 point. =item fontname_body This option controls what font is to be used for the body of the text -- that is, everything other than heading text and text in pre/code/tt elements. The default value is currently "Times". Other handy values I can suggest using are "Georgia" or "Bookman Old Style". =item fontname_code This option controls what font is to be used for text in pre/code/tt elements. The default value is currently "Courier New". =item fontname_headings This option controls what font name is to be used for headings. You can use the same font as fontname_body, but I prefer a sans-serif font, so the default value is currently "Arial". Also consider "Tahoma" and "Verdana". =item document_language This option controls what Microsoft language number will be specified as the language for this document. The current default value is 1033, for US English. Consult an RTF reference for other language numbers. =item hr_width This option controls how many underline characters will be used for rendering a "<hr>" tag. Its default value is currently 50. You can usually leave this alone, but under some circumstances you might want to use a smaller or larger number. =item no_prolog If this option is set to a true value, HTML::FormatRTF will make a point of I<not> emitting the RTF prolog before the document. By default, this is off, meaning that HTML::FormatRTF I<will> emit the prolog. This option is of interest only to advanced users. =item no_trailer If this option is set to a true value, HTML::FormatRTF will make a point of I<not> emitting the RTF trailer at the end of the document. By default, this is off, meaning that HTML::FormatRTF I<will> emit the bit of RTF that ends the document. This option is of interest only to advanced users. =back =head1 SEE ALSO L<HTML::Formatter>, L<RTF::Writer> =head1 INSTALLATION See perlmodinstall for information and options on installing Perl modules. =head1 BUGS AND LIMITATIONS You can make new bug reports, and view existing ones, through the web interface at L<http://rt.cpan.org/Public/Dist/Display.html?Name=HTML-Format>. =head1 AVAILABILITY The project homepage is L<https://metacpan.org/release/HTML-Format>. The latest version of this module is available from the Comprehensive Perl Archive Network (CPAN). Visit L<http://www.perl.com/CPAN/> to find a CPAN site near you, or see L<https://metacpan.org/module/HTML::Format/>. =head1 AUTHORS =over 4 =item * Nigel Metheringham <nigelm@cpan.org> =item * Sean M Burke <sburke@cpan.org> =item * Gisle Aas <gisle@ActiveState.com> =back =head1 COPYRIGHT AND LICENSE This software is copyright (c) 2015 by Nigel Metheringham, 2002-2005 Sean M Burke, 1999-2002 Gisle Aas. This is free software; you can redistribute it and/or modify it under the same terms as the Perl 5 programming language system itself. =cut Form.pm 0000644 00000116537 00000000000 0005763 0 ustar 00 package HTML::Form; use strict; use URI; use Carp (); use Encode (); use vars qw($VERSION); our $VERSION = '6.07'; my %form_tags = map {$_ => 1} qw(input textarea button select option); my %type2class = ( text => "TextInput", password => "TextInput", hidden => "TextInput", textarea => "TextInput", "reset" => "IgnoreInput", radio => "ListInput", checkbox => "ListInput", option => "ListInput", button => "SubmitInput", submit => "SubmitInput", image => "ImageInput", file => "FileInput", keygen => "KeygenInput", ); # The new HTML5 input types %type2class = (%type2class, map { $_ => 'TextInput' } qw( tel search url email datetime date month week time datetime-local number range color )); # ABSTRACT: Class that represents an HTML form element sub parse { my $class = shift; my $html = shift; unshift(@_, "base") if @_ == 1; my %opt = @_; require HTML::TokeParser; my $p = HTML::TokeParser->new(ref($html) ? $html->decoded_content(ref => 1) : \$html); die "Failed to create HTML::TokeParser object" unless $p; my $base_uri = delete $opt{base}; my $charset = delete $opt{charset}; my $strict = delete $opt{strict}; my $verbose = delete $opt{verbose}; if ($^W) { Carp::carp("Unrecognized option $_ in HTML::Form->parse") for sort keys %opt; } unless (defined $base_uri) { if (ref($html)) { $base_uri = $html->base; } else { Carp::croak("HTML::Form::parse: No \$base_uri provided"); } } unless (defined $charset) { if (ref($html) and $html->can("content_charset")) { $charset = $html->content_charset; } unless ($charset) { $charset = "UTF-8"; } } my @forms; my $f; # current form my %openselect; # index to the open instance of a select while (my $t = $p->get_tag) { my($tag,$attr) = @$t; if ($tag eq "form") { my $action = delete $attr->{'action'}; $action = "" unless defined $action; $action = URI->new_abs($action, $base_uri); $f = $class->new($attr->{'method'}, $action, $attr->{'enctype'}); $f->accept_charset($attr->{'accept-charset'}) if $attr->{'accept-charset'}; $f->{default_charset} = $charset; $f->{attr} = $attr; $f->strict(1) if $strict; %openselect = (); push(@forms, $f); my(%labels, $current_label); while (my $t = $p->get_tag) { my($tag, $attr) = @$t; last if $tag eq "/form"; if ($tag ne 'textarea') { # if we are inside a label tag, then keep # appending any text to the current label if(defined $current_label) { $current_label = join " ", grep { defined and length } $current_label, $p->get_phrase; } } if ($tag eq "input") { $attr->{value_name} = exists $attr->{id} && exists $labels{$attr->{id}} ? $labels{$attr->{id}} : defined $current_label ? $current_label : $p->get_phrase; } if ($tag eq "label") { $current_label = $p->get_phrase; $labels{ $attr->{for} } = $current_label if exists $attr->{for}; } elsif ($tag eq "/label") { $current_label = undef; } elsif ($tag eq "input") { my $type = delete $attr->{type} || "text"; $f->push_input($type, $attr, $verbose); } elsif ($tag eq "button") { my $type = delete $attr->{type} || "submit"; $f->push_input($type, $attr, $verbose); } elsif ($tag eq "textarea") { $attr->{textarea_value} = $attr->{value} if exists $attr->{value}; my $text = $p->get_text("/textarea"); $attr->{value} = $text; $f->push_input("textarea", $attr, $verbose); } elsif ($tag eq "select") { # rename attributes reserved to come for the option tag for ("value", "value_name") { $attr->{"select_$_"} = delete $attr->{$_} if exists $attr->{$_}; } # count this new select option separately my $name = $attr->{name}; $name = "" unless defined $name; $openselect{$name}++; while ($t = $p->get_tag) { my $tag = shift @$t; last if $tag eq "/select"; next if $tag =~ m,/?optgroup,; next if $tag eq "/option"; if ($tag eq "option") { my %a = %{$t->[0]}; # rename keys so they don't clash with %attr for (keys %a) { next if $_ eq "value"; $a{"option_$_"} = delete $a{$_}; } while (my($k,$v) = each %$attr) { $a{$k} = $v; } $a{value_name} = $p->get_trimmed_text; $a{value} = delete $a{value_name} unless defined $a{value}; $a{idx} = $openselect{$name}; $f->push_input("option", \%a, $verbose); } else { warn("Bad <select> tag '$tag' in $base_uri\n") if $verbose; if ($tag eq "/form" || $tag eq "input" || $tag eq "textarea" || $tag eq "select" || $tag eq "keygen") { # MSIE implicitly terminates the <select> here, so we # try to do the same. Actually the MSIE behaviour # appears really strange: <input> and <textarea> # do implicitly close, but not <select>, <keygen> or # </form>. my $type = ($tag =~ s,^/,,) ? "E" : "S"; $p->unget_token([$type, $tag, @$t]); last; } } } } elsif ($tag eq "keygen") { $f->push_input("keygen", $attr, $verbose); } } } elsif ($form_tags{$tag}) { warn("<$tag> outside <form> in $base_uri\n") if $verbose; } } for (@forms) { $_->fixup; } wantarray ? @forms : $forms[0]; } sub new { my $class = shift; my $self = bless {}, $class; $self->{method} = uc(shift || "GET"); $self->{action} = shift || Carp::croak("No action defined"); $self->{enctype} = lc(shift || "application/x-www-form-urlencoded"); $self->{accept_charset} = "UNKNOWN"; $self->{default_charset} = "UTF-8"; $self->{inputs} = [@_]; $self; } sub push_input { my($self, $type, $attr, $verbose) = @_; $type = lc $type; my $class = $type2class{$type}; unless ($class) { Carp::carp("Unknown input type '$type'") if $verbose; $class = "TextInput"; } $class = "HTML::Form::$class"; my @extra; push(@extra, readonly => 1) if $type eq "hidden"; push(@extra, strict => 1) if $self->{strict}; if ($type eq "file" && exists $attr->{value}) { # it's not safe to trust the value set by the server # the user always needs to explicitly set the names of files to upload $attr->{orig_value} = delete $attr->{value}; } delete $attr->{type}; # don't confuse the type argument my $input = $class->new(type => $type, %$attr, @extra); $input->add_to_form($self); } BEGIN { # Set up some accessors for (qw(method action enctype accept_charset)) { my $m = $_; no strict 'refs'; *{$m} = sub { my $self = shift; my $old = $self->{$m}; $self->{$m} = shift if @_; $old; }; } *uri = \&action; # alias } sub attr { my $self = shift; my $name = shift; return undef unless defined $name; my $old = $self->{attr}{$name}; $self->{attr}{$name} = shift if @_; return $old; } sub strict { my $self = shift; my $old = $self->{strict}; if (@_) { $self->{strict} = shift; for my $input (@{$self->{inputs}}) { $input->strict($self->{strict}); } } return $old; } sub inputs { my $self = shift; @{$self->{'inputs'}}; } sub find_input { my($self, $name, $type, $no) = @_; die "Invalid index $no" if defined $no && $no < 1; if (wantarray) { warn "find_input called in list context with index specified\n" if defined $no; my @res; my $c; for (@{$self->{'inputs'}}) { next if defined($name) && !$_->selected($name); next if $type && $type ne $_->{type}; $c++; next if $no && $no != $c; push(@res, $_); } return @res; } else { $no ||= 1; for (@{$self->{'inputs'}}) { next if defined($name) && !$_->selected($name); next if $type && $type ne $_->{type}; next if --$no; return $_; } return undef; } } sub fixup { my $self = shift; for (@{$self->{'inputs'}}) { $_->fixup; } } sub value { my $self = shift; my $key = shift; my $input = $self->find_input($key); unless ($input) { Carp::croak("No such field '$key'") if $self->{strict}; return undef unless @_; $input = $self->push_input("text", { name => $key, value => "" }); } local $Carp::CarpLevel = 1; $input->value(@_); } sub param { my $self = shift; if (@_) { my $name = shift; my @inputs; for ($self->inputs) { my $n = $_->name; next if !defined($n) || $n ne $name; push(@inputs, $_); } if (@_) { # set die "No '$name' parameter exists" unless @inputs; my @v = @_; @v = @{$v[0]} if @v == 1 && ref($v[0]); while (@v) { my $v = shift @v; my $err; for my $i (0 .. @inputs-1) { eval { $inputs[$i]->value($v); }; unless ($@) { undef($err); splice(@inputs, $i, 1); last; } $err ||= $@; } die $err if $err; } # the rest of the input should be cleared for (@inputs) { $_->value(undef); } } else { # get my @v; for (@inputs) { if (defined(my $v = $_->value)) { push(@v, $v); } } return wantarray ? @v : $v[0]; } } else { # list parameter names my @n; my %seen; for ($self->inputs) { my $n = $_->name; next if !defined($n) || $seen{$n}++; push(@n, $n); } return @n; } } sub try_others { my($self, $cb) = @_; my @try; for (@{$self->{'inputs'}}) { my @not_tried_yet = $_->other_possible_values; next unless @not_tried_yet; push(@try, [\@not_tried_yet, $_]); } return unless @try; $self->_try($cb, \@try, 0); } sub _try { my($self, $cb, $try, $i) = @_; for (@{$try->[$i][0]}) { $try->[$i][1]->value($_); &$cb($self); $self->_try($cb, $try, $i+1) if $i+1 < @$try; } } sub make_request { my $self = shift; my $method = uc $self->{'method'}; my $uri = $self->{'action'}; my $enctype = $self->{'enctype'}; my @form = $self->form; my $charset = $self->accept_charset eq "UNKNOWN" ? $self->{default_charset} : $self->accept_charset; foreach my $fi (@form) { $fi = Encode::encode($charset, $fi) unless ref($fi); } if ($method eq "GET") { require HTTP::Request; $uri = URI->new($uri, "http"); $uri->query_form(@form); return HTTP::Request->new(GET => $uri); } elsif ($method eq "POST") { require HTTP::Request::Common; return HTTP::Request::Common::POST($uri, \@form, Content_Type => $enctype); } else { Carp::croak("Unknown method '$method'"); } } sub click { my $self = shift; my $name; $name = shift if (@_ % 2) == 1; # odd number of arguments # try to find first submit button to activate for (@{$self->{'inputs'}}) { next unless $_->can("click"); next if $name && !$_->selected($name); next if $_->disabled; return $_->click($self, @_); } Carp::croak("No clickable input with name $name") if $name; $self->make_request; } sub form { my $self = shift; map { $_->form_name_value($self) } @{$self->{'inputs'}}; } sub dump { my $self = shift; my $method = $self->{'method'}; my $uri = $self->{'action'}; my $enctype = $self->{'enctype'}; my $dump = "$method $uri"; $dump .= " ($enctype)" if $enctype ne "application/x-www-form-urlencoded"; $dump .= " [$self->{attr}{name}]" if exists $self->{attr}{name}; $dump .= "\n"; for ($self->inputs) { $dump .= " " . $_->dump . "\n"; } print STDERR $dump unless defined wantarray; $dump; } #--------------------------------------------------- package HTML::Form::Input; sub new { my $class = shift; my $self = bless {@_}, $class; $self; } sub add_to_form { my($self, $form) = @_; push(@{$form->{'inputs'}}, $self); $self; } sub strict { my $self = shift; my $old = $self->{strict}; if (@_) { $self->{strict} = shift; } $old; } sub fixup {} sub type { shift->{type}; } sub name { my $self = shift; my $old = $self->{name}; $self->{name} = shift if @_; $old; } sub id { my $self = shift; my $old = $self->{id}; $self->{id} = shift if @_; $old; } sub class { my $self = shift; my $old = $self->{class}; $self->{class} = shift if @_; $old; } sub selected { my($self, $sel) = @_; return undef unless defined $sel; my $attr = $sel =~ s/^\^// ? "name" : $sel =~ s/^#// ? "id" : $sel =~ s/^\.// ? "class" : "name"; return 0 unless defined $self->{$attr}; return $self->{$attr} eq $sel; } sub value { my $self = shift; my $old = $self->{value}; $self->{value} = shift if @_; $old; } sub autocomplete { my $self = shift; my $old = $self->{autocomplete}; $self->{autocomplete} = shift if @_; $old; } sub possible_values { return; } sub other_possible_values { return; } sub value_names { return } sub readonly { my $self = shift; my $old = $self->{readonly}; $self->{readonly} = shift if @_; $old; } sub disabled { my $self = shift; my $old = $self->{disabled}; $self->{disabled} = shift if @_; $old; } sub form_name_value { my $self = shift; my $name = $self->{'name'}; return unless defined $name; return if $self->disabled; my $value = $self->value; return unless defined $value; return ($name => $value); } sub dump { my $self = shift; my $name = $self->name; $name = "<NONAME>" unless defined $name; my $value = $self->value; $value = "<UNDEF>" unless defined $value; my $dump = "$name=$value"; my $type = $self->type; $type .= " disabled" if $self->disabled; $type .= " readonly" if $self->readonly; return sprintf "%-30s %s", $dump, "($type)" unless $self->{menu}; my @menu; my $i = 0; for (@{$self->{menu}}) { my $opt = $_->{value}; $opt = "<UNDEF>" unless defined $opt; $opt .= "/$_->{name}" if defined $_->{name} && length $_->{name} && $_->{name} ne $opt; substr($opt,0,0) = "-" if $_->{disabled}; if (exists $self->{current} && $self->{current} == $i) { substr($opt,0,0) = "!" unless $_->{seen}; substr($opt,0,0) = "*"; } else { substr($opt,0,0) = ":" if $_->{seen}; } push(@menu, $opt); $i++; } return sprintf "%-30s %-10s %s", $dump, "($type)", "[" . join("|", @menu) . "]"; } #--------------------------------------------------- package HTML::Form::TextInput; @HTML::Form::TextInput::ISA=qw(HTML::Form::Input); #input/text #input/password #input/hidden #textarea sub value { my $self = shift; my $old = $self->{value}; $old = "" unless defined $old; if (@_) { Carp::croak("Input '$self->{name}' is readonly") if $self->{strict} && $self->{readonly}; my $new = shift; my $n = exists $self->{maxlength} ? $self->{maxlength} : undef; Carp::croak("Input '$self->{name}' has maxlength '$n'") if $self->{strict} && defined($n) && defined($new) && length($new) > $n; $self->{value} = $new; } $old; } #--------------------------------------------------- package HTML::Form::IgnoreInput; @HTML::Form::IgnoreInput::ISA=qw(HTML::Form::Input); #input/button #input/reset sub value { return } #--------------------------------------------------- package HTML::Form::ListInput; @HTML::Form::ListInput::ISA=qw(HTML::Form::Input); #select/option (val1, val2, ....) #input/radio (undef, val1, val2,...) #input/checkbox (undef, value) #select-multiple/option (undef, value) sub new { my $class = shift; my $self = $class->SUPER::new(@_); my $value = delete $self->{value}; my $value_name = delete $self->{value_name}; my $type = $self->{type}; if ($type eq "checkbox") { $value = "on" unless defined $value; $self->{menu} = [ { value => undef, name => "off", }, { value => $value, name => $value_name, }, ]; $self->{current} = (delete $self->{checked}) ? 1 : 0; ; } else { $self->{option_disabled}++ if $type eq "radio" && delete $self->{disabled}; $self->{menu} = [ {value => $value, name => $value_name}, ]; my $checked = $self->{checked} || $self->{option_selected}; delete $self->{checked}; delete $self->{option_selected}; if (exists $self->{multiple}) { unshift(@{$self->{menu}}, { value => undef, name => "off"}); $self->{current} = $checked ? 1 : 0; } else { $self->{current} = 0 if $checked; } } $self; } sub add_to_form { my($self, $form) = @_; my $type = $self->type; return $self->SUPER::add_to_form($form) if $type eq "checkbox"; if ($type eq "option" && exists $self->{multiple}) { $self->{disabled} ||= delete $self->{option_disabled}; return $self->SUPER::add_to_form($form); } die "Assert" if @{$self->{menu}} != 1; my $m = $self->{menu}[0]; $m->{disabled}++ if delete $self->{option_disabled}; my $prev = $form->find_input($self->{name}, $self->{type}, $self->{idx}); return $self->SUPER::add_to_form($form) unless $prev; # merge menus $prev->{current} = @{$prev->{menu}} if exists $self->{current}; push(@{$prev->{menu}}, $m); } sub fixup { my $self = shift; if ($self->{type} eq "option" && !(exists $self->{current})) { $self->{current} = 0; } $self->{menu}[$self->{current}]{seen}++ if exists $self->{current}; } sub disabled { my $self = shift; my $type = $self->type; my $old = $self->{disabled} || _menu_all_disabled(@{$self->{menu}}); if (@_) { my $v = shift; $self->{disabled} = $v; for (@{$self->{menu}}) { $_->{disabled} = $v; } } return $old; } sub _menu_all_disabled { for (@_) { return 0 unless $_->{disabled}; } return 1; } sub value { my $self = shift; my $old; $old = $self->{menu}[$self->{current}]{value} if exists $self->{current}; $old = $self->{value} if exists $self->{value}; if (@_) { my $i = 0; my $val = shift; my $cur; my $disabled; for (@{$self->{menu}}) { if ((defined($val) && defined($_->{value}) && $val eq $_->{value}) || (!defined($val) && !defined($_->{value})) ) { $cur = $i; $disabled = $_->{disabled}; last unless $disabled; } $i++; } if (!(defined $cur) || $disabled) { if (defined $val) { # try to search among the alternative names as well my $i = 0; my $cur_ignorecase; my $lc_val = lc($val); for (@{$self->{menu}}) { if (defined $_->{name}) { if ($val eq $_->{name}) { $disabled = $_->{disabled}; $cur = $i; last unless $disabled; } if (!defined($cur_ignorecase) && $lc_val eq lc($_->{name})) { $cur_ignorecase = $i; } } $i++; } unless (defined $cur) { $cur = $cur_ignorecase; if (defined $cur) { $disabled = $self->{menu}[$cur]{disabled}; } elsif ($self->{strict}) { my $n = $self->name; Carp::croak("Illegal value '$val' for field '$n'"); } } } elsif ($self->{strict}) { my $n = $self->name; Carp::croak("The '$n' field can't be unchecked"); } } if ($self->{strict} && $disabled) { my $n = $self->name; Carp::croak("The value '$val' has been disabled for field '$n'"); } if (defined $cur) { $self->{current} = $cur; $self->{menu}[$cur]{seen}++; delete $self->{value}; } else { $self->{value} = $val; delete $self->{current}; } } $old; } sub check { my $self = shift; $self->{current} = 1; $self->{menu}[1]{seen}++; } sub possible_values { my $self = shift; map $_->{value}, grep !$_->{disabled}, @{$self->{menu}}; } sub other_possible_values { my $self = shift; map $_->{value}, grep !$_->{seen} && !$_->{disabled}, @{$self->{menu}}; } sub value_names { my $self = shift; my @names; for (@{$self->{menu}}) { my $n = $_->{name}; $n = $_->{value} unless defined $n; push(@names, $n); } @names; } #--------------------------------------------------- package HTML::Form::SubmitInput; @HTML::Form::SubmitInput::ISA=qw(HTML::Form::Input); #input/image #input/submit sub click { my($self,$form,$x,$y) = @_; for ($x, $y) { $_ = 1 unless defined; } local($self->{clicked}) = [$x,$y]; return $form->make_request; } sub form_name_value { my $self = shift; return unless $self->{clicked}; return $self->SUPER::form_name_value(@_); } #--------------------------------------------------- package HTML::Form::ImageInput; @HTML::Form::ImageInput::ISA=qw(HTML::Form::SubmitInput); sub form_name_value { my $self = shift; my $clicked = $self->{clicked}; return unless $clicked; return if $self->{disabled}; my $name = $self->{name}; $name = (defined($name) && length($name)) ? "$name." : ""; return ("${name}x" => $clicked->[0], "${name}y" => $clicked->[1] ); } #--------------------------------------------------- package HTML::Form::FileInput; @HTML::Form::FileInput::ISA=qw(HTML::Form::TextInput); sub file { my $self = shift; $self->value(@_); } sub filename { my $self = shift; my $old = $self->{filename}; $self->{filename} = shift if @_; $old = $self->file unless defined $old; $old; } sub content { my $self = shift; my $old = $self->{content}; $self->{content} = shift if @_; $old; } sub headers { my $self = shift; my $old = $self->{headers} || []; $self->{headers} = [@_] if @_; @$old; } sub form_name_value { my($self, $form) = @_; return $self->SUPER::form_name_value($form) if $form->method ne "POST" || $form->enctype ne "multipart/form-data"; my $name = $self->name; return unless defined $name; return if $self->{disabled}; my $file = $self->file; my $filename = $self->filename; my @headers = $self->headers; my $content = $self->content; if (defined $content) { $filename = $file unless defined $filename; $file = undef; unshift(@headers, "Content" => $content); } elsif (!defined($file) || length($file) == 0) { return; } # legacy (this used to be the way to do it) if (ref($file) eq "ARRAY") { my $f = shift @$file; my $fn = shift @$file; push(@headers, @$file); $file = $f; $filename = $fn unless defined $filename; } return ($name => [$file, $filename, @headers]); } package HTML::Form::KeygenInput; @HTML::Form::KeygenInput::ISA=qw(HTML::Form::Input); sub challenge { my $self = shift; return $self->{challenge}; } sub keytype { my $self = shift; return lc($self->{keytype} || 'rsa'); } 1; __END__ =pod =encoding UTF-8 =head1 NAME HTML::Form - Class that represents an HTML form element =head1 VERSION version 6.07 =head1 SYNOPSIS use HTML::Form; $form = HTML::Form->parse($html, $base_uri); $form->value(query => "Perl"); use LWP::UserAgent; $ua = LWP::UserAgent->new; $response = $ua->request($form->click); =head1 DESCRIPTION Objects of the C<HTML::Form> class represents a single HTML C<E<lt>formE<gt> ... E<lt>/formE<gt>> instance. A form consists of a sequence of inputs that usually have names, and which can take on various values. The state of a form can be tweaked and it can then be asked to provide L<HTTP::Request> objects that can be passed to the request() method of L<LWP::UserAgent>. The following methods are available: =over 4 =item @forms = HTML::Form->parse( $html_document, $base_uri ) =item @forms = HTML::Form->parse( $html_document, base => $base_uri, %opt ) =item @forms = HTML::Form->parse( $response, %opt ) The parse() class method will parse an HTML document and build up C<HTML::Form> objects for each <form> element found. If called in scalar context only returns the first <form>. Returns an empty list if there are no forms to be found. The required arguments is the HTML document to parse ($html_document) and the URI used to retrieve the document ($base_uri). The base URI is needed to resolve relative action URIs. The provided HTML document should be a Unicode string (or US-ASCII). By default HTML::Form assumes that the original document was UTF-8 encoded and thus encode forms that don't specify an explicit I<accept-charset> as UTF-8. The charset assumed can be overridden by providing the C<charset> option to parse(). It's a good idea to be explicit about this parameter as well, thus the recommended simplest invocation becomes: my @forms = HTML::Form->parse( Encode::decode($encoding, $html_document_bytes), base => $base_uri, charset => $encoding, ); If the document was retrieved with LWP then the response object provide methods to obtain a proper value for C<base> and C<charset>: my $ua = LWP::UserAgent->new; my $response = $ua->get("http://www.example.com/form.html"); my @forms = HTML::Form->parse($response->decoded_content, base => $response->base, charset => $response->content_charset, ); In fact, the parse() method can parse from an L<HTTP::Response> object directly, so the example above can be more conveniently written as: my $ua = LWP::UserAgent->new; my $response = $ua->get("http://www.example.com/form.html"); my @forms = HTML::Form->parse($response); Note that any object that implements a decoded_content(), base() and content_charset() method with similar behaviour as L<HTTP::Response> will do. Additional options might be passed in to control how the parse method behaves. The following are all the options currently recognized: =over =item C<< base => $uri >> This is the URI used to retrieve the original document. This option is not optional ;-) =item C<< charset => $str >> Specify what charset the original document was encoded in. This is used as the default for accept_charset. If not provided this defaults to "UTF-8". =item C<< verbose => $bool >> Warn (print messages to STDERR) about any bad HTML form constructs found. You can trap these with $SIG{__WARN__}. The default is not to issue warnings. =item C<< strict => $bool >> Initialize any form objects with the given strict attribute. If the strict is turned on the methods that change values of the form will croak if you try to set illegal values or modify readonly fields. The default is not to be strict. =back =item $form->push_input( $type, \%attr, $verbose ) This method adds additional inputs to the form. The first argument is the type of input (e.g. hidden, option, etc.). The second argument is a reference to a hash of the input attributes. The third argument is optional, and will issue warnings about unknown input types. Example: push_input( 'hidden', { name => 'NewFormElement', id => 'NewFormElementId', value => 'some value', }); =item $method = $form->method =item $form->method( $new_method ) This method is gets/sets the I<method> name used for the L<HTTP::Request> generated. It is a string like "GET" or "POST". =item $action = $form->action =item $form->action( $new_action ) This method gets/sets the URI which we want to apply the request I<method> to. =item $enctype = $form->enctype =item $form->enctype( $new_enctype ) This method gets/sets the encoding type for the form data. It is a string like "application/x-www-form-urlencoded" or "multipart/form-data". =item $accept = $form->accept_charset =item $form->accept_charset( $new_accept ) This method gets/sets the list of charset encodings that the server processing the form accepts. Current implementation supports only one-element lists. Default value is "UNKNOWN" which we interpret as a request to use document charset as specified by the 'charset' parameter of the parse() method. =item $value = $form->attr( $name ) =item $form->attr( $name, $new_value ) This method give access to the original HTML attributes of the <form> tag. The $name should always be passed in lower case. Example: @f = HTML::Form->parse( $html, $foo ); @f = grep $_->attr("id") eq "foo", @f; die "No form named 'foo' found" unless @f; $foo = shift @f; =item $bool = $form->strict =item $form->strict( $bool ) Gets/sets the strict attribute of a form. If the strict is turned on the methods that change values of the form will croak if you try to set illegal values or modify readonly fields. The default is not to be strict. =item @inputs = $form->inputs This method returns the list of inputs in the form. If called in scalar context it returns the number of inputs contained in the form. See L</INPUTS> for what methods are available for the input objects returned. =item $input = $form->find_input( $selector ) =item $input = $form->find_input( $selector, $type ) =item $input = $form->find_input( $selector, $type, $index ) =item @inputs = $form->find_input( $selector ) =item @inputs = $form->find_input( $selector, $type ) This method is used to locate specific inputs within the form. All inputs that match the arguments given are returned. In scalar context only the first is returned, or C<undef> if none match. If $selector is not C<undef>, then the input's name, id, class attribute must match. A selector prefixed with '#' must match the id attribute of the input. A selector prefixed with '.' matches the class attribute. A selector prefixed with '^' or with no prefix matches the name attribute. If $type is not C<undef>, then the input must have the specified type. The following type names are used: "text", "password", "hidden", "textarea", "file", "image", "submit", "radio", "checkbox" and "option". The $index is the sequence number of the input matched where 1 is the first. If combined with $name and/or $type, then it selects the I<n>th input with the given name and/or type. =item $value = $form->value( $selector ) =item $form->value( $selector, $new_value ) The value() method can be used to get/set the value of some input. If strict is enabled and no input has the indicated name, then this method will croak. If multiple inputs have the same name, only the first one will be affected. The call: $form->value('foo') is basically a short-hand for: $form->find_input('foo')->value; =item @names = $form->param =item @values = $form->param( $name ) =item $form->param( $name, $value, ... ) =item $form->param( $name, \@values ) Alternative interface to examining and setting the values of the form. If called without arguments then it returns the names of all the inputs in the form. The names will not repeat even if multiple inputs have the same name. In scalar context the number of different names is returned. If called with a single argument then it returns the value or values of inputs with the given name. If called in scalar context only the first value is returned. If no input exists with the given name, then C<undef> is returned. If called with 2 or more arguments then it will set values of the named inputs. This form will croak if no inputs have the given name or if any of the values provided does not fit. Values can also be provided as a reference to an array. This form will allow unsetting all values with the given name as well. This interface resembles that of the param() function of the CGI module. =item $form->try_others( \&callback ) This method will iterate over all permutations of unvisited enumerated values (<select>, <radio>, <checkbox>) and invoke the callback for each. The callback is passed the $form as argument. The return value from the callback is ignored and the try_others() method itself does not return anything. =item $request = $form->make_request Will return an L<HTTP::Request> object that reflects the current setting of the form. You might want to use the click() method instead. =item $request = $form->click =item $request = $form->click( $selector ) =item $request = $form->click( $x, $y ) =item $request = $form->click( $selector, $x, $y ) Will "click" on the first clickable input (which will be of type C<submit> or C<image>). The result of clicking is an L<HTTP::Request> object that can then be passed to L<LWP::UserAgent> if you want to obtain the server response. If a $selector is specified, we will click on the first clickable input matching the selector, and the method will croak if no matching clickable input is found. If $selector is I<not> specified, then it is ok if the form contains no clickable inputs. In this case the click() method returns the same request as the make_request() method would do. See description of the find_input() method above for how the $selector is specified. If there are multiple clickable inputs with the same name, then there is no way to get the click() method of the C<HTML::Form> to click on any but the first. If you need this you would have to locate the input with find_input() and invoke the click() method on the given input yourself. A click coordinate pair can also be provided, but this only makes a difference if you clicked on an image. The default coordinate is (1,1). The upper-left corner of the image is (0,0), but some badly coded CGI scripts are known to not recognize this. Therefore (1,1) was selected as a safer default. =item @kw = $form->form Returns the current setting as a sequence of key/value pairs. Note that keys might be repeated, which means that some values might be lost if the return values are assigned to a hash. In scalar context this method returns the number of key/value pairs generated. =item $form->dump Returns a textual representation of current state of the form. Mainly useful for debugging. If called in void context, then the dump is printed on STDERR. =back =head1 INPUTS An C<HTML::Form> objects contains a sequence of I<inputs>. References to the inputs can be obtained with the $form->inputs or $form->find_input methods. Note that there is I<not> a one-to-one correspondence between input I<objects> and E<lt>inputE<gt> I<elements> in the HTML document. An input object basically represents a name/value pair, so when multiple HTML elements contribute to the same name/value pair in the submitted form they are combined. The input elements that are mapped one-to-one are "text", "textarea", "password", "hidden", "file", "image", "submit" and "checkbox". For the "radio" and "option" inputs the story is not as simple: All E<lt>input type="radio"E<gt> elements with the same name will contribute to the same input radio object. The number of radio input objects will be the same as the number of distinct names used for the E<lt>input type="radio"E<gt> elements. For a E<lt>selectE<gt> element without the C<multiple> attribute there will be one input object of type of "option". For a E<lt>select multipleE<gt> element there will be one input object for each contained E<lt>optionE<gt> element. Each one of these option objects will have the same name. The following methods are available for the I<input> objects: =over 4 =item $input->type Returns the type of this input. The type is one of the following strings: "text", "password", "hidden", "textarea", "file", "image", "submit", "radio", "checkbox" or "option". =item $name = $input->name =item $input->name( $new_name ) This method can be used to get/set the current name of the input. =item $input->id =item $input->class These methods can be used to get/set the current id or class attribute for the input. =item $input->selected( $selector ) Returns TRUE if the given selector matched the input. See the description of the find_input() method above for a description of the selector syntax. =item $value = $input->value =item $input->value( $new_value ) This method can be used to get/set the current value of an input. If strict is enabled and the input only can take an enumerated list of values, then it is an error to try to set it to something else and the method will croak if you try. You will also be able to set the value of read-only inputs, but a warning will be generated if running under C<perl -w>. =item $autocomplete = $input->autocomplete =item $input->autocomplete( $new_autocomplete ) This method can be used to get/set the current value (if any) of C<autcomplete> for the input. =item $input->possible_values Returns a list of all values that an input can take. For inputs that do not have discrete values, this returns an empty list. =item $input->other_possible_values Returns a list of all values not tried yet. =item $input->value_names For some inputs the values can have names that are different from the values themselves. The number of names returned by this method will match the number of values reported by $input->possible_values. When setting values using the value() method it is also possible to use the value names in place of the value itself. =item $bool = $input->readonly =item $input->readonly( $bool ) This method is used to get/set the value of the readonly attribute. You are allowed to modify the value of readonly inputs, but setting the value will generate some noise when warnings are enabled. Hidden fields always start out readonly. =item $bool = $input->disabled =item $input->disabled( $bool ) This method is used to get/set the value of the disabled attribute. Disabled inputs do not contribute any key/value pairs for the form value. =item $input->form_name_value Returns a (possible empty) list of key/value pairs that should be incorporated in the form value from this input. =item $input->check Some input types represent toggles that can be turned on/off. This includes "checkbox" and "option" inputs. Calling this method turns this input on without having to know the value name. If the input is already on, then nothing happens. This has the same effect as: $input->value($input->possible_values[1]); The input can be turned off with: $input->value(undef); =item $input->click($form, $x, $y) Some input types (currently "submit" buttons and "images") can be clicked to submit the form. The click() method returns the corresponding L<HTTP::Request> object. =back If the input is of type C<file>, then it has these additional methods: =over 4 =item $input->file This is just an alias for the value() method. It sets the filename to read data from. For security reasons this field will never be initialized from the parsing of a form. This prevents the server from triggering stealth uploads of arbitrary files from the client machine. =item $filename = $input->filename =item $input->filename( $new_filename ) This get/sets the filename reported to the server during file upload. This attribute defaults to the value reported by the file() method. =item $content = $input->content =item $input->content( $new_content ) This get/sets the file content provided to the server during file upload. This method can be used if you do not want the content to be read from an actual file. =item @headers = $input->headers =item input->headers($key => $value, .... ) This get/set additional header fields describing the file uploaded. This can for instance be used to set the C<Content-Type> reported for the file. =back =head1 SEE ALSO L<LWP>, L<LWP::UserAgent>, L<HTML::Parser> =head1 AUTHOR Gisle Aas <gisle@activestate.com> =head1 COPYRIGHT AND LICENSE This software is copyright (c) 1998 by Gisle Aas. This is free software; you can redistribute it and/or modify it under the same terms as the Perl 5 programming language system itself. =cut Template/FAQ.pm 0000644 00000012646 00000000000 0007236 0 ustar 00 use strict; use warnings; package HTML::Template::FAQ; # ABSTRACT: Frequently Asked Questions about HTML::Template use Carp (); Carp::confess "you're not meant to use the FAQ, just read it!"; 1; __END__ =pod =head1 NAME HTML::Template::FAQ - Frequently Asked Questions about HTML::Template =head1 SYNOPSIS In the interest of greater understanding I've started a FAQ section of the perldocs. Please look in here before you send me email. =head1 FREQUENTLY ASKED QUESTIONS =head2 Is there a place to go to discuss HTML::Template and/or get help? There's a mailing-list for discussing L<HTML::Template> at html-template-users@lists.sourceforge.net. Join at: http://lists.sourceforge.net/lists/listinfo/html-template-users If you just want to get email when new releases are available you can join the announcements mailing-list here: http://lists.sourceforge.net/lists/listinfo/html-template-announce =head2 Is there a searchable archive for the mailing-list? Yes, you can find an archive of the SourceForge list here: http://dir.gmane.org/gmane.comp.lang.perl.modules.html-template =head2 I want support for <TMPL_XXX>! How about it? Maybe. I definitely encourage people to discuss their ideas for L<HTML::Template> on the mailing list. Please be ready to explain to me how the new tag fits in with HTML::Template's mission to provide a fast, lightweight system for using HTML templates. NOTE: Offering to program said addition and provide it in the form of a patch to the most recent version of L<HTML::Template> will definitely have a softening effect on potential opponents! =head2 I found a bug, can you fix it? That depends. Did you send me the VERSION of L<HTML::Template>, a test script and a test template? If so, then almost certainly. If you're feeling really adventurous, L<HTML::Template> is publicly available on GitHub (https://github.com/mpeters/html-template). Please feel free to fork it and send me a pull request with any changes you have. =head2 <TMPL_VAR>s from the main template aren't working inside a <TMPL_LOOP>! Why? This is the intended behavior. C<< <TMPL_LOOP> >> introduces a separate scope for C<< <TMPL_VAR>s >> much like a subroutine call in Perl introduces a separate scope for C<my> variables. If you want your C<< <TMPL_VAR> >>s to be global you can set the C<global_vars> option when you call C<new()>. See above for documentation of the C<global_vars> C<new()> option. =head2 How can I pre-load my templates using cache-mode and mod_perl? Add something like this to your startup.pl: use HTML::Template; use File::Find; print STDERR "Pre-loading HTML Templates...\n"; find( sub { return unless /\.tmpl$/; HTML::Template->new( filename => "$File::Find::dir/$_", cache => 1, ); }, '/path/to/templates', '/another/path/to/templates/' ); Note that you'll need to modify the C<return unless> line to specify the extension you use for your template files - I use F<.tmpl>, as you can see. You'll also need to specify the path to your template files. One potential problem: the F</path/to/templates/> must be B<EXACTLY> the same path you use when you call C<< HTML::Template->new() >>. Otherwise the cache won't know they're the same file and will load a new copy - instead getting a speed increase, you'll double your memory usage. To find out if this is happening set C<cache_debug => 1> in your application code and look for "CACHE MISS" messages in the logs. =head2 What characters are allowed in TMPL_* names? Numbers, letters, '.', '/', '+', '-' and '_'. =head2 How can I execute a program from inside my template? Short answer: you can't. Longer answer: you shouldn't since this violates the fundamental concept behind L<HTML::Template> - that design and code should be separate. But, inevitably some people still want to do it. If that describes you then you should take a look at L<HTML::Template::Expr>. Using L<HTML::Template::Expr> it should be easy to write a C<run_program()> function. Then you can do awful stuff like: <tmpl_var expr="run_program('foo.pl')"> Just, please, don't tell me about it. I'm feeling guilty enough just for writing L<HTML::Template::Expr> in the first place. =head2 What's the best way to create a <select> form element using HTML::Template? There is much disagreement on this issue. My personal preference is to use L<CGI.pm>'s excellent C<popup_menu()> and C<scrolling_list()> functions to fill in a single C<< <tmpl_var select_foo> >> variable. To some people this smacks of mixing HTML and code in a way that they hoped L<HTML::Template> would help them avoid. To them I'd say that HTML is a violation of the principle of separating design from programming. There's no clear separation between the programmatic elements of the C<< <form> >> tags and the layout of the C<< <form> >> tags. You'll have to draw the line somewhere - clearly the designer can't be entirely in charge of form creation. It's a balancing act and you have to weigh the pros and cons on each side. It is certainly possible to produce a C<< <select> >> element entirely inside the template. What you end up with is a rat's nest of loops and conditionals. Alternately you can give up a certain amount of flexibility in return for vastly simplifying your templates. I generally choose the latter. Another option is to investigate L<HTML::FillInForm> which some have reported success using to solve this problem. Tree.pm 0000644 00000013636 00000000000 0005753 0 ustar 00 package HTML::Tree; # ABSTRACT: build and scan parse-trees of HTML # HTML::Tree is basically just a happy alias to HTML::TreeBuilder. use warnings; use strict; our $VERSION = '5.07'; # VERSION from OurPkgVersion use HTML::TreeBuilder (); sub new { shift; unshift @_, 'HTML::TreeBuilder'; goto &HTML::TreeBuilder::new; } sub new_from_file { shift; unshift @_, 'HTML::TreeBuilder'; goto &HTML::TreeBuilder::new_from_file; } sub new_from_content { shift; unshift @_, 'HTML::TreeBuilder'; goto &HTML::TreeBuilder::new_from_content; } sub new_from_url { shift; unshift @_, 'HTML::TreeBuilder'; goto &HTML::TreeBuilder::new_from_url; } 1; __END__ =pod =head1 NAME HTML::Tree - build and scan parse-trees of HTML =head1 VERSION This document describes version 5.07 of HTML::Tree, released August 31, 2017 as part of L<HTML-Tree|HTML::Tree>. =head1 SYNOPSIS use HTML::TreeBuilder; my $tree = HTML::TreeBuilder->new(); $tree->parse_file($filename); # Then do something with the tree, using HTML::Element # methods -- for example: $tree->dump # Finally: $tree->delete; =head1 DESCRIPTION HTML-Tree is a suite of Perl modules for making parse trees out of HTML source. It consists of mainly two modules, whose documentation you should refer to: L<HTML::TreeBuilder|HTML::TreeBuilder> and L<HTML::Element|HTML::Element>. HTML::TreeBuilder is the module that builds the parse trees. (It uses HTML::Parser to do the work of breaking the HTML up into tokens.) The tree that TreeBuilder builds for you is made up of objects of the class HTML::Element. If you find that you do not properly understand the documentation for HTML::TreeBuilder and HTML::Element, it may be because you are unfamiliar with tree-shaped data structures, or with object-oriented modules in general. Sean Burke has written some articles for I<The Perl Journal> (C<www.tpj.com>) that seek to provide that background. The full text of those articles is contained in this distribution, as: =over 4 =item L<HTML::Tree::AboutObjects|HTML::Tree::AboutObjects> "User's View of Object-Oriented Modules" from TPJ17. =item L<HTML::Tree::AboutTrees|HTML::Tree::AboutTrees> "Trees" from TPJ18 =item L<HTML::Tree::Scanning|HTML::Tree::Scanning> "Scanning HTML" from TPJ19 =back Readers already familiar with object-oriented modules and tree-shaped data structures should read just the last article. Readers without that background should read the first, then the second, and then the third. =head1 METHODS All these methods simply redirect to the corresponding method in HTML::TreeBuilder. It's more efficient to use HTML::TreeBuilder directly, and skip loading HTML::Tree at all. =head2 new Redirects to L<HTML::TreeBuilder/new>. =head2 new_from_file Redirects to L<HTML::TreeBuilder/new_from_file>. =head2 new_from_content Redirects to L<HTML::TreeBuilder/new_from_content>. =head2 new_from_url Redirects to L<HTML::TreeBuilder/new_from_url>. =head1 SUPPORT You can find documentation for this module with the perldoc command. perldoc HTML::Tree You can also look for information at: =over 4 =item * AnnoCPAN: Annotated CPAN documentation L<http://annocpan.org/dist/HTML-Tree> =item * CPAN Ratings L<http://cpanratings.perl.org/d/HTML-Tree> =item * RT: CPAN's request tracker L<http://rt.cpan.org/NoAuth/Bugs.html?Dist=HTML-Tree> =item * Search CPAN L<http://search.cpan.org/dist/HTML-Tree> =item * Stack Overflow L<http://stackoverflow.com/questions/tagged/html-tree> If you have a question about how to use HTML-Tree, Stack Overflow is the place to ask it. Make sure you tag it both C<perl> and C<html-tree>. =back =head1 SEE ALSO L<HTML::TreeBuilder>, L<HTML::Element>, L<HTML::Tagset>, L<HTML::Parser>, L<HTML::DOMbo> The book I<Perl & LWP> by Sean M. Burke published by O'Reilly and Associates, 2002. ISBN: 0-596-00178-9 It has several chapters to do with HTML processing in general, and HTML-Tree specifically. There's more info at: http://www.oreilly.com/catalog/perllwp/ http://www.amazon.com/exec/obidos/ASIN/0596001789 =head1 SOURCE REPOSITORY HTML-Tree is now maintained using Git. The main public repository is L<< https://github.com/kentfredric/HTML-Tree >>. The best way to send a patch is to make a pull request there. =head1 ACKNOWLEDGEMENTS Thanks to Gisle Aas, Sean Burke and Andy Lester for their original work. Thanks to Chicago Perl Mongers (http://chicago.pm.org) for their patches submitted to HTML::Tree as part of the Phalanx project (http://qa.perl.org/phalanx). Thanks to the following people for additional patches and documentation: Terrence Brannon, Gordon Lack, Chris Madsen and Ricardo Signes. =head1 AUTHOR Current maintainers: =over =item * Christopher J. Madsen S<C<< <perl AT cjmweb.net> >>> =item * Jeff Fearn S<C<< <jfearn AT cpan.org> >>> =back Original HTML-Tree author: =over =item * Gisle Aas =back Former maintainers: =over =item * Sean M. Burke =item * Andy Lester =item * Pete Krawczyk S<C<< <petek AT cpan.org> >>> =back You can follow or contribute to HTML-Tree's development at L<< https://github.com/kentfredric/HTML-Tree >>. =head1 COPYRIGHT AND LICENSE Copyright 1995-1998 Gisle Aas, 1999-2004 Sean M. Burke, 2005 Andy Lester, 2006 Pete Krawczyk, 2010 Jeff Fearn, 2012 Christopher J. Madsen. (Except the articles contained in HTML::Tree::AboutObjects, HTML::Tree::AboutTrees, and HTML::Tree::Scanning, which are all copyright 2000 The Perl Journal.) Except for those three TPJ articles, the whole HTML-Tree distribution, of which this file is a part, is free software; you can redistribute it and/or modify it under the same terms as Perl itself. Those three TPJ articles may be distributed under the same terms as Perl itself. The programs in this library are distributed in the hope that they will be useful, but without any warranty; without even the implied warranty of merchantability or fitness for a particular purpose. =cut Tagset.pm 0000644 00000031224 00000000000 0006274 0 ustar 00 package HTML::Tagset; use strict; =head1 NAME HTML::Tagset - data tables useful in parsing HTML =head1 VERSION Version 3.20 =cut use vars qw( $VERSION ); $VERSION = '3.20'; =head1 SYNOPSIS use HTML::Tagset; # Then use any of the items in the HTML::Tagset package # as need arises =head1 DESCRIPTION This module contains several data tables useful in various kinds of HTML parsing operations. Note that all tag names used are lowercase. In the following documentation, a "hashset" is a hash being used as a set -- the hash conveys that its keys are there, and the actual values associated with the keys are not significant. (But what values are there, are always true.) =cut use vars qw( $VERSION %emptyElement %optionalEndTag %linkElements %boolean_attr %isHeadElement %isBodyElement %isPhraseMarkup %is_Possible_Strict_P_Content %isHeadOrBodyElement %isList %isTableElement %isFormElement %isKnown %canTighten @p_closure_barriers %isCDATA_Parent ); =head1 VARIABLES Note that none of these variables are exported. =head2 hashset %HTML::Tagset::emptyElement This hashset has as values the tag-names (GIs) of elements that cannot have content. (For example, "base", "br", "hr".) So C<$HTML::Tagset::emptyElement{'hr'}> exists and is true. C<$HTML::Tagset::emptyElement{'dl'}> does not exist, and so is not true. =cut %emptyElement = map {; $_ => 1 } qw(base link meta isindex img br hr wbr input area param embed bgsound spacer basefont col frame ~comment ~literal ~declaration ~pi ); # The "~"-initial names are for pseudo-elements used by HTML::Entities # and TreeBuilder =head2 hashset %HTML::Tagset::optionalEndTag This hashset lists tag-names for elements that can have content, but whose end-tags are generally, "safely", omissible. Example: C<$HTML::Tagset::emptyElement{'li'}> exists and is true. =cut %optionalEndTag = map {; $_ => 1 } qw(p li dt dd); # option th tr td); =head2 hash %HTML::Tagset::linkElements Values in this hash are tagnames for elements that might contain links, and the value for each is a reference to an array of the names of attributes whose values can be links. =cut %linkElements = ( 'a' => ['href'], 'applet' => ['archive', 'codebase', 'code'], 'area' => ['href'], 'base' => ['href'], 'bgsound' => ['src'], 'blockquote' => ['cite'], 'body' => ['background'], 'del' => ['cite'], 'embed' => ['pluginspage', 'src'], 'form' => ['action'], 'frame' => ['src', 'longdesc'], 'iframe' => ['src', 'longdesc'], 'ilayer' => ['background'], 'img' => ['src', 'lowsrc', 'longdesc', 'usemap'], 'input' => ['src', 'usemap'], 'ins' => ['cite'], 'isindex' => ['action'], 'head' => ['profile'], 'layer' => ['background', 'src'], 'link' => ['href'], 'object' => ['classid', 'codebase', 'data', 'archive', 'usemap'], 'q' => ['cite'], 'script' => ['src', 'for'], 'table' => ['background'], 'td' => ['background'], 'th' => ['background'], 'tr' => ['background'], 'xmp' => ['href'], ); =head2 hash %HTML::Tagset::boolean_attr This hash (not hashset) lists what attributes of what elements can be printed without showing the value (for example, the "noshade" attribute of "hr" elements). For elements with only one such attribute, its value is simply that attribute name. For elements with many such attributes, the value is a reference to a hashset containing all such attributes. =cut %boolean_attr = ( # TODO: make these all hashes 'area' => 'nohref', 'dir' => 'compact', 'dl' => 'compact', 'hr' => 'noshade', 'img' => 'ismap', 'input' => { 'checked' => 1, 'readonly' => 1, 'disabled' => 1 }, 'menu' => 'compact', 'ol' => 'compact', 'option' => 'selected', 'select' => 'multiple', 'td' => 'nowrap', 'th' => 'nowrap', 'ul' => 'compact', ); #========================================================================== # List of all elements from Extensible HTML version 1.0 Transitional DTD: # # a abbr acronym address applet area b base basefont bdo big # blockquote body br button caption center cite code col colgroup # dd del dfn dir div dl dt em fieldset font form h1 h2 h3 h4 h5 h6 # head hr html i iframe img input ins isindex kbd label legend li # link map menu meta noframes noscript object ol optgroup option p # param pre q s samp script select small span strike strong style # sub sup table tbody td textarea tfoot th thead title tr tt u ul # var # # Varia from Mozilla source internal table of tags: # Implemented: # xmp listing wbr nobr frame frameset noframes ilayer # layer nolayer spacer embed multicol # But these are unimplemented: # sound?? keygen?? server?? # Also seen here and there: # marquee?? app?? (both unimplemented) #========================================================================== =head2 hashset %HTML::Tagset::isPhraseMarkup This hashset contains all phrasal-level elements. =cut %isPhraseMarkup = map {; $_ => 1 } qw( span abbr acronym q sub sup cite code em kbd samp strong var dfn strike b i u s tt small big a img br wbr nobr blink font basefont bdo spacer embed noembed ); # had: center, hr, table =head2 hashset %HTML::Tagset::is_Possible_Strict_P_Content This hashset contains all phrasal-level elements that be content of a P element, for a strict model of HTML. =cut %is_Possible_Strict_P_Content = ( %isPhraseMarkup, %isFormElement, map {; $_ => 1} qw( object script map ) # I've no idea why there's these latter exceptions. # I'm just following the HTML4.01 DTD. ); #from html4 strict: #<!ENTITY % fontstyle "TT | I | B | BIG | SMALL"> # #<!ENTITY % phrase "EM | STRONG | DFN | CODE | # SAMP | KBD | VAR | CITE | ABBR | ACRONYM" > # #<!ENTITY % special # "A | IMG | OBJECT | BR | SCRIPT | MAP | Q | SUB | SUP | SPAN | BDO"> # #<!ENTITY % formctrl "INPUT | SELECT | TEXTAREA | LABEL | BUTTON"> # #<!-- %inline; covers inline or "text-level" elements --> #<!ENTITY % inline "#PCDATA | %fontstyle; | %phrase; | %special; | %formctrl;"> =head2 hashset %HTML::Tagset::isHeadElement This hashset contains all elements that elements that should be present only in the 'head' element of an HTML document. =cut %isHeadElement = map {; $_ => 1 } qw(title base link meta isindex script style object bgsound); =head2 hashset %HTML::Tagset::isList This hashset contains all elements that can contain "li" elements. =cut %isList = map {; $_ => 1 } qw(ul ol dir menu); =head2 hashset %HTML::Tagset::isTableElement This hashset contains all elements that are to be found only in/under a "table" element. =cut %isTableElement = map {; $_ => 1 } qw(tr td th thead tbody tfoot caption col colgroup); =head2 hashset %HTML::Tagset::isFormElement This hashset contains all elements that are to be found only in/under a "form" element. =cut %isFormElement = map {; $_ => 1 } qw(input select option optgroup textarea button label); =head2 hashset %HTML::Tagset::isBodyElement This hashset contains all elements that are to be found only in/under the "body" element of an HTML document. =cut %isBodyElement = map {; $_ => 1 } qw( h1 h2 h3 h4 h5 h6 p div pre plaintext address blockquote xmp listing center multicol iframe ilayer nolayer bgsound hr ol ul dir menu li dl dt dd ins del fieldset legend map area applet param object isindex script noscript table center form ), keys %isFormElement, keys %isPhraseMarkup, # And everything phrasal keys %isTableElement, ; =head2 hashset %HTML::Tagset::isHeadOrBodyElement This hashset includes all elements that I notice can fall either in the head or in the body. =cut %isHeadOrBodyElement = map {; $_ => 1 } qw(script isindex style object map area param noscript bgsound); # i.e., if we find 'script' in the 'body' or the 'head', don't freak out. =head2 hashset %HTML::Tagset::isKnown This hashset lists all known HTML elements. =cut %isKnown = (%isHeadElement, %isBodyElement, map{; $_=>1 } qw( head body html frame frameset noframes ~comment ~pi ~directive ~literal )); # that should be all known tags ever ever =head2 hashset %HTML::Tagset::canTighten This hashset lists elements that might have ignorable whitespace as children or siblings. =cut %canTighten = %isKnown; delete @canTighten{ keys(%isPhraseMarkup), 'input', 'select', 'xmp', 'listing', 'plaintext', 'pre', }; # xmp, listing, plaintext, and pre are untightenable, and # in a really special way. @canTighten{'hr','br'} = (1,1); # exceptional 'phrasal' things that ARE subject to tightening. # The one case where I can think of my tightening rules failing is: # <p>foo bar<center> <em>baz quux</em> ... # ^-- that would get deleted. # But that's pretty gruesome code anyhow. You gets what you pays for. #========================================================================== =head2 array @HTML::Tagset::p_closure_barriers This array has a meaning that I have only seen a need for in C<HTML::TreeBuilder>, but I include it here on the off chance that someone might find it of use: When we see a "E<lt>pE<gt>" token, we go lookup up the lineage for a p element we might have to minimize. At first sight, we might say that if there's a p anywhere in the lineage of this new p, it should be closed. But that's wrong. Consider this document: <html> <head> <title>foo</title> </head> <body> <p>foo <table> <tr> <td> foo <p>bar </td> </tr> </table> </p> </body> </html> The second p is quite legally inside a much higher p. My formalization of the reason why this is legal, but this: <p>foo<p>bar</p></p> isn't, is that something about the table constitutes a "barrier" to the application of the rule about what p must minimize. So C<@HTML::Tagset::p_closure_barriers> is the list of all such barrier-tags. =cut @p_closure_barriers = qw( li blockquote ul ol menu dir dl dt dd td th tr table caption div ); # In an ideal world (i.e., XHTML) we wouldn't have to bother with any of this # monkey business of barriers to minimization! =head2 hashset %isCDATA_Parent This hashset includes all elements whose content is CDATA. =cut %isCDATA_Parent = map {; $_ => 1 } qw(script style xmp listing plaintext); # TODO: there's nothing else that takes CDATA children, right? # As the HTML3 DTD (Raggett 1995-04-24) noted: # The XMP, LISTING and PLAINTEXT tags are incompatible with SGML # and derive from very early versions of HTML. They require non- # standard parsers and will cause problems for processing # documents with standard SGML tools. =head1 CAVEATS You may find it useful to alter the behavior of modules (like C<HTML::Element> or C<HTML::TreeBuilder>) that use C<HTML::Tagset>'s data tables by altering the data tables themselves. You are welcome to try, but be careful; and be aware that different modules may or may react differently to the data tables being changed. Note that it may be inappropriate to use these tables for I<producing> HTML -- for example, C<%isHeadOrBodyElement> lists the tagnames for all elements that can appear either in the head or in the body, such as "script". That doesn't mean that I am saying your code that produces HTML should feel free to put script elements in either place! If you are producing programs that spit out HTML, you should be I<intimately> familiar with the DTDs for HTML or XHTML (available at C<http://www.w3.org/>), and you should slavishly obey them, not the data tables in this document. =head1 SEE ALSO L<HTML::Element>, L<HTML::TreeBuilder>, L<HTML::LinkExtor> =head1 COPYRIGHT & LICENSE Copyright 1995-2000 Gisle Aas. Copyright 2000-2005 Sean M. Burke. Copyright 2005-2008 Andy Lester. This program is free software; you can redistribute it and/or modify it under the same terms as Perl itself. =head1 ACKNOWLEDGEMENTS Most of the code/data in this module was adapted from code written by Gisle Aas for C<HTML::Element>, C<HTML::TreeBuilder>, and C<HTML::LinkExtor>. Then it was maintained by Sean M. Burke. =head1 AUTHOR Current maintainer: Andy Lester, C<< <andy at petdance.com> >> =head1 BUGS Please report any bugs or feature requests to C<bug-html-tagset at rt.cpan.org>, or through the web interface at L<http://rt.cpan.org/NoAuth/ReportBug.html?Queue=HTML-Tagset>. I will be notified, and then you'll automatically be notified of progress on your bug as I make changes. =cut 1; Element.pm 0000644 00000425350 00000000000 0006445 0 ustar 00 package HTML::Element; # ABSTRACT: Class for objects that represent HTML elements use strict; use warnings; our $VERSION = '5.07'; # VERSION from OurPkgVersion use Carp (); use HTML::Entities (); use HTML::Tagset (); use integer; # vroom vroom! # This controls encoding entities on output. # When set entities won't be re-encoded. # Defaulting off because parser defaults to unencoding entities our $encoded_content = 0; use vars qw($html_uc $Debug $ID_COUNTER $VERSION %list_type_to_sub); # Set up support for weak references, if possible: my $using_weaken; #=head1 CLASS METHODS sub Use_Weak_Refs { my $self_or_class = shift; if (@_) { # set $using_weaken = !! shift; # Normalize boolean value Carp::croak("The installed Scalar::Util lacks support for weak references") if $using_weaken and not defined &Scalar::Util::weaken; no warnings 'redefine'; *_weaken = $using_weaken ? \&Scalar::Util::weaken : sub ($) {}; } # end if setting value return $using_weaken; } # end Use_Weak_Refs BEGIN { # Attempt to import weaken from Scalar::Util, but don't complain # if we can't. Also, rename it to _weaken. require Scalar::Util; __PACKAGE__->Use_Weak_Refs(defined &Scalar::Util::weaken); } sub import { my $class = shift; for (@_) { if (/^-(no_?)?weak$/) { $class->Use_Weak_Refs(not $1); } else { Carp::croak("$_ is not exported by the $class module"); } } } # end import $Debug = 0 unless defined $Debug; #=head1 SUBROUTINES sub Version { Carp::carp("Deprecated subroutine HTML::Element::Version called"); $VERSION; } my $nillio = []; *HTML::Element::emptyElement = \%HTML::Tagset::emptyElement; # legacy *HTML::Element::optionalEndTag = \%HTML::Tagset::optionalEndTag; # legacy *HTML::Element::linkElements = \%HTML::Tagset::linkElements; # legacy *HTML::Element::boolean_attr = \%HTML::Tagset::boolean_attr; # legacy *HTML::Element::canTighten = \%HTML::Tagset::canTighten; # legacy # Constants for signalling back to the traverser: my $travsignal_package = __PACKAGE__ . '::_travsignal'; my ( $ABORT, $PRUNE, $PRUNE_SOFTLY, $OK, $PRUNE_UP ) = map { my $x = $_; bless \$x, $travsignal_package; } qw( ABORT PRUNE PRUNE_SOFTLY OK PRUNE_UP ); ## Comments from Father Chrysostomos RT #58880 ## The sole purpose for empty parentheses after a sub name is to make it ## parse as a 0-ary (nihilary?) function. I.e., ABORT+1 should parse as ## ABORT()+1, not ABORT(+1). The parentheses also tell perl that it can ### be inlined. ##Deparse is really useful for demonstrating this: ##$ perl -MO=Deparse,-p -e 'sub ABORT {7} print ABORT+8' # Vs # perl -MO=Deparse,-p -e 'sub ABORT() {7} print ABORT+8' # # With the parentheses, it not only makes it parse as a term. # It even resolves the constant at compile-time, making the code run faster. ## no critic sub ABORT () {$ABORT} sub PRUNE () {$PRUNE} sub PRUNE_SOFTLY () {$PRUNE_SOFTLY} sub OK () {$OK} sub PRUNE_UP () {$PRUNE_UP} ## use critic $html_uc = 0; # set to 1 if you want tag and attribute names from starttag and endtag # to be uc'd # regexs for XML names # http://www.w3.org/TR/2006/REC-xml11-20060816/NT-NameStartChar my $START_CHAR = qr/(?:\:|[A-Z]|_|[a-z]|[\x{C0}-\x{D6}]|[\x{D8}-\x{F6}]|[\x{F8}-\x{2FF}]|[\x{370}-\x{37D}]|[\x{37F}-\x{1FFF}]|[\x{200C}-\x{200D}]|[\x{2070}-\x{218F}]|[\x{2C00}-\x{2FEF}]|[\x{3001}-\x{D7FF}]|[\x{F900}-\x{FDCF}]|[\x{FDF0}-\x{FFFD}]|[\x{10000}-\x{EFFFF}])/; # http://www.w3.org/TR/2006/REC-xml11-20060816/#NT-NameChar my $NAME_CHAR = qr/(?:$START_CHAR|-|\.|[0-9]|\x{B7}|[\x{0300}-\x{036F}]|[\x{203F}-\x{2040}])/; # Elements that does not have corresponding end tags (i.e. are empty) #========================================================================== #=head1 BASIC METHODS # # An HTML::Element is represented by blessed hash reference, much like # Tree::DAG_Node objects. Key-names not starting with '_' are reserved # for the SGML attributes of the element. # The following special keys are used: # # '_tag': The tag name (i.e., the generic identifier) # '_parent': A reference to the HTML::Element above (when forming a tree) # '_pos': The current position (a reference to a HTML::Element) is # where inserts will be placed (look at the insert_element # method) If not set, the implicit value is the object itself. # '_content': A ref to an array of nodes under this. # It might not be set. # # Example: <img src="gisle.jpg" alt="Gisle's photo"> is represented like this: # # bless { # _tag => 'img', # src => 'gisle.jpg', # alt => "Gisle's photo", # }, 'HTML::Element'; # sub new { my $class = shift; $class = ref($class) || $class; my $tag = shift; Carp::croak("No tagname") unless defined $tag and length $tag; Carp::croak "\"$tag\" isn't a good tag name!" if $tag =~ m/[<>\/\x00-\x20]/; # minimal sanity, certainly! my $self = bless { _tag => scalar( $class->_fold_case($tag) ) }, $class; my ( $attr, $val ); while ( ( $attr, $val ) = splice( @_, 0, 2 ) ) { ## RT #42209 why does this default to the attribute name and not remain unset or the empty string? $val = $attr unless defined $val; $self->{ $class->_fold_case($attr) } = $val; } if ( $tag eq 'html' ) { $self->{'_pos'} = undef; } _weaken($self->{'_parent'}) if $self->{'_parent'}; return $self; } sub attr { my $self = shift; my $attr = scalar( $self->_fold_case(shift) ); if (@_) { # set if ( defined $_[0] ) { my $old = $self->{$attr}; $self->{$attr} = $_[0]; return $old; } else { # delete, actually return delete $self->{$attr}; } } else { # get return $self->{$attr}; } } sub tag { my $self = shift; if (@_) { # set $self->{'_tag'} = $self->_fold_case( $_[0] ); } else { # get $self->{'_tag'}; } } sub parent { my $self = shift; if (@_) { # set Carp::croak "an element can't be made its own parent" if defined $_[0] and ref $_[0] and $self eq $_[0]; # sanity _weaken($self->{'_parent'} = $_[0]); } else { $self->{'_parent'}; # get } } sub content_list { return wantarray ? @{ shift->{'_content'} || return () } : scalar @{ shift->{'_content'} || return 0 }; } # a read-only method! can't say $h->content( [] )! sub content { return shift->{'_content'}; } sub content_array_ref { return shift->{'_content'} ||= []; } sub content_refs_list { return \( @{ shift->{'_content'} || return () } ); } sub implicit { return shift->attr( '_implicit', @_ ); } sub pos { my $self = shift; my $pos = $self->{'_pos'}; if (@_) { # set my $parm = shift; if ( defined $parm and $parm ne $self ) { $self->{'_pos'} = $parm; # means that element } else { $self->{'_pos'} = undef; # means $self } } return $pos if defined($pos); return $self; } sub all_attr { return %{ $_[0] }; # Yes, trivial. But no other way for the user to do the same # without breaking encapsulation. # And if our object representation changes, this method's behavior # should stay the same. } sub all_attr_names { return keys %{ $_[0] }; } sub all_external_attr { my $self = $_[0]; return map( ( length($_) && substr( $_, 0, 1 ) eq '_' ) ? () : ( $_, $self->{$_} ), keys %$self ); } sub all_external_attr_names { return grep !( length($_) && substr( $_, 0, 1 ) eq '_' ), keys %{ $_[0] }; } sub id { if ( @_ == 1 ) { return $_[0]{'id'}; } elsif ( @_ == 2 ) { if ( defined $_[1] ) { return $_[0]{'id'} = $_[1]; } else { return delete $_[0]{'id'}; } } else { Carp::croak '$node->id can\'t take ' . scalar(@_) . ' parameters!'; } } sub _gensym { unless ( defined $ID_COUNTER ) { # start it out... $ID_COUNTER = sprintf( '%04x', rand(0x1000) ); $ID_COUNTER =~ tr<0-9a-f><J-NP-Z>; # yes, skip letter "oh" $ID_COUNTER .= '00000'; } ++$ID_COUNTER; } sub idf { my $nparms = scalar @_; if ( $nparms == 1 ) { my $x; if ( defined( $x = $_[0]{'id'} ) and length $x ) { return $x; } else { return $_[0]{'id'} = _gensym(); } } if ( $nparms == 2 ) { if ( defined $_[1] ) { return $_[0]{'id'} = $_[1]; } else { return delete $_[0]{'id'}; } } Carp::croak '$node->idf can\'t take ' . scalar(@_) . ' parameters!'; } sub push_content { my $self = shift; return $self unless @_; my $content = ( $self->{'_content'} ||= [] ); for (@_) { if ( ref($_) eq 'ARRAY' ) { # magically call new_from_lol push @$content, $self->new_from_lol($_); _weaken($content->[-1]->{'_parent'} = $self); } elsif ( ref($_) ) { # insert an element $_->detach if $_->{'_parent'}; _weaken($_->{'_parent'} = $self); push( @$content, $_ ); } else { # insert text segment if ( @$content && !ref $content->[-1] ) { # last content element is also text segment -- append $content->[-1] .= $_; } else { push( @$content, $_ ); } } } return $self; } sub unshift_content { my $self = shift; return $self unless @_; my $content = ( $self->{'_content'} ||= [] ); for ( reverse @_ ) { # so they get added in the order specified if ( ref($_) eq 'ARRAY' ) { # magically call new_from_lol unshift @$content, $self->new_from_lol($_); _weaken($content->[0]->{'_parent'} = $self); } elsif ( ref $_ ) { # insert an element $_->detach if $_->{'_parent'}; _weaken($_->{'_parent'} = $self); unshift( @$content, $_ ); } else { # insert text segment if ( @$content && !ref $content->[0] ) { # last content element is also text segment -- prepend $content->[0] = $_ . $content->[0]; } else { unshift( @$content, $_ ); } } } return $self; } # Cf. splice ARRAY,OFFSET,LENGTH,LIST sub splice_content { my ( $self, $offset, $length, @to_add ) = @_; Carp::croak "splice_content requires at least one argument" if @_ < 2; # at least $h->splice_content($offset); my $content = ( $self->{'_content'} ||= [] ); # prep the list my @out; if ( @_ > 2 ) { # self, offset, length, ... foreach my $n (@to_add) { if ( ref($n) eq 'ARRAY' ) { $n = $self->new_from_lol($n); _weaken($n->{'_parent'} = $self); } elsif ( ref($n) ) { $n->detach; _weaken($n->{'_parent'} = $self); } } @out = splice @$content, $offset, $length, @to_add; } else { # self, offset @out = splice @$content, $offset; } foreach my $n (@out) { $n->{'_parent'} = undef if ref $n; } return @out; } sub detach { my $self = $_[0]; return undef unless ( my $parent = $self->{'_parent'} ); $self->{'_parent'} = undef; my $cohort = $parent->{'_content'} || return $parent; @$cohort = grep { not( ref($_) and $_ eq $self ) } @$cohort; # filter $self out, if parent has any evident content return $parent; } sub detach_content { my $c = $_[0]->{'_content'} || return (); # in case of no content for (@$c) { $_->{'_parent'} = undef if ref $_; } return splice @$c; } sub replace_with { my ( $self, @replacers ) = @_; Carp::croak "the target node has no parent" unless my ($parent) = $self->{'_parent'}; my $parent_content = $parent->{'_content'}; Carp::croak "the target node's parent has no content!?" unless $parent_content and @$parent_content; my $replacers_contains_self; for (@replacers) { if ( !ref $_ ) { # noop } elsif ( $_ eq $self ) { # noop, but check that it's there just once. Carp::croak "Replacement list contains several copies of target!" if $replacers_contains_self++; } elsif ( $_ eq $parent ) { Carp::croak "Can't replace an item with its parent!"; } elsif ( ref($_) eq 'ARRAY' ) { $_ = $self->new_from_lol($_); _weaken($_->{'_parent'} = $parent); } else { $_->detach; _weaken($_->{'_parent'} = $parent); # each of these are necessary } } # for @replacers @$parent_content = map { ( ref($_) and $_ eq $self ) ? @replacers : $_ } @$parent_content; $self->{'_parent'} = undef unless $replacers_contains_self; # if replacers does contain self, then the parent attribute is fine as-is return $self; } sub preinsert { my $self = shift; return $self unless @_; return $self->replace_with( @_, $self ); } sub postinsert { my $self = shift; return $self unless @_; return $self->replace_with( $self, @_ ); } sub replace_with_content { my $self = $_[0]; Carp::croak "the target node has no parent" unless my ($parent) = $self->{'_parent'}; my $parent_content = $parent->{'_content'}; Carp::croak "the target node's parent has no content!?" unless $parent_content and @$parent_content; my $content_r = $self->{'_content'} || []; @$parent_content = map { ( ref($_) and $_ eq $self ) ? @$content_r : $_ } @$parent_content; $self->{'_parent'} = undef; # detach $self from its parent # Update parentage link, removing from $self's content list for ( splice @$content_r ) { _weaken($_->{'_parent'} = $parent) if ref $_ } return $self; # note: doesn't destroy it. } sub delete_content { for ( splice @{ delete( $_[0]->{'_content'} ) # Deleting it here (while holding its value, for the moment) # will keep calls to detach() from trying to uselessly filter # the list (as they won't be able to see it once it's been # deleted) || return ( $_[0] ) # in case of no content }, 0 # the splice is so we can null the array too, just in case # something somewhere holds a ref to it ) { $_->delete if ref $_; } $_[0]; } # two handy aliases sub destroy { shift->delete(@_) } sub destroy_content { shift->delete_content(@_) } sub delete { my $self = $_[0]; $self->delete_content # recurse down if $self->{'_content'} && @{ $self->{'_content'} }; $self->detach if $self->{'_parent'} and $self->{'_parent'}{'_content'}; # not the typical case %$self = (); # null out the whole object on the way out return; } sub clone { #print "Cloning $_[0]\n"; my $it = shift; Carp::croak "clone() can be called only as an object method" unless ref $it; Carp::croak "clone() takes no arguments" if @_; my $new = bless {%$it}, ref($it); # COPY!!! HOOBOY! delete @$new{ '_content', '_parent', '_pos', '_head', '_body' }; # clone any contents if ( $it->{'_content'} and @{ $it->{'_content'} } ) { $new->{'_content'} = [ ref($it)->clone_list( @{ $it->{'_content'} } ) ]; for ( @{ $new->{'_content'} } ) { _weaken($_->{'_parent'} = $new) if ref $_; } } return $new; } sub clone_list { Carp::croak "clone_list can be called only as a class method" if ref shift @_; # all that does is get me here return map { ref($_) ? $_->clone # copy by method : $_ # copy by evaluation } @_; } sub normalize_content { my $start = $_[0]; my $c; return unless $c = $start->{'_content'} and ref $c and @$c; # nothing to do # TODO: if we start having text elements, deal with catenating those too? my @stretches = (undef); # start with a barrier # I suppose this could be rewritten to treat stretches as it goes, instead # of at the end. But feh. # Scan: for ( my $i = 0; $i < @$c; ++$i ) { if ( defined $c->[$i] and ref $c->[$i] ) { # not a text segment if ( $stretches[0] ) { # put in a barrier if ( $stretches[0][1] == 1 ) { #print "Nixing stretch at ", $i-1, "\n"; undef $stretches[0]; # nix the previous one-node "stretch" } else { #print "End of stretch at ", $i-1, "\n"; unshift @stretches, undef; } } # else no need for a barrier } else { # text segment $c->[$i] = '' unless defined $c->[$i]; if ( $stretches[0] ) { ++$stretches[0][1]; # increase length } else { #print "New stretch at $i\n"; unshift @stretches, [ $i, 1 ]; # start and length } } } # Now combine. Note that @stretches is in reverse order, so the indexes # still make sense as we work our way thru (i.e., backwards thru $c). foreach my $s (@stretches) { if ( $s and $s->[1] > 1 ) { #print "Stretch at ", $s->[0], " for ", $s->[1], "\n"; $c->[ $s->[0] ] .= join( '', splice( @$c, $s->[0] + 1, $s->[1] - 1 ) ) # append the subsequent ones onto the first one. } } return; } sub delete_ignorable_whitespace { # This doesn't delete all sorts of whitespace that won't actually # be used in rendering, tho -- that's up to the rendering application. # For example: # <input type='text' name='foo'> # [some whitespace] # <input type='text' name='bar'> # The WS between the two elements /will/ get used by the renderer. # But here: # <input type='hidden' name='foo' value='1'> # [some whitespace] # <input type='text' name='bar' value='2'> # the WS between them won't be rendered in any way, presumably. #my $Debug = 4; die "delete_ignorable_whitespace can be called only as an object method" unless ref $_[0]; print "About to tighten up...\n" if $Debug > 2; my (@to_do) = ( $_[0] ); # Start off. my ( $i, $sibs, $ptag, $this ); # scratch for the loop... while (@to_do) { if ( ( $ptag = ( $this = shift @to_do )->{'_tag'} ) eq 'pre' or $ptag eq 'textarea' or $HTML::Tagset::isCDATA_Parent{$ptag} ) { # block the traversal under those print "Blocking traversal under $ptag\n" if $Debug; next; } next unless ( $sibs = $this->{'_content'} and @$sibs ); for ( $i = $#$sibs; $i >= 0; --$i ) { # work backwards thru the list if ( ref $sibs->[$i] ) { unshift @to_do, $sibs->[$i]; # yes, this happens in pre order -- we're going backwards # thru this sibling list. I doubt it actually matters, tho. next; } next if $sibs->[$i] =~ m<[^\n\r\f\t ]>s; # it's /all/ whitespace print "Under $ptag whose canTighten ", "value is ", 0 + $HTML::Element::canTighten{$ptag}, ".\n" if $Debug > 3; # It's all whitespace... if ( $i == 0 ) { if ( @$sibs == 1 ) { # I'm an only child next unless $HTML::Element::canTighten{$ptag}; # parent } else { # I'm leftmost of many # if either my parent or sib are eligible, I'm good. next unless $HTML::Element::canTighten{$ptag} # parent or (ref $sibs->[1] and $HTML::Element::canTighten{ $sibs->[1] {'_tag'} } # right sib ); } } elsif ( $i == $#$sibs ) { # I'm rightmost of many # if either my parent or sib are eligible, I'm good. next unless $HTML::Element::canTighten{$ptag} # parent or (ref $sibs->[ $i - 1 ] and $HTML::Element::canTighten{ $sibs->[ $i - 1 ] {'_tag'} } # left sib ); } else { # I'm the piggy in the middle # My parent doesn't matter -- it all depends on my sibs next unless ref $sibs->[ $i - 1 ] or ref $sibs->[ $i + 1 ]; # if NEITHER sib is a node, quit next if # bailout condition: if BOTH are INeligible nodes # (as opposed to being text, or being eligible nodes) ref $sibs->[ $i - 1 ] and ref $sibs->[ $i + 1 ] and !$HTML::Element::canTighten{ $sibs->[ $i - 1 ] {'_tag'} } # left sib and !$HTML::Element::canTighten{ $sibs->[ $i + 1 ] {'_tag'} } # right sib ; } # Unknown tags aren't in canTighten and so AREN'T subject to tightening print " delendum: child $i of $ptag\n" if $Debug > 3; splice @$sibs, $i, 1; } # end of the loop-over-children } # end of the while loop. return; } sub insert_element { my ( $self, $tag, $implicit ) = @_; return $self->pos() unless $tag; # noop if nothing to insert my $e; if ( ref $tag ) { $e = $tag; $tag = $e->tag; } else { # just a tag name -- so make the element $e = $self->element_class->new($tag); ++( $self->{'_element_count'} ) if exists $self->{'_element_count'}; # undocumented. see TreeBuilder. } $e->{'_implicit'} = 1 if $implicit; my $pos = $self->{'_pos'}; $pos = $self unless defined $pos; $pos->push_content($e); $self->{'_pos'} = $pos = $e unless $self->_empty_element_map->{$tag} || $e->{'_empty_element'}; $pos; } #========================================================================== # Some things to override in XML::Element sub _empty_element_map { \%HTML::Element::emptyElement; } sub _fold_case_LC { if (wantarray) { shift; map lc($_), @_; } else { return lc( $_[1] ); } } sub _fold_case_NOT { if (wantarray) { shift; @_; } else { return $_[1]; } } *_fold_case = \&_fold_case_LC; #========================================================================== #=head1 DUMPING METHODS sub dump { my ( $self, $fh, $depth ) = @_; $fh = *STDOUT{IO} unless defined $fh; $depth = 0 unless defined $depth; print $fh " " x $depth, $self->starttag, " \@", $self->address, $self->{'_implicit'} ? " (IMPLICIT)\n" : "\n"; for ( @{ $self->{'_content'} } ) { if ( ref $_ ) { # element $_->dump( $fh, $depth + 1 ); # recurse } else { # text node print $fh " " x ( $depth + 1 ); if ( length($_) > 65 or m<[\x00-\x1F]> ) { # it needs prettyin' up somehow or other my $x = ( length($_) <= 65 ) ? $_ : ( substr( $_, 0, 65 ) . '...' ); $x =~ s<([\x00-\x1F])> <'\\x'.(unpack("H2",$1))>eg; print $fh qq{"$x"\n}; } else { print $fh qq{"$_"\n}; } } } } sub as_HTML { my ( $self, $entities, $indent, $omissible_map ) = @_; #my $indent_on = defined($indent) && length($indent); my @html = (); $omissible_map ||= \%HTML::Element::optionalEndTag; my $empty_element_map = $self->_empty_element_map; my $last_tag_tightenable = 0; my $this_tag_tightenable = 0; my $nonindentable_ancestors = 0; # count of nonindentible tags over us. my ( $tag, $node, $start, $depth ); # per-iteration scratch if ( defined($indent) && length($indent) ) { $self->traverse( sub { ( $node, $start, $depth ) = @_; if ( ref $node ) { # it's an element # detect bogus classes. RT #35948, #61673 $node->can('starttag') or Carp::confess( "Object of class " . ref($node) . " cannot be processed by HTML::Element" ); $tag = $node->{'_tag'}; if ($start) { # on the way in if (( $this_tag_tightenable = $HTML::Element::canTighten{$tag} ) and !$nonindentable_ancestors and $last_tag_tightenable ) { push @html, "\n", $indent x $depth, $node->starttag($entities), ; } else { push( @html, $node->starttag($entities) ); } $last_tag_tightenable = $this_tag_tightenable; ++$nonindentable_ancestors if $tag eq 'pre' or $tag eq 'textarea' or $HTML::Tagset::isCDATA_Parent{$tag}; } elsif ( not( $empty_element_map->{$tag} or $omissible_map->{$tag} ) ) { # on the way out if ( $tag eq 'pre' or $tag eq 'textarea' or $HTML::Tagset::isCDATA_Parent{$tag} ) { --$nonindentable_ancestors; $last_tag_tightenable = $HTML::Element::canTighten{$tag}; push @html, $node->endtag; } else { # general case if (( $this_tag_tightenable = $HTML::Element::canTighten{$tag} ) and !$nonindentable_ancestors and $last_tag_tightenable ) { push @html, "\n", $indent x $depth, $node->endtag, ; } else { push @html, $node->endtag; } $last_tag_tightenable = $this_tag_tightenable; #print "$tag tightenable: $this_tag_tightenable\n"; } } } else { # it's a text segment $last_tag_tightenable = 0; # I guess this is right HTML::Entities::encode_entities( $node, $entities ) # That does magic things if $entities is undef. unless ( ( defined($entities) && !length($entities) ) # If there's no entity to encode, don't call it || $HTML::Tagset::isCDATA_Parent{ $_[3]{'_tag'} } # To keep from amp-escaping children of script et al. # That doesn't deal with descendants; but then, CDATA # parents shouldn't /have/ descendants other than a # text children (or comments?) || $encoded_content ); if ($nonindentable_ancestors) { push @html, $node; # say no go } else { if ($last_tag_tightenable) { $node =~ s<[\n\r\f\t ]+>< >s; #$node =~ s< $><>s; $node =~ s<^ ><>s; push @html, "\n", $indent x $depth, $node, #Text::Wrap::wrap($indent x $depth, $indent x $depth, "\n" . $node) ; } else { push @html, $node, #Text::Wrap::wrap('', $indent x $depth, $node) ; } } } 1; # keep traversing } ); # End of parms to traverse() } else { # no indenting -- much simpler code $self->traverse( sub { ( $node, $start ) = @_; if ( ref $node ) { # detect bogus classes. RT #35948 $node->isa( $self->element_class ) or Carp::confess( "Object of class " . ref($node) . " cannot be processed by HTML::Element" ); $tag = $node->{'_tag'}; if ($start) { # on the way in push( @html, $node->starttag($entities) ); } elsif ( not( $empty_element_map->{$tag} or $omissible_map->{$tag} ) ) { # on the way out push( @html, $node->endtag ); } } else { # simple text content HTML::Entities::encode_entities( $node, $entities ) # That does magic things if $entities is undef. unless ( ( defined($entities) && !length($entities) ) # If there's no entity to encode, don't call it || $HTML::Tagset::isCDATA_Parent{ $_[3]{'_tag'} } # To keep from amp-escaping children of script et al. # That doesn't deal with descendants; but then, CDATA # parents shouldn't /have/ descendants other than a # text children (or comments?) || $encoded_content ); push( @html, $node ); } 1; # keep traversing } ); # End of parms to traverse() } if ( $self->{_store_declarations} && defined $self->{_decl} ) { unshift @html, sprintf "<!%s>\n", $self->{_decl}->{text}; } return join( '', @html ); } sub as_text { # Yet another iteratively implemented traverser my ( $this, %options ) = @_; my $skip_dels = $options{'skip_dels'} || 0; my (@pile) = ($this); my $tag; my $text = ''; while (@pile) { if ( !defined( $pile[0] ) ) { # undef! # no-op } elsif ( !ref( $pile[0] ) ) { # text bit! save it! $text .= shift @pile; } else { # it's a ref -- traverse under it unshift @pile, @{ $this->{'_content'} || $nillio } unless ( $tag = ( $this = shift @pile )->{'_tag'} ) eq 'style' or $tag eq 'script' or ( $skip_dels and $tag eq 'del' ); } } return $text; } # extra_chars added for RT #26436 sub as_trimmed_text { my ( $this, %options ) = @_; my $text = $this->as_text(%options); my $extra_chars = defined $options{'extra_chars'} ? $options{'extra_chars'} : ''; $text =~ s/[\n\r\f\t$extra_chars ]+$//s; $text =~ s/^[\n\r\f\t$extra_chars ]+//s; $text =~ s/[\n\r\f\t$extra_chars ]+/ /g; return $text; } sub as_text_trimmed { shift->as_trimmed_text(@_) } # alias, because I forget # TODO: make it wrap, if not indent? sub as_XML { # based an as_HTML my ($self) = @_; #my $indent_on = defined($indent) && length($indent); my @xml = (); my $empty_element_map = $self->_empty_element_map; my ( $tag, $node, $start ); # per-iteration scratch $self->traverse( sub { ( $node, $start ) = @_; if ( ref $node ) { # it's an element $tag = $node->{'_tag'}; if ($start) { # on the way in foreach my $attr ( $node->all_attr_names() ) { Carp::croak( "$tag has an invalid attribute name '$attr'") unless ( $attr eq '/' || $self->_valid_name($attr) ); } if ( $empty_element_map->{$tag} and !@{ $node->{'_content'} || $nillio } ) { push( @xml, $node->starttag_XML( undef, 1 ) ); } else { push( @xml, $node->starttag_XML(undef) ); } } else { # on the way out unless ( $empty_element_map->{$tag} and !@{ $node->{'_content'} || $nillio } ) { push( @xml, $node->endtag_XML() ); } # otherwise it will have been an <... /> tag. } } else { # it's just text _xml_escape($node); push( @xml, $node ); } 1; # keep traversing } ); join( '', @xml, "\n" ); } sub _xml_escape { # DESTRUCTIVE (a.k.a. "in-place") # Five required escapes: http://www.w3.org/TR/2006/REC-xml11-20060816/#syntax # We allow & if it's part of a valid escape already: http://www.w3.org/TR/2006/REC-xml11-20060816/#sec-references foreach my $x (@_) { # In strings with no encoded entities all & should be encoded. if ($encoded_content) { $x =~ s/&(?! # An ampersand that isn't followed by... (\#\d+; | # A hash mark, digits and semicolon, or \#x[\da-f]+; | # A hash mark, "x", hex digits and semicolon, or $START_CHAR$NAME_CHAR+; ) # A valid unicode entity name and semicolon )/&/gx; # Needs to be escaped to amp } else { $x =~ s/&/&/g; } # simple character escapes $x =~ s/</</g; $x =~ s/>/>/g; $x =~ s/"/"/g; $x =~ s/'/'/g; } return; } # NOTES: # # It's been suggested that attribute names be made :-keywords: # (:_tag "img" :border 0 :src "pie.png" :usemap "#main.map") # However, it seems that Scheme has no such data type as :-keywords. # So, for the moment at least, I tend toward simplicity, uniformity, # and universality, where everything a string or a list. sub as_Lisp_form { my @out; my $sub; my $depth = 0; my ( @list, $val ); $sub = sub { # Recursor my $self = $_[0]; @list = ( '_tag', $self->{'_tag'} ); @list = () unless defined $list[-1]; # unlikely for ( sort keys %$self ) { # predictable ordering next if $_ eq '_content' or $_ eq '_tag' or $_ eq '_parent' or $_ eq '/'; # Leave the other private attributes, I guess. push @list, $_, $val if defined( $val = $self->{$_} ); # and !ref $val; } for (@list) { # octal-escape it s<([^\x20\x21\x23\x27-\x5B\x5D-\x7E])> <sprintf('\\%03o',ord($1))>eg; $_ = qq{"$_"}; } push @out, ( ' ' x $depth ) . '(' . join ' ', splice @list; if ( @{ $self->{'_content'} || $nillio } ) { $out[-1] .= " \"_content\" (\n"; ++$depth; foreach my $c ( @{ $self->{'_content'} } ) { if ( ref($c) ) { # an element -- recurse $sub->($c); } else { # a text segment -- stick it in and octal-escape it push @out, $c; $out[-1] =~ s<([^\x20\x21\x23\x27-\x5B\x5D-\x7E])> <sprintf('\\%03o',ord($1))>eg; # And quote and indent it. $out[-1] .= "\"\n"; $out[-1] = ( ' ' x $depth ) . '"' . $out[-1]; } } --$depth; substr( $out[-1], -1 ) = "))\n"; # end of _content and of the element } else { $out[-1] .= ")\n"; } return; }; $sub->( $_[0] ); undef $sub; return join '', @out; } sub format { my ( $self, $formatter ) = @_; unless ( defined $formatter ) { # RECOMMEND PREREQ: HTML::FormatText require HTML::FormatText; $formatter = HTML::FormatText->new(); } $formatter->format($self); } sub starttag { my ( $self, $entities ) = @_; my $name = $self->{'_tag'}; return $self->{'text'} if $name eq '~literal'; return "<!" . $self->{'text'} . ">" if $name eq '~declaration'; return "<?" . $self->{'text'} . ">" if $name eq '~pi'; if ( $name eq '~comment' ) { if ( ref( $self->{'text'} || '' ) eq 'ARRAY' ) { # Does this ever get used? And is this right? return "<!" . join( ' ', map( "--$_--", @{ $self->{'text'} } ) ) . ">"; } else { return "<!--" . $self->{'text'} . "-->"; } } my $tag = $html_uc ? "<\U$name" : "<\L$name"; my $val; for ( sort keys %$self ) { # predictable ordering next if !length $_ or m/^_/s or $_ eq '/'; $val = $self->{$_}; next if !defined $val; # or ref $val; if ($_ eq $val && # if attribute is boolean, for this element exists( $HTML::Element::boolean_attr{$name} ) && (ref( $HTML::Element::boolean_attr{$name} ) ? $HTML::Element::boolean_attr{$name}{$_} : $HTML::Element::boolean_attr{$name} eq $_ ) ) { $tag .= $html_uc ? " \U$_" : " \L$_"; } else { # non-boolean attribute if ( ref $val eq 'HTML::Element' and $val->{_tag} eq '~literal' ) { $val = $val->{text}; } else { HTML::Entities::encode_entities( $val, $entities ) unless ( defined($entities) && !length($entities) || $encoded_content ); } $val = qq{"$val"}; $tag .= $html_uc ? qq{ \U$_\E=$val} : qq{ \L$_\E=$val}; } } # for keys if ( scalar $self->content_list == 0 && $self->_empty_element_map->{ $self->tag } ) { return $tag . " />"; } else { return $tag . ">"; } } sub starttag_XML { my ($self) = @_; # and a third parameter to signal emptiness? my $name = $self->{'_tag'}; return $self->{'text'} if $name eq '~literal'; return '<!' . $self->{'text'} . '>' if $name eq '~declaration'; return "<?" . $self->{'text'} . "?>" if $name eq '~pi'; if ( $name eq '~comment' ) { if ( ref( $self->{'text'} || '' ) eq 'ARRAY' ) { # Does this ever get used? And is this right? $name = join( ' ', @{ $self->{'text'} } ); } else { $name = $self->{'text'}; } $name =~ s/--/--/g; # can't have double --'s in XML comments return "<!--$name-->"; } my $tag = "<$name"; my $val; for ( sort keys %$self ) { # predictable ordering next if !length $_ or m/^_/s or $_ eq '/'; # Hm -- what to do if val is undef? # I suppose that shouldn't ever happen. next if !defined( $val = $self->{$_} ); # or ref $val; _xml_escape($val); $tag .= qq{ $_="$val"}; } @_ == 3 ? "$tag />" : "$tag>"; } sub endtag { $html_uc ? "</\U$_[0]->{'_tag'}>" : "</\L$_[0]->{'_tag'}>"; } sub endtag_XML { "</$_[0]->{'_tag'}>"; } #========================================================================== # This, ladies and germs, is an iterative implementation of a # recursive algorithm. DON'T TRY THIS AT HOME. # Basically, the algorithm says: # # To traverse: # 1: pre-order visit this node # 2: traverse any children of this node # 3: post-order visit this node, unless it's a text segment, # or a prototypically empty node (like "br", etc.) # Add to that the consideration of the callbacks' return values, # so you can block visitation of the children, or siblings, or # abort the whole excursion, etc. # # So, why all this hassle with making the code iterative? # It makes for real speed, because it eliminates the whole # hassle of Perl having to allocate scratch space for each # instance of the recursive sub. Since the algorithm # is basically simple (and not all recursive ones are!) and # has few necessary lexicals (basically just the current node's # content list, and the current position in it), it was relatively # straightforward to store that information not as the frame # of a sub, but as a stack, i.e., a simple Perl array (well, two # of them, actually: one for content-listrefs, one for indexes of # current position in each of those). my $NIL = []; sub traverse { my ( $start, $callback, $ignore_text ) = @_; Carp::croak "traverse can be called only as an object method" unless ref $start; Carp::croak('must provide a callback for traverse()!') unless defined $callback and ref $callback; # Elementary type-checking: my ( $c_pre, $c_post ); if ( UNIVERSAL::isa( $callback, 'CODE' ) ) { $c_pre = $c_post = $callback; } elsif ( UNIVERSAL::isa( $callback, 'ARRAY' ) ) { ( $c_pre, $c_post ) = @$callback; Carp::croak( "pre-order callback \"$c_pre\" is true but not a coderef!") if $c_pre and not UNIVERSAL::isa( $c_pre, 'CODE' ); Carp::croak( "pre-order callback \"$c_post\" is true but not a coderef!") if $c_post and not UNIVERSAL::isa( $c_post, 'CODE' ); return $start unless $c_pre or $c_post; # otherwise there'd be nothing to actually do! } else { Carp::croak("$callback is not a known kind of reference") unless ref($callback); } my $empty_element_map = $start->_empty_element_map; my (@C) = [$start]; # a stack containing lists of children my (@I) = (-1); # initial value must be -1 for each list # a stack of indexes to current position in corresponding lists in @C # In each of these, 0 is the active point # scratch: my ($rv, # return value of callback $this, # current node $content_r, # child list of $this ); # THE BIG LOOP while (@C) { # Move to next item in this frame if ( !defined( $I[0] ) or ++$I[0] >= @{ $C[0] } ) { # We either went off the end of this list, or aborted the list # So call the post-order callback: if ( $c_post and defined $I[0] and @C > 1 # to keep the next line from autovivifying and defined( $this = $C[1][ $I[1] ] ) # sanity, and # suppress callbacks on exiting the fictional top frame and ref($this) # sanity and not( $this->{'_empty_element'} || ( $empty_element_map->{ $this->{'_tag'} || '' } && !@{ $this->{'_content'} } ) # RT #49932 ) # things that don't get post-order callbacks ) { shift @I; shift @C; #print "Post! at depth", scalar(@I), "\n"; $rv = $c_post->( #map $_, # copy to avoid any messiness $this, # 0: this 0, # 1: startflag (0 for post-order call) @I - 1, # 2: depth ); if ( defined($rv) and ref($rv) eq $travsignal_package ) { $rv = $$rv; #deref if ( $rv eq 'ABORT' ) { last; # end of this excursion! } elsif ( $rv eq 'PRUNE' ) { # NOOP on post!! } elsif ( $rv eq 'PRUNE_SOFTLY' ) { # NOOP on post!! } elsif ( $rv eq 'OK' ) { # noop } elsif ( $rv eq 'PRUNE_UP' ) { $I[0] = undef; } else { die "Unknown travsignal $rv\n"; # should never happen } } } else { shift @I; shift @C; } next; } $this = $C[0][ $I[0] ]; if ($c_pre) { if ( defined $this and ref $this ) { # element $rv = $c_pre->( #map $_, # copy to avoid any messiness $this, # 0: this 1, # 1: startflag (1 for pre-order call) @I - 1, # 2: depth ); } else { # text segment next if $ignore_text; $rv = $c_pre->( #map $_, # copy to avoid any messiness $this, # 0: this 1, # 1: startflag (1 for pre-order call) @I - 1, # 2: depth $C[1][ $I[1] ], # 3: parent # And there will always be a $C[1], since # we can't start traversing at a text node $I[0] # 4: index of self in parent's content list ); } if ( not $rv ) { # returned false. Same as PRUNE. next; # prune } elsif ( ref($rv) eq $travsignal_package ) { $rv = $$rv; # deref if ( $rv eq 'ABORT' ) { last; # end of this excursion! } elsif ( $rv eq 'PRUNE' ) { next; } elsif ( $rv eq 'PRUNE_SOFTLY' ) { if (ref($this) and not( $this->{'_empty_element'} || $empty_element_map->{ $this->{'_tag'} || '' } ) ) { # push a dummy empty content list just to trigger a post callback unshift @I, -1; unshift @C, $NIL; } next; } elsif ( $rv eq 'OK' ) { # noop } elsif ( $rv eq 'PRUNE_UP' ) { $I[0] = undef; next; # equivalent of last'ing out of the current child list. # Used to have PRUNE_UP_SOFTLY and ABORT_SOFTLY here, but the code # for these was seriously upsetting, served no particularly clear # purpose, and could not, I think, be easily implemented with a # recursive routine. All bad things! } else { die "Unknown travsignal $rv\n"; # should never happen } } # else fall thru to meaning same as \'OK'. } # end of pre-order calling # Now queue up content list for the current element... if (ref $this and not( # ...except for those which... not( $content_r = $this->{'_content'} and @$content_r ) # ...have empty content lists... and $this->{'_empty_element'} || $empty_element_map->{ $this->{'_tag'} || '' } # ...and that don't get post-order callbacks ) ) { unshift @I, -1; unshift @C, $content_r || $NIL; #print $this->{'_tag'}, " ($this) adds content_r ", $C[0], "\n"; } } return $start; } sub is_inside { my $self = shift; return 0 unless @_; # if no items specified, I guess this is right. my $current = $self; # the loop starts by looking at the given element if (scalar @_ == 1) { while ( defined $current and ref $current ) { return 1 if $current eq $_[0] || $current->{'_tag'} eq $_[0]; $current = $current->{'_parent'}; } return 0; } else { my %elements = map { $_ => 1 } @_; while ( defined $current and ref $current ) { return 1 if $elements{$current} || $elements{ $current->{'_tag'} }; $current = $current->{'_parent'}; } } return 0; } sub is_empty { my $self = shift; !$self->{'_content'} || !@{ $self->{'_content'} }; } sub pindex { my $self = shift; my $parent = $self->{'_parent'} || return undef; my $pc = $parent->{'_content'} || return undef; for ( my $i = 0; $i < @$pc; ++$i ) { return $i if ref $pc->[$i] and $pc->[$i] eq $self; } return undef; # we shouldn't ever get here } #-------------------------------------------------------------------------- sub left { Carp::croak "left() is supposed to be an object method" unless ref $_[0]; my $pc = ( $_[0]->{'_parent'} || return )->{'_content'} || die "parent is childless?"; die "parent is childless" unless @$pc; return if @$pc == 1; # I'm an only child if (wantarray) { my @out; foreach my $j (@$pc) { return @out if ref $j and $j eq $_[0]; push @out, $j; } } else { for ( my $i = 0; $i < @$pc; ++$i ) { return $i ? $pc->[ $i - 1 ] : undef if ref $pc->[$i] and $pc->[$i] eq $_[0]; } } die "I'm not in my parent's content list?"; return; } sub right { Carp::croak "right() is supposed to be an object method" unless ref $_[0]; my $pc = ( $_[0]->{'_parent'} || return )->{'_content'} || die "parent is childless?"; die "parent is childless" unless @$pc; return if @$pc == 1; # I'm an only child if (wantarray) { my ( @out, $seen ); foreach my $j (@$pc) { if ($seen) { push @out, $j; } else { $seen = 1 if ref $j and $j eq $_[0]; } } die "I'm not in my parent's content list?" unless $seen; return @out; } else { for ( my $i = 0; $i < @$pc; ++$i ) { return +( $i == $#$pc ) ? undef : $pc->[ $i + 1 ] if ref $pc->[$i] and $pc->[$i] eq $_[0]; } die "I'm not in my parent's content list?"; return; } } #-------------------------------------------------------------------------- sub address { if ( @_ == 1 ) { # report-address form return join( '.', reverse( # so it starts at the top map( $_->pindex() || '0', # so that root's undef -> '0' $_[0], # self and... $_[0]->lineage ) ) ); } else { # get-node-at-address my @stack = split( /\./, $_[1] ); my $here; if ( @stack and !length $stack[0] ) { # relative addressing $here = $_[0]; shift @stack; } else { # absolute addressing return undef unless 0 == shift @stack; # pop the initial 0-for-root $here = $_[0]->root; } while (@stack) { return undef unless $here->{'_content'} and @{ $here->{'_content'} } > $stack[0]; # make sure the index isn't too high $here = $here->{'_content'}[ shift @stack ]; return undef if @stack and not ref $here; # we hit a text node when we expected a non-terminal element node } return $here; } } sub depth { my $here = $_[0]; my $depth = 0; while ( defined( $here = $here->{'_parent'} ) and ref($here) ) { ++$depth; } return $depth; } sub root { my $here = my $root = shift; while ( defined( $here = $here->{'_parent'} ) and ref($here) ) { $root = $here; } return $root; } sub lineage { my $here = shift; my @lineage; while ( defined( $here = $here->{'_parent'} ) and ref($here) ) { push @lineage, $here; } return @lineage; } sub lineage_tag_names { my $here = my $start = shift; my @lineage_names; while ( defined( $here = $here->{'_parent'} ) and ref($here) ) { push @lineage_names, $here->{'_tag'}; } return @lineage_names; } sub descendents { shift->descendants(@_) } sub descendants { my $start = shift; if (wantarray) { my @descendants; $start->traverse( [ # pre-order sub only sub { push( @descendants, $_[0] ); return 1; }, undef # no post ], 1, # ignore text ); shift @descendants; # so $self doesn't appear in the list return @descendants; } else { # just returns a scalar my $descendants = -1; # to offset $self being counted $start->traverse( [ # pre-order sub only sub { ++$descendants; return 1; }, undef # no post ], 1, # ignore text ); return $descendants; } } sub find { shift->find_by_tag_name(@_) } # yup, a handy alias sub find_by_tag_name { my (@pile) = shift(@_); # start out the to-do stack for the traverser Carp::croak "find_by_tag_name can be called only as an object method" unless ref $pile[0]; return () unless @_; my (@tags) = $pile[0]->_fold_case(@_); my ( @matching, $this, $this_tag ); while (@pile) { $this_tag = ( $this = shift @pile )->{'_tag'}; foreach my $t (@tags) { if ( $t eq $this_tag ) { if (wantarray) { push @matching, $this; last; } else { return $this; } } } unshift @pile, grep ref($_), @{ $this->{'_content'} || next }; } return @matching if wantarray; return; } sub find_by_attribute { # We could limit this to non-internal attributes, but hey. my ( $self, $attribute, $value ) = @_; Carp::croak "Attribute must be a defined value!" unless defined $attribute; $attribute = $self->_fold_case($attribute); my @matching; my $wantarray = wantarray; my $quit; $self->traverse( [ # pre-order only sub { if ( exists $_[0]{$attribute} and $_[0]{$attribute} eq $value ) { push @matching, $_[0]; return HTML::Element::ABORT unless $wantarray; # only take the first } 1; # keep traversing }, undef # no post ], 1, # yes, ignore text nodes. ); if ($wantarray) { return @matching; } else { return $matching[0]; } } #-------------------------------------------------------------------------- sub look_down { ref( $_[0] ) or Carp::croak "look_down works only as an object method"; my @criteria; for ( my $i = 1; $i < @_; ) { Carp::croak "Can't use undef as an attribute name" unless defined $_[$i]; if ( ref $_[$i] ) { Carp::croak "A " . ref( $_[$i] ) . " value is not a criterion" unless ref $_[$i] eq 'CODE'; push @criteria, $_[ $i++ ]; } else { Carp::croak "param list to look_down ends in a key!" if $i == $#_; push @criteria, [ scalar( $_[0]->_fold_case( $_[$i] ) ), defined( $_[ $i + 1 ] ) ? ( ( ref $_[ $i + 1 ] ? $_[ $i + 1 ] : lc( $_[ $i + 1 ] ) ), ref( $_[ $i + 1 ] ) ) # yes, leave that LC! : undef ]; $i += 2; } } Carp::croak "No criteria?" unless @criteria; my (@pile) = ( $_[0] ); my ( @matching, $val, $this ); Node: while ( defined( $this = shift @pile ) ) { # Yet another traverser implemented with merely iterative code. foreach my $c (@criteria) { if ( ref($c) eq 'CODE' ) { next Node unless $c->($this); # jump to the continue block } else { # it's an attr-value pair next Node # jump to the continue block if # two values are unequal if: ( defined( $val = $this->{ $c->[0] } ) ) ? ( !defined $c->[ 1 ] # actual is def, critval is undef => fail # allow regex matching # allow regex matching or ( $c->[2] eq 'Regexp' ? $val !~ $c->[1] : ( ref $val ne $c->[2] # have unequal ref values => fail or lc($val) ne lc( $c->[1] ) # have unequal lc string values => fail ) ) ) : ( defined $c->[1] ) # actual is undef, critval is def => fail } } # We make it this far only if all the criteria passed. return $this unless wantarray; push @matching, $this; } continue { unshift @pile, grep ref($_), @{ $this->{'_content'} || $nillio }; } return @matching if wantarray; return; } sub look_up { ref( $_[0] ) or Carp::croak "look_up works only as an object method"; my @criteria; for ( my $i = 1; $i < @_; ) { Carp::croak "Can't use undef as an attribute name" unless defined $_[$i]; if ( ref $_[$i] ) { Carp::croak "A " . ref( $_[$i] ) . " value is not a criterion" unless ref $_[$i] eq 'CODE'; push @criteria, $_[ $i++ ]; } else { Carp::croak "param list to look_up ends in a key!" if $i == $#_; push @criteria, [ scalar( $_[0]->_fold_case( $_[$i] ) ), defined( $_[ $i + 1 ] ) ? ( ( ref $_[ $i + 1 ] ? $_[ $i + 1 ] : lc( $_[ $i + 1 ] ) ), ref( $_[ $i + 1 ] ) ) : undef # Yes, leave that LC! ]; $i += 2; } } Carp::croak "No criteria?" unless @criteria; my ( @matching, $val ); my $this = $_[0]; Node: while (1) { # You'll notice that the code here is almost the same as for look_down. foreach my $c (@criteria) { if ( ref($c) eq 'CODE' ) { next Node unless $c->($this); # jump to the continue block } else { # it's an attr-value pair next Node # jump to the continue block if # two values are unequal if: ( defined( $val = $this->{ $c->[0] } ) ) ? ( !defined $c->[ 1 ] # actual is def, critval is undef => fail or ( $c->[2] eq 'Regexp' ? $val !~ $c->[1] : ( ref $val ne $c->[2] # have unequal ref values => fail or lc($val) ne $c->[1] # have unequal lc string values => fail ) ) ) : ( defined $c->[1] ) # actual is undef, critval is def => fail } } # We make it this far only if all the criteria passed. return $this unless wantarray; push @matching, $this; } continue { last unless defined( $this = $this->{'_parent'} ) and ref $this; } return @matching if wantarray; return; } #-------------------------------------------------------------------------- sub attr_get_i { if ( @_ > 2 ) { my $self = shift; Carp::croak "No attribute names can be undef!" if grep !defined($_), @_; my @attributes = $self->_fold_case(@_); if (wantarray) { my @out; foreach my $x ( $self, $self->lineage ) { push @out, map { exists( $x->{$_} ) ? $x->{$_} : () } @attributes; } return @out; } else { foreach my $x ( $self, $self->lineage ) { foreach my $attribute (@attributes) { return $x->{$attribute} if exists $x->{$attribute}; # found } } return; # never found } } else { # Single-attribute search. Simpler, most common, so optimize # for the most common case Carp::croak "Attribute name must be a defined value!" unless defined $_[1]; my $self = $_[0]; my $attribute = $self->_fold_case( $_[1] ); if (wantarray) { # list context return map { exists( $_->{$attribute} ) ? $_->{$attribute} : () } $self, $self->lineage; } else { # scalar context foreach my $x ( $self, $self->lineage ) { return $x->{$attribute} if exists $x->{$attribute}; # found } return; # never found } } } sub tagname_map { my (@pile) = $_[0]; # start out the to-do stack for the traverser Carp::croak "find_by_tag_name can be called only as an object method" unless ref $pile[0]; my ( %map, $this_tag, $this ); while (@pile) { $this_tag = '' unless defined( $this_tag = ( $this = shift @pile )->{'_tag'} ) ; # dance around the strange case of having an undef tagname. push @{ $map{$this_tag} ||= [] }, $this; # add to map unshift @pile, grep ref($_), @{ $this->{'_content'} || next }; # traverse } return \%map; } sub extract_links { my $start = shift; my %wantType; @wantType{ $start->_fold_case(@_) } = (1) x @_; # if there were any my $wantType = scalar(@_); my @links; # TODO: add xml:link? my ( $link_attrs, $tag, $self, $val ); # scratch for each iteration $start->traverse( [ sub { # pre-order call only $self = $_[0]; $tag = $self->{'_tag'}; return 1 if $wantType && !$wantType{$tag}; # if we're selective if (defined( $link_attrs = $HTML::Element::linkElements{$tag} ) ) { # If this is a tag that has any link attributes, # look over possibly present link attributes, # saving the value, if found. for ( ref($link_attrs) ? @$link_attrs : $link_attrs ) { if ( defined( $val = $self->attr($_) ) ) { push( @links, [ $val, $self, $_, $tag ] ); } } } 1; # return true, so we keep recursing }, undef ], 1, # ignore text nodes ); \@links; } sub simplify_pres { my $pre = 0; my $sub; my $line; $sub = sub { ++$pre if $_[0]->{'_tag'} eq 'pre'; foreach my $it ( @{ $_[0]->{'_content'} || return } ) { if ( ref $it ) { $sub->($it); # recurse! } elsif ($pre) { #$it =~ s/(?:(?:\cm\cj*)|(?:\cj))/\n/g; $it = join "\n", map { ; $line = $_; while ( $line =~ s/^([^\t]*)(\t+)/$1.(" " x ((length($2)<<3)-(length($1)&7)))/e # Sort of adapted from Text::Tabs -- yes, it's hardwired-in that # tabs are at every EIGHTH column. ) { } $line; } split /(?:(?:\cm\cj*)|(?:\cj))/, $it, -1; } } --$pre if $_[0]->{'_tag'} eq 'pre'; return; }; $sub->( $_[0] ); undef $sub; return; } sub same_as { die 'same_as() takes only one argument: $h->same_as($i)' unless @_ == 2; my ( $h, $i ) = @_[ 0, 1 ]; die "same_as() can be called only as an object method" unless ref $h; return 0 unless defined $i and ref $i; # An element can't be same_as anything but another element! # They needn't be of the same class, tho. return 1 if $h eq $i; # special (if rare) case: anything is the same as... itself! # assumes that no content lists in/under $h or $i contain subsequent # text segments, like: ['foo', ' bar'] # compare attributes now. #print "Comparing tags of $h and $i...\n"; return 0 unless $h->{'_tag'} eq $i->{'_tag'}; # only significant attribute whose name starts with "_" #print "Comparing attributes of $h and $i...\n"; # Compare attributes, but only the real ones. { # Bear in mind that the average element has very few attributes, # and that element names are rather short. # (Values are a different story.) # XXX I would think that /^[^_]/ would be faster, at least easier to read. my @keys_h = sort grep { length $_ and substr( $_, 0, 1 ) ne '_' } keys %$h; my @keys_i = sort grep { length $_ and substr( $_, 0, 1 ) ne '_' } keys %$i; return 0 unless @keys_h == @keys_i; # different number of real attributes? they're different. for ( my $x = 0; $x < @keys_h; ++$x ) { return 0 unless $keys_h[$x] eq $keys_i[$x] and # same key name $h->{ $keys_h[$x] } eq $i->{ $keys_h[$x] }; # same value # Should this test for definedness on values? # People shouldn't be putting undef in attribute values, I think. } } #print "Comparing children of $h and $i...\n"; my $hcl = $h->{'_content'} || []; my $icl = $i->{'_content'} || []; return 0 unless @$hcl == @$icl; # different numbers of children? they're different. if (@$hcl) { # compare each of the children: for ( my $x = 0; $x < @$hcl; ++$x ) { if ( ref $hcl->[$x] ) { return 0 unless ref( $icl->[$x] ); # an element can't be the same as a text segment # Both elements: return 0 unless $hcl->[$x]->same_as( $icl->[$x] ); # RECURSE! } else { return 0 if ref( $icl->[$x] ); # a text segment can't be the same as an element # Both text segments: return 0 unless $hcl->[$x] eq $icl->[$x]; } } } return 1; # passed all the tests! } sub new_from_lol { my $class = shift; $class = ref($class) || $class; # calling as an object method is just the same as ref($h)->new_from_lol(...) my $lol = $_[1]; my @ancestor_lols; # So we can make sure there's no cyclicities in this lol. # That would be perverse, but one never knows. my ( $sub, $k, $v, $node ); # last three are scratch values $sub = sub { #print "Building for $_[0]\n"; my $lol = $_[0]; return unless @$lol; my ( @attributes, @children ); Carp::croak "Cyclicity detected in source LOL tree, around $lol?!?" if grep( $_ eq $lol, @ancestor_lols ); push @ancestor_lols, $lol; my $tag_name = 'null'; # Recursion in in here: for ( my $i = 0; $i < @$lol; ++$i ) { # Iterate over children if ( ref( $lol->[$i] ) eq 'ARRAY' ) { # subtree: most common thing in loltree push @children, $sub->( $lol->[$i] ); } elsif ( !ref( $lol->[$i] ) ) { if ( $i == 0 ) { # name $tag_name = $lol->[$i]; Carp::croak "\"$tag_name\" isn't a good tag name!" if $tag_name =~ m/[<>\/\x00-\x20]/ ; # minimal sanity, certainly! } else { # text segment child push @children, $lol->[$i]; } } elsif ( ref( $lol->[$i] ) eq 'HASH' ) { # attribute hashref keys %{ $lol->[$i] }; # reset the each-counter, just in case while ( ( $k, $v ) = each %{ $lol->[$i] } ) { push @attributes, $class->_fold_case($k), $v if defined $v and $k ne '_name' and $k ne '_content' and $k ne '_parent'; # enforce /some/ sanity! } } elsif ( UNIVERSAL::isa( $lol->[$i], __PACKAGE__ ) ) { if ( $lol->[$i]->{'_parent'} ) { # if claimed #print "About to clone ", $lol->[$i], "\n"; push @children, $lol->[$i]->clone(); } else { push @children, $lol->[$i]; # if unclaimed... #print "Claiming ", $lol->[$i], "\n"; $lol->[$i]->{'_parent'} = 1; # claim it NOW # This WILL be replaced by the correct value once we actually # construct the parent, just after the end of this loop... } } else { Carp::croak "new_from_lol doesn't handle references of type " . ref( $lol->[$i] ); } } pop @ancestor_lols; $node = $class->new($tag_name); #print "Children: @children\n"; if ( $class eq __PACKAGE__ ) { # Special-case it, for speed: %$node = ( %$node, @attributes ) if @attributes; #print join(' ', $node, ' ' , map("<$_>", %$node), "\n"); if (@children) { $node->{'_content'} = \@children; foreach my $c (@children) { _weaken($c->{'_parent'} = $node) if ref $c; } } } else { # Do it the clean way... #print "Done neatly\n"; while (@attributes) { $node->attr( splice @attributes, 0, 2 ) } $node->push_content( map { _weaken($_->{'_parent'} = $node) if ref $_; $_ } @children ) if @children; } return $node; }; # End of sub definition. if (wantarray) { my (@nodes) = map { ; ( ref($_) eq 'ARRAY' ) ? $sub->($_) : $_ } @_; # Let text bits pass thru, I guess. This makes this act more like # unshift_content et al. Undocumented. undef $sub; # so it won't be in its own frame, so its refcount can hit 0 return @nodes; } else { Carp::croak "new_from_lol in scalar context needs exactly one lol" unless @_ == 1; return $_[0] unless ref( $_[0] ) eq 'ARRAY'; # used to be a fatal error. still undocumented tho. $node = $sub->( $_[0] ); undef $sub; # so it won't be in its own frame, so its refcount can hit 0 return $node; } } sub objectify_text { my (@stack) = ( $_[0] ); my ($this); while (@stack) { foreach my $c ( @{ ( $this = shift @stack )->{'_content'} } ) { if ( ref($c) ) { unshift @stack, $c; # visit it later. } else { $c = $this->element_class->new( '~text', 'text' => $c, '_parent' => $this ); } } } return; } sub deobjectify_text { my (@stack) = ( $_[0] ); my ($old_node); if ( $_[0]{'_tag'} eq '~text' ) { # special case # Puts the $old_node variable to a different purpose if ( $_[0]{'_parent'} ) { $_[0]->replace_with( $old_node = delete $_[0]{'text'} )->delete; } else { # well, that's that, then! $old_node = delete $_[0]{'text'}; } if ( ref( $_[0] ) eq __PACKAGE__ ) { # common case %{ $_[0] } = (); # poof! } else { # play nice: delete $_[0]{'_parent'}; $_[0]->delete; } return '' unless defined $old_node; # sanity! return $old_node; } while (@stack) { foreach my $c ( @{ ( shift @stack )->{'_content'} } ) { if ( ref($c) ) { if ( $c->{'_tag'} eq '~text' ) { $c = ( $old_node = $c )->{'text'}; if ( ref($old_node) eq __PACKAGE__ ) { # common case %$old_node = (); # poof! } else { # play nice: delete $old_node->{'_parent'}; $old_node->delete; } } else { unshift @stack, $c; # visit it later. } } } } return undef; } { # The next three subs are basically copied from Number::Latin, # based on a one-liner by Abigail. Yes, I could simply require that # module, and a Roman numeral module too, but really, HTML-Tree already # has enough dependecies as it is; and anyhow, I don't need the functions # that do latin2int or roman2int. no integer; sub _int2latin { return unless defined $_[0]; return '0' if $_[0] < 1 and $_[0] > -1; return '-' . _i2l( abs int $_[0] ) if $_[0] <= -1; # tolerate negatives return _i2l( int $_[0] ); } sub _int2LATIN { # just the above plus uc return unless defined $_[0]; return '0' if $_[0] < 1 and $_[0] > -1; return '-' . uc( _i2l( abs int $_[0] ) ) if $_[0] <= -1; # tolerate negs return uc( _i2l( int $_[0] ) ); } my @alpha = ( 'a' .. 'z' ); sub _i2l { # the real work my $int = $_[0] || return ""; _i2l( int( ( $int - 1 ) / 26 ) ) . $alpha[ $int % 26 - 1 ]; # yes, recursive # Yes, 26 => is (26 % 26 - 1), which is -1 => Z! } } { # And now, some much less impressive Roman numerals code: my (@i) = ( '', qw(I II III IV V VI VII VIII IX) ); my (@x) = ( '', qw(X XX XXX XL L LX LXX LXXX XC) ); my (@c) = ( '', qw(C CC CCC CD D DC DCC DCCC CM) ); my (@m) = ( '', qw(M MM MMM) ); sub _int2ROMAN { my ( $i, $pref ); return '0' if 0 == ( $i = int( $_[0] || 0 ) ); # zero is a special case return $i + 0 if $i <= -4000 or $i >= 4000; # Because over 3999 would require non-ASCII chars, like D-with-)-inside if ( $i < 0 ) { # grumble grumble tolerate negatives grumble $pref = '-'; $i = abs($i); } else { $pref = ''; # normal case } my ( $x, $c, $m ) = ( 0, 0, 0 ); if ( $i >= 10 ) { $x = $i / 10; $i %= 10; if ( $x >= 10 ) { $c = $x / 10; $x %= 10; if ( $c >= 10 ) { $m = $c / 10; $c %= 10; } } } #print "m$m c$c x$x i$i\n"; return join( '', $pref, $m[$m], $c[$c], $x[$x], $i[$i] ); } sub _int2roman { lc( _int2ROMAN( $_[0] ) ) } } sub _int2int { $_[0] } # dummy %list_type_to_sub = ( 'I' => \&_int2ROMAN, 'i' => \&_int2roman, 'A' => \&_int2LATIN, 'a' => \&_int2latin, '1' => \&_int2int, ); sub number_lists { my (@stack) = ( $_[0] ); my ( $this, $tag, $counter, $numberer ); # scratch while (@stack) { # yup, pre-order-traverser idiom if ( ( $tag = ( $this = shift @stack )->{'_tag'} ) eq 'ol' ) { # Prep some things: $counter = ( ( $this->{'start'} || '' ) =~ m<^\s*(\d{1,7})\s*$>s ) ? $1 : 1; $numberer = $list_type_to_sub{ $this->{'type'} || '' } || $list_type_to_sub{'1'}; # Immeditately iterate over all children foreach my $c ( @{ $this->{'_content'} || next } ) { next unless ref $c; unshift @stack, $c; if ( $c->{'_tag'} eq 'li' ) { $counter = $1 if ( ( $c->{'value'} || '' ) =~ m<^\s*(\d{1,7})\s*$>s ); $c->{'_bullet'} = $numberer->($counter) . '.'; ++$counter; } } } elsif ( $tag eq 'ul' or $tag eq 'dir' or $tag eq 'menu' ) { # Immeditately iterate over all children foreach my $c ( @{ $this->{'_content'} || next } ) { next unless ref $c; unshift @stack, $c; $c->{'_bullet'} = '*' if $c->{'_tag'} eq 'li'; } } else { foreach my $c ( @{ $this->{'_content'} || next } ) { unshift @stack, $c if ref $c; } } } return; } sub has_insane_linkage { my @pile = ( $_[0] ); my ( $c, $i, $p, $this ); # scratch # Another iterative traverser; this time much simpler because # only in pre-order: my %parent_of = ( $_[0], 'TOP-OF-SCAN' ); while (@pile) { $this = shift @pile; $c = $this->{'_content'} || next; return ( $this, "_content attribute is true but nonref." ) unless ref($c) eq 'ARRAY'; next unless @$c; for ( $i = 0; $i < @$c; ++$i ) { return ( $this, "Child $i is undef" ) unless defined $c->[$i]; if ( ref( $c->[$i] ) ) { return ( $c->[$i], "appears in its own content list" ) if $c->[$i] eq $this; return ( $c->[$i], "appears twice in the tree: once under $this, once under $parent_of{$c->[$i]}" ) if exists $parent_of{ $c->[$i] }; $parent_of{ $c->[$i] } = '' . $this; # might as well just use the stringification of it. return ( $c->[$i], "_parent attribute is wrong (not defined)" ) unless defined( $p = $c->[$i]{'_parent'} ); return ( $c->[$i], "_parent attribute is wrong (nonref)" ) unless ref($p); return ( $c->[$i], "_parent attribute is wrong (is $p; should be $this)" ) unless $p eq $this; } } unshift @pile, grep ref($_), @$c; # queue up more things on the pile stack } return; #okay } sub _asserts_fail { # to be run on trusted documents only my (@pile) = ( $_[0] ); my ( @errors, $this, $id, $assert, $parent, $rv ); while (@pile) { $this = shift @pile; if ( defined( $assert = $this->{'assert'} ) ) { $id = ( $this->{'id'} ||= $this->address ) ; # don't use '0' as an ID, okay? unless ( ref($assert) ) { package main; ## no critic $assert = $this->{'assert'} = ( $assert =~ m/\bsub\b/ ? eval($assert) : eval("sub { $assert\n}") ); ## use critic if ($@) { push @errors, [ $this, "assertion at $id broke in eval: $@" ]; $assert = $this->{'assert'} = sub { }; } } $parent = $this->{'_parent'}; $rv = undef; eval { $rv = $assert->( $this, $this->{'_tag'}, $this->{'_id'}, # 0,1,2 $parent ? ( $parent, $parent->{'_tag'}, $parent->{'id'} ) : () # 3,4,5 ); }; if ($@) { push @errors, [ $this, "assertion at $id died: $@" ]; } elsif ( !$rv ) { push @errors, [ $this, "assertion at $id failed" ]; } # else OK } push @pile, grep ref($_), @{ $this->{'_content'} || next }; } return @errors; } ## _valid_name # validate XML style attribute names # http://www.w3.org/TR/2006/REC-xml11-20060816/#NT-Name sub _valid_name { my $self = shift; my $attr = shift or Carp::croak("sub valid_name requires an attribute name"); return (0) unless ( $attr =~ /^$START_CHAR$NAME_CHAR+$/ ); return (1); } sub element_class { $_[0]->{_element_class} || __PACKAGE__; } 1; 1; __END__ =pod =head1 NAME HTML::Element - Class for objects that represent HTML elements =head1 VERSION This document describes version 5.07 of HTML::Element, released August 31, 2017 as part of L<HTML-Tree|HTML::Tree>. =head1 SYNOPSIS use HTML::Element; $a = HTML::Element->new('a', href => 'http://www.perl.com/'); $a->push_content("The Perl Homepage"); $tag = $a->tag; print "$tag starts out as:", $a->starttag, "\n"; print "$tag ends as:", $a->endtag, "\n"; print "$tag\'s href attribute is: ", $a->attr('href'), "\n"; $links_r = $a->extract_links(); print "Hey, I found ", scalar(@$links_r), " links.\n"; print "And that, as HTML, is: ", $a->as_HTML, "\n"; $a = $a->delete; =head1 DESCRIPTION (This class is part of the L<HTML::Tree|HTML::Tree> dist.) Objects of the HTML::Element class can be used to represent elements of HTML document trees. These objects have attributes, notably attributes that designates each element's parent and content. The content is an array of text segments and other HTML::Element objects. A tree with HTML::Element objects as nodes can represent the syntax tree for a HTML document. =head1 HOW WE REPRESENT TREES Consider this HTML document: <html lang='en-US'> <head> <title>Stuff</title> <meta name='author' content='Jojo'> </head> <body> <h1>I like potatoes!</h1> </body> </html> Building a syntax tree out of it makes a tree-structure in memory that could be diagrammed as: html (lang='en-US') / \ / \ / \ head body /\ \ / \ \ / \ \ title meta h1 | (name='author', | "Stuff" content='Jojo') "I like potatoes" This is the traditional way to diagram a tree, with the "root" at the top, and it's this kind of diagram that people have in mind when they say, for example, that "the meta element is under the head element instead of under the body element". (The same is also said with "inside" instead of "under" -- the use of "inside" makes more sense when you're looking at the HTML source.) Another way to represent the above tree is with indenting: html (attributes: lang='en-US') head title "Stuff" meta (attributes: name='author' content='Jojo') body h1 "I like potatoes" Incidentally, diagramming with indenting works much better for very large trees, and is easier for a program to generate. The C<< $tree->dump >> method uses indentation just that way. However you diagram the tree, it's stored the same in memory -- it's a network of objects, each of which has attributes like so: element #1: _tag: 'html' _parent: none _content: [element #2, element #5] lang: 'en-US' element #2: _tag: 'head' _parent: element #1 _content: [element #3, element #4] element #3: _tag: 'title' _parent: element #2 _content: [text segment "Stuff"] element #4 _tag: 'meta' _parent: element #2 _content: none name: author content: Jojo element #5 _tag: 'body' _parent: element #1 _content: [element #6] element #6 _tag: 'h1' _parent: element #5 _content: [text segment "I like potatoes"] The "treeness" of the tree-structure that these elements comprise is not an aspect of any particular object, but is emergent from the relatedness attributes (_parent and _content) of these element-objects and from how you use them to get from element to element. While you could access the content of a tree by writing code that says "access the 'src' attribute of the root's I<first> child's I<seventh> child's I<third> child", you're more likely to have to scan the contents of a tree, looking for whatever nodes, or kinds of nodes, you want to do something with. The most straightforward way to look over a tree is to "traverse" it; an HTML::Element method (C<< $h->traverse >>) is provided for this purpose; and several other HTML::Element methods are based on it. (For everything you ever wanted to know about trees, and then some, see Niklaus Wirth's I<Algorithms + Data Structures = Programs> or Donald Knuth's I<The Art of Computer Programming, Volume 1>.) =head2 Weak References TL;DR summary: S<C<use HTML::TreeBuilder 5 -weak;>> and forget about the C<delete> method (except for pruning a node from a tree). Because HTML::Element stores a reference to the parent element, Perl's reference-count garbage collection doesn't work properly with HTML::Element trees. Starting with version 5.00, HTML::Element uses weak references (if available) to prevent that problem. Weak references were introduced in Perl 5.6.0, but you also need a version of L<Scalar::Util> that provides the C<weaken> function. Weak references are enabled by default. If you want to be certain they're in use, you can say S<C<use HTML::Element 5 -weak;>>. You must include the version number; previous versions of HTML::Element ignored the import list entirely. To disable weak references, you can say S<C<use HTML::Element -noweak;>>. This is a global setting. B<This feature is deprecated> and is provided only as a quick fix for broken code. If your code does not work properly with weak references, you should fix it immediately, as weak references may become mandatory in a future version. Generally, all you need to do is keep a reference to the root of the tree until you're done working with it. Because HTML::TreeBuilder is a subclass of HTML::Element, you can also import C<-weak> or C<-noweak> from HTML::TreeBuilder: e.g. S<C<use HTML::TreeBuilder: 5 -weak;>>. =head1 BASIC METHODS =head2 new $h = HTML::Element->new('tag', 'attrname' => 'value', ... ); This constructor method returns a new HTML::Element object. The tag name is a required argument; it will be forced to lowercase. Optionally, you can specify other initial attributes at object creation time. =head2 attr $value = $h->attr('attr'); $old_value = $h->attr('attr', $new_value); Returns (optionally sets) the value of the given attribute of C<$h>. The attribute name (but not the value, if provided) is forced to lowercase. If trying to read the value of an attribute not present for this element, the return value is undef. If setting a new value, the old value of that attribute is returned. If methods are provided for accessing an attribute (like C<< $h->tag >> for "_tag", C<< $h->content_list >>, etc. below), use those instead of calling attr C<< $h->attr >>, whether for reading or setting. Note that setting an attribute to C<undef> (as opposed to "", the empty string) actually deletes the attribute. =head2 tag $tagname = $h->tag(); $h->tag('tagname'); Returns (optionally sets) the tag name (also known as the generic identifier) for the element C<$h>. In setting, the tag name is always converted to lower case. There are four kinds of "pseudo-elements" that show up as HTML::Element objects: =over =item Comment pseudo-elements These are element objects with a C<$h-E<gt>tag> value of "~comment", and the content of the comment is stored in the "text" attribute (C<$h-E<gt>attr("text")>). For example, parsing this code with HTML::TreeBuilder... <!-- I like Pie. Pie is good --> produces an HTML::Element object with these attributes: "_tag", "~comment", "text", " I like Pie.\n Pie is good\n " =item Declaration pseudo-elements Declarations (rarely encountered) are represented as HTML::Element objects with a tag name of "~declaration", and content in the "text" attribute. For example, this: <!DOCTYPE foo> produces an element whose attributes include: "_tag", "~declaration", "text", "DOCTYPE foo" =item Processing instruction pseudo-elements PIs (rarely encountered) are represented as HTML::Element objects with a tag name of "~pi", and content in the "text" attribute. For example, this: <?stuff foo?> produces an element whose attributes include: "_tag", "~pi", "text", "stuff foo?" (assuming a recent version of HTML::Parser) =item ~literal pseudo-elements These objects are not currently produced by HTML::TreeBuilder, but can be used to represent a "super-literal" -- i.e., a literal you want to be immune from escaping. (Yes, I just made that term up.) That is, this is useful if you want to insert code into a tree that you plan to dump out with C<as_HTML>, where you want, for some reason, to suppress C<as_HTML>'s normal behavior of amp-quoting text segments. For example, this: my $literal = HTML::Element->new('~literal', 'text' => 'x < 4 & y > 7' ); my $span = HTML::Element->new('span'); $span->push_content($literal); print $span->as_HTML; prints this: <span>x < 4 & y > 7</span> Whereas this: my $span = HTML::Element->new('span'); $span->push_content('x < 4 & y > 7'); # normal text segment print $span->as_HTML; prints this: <span>x < 4 & y > 7</span> Unless you're inserting lots of pre-cooked code into existing trees, and dumping them out again, it's not likely that you'll find C<~literal> pseudo-elements useful. =back =head2 parent $parent = $h->parent(); $h->parent($new_parent); Returns (optionally sets) the parent (aka "container") for this element. The parent should either be undef, or should be another element. You B<should not> use this to directly set the parent of an element. Instead use any of the other methods under "Structure-Modifying Methods", below. Note that C<< not($h->parent) >> is a simple test for whether C<$h> is the root of its subtree. =head2 content_list @content = $h->content_list(); $num_children = $h->content_list(); Returns a list of the child nodes of this element -- i.e., what nodes (elements or text segments) are inside/under this element. (Note that this may be an empty list.) In a scalar context, this returns the count of the items, as you may expect. =head2 content $content_array_ref = $h->content(); # may return undef This somewhat deprecated method returns the content of this element; but unlike content_list, this returns either undef (which you should understand to mean no content), or a I<reference to the array> of content items, each of which is either a text segment (a string, i.e., a defined non-reference scalar value), or an HTML::Element object. Note that even if an arrayref is returned, it may be a reference to an empty array. While older code should feel free to continue to use C<< $h->content >>, new code should use C<< $h->content_list >> in almost all conceivable cases. It is my experience that in most cases this leads to simpler code anyway, since it means one can say: @children = $h->content_list; instead of the inelegant: @children = @{$h->content || []}; If you do use C<< $h->content >> (or C<< $h->content_array_ref >>), you should not use the reference returned by it (assuming it returned a reference, and not undef) to directly set or change the content of an element or text segment! Instead use L<content_refs_list> or any of the other methods under "Structure-Modifying Methods", below. =head2 content_array_ref $content_array_ref = $h->content_array_ref(); # never undef This is like C<content> (with all its caveats and deprecations) except that it is guaranteed to return an array reference. That is, if the given node has no C<_content> attribute, the C<content> method would return that undef, but C<content_array_ref> would set the given node's C<_content> value to C<[]> (a reference to a new, empty array), and return that. =head2 content_refs_list @content_refs = $h->content_refs_list; This returns a list of scalar references to each element of C<$h>'s content list. This is useful in case you want to in-place edit any large text segments without having to get a copy of the current value of that segment value, modify that copy, then use the C<splice_content> to replace the old with the new. Instead, here you can in-place edit: foreach my $item_r ($h->content_refs_list) { next if ref $$item_r; $$item_r =~ s/honour/honor/g; } You I<could> currently achieve the same affect with: foreach my $item (@{ $h->content_array_ref }) { # deprecated! next if ref $item; $item =~ s/honour/honor/g; } ...except that using the return value of C<< $h->content >> or C<< $h->content_array_ref >> to do that is deprecated, and just might stop working in the future. =head2 implicit $is_implicit = $h->implicit(); $h->implicit($make_implicit); Returns (optionally sets) the "_implicit" attribute. This attribute is a flag that's used for indicating that the element was not originally present in the source, but was added to the parse tree (by HTML::TreeBuilder, for example) in order to conform to the rules of HTML structure. =head2 pos $pos = $h->pos(); $h->pos($element); Returns (and optionally sets) the "_pos" (for "current I<pos>ition") pointer of C<$h>. This attribute is a pointer used during some parsing operations, whose value is whatever HTML::Element element at or under C<$h> is currently "open", where C<< $h->insert_element(NEW) >> will actually insert a new element. (This has nothing to do with the Perl function called C<pos>, for controlling where regular expression matching starts.) If you set C<< $h->pos($element) >>, be sure that C<$element> is either C<$h>, or an element under C<$h>. If you've been modifying the tree under C<$h> and are no longer sure C<< $h->pos >> is valid, you can enforce validity with: $h->pos(undef) unless $h->pos->is_inside($h); =head2 all_attr %attr = $h->all_attr(); Returns all this element's attributes and values, as key-value pairs. This will include any "internal" attributes (i.e., ones not present in the original element, and which will not be represented if/when you call C<< $h->as_HTML >>). Internal attributes are distinguished by the fact that the first character of their key (not value! key!) is an underscore ("_"). Example output of C<< $h->all_attr() >> : C<'_parent', >I<[object_value]>C< , '_tag', 'em', 'lang', 'en-US', '_content', >I<[array-ref value]>. =head2 all_attr_names @names = $h->all_attr_names(); $num_attrs = $h->all_attr_names(); Like C<all_attr>, but only returns the names of the attributes. In scalar context, returns the number of attributes. Example output of C<< $h->all_attr_names() >> : C<'_parent', '_tag', 'lang', '_content', >. =head2 all_external_attr %attr = $h->all_external_attr(); Like C<all_attr>, except that internal attributes are not present. =head2 all_external_attr_names @names = $h->all_external_attr_names(); $num_attrs = $h->all_external_attr_names(); Like C<all_attr_names>, except that internal attributes' names are not present (or counted). =head2 id $id = $h->id(); $h->id($string); Returns (optionally sets to C<$string>) the "id" attribute. C<< $h->id(undef) >> deletes the "id" attribute. C<< $h->id(...) >> is basically equivalent to C<< $h->attr('id', ...) >>, except that when setting the attribute, this method returns the new value, not the old value. =head2 idf $id = $h->idf(); $h->idf($string); Just like the C<id> method, except that if you call C<< $h->idf() >> and no "id" attribute is defined for this element, then it's set to a likely-to-be-unique value, and returned. (The "f" is for "force".) =head1 STRUCTURE-MODIFYING METHODS These methods are provided for modifying the content of trees by adding or changing nodes as parents or children of other nodes. =head2 push_content $h->push_content($element_or_text, ...); Adds the specified items to the I<end> of the content list of the element C<$h>. The items of content to be added should each be either a text segment (a string), an HTML::Element object, or an arrayref. Arrayrefs are fed thru C<< $h->new_from_lol(that_arrayref) >> to convert them into elements, before being added to the content list of C<$h>. This means you can say things concise things like: $body->push_content( ['br'], ['ul', map ['li', $_], qw(Peaches Apples Pears Mangos) ] ); See the L</new_from_lol> method's documentation, far below, for more explanation. Returns C<$h> (the element itself). The push_content method will try to consolidate adjacent text segments while adding to the content list. That's to say, if C<$h>'s C<content_list> is ('foo bar ', $some_node, 'baz!') and you call $h->push_content('quack?'); then the resulting content list will be this: ('foo bar ', $some_node, 'baz!quack?') and not this: ('foo bar ', $some_node, 'baz!', 'quack?') If that latter is what you want, you'll have to override the feature of consolidating text by using splice_content, as in: $h->splice_content(scalar($h->content_list),0,'quack?'); Similarly, if you wanted to add 'Skronk' to the beginning of the content list, calling this: $h->unshift_content('Skronk'); then the resulting content list will be this: ('Skronkfoo bar ', $some_node, 'baz!') and not this: ('Skronk', 'foo bar ', $some_node, 'baz!') What you'd to do get the latter is: $h->splice_content(0,0,'Skronk'); =head2 unshift_content $h->unshift_content($element_or_text, ...) Just like C<push_content>, but adds to the I<beginning> of the C<$h> element's content list. The items of content to be added should each be either a text segment (a string), an HTML::Element object, or an arrayref (which is fed thru C<new_from_lol>). The unshift_content method will try to consolidate adjacent text segments while adding to the content list. See above for a discussion of this. Returns C<$h> (the element itself). =head2 splice_content @removed = $h->splice_content($offset, $length, $element_or_text, ...); Detaches the elements from C<$h>'s list of content-nodes, starting at C<$offset> and continuing for C<$length> items, replacing them with the elements of the following list, if any. Returns the elements (if any) removed from the content-list. If C<$offset> is negative, then it starts that far from the end of the array, just like Perl's normal C<splice> function. If C<$length> and the following list is omitted, removes everything from C<$offset> onward. The items of content to be added (if any) should each be either a text segment (a string), an arrayref (which is fed thru L</new_from_lol>), or an HTML::Element object that's not already a child of C<$h>. =head2 detach $old_parent = $h->detach(); This unlinks C<$h> from its parent, by setting its 'parent' attribute to undef, and by removing it from the content list of its parent (if it had one). The return value is the parent that was detached from (or undef, if C<$h> had no parent to start with). Note that neither C<$h> nor its parent are explicitly destroyed. =head2 detach_content @old_content = $h->detach_content(); This unlinks all of C<$h>'s children from C<$h>, and returns them. Note that these are not explicitly destroyed; for that, you can just use C<< $h->delete_content >>. =head2 replace_with $h->replace_with( $element_or_text, ... ) This replaces C<$h> in its parent's content list with the nodes specified. The element C<$h> (which by then may have no parent) is returned. This causes a fatal error if C<$h> has no parent. The list of nodes to insert may contain C<$h>, but at most once. Aside from that possible exception, the nodes to insert should not already be children of C<$h>'s parent. Also, note that this method does not destroy C<$h> if weak references are turned off -- use C<< $h->replace_with(...)->delete >> if you need that. =head2 preinsert $h->preinsert($element_or_text...); Inserts the given nodes right BEFORE C<$h> in C<$h>'s parent's content list. This causes a fatal error if C<$h> has no parent. None of the given nodes should be C<$h> or other children of C<$h>. Returns C<$h>. =head2 postinsert $h->postinsert($element_or_text...) Inserts the given nodes right AFTER C<$h> in C<$h>'s parent's content list. This causes a fatal error if C<$h> has no parent. None of the given nodes should be C<$h> or other children of C<$h>. Returns C<$h>. =head2 replace_with_content $h->replace_with_content(); This replaces C<$h> in its parent's content list with its own content. The element C<$h> (which by then has no parent or content of its own) is returned. This causes a fatal error if C<$h> has no parent. Also, note that this does not destroy C<$h> if weak references are turned off -- use C<< $h->replace_with_content->delete >> if you need that. =head2 delete_content $h->delete_content(); $h->destroy_content(); # alias Clears the content of C<$h>, calling C<< $h->delete >> for each content element. Compare with C<< $h->detach_content >>. Returns C<$h>. C<destroy_content> is an alias for this method. =head2 delete $h->delete(); $h->destroy(); # alias Detaches this element from its parent (if it has one) and explicitly destroys the element and all its descendants. The return value is the empty list (or C<undef> in scalar context). Before version 5.00 of HTML::Element, you had to call C<delete> when you were finished with the tree, or your program would leak memory. This is no longer necessary if weak references are enabled, see L</"Weak References">. =head2 destroy An alias for L</delete>. =head2 destroy_content An alias for L</delete_content>. =head2 clone $copy = $h->clone(); Returns a copy of the element (whose children are clones (recursively) of the original's children, if any). The returned element is parentless. Any '_pos' attributes present in the source element/tree will be absent in the copy. For that and other reasons, the clone of an HTML::TreeBuilder object that's in mid-parse (i.e, the head of a tree that HTML::TreeBuilder is elaborating) cannot (currently) be used to continue the parse. You are free to clone HTML::TreeBuilder trees, just as long as: 1) they're done being parsed, or 2) you don't expect to resume parsing into the clone. (You can continue parsing into the original; it is never affected.) =head2 clone_list @copies = HTML::Element->clone_list(...nodes...); Returns a list consisting of a copy of each node given. Text segments are simply copied; elements are cloned by calling C<< $it->clone >> on each of them. Note that this must be called as a class method, not as an instance method. C<clone_list> will croak if called as an instance method. You can also call it like so: ref($h)->clone_list(...nodes...) =head2 normalize_content $h->normalize_content Normalizes the content of C<$h> -- i.e., concatenates any adjacent text nodes. (Any undefined text segments are turned into empty-strings.) Note that this does not recurse into C<$h>'s descendants. =head2 delete_ignorable_whitespace $h->delete_ignorable_whitespace() This traverses under C<$h> and deletes any text segments that are ignorable whitespace. You should not use this if C<$h> is under a C<< <pre> >> element. =head2 insert_element $h->insert_element($element, $implicit); Inserts (via push_content) a new element under the element at C<< $h->pos() >>. Then updates C<< $h->pos() >> to point to the inserted element, unless $element is a prototypically empty element like C<< <br> >>, C<< <hr> >>, C<< <img> >>, etc. The new C<< $h->pos() >> is returned. This method is useful only if your particular tree task involves setting C<< $h->pos() >>. =head1 DUMPING METHODS =head2 dump $h->dump() $h->dump(*FH) ; # or *FH{IO} or $fh_obj Prints the element and all its children to STDOUT (or to a specified filehandle), in a format useful only for debugging. The structure of the document is shown by indentation (no end tags). =head2 as_HTML $s = $h->as_HTML(); $s = $h->as_HTML($entities); $s = $h->as_HTML($entities, $indent_char); $s = $h->as_HTML($entities, $indent_char, \%optional_end_tags); Returns a string representing in HTML the element and its descendants. The optional argument C<$entities> specifies a string of the entities to encode. For compatibility with previous versions, specify C<'E<lt>E<gt>&'> here. If omitted or undef, I<all> unsafe characters are encoded as HTML entities. See L<HTML::Entities> for details. If passed an empty string, no entities are encoded. If $indent_char is specified and defined, the HTML to be output is intented, using the string you specify (which you probably should set to "\t", or some number of spaces, if you specify it). If C<\%optional_end_tags> is specified and defined, it should be a reference to a hash that holds a true value for every tag name whose end tag is optional. Defaults to C<\%HTML::Element::optionalEndTag>, which is an alias to C<%HTML::Tagset::optionalEndTag>, which, at time of writing, contains true values for C<p, li, dt, dd>. A useful value to pass is an empty hashref, C<{}>, which means that no end-tags are optional for this dump. Otherwise, possibly consider copying C<%HTML::Tagset::optionalEndTag> to a hash of your own, adding or deleting values as you like, and passing a reference to that hash. =head2 as_text $s = $h->as_text(); $s = $h->as_text(skip_dels => 1); Returns a string consisting of only the text parts of the element's descendants. Any whitespace inside the element is included unchanged, but whitespace not in the tree is never added. But remember that whitespace may be ignored or compacted by HTML::TreeBuilder during parsing (depending on the value of the C<ignore_ignorable_whitespace> and C<no_space_compacting> attributes). Also, since whitespace is never added during parsing, HTML::TreeBuilder->new_from_content("<p>a</p><p>b</p>") ->as_text; returns C<"ab">, not C<"a b"> or C<"a\nb">. Text under C<< <script> >> or C<< <style> >> elements is never included in what's returned. If C<skip_dels> is true, then text content under C<< <del> >> nodes is not included in what's returned. =head2 as_trimmed_text $s = $h->as_trimmed_text(...); $s = $h->as_trimmed_text(extra_chars => '\xA0'); # remove $s = $h->as_text_trimmed(...); # alias This is just like C<as_text(...)> except that leading and trailing whitespace is deleted, and any internal whitespace is collapsed. This will not remove non-breaking spaces, Unicode spaces, or any other non-ASCII whitespace unless you supply the extra characters as a string argument (e.g. C<< $h->as_trimmed_text(extra_chars => '\xA0') >>). C<extra_chars> may be any string that can appear inside a character class, including ranges like C<a-z>, POSIX character classes like C<[:alpha:]>, and character class escapes like C<\p{Zs}>. =head2 as_XML $s = $h->as_XML() Returns a string representing in XML the element and its descendants. The XML is not indented. =head2 as_Lisp_form $s = $h->as_Lisp_form(); Returns a string representing the element and its descendants as a Lisp form. Unsafe characters are encoded as octal escapes. The Lisp form is indented, and contains external ("href", etc.) as well as internal attributes ("_tag", "_content", "_implicit", etc.), except for "_parent", which is omitted. Current example output for a given element: ("_tag" "img" "border" "0" "src" "pie.png" "usemap" "#main.map") =head2 format $s = $h->format; # use HTML::FormatText $s = $h->format($formatter); Formats text output. Defaults to HTML::FormatText. Takes a second argument that is a reference to a formatter. =head2 starttag $start = $h->starttag(); $start = $h->starttag($entities); Returns a string representing the complete start tag for the element. I.e., leading "<", tag name, attributes, and trailing ">". All values are surrounded with double-quotes, and appropriate characters are encoded. If C<$entities> is omitted or undef, I<all> unsafe characters are encoded as HTML entities. See L<HTML::Entities> for details. If you specify some value for C<$entities>, remember to include the double-quote character in it. (Previous versions of this module would basically behave as if C<'&"E<gt>'> were specified for C<$entities>.) If C<$entities> is an empty string, no entity is escaped. =head2 starttag_XML $start = $h->starttag_XML(); Returns a string representing the complete start tag for the element. =head2 endtag $end = $h->endtag(); Returns a string representing the complete end tag for this element. I.e., "</", tag name, and ">". =head2 endtag_XML $end = $h->endtag_XML(); Returns a string representing the complete end tag for this element. I.e., "</", tag name, and ">". =head1 SECONDARY STRUCTURAL METHODS These methods all involve some structural aspect of the tree; either they report some aspect of the tree's structure, or they involve traversal down the tree, or walking up the tree. =head2 is_inside $inside = $h->is_inside('tag', $element, ...); Returns true if the C<$h> element is, or is contained anywhere inside an element that is any of the ones listed, or whose tag name is any of the tag names listed. You can use any mix of elements and tag names. =head2 is_empty $empty = $h->is_empty(); Returns true if C<$h> has no content, i.e., has no elements or text segments under it. In other words, this returns true if C<$h> is a leaf node, AKA a terminal node. Do not confuse this sense of "empty" with another sense that it can have in SGML/HTML/XML terminology, which means that the element in question is of the type (like HTML's C<< <hr> >>, C<< <br> >>, C<< <img> >>, etc.) that I<can't> have any content. That is, a particular C<< <p> >> element may happen to have no content, so $that_p_element->is_empty will be true -- even though the prototypical C<< <p> >> element isn't "empty" (not in the way that the prototypical C<< <hr> >> element is). If you think this might make for potentially confusing code, consider simply using the clearer exact equivalent: C<< not($h->content_list) >>. =head2 pindex $index = $h->pindex(); Return the index of the element in its parent's contents array, such that C<$h> would equal $h->parent->content->[$h->pindex] # or ($h->parent->content_list)[$h->pindex] assuming C<$h> isn't root. If the element C<$h> is root, then C<< $h->pindex >> returns C<undef>. =head2 left $element = $h->left(); @elements = $h->left(); In scalar context: returns the node that's the immediate left sibling of C<$h>. If C<$h> is the leftmost (or only) child of its parent (or has no parent), then this returns undef. In list context: returns all the nodes that're the left siblings of C<$h> (starting with the leftmost). If C<$h> is the leftmost (or only) child of its parent (or has no parent), then this returns an empty list. (See also C<< $h->preinsert(LIST) >>.) =head2 right $element = $h->right(); @elements = $h->right(); In scalar context: returns the node that's the immediate right sibling of C<$h>. If C<$h> is the rightmost (or only) child of its parent (or has no parent), then this returns C<undef>. In list context: returns all the nodes that're the right siblings of C<$h>, starting with the leftmost. If C<$h> is the rightmost (or only) child of its parent (or has no parent), then this returns an empty list. (See also C<< $h->postinsert(LIST) >>.) =head2 address $address = $h->address(); $element_or_text = $h->address($address); The first form (with no parameter) returns a string representing the location of C<$h> in the tree it is a member of. The address consists of numbers joined by a '.', starting with '0', and followed by the pindexes of the nodes in the tree that are ancestors of C<$h>, starting from the top. So if the way to get to a node starting at the root is to go to child 2 of the root, then child 10 of that, and then child 0 of that, and then you're there -- then that node's address is "0.2.10.0". As a bit of a special case, the address of the root is simply "0". I forsee this being used mainly for debugging, but you may find your own uses for it. $element_or_text = $h->address($address); This form returns the node (whether element or text-segment) at the given address in the tree that C<$h> is a part of. (That is, the address is resolved starting from C<< $h->root >>.) If there is no node at the given address, this returns C<undef>. You can specify "relative addressing" (i.e., that indexing is supposed to start from C<$h> and not from C<< $h->root >>) by having the address start with a period -- e.g., C<< $h->address(".3.2") >> will look at child 3 of C<$h>, and child 2 of that. =head2 depth $depth = $h->depth(); Returns a number expressing C<$h>'s depth within its tree, i.e., how many steps away it is from the root. If C<$h> has no parent (i.e., is root), its depth is 0. =head2 root $root = $h->root(); Returns the element that's the top of C<$h>'s tree. If C<$h> is root, this just returns C<$h>. (If you want to test whether C<$h> I<is> the root, instead of asking what its root is, just test C<< not($h->parent) >>.) =head2 lineage @lineage = $h->lineage(); Returns the list of C<$h>'s ancestors, starting with its parent, and then that parent's parent, and so on, up to the root. If C<$h> is root, this returns an empty list. If you simply want a count of the number of elements in C<$h>'s lineage, use C<< $h->depth >>. =head2 lineage_tag_names @names = $h->lineage_tag_names(); Returns the list of the tag names of C<$h>'s ancestors, starting with its parent, and that parent's parent, and so on, up to the root. If C<$h> is root, this returns an empty list. Example output: C<('em', 'td', 'tr', 'table', 'body', 'html')> Equivalent to: map { $_->tag } $h->lineage; =head2 descendants @descendants = $h->descendants(); In list context, returns the list of all C<$h>'s descendant elements, listed in pre-order (i.e., an element appears before its content-elements). Text segments DO NOT appear in the list. In scalar context, returns a count of all such elements. =head2 descendents This is just an alias to the C<descendants> method, for people who can't spell. =head2 find_by_tag_name @elements = $h->find_by_tag_name('tag', ...); $first_match = $h->find_by_tag_name('tag', ...); In list context, returns a list of elements at or under C<$h> that have any of the specified tag names. In scalar context, returns the first (in pre-order traversal of the tree) such element found, or undef if none. =head2 find This is just an alias to C<find_by_tag_name>. (There was once going to be a whole find_* family of methods, but then C<look_down> filled that niche, so there turned out not to be much reason for the verboseness of the name "find_by_tag_name".) =head2 find_by_attribute @elements = $h->find_by_attribute('attribute', 'value'); $first_match = $h->find_by_attribute('attribute', 'value'); In a list context, returns a list of elements at or under C<$h> that have the specified attribute, and have the given value for that attribute. In a scalar context, returns the first (in pre-order traversal of the tree) such element found, or undef if none. This method is B<deprecated> in favor of the more expressive C<look_down> method, which new code should use instead. =head2 look_down @elements = $h->look_down( ...criteria... ); $first_match = $h->look_down( ...criteria... ); This starts at C<$h> and looks thru its element descendants (in pre-order), looking for elements matching the criteria you specify. In list context, returns all elements that match all the given criteria; in scalar context, returns the first such element (or undef, if nothing matched). There are three kinds of criteria you can specify: =over =item (attr_name, attr_value) This means you're looking for an element with that value for that attribute. Example: C<"alt", "pix!">. Consider that you can search on internal attribute values too: C<"_tag", "p">. =item (attr_name, qr/.../) This means you're looking for an element whose value for that attribute matches the specified Regexp object. =item a coderef This means you're looking for elements where coderef->(each_element) returns true. Example: my @wide_pix_images = $h->look_down( _tag => "img", alt => "pix!", sub { $_[0]->attr('width') > 350 } ); =back Note that C<(attr_name, attr_value)> and C<(attr_name, qr/.../)> criteria are almost always faster than coderef criteria, so should presumably be put before them in your list of criteria. That is, in the example above, the sub ref is called only for elements that have already passed the criteria of having a "_tag" attribute with value "img", and an "alt" attribute with value "pix!". If the coderef were first, it would be called on every element, and I<then> what elements pass that criterion (i.e., elements for which the coderef returned true) would be checked for their "_tag" and "alt" attributes. Note that comparison of string attribute-values against the string value in C<(attr_name, attr_value)> is case-INsensitive! A criterion of C<('align', 'right')> I<will> match an element whose "align" value is "RIGHT", or "right" or "rIGhT", etc. Note also that C<look_down> considers "" (empty-string) and undef to be different things, in attribute values. So this: $h->look_down("alt", "") will find elements I<with> an "alt" attribute, but where the value for the "alt" attribute is "". But this: $h->look_down("alt", undef) is the same as: $h->look_down(sub { !defined($_[0]->attr('alt')) } ) That is, it finds elements that do not have an "alt" attribute at all (or that do have an "alt" attribute, but with a value of undef -- which is not normally possible). Note that when you give several criteria, this is taken to mean you're looking for elements that match I<all> your criterion, not just I<any> of them. In other words, there is an implicit "and", not an "or". So if you wanted to express that you wanted to find elements with a "name" attribute with the value "foo" I<or> with an "id" attribute with the value "baz", you'd have to do it like: @them = $h->look_down( sub { # the lcs are to fold case lc($_[0]->attr('name')) eq 'foo' or lc($_[0]->attr('id')) eq 'baz' } ); Coderef criteria are more expressive than C<(attr_name, attr_value)> and C<(attr_name, qr/.../)> criteria, and all C<(attr_name, attr_value)> and C<(attr_name, qr/.../)> criteria could be expressed in terms of coderefs. However, C<(attr_name, attr_value)> and C<(attr_name, qr/.../)> criteria are a convenient shorthand. (In fact, C<look_down> itself is basically "shorthand" too, since anything you can do with C<look_down> you could do by traversing the tree, either with the C<traverse> method or with a routine of your own. However, C<look_down> often makes for very concise and clear code.) =head2 look_up @elements = $h->look_up( ...criteria... ); $first_match = $h->look_up( ...criteria... ); This is identical to C<< $h->look_down >>, except that whereas C<< $h->look_down >> basically scans over the list: ($h, $h->descendants) C<< $h->look_up >> instead scans over the list ($h, $h->lineage) So, for example, this returns all ancestors of C<$h> (possibly including C<$h> itself) that are C<< <td> >> elements with an "align" attribute with a value of "right" (or "RIGHT", etc.): $h->look_up("_tag", "td", "align", "right"); =head2 traverse $h->traverse(...options...) Lengthy discussion of HTML::Element's unnecessary and confusing C<traverse> method has been moved to a separate file: L<HTML::Element::traverse> =head2 attr_get_i @values = $h->attr_get_i('attribute'); $first_value = $h->attr_get_i('attribute'); In list context, returns a list consisting of the values of the given attribute for C<$h> and for all its ancestors starting from C<$h> and working its way up. Nodes with no such attribute are skipped. ("attr_get_i" stands for "attribute get, with inheritance".) In scalar context, returns the first such value, or undef if none. Consider a document consisting of: <html lang='i-klingon'> <head><title>Pati Pata</title></head> <body> <h1 lang='la'>Stuff</h1> <p lang='es-MX' align='center'> Foo bar baz <cite>Quux</cite>. </p> <p>Hooboy.</p> </body> </html> If C<$h> is the C<< <cite> >> element, C<< $h->attr_get_i("lang") >> in list context will return the list C<('es-MX', 'i-klingon')>. In scalar context, it will return the value C<'es-MX'>. If you call with multiple attribute names... @values = $h->attr_get_i('a1', 'a2', 'a3'); $first_value = $h->attr_get_i('a1', 'a2', 'a3'); ...in list context, this will return a list consisting of the values of these attributes which exist in C<$h> and its ancestors. In scalar context, this returns the first value (i.e., the value of the first existing attribute from the first element that has any of the attributes listed). So, in the above example, $h->attr_get_i('lang', 'align'); will return: ('es-MX', 'center', 'i-klingon') # in list context or 'es-MX' # in scalar context. But note that this: $h->attr_get_i('align', 'lang'); will return: ('center', 'es-MX', 'i-klingon') # in list context or 'center' # in scalar context. =head2 tagname_map $hash_ref = $h->tagname_map(); Scans across C<$h> and all its descendants, and makes a hash (a reference to which is returned) where each entry consists of a key that's a tag name, and a value that's a reference to a list to all elements that have that tag name. I.e., this method returns: { # Across $h and all descendants... 'a' => [ ...list of all <a> elements... ], 'em' => [ ...list of all <em> elements... ], 'img' => [ ...list of all <img> elements... ], } (There are entries in the hash for only those tagnames that occur at/under C<$h> -- so if there's no C<< <img> >> elements, there'll be no "img" entry in the returned hashref.) Example usage: my $map_r = $h->tagname_map(); my @heading_tags = sort grep m/^h\d$/s, keys %$map_r; if(@heading_tags) { print "Heading levels used: @heading_tags\n"; } else { print "No headings.\n" } =head2 extract_links $links_array_ref = $h->extract_links(); $links_array_ref = $h->extract_links(@wantedTypes); Returns links found by traversing the element and all of its children and looking for attributes (like "href" in an C<< <a> >> element, or "src" in an C<< <img> >> element) whose values represent links. The return value is a I<reference> to an array. Each element of the array is reference to an array with I<four> items: the link-value, the element that has the attribute with that link-value, and the name of that attribute, and the tagname of that element. (Example: C<['http://www.suck.com/',> I<$elem_obj> C<, 'href', 'a']>.) You may or may not end up using the element itself -- for some purposes, you may use only the link value. You might specify that you want to extract links from just some kinds of elements (instead of the default, which is to extract links from I<all> the kinds of elements known to have attributes whose values represent links). For instance, if you want to extract links from only C<< <a> >> and C<< <img> >> elements, you could code it like this: for (@{ $e->extract_links('a', 'img') }) { my($link, $element, $attr, $tag) = @$_; print "Hey, there's a $tag that links to ", $link, ", in its $attr attribute, at ", $element->address(), ".\n"; } =head2 simplify_pres $h->simplify_pres(); In text bits under PRE elements that are at/under C<$h>, this routine nativizes all newlines, and expands all tabs. That is, if you read a file with lines delimited by C<\cm\cj>'s, the text under PRE areas will have C<\cm\cj>'s instead of C<\n>'s. Calling C<< $h->simplify_pres >> on such a tree will turn C<\cm\cj>'s into C<\n>'s. Tabs are expanded to however many spaces it takes to get to the next 8th column -- the usual way of expanding them. =head2 same_as $equal = $h->same_as($i) Returns true if C<$h> and C<$i> are both elements representing the same tree of elements, each with the same tag name, with the same explicit attributes (i.e., not counting attributes whose names start with "_"), and with the same content (textual, comments, etc.). Sameness of descendant elements is tested, recursively, with C<< $child1->same_as($child_2) >>, and sameness of text segments is tested with C<$segment1 eq $segment2>. =head2 new_from_lol $h = HTML::Element->new_from_lol($array_ref); @elements = HTML::Element->new_from_lol($array_ref, ...); Recursively constructs a tree of nodes, based on the (non-cyclic) data structure represented by each C<$array_ref>, where that is a reference to an array of arrays (of arrays (of arrays (etc.))). In each arrayref in that structure, different kinds of values are treated as follows: =over =item * Arrayrefs Arrayrefs are considered to designate a sub-tree representing children for the node constructed from the current arrayref. =item * Hashrefs Hashrefs are considered to contain attribute-value pairs to add to the element to be constructed from the current arrayref =item * Text segments Text segments at the start of any arrayref will be considered to specify the name of the element to be constructed from the current arrayref; all other text segments will be considered to specify text segments as children for the current arrayref. =item * Elements Existing element objects are either inserted into the treelet constructed, or clones of them are. That is, when the lol-tree is being traversed and elements constructed based what's in it, if an existing element object is found, if it has no parent, then it is added directly to the treelet constructed; but if it has a parent, then C<$that_node-E<gt>clone> is added to the treelet at the appropriate place. =back An example will hopefully make this more obvious: my $h = HTML::Element->new_from_lol( ['html', ['head', [ 'title', 'I like stuff!' ], ], ['body', {'lang', 'en-JP', _implicit => 1}, 'stuff', ['p', 'um, p < 4!', {'class' => 'par123'}], ['div', {foo => 'bar'}, '123'], ] ] ); $h->dump; Will print this: <html> @0 <head> @0.0 <title> @0.0.0 "I like stuff!" <body lang="en-JP"> @0.1 (IMPLICIT) "stuff" <p class="par123"> @0.1.1 "um, p < 4!" <div foo="bar"> @0.1.2 "123" And printing $h->as_HTML will give something like: <html><head><title>I like stuff!</title></head> <body lang="en-JP">stuff<p class="par123">um, p < 4! <div foo="bar">123</div></body></html> You can even do fancy things with C<map>: $body->push_content( # push_content implicitly calls new_from_lol on arrayrefs... ['br'], ['blockquote', ['h2', 'Pictures!'], map ['p', $_], $body2->look_down("_tag", "img"), # images, to be copied from that other tree. ], # and more stuff: ['ul', map ['li', ['a', {'href'=>"$_.png"}, $_ ] ], qw(Peaches Apples Pears Mangos) ], ); In scalar context, you must supply exactly one arrayref. In list context, you can pass a list of arrayrefs, and L<new_from_lol> will return a list of elements, one for each arrayref. @elements = HTML::Element->new_from_lol( ['hr'], ['p', 'And there, on the door, was a hook!'], ); # constructs two elements. =head2 objectify_text $h->objectify_text(); This turns any text nodes under C<$h> from mere text segments (strings) into real objects, pseudo-elements with a tag-name of "~text", and the actual text content in an attribute called "text". (For a discussion of pseudo-elements, see the L</"tag"> method, far above.) This method is provided because, for some purposes, it is convenient or necessary to be able, for a given text node, to ask what element is its parent; and clearly this is not possible if a node is just a text string. Note that these "~text" objects are not recognized as text nodes by methods like L</as_text>. Presumably you will want to call C<< $h->objectify_text >>, perform whatever task that you needed that for, and then call C<< $h->deobjectify_text >> before calling anything like C<< $h->as_text >>. =head2 deobjectify_text $h->deobjectify_text(); This undoes the effect of C<< $h->objectify_text >>. That is, it takes any "~text" pseudo-elements in the tree at/under C<$h>, and deletes each one, replacing each with the content of its "text" attribute. Note that if C<$h> itself is a "~text" pseudo-element, it will be destroyed -- a condition you may need to treat specially in your calling code (since it means you can't very well do anything with C<$h> after that). So that you can detect that condition, if C<$h> is itself a "~text" pseudo-element, then this method returns the value of the "text" attribute, which should be a defined value; in all other cases, it returns undef. (This method assumes that no "~text" pseudo-element has any children.) =head2 number_lists $h->number_lists(); For every UL, OL, DIR, and MENU element at/under C<$h>, this sets a "_bullet" attribute for every child LI element. For LI children of an OL, the "_bullet" attribute's value will be something like "4.", "d.", "D.", "IV.", or "iv.", depending on the OL element's "type" attribute. LI children of a UL, DIR, or MENU get their "_bullet" attribute set to "*". There should be no other LIs (i.e., except as children of OL, UL, DIR, or MENU elements), and if there are, they are unaffected. =head2 has_insane_linkage $h->has_insane_linkage This method is for testing whether this element or the elements under it have linkage attributes (_parent and _content) whose values are deeply aberrant: if there are undefs in a content list; if an element appears in the content lists of more than one element; if the _parent attribute of an element doesn't match its actual parent; or if an element appears as its own descendant (i.e., if there is a cyclicity in the tree). This returns empty list (or false, in scalar context) if the subtree's linkage methods are sane; otherwise it returns two items (or true, in scalar context): the element where the error occurred, and a string describing the error. This method is provided is mainly for debugging and troubleshooting -- it should be I<quite impossible> for any document constructed via HTML::TreeBuilder to parse into a non-sane tree (since it's not the content of the tree per se that's in question, but whether the tree in memory was properly constructed); and it I<should> be impossible for you to produce an insane tree just thru reasonable use of normal documented structure-modifying methods. But if you're constructing your own trees, and your program is going into infinite loops as during calls to traverse() or any of the secondary structural methods, as part of debugging, consider calling C<has_insane_linkage> on the tree. =head2 element_class $classname = $h->element_class(); This method returns the class which will be used for new elements. It defaults to HTML::Element, but can be overridden by subclassing or esoteric means best left to those will will read the source and then not complain when those esoteric means change. (Just subclass.) =head1 CLASS METHODS =head2 Use_Weak_Refs $enabled = HTML::Element->Use_Weak_Refs; HTML::Element->Use_Weak_Refs( $enabled ); This method allows you to check whether weak reference support is enabled, and to enable or disable it. For details, see L</"Weak References">. C<$enabled> is true if weak references are enabled. You should not switch this in the middle of your program, and you probably shouldn't use it at all. Existing trees are not affected by this method (until you start modifying nodes in them). Throws an exception if you attempt to enable weak references and your Perl or Scalar::Util does not support them. Disabling weak reference support is deprecated. =head1 SUBROUTINES =head2 Version This subroutine is deprecated. Please use the standard VERSION method (e.g. C<< HTML::Element->VERSION >>) instead. =head2 ABORT OK PRUNE PRUNE_SOFTLY PRUNE_UP Constants for signalling back to the traverser =for Pod::Coverage as_text_trimmed =head1 BUGS * If you want to free the memory associated with a tree built of HTML::Element nodes, and you have disabled weak references, then you will have to delete it explicitly using the L</delete> method. See L</"Weak References">. * There's almost nothing to stop you from making a "tree" with cyclicities (loops) in it, which could, for example, make the traverse method go into an infinite loop. So don't make cyclicities! (If all you're doing is parsing HTML files, and looking at the resulting trees, this will never be a problem for you.) * There's no way to represent comments or processing directives in a tree with HTML::Elements. Not yet, at least. * There's (currently) nothing to stop you from using an undefined value as a text segment. If you're running under C<perl -w>, however, this may make HTML::Element's code produce a slew of warnings. =head1 NOTES ON SUBCLASSING You are welcome to derive subclasses from HTML::Element, but you should be aware that the code in HTML::Element makes certain assumptions about elements (and I'm using "element" to mean ONLY an object of class HTML::Element, or of a subclass of HTML::Element): * The value of an element's _parent attribute must either be undef or otherwise false, or must be an element. * The value of an element's _content attribute must either be undef or otherwise false, or a reference to an (unblessed) array. The array may be empty; but if it has items, they must ALL be either mere strings (text segments), or elements. * The value of an element's _tag attribute should, at least, be a string of printable characters. Moreover, bear these rules in mind: * Do not break encapsulation on objects. That is, access their contents only thru $obj->attr or more specific methods. * You should think twice before completely overriding any of the methods that HTML::Element provides. (Overriding with a method that calls the superclass method is not so bad, though.) =head1 SEE ALSO L<HTML::Tree>; L<HTML::TreeBuilder>; L<HTML::AsSubs>; L<HTML::Tagset>; and, for the morbidly curious, L<HTML::Element::traverse>. =head1 ACKNOWLEDGEMENTS Thanks to Mark-Jason Dominus for a POD suggestion. =head1 AUTHOR Current maintainers: =over =item * Christopher J. Madsen S<C<< <perl AT cjmweb.net> >>> =item * Jeff Fearn S<C<< <jfearn AT cpan.org> >>> =back Original HTML-Tree author: =over =item * Gisle Aas =back Former maintainers: =over =item * Sean M. Burke =item * Andy Lester =item * Pete Krawczyk S<C<< <petek AT cpan.org> >>> =back You can follow or contribute to HTML-Tree's development at L<< https://github.com/kentfredric/HTML-Tree >>. =head1 COPYRIGHT AND LICENSE Copyright 1995-1998 Gisle Aas, 1999-2004 Sean M. Burke, 2005 Andy Lester, 2006 Pete Krawczyk, 2010 Jeff Fearn, 2012 Christopher J. Madsen. This library is free software; you can redistribute it and/or modify it under the same terms as Perl itself. The programs in this library are distributed in the hope that they will be useful, but without any warranty; without even the implied warranty of merchantability or fitness for a particular purpose. =cut AsSubs.pm 0000644 00000011623 00000000000 0006246 0 ustar 00 package HTML::AsSubs; # ABSTRACT: functions that construct a HTML syntax tree use warnings; use strict; use vars qw(@ISA @EXPORT); our $VERSION = '5.07'; # VERSION from OurPkgVersion require HTML::Element; require Exporter; @ISA = qw(Exporter); # Problem: exports so damned much. Has no concept of "export only HTML4 # elements". TODO:?? make something that make functions that just # wrap XML::Generator calls? use vars qw(@TAGS); @TAGS = qw(html head title base link meta isindex nextid script style body h1 h2 h3 h4 h5 h6 p pre div blockquote a img br hr ol ul dir menu li dl dt dd dfn cite code em kbd samp strong var address span b i u tt center font big small strike sub sup table tr td th caption form input select option textarea object applet param map area frame frameset noframe ); for (@TAGS) { my $code; $code = "sub $_ { _elem('$_', \@_); }\n"; push( @EXPORT, $_ ); ## no critic eval $code; ## use critic if ($@) { die $@; } } sub _elem { my $tag = shift; my $attributes; if ( @_ and defined $_[0] and ref( $_[0] ) eq "HASH" ) { $attributes = shift; } my $elem = HTML::Element->new( $tag, %$attributes ); $elem->push_content(@_); $elem; } 1; __END__ =pod =head1 NAME HTML::AsSubs - functions that construct a HTML syntax tree =head1 VERSION This document describes version 5.07 of HTML::AsSubs, released August 31, 2017 as part of L<HTML-Tree|HTML::Tree>. =head1 SYNOPSIS use HTML::AsSubs; $h = body( h1("This is the heading"), p("This is the first paragraph which contains a ", a({href=>'link.html'}, "link"), " and an ", img({src=>'img.gif', alt=>'image'}), "." ), ); print $h->as_HTML; =head1 DESCRIPTION This module exports functions that can be used to construct various HTML elements. The functions are named after the tags of the corresponding HTML element and are all written in lower case. If the first argument is a hash reference then it will be used to initialize the attributes of this element. The remaining arguments are regarded as content. For a similar idea (i.e., it's another case where the syntax tree of the Perl source mirrors the syntax tree of the HTML produced), see HTML::Element's C<new_from_lol> method. For what I now think is a cleaner implementation of this same idea, see the excellent module C<XML::Generator>, which is what I suggest for actual real-life use. (I suggest this over C<HTML::AsSubs> and over C<CGI.pm>'s HTML-making functions.) =head1 ACKNOWLEDGEMENT This module was inspired by the following message: Date: Tue, 4 Oct 1994 16:11:30 +0100 Subject: Wow! I have a large lightbulb above my head! Take a moment to consider these lines: %OVERLOAD=( '""' => sub { join("", @{$_[0]}) } ); sub html { my($type)=shift; bless ["<$type>", @_, "</$type>"]; } :-) I *love* Perl 5! Thankyou Larry and Ilya. Regards, Tim Bunce. p.s. If you didn't get it, think about recursive data types: html(html()) p.p.s. I'll turn this into a much more practical example in a day or two. p.p.p.s. It's a pity that overloads are not inherited. Is this a bug? =head1 BUGS The exported link() function overrides the builtin link() function. The exported tr() function must be called using &tr(...) syntax because it clashes with the builtin tr/../../ operator. =head1 SEE ALSO L<HTML::Element>, L<XML::Generator> =head2 html head title base link meta isindex nextid script style body h1 h2 h3 h4 h5 h6 p pre div blockquote a img br hr ol ul dir menu li dl dt dd dfn cite code em kbd samp strong var address span b i u tt center font big small strike sub sup table tr td th caption form input select option textarea object applet param map area frame frameset noframe A bunch of methods for creating tags. =head1 Private Functions =head2 _elem() The _elem() function is wrapped by all the html 'tag' functions. It takes a tag-name, optional hashref of attributes and a list of content as parameters. =head1 AUTHOR Current maintainers: =over =item * Christopher J. Madsen S<C<< <perl AT cjmweb.net> >>> =item * Jeff Fearn S<C<< <jfearn AT cpan.org> >>> =back Original HTML-Tree author: =over =item * Gisle Aas =back Former maintainers: =over =item * Sean M. Burke =item * Andy Lester =item * Pete Krawczyk S<C<< <petek AT cpan.org> >>> =back You can follow or contribute to HTML-Tree's development at L<< https://github.com/kentfredric/HTML-Tree >>. =head1 COPYRIGHT AND LICENSE Copyright 1995-1998 Gisle Aas, 1999-2004 Sean M. Burke, 2005 Andy Lester, 2006 Pete Krawczyk, 2010 Jeff Fearn, 2012 Christopher J. Madsen. This library is free software; you can redistribute it and/or modify it under the same terms as Perl itself. The programs in this library are distributed in the hope that they will be useful, but without any warranty; without even the implied warranty of merchantability or fitness for a particular purpose. =cut Formatter.pm 0000644 00000063717 00000000000 0007024 0 ustar 00 package HTML::Formatter; # ABSTRACT: Base class for HTML formatters use 5.006_001; use strict; use warnings; use Carp; use HTML::Element 3.15 (); # We now use Smart::Comments in place of the old DEBUG framework. # this should be commented out in release versions.... ##use Smart::Comments; our $VERSION = '2.12'; # VERSION our $AUTHORITY = 'cpan:NIGELM'; # AUTHORITY # # A typical formatter will not use all of the features of this # class. But it will use some, as best fits the mapping # of HTML to the particular output format. # # ------------------------------------------------------------------------ sub new { my ( $class, %arg ) = @_; my $self = bless { $class->default_values }, $class; $self->configure( \%arg ) if keys %arg; return $self; } # ------------------------------------------------------------------------ sub default_values { (); } # ------------------------------------------------------------------------ sub configure { my ( $self, $arg ) = @_; for ( keys %$arg ) { warn "Unknown configure argument '$_'" if $^W; } return $self; } # ------------------------------------------------------------------------ sub massage_tree { my ( $self, $html ) = @_; return if $html->tag eq 'p'; # sanity ### Before massaging: $html->dump() $html->simplify_pres(); # Does anything else need doing? ### After massaging: $html->dump() return; } # ------------------------------------------------------------------------ sub format_from_file { return shift->format_file(@_); } sub format_file { my ( $self, $filename, @params ) = @_; $self = $self->new(@params) unless ref $self; croak "What filename to format from?" unless ( defined($filename) and length($filename) ); my $tree = $self->_default_tree(); $tree->parse_file($filename); my $out = $self->format($tree); $tree->delete; return $out; } # ------------------------------------------------------------------------ # ------------------------------------------------------------------------ sub format_from_string { shift->format_string(@_) } sub format_string { my ( $self, $content, @params ) = @_; $self = $self->new(@params) unless ref $self; croak "What string to format?" unless defined $content; my $tree = $self->_default_tree(); $tree->parse($content); $tree->eof(); undef $content; my $out = $self->format($tree); $tree->delete; return $out; } # ------------------------------------------------------------------------ sub _default_tree { require HTML::TreeBuilder; my $t = HTML::TreeBuilder->new; # If nothing else works, try using these parser options:s #$t->implicit_body_p_tag(1); #$t->p_strict(1); return $t; } # ------------------------------------------------------------------------ sub format { my ( $self, $html ) = @_; croak "Usage: \$formatter->format(\$tree)" unless ( defined($html) and ref($html) and $html->can('tag') ); #### Tree to format: $html->dump $self->set_version_tag($html); $self->massage_tree($html); $self->begin($html); $html->number_lists(); # Per-iteration scratch: my ( $node, $start, $depth, $tag, $func ); $html->traverse( sub { ( $node, $start, $depth ) = @_; if ( ref $node ) { $tag = $node->tag; $func = $tag . '_' . ( $start ? "start" : "end" ); # Use ->can so that we can recover if # a handler is not defined for the tag. if ( $self->can($func) ) { ### Calling : (' ' x $depth) . $func return $self->$func($node); } else { ### Skipping: (' ' x $depth) . $func return 1; } } else { $self->textflow($node); } 1; } ); $self->end($html); return join( '', @{ $self->{output} } ); } # ------------------------------------------------------------------------ sub begin { my $self = shift; # Flags $self->{anchor} = 0; $self->{underline} = 0; $self->{bold} = 0; $self->{italic} = 0; $self->{center} = 0; $self->{superscript} = 0; $self->{subscript} = 0; $self->{strikethrough} = 0; $self->{center_stack} = []; # push and pop 'center' states to it $self->{nobr} = 0; $self->{'font_size'} = [3]; # last element is current size $self->{basefont_size} = [3]; $self->{vspace} = undef; # vertical space (dimension) $self->{output} = []; } # ------------------------------------------------------------------------ sub end { } # ------------------------------------------------------------------------ sub set_version_tag { my ( $self, $html ) = @_; if ($html) { $self->{'version_tag'} = sprintf( "%s (v%s, using %s v%s%s)", ref($self), $self->VERSION || '?', ref($html), $html->VERSION || '?', $HTML::Parser::VERSION ? ", and HTML::Parser v$HTML::Parser::VERSION" : '' ); } elsif ($HTML::Parser::VERSION) { $self->{'version_tag'} = sprintf( "%s (v%s, using %s)", ref($self), $self->VERSION || "?", "HTML::Parser v$HTML::Parser::VERSION", ); } else { $self->{'version_tag'} = sprintf( "%s (v%s)", ref($self), $self->VERSION || '?', ); } } # ------------------------------------------------------------------------ sub version_tag { shift->{'version_tag'} } # ------------------------------------------------------------------------ sub html_start { 1; } sub html_end { } sub body_start { 1; } sub body_end { } sub head_start { 0; } sub script_start { 0; } sub style_start { 0; } sub frameset_start { 0; } # ------------------------------------------------------------------------ sub header_start { my ( $self, undef, $node ) = @_; my $align = $node->attr('align'); if ( defined($align) && lc($align) eq 'center' ) { $self->{center}++; } 1; } # ------------------------------------------------------------------------ sub header_end { my ( $self, undef, $node ) = @_; my $align = $node->attr('align'); if ( defined($align) && lc($align) eq 'center' ) { $self->{center}--; } } # ------------------------------------------------------------------------ sub h1_start { shift->header_start( 1, @_ ) } sub h2_start { shift->header_start( 2, @_ ) } sub h3_start { shift->header_start( 3, @_ ) } sub h4_start { shift->header_start( 4, @_ ) } sub h5_start { shift->header_start( 5, @_ ) } sub h6_start { shift->header_start( 6, @_ ) } # ------------------------------------------------------------------------ sub h1_end { shift->header_end( 1, @_ ) } sub h2_end { shift->header_end( 2, @_ ) } sub h3_end { shift->header_end( 3, @_ ) } sub h4_end { shift->header_end( 4, @_ ) } sub h5_end { shift->header_end( 5, @_ ) } sub h6_end { shift->header_end( 6, @_ ) } sub br_start { my $self = shift; $self->vspace( 0, 1 ); } sub hr_start { my $self = shift; $self->vspace(1); 1; } # ------------------------------------------------------------------------ sub img_start { my ( $self, $node ) = @_; my $alt = $node->attr('alt'); $self->out( defined($alt) ? $alt : "[IMAGE]" ); } # ------------------------------------------------------------------------ sub a_start { shift->{anchor}++; 1; } sub a_end { shift->{anchor}--; } sub u_start { shift->{underline}++; 1; } sub u_end { shift->{underline}--; } sub b_start { shift->{bold}++; 1; } sub b_end { shift->{bold}--; } sub tt_start { shift->{teletype}++; 1; } sub tt_end { shift->{teletype}--; } sub i_start { shift->{italic}++; 1; } sub i_end { shift->{italic}--; } sub center_start { shift->{center}++; 1; } sub center_end { shift->{center}--; } # ------------------------------------------------------------------------ sub div_start { # interesting only for its 'align' attribute my ( $self, $node ) = @_; my $align = $node->attr('align'); if ( defined($align) && lc($align) eq 'center' ) { return $self->center_start; } 1; } # ------------------------------------------------------------------------ sub div_end { my ( $self, $node ) = @_; my $align = $node->attr('align'); if ( defined($align) && lc($align) eq 'center' ) { return $self->center_end; } } # ------------------------------------------------------------------------ sub nobr_start { shift->{nobr}++; 1; } sub nobr_end { shift->{nobr}--; } sub wbr_start { 1; } # ------------------------------------------------------------------------ sub font_start { my ( $self, $elem ) = @_; my $size = $elem->attr('size'); return 1 unless ( defined($size) ); if ( $size =~ /^\s*[+\-]/ ) { my $base = $self->{basefont_size}[-1]; # yes, base it on the most recent one $size = $base + $size; } push @{ $self->{'font_size'} }, $size; $self->new_font_size($size); 1; } # ------------------------------------------------------------------------ sub font_end { my ( $self, $elem ) = @_; my $size = $elem->attr('size'); return unless defined $size; pop @{ $self->{'font_size'} }; $self->restore_font_size( $self->{'font_size'}[-1] ); } # ------------------------------------------------------------------------ sub big_start { my $self = $_[0]; push @{ $self->{'font_size'} }, $self->{basefont_size}[-1] + 1; # same as font size="+1" $self->new_font_size( $self->{'font_size'}[-1] ); 1; } # ------------------------------------------------------------------------ sub small_start { my $self = $_[0]; push @{ $self->{'font_size'} }, $self->{basefont_size}[-1] - 1, # same as font size="-1" ; $self->new_font_size( $self->{'font_size'}[-1] ); 1; } # ------------------------------------------------------------------------ sub big_end { my $self = $_[0]; pop @{ $self->{'font_size'} }; $self->restore_font_size( $self->{'font_size'}[-1] ); 1; } # ------------------------------------------------------------------------ sub small_end { my $self = $_[0]; pop @{ $self->{'font_size'} }; $self->restore_font_size( $self->{'font_size'}[-1] ); 1; } # ------------------------------------------------------------------------ sub basefont_start { my ( $self, $elem ) = @_; my $size = $elem->attr('size'); return unless defined $size; push( @{ $self->{basefont_size} }, $size ); 1; } # ------------------------------------------------------------------------ sub basefont_end { my ( $self, $elem ) = @_; my $size = $elem->attr('size'); return unless defined $size; pop( @{ $self->{basefont_size} } ); } # ------------------------------------------------------------------------ # # Override in subclasses, if you like. # sub new_font_size { } #my( $self, $font_size_number ) = @_; sub restore_font_size { } #my( $self, $font_size_number ) = @_; # ------------------------------------------------------------------------ sub q_start { shift->out(q<">); 1; } sub q_end { shift->out(q<">); 1; } sub sup_start { shift->{superscript}++; 1; } sub sup_end { shift->{superscript}--; 1; } sub sub_start { shift->{subscript}++; 1; } sub sub_end { shift->{subscript}--; 1; } sub strike_start { shift->{strikethrough}++; 1; } sub strike_end { shift->{strikethrough}--; 1; } sub s_start { shift->strike_start(@_); } sub s_end { shift->strike_end(@_); } sub dfn_start { 1; } sub dfn_end { 1; } sub abbr_start { 1; } sub abbr_end { 1; } sub acronym_start { 1; } sub acronym_end { 1; } sub span_start { 1; } sub span_end { 1; } sub ins_start { 1; } sub ins_end { 1; } sub del_start { 0; } # Don't render the del'd bits sub del_end { 0; } # ------------------------------------------------------------------------ my @Size_magic_numbers = ( 0.60, 0.75, 0.89, 1, 1.20, 1.50, 2.00, 3.00 # #0 #1 #2 #3 #4 #5 #6 #7 #________________ - | + _________________________ # -3 -2 -1 0 +1 +2 +3 +4 ); # ------------------------------------------------------------------------ sub scale_font_for { my ( $self, $reference_size ) = @_; # Mozilla's source, at # http://lxr.mozilla.org/seamonkey/source/content/html/style/src/nsStyleUtil.cpp#299 # says: # static PRInt32 sFontSizeFactors[8] = { 60,75,89,100,120,150,200,300 }; # # For comparison, Gisle's earlier HTML::FormatPS has: # | # size 0 1 2 3 4 5 6 7 # | @FontSizes = ( 5, 6, 8, 10, 12, 14, 18, 24, 32); # ...and gets different sizing via just a scaling factor. my $size_number = int( defined( $_[2] ) ? $_[2] : $self->{'font_size'}[-1] ); # force the size_number into range: $size_number = ( $size_number < 0 ) ? 0 : ( $size_number > $#Size_magic_numbers ) ? $#Size_magic_numbers : int($size_number); my $result = int( .5 + $reference_size * $Size_magic_numbers[$size_number] ); ### Scale Font: sprintf("reference %s, size %s => %s", $reference_size, $size_number, $result); return $result; } # ------------------------------------------------------------------------ # Aliases for logical markup: sub strong_start { shift->b_start(@_) } sub strong_end { shift->b_end(@_) } sub cite_start { shift->i_start(@_) } sub cite_end { shift->i_end(@_) } sub em_start { shift->i_start(@_) } sub em_end { shift->i_end(@_) } sub code_start { shift->tt_start(@_) } sub code_end { shift->tt_end(@_) } sub kbd_start { shift->tt_start(@_) } sub kbd_end { shift->tt_end(@_) } sub samp_start { shift->tt_start(@_) } sub samp_end { shift->tt_end(@_) } sub var_start { shift->tt_start(@_) } sub var_end { shift->tt_end(@_) } # ------------------------------------------------------------------------ sub p_start { my $self = shift; #$self->adjust_lm(0); # assert new paragraph $self->vspace(1); # assert one line's worth of vertical space at para-start $self->out(''); 1; } # ------------------------------------------------------------------------ sub p_end { shift->vspace(1); # assert one line's worth of vertical space at para-end } # ------------------------------------------------------------------------ sub pre_start { my $self = shift; $self->{pre}++; $self->vspace(1); # assert one line's worth of vertical space at pre-start 1; } # ------------------------------------------------------------------------ sub pre_end { my $self = shift; $self->{pre}--; # assert one line's worth of vertical space at pre-end $self->vspace(1); } # ------------------------------------------------------------------------ sub listing_start { shift->pre_start(@_) } sub listing_end { shift->pre_end(@_) } sub xmp_start { shift->pre_start(@_) } sub xmp_end { shift->pre_end(@_) } # ------------------------------------------------------------------------ sub blockquote_start { my $self = shift; $self->vspace(1); # assert one line's worth of vertical space at blockquote-start $self->adjust_lm(+2); $self->adjust_rm(-2); 1; } # ------------------------------------------------------------------------ sub blockquote_end { my $self = shift; $self->vspace(1); # assert one line's worth of vertical space at blockquote-end $self->adjust_lm(-2); $self->adjust_rm(+2); } # ------------------------------------------------------------------------ sub address_start { my $self = shift; $self->vspace(1); # assert one line's worth of vertical space at address-para-start $self->i_start(@_); 1; } # ------------------------------------------------------------------------ sub address_end { my $self = shift; $self->i_end(@_); # assert one line's worth of vertical space at address-para-end $self->vspace(1); } # ------------------------------------------------------------------------ # Handling of list elements sub ul_start { my $self = shift; $self->vspace(1); # assert one line's worth of vertical space at ul-start $self->adjust_lm(+2); 1; } # ------------------------------------------------------------------------ sub ul_end { my $self = shift; $self->adjust_lm(-2); # assert one line's worth of vertical space at ul-end $self->vspace(1); } # ------------------------------------------------------------------------ sub li_start { my $self = shift; $self->bullet( shift->attr('_bullet') || '' ); $self->adjust_lm(+2); 1; } # ------------------------------------------------------------------------ sub bullet { shift->out(@_); } # ------------------------------------------------------------------------ sub li_end { my $self = shift; $self->vspace(1); $self->adjust_lm(-2); } # ------------------------------------------------------------------------ sub menu_start { shift->ul_start(@_) } sub menu_end { shift->ul_end(@_) } sub dir_start { shift->ul_start(@_) } sub dir_end { shift->ul_end(@_) } # ------------------------------------------------------------------------ sub ol_start { my $self = shift; $self->vspace(1); $self->adjust_lm(+2); 1; } # ------------------------------------------------------------------------ sub ol_end { my $self = shift; $self->adjust_lm(-2); $self->vspace(1); } # ------------------------------------------------------------------------ sub dl_start { my $self = shift; # $self->adjust_lm(+2); $self->vspace(1); # assert one line's worth of vertical space at dl-start 1; } # ------------------------------------------------------------------------ sub dl_end { my $self = shift; # $self->adjust_lm(-2); $self->vspace(1); # assert one line's worth of vertical space at dl-end } # ------------------------------------------------------------------------ sub dt_start { my $self = shift; $self->vspace(1); # assert one line's worth of vertical space at dt-start 1; } # ------------------------------------------------------------------------ sub dt_end { } # ------------------------------------------------------------------------ sub dd_start { my $self = shift; $self->adjust_lm(+6); $self->vspace(0); # hm, what's that do? nothing? 1; } # ------------------------------------------------------------------------ sub dd_end { my $self = shift; $self->vspace(1); # assert one line's worth of vertical space at dd-end $self->adjust_lm(-6); } # ------------------------------------------------------------------------ # And now some things that are basically sane fall-throughs for classes # that don't really handle tables or forms specially... # Things not formatted at all sub input_start { 0; } sub textarea_start { 0; } sub select_start { 0; } sub option_start { 0; } # ------------------------------------------------------------------------ sub td_start { my $self = shift; push @{ $self->{'center_stack'} }, $self->{'center'}; $self->{center} = 0; $self->p_start(@_); } # ------------------------------------------------------------------------ sub td_end { my $self = shift; $self->{'center'} = pop @{ $self->{'center_stack'} }; $self->p_end(@_); } # ------------------------------------------------------------------------ sub th_start { my $self = shift; push @{ $self->{'center_stack'} }, $self->{'center'}; $self->{center} = 0; $self->p_start(@_); $self->b_start(@_); } # ------------------------------------------------------------------------ sub th_end { my $self = shift; $self->b_end(@_); $self->{'center'} = pop @{ $self->{'center_stack'} }; $self->p_end(@_); } # But if you wanted to just SKIP tables and forms, you'd do this: # sub table_start { shift->out('[TABLE NOT SHOWN]'); 0; } # sub form_start { shift->out('[FORM NOT SHOWN]'); 0; } # ------------------------------------------------------------------------ sub textflow { my $self = shift; if ( $self->{pre} ) { # Strip one leading and one trailing newline so that a <pre> # tag can be placed on a line of its own without causing extra # vertical space as part of the preformatted text. $_[0] =~ s/\n$//; $_[0] =~ s/^\n//; $self->pre_out( $_[0] ); } elsif ( $self->{blockquote} ) { $_[0] =~ s/\A\s//; $self->blockquote_out( $_[0] ); } else { for ( split( /(\s+)/, $_[0] ) ) { next unless length $_; $self->out($_); } } } # ------------------------------------------------------------------------ sub vspace { my ( $self, $min, $add ) = @_; # This method sets the vspace attribute. When vspace is # defined, then a new line should be started. If vspace # is a nonzero value, then that should be taken as the # number of lines to be skipped before following text # is written out. # # You may think it odd to conflate the two concepts of # ending this paragraph, and asserting how much space should # follow; but it happens to work out pretty well. my $old = $self->{vspace}; if ( defined $old ) { my $new = $old; $new += $add || 0; $new = $min if $new < $min; $self->{vspace} = $new; } else { $self->{vspace} = $min; } ### vspace: $self->{vspace} $old; } # ------------------------------------------------------------------------ sub collect { push( @{ shift->{output} }, @_ ); } # ------------------------------------------------------------------------ sub out { confess "Must be overridden by subclass"; } # Output a word sub pre_out { confess "Must be overridden by subclass"; } sub adjust_lm { confess "Must be overridden by subclass"; } sub adjust_rm { confess "Must be overridden by subclass"; } # ------------------------------------------------------------------------ 1; __END__ =pod =for test_synopsis 1; __END__ =for stopwords formatters CPAN homepage =head1 NAME HTML::Formatter - Base class for HTML formatters =head1 VERSION version 2.12 =head1 SYNOPSIS use HTML::FormatSomething; my $infile = "whatever.html"; my $outfile = "whatever.file"; open OUT, ">$outfile" or die "Can't write-open $outfile: $!\n"; print OUT HTML::FormatSomething->format_file( $infile, 'option1' => 'value1', 'option2' => 'value2', ... ); close(OUT); =head1 DESCRIPTION HTML::Formatter is a base class for classes that take HTML and format it to some output format. When you take an object of such a base class and call C<$formatter->format( $tree )> with an L<HTML::TreeBuilder> (or L<HTML::Element>) object, they return the appropriately formatted string for the input HTML. HTML formatters are able to format a HTML syntax tree into various printable formats. Different formatters produce output for different output media. Common for all formatters are that they will return the formatted output when the format() method is called. The format() method takes a HTML::Element object (usually the HTML::TreeBuilder root object) as parameter. =head1 METHODS =head2 new my $formatter = FormatterClass->new( option1 => value1, option2 => value2, ... ); This creates a new formatter object with the given options. =head2 format_file =head2 format_from_file $string = FormatterClass->format_file( $html_source, option1 => value1, option2 => value2, ... ); Return a string consisting of the result of using the given class to format the given HTML file according to the given (optional) options. Internally it calls C<< SomeClass->new( ... )->format( ... ) >> on a new HTML::TreeBuilder object based on the given HTML file. =head2 format_string =head2 format_from_string $string = FormatterClass->format_string( $html_source, option1 => value1, option2 => value2, ... ); Return a string consisting of the result of using the given class to format the given HTML source according to the given (optional) options. Internally it calls C<< SomeClass->new( ... )->format( ... ) >> on a new HTML::TreeBuilder object based on the given source. =head2 format my $render_string = $formatter->format( $html_tree_object ); This renders the given HTML object according to the options set for $formatter. After you've used a particular formatter object to format a particular HTML tree object, you probably should not use either again. =head1 SEE ALSO The three specific formatters:- =over =item L<HTML::FormatText> Format HTML into plain text =item L<HTML::FormatPS> Format HTML into postscript =item L<HTML::FormatRTF> Format HTML into Rich Text Format =back Also the HTML manipulation libraries used - L<HTML::TreeBuilder>, L<HTML::Element> and L<HTML::Tree> =head1 INSTALLATION See perlmodinstall for information and options on installing Perl modules. =head1 BUGS AND LIMITATIONS You can make new bug reports, and view existing ones, through the web interface at L<http://rt.cpan.org/Public/Dist/Display.html?Name=HTML-Format>. =head1 AVAILABILITY The project homepage is L<https://metacpan.org/release/HTML-Format>. The latest version of this module is available from the Comprehensive Perl Archive Network (CPAN). Visit L<http://www.perl.com/CPAN/> to find a CPAN site near you, or see L<https://metacpan.org/module/HTML::Format/>. =head1 AUTHORS =over 4 =item * Nigel Metheringham <nigelm@cpan.org> =item * Sean M Burke <sburke@cpan.org> =item * Gisle Aas <gisle@ActiveState.com> =back =head1 COPYRIGHT AND LICENSE This software is copyright (c) 2015 by Nigel Metheringham, 2002-2005 Sean M Burke, 1999-2002 Gisle Aas. This is free software; you can redistribute it and/or modify it under the same terms as the Perl 5 programming language system itself. =cut Template.pm 0000644 00000406134 00000000000 0006626 0 ustar 00 package HTML::Template; $HTML::Template::VERSION = '2.97'; =head1 NAME HTML::Template - Perl module to use HTML-like templating language =head1 SYNOPSIS First you make a template - this is just a normal HTML file with a few extra tags, the simplest being C<< <TMPL_VAR> >> For example, test.tmpl: <html> <head><title>Test Template</title></head> <body> My Home Directory is <TMPL_VAR NAME=HOME> <p> My Path is set to <TMPL_VAR NAME=PATH> </body> </html> Now you can use it in a small CGI program: #!/usr/bin/perl -w use HTML::Template; # open the html template my $template = HTML::Template->new(filename => 'test.tmpl'); # fill in some parameters $template->param(HOME => $ENV{HOME}); $template->param(PATH => $ENV{PATH}); # send the obligatory Content-Type and print the template output print "Content-Type: text/html\n\n", $template->output; If all is well in the universe this should show something like this in your browser when visiting the CGI: My Home Directory is /home/some/directory My Path is set to /bin;/usr/bin =head1 DESCRIPTION This module attempts to make using HTML templates simple and natural. It extends standard HTML with a few new HTML-esque tags - C<< <TMPL_VAR> >> C<< <TMPL_LOOP> >>, C<< <TMPL_INCLUDE> >>, C<< <TMPL_IF> >>, C<< <TMPL_ELSE> >> and C<< <TMPL_UNLESS> >>. The file written with HTML and these new tags is called a template. It is usually saved separate from your script - possibly even created by someone else! Using this module you fill in the values for the variables, loops and branches declared in the template. This allows you to separate design - the HTML - from the data, which you generate in the Perl script. This module is licensed under the same terms as Perl. See the LICENSE section below for more details. =head1 TUTORIAL If you're new to HTML::Template, I suggest you start with the introductory article available on Perl Monks: http://www.perlmonks.org/?node_id=65642 =head1 FAQ Please see L<HTML::Template::FAQ> =head1 MOTIVATION It is true that there are a number of packages out there to do HTML templates. On the one hand you have things like L<HTML::Embperl> which allows you freely mix Perl with HTML. On the other hand lie home-grown variable substitution solutions. Hopefully the module can find a place between the two. One advantage of this module over a full L<HTML::Embperl>-esque solution is that it enforces an important divide - design and programming. By limiting the programmer to just using simple variables and loops in the HTML, the template remains accessible to designers and other non-perl people. The use of HTML-esque syntax goes further to make the format understandable to others. In the future this similarity could be used to extend existing HTML editors/analyzers to support HTML::Template. An advantage of this module over home-grown tag-replacement schemes is the support for loops. In my work I am often called on to produce tables of data in html. Producing them using simplistic HTML templates results in programs containing lots of HTML since the HTML itself cannot represent loops. The introduction of loop statements in the HTML simplifies this situation considerably. The designer can layout a single row and the programmer can fill it in as many times as necessary - all they must agree on is the parameter names. For all that, I think the best thing about this module is that it does just one thing and it does it quickly and carefully. It doesn't try to replace Perl and HTML, it just augments them to interact a little better. And it's pretty fast. =head1 THE TAGS =head2 TMPL_VAR <TMPL_VAR NAME="PARAMETER_NAME"> The C<< <TMPL_VAR> >> tag is very simple. For each C<< <TMPL_VAR> >> tag in the template you call: $template->param(PARAMETER_NAME => "VALUE") When the template is output the C<< <TMPL_VAR> >> is replaced with the VALUE text you specified. If you don't set a parameter it just gets skipped in the output. You can also specify the value of the parameter as a code reference in order to have "lazy" variables. These sub routines will only be referenced if the variables are used. See L<LAZY VALUES> for more information. =head3 Attributes The following "attributes" can also be specified in template var tags: =over =item * escape This allows you to escape the value before it's put into the output. This is useful when you want to use a TMPL_VAR in a context where those characters would cause trouble. For example: <input name=param type=text value="<TMPL_VAR PARAM>"> If you called C<param()> with a value like C<sam"my> you'll get in trouble with HTML's idea of a double-quote. On the other hand, if you use C<escape=html>, like this: <input name=param type=text value="<TMPL_VAR PARAM ESCAPE=HTML>"> You'll get what you wanted no matter what value happens to be passed in for param. The following escape values are supported: =over =item * html Replaces the following characters with their HTML entity equivalent: C<&>, C<">, C<'>, C<< < >>, C<< > >> =item * js Escapes (with a backslash) the following characters: C<\>, C<'>, C<">, C<\n>, C<\r> =item * url URL escapes any ASCII characters except for letters, numbers, C<_>, C<.> and C<->. =item * none Performs no escaping. This is the default, but it's useful to be able to explicitly turn off escaping if you are using the C<default_escape> option. =back =item * default With this attribute you can assign a default value to a variable. For example, this will output "the devil gave me a taco" if the C<who> variable is not set. <TMPL_VAR WHO DEFAULT="the devil"> gave me a taco. =back =head2 TMPL_LOOP <TMPL_LOOP NAME="LOOP_NAME"> ... </TMPL_LOOP> The C<< <TMPL_LOOP> >> tag is a bit more complicated than C<< <TMPL_VAR> >>. The C<< <TMPL_LOOP> >> tag allows you to delimit a section of text and give it a name. Inside this named loop you place C<< <TMPL_VAR> >>s. Now you pass to C<param()> a list (an array ref) of parameter assignments (hash refs) for this loop. The loop iterates over the list and produces output from the text block for each pass. Unset parameters are skipped. Here's an example: In the template: <TMPL_LOOP NAME=EMPLOYEE_INFO> Name: <TMPL_VAR NAME=NAME> <br> Job: <TMPL_VAR NAME=JOB> <p> </TMPL_LOOP> In your Perl code: $template->param( EMPLOYEE_INFO => [{name => 'Sam', job => 'programmer'}, {name => 'Steve', job => 'soda jerk'}] ); print $template->output(); The output is: Name: Sam Job: programmer Name: Steve Job: soda jerk As you can see above the C<< <TMPL_LOOP> >> takes a list of variable assignments and then iterates over the loop body producing output. Often you'll want to generate a C<< <TMPL_LOOP> >>'s contents programmatically. Here's an example of how this can be done (many other ways are possible!): # a couple of arrays of data to put in a loop: my @words = qw(I Am Cool); my @numbers = qw(1 2 3); my @loop_data = (); # initialize an array to hold your loop while (@words and @numbers) { my %row_data; # get a fresh hash for the row data # fill in this row $row_data{WORD} = shift @words; $row_data{NUMBER} = shift @numbers; # the crucial step - push a reference to this row into the loop! push(@loop_data, \%row_data); } # finally, assign the loop data to the loop param, again with a reference: $template->param(THIS_LOOP => \@loop_data); The above example would work with a template like: <TMPL_LOOP NAME="THIS_LOOP"> Word: <TMPL_VAR NAME="WORD"> Number: <TMPL_VAR NAME="NUMBER"> </TMPL_LOOP> It would produce output like: Word: I Number: 1 Word: Am Number: 2 Word: Cool Number: 3 C<< <TMPL_LOOP> >>s within C<< <TMPL_LOOP> >>s are fine and work as you would expect. If the syntax for the C<param()> call has you stumped, here's an example of a param call with one nested loop: $template->param( LOOP => [ { name => 'Bobby', nicknames => [{name => 'the big bad wolf'}, {name => 'He-Man'}], }, ], ); Basically, each C<< <TMPL_LOOP> >> gets an array reference. Inside the array are any number of hash references. These hashes contain the name=>value pairs for a single pass over the loop template. Inside a C<< <TMPL_LOOP> >>, the only variables that are usable are the ones from the C<< <TMPL_LOOP> >>. The variables in the outer blocks are not visible within a template loop. For the computer-science geeks among you, a C<< <TMPL_LOOP> >> introduces a new scope much like a perl subroutine call. If you want your variables to be global you can use C<global_vars> option to C<new()> described below. =head2 TMPL_INCLUDE <TMPL_INCLUDE NAME="filename.tmpl"> This tag includes a template directly into the current template at the point where the tag is found. The included template contents are used exactly as if its contents were physically included in the master template. The file specified can be an absolute path (beginning with a '/' under Unix, for example). If it isn't absolute, the path to the enclosing file is tried first. After that the path in the environment variable C<HTML_TEMPLATE_ROOT> is tried, if it exists. Next, the "path" option is consulted, first as-is and then with C<HTML_TEMPLATE_ROOT> prepended if available. As a final attempt, the filename is passed to C<open()> directly. See below for more information on C<HTML_TEMPLATE_ROOT> and the C<path> option to C<new()>. As a protection against infinitely recursive includes, an arbitrary limit of 10 levels deep is imposed. You can alter this limit with the C<max_includes> option. See the entry for the C<max_includes> option below for more details. =head2 TMPL_IF <TMPL_IF NAME="PARAMETER_NAME"> ... </TMPL_IF> The C<< <TMPL_IF> >> tag allows you to include or not include a block of the template based on the value of a given parameter name. If the parameter is given a value that is true for Perl - like '1' - then the block is included in the output. If it is not defined, or given a false value - like '0' - then it is skipped. The parameters are specified the same way as with C<< <TMPL_VAR> >>. Example Template: <TMPL_IF NAME="BOOL"> Some text that only gets displayed if BOOL is true! </TMPL_IF> Now if you call C<< $template->param(BOOL => 1) >> then the above block will be included by output. C<< <TMPL_IF> </TMPL_IF> >> blocks can include any valid HTML::Template construct - C<VAR>s and C<LOOP>s and other C<IF>/C<ELSE> blocks. Note, however, that intersecting a C<< <TMPL_IF> >> and a C<< <TMPL_LOOP> >> is invalid. Not going to work: <TMPL_IF BOOL> <TMPL_LOOP SOME_LOOP> </TMPL_IF> </TMPL_LOOP> If the name of a C<< <TMPL_LOOP> >> is used in a C<< <TMPL_IF> >>, the C<IF> block will output if the loop has at least one row. Example: <TMPL_IF LOOP_ONE> This will output if the loop is not empty. </TMPL_IF> <TMPL_LOOP LOOP_ONE> .... </TMPL_LOOP> WARNING: Much of the benefit of HTML::Template is in decoupling your Perl and HTML. If you introduce numerous cases where you have C<TMPL_IF>s and matching Perl C<if>s, you will create a maintenance problem in keeping the two synchronized. I suggest you adopt the practice of only using C<TMPL_IF> if you can do so without requiring a matching C<if> in your Perl code. =head2 TMPL_ELSE <TMPL_IF NAME="PARAMETER_NAME"> ... <TMPL_ELSE> ... </TMPL_IF> You can include an alternate block in your C<< <TMPL_IF> >> block by using C<< <TMPL_ELSE> >>. NOTE: You still end the block with C<< </TMPL_IF> >>, not C<< </TMPL_ELSE> >>! Example: <TMPL_IF BOOL> Some text that is included only if BOOL is true <TMPL_ELSE> Some text that is included only if BOOL is false </TMPL_IF> =head2 TMPL_UNLESS <TMPL_UNLESS NAME="PARAMETER_NAME"> ... </TMPL_UNLESS> This tag is the opposite of C<< <TMPL_IF> >>. The block is output if the C<PARAMETER_NAME> is set false or not defined. You can use C<< <TMPL_ELSE> >> with C<< <TMPL_UNLESS> >> just as you can with C<< <TMPL_IF> >>. Example: <TMPL_UNLESS BOOL> Some text that is output only if BOOL is FALSE. <TMPL_ELSE> Some text that is output only if BOOL is TRUE. </TMPL_UNLESS> If the name of a C<< <TMPL_LOOP> >> is used in a C<< <TMPL_UNLESS> >>, the C<< <UNLESS> >> block output if the loop has zero rows. <TMPL_UNLESS LOOP_ONE> This will output if the loop is empty. </TMPL_UNLESS> <TMPL_LOOP LOOP_ONE> .... </TMPL_LOOP> =cut =head2 NOTES HTML::Template's tags are meant to mimic normal HTML tags. However, they are allowed to "break the rules". Something like: <img src="<TMPL_VAR IMAGE_SRC>"> is not really valid HTML, but it is a perfectly valid use and will work as planned. The C<NAME=> in the tag is optional, although for extensibility's sake I recommend using it. Example - C<< <TMPL_LOOP LOOP_NAME> >> is acceptable. If you're a fanatic about valid HTML and would like your templates to conform to valid HTML syntax, you may optionally type template tags in the form of HTML comments. This may be of use to HTML authors who would like to validate their templates' HTML syntax prior to HTML::Template processing, or who use DTD-savvy editing tools. <!-- TMPL_VAR NAME=PARAM1 --> In order to realize a dramatic savings in bandwidth, the standard (non-comment) tags will be used throughout this documentation. =head1 METHODS =head2 new Call C<new()> to create a new Template object: my $template = HTML::Template->new( filename => 'file.tmpl', option => 'value', ); You must call C<new()> with at least one C<name => value> pair specifying how to access the template text. You can use C<< filename => 'file.tmpl' >> to specify a filename to be opened as the template. Alternately you can use: my $t = HTML::Template->new( scalarref => $ref_to_template_text, option => 'value', ); and my $t = HTML::Template->new( arrayref => $ref_to_array_of_lines, option => 'value', ); These initialize the template from in-memory resources. In almost every case you'll want to use the filename parameter. If you're worried about all the disk access from reading a template file just use mod_perl and the cache option detailed below. You can also read the template from an already opened filehandle, either traditionally as a glob or as a L<FileHandle>: my $t = HTML::Template->new(filehandle => *FH, option => 'value'); The four C<new()> calling methods can also be accessed as below, if you prefer. my $t = HTML::Template->new_file('file.tmpl', option => 'value'); my $t = HTML::Template->new_scalar_ref($ref_to_template_text, option => 'value'); my $t = HTML::Template->new_array_ref($ref_to_array_of_lines, option => 'value'); my $t = HTML::Template->new_filehandle($fh, option => 'value'); And as a final option, for those that might prefer it, you can call new as: my $t = HTML::Template->new( type => 'filename', source => 'file.tmpl', ); Which works for all three of the source types. If the environment variable C<HTML_TEMPLATE_ROOT> is set and your filename doesn't begin with "/", then the path will be relative to the value of c<HTML_TEMPLATE_ROOT>. B<Example> - if the environment variable C<HTML_TEMPLATE_ROOT> is set to F</home/sam> and I call C<< HTML::Template->new() >> with filename set to "sam.tmpl", HTML::Template will try to open F</home/sam/sam.tmpl> to access the template file. You can also affect the search path for files with the C<path> option to C<new()> - see below for more information. You can modify the Template object's behavior with C<new()>. The options are available: =head3 Error Detection Options =over =item * die_on_bad_params If set to 0 the module will let you call: $template->param(param_name => 'value') even if 'param_name' doesn't exist in the template body. Defaults to 1. =item * force_untaint If set to 1 the module will not allow you to set unescaped parameters with tainted values. If set to 2 you will have to untaint all parameters, including ones with the escape attribute. This option makes sure you untaint everything so you don't accidentally introduce e.g. cross-site-scripting (XSS) vulnerabilities. Requires taint mode. Defaults to 0. =item * strict - if set to 0 the module will allow things that look like they might be TMPL_* tags to get by without dieing. Example: <TMPL_HUH NAME=ZUH> Would normally cause an error, but if you call new with C<< strict => 0 >> HTML::Template will ignore it. Defaults to 1. =item * vanguard_compatibility_mode If set to 1 the module will expect to see C<< <TMPL_VAR> >>s that look like C<%NAME%> in addition to the standard syntax. Also sets C<die_on_bad_params => 0>. If you're not at Vanguard Media trying to use an old format template don't worry about this one. Defaults to 0. =back =head3 Caching Options =over =item * cache If set to 1 the module will cache in memory the parsed templates based on the filename parameter, the modification date of the file and the options passed to C<new()>. This only applies to templates opened with the filename parameter specified, not scalarref or arrayref templates. Caching also looks at the modification times of any files included using C<< <TMPL_INCLUDE> >> tags, but again, only if the template is opened with filename parameter. This is mainly of use in a persistent environment like Apache/mod_perl. It has absolutely no benefit in a normal CGI environment since the script is unloaded from memory after every request. For a cache that does work for a non-persistent environment see the C<shared_cache> option below. My simplistic testing shows that using cache yields a 90% performance increase under mod_perl. Cache defaults to 0. =item * shared_cache If set to 1 the module will store its cache in shared memory using the L<IPC::SharedCache> module (available from CPAN). The effect of this will be to maintain a single shared copy of each parsed template for all instances of HTML::Template on the same machine to use. This can be a significant reduction in memory usage in an environment with a single machine but multiple servers. As an example, on one of our systems we use 4MB of template cache and maintain 25 httpd processes - shared_cache results in saving almost 100MB! Of course, some reduction in speed versus normal caching is to be expected. Another difference between normal caching and shared_cache is that shared_cache will work in a non-persistent environment (like normal CGI) - normal caching is only useful in a persistent environment like Apache/mod_perl. By default HTML::Template uses the IPC key 'TMPL' as a shared root segment (0x4c504d54 in hex), but this can be changed by setting the C<ipc_key> C<new()> parameter to another 4-character or integer key. Other options can be used to affect the shared memory cache correspond to L<IPC::SharedCache> options - C<ipc_mode>, C<ipc_segment_size> and C<ipc_max_size>. See L<IPC::SharedCache> for a description of how these work - in most cases you shouldn't need to change them from the defaults. For more information about the shared memory cache system used by HTML::Template see L<IPC::SharedCache>. =item * double_cache If set to 1 the module will use a combination of C<shared_cache> and normal cache mode for the best possible caching. Of course, it also uses the most memory of all the cache modes. All the same ipc_* options that work with C<shared_cache> apply to C<double_cache> as well. Defaults to 0. =item * blind_cache If set to 1 the module behaves exactly as with normal caching but does not check to see if the file has changed on each request. This option should be used with caution, but could be of use on high-load servers. My tests show C<blind_cache> performing only 1 to 2 percent faster than cache under mod_perl. B<NOTE>: Combining this option with shared_cache can result in stale templates stuck permanently in shared memory! =item * file_cache If set to 1 the module will store its cache in a file using the L<Storable> module. It uses no additional memory, and my simplistic testing shows that it yields a 50% performance advantage. Like C<shared_cache>, it will work in a non-persistent environments (like CGI). Default is 0. If you set this option you must set the C<file_cache_dir> option. See below for details. B<NOTE>: L<Storable> uses C<flock()> to ensure safe access to cache files. Using C<file_cache> on a system or filesystem (like NFS) without C<flock()> support is dangerous. =item * file_cache_dir Sets the directory where the module will store the cache files if C<file_cache> is enabled. Your script will need write permissions to this directory. You'll also need to make sure the sufficient space is available to store the cache files. =item * file_cache_dir_mode Sets the file mode for newly created C<file_cache> directories and subdirectories. Defaults to "0700" for security but this may be inconvenient if you do not have access to the account running the webserver. =item * double_file_cache If set to 1 the module will use a combination of C<file_cache> and normal C<cache> mode for the best possible caching. The file_cache_* options that work with file_cache apply to C<double_file_cache> as well. Defaults to 0. =item * cache_lazy_vars The option tells HTML::Template to cache the values returned from code references used for C<TMPL_VAR>s. See L<LAZY VALUES> for details. =item * cache_lazy_loops The option tells HTML::Template to cache the values returned from code references used for C<TMPL_LOOP>s. See L<LAZY VALUES> for details. =back =head3 Filesystem Options =over =item * path You can set this variable with a list of paths to search for files specified with the C<filename> option to C<new()> and for files included with the C<< <TMPL_INCLUDE> >> tag. This list is only consulted when the filename is relative. The C<HTML_TEMPLATE_ROOT> environment variable is always tried first if it exists. Also, if C<HTML_TEMPLATE_ROOT> is set then an attempt will be made to prepend C<HTML_TEMPLATE_ROOT> onto paths in the path array. In the case of a C<< <TMPL_INCLUDE> >> file, the path to the including file is also tried before path is consulted. Example: my $template = HTML::Template->new( filename => 'file.tmpl', path => ['/path/to/templates', '/alternate/path'], ); B<NOTE>: the paths in the path list must be expressed as UNIX paths, separated by the forward-slash character ('/'). =item * search_path_on_include If set to a true value the module will search from the top of the array of paths specified by the path option on every C<< <TMPL_INCLUDE> >> and use the first matching template found. The normal behavior is to look only in the current directory for a template to include. Defaults to 0. =item * utf8 Setting this to true tells HTML::Template to treat your template files as UTF-8 encoded. This will apply to any file's passed to C<new()> or any included files. It won't do anything special to scalars templates passed to C<new()> since you should be doing the encoding on those yourself. my $template = HTML::Template->new( filename => 'umlauts_are_awesome.tmpl', utf8 => 1, ); Most templates are either ASCII (the default) or UTF-8 encoded Unicode. But if you need some other encoding other than these 2, look at the C<open_mode> option. B<NOTE>: The C<utf8> and C<open_mode> options cannot be used at the same time. =item * open_mode You can set this option to an opening mode with which all template files will be opened. For example, if you want to use a template that is UTF-16 encoded unicode: my $template = HTML::Template->new( filename => 'file.tmpl', open_mode => '<:encoding(UTF-16)', ); That way you can force a different encoding (than the default ASCII or UTF-8), CR/LF properties etc. on the template files. See L<PerlIO> for details. B<NOTE>: this only works in perl 5.7.1 and above. B<NOTE>: you have to supply an opening mode that actually permits reading from the file handle. B<NOTE>: The C<utf8> and C<open_mode> options cannot be used at the same time. =back =head3 Debugging Options =over =item * debug If set to 1 the module will write random debugging information to STDERR. Defaults to 0. =item * stack_debug If set to 1 the module will use Data::Dumper to print out the contents of the parse_stack to STDERR. Defaults to 0. =item * cache_debug If set to 1 the module will send information on cache loads, hits and misses to STDERR. Defaults to 0. =item * shared_cache_debug If set to 1 the module will turn on the debug option in L<IPC::SharedCache>. Defaults to 0. =item * memory_debug If set to 1 the module will send information on cache memory usage to STDERR. Requires the L<GTop> module. Defaults to 0. =back =head3 Miscellaneous Options =over =item * associate This option allows you to inherit the parameter values from other objects. The only requirement for the other object is that it have a C<param()> method that works like HTML::Template's C<param()>. A good candidate would be a L<CGI> query object. Example: my $query = CGI->new; my $template = HTML::Template->new( filename => 'template.tmpl', associate => $query, ); Now, C<< $template->output() >> will act as though $template->param(form_field => $cgi->param('form_field')); had been specified for each key/value pair that would be provided by the C<< $cgi->param() >> method. Parameters you set directly take precedence over associated parameters. You can specify multiple objects to associate by passing an anonymous array to the associate option. They are searched for parameters in the order they appear: my $template = HTML::Template->new( filename => 'template.tmpl', associate => [$query, $other_obj], ); B<NOTE>: The parameter names are matched in a case-insensitive manner. If you have two parameters in a CGI object like 'NAME' and 'Name' one will be chosen randomly by associate. This behavior can be changed by the C<case_sensitive> option. =item * case_sensitive Setting this option to true causes HTML::Template to treat template variable names case-sensitively. The following example would only set one parameter without the C<case_sensitive> option: my $template = HTML::Template->new( filename => 'template.tmpl', case_sensitive => 1 ); $template->param( FieldA => 'foo', fIELDa => 'bar', ); This option defaults to off. B<NOTE>: with C<case_sensitive> and C<loop_context_vars> the special loop variables are available in lower-case only. =item * loop_context_vars When this parameter is set to true (it is false by default) extra variables that depend on the loop's context are made available inside a loop. These are: =over =item * __first__ Value that is true for the first iteration of the loop and false every other time. =item * __last__ Value that is true for the last iteration of the loop and false every other time. =item * __inner__ Value that is true for the every iteration of the loop except for the first and last. =item * __outer__ Value that is true for the first and last iterations of the loop. =item * __odd__ Value that is true for the every odd iteration of the loop. =item * __even__ Value that is true for the every even iteration of the loop. =item * __counter__ An integer (starting from 1) whose value increments for each iteration of the loop. =item * __index__ An integer (starting from 0) whose value increments for each iteration of the loop. =back Just like any other C<TMPL_VAR>s these variables can be used in C<< <TMPL_IF> >>, C<< <TMPL_UNLESS> >> and C<< <TMPL_ELSE> >> to control how a loop is output. Example: <TMPL_LOOP NAME="FOO"> <TMPL_IF NAME="__first__"> This only outputs on the first pass. </TMPL_IF> <TMPL_IF NAME="__odd__"> This outputs every other pass, on the odd passes. </TMPL_IF> <TMPL_UNLESS NAME="__odd__"> This outputs every other pass, on the even passes. </TMPL_UNLESS> <TMPL_IF NAME="__inner__"> This outputs on passes that are neither first nor last. </TMPL_IF> This is pass number <TMPL_VAR NAME="__counter__">. <TMPL_IF NAME="__last__"> This only outputs on the last pass. </TMPL_IF> </TMPL_LOOP> One use of this feature is to provide a "separator" similar in effect to the perl function C<join()>. Example: <TMPL_LOOP FRUIT> <TMPL_IF __last__> and </TMPL_IF> <TMPL_VAR KIND><TMPL_UNLESS __last__>, <TMPL_ELSE>.</TMPL_UNLESS> </TMPL_LOOP> Would output something like: Apples, Oranges, Brains, Toes, and Kiwi. Given an appropriate C<param()> call, of course. B<NOTE>: A loop with only a single pass will get both C<__first__> and C<__last__> set to true, but not C<__inner__>. =item * no_includes Set this option to 1 to disallow the C<< <TMPL_INCLUDE> >> tag in the template file. This can be used to make opening untrusted templates B<slightly> less dangerous. Defaults to 0. =item * max_includes Set this variable to determine the maximum depth that includes can reach. Set to 10 by default. Including files to a depth greater than this value causes an error message to be displayed. Set to 0 to disable this protection. =item * die_on_missing_include If true, then HTML::Template will die if it can't find a file for a C<< <TMPL_INCLUDE> >>. This defaults to true. =item * global_vars Normally variables declared outside a loop are not available inside a loop. This option makes C<< <TMPL_VAR> >>s like global variables in Perl - they have unlimited scope. This option also affects C<< <TMPL_IF> >> and C<< <TMPL_UNLESS> >>. Example: This is a normal variable: <TMPL_VAR NORMAL>.<P> <TMPL_LOOP NAME=FROOT_LOOP> Here it is inside the loop: <TMPL_VAR NORMAL><P> </TMPL_LOOP> Normally this wouldn't work as expected, since C<< <TMPL_VAR NORMAL> >>'s value outside the loop is not available inside the loop. The global_vars option also allows you to access the values of an enclosing loop within an inner loop. For example, in this loop the inner loop will have access to the value of C<OUTER_VAR> in the correct iteration: <TMPL_LOOP OUTER_LOOP> OUTER: <TMPL_VAR OUTER_VAR> <TMPL_LOOP INNER_LOOP> INNER: <TMPL_VAR INNER_VAR> INSIDE OUT: <TMPL_VAR OUTER_VAR> </TMPL_LOOP> </TMPL_LOOP> One side-effect of C<global_vars> is that variables you set with C<param()> that might otherwise be ignored when C<die_on_bad_params> is off will stick around. This is necessary to allow inner loops to access values set for outer loops that don't directly use the value. B<NOTE>: C<global_vars> is not C<global_loops> (which does not exist). That means that loops you declare at one scope are not available inside other loops even when C<global_vars> is on. =item * filter This option allows you to specify a filter for your template files. A filter is a subroutine that will be called after HTML::Template reads your template file but before it starts parsing template tags. In the most simple usage, you simply assign a code reference to the filter parameter. This subroutine will receive a single argument - a reference to a string containing the template file text. Here is an example that accepts templates with tags that look like C<!!!ZAP_VAR FOO!!!> and transforms them into HTML::Template tags: my $filter = sub { my $text_ref = shift; $$text_ref =~ s/!!!ZAP_(.*?)!!!/<TMPL_$1>/g; }; # open zap.tmpl using the above filter my $template = HTML::Template->new( filename => 'zap.tmpl', filter => $filter, ); More complicated usages are possible. You can request that your filter receives the template text as an array of lines rather than as a single scalar. To do that you need to specify your filter using a hash-ref. In this form you specify the filter using the C<sub> key and the desired argument format using the C<format> key. The available formats are C<scalar> and C<array>. Using the C<array> format will incur a performance penalty but may be more convenient in some situations. my $template = HTML::Template->new( filename => 'zap.tmpl', filter => { sub => $filter, format => 'array', } ); You may also have multiple filters. This allows simple filters to be combined for more elaborate functionality. To do this you specify an array of filters. The filters are applied in the order they are specified. my $template = HTML::Template->new( filename => 'zap.tmpl', filter => [ { sub => \&decompress, format => 'scalar', }, { sub => \&remove_spaces, format => 'array', }, ] ); The specified filters will be called for any C<TMPL_INCLUDE>ed files just as they are for the main template file. =item * default_escape Set this parameter to a valid escape type (see the C<escape> option) and HTML::Template will apply the specified escaping to all variables unless they declare a different escape in the template. =back =cut use integer; # no floating point math so far! use strict; # and no funny business, either. use Carp; # generate better errors with more context use File::Spec; # generate paths that work on all platforms use Digest::MD5 qw(md5_hex); # generate cache keys use Scalar::Util qw(tainted); # define accessor constants used to improve readability of array # accesses into "objects". I used to use 'use constant' but that # seems to cause occasional irritating warnings in older Perls. package HTML::Template::LOOP; sub TEMPLATE_HASH () { 0 } sub PARAM_SET () { 1 } package HTML::Template::COND; sub VARIABLE () { 0 } sub VARIABLE_TYPE () { 1 } sub VARIABLE_TYPE_VAR () { 0 } sub VARIABLE_TYPE_LOOP () { 1 } sub JUMP_IF_TRUE () { 2 } sub JUMP_ADDRESS () { 3 } sub WHICH () { 4 } sub UNCONDITIONAL_JUMP () { 5 } sub IS_ELSE () { 6 } sub WHICH_IF () { 0 } sub WHICH_UNLESS () { 1 } # back to the main package scope. package HTML::Template; my %OPTIONS; # set the default options BEGIN { %OPTIONS = ( debug => 0, stack_debug => 0, timing => 0, search_path_on_include => 0, cache => 0, blind_cache => 0, file_cache => 0, file_cache_dir => '', file_cache_dir_mode => 0700, force_untaint => 0, cache_debug => 0, shared_cache_debug => 0, memory_debug => 0, die_on_bad_params => 1, vanguard_compatibility_mode => 0, associate => [], path => [], strict => 1, loop_context_vars => 0, max_includes => 10, shared_cache => 0, double_cache => 0, double_file_cache => 0, ipc_key => 'TMPL', ipc_mode => 0666, ipc_segment_size => 65536, ipc_max_size => 0, global_vars => 0, no_includes => 0, case_sensitive => 0, filter => [], open_mode => '', utf8 => 0, cache_lazy_vars => 0, cache_lazy_loops => 0, die_on_missing_include => 1, ); } # open a new template and return an object handle sub new { my $pkg = shift; my $self; { my %hash; $self = bless(\%hash, $pkg); } # the options hash my $options = {}; $self->{options} = $options; # set default parameters in options hash %$options = %OPTIONS; # load in options supplied to new() $options = _load_supplied_options([@_], $options); # blind_cache = 1 implies cache = 1 $options->{blind_cache} and $options->{cache} = 1; # shared_cache = 1 implies cache = 1 $options->{shared_cache} and $options->{cache} = 1; # file_cache = 1 implies cache = 1 $options->{file_cache} and $options->{cache} = 1; # double_cache is a combination of shared_cache and cache. $options->{double_cache} and $options->{cache} = 1; $options->{double_cache} and $options->{shared_cache} = 1; # double_file_cache is a combination of file_cache and cache. $options->{double_file_cache} and $options->{cache} = 1; $options->{double_file_cache} and $options->{file_cache} = 1; # vanguard_compatibility_mode implies die_on_bad_params = 0 $options->{vanguard_compatibility_mode} and $options->{die_on_bad_params} = 0; # handle the "type", "source" parameter format (does anyone use it?) if (exists($options->{type})) { exists($options->{source}) or croak("HTML::Template->new() called with 'type' parameter set, but no 'source'!"); ( $options->{type} eq 'filename' or $options->{type} eq 'scalarref' or $options->{type} eq 'arrayref' or $options->{type} eq 'filehandle' ) or croak("HTML::Template->new() : type parameter must be set to 'filename', 'arrayref', 'scalarref' or 'filehandle'!"); $options->{$options->{type}} = $options->{source}; delete $options->{type}; delete $options->{source}; } # make sure taint mode is on if force_untaint flag is set if ($options->{force_untaint}) { if ($] < 5.008000) { warn("HTML::Template->new() : 'force_untaint' option needs at least Perl 5.8.0!"); } elsif (!${^TAINT}) { croak("HTML::Template->new() : 'force_untaint' option set but perl does not run in taint mode!"); } } # associate should be an array of one element if it's not # already an array. if (ref($options->{associate}) ne 'ARRAY') { $options->{associate} = [$options->{associate}]; } # path should be an array if it's not already if (ref($options->{path}) ne 'ARRAY') { $options->{path} = [$options->{path}]; } # filter should be an array if it's not already if (ref($options->{filter}) ne 'ARRAY') { $options->{filter} = [$options->{filter}]; } # make sure objects in associate area support param() foreach my $object (@{$options->{associate}}) { defined($object->can('param')) or croak("HTML::Template->new called with associate option, containing object of type " . ref($object) . " which lacks a param() method!"); } # check for syntax errors: my $source_count = 0; exists($options->{filename}) and $source_count++; exists($options->{filehandle}) and $source_count++; exists($options->{arrayref}) and $source_count++; exists($options->{scalarref}) and $source_count++; if ($source_count != 1) { croak( "HTML::Template->new called with multiple (or no) template sources specified! A valid call to new() has exactly one filename => 'file' OR exactly one scalarref => \\\$scalar OR exactly one arrayref => \\\@array OR exactly one filehandle => \*FH" ); } # check that cache options are not used with non-cacheable templates croak "Cannot have caching when template source is not file" if grep { exists($options->{$_}) } qw( filehandle arrayref scalarref) and grep { $options->{$_} } qw( cache blind_cache file_cache shared_cache double_cache double_file_cache ); # check that filenames aren't empty if (exists($options->{filename})) { croak("HTML::Template->new called with empty filename parameter!") unless length $options->{filename}; } # do some memory debugging - this is best started as early as possible if ($options->{memory_debug}) { # memory_debug needs GTop eval { require GTop; }; croak("Could not load GTop. You must have GTop installed to use HTML::Template in memory_debug mode. The error was: $@") if ($@); $self->{gtop} = GTop->new(); $self->{proc_mem} = $self->{gtop}->proc_mem($$); print STDERR "\n### HTML::Template Memory Debug ### START ", $self->{proc_mem}->size(), "\n"; } if ($options->{file_cache}) { # make sure we have a file_cache_dir option croak("You must specify the file_cache_dir option if you want to use file_cache.") unless length $options->{file_cache_dir}; # file_cache needs some extra modules loaded eval { require Storable; }; croak( "Could not load Storable. You must have Storable installed to use HTML::Template in file_cache mode. The error was: $@" ) if ($@); } if ($options->{shared_cache}) { # shared_cache needs some extra modules loaded eval { require IPC::SharedCache; }; croak( "Could not load IPC::SharedCache. You must have IPC::SharedCache installed to use HTML::Template in shared_cache mode. The error was: $@" ) if ($@); # initialize the shared cache my %cache; tie %cache, 'IPC::SharedCache', ipc_key => $options->{ipc_key}, load_callback => [\&_load_shared_cache, $self], validate_callback => [\&_validate_shared_cache, $self], debug => $options->{shared_cache_debug}, ipc_mode => $options->{ipc_mode}, max_size => $options->{ipc_max_size}, ipc_segment_size => $options->{ipc_segment_size}; $self->{cache} = \%cache; } if ($options->{default_escape}) { $options->{default_escape} = uc $options->{default_escape}; unless ($options->{default_escape} =~ /^(NONE|HTML|URL|JS)$/i) { croak( "HTML::Template->new(): Invalid setting for default_escape - '$options->{default_escape}'. Valid values are 'none', 'html', 'url', or 'js'." ); } } # no 3 args form of open before perl 5.7.1 if ($options->{open_mode} && $] < 5.007001) { croak("HTML::Template->new(): open_mode cannot be used in Perl < 5.7.1"); } if($options->{utf8}) { croak("HTML::Template->new(): utf8 cannot be used in Perl < 5.7.1") if $] < 5.007001; croak("HTML::Template->new(): utf8 and open_mode cannot be used at the same time") if $options->{open_mode}; # utf8 is just a short-cut for a common open_mode $options->{open_mode} = '<:encoding(utf8)'; } print STDERR "### HTML::Template Memory Debug ### POST CACHE INIT ", $self->{proc_mem}->size(), "\n" if $options->{memory_debug}; # initialize data structures $self->_init; print STDERR "### HTML::Template Memory Debug ### POST _INIT CALL ", $self->{proc_mem}->size(), "\n" if $options->{memory_debug}; # drop the shared cache - leaving out this step results in the # template object evading garbage collection since the callbacks in # the shared cache tie hold references to $self! This was not easy # to find, by the way. delete $self->{cache} if $options->{shared_cache}; return $self; } sub _load_supplied_options { my $argsref = shift; my $options = shift; for (my $x = 0 ; $x < @{$argsref} ; $x += 2) { defined(${$argsref}[($x + 1)]) or croak("HTML::Template->new() called with odd number of option parameters - should be of the form option => value"); $options->{lc(${$argsref}[$x])} = ${$argsref}[($x + 1)]; } return $options; } # an internally used new that receives its parse_stack and param_map as input sub _new_from_loop { my $pkg = shift; my $self; { my %hash; $self = bless(\%hash, $pkg); } # the options hash my $options = { debug => $OPTIONS{debug}, stack_debug => $OPTIONS{stack_debug}, die_on_bad_params => $OPTIONS{die_on_bad_params}, associate => [@{$OPTIONS{associate}}], loop_context_vars => $OPTIONS{loop_context_vars}, }; $self->{options} = $options; $options = _load_supplied_options([@_], $options); $self->{param_map} = $options->{param_map}; $self->{parse_stack} = $options->{parse_stack}; delete($options->{param_map}); delete($options->{parse_stack}); return $self; } # a few shortcuts to new(), of possible use... sub new_file { my $pkg = shift; return $pkg->new('filename', @_); } sub new_filehandle { my $pkg = shift; return $pkg->new('filehandle', @_); } sub new_array_ref { my $pkg = shift; return $pkg->new('arrayref', @_); } sub new_scalar_ref { my $pkg = shift; return $pkg->new('scalarref', @_); } # initializes all the object data structures, either from cache or by # calling the appropriate routines. sub _init { my $self = shift; my $options = $self->{options}; if ($options->{double_cache}) { # try the normal cache, return if we have it. $self->_fetch_from_cache(); return if (defined $self->{param_map} and defined $self->{parse_stack}); # try the shared cache $self->_fetch_from_shared_cache(); # put it in the local cache if we got it. $self->_commit_to_cache() if (defined $self->{param_map} and defined $self->{parse_stack}); } elsif ($options->{double_file_cache}) { # try the normal cache, return if we have it. $self->_fetch_from_cache(); return if (defined $self->{param_map}); # try the file cache $self->_fetch_from_file_cache(); # put it in the local cache if we got it. $self->_commit_to_cache() if (defined $self->{param_map}); } elsif ($options->{shared_cache}) { # try the shared cache $self->_fetch_from_shared_cache(); } elsif ($options->{file_cache}) { # try the file cache $self->_fetch_from_file_cache(); } elsif ($options->{cache}) { # try the normal cache $self->_fetch_from_cache(); } # if we got a cache hit, return return if (defined $self->{param_map}); # if we're here, then we didn't get a cached copy, so do a full # init. $self->_init_template(); $self->_parse(); # now that we have a full init, cache the structures if caching is # on. shared cache is already cool. if ($options->{file_cache}) { $self->_commit_to_file_cache(); } $self->_commit_to_cache() if ( ($options->{cache} and not $options->{shared_cache} and not $options->{file_cache}) or ($options->{double_cache}) or ($options->{double_file_cache})); } # Caching subroutines - they handle getting and validating cache # records from either the in-memory or shared caches. # handles the normal in memory cache use vars qw( %CACHE ); sub _fetch_from_cache { my $self = shift; my $options = $self->{options}; # return if there's no file here my $filepath = $self->_find_file($options->{filename}); return unless (defined($filepath)); $options->{filepath} = $filepath; # return if there's no cache entry for this key my $key = $self->_cache_key(); return unless exists($CACHE{$key}); # validate the cache my $mtime = $self->_mtime($filepath); if (defined $mtime) { # return if the mtime doesn't match the cache if (defined($CACHE{$key}{mtime}) and ($mtime != $CACHE{$key}{mtime})) { $options->{cache_debug} and print STDERR "CACHE MISS : $filepath : $mtime\n"; return; } # if the template has includes, check each included file's mtime # and return if different if (exists($CACHE{$key}{included_mtimes})) { foreach my $filename (keys %{$CACHE{$key}{included_mtimes}}) { next unless defined($CACHE{$key}{included_mtimes}{$filename}); my $included_mtime = (stat($filename))[9]; if ($included_mtime != $CACHE{$key}{included_mtimes}{$filename}) { $options->{cache_debug} and print STDERR "### HTML::Template Cache Debug ### CACHE MISS : $filepath : INCLUDE $filename : $included_mtime\n"; return; } } } } # got a cache hit! $options->{cache_debug} and print STDERR "### HTML::Template Cache Debug ### CACHE HIT : $filepath => $key\n"; $self->{param_map} = $CACHE{$key}{param_map}; $self->{parse_stack} = $CACHE{$key}{parse_stack}; exists($CACHE{$key}{included_mtimes}) and $self->{included_mtimes} = $CACHE{$key}{included_mtimes}; # clear out values from param_map from last run $self->_normalize_options(); $self->clear_params(); } sub _commit_to_cache { my $self = shift; my $options = $self->{options}; my $key = $self->_cache_key(); my $filepath = $options->{filepath}; $options->{cache_debug} and print STDERR "### HTML::Template Cache Debug ### CACHE LOAD : $filepath => $key\n"; $options->{blind_cache} or $CACHE{$key}{mtime} = $self->_mtime($filepath); $CACHE{$key}{param_map} = $self->{param_map}; $CACHE{$key}{parse_stack} = $self->{parse_stack}; exists($self->{included_mtimes}) and $CACHE{$key}{included_mtimes} = $self->{included_mtimes}; } # create a cache key from a template object. The cache key includes # the full path to the template and options which affect template # loading. sub _cache_key { my $self = shift; my $options = $self->{options}; # assemble pieces of the key my @key = ($options->{filepath}); push(@key, @{$options->{path}}); push(@key, $options->{search_path_on_include} || 0); push(@key, $options->{loop_context_vars} || 0); push(@key, $options->{global_vars} || 0); push(@key, $options->{open_mode} || 0); # compute the md5 and return it return md5_hex(@key); } # generates MD5 from filepath to determine filename for cache file sub _get_cache_filename { my ($self, $filepath) = @_; # get a cache key $self->{options}{filepath} = $filepath; my $hash = $self->_cache_key(); # ... and build a path out of it. Using the first two characters # gives us 255 buckets. This means you can have 255,000 templates # in the cache before any one directory gets over a few thousand # files in it. That's probably pretty good for this planet. If not # then it should be configurable. if (wantarray) { return (substr($hash, 0, 2), substr($hash, 2)); } else { return File::Spec->join($self->{options}{file_cache_dir}, substr($hash, 0, 2), substr($hash, 2)); } } # handles the file cache sub _fetch_from_file_cache { my $self = shift; my $options = $self->{options}; # return if there's no cache entry for this filename my $filepath = $self->_find_file($options->{filename}); return unless defined $filepath; my $cache_filename = $self->_get_cache_filename($filepath); return unless -e $cache_filename; eval { $self->{record} = Storable::lock_retrieve($cache_filename); }; croak("HTML::Template::new() - Problem reading cache file $cache_filename (file_cache => 1) : $@") if $@; croak("HTML::Template::new() - Problem reading cache file $cache_filename (file_cache => 1) : $!") unless defined $self->{record}; ($self->{mtime}, $self->{included_mtimes}, $self->{param_map}, $self->{parse_stack}) = @{$self->{record}}; $options->{filepath} = $filepath; # validate the cache my $mtime = $self->_mtime($filepath); if (defined $mtime) { # return if the mtime doesn't match the cache if (defined($self->{mtime}) and ($mtime != $self->{mtime})) { $options->{cache_debug} and print STDERR "### HTML::Template Cache Debug ### FILE CACHE MISS : $filepath : $mtime\n"; ($self->{mtime}, $self->{included_mtimes}, $self->{param_map}, $self->{parse_stack}) = (undef, undef, undef, undef); return; } # if the template has includes, check each included file's mtime # and return if different if (exists($self->{included_mtimes})) { foreach my $filename (keys %{$self->{included_mtimes}}) { next unless defined($self->{included_mtimes}{$filename}); my $included_mtime = (stat($filename))[9]; if ($included_mtime != $self->{included_mtimes}{$filename}) { $options->{cache_debug} and print STDERR "### HTML::Template Cache Debug ### FILE CACHE MISS : $filepath : INCLUDE $filename : $included_mtime\n"; ($self->{mtime}, $self->{included_mtimes}, $self->{param_map}, $self->{parse_stack}) = (undef, undef, undef, undef); return; } } } } # got a cache hit! $options->{cache_debug} and print STDERR "### HTML::Template Cache Debug ### FILE CACHE HIT : $filepath\n"; # clear out values from param_map from last run $self->_normalize_options(); $self->clear_params(); } sub _commit_to_file_cache { my $self = shift; my $options = $self->{options}; my $filepath = $options->{filepath}; if (not defined $filepath) { $filepath = $self->_find_file($options->{filename}); confess("HTML::Template->new() : Cannot open included file $options->{filename} : file not found.") unless defined($filepath); $options->{filepath} = $filepath; } my ($cache_dir, $cache_file) = $self->_get_cache_filename($filepath); $cache_dir = File::Spec->join($options->{file_cache_dir}, $cache_dir); if (not -d $cache_dir) { if (not -d $options->{file_cache_dir}) { mkdir($options->{file_cache_dir}, $options->{file_cache_dir_mode}) or croak("HTML::Template->new() : can't mkdir $options->{file_cache_dir} (file_cache => 1): $!"); } mkdir($cache_dir, $options->{file_cache_dir_mode}) or croak("HTML::Template->new() : can't mkdir $cache_dir (file_cache => 1): $!"); } $options->{cache_debug} and print STDERR "### HTML::Template Cache Debug ### FILE CACHE LOAD : $options->{filepath}\n"; my $result; eval { $result = Storable::lock_store([$self->{mtime}, $self->{included_mtimes}, $self->{param_map}, $self->{parse_stack}], scalar File::Spec->join($cache_dir, $cache_file)); }; croak("HTML::Template::new() - Problem writing cache file $cache_dir/$cache_file (file_cache => 1) : $@") if $@; croak("HTML::Template::new() - Problem writing cache file $cache_dir/$cache_file (file_cache => 1) : $!") unless defined $result; } # Shared cache routines. sub _fetch_from_shared_cache { my $self = shift; my $options = $self->{options}; my $filepath = $self->_find_file($options->{filename}); return unless defined $filepath; # fetch from the shared cache. $self->{record} = $self->{cache}{$filepath}; ($self->{mtime}, $self->{included_mtimes}, $self->{param_map}, $self->{parse_stack}) = @{$self->{record}} if defined($self->{record}); $options->{cache_debug} and defined($self->{record}) and print STDERR "### HTML::Template Cache Debug ### CACHE HIT : $filepath\n"; # clear out values from param_map from last run $self->_normalize_options(), $self->clear_params() if (defined($self->{record})); delete($self->{record}); return $self; } sub _validate_shared_cache { my ($self, $filename, $record) = @_; my $options = $self->{options}; $options->{shared_cache_debug} and print STDERR "### HTML::Template Cache Debug ### SHARED CACHE VALIDATE : $filename\n"; return 1 if $options->{blind_cache}; my ($c_mtime, $included_mtimes, $param_map, $parse_stack) = @$record; # if the modification time has changed return false my $mtime = $self->_mtime($filename); if ( defined $mtime and defined $c_mtime and $mtime != $c_mtime) { $options->{cache_debug} and print STDERR "### HTML::Template Cache Debug ### SHARED CACHE MISS : $filename : $mtime\n"; return 0; } # if the template has includes, check each included file's mtime # and return false if different if (defined $mtime and defined $included_mtimes) { foreach my $fname (keys %$included_mtimes) { next unless defined($included_mtimes->{$fname}); if ($included_mtimes->{$fname} != (stat($fname))[9]) { $options->{cache_debug} and print STDERR "### HTML::Template Cache Debug ### SHARED CACHE MISS : $filename : INCLUDE $fname\n"; return 0; } } } # all done - return true return 1; } sub _load_shared_cache { my ($self, $filename) = @_; my $options = $self->{options}; my $cache = $self->{cache}; $self->_init_template(); $self->_parse(); $options->{cache_debug} and print STDERR "### HTML::Template Cache Debug ### SHARED CACHE LOAD : $options->{filepath}\n"; print STDERR "### HTML::Template Memory Debug ### END CACHE LOAD ", $self->{proc_mem}->size(), "\n" if $options->{memory_debug}; return [$self->{mtime}, $self->{included_mtimes}, $self->{param_map}, $self->{parse_stack}]; } # utility function - given a filename performs documented search and # returns a full path or undef if the file cannot be found. sub _find_file { my ($self, $filename, $extra_path) = @_; my $options = $self->{options}; my $filepath; # first check for a full path return File::Spec->canonpath($filename) if (File::Spec->file_name_is_absolute($filename) and (-e $filename)); # try the extra_path if one was specified if (defined($extra_path)) { $extra_path->[$#{$extra_path}] = $filename; $filepath = File::Spec->canonpath(File::Spec->catfile(@$extra_path)); return File::Spec->canonpath($filepath) if -e $filepath; } # try pre-prending HTML_Template_Root if (defined($ENV{HTML_TEMPLATE_ROOT})) { $filepath = File::Spec->catfile($ENV{HTML_TEMPLATE_ROOT}, $filename); return File::Spec->canonpath($filepath) if -e $filepath; } # try "path" option list.. foreach my $path (@{$options->{path}}) { $filepath = File::Spec->catfile($path, $filename); return File::Spec->canonpath($filepath) if -e $filepath; } # try even a relative path from the current directory... return File::Spec->canonpath($filename) if -e $filename; # try "path" option list with HTML_TEMPLATE_ROOT prepended... if (defined($ENV{HTML_TEMPLATE_ROOT})) { foreach my $path (@{$options->{path}}) { $filepath = File::Spec->catfile($ENV{HTML_TEMPLATE_ROOT}, $path, $filename); return File::Spec->canonpath($filepath) if -e $filepath; } } return undef; } # utility function - computes the mtime for $filename sub _mtime { my ($self, $filepath) = @_; my $options = $self->{options}; return (undef) if ($options->{blind_cache}); # make sure it still exists in the filesystem (-r $filepath) or Carp::confess("HTML::Template : template file $filepath does not exist or is unreadable."); # get the modification time return (stat(_))[9]; } # utility function - enforces new() options across LOOPs that have # come from a cache. Otherwise they would have stale options hashes. sub _normalize_options { my $self = shift; my $options = $self->{options}; my @pstacks = ($self->{parse_stack}); while (@pstacks) { my $pstack = pop(@pstacks); foreach my $item (@$pstack) { next unless (ref($item) eq 'HTML::Template::LOOP'); foreach my $template (values %{$item->[HTML::Template::LOOP::TEMPLATE_HASH]}) { # must be the same list as the call to _new_from_loop... $template->{options}{debug} = $options->{debug}; $template->{options}{stack_debug} = $options->{stack_debug}; $template->{options}{die_on_bad_params} = $options->{die_on_bad_params}; $template->{options}{case_sensitive} = $options->{case_sensitive}; $template->{options}{parent_global_vars} = $options->{parent_global_vars}; push(@pstacks, $template->{parse_stack}); } } } } # initialize the template buffer sub _init_template { my $self = shift; my $options = $self->{options}; print STDERR "### HTML::Template Memory Debug ### START INIT_TEMPLATE ", $self->{proc_mem}->size(), "\n" if $options->{memory_debug}; if (exists($options->{filename})) { my $filepath = $options->{filepath}; if (not defined $filepath) { $filepath = $self->_find_file($options->{filename}); confess("HTML::Template->new() : Cannot open included file $options->{filename} : file not found.") unless defined($filepath); # we'll need this for future reference - to call stat() for example. $options->{filepath} = $filepath; } # use the open_mode if we have one if (my $mode = $options->{open_mode}) { open(TEMPLATE, $mode, $filepath) || confess("HTML::Template->new() : Cannot open included file $filepath with mode $mode: $!"); } else { open(TEMPLATE, $filepath) or confess("HTML::Template->new() : Cannot open included file $filepath : $!"); } $self->{mtime} = $self->_mtime($filepath); # read into scalar, note the mtime for the record $self->{template} = ""; while (read(TEMPLATE, $self->{template}, 10240, length($self->{template}))) { } close(TEMPLATE); } elsif (exists($options->{scalarref})) { # copy in the template text $self->{template} = ${$options->{scalarref}}; delete($options->{scalarref}); } elsif (exists($options->{arrayref})) { # if we have an array ref, join and store the template text $self->{template} = join("", @{$options->{arrayref}}); delete($options->{arrayref}); } elsif (exists($options->{filehandle})) { # just read everything in in one go local $/ = undef; $self->{template} = readline($options->{filehandle}); delete($options->{filehandle}); } else { confess("HTML::Template : Need to call new with filename, filehandle, scalarref or arrayref parameter specified."); } print STDERR "### HTML::Template Memory Debug ### END INIT_TEMPLATE ", $self->{proc_mem}->size(), "\n" if $options->{memory_debug}; # handle filters if necessary $self->_call_filters(\$self->{template}) if @{$options->{filter}}; return $self; } # handle calling user defined filters sub _call_filters { my $self = shift; my $template_ref = shift; my $options = $self->{options}; my ($format, $sub); foreach my $filter (@{$options->{filter}}) { croak("HTML::Template->new() : bad value set for filter parameter - must be a code ref or a hash ref.") unless ref $filter; # translate into CODE->HASH $filter = {'format' => 'scalar', 'sub' => $filter} if (ref $filter eq 'CODE'); if (ref $filter eq 'HASH') { $format = $filter->{'format'}; $sub = $filter->{'sub'}; # check types and values croak( "HTML::Template->new() : bad value set for filter parameter - hash must contain \"format\" key and \"sub\" key.") unless defined $format and defined $sub; croak("HTML::Template->new() : bad value set for filter parameter - \"format\" must be either 'array' or 'scalar'") unless $format eq 'array' or $format eq 'scalar'; croak("HTML::Template->new() : bad value set for filter parameter - \"sub\" must be a code ref") unless ref $sub and ref $sub eq 'CODE'; # catch errors eval { if ($format eq 'scalar') { # call $sub->($template_ref); } else { # modulate my @array = map { $_ . "\n" } split("\n", $$template_ref); # call $sub->(\@array); # demodulate $$template_ref = join("", @array); } }; croak("HTML::Template->new() : fatal error occurred during filter call: $@") if $@; } else { croak("HTML::Template->new() : bad value set for filter parameter - must be code ref or hash ref"); } } # all done return $template_ref; } # _parse sifts through a template building up the param_map and # parse_stack structures. # # The end result is a Template object that is fully ready for # output(). sub _parse { my $self = shift; my $options = $self->{options}; $options->{debug} and print STDERR "### HTML::Template Debug ### In _parse:\n"; # setup the stacks and maps - they're accessed by typeglobs that # reference the top of the stack. They are masked so that a loop # can transparently have its own versions. use vars qw(@pstack %pmap @ifstack @ucstack %top_pmap); local (*pstack, *ifstack, *pmap, *ucstack, *top_pmap); # the pstack is the array of scalar refs (plain text from the # template file), VARs, LOOPs, IFs and ELSEs that output() works on # to produce output. Looking at output() should make it clear what # _parse is trying to accomplish. my @pstacks = ([]); *pstack = $pstacks[0]; $self->{parse_stack} = $pstacks[0]; # the pmap binds names to VARs, LOOPs and IFs. It allows param() to # access the right variable. NOTE: output() does not look at the # pmap at all! my @pmaps = ({}); *pmap = $pmaps[0]; *top_pmap = $pmaps[0]; $self->{param_map} = $pmaps[0]; # the ifstack is a temporary stack containing pending ifs and elses # waiting for a /if. my @ifstacks = ([]); *ifstack = $ifstacks[0]; # the ucstack is a temporary stack containing conditions that need # to be bound to param_map entries when their block is finished. # This happens when a conditional is encountered before any other # reference to its NAME. Since a conditional can reference VARs and # LOOPs it isn't possible to make the link right away. my @ucstacks = ([]); *ucstack = $ucstacks[0]; # the loopstack is another temp stack for closing loops. unlike # those above it doesn't get scoped inside loops, therefore it # doesn't need the typeglob magic. my @loopstack = (); # the fstack is a stack of filenames and counters that keeps track # of which file we're in and where we are in it. This allows # accurate error messages even inside included files! # fcounter, fmax and fname are aliases for the current file's info use vars qw($fcounter $fname $fmax); local (*fcounter, *fname, *fmax); my @fstack = ([$options->{filepath} || "/fake/path/for/non/file/template", 1, scalar @{[$self->{template} =~ m/(\n)/g]} + 1]); (*fname, *fcounter, *fmax) = \(@{$fstack[0]}); my $NOOP = HTML::Template::NOOP->new(); my $ESCAPE = HTML::Template::ESCAPE->new(); my $JSESCAPE = HTML::Template::JSESCAPE->new(); my $URLESCAPE = HTML::Template::URLESCAPE->new(); # all the tags that need NAMEs: my %need_names = map { $_ => 1 } qw(TMPL_VAR TMPL_LOOP TMPL_IF TMPL_UNLESS TMPL_INCLUDE); # variables used below that don't need to be my'd in the loop my ($name, $which, $escape, $default); # handle the old vanguard format $options->{vanguard_compatibility_mode} and $self->{template} =~ s/%([-\w\/\.+]+)%/<TMPL_VAR NAME=$1>/g; # now split up template on '<', leaving them in my @chunks = split(m/(?=<)/, $self->{template}); # all done with template delete $self->{template}; # loop through chunks, filling up pstack my $last_chunk = $#chunks; CHUNK: for (my $chunk_number = 0 ; $chunk_number <= $last_chunk ; $chunk_number++) { next unless defined $chunks[$chunk_number]; my $chunk = $chunks[$chunk_number]; # a general regex to match any and all TMPL_* tags if ( $chunk =~ /^< (?:!--\s*)? ( \/?tmpl_ (?: (?:var) | (?:loop) | (?:if) | (?:else) | (?:unless) | (?:include) ) ) # $1 => $which - start of the tag \s* # DEFAULT attribute (?: default \s*=\s* (?: "([^">]*)" # $2 => double-quoted DEFAULT value " | '([^'>]*)' # $3 => single-quoted DEFAULT value | ([^\s=>]*) # $4 => unquoted DEFAULT value ) )? \s* # ESCAPE attribute (?: escape \s*=\s* (?: ( (?:["']?0["']?)| (?:["']?1["']?)| (?:["']?html["']?) | (?:["']?url["']?) | (?:["']?js["']?) | (?:["']?none["']?) ) # $5 => ESCAPE on ) )* # allow multiple ESCAPEs \s* # DEFAULT attribute (?: default \s*=\s* (?: "([^">]*)" # $6 => double-quoted DEFAULT value " | '([^'>]*)' # $7 => single-quoted DEFAULT value | ([^\s=>]*) # $8 => unquoted DEFAULT value ) )? \s* # NAME attribute (?: (?: name \s*=\s*)? (?: "([^">]*)" # $9 => double-quoted NAME value " | '([^'>]*)' # $10 => single-quoted NAME value | ([^\s=>]*) # $11 => unquoted NAME value ) )? \s* # DEFAULT attribute (?: default \s*=\s* (?: "([^">]*)" # $12 => double-quoted DEFAULT value " | '([^'>]*)' # $13 => single-quoted DEFAULT value | ([^\s=>]*) # $14 => unquoted DEFAULT value ) )? \s* # ESCAPE attribute (?: escape \s*=\s* (?: ( (?:["']?0["']?)| (?:["']?1["']?)| (?:["']?html["']?) | (?:["']?url["']?) | (?:["']?js["']?) | (?:["']?none["']?) ) # $15 => ESCAPE on ) )* # allow multiple ESCAPEs \s* # DEFAULT attribute (?: default \s*=\s* (?: "([^">]*)" # $16 => double-quoted DEFAULT value " | '([^'>]*)' # $17 => single-quoted DEFAULT value | ([^\s=>]*) # $18 => unquoted DEFAULT value ) )? \s* (?:--)?\/?> (.*) # $19 => $post - text that comes after the tag $/isx ) { $which = uc($1); # which tag is it $escape = defined $5 ? $5 : defined $15 ? $15 : (defined $options->{default_escape} && $which eq 'TMPL_VAR') ? $options->{default_escape} : 0; # escape set? # what name for the tag? undef for a /tag at most, one of the # following three will be defined $name = defined $9 ? $9 : defined $10 ? $10 : defined $11 ? $11 : undef; # is there a default? $default = defined $2 ? $2 : defined $3 ? $3 : defined $4 ? $4 : defined $6 ? $6 : defined $7 ? $7 : defined $8 ? $8 : defined $12 ? $12 : defined $13 ? $13 : defined $14 ? $14 : defined $16 ? $16 : defined $17 ? $17 : defined $18 ? $18 : undef; my $post = $19; # what comes after on the line # allow mixed case in filenames, otherwise flatten $name = lc($name) unless (not defined $name or $which eq 'TMPL_INCLUDE' or $options->{case_sensitive}); # die if we need a name and didn't get one die "HTML::Template->new() : No NAME given to a $which tag at $fname : line $fcounter." if ($need_names{$which} and (not defined $name or not length $name)); # die if we got an escape but can't use one die "HTML::Template->new() : ESCAPE option invalid in a $which tag at $fname : line $fcounter." if ($escape and ($which ne 'TMPL_VAR')); # die if we got a default but can't use one die "HTML::Template->new() : DEFAULT option invalid in a $which tag at $fname : line $fcounter." if (defined $default and ($which ne 'TMPL_VAR')); # take actions depending on which tag found if ($which eq 'TMPL_VAR') { print STDERR "### HTML::Template Debug ### $fname : line $fcounter : parsed VAR $name\n" if $options->{debug}; # if we already have this var, then simply link to the existing # HTML::Template::VAR, else create a new one. my $var; if (exists $pmap{$name}) { $var = $pmap{$name}; if( $options->{die_on_bad_params} && ref($var) ne 'HTML::Template::VAR') { die "HTML::Template->new() : Already used param name $name as a TMPL_LOOP, found in a TMPL_VAR at $fname : line $fcounter."; } } else { $var = HTML::Template::VAR->new(); $pmap{$name} = $var; $top_pmap{$name} = HTML::Template::VAR->new() if $options->{global_vars} and not exists $top_pmap{$name}; } # if a DEFAULT was provided, push a DEFAULT object on the # stack before the variable. if (defined $default) { push(@pstack, HTML::Template::DEF->new($default)); } # if ESCAPE was set, push an ESCAPE op on the stack before # the variable. output will handle the actual work. # unless of course, they have set escape=0 or escape=none if ($escape) { if ($escape =~ /^["']?url["']?$/i) { push(@pstack, $URLESCAPE); } elsif ($escape =~ /^["']?js["']?$/i) { push(@pstack, $JSESCAPE); } elsif ($escape =~ /^["']?0["']?$/) { # do nothing if escape=0 } elsif ($escape =~ /^["']?none["']?$/i) { # do nothing if escape=none } else { push(@pstack, $ESCAPE); } } push(@pstack, $var); } elsif ($which eq 'TMPL_LOOP') { # we've got a loop start print STDERR "### HTML::Template Debug ### $fname : line $fcounter : LOOP $name start\n" if $options->{debug}; # if we already have this loop, then simply link to the existing # HTML::Template::LOOP, else create a new one. my $loop; if (exists $pmap{$name}) { $loop = $pmap{$name}; if( $options->{die_on_bad_params} && ref($loop) ne 'HTML::Template::LOOP') { die "HTML::Template->new() : Already used param name $name as a TMPL_VAR, TMPL_IF or TMPL_UNLESS, found in a TMPL_LOOP at $fname : line $fcounter!"; } } else { # store the results in a LOOP object - actually just a # thin wrapper around another HTML::Template object. $loop = HTML::Template::LOOP->new(); $pmap{$name} = $loop; } # get it on the loopstack, pstack of the enclosing block push(@pstack, $loop); push(@loopstack, [$loop, $#pstack]); # magic time - push on a fresh pmap and pstack, adjust the typeglobs. # this gives the loop a separate namespace (i.e. pmap and pstack). push(@pstacks, []); *pstack = $pstacks[$#pstacks]; push(@pmaps, {}); *pmap = $pmaps[$#pmaps]; push(@ifstacks, []); *ifstack = $ifstacks[$#ifstacks]; push(@ucstacks, []); *ucstack = $ucstacks[$#ucstacks]; # auto-vivify __FIRST__, __LAST__ and __INNER__ if # loop_context_vars is set. Otherwise, with # die_on_bad_params set output() will might cause errors # when it tries to set them. if ($options->{loop_context_vars}) { $pmap{__first__} = HTML::Template::VAR->new(); $pmap{__inner__} = HTML::Template::VAR->new(); $pmap{__outer__} = HTML::Template::VAR->new(); $pmap{__last__} = HTML::Template::VAR->new(); $pmap{__odd__} = HTML::Template::VAR->new(); $pmap{__even__} = HTML::Template::VAR->new(); $pmap{__counter__} = HTML::Template::VAR->new(); $pmap{__index__} = HTML::Template::VAR->new(); } } elsif ($which eq '/TMPL_LOOP') { $options->{debug} and print STDERR "### HTML::Template Debug ### $fname : line $fcounter : LOOP end\n"; my $loopdata = pop(@loopstack); die "HTML::Template->new() : found </TMPL_LOOP> with no matching <TMPL_LOOP> at $fname : line $fcounter!" unless defined $loopdata; my ($loop, $starts_at) = @$loopdata; # resolve pending conditionals foreach my $uc (@ucstack) { my $var = $uc->[HTML::Template::COND::VARIABLE]; if (exists($pmap{$var})) { $uc->[HTML::Template::COND::VARIABLE] = $pmap{$var}; } else { $pmap{$var} = HTML::Template::VAR->new(); $top_pmap{$var} = HTML::Template::VAR->new() if $options->{global_vars} and not exists $top_pmap{$var}; $uc->[HTML::Template::COND::VARIABLE] = $pmap{$var}; } if (ref($pmap{$var}) eq 'HTML::Template::VAR') { $uc->[HTML::Template::COND::VARIABLE_TYPE] = HTML::Template::COND::VARIABLE_TYPE_VAR; } else { $uc->[HTML::Template::COND::VARIABLE_TYPE] = HTML::Template::COND::VARIABLE_TYPE_LOOP; } } # get pmap and pstack for the loop, adjust the typeglobs to # the enclosing block. my $param_map = pop(@pmaps); *pmap = $pmaps[$#pmaps]; my $parse_stack = pop(@pstacks); *pstack = $pstacks[$#pstacks]; scalar(@ifstack) and die "HTML::Template->new() : Dangling <TMPL_IF> or <TMPL_UNLESS> in loop ending at $fname : line $fcounter."; pop(@ifstacks); *ifstack = $ifstacks[$#ifstacks]; pop(@ucstacks); *ucstack = $ucstacks[$#ucstacks]; # instantiate the sub-Template, feeding it parse_stack and # param_map. This means that only the enclosing template # does _parse() - sub-templates get their parse_stack and # param_map fed to them already filled in. $loop->[HTML::Template::LOOP::TEMPLATE_HASH]{$starts_at} = ref($self)->_new_from_loop( parse_stack => $parse_stack, param_map => $param_map, debug => $options->{debug}, die_on_bad_params => $options->{die_on_bad_params}, loop_context_vars => $options->{loop_context_vars}, case_sensitive => $options->{case_sensitive}, force_untaint => $options->{force_untaint}, parent_global_vars => ($options->{global_vars} || $options->{parent_global_vars} || 0) ); # if this loop has been used multiple times we need to merge the "param_map" between them # all so that die_on_bad_params doesn't complain if we try to use different vars in # each instance of the same loop if ($options->{die_on_bad_params}) { my $loops = $loop->[HTML::Template::LOOP::TEMPLATE_HASH]; my @loop_keys = sort { $a <=> $b } keys %$loops; if (@loop_keys > 1) { my $last_loop = pop(@loop_keys); foreach my $loop (@loop_keys) { # make sure all the params in the last loop are also in this loop foreach my $param (keys %{$loops->{$last_loop}->{param_map}}) { next if $loops->{$loop}->{param_map}->{$param}; $loops->{$loop}->{param_map}->{$param} = $loops->{$last_loop}->{param_map}->{$param}; } # make sure all the params in this loop are also in the last loop foreach my $param (keys %{$loops->{$loop}->{param_map}}) { next if $loops->{$last_loop}->{param_map}->{$param}; $loops->{$last_loop}->{param_map}->{$param} = $loops->{$loop}->{param_map}->{$param}; } } } } } elsif ($which eq 'TMPL_IF' or $which eq 'TMPL_UNLESS') { $options->{debug} and print STDERR "### HTML::Template Debug ### $fname : line $fcounter : $which $name start\n"; # if we already have this var, then simply link to the existing # HTML::Template::VAR/LOOP, else defer the mapping my $var; if (exists $pmap{$name}) { $var = $pmap{$name}; } else { $var = $name; } # connect the var to a conditional my $cond = HTML::Template::COND->new($var); if ($which eq 'TMPL_IF') { $cond->[HTML::Template::COND::WHICH] = HTML::Template::COND::WHICH_IF; $cond->[HTML::Template::COND::JUMP_IF_TRUE] = 0; } else { $cond->[HTML::Template::COND::WHICH] = HTML::Template::COND::WHICH_UNLESS; $cond->[HTML::Template::COND::JUMP_IF_TRUE] = 1; } # push unconnected conditionals onto the ucstack for # resolution later. Otherwise, save type information now. if ($var eq $name) { push(@ucstack, $cond); } else { if (ref($var) eq 'HTML::Template::VAR') { $cond->[HTML::Template::COND::VARIABLE_TYPE] = HTML::Template::COND::VARIABLE_TYPE_VAR; } else { $cond->[HTML::Template::COND::VARIABLE_TYPE] = HTML::Template::COND::VARIABLE_TYPE_LOOP; } } # push what we've got onto the stacks push(@pstack, $cond); push(@ifstack, $cond); } elsif ($which eq '/TMPL_IF' or $which eq '/TMPL_UNLESS') { $options->{debug} and print STDERR "### HTML::Template Debug ### $fname : line $fcounter : $which end\n"; my $cond = pop(@ifstack); die "HTML::Template->new() : found </${which}> with no matching <TMPL_IF> at $fname : line $fcounter." unless defined $cond; if ($which eq '/TMPL_IF') { die "HTML::Template->new() : found </TMPL_IF> incorrectly terminating a <TMPL_UNLESS> (use </TMPL_UNLESS>) at $fname : line $fcounter.\n" if ($cond->[HTML::Template::COND::WHICH] == HTML::Template::COND::WHICH_UNLESS); } else { die "HTML::Template->new() : found </TMPL_UNLESS> incorrectly terminating a <TMPL_IF> (use </TMPL_IF>) at $fname : line $fcounter.\n" if ($cond->[HTML::Template::COND::WHICH] == HTML::Template::COND::WHICH_IF); } # connect the matching to this "address" - place a NOOP to # hold the spot. This allows output() to treat an IF in the # assembler-esque "Conditional Jump" mode. push(@pstack, $NOOP); $cond->[HTML::Template::COND::JUMP_ADDRESS] = $#pstack; } elsif ($which eq 'TMPL_ELSE') { $options->{debug} and print STDERR "### HTML::Template Debug ### $fname : line $fcounter : ELSE\n"; my $cond = pop(@ifstack); die "HTML::Template->new() : found <TMPL_ELSE> with no matching <TMPL_IF> or <TMPL_UNLESS> at $fname : line $fcounter." unless defined $cond; die "HTML::Template->new() : found second <TMPL_ELSE> tag for <TMPL_IF> or <TMPL_UNLESS> at $fname : line $fcounter." if $cond->[HTML::Template::COND::IS_ELSE]; my $else = HTML::Template::COND->new($cond->[HTML::Template::COND::VARIABLE]); $else->[HTML::Template::COND::WHICH] = $cond->[HTML::Template::COND::WHICH]; $else->[HTML::Template::COND::UNCONDITIONAL_JUMP] = 1; $else->[HTML::Template::COND::IS_ELSE] = 1; # need end-block resolution? if (defined($cond->[HTML::Template::COND::VARIABLE_TYPE])) { $else->[HTML::Template::COND::VARIABLE_TYPE] = $cond->[HTML::Template::COND::VARIABLE_TYPE]; } else { push(@ucstack, $else); } push(@pstack, $else); push(@ifstack, $else); # connect the matching to this "address" - thus the if, # failing jumps to the ELSE address. The else then gets # elaborated, and of course succeeds. On the other hand, if # the IF fails and falls though, output will reach the else # and jump to the /if address. $cond->[HTML::Template::COND::JUMP_ADDRESS] = $#pstack; } elsif ($which eq 'TMPL_INCLUDE') { # handle TMPL_INCLUDEs $options->{debug} and print STDERR "### HTML::Template Debug ### $fname : line $fcounter : INCLUDE $name \n"; # no includes here, bub $options->{no_includes} and croak("HTML::Template : Illegal attempt to use TMPL_INCLUDE in template file : (no_includes => 1)"); my $filename = $name; # look for the included file... my $filepath; if ($options->{search_path_on_include}) { $filepath = $self->_find_file($filename); } else { $filepath = $self->_find_file($filename, [File::Spec->splitdir($fstack[-1][0])]); } die "HTML::Template->new() : Cannot open included file $filename : file not found." if !defined $filepath && $options->{die_on_missing_include}; my $included_template = ""; if( $filepath ) { # use the open_mode if we have one if (my $mode = $options->{open_mode}) { open(TEMPLATE, $mode, $filepath) || confess("HTML::Template->new() : Cannot open included file $filepath with mode $mode: $!"); } else { open(TEMPLATE, $filepath) or confess("HTML::Template->new() : Cannot open included file $filepath : $!"); } # read into the array while (read(TEMPLATE, $included_template, 10240, length($included_template))) { } close(TEMPLATE); } # call filters if necessary $self->_call_filters(\$included_template) if @{$options->{filter}}; if ($included_template) { # not empty # handle the old vanguard format - this needs to happen here # since we're not about to do a next CHUNKS. $options->{vanguard_compatibility_mode} and $included_template =~ s/%([-\w\/\.+]+)%/<TMPL_VAR NAME=$1>/g; # collect mtimes for included files if ($options->{cache} and !$options->{blind_cache}) { $self->{included_mtimes}{$filepath} = (stat($filepath))[9]; } # adjust the fstack to point to the included file info push(@fstack, [$filepath, 1, scalar @{[$included_template =~ m/(\n)/g]} + 1]); (*fname, *fcounter, *fmax) = \(@{$fstack[$#fstack]}); # make sure we aren't infinitely recursing die "HTML::Template->new() : likely recursive includes - parsed $options->{max_includes} files deep and giving up (set max_includes higher to allow deeper recursion)." if ($options->{max_includes} and (scalar(@fstack) > $options->{max_includes})); # stick the remains of this chunk onto the bottom of the # included text. $included_template .= $post; $post = undef; # move the new chunks into place. splice(@chunks, $chunk_number, 1, split(m/(?=<)/, $included_template)); # recalculate stopping point $last_chunk = $#chunks; # start in on the first line of the included text - nothing # else to do on this line. $chunk = $chunks[$chunk_number]; redo CHUNK; } } else { # zuh!? die "HTML::Template->new() : Unknown or unmatched TMPL construct at $fname : line $fcounter."; } # push the rest after the tag if (defined($post)) { if (ref($pstack[$#pstack]) eq 'SCALAR') { ${$pstack[$#pstack]} .= $post; } else { push(@pstack, \$post); } } } else { # just your ordinary markup # make sure we didn't reject something TMPL_* but badly formed if ($options->{strict}) { die "HTML::Template->new() : Syntax error in <TMPL_*> tag at $fname : $fcounter." if ($chunk =~ /<(?:!--\s*)?\/?tmpl_/i); } # push the rest and get next chunk if (defined($chunk)) { if (ref($pstack[$#pstack]) eq 'SCALAR') { ${$pstack[$#pstack]} .= $chunk; } else { push(@pstack, \$chunk); } } } # count newlines in chunk and advance line count $fcounter += scalar(@{[$chunk =~ m/(\n)/g]}); # if we just crossed the end of an included file # pop off the record and re-alias to the enclosing file's info pop(@fstack), (*fname, *fcounter, *fmax) = \(@{$fstack[$#fstack]}) if ($fcounter > $fmax); } # next CHUNK # make sure we don't have dangling IF or LOOP blocks scalar(@ifstack) and die "HTML::Template->new() : At least one <TMPL_IF> or <TMPL_UNLESS> not terminated at end of file!"; scalar(@loopstack) and die "HTML::Template->new() : At least one <TMPL_LOOP> not terminated at end of file!"; # resolve pending conditionals foreach my $uc (@ucstack) { my $var = $uc->[HTML::Template::COND::VARIABLE]; if (exists($pmap{$var})) { $uc->[HTML::Template::COND::VARIABLE] = $pmap{$var}; } else { $pmap{$var} = HTML::Template::VAR->new(); $top_pmap{$var} = HTML::Template::VAR->new() if $options->{global_vars} and not exists $top_pmap{$var}; $uc->[HTML::Template::COND::VARIABLE] = $pmap{$var}; } if (ref($pmap{$var}) eq 'HTML::Template::VAR') { $uc->[HTML::Template::COND::VARIABLE_TYPE] = HTML::Template::COND::VARIABLE_TYPE_VAR; } else { $uc->[HTML::Template::COND::VARIABLE_TYPE] = HTML::Template::COND::VARIABLE_TYPE_LOOP; } } # want a stack dump? if ($options->{stack_debug}) { require 'Data/Dumper.pm'; print STDERR "### HTML::Template _param Stack Dump ###\n\n", Data::Dumper::Dumper($self->{parse_stack}), "\n"; } # get rid of filters - they cause runtime errors if Storable tries # to store them. This can happen under global_vars. delete $options->{filter}; } # a recursive sub that associates each loop with the loops above # (treating the top-level as a loop) sub _globalize_vars { my $self = shift; # associate with the loop (and top-level templates) above in the tree. push(@{$self->{options}{associate}}, @_); # recurse down into the template tree, adding ourself to the end of # list. push(@_, $self); map { $_->_globalize_vars(@_) } map { values %{$_->[HTML::Template::LOOP::TEMPLATE_HASH]} } grep { ref($_) eq 'HTML::Template::LOOP' } @{$self->{parse_stack}}; } # method used to recursively un-hook associate sub _unglobalize_vars { my $self = shift; # disassociate $self->{options}{associate} = undef; # recurse down into the template tree disassociating map { $_->_unglobalize_vars() } map { values %{$_->[HTML::Template::LOOP::TEMPLATE_HASH]} } grep { ref($_) eq 'HTML::Template::LOOP' } @{$self->{parse_stack}}; } =head2 config A package method that is used to set/get the global default configuration options. For instance, if you want to set the C<utf8> flag to always be on for every template loaded by this process you would do: HTML::Template->config(utf8 => 1); Or if you wanted to check if the C<utf8> flag was on or not, you could do: my %config = HTML::Template->config; if( $config{utf8} ) { ... } Any configuration options that are valid for C<new()> are acceptable to be passed to this method. =cut sub config { my ($pkg, %options) = @_; foreach my $opt (keys %options) { if( $opt eq 'associate' || $opt eq 'filter' || $opt eq 'path' ) { push(@{$OPTIONS{$opt}}, $options{$opt}); } else { $OPTIONS{$opt} = $options{$opt}; } } return %OPTIONS; } =head2 param C<param()> can be called in a number of ways =over =item 1 - To return a list of parameters in the template : my @parameter_names = $self->param(); =item 2 - To return the value set to a param : my $value = $self->param('PARAM'); =item 3 - To set the value of a parameter : # For simple TMPL_VARs: $self->param(PARAM => 'value'); # with a subroutine reference that gets called to get the value # of the scalar. The sub will receive the template object as a # parameter. $self->param(PARAM => sub { return 'value' }); # And TMPL_LOOPs: $self->param(LOOP_PARAM => [{PARAM => VALUE_FOR_FIRST_PASS}, {PARAM => VALUE_FOR_SECOND_PASS}]); =item 4 - To set the value of a number of parameters : # For simple TMPL_VARs: $self->param( PARAM => 'value', PARAM2 => 'value' ); # And with some TMPL_LOOPs: $self->param( PARAM => 'value', PARAM2 => 'value', LOOP_PARAM => [{PARAM => VALUE_FOR_FIRST_PASS}, {PARAM => VALUE_FOR_SECOND_PASS}], ANOTHER_LOOP_PARAM => [{PARAM => VALUE_FOR_FIRST_PASS}, {PARAM => VALUE_FOR_SECOND_PASS}], ); =item 5 - To set the value of a number of parameters using a hash-ref : $self->param( { PARAM => 'value', PARAM2 => 'value', LOOP_PARAM => [{PARAM => VALUE_FOR_FIRST_PASS}, {PARAM => VALUE_FOR_SECOND_PASS}], ANOTHER_LOOP_PARAM => [{PARAM => VALUE_FOR_FIRST_PASS}, {PARAM => VALUE_FOR_SECOND_PASS}], } ); An error occurs if you try to set a value that is tainted if the C<force_untaint> option is set. =back =cut sub param { my $self = shift; my $options = $self->{options}; my $param_map = $self->{param_map}; # the no-parameter case - return list of parameters in the template. return keys(%$param_map) unless scalar(@_); my $first = shift; my $type = ref $first; # the one-parameter case - could be a parameter value request or a # hash-ref. if (!scalar(@_) and !length($type)) { my $param = $options->{case_sensitive} ? $first : lc $first; # check for parameter existence $options->{die_on_bad_params} and !exists($param_map->{$param}) and croak( "HTML::Template : Attempt to get nonexistent parameter '$param' - this parameter name doesn't match any declarations in the template file : (die_on_bad_params set => 1)" ); return undef unless (exists($param_map->{$param}) and defined($param_map->{$param})); return ${$param_map->{$param}} if (ref($param_map->{$param}) eq 'HTML::Template::VAR'); return $param_map->{$param}[HTML::Template::LOOP::PARAM_SET]; } if (!scalar(@_)) { croak("HTML::Template->param() : Single reference arg to param() must be a hash-ref! You gave me a $type.") unless $type eq 'HASH' or UNIVERSAL::isa($first, 'HASH'); push(@_, %$first); } else { unshift(@_, $first); } croak("HTML::Template->param() : You gave me an odd number of parameters to param()!") unless ((@_ % 2) == 0); # strangely, changing this to a "while(@_) { shift, shift }" type # loop causes perl 5.004_04 to die with some nonsense about a # read-only value. for (my $x = 0 ; $x <= $#_ ; $x += 2) { my $param = $options->{case_sensitive} ? $_[$x] : lc $_[$x]; my $value = $_[($x + 1)]; # check that this param exists in the template $options->{die_on_bad_params} and !exists($param_map->{$param}) and croak( "HTML::Template : Attempt to set nonexistent parameter '$param' - this parameter name doesn't match any declarations in the template file : (die_on_bad_params => 1)" ); # if we're not going to die from bad param names, we need to ignore # them... unless (exists($param_map->{$param})) { next if not $options->{parent_global_vars}; # ... unless global vars is on - in which case we can't be # sure we won't need it in a lower loop. if (ref($value) eq 'ARRAY') { $param_map->{$param} = HTML::Template::LOOP->new(); } else { $param_map->{$param} = HTML::Template::VAR->new(); } } # figure out what we've got, taking special care to allow for # objects that are compatible underneath. my $type = ref $value || ''; if ($type eq 'REF') { croak("HTML::Template::param() : attempt to set parameter '$param' with a reference to a reference!"); } elsif ($type && ($type eq 'ARRAY' || ($type !~ /^(CODE)|(HASH)|(SCALAR)$/ && $value->isa('ARRAY')))) { ref($param_map->{$param}) eq 'HTML::Template::LOOP' || croak( "HTML::Template::param() : attempt to set parameter '$param' with an array ref - parameter is not a TMPL_LOOP!"); $param_map->{$param}[HTML::Template::LOOP::PARAM_SET] = [@{$value}]; } elsif( $type eq 'CODE' ) { # code can be used for a var or a loop if( ref($param_map->{$param}) eq 'HTML::Template::LOOP' ) { $param_map->{$param}[HTML::Template::LOOP::PARAM_SET] = $value; } else { ${$param_map->{$param}} = $value; } } else { ref($param_map->{$param}) eq 'HTML::Template::VAR' || croak( "HTML::Template::param() : attempt to set parameter '$param' with a scalar - parameter is not a TMPL_VAR!"); ${$param_map->{$param}} = $value; } } } =head2 clear_params Sets all the parameters to undef. Useful internally, if nowhere else! =cut sub clear_params { my $self = shift; my $type; foreach my $name (keys %{$self->{param_map}}) { $type = ref($self->{param_map}{$name}); undef(${$self->{param_map}{$name}}) if ($type eq 'HTML::Template::VAR'); undef($self->{param_map}{$name}[HTML::Template::LOOP::PARAM_SET]) if ($type eq 'HTML::Template::LOOP'); } } # obsolete implementation of associate sub associateCGI { my $self = shift; my $cgi = shift; (ref($cgi) eq 'CGI') or croak("Warning! non-CGI object was passed to HTML::Template::associateCGI()!\n"); push(@{$self->{options}{associate}}, $cgi); return 1; } =head2 output C<output()> returns the final result of the template. In most situations you'll want to print this, like: print $template->output(); When output is called each occurrence of C<< <TMPL_VAR NAME=name> >> is replaced with the value assigned to "name" via C<param()>. If a named parameter is unset it is simply replaced with ''. C<< <TMPL_LOOP> >>s are evaluated once per parameter set, accumulating output on each pass. Calling C<output()> is guaranteed not to change the state of the HTML::Template object, in case you were wondering. This property is mostly important for the internal implementation of loops. You may optionally supply a filehandle to print to automatically as the template is generated. This may improve performance and lower memory consumption. Example: $template->output(print_to => *STDOUT); The return value is undefined when using the C<print_to> option. =cut use vars qw(%URLESCAPE_MAP); sub output { my $self = shift; my $options = $self->{options}; local $_; croak("HTML::Template->output() : You gave me an odd number of parameters to output()!") unless ((@_ % 2) == 0); my %args = @_; print STDERR "### HTML::Template Memory Debug ### START OUTPUT ", $self->{proc_mem}->size(), "\n" if $options->{memory_debug}; $options->{debug} and print STDERR "### HTML::Template Debug ### In output\n"; # want a stack dump? if ($options->{stack_debug}) { require 'Data/Dumper.pm'; print STDERR "### HTML::Template output Stack Dump ###\n\n", Data::Dumper::Dumper($self->{parse_stack}), "\n"; } # globalize vars - this happens here to localize the circular # references created by global_vars. $self->_globalize_vars() if ($options->{global_vars}); # support the associate magic, searching for undefined params and # attempting to fill them from the associated objects. if (scalar(@{$options->{associate}})) { # prepare case-mapping hashes to do case-insensitive matching # against associated objects. This allows CGI.pm to be # case-sensitive and still work with associate. my (%case_map, $lparam); foreach my $associated_object (@{$options->{associate}}) { # what a hack! This should really be optimized out for case_sensitive. if ($options->{case_sensitive}) { map { $case_map{$associated_object}{$_} = $_ } $associated_object->param(); } else { map { $case_map{$associated_object}{lc($_)} = $_ } $associated_object->param(); } } foreach my $param (keys %{$self->{param_map}}) { unless (defined($self->param($param))) { OBJ: foreach my $associated_object (reverse @{$options->{associate}}) { $self->param($param, scalar $associated_object->param($case_map{$associated_object}{$param})), last OBJ if (exists($case_map{$associated_object}{$param})); } } } } use vars qw($line @parse_stack); local (*line, *parse_stack); # walk the parse stack, accumulating output in $result *parse_stack = $self->{parse_stack}; my $result = ''; tie $result, 'HTML::Template::PRINTSCALAR', $args{print_to} if defined $args{print_to} && !eval { tied *{$args{print_to}} }; my $type; my $parse_stack_length = $#parse_stack; for (my $x = 0 ; $x <= $parse_stack_length ; $x++) { *line = \$parse_stack[$x]; $type = ref($line); if ($type eq 'SCALAR') { $result .= $$line; } elsif ($type eq 'HTML::Template::VAR' and ref($$line) eq 'CODE') { if (defined($$line)) { my $tmp_val = $$line->($self); croak("HTML::Template->output() : 'force_untaint' option but coderef returns tainted value") if $options->{force_untaint} && tainted($tmp_val); $result .= $tmp_val; # change the reference to point to the value now not the code reference $$line = $tmp_val if $options->{cache_lazy_vars} } } elsif ($type eq 'HTML::Template::VAR') { if (defined $$line) { if ($options->{force_untaint} && tainted($$line)) { croak("HTML::Template->output() : tainted value with 'force_untaint' option"); } $result .= $$line; } } elsif ($type eq 'HTML::Template::LOOP') { if (defined($line->[HTML::Template::LOOP::PARAM_SET])) { eval { $result .= $line->output($x, $options->{loop_context_vars}); }; croak("HTML::Template->output() : fatal error in loop output : $@") if $@; } } elsif ($type eq 'HTML::Template::COND') { if ($line->[HTML::Template::COND::UNCONDITIONAL_JUMP]) { $x = $line->[HTML::Template::COND::JUMP_ADDRESS]; } else { if ($line->[HTML::Template::COND::JUMP_IF_TRUE]) { if ($line->[HTML::Template::COND::VARIABLE_TYPE] == HTML::Template::COND::VARIABLE_TYPE_VAR) { if (defined ${$line->[HTML::Template::COND::VARIABLE]}) { if (ref(${$line->[HTML::Template::COND::VARIABLE]}) eq 'CODE') { my $tmp_val = ${$line->[HTML::Template::COND::VARIABLE]}->($self); $x = $line->[HTML::Template::COND::JUMP_ADDRESS] if $tmp_val; ${$line->[HTML::Template::COND::VARIABLE]} = $tmp_val if $options->{cache_lazy_vars}; } else { $x = $line->[HTML::Template::COND::JUMP_ADDRESS] if ${$line->[HTML::Template::COND::VARIABLE]}; } } } else { # if it's a code reference, execute it to get the values my $loop_values = $line->[HTML::Template::COND::VARIABLE][HTML::Template::LOOP::PARAM_SET]; if (defined $loop_values && ref $loop_values eq 'CODE') { $loop_values = $loop_values->($self); $line->[HTML::Template::COND::VARIABLE][HTML::Template::LOOP::PARAM_SET] = $loop_values if $options->{cache_lazy_loops}; } # if we have anything for the loop, jump to the next part if (defined $loop_values && @$loop_values) { $x = $line->[HTML::Template::COND::JUMP_ADDRESS]; } } } else { if ($line->[HTML::Template::COND::VARIABLE_TYPE] == HTML::Template::COND::VARIABLE_TYPE_VAR) { if (defined ${$line->[HTML::Template::COND::VARIABLE]}) { if (ref(${$line->[HTML::Template::COND::VARIABLE]}) eq 'CODE') { my $tmp_val = ${$line->[HTML::Template::COND::VARIABLE]}->($self); $x = $line->[HTML::Template::COND::JUMP_ADDRESS] unless $tmp_val; ${$line->[HTML::Template::COND::VARIABLE]} = $tmp_val if $options->{cache_lazy_vars}; } else { $x = $line->[HTML::Template::COND::JUMP_ADDRESS] unless ${$line->[HTML::Template::COND::VARIABLE]}; } } else { $x = $line->[HTML::Template::COND::JUMP_ADDRESS]; } } else { # if we don't have anything for the loop, jump to the next part my $loop_values = $line->[HTML::Template::COND::VARIABLE][HTML::Template::LOOP::PARAM_SET]; if(!defined $loop_values) { $x = $line->[HTML::Template::COND::JUMP_ADDRESS]; } else { # check to see if the loop is a code ref and if it is execute it to get the values if( ref $loop_values eq 'CODE' ) { $loop_values = $line->[HTML::Template::COND::VARIABLE][HTML::Template::LOOP::PARAM_SET]->($self); $line->[HTML::Template::COND::VARIABLE][HTML::Template::LOOP::PARAM_SET] = $loop_values if $options->{cache_lazy_loops}; } # if we don't have anything in the loop, jump to the next part if(!@$loop_values) { $x = $line->[HTML::Template::COND::JUMP_ADDRESS]; } } } } } } elsif ($type eq 'HTML::Template::NOOP') { next; } elsif ($type eq 'HTML::Template::DEF') { $_ = $x; # remember default place in stack # find next VAR, there might be an ESCAPE in the way *line = \$parse_stack[++$x]; *line = \$parse_stack[++$x] if ref $line eq 'HTML::Template::ESCAPE' or ref $line eq 'HTML::Template::JSESCAPE' or ref $line eq 'HTML::Template::URLESCAPE'; # either output the default or go back if (defined $$line) { $x = $_; } else { $result .= ${$parse_stack[$_]}; } next; } elsif ($type eq 'HTML::Template::ESCAPE') { *line = \$parse_stack[++$x]; if (defined($$line)) { my $tmp_val; if (ref($$line) eq 'CODE') { $tmp_val = $$line->($self); if ($options->{force_untaint} > 1 && tainted($_)) { croak("HTML::Template->output() : 'force_untaint' option but coderef returns tainted value"); } $$line = $tmp_val if $options->{cache_lazy_vars}; } else { $tmp_val = $$line; if ($options->{force_untaint} > 1 && tainted($_)) { croak("HTML::Template->output() : tainted value with 'force_untaint' option"); } } # straight from the CGI.pm bible. $tmp_val =~ s/&/&/g; $tmp_val =~ s/\"/"/g; $tmp_val =~ s/>/>/g; $tmp_val =~ s/</</g; $tmp_val =~ s/'/'/g; $result .= $tmp_val; } next; } elsif ($type eq 'HTML::Template::JSESCAPE') { $x++; *line = \$parse_stack[$x]; if (defined($$line)) { my $tmp_val; if (ref($$line) eq 'CODE') { $tmp_val = $$line->($self); if ($options->{force_untaint} > 1 && tainted($_)) { croak("HTML::Template->output() : 'force_untaint' option but coderef returns tainted value"); } $$line = $tmp_val if $options->{cache_lazy_vars}; } else { $tmp_val = $$line; if ($options->{force_untaint} > 1 && tainted($_)) { croak("HTML::Template->output() : tainted value with 'force_untaint' option"); } } $tmp_val =~ s/\\/\\\\/g; $tmp_val =~ s/'/\\'/g; $tmp_val =~ s/"/\\"/g; $tmp_val =~ s/[\n\x{2028}]/\\n/g; $tmp_val =~ s/\x{2029}/\\n\\n/g; $tmp_val =~ s/\r/\\r/g; $result .= $tmp_val; } } elsif ($type eq 'HTML::Template::URLESCAPE') { $x++; *line = \$parse_stack[$x]; if (defined($$line)) { my $tmp_val; if (ref($$line) eq 'CODE') { $tmp_val = $$line->($self); if ($options->{force_untaint} > 1 && tainted($_)) { croak("HTML::Template->output() : 'force_untaint' option but coderef returns tainted value"); } $$line = $tmp_val if $options->{cache_lazy_vars}; } else { $tmp_val = $$line; if ($options->{force_untaint} > 1 && tainted($_)) { croak("HTML::Template->output() : tainted value with 'force_untaint' option"); } } # Build a char->hex map if one isn't already available unless (exists($URLESCAPE_MAP{chr(1)})) { for (0 .. 255) { $URLESCAPE_MAP{chr($_)} = sprintf('%%%02X', $_); } } # do the translation (RFC 2396 ^uric) $tmp_val =~ s!([^a-zA-Z0-9_.\-])!$URLESCAPE_MAP{$1}!g; $result .= $tmp_val; } } else { confess("HTML::Template::output() : Unknown item in parse_stack : " . $type); } } # undo the globalization circular refs $self->_unglobalize_vars() if ($options->{global_vars}); print STDERR "### HTML::Template Memory Debug ### END OUTPUT ", $self->{proc_mem}->size(), "\n" if $options->{memory_debug}; return undef if defined $args{print_to}; return $result; } =head2 query This method allow you to get information about the template structure. It can be called in a number of ways. The simplest usage of query is simply to check whether a parameter name exists in the template, using the C<name> option: if ($template->query(name => 'foo')) { # do something if a variable of any type named FOO is in the template } This same usage returns the type of the parameter. The type is the same as the tag minus the leading 'TMPL_'. So, for example, a C<TMPL_VAR> parameter returns 'VAR' from C<query()>. if ($template->query(name => 'foo') eq 'VAR') { # do something if FOO exists and is a TMPL_VAR } Note that the variables associated with C<TMPL_IF>s and C<TMPL_UNLESS>s will be identified as 'VAR' unless they are also used in a C<TMPL_LOOP>, in which case they will return 'LOOP'. C<query()> also allows you to get a list of parameters inside a loop (and inside loops inside loops). Example loop: <TMPL_LOOP NAME="EXAMPLE_LOOP"> <TMPL_VAR NAME="BEE"> <TMPL_VAR NAME="BOP"> <TMPL_LOOP NAME="EXAMPLE_INNER_LOOP"> <TMPL_VAR NAME="INNER_BEE"> <TMPL_VAR NAME="INNER_BOP"> </TMPL_LOOP> </TMPL_LOOP> And some query calls: # returns 'LOOP' $type = $template->query(name => 'EXAMPLE_LOOP'); # returns ('bop', 'bee', 'example_inner_loop') @param_names = $template->query(loop => 'EXAMPLE_LOOP'); # both return 'VAR' $type = $template->query(name => ['EXAMPLE_LOOP', 'BEE']); $type = $template->query(name => ['EXAMPLE_LOOP', 'BOP']); # and this one returns 'LOOP' $type = $template->query(name => ['EXAMPLE_LOOP', 'EXAMPLE_INNER_LOOP']); # and finally, this returns ('inner_bee', 'inner_bop') @inner_param_names = $template->query(loop => ['EXAMPLE_LOOP', 'EXAMPLE_INNER_LOOP']); # for non existent parameter names you get undef this returns undef. $type = $template->query(name => 'DWEAZLE_ZAPPA'); # calling loop on a non-loop parameter name will cause an error. This dies: $type = $template->query(loop => 'DWEAZLE_ZAPPA'); As you can see above the C<loop> option returns a list of parameter names and both C<name> and C<loop> take array refs in order to refer to parameters inside loops. It is an error to use C<loop> with a parameter that is not a loop. Note that all the names are returned in lowercase and the types are uppercase. Just like C<param()>, C<query()> with no arguments returns all the parameter names in the template at the top level. =cut sub query { my $self = shift; $self->{options}{debug} and print STDERR "### HTML::Template Debug ### query(", join(', ', @_), ")\n"; # the no-parameter case - return $self->param() return $self->param() unless scalar(@_); croak("HTML::Template::query() : Odd number of parameters passed to query!") if (scalar(@_) % 2); croak("HTML::Template::query() : Wrong number of parameters passed to query - should be 2.") if (scalar(@_) != 2); my ($opt, $path) = (lc shift, shift); croak("HTML::Template::query() : invalid parameter ($opt)") unless ($opt eq 'name' or $opt eq 'loop'); # make path an array unless it already is $path = [$path] unless (ref $path); # find the param in question. my @objs = $self->_find_param(@$path); return undef unless scalar(@objs); my ($obj, $type); # do what the user asked with the object if ($opt eq 'name') { # we only look at the first one. new() should make sure they're # all the same. ($obj, $type) = (shift(@objs), shift(@objs)); return undef unless defined $obj; return 'VAR' if $type eq 'HTML::Template::VAR'; return 'LOOP' if $type eq 'HTML::Template::LOOP'; croak("HTML::Template::query() : unknown object ($type) in param_map!"); } elsif ($opt eq 'loop') { my %results; while (@objs) { ($obj, $type) = (shift(@objs), shift(@objs)); croak( "HTML::Template::query() : Search path [", join(', ', @$path), "] doesn't end in a TMPL_LOOP - it is an error to use the 'loop' option on a non-loop parameter. To avoid this problem you can use the 'name' option to query() to check the type first." ) unless ((defined $obj) and ($type eq 'HTML::Template::LOOP')); # SHAZAM! This bit extracts all the parameter names from all the # loop objects for this name. map { $results{$_} = 1 } map { keys(%{$_->{'param_map'}}) } values(%{$obj->[HTML::Template::LOOP::TEMPLATE_HASH]}); } # this is our loop list, return it. return keys(%results); } } # a function that returns the object(s) corresponding to a given path and # its (their) ref()(s). Used by query() in the obvious way. sub _find_param { my $self = shift; my $spot = $self->{options}{case_sensitive} ? shift : lc shift; # get the obj and type for this spot my $obj = $self->{'param_map'}{$spot}; return unless defined $obj; my $type = ref $obj; # return if we're here or if we're not but this isn't a loop return ($obj, $type) unless @_; return unless ($type eq 'HTML::Template::LOOP'); # recurse. this is a depth first search on the template tree, for # the algorithm geeks in the audience. return map { $_->_find_param(@_) } values(%{$obj->[HTML::Template::LOOP::TEMPLATE_HASH]}); } # HTML::Template::VAR, LOOP, etc are *light* objects - their internal # spec is used above. No encapsulation or information hiding is to be # assumed. package HTML::Template::VAR; sub new { my $value; return bless(\$value, $_[0]); } package HTML::Template::DEF; sub new { my $value = $_[1]; return bless(\$value, $_[0]); } package HTML::Template::LOOP; sub new { return bless([], $_[0]); } sub output { my $self = shift; my $index = shift; my $loop_context_vars = shift; my $template = $self->[TEMPLATE_HASH]{$index}; my $value_sets_array = $self->[PARAM_SET]; return unless defined($value_sets_array); my $result = ''; my $count = 0; my $odd = 0; # execute the code to get the values if it's a code reference if( ref $value_sets_array eq 'CODE' ) { $value_sets_array = $value_sets_array->($template); croak("HTML::Template->output: TMPL_LOOP code reference did not return an ARRAY reference!") unless ref $value_sets_array && ref $value_sets_array eq 'ARRAY'; $self->[PARAM_SET] = $value_sets_array if $template->{options}->{cache_lazy_loops}; } foreach my $value_set (@$value_sets_array) { if ($loop_context_vars) { if ($count == 0) { @{$value_set}{qw(__first__ __inner__ __outer__ __last__)} = (1, 0, 1, $#{$value_sets_array} == 0); } elsif ($count == $#{$value_sets_array}) { @{$value_set}{qw(__first__ __inner__ __outer__ __last__)} = (0, 0, 1, 1); } else { @{$value_set}{qw(__first__ __inner__ __outer__ __last__)} = (0, 1, 0, 0); } $odd = $value_set->{__odd__} = !$odd; $value_set->{__even__} = !$odd; $value_set->{__counter__} = $count + 1; $value_set->{__index__} = $count; } $template->param($value_set); $result .= $template->output; $template->clear_params; @{$value_set}{qw(__first__ __last__ __inner__ __outer__ __odd__ __even__ __counter__ __index__)} = (0, 0, 0, 0, 0, 0, 0) if ($loop_context_vars); $count++; } return $result; } package HTML::Template::COND; sub new { my $pkg = shift; my $var = shift; my $self = []; $self->[VARIABLE] = $var; bless($self, $pkg); return $self; } package HTML::Template::NOOP; sub new { my $unused; my $self = \$unused; bless($self, $_[0]); return $self; } package HTML::Template::ESCAPE; sub new { my $unused; my $self = \$unused; bless($self, $_[0]); return $self; } package HTML::Template::JSESCAPE; sub new { my $unused; my $self = \$unused; bless($self, $_[0]); return $self; } package HTML::Template::URLESCAPE; sub new { my $unused; my $self = \$unused; bless($self, $_[0]); return $self; } # scalar-tying package for output(print_to => *HANDLE) implementation package HTML::Template::PRINTSCALAR; use strict; sub TIESCALAR { bless \$_[1], $_[0]; } sub FETCH { } sub STORE { my $self = shift; local *FH = $$self; print FH @_; } 1; __END__ =head1 LAZY VALUES As mentioned above, both C<TMPL_VAR> and C<TMPL_LOOP> values can be code references. These code references are only executed if the variable or loop is used in the template. This is extremely useful if you want to make a variable available to template designers but it can be expensive to calculate, so you only want to do so if you have to. Maybe an example will help to illustrate. Let's say you have a template like this: <tmpl_if we_care> <tmpl_if life_universe_and_everything> </tmpl_if> If C<life_universe_and_everything> is expensive to calculate we can wrap it's calculation in a code reference and HTML::Template will only execute that code if C<we_care> is also true. $tmpl->param(life_universe_and_everything => sub { calculate_42() }); Your code reference will be given a single argument, the HTML::Template object in use. In the above example, if we wanted C<calculate_42()> to have this object we'd do something like this: $tmpl->param(life_universe_and_everything => sub { calculate_42(shift) }); This same approach can be used for C<TMPL_LOOP>s too: <tmpl_if we_care> <tmpl_loop needles_in_haystack> Found <tmpl_var __counter>! </tmpl_loop> </tmpl_if> And in your Perl code: $tmpl->param(needles_in_haystack => sub { find_needles() }); The only difference in the C<TMPL_LOOP> case is that the subroutine needs to return a reference to an ARRAY, not just a scalar value. =head2 Multiple Calls It's important to recognize that while this feature is designed to save processing time when things aren't needed, if you're not careful it can actually increase the number of times you perform your calculation. HTML::Template calls your code reference each time it seems your loop in the template, this includes the times that you might use the loop in a conditional (C<TMPL_IF> or C<TMPL_UNLESS>). For instance: <tmpl_if we care> <tmpl_if needles_in_haystack> <tmpl_loop needles_in_haystack> Found <tmpl_var __counter>! </tmpl_loop> <tmpl_else> No needles found! </tmpl_if> </tmpl_if> This will actually call C<find_needles()> twice which will be even worse than you had before. One way to work around this is to cache the return value yourself: my $needles; $tmpl->param(needles_in_haystack => sub { defined $needles ? $needles : $needles = find_needles() }); =head1 BUGS I am aware of no bugs - if you find one, join the mailing list and tell us about it. You can join the HTML::Template mailing-list by visiting: http://lists.sourceforge.net/lists/listinfo/html-template-users Of course, you can still email me directly (C<sam@tregar.com>) with bugs, but I reserve the right to forward bug reports to the mailing list. When submitting bug reports, be sure to include full details, including the VERSION of the module, a test script and a test template demonstrating the problem! If you're feeling really adventurous, HTML::Template has a publically available Git repository. See below for more information in the PUBLIC GIT REPOSITORY section. =head1 CREDITS This module was the brain child of my boss, Jesse Erlbaum (C<jesse@vm.com>) at Vanguard Media (http://vm.com) . The most original idea in this module - the C<< <TMPL_LOOP> >> - was entirely his. Fixes, Bug Reports, Optimizations and Ideas have been generously provided by: =over =item * Richard Chen =item * Mike Blazer =item * Adriano Nagelschmidt Rodrigues =item * Andrej Mikus =item * Ilya Obshadko =item * Kevin Puetz =item * Steve Reppucci =item * Richard Dice =item * Tom Hukins =item * Eric Zylberstejn =item * David Glasser =item * Peter Marelas =item * James William Carlson =item * Frank D. Cringle =item * Winfried Koenig =item * Matthew Wickline =item * Doug Steinwand =item * Drew Taylor =item * Tobias Brox =item * Michael Lloyd =item * Simran Gambhir =item * Chris Houser <chouser@bluweb.com> =item * Larry Moore =item * Todd Larason =item * Jody Biggs =item * T.J. Mather =item * Martin Schroth =item * Dave Wolfe =item * uchum =item * Kawai Takanori =item * Peter Guelich =item * Chris Nokleberg =item * Ralph Corderoy =item * William Ward =item * Ade Olonoh =item * Mark Stosberg =item * Lance Thomas =item * Roland Giersig =item * Jere Julian =item * Peter Leonard =item * Kenny Smith =item * Sean P. Scanlon =item * Martin Pfeffer =item * David Ferrance =item * Gyepi Sam =item * Darren Chamberlain =item * Paul Baker =item * Gabor Szabo =item * Craig Manley =item * Richard Fein =item * The Phalanx Project =item * Sven Neuhaus =item * Michael Peters =item * Jan Dubois =item * Moritz Lenz =back Thanks! =head1 WEBSITE You can find information about HTML::Template and other related modules at: http://html-template.sourceforge.net =head1 PUBLIC GIT REPOSITORY HTML::Template now has a publicly accessible Git repository provided by GitHub (github.com). You can access it by going to https://github.com/mpeters/html-template. Give it a try! =head1 AUTHOR Sam Tregar, C<sam@tregar.com> =head1 CO-MAINTAINER Michael Peters, C<mpeters@plusthree.com> =head1 LICENSE HTML::Template : A module for using HTML Templates with Perl Copyright (C) 2000-2011 Sam Tregar (sam@tregar.com) This module is free software; you can redistribute it and/or modify it under the same terms as Perl itself, which means using either: a) the GNU General Public License as published by the Free Software Foundation; either version 1, or (at your option) any later version, or b) the "Artistic License" which comes with this module. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See either the GNU General Public License or the Artistic License for more details. You should have received a copy of the Artistic License with this module. If not, I'll be glad to provide one. You should have received a copy of the GNU General Public License along with this program. If not, write to the Free Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA =cut FormatMarkdown.pm 0000644 00000015340 00000000000 0010001 0 ustar 00 package HTML::FormatMarkdown; # ABSTRACT: Format HTML as Markdown use 5.006_001; use strict; use warnings; use parent 'HTML::Formatter'; our $VERSION = '2.12'; # VERSION our $AUTHORITY = 'cpan:NIGELM'; # AUTHORITY sub default_values { ( shift->SUPER::default_values(), lm => 0, rm => 70, ); } sub configure { my ( $self, $hash ) = @_; my $lm = $self->{lm}; my $rm = $self->{rm}; $lm = delete $hash->{lm} if exists $hash->{lm}; $lm = delete $hash->{leftmargin} if exists $hash->{leftmargin}; $rm = delete $hash->{rm} if exists $hash->{rm}; $rm = delete $hash->{rightmargin} if exists $hash->{rightmargin}; my $width = $rm - $lm; if ( $width < 1 ) { warn "Bad margins, ignored" if $^W; return; } if ( $width < 20 ) { warn "Page probably too narrow" if $^W; } for ( keys %$hash ) { warn "Unknown configure option '$_'" if $^W; } $self->{lm} = $lm; $self->{rm} = $rm; $self; } sub begin { my $self = shift; $self->SUPER::begin(); $self->{maxpos} = 0; $self->{curpos} = 0; # current output position. } sub end { shift->collect("\n"); } sub header_start { my ( $self, $level ) = @_; $self->vspace(1); $self->out( '#' x $level . ' ' ); 1; } sub header_end { my ( $self, $level ) = @_; $self->out( ' ' . '#' x $level ); $self->vspace(1); } sub bullet { my $self = shift; $self->SUPER::bullet( $_[0] . ' ' ); } sub hr_start { my $self = shift; $self->vspace(1); $self->out('- - -'); $self->vspace(1); } sub img_start { my ( $self, $node ) = @_; my $alt = $node->attr('alt'); my $src = $node->attr('src'); $self->out(""); } sub a_start { my ( $self, $node ) = @_; # ignore named anchors if ( $node->attr('name') ) { 1; } elsif ( $node->attr('href') =~ /^#/ ) { 1; } else { $self->out("["); } } sub a_end { my ( $self, $node ) = @_; if ( $node->attr('name') ) { return; } elsif ( my $href = $node->attr('href') ) { if ( $href =~ /^#/ ) { return; } $self->out("]($href)"); } } sub b_start { shift->out("**") } sub b_end { shift->out("**") } sub i_start { shift->out("*") } sub i_end { shift->out("*") } sub tt_start { my $self = shift; if ( $self->{pre} ) { return 1; } else { $self->out("`"); } } sub tt_end { my $self = shift; if ( $self->{pre} ) { return; } else { $self->out("`"); } } sub blockquote_start { my $self = shift; $self->{blockquote}++; $self->vspace(1); $self->adjust_rm(-4); 1; } sub blockquote_end { my $self = shift; $self->{blockquote}--; $self->vspace(1); $self->adjust_rm(+4); } sub blockquote_out { my ( $self, $text ) = @_; $self->nl; $self->goto_lm; my $line = "> "; $self->{curpos} += 2; foreach my $word ( split /\s/, $text ) { $line .= "$word "; if ( ( $self->{curpos} + length($line) ) > $self->{rm} ) { $self->collect($line); $self->nl; $self->goto_lm; $line = "> "; $self->{curpos} += 2; } } $self->collect($line); $self->nl; } # Quoted from HTML::FormatText sub pre_out { my $self = shift; if ( defined $self->{vspace} ) { if ( $self->{out} ) { $self->nl() while $self->{vspace}-- >= 0; $self->{vspace} = undef; } } my $indent = ' ' x $self->{lm}; $indent .= ' ' x 4; my $pre = shift; $pre =~ s/^/$indent/mg; $self->collect($pre); $self->{out}++; } sub out { my $self = shift; my $text = shift; $text =~ tr/\xA0\xAD/ /d; if ( $text =~ /^\s*$/ ) { $self->{hspace} = 1; return; } if ( defined $self->{vspace} ) { if ( $self->{out} ) { $self->nl while $self->{vspace}-- >= 0; } $self->goto_lm; $self->{vspace} = undef; $self->{hspace} = 0; } if ( $self->{hspace} ) { if ( $self->{curpos} + length($text) > $self->{rm} ) { # word will not fit on line; do a line break $self->nl; $self->goto_lm; } else { # word fits on line; use a space $self->collect(' '); ++$self->{curpos}; } $self->{hspace} = 0; } $self->collect($text); my $pos = $self->{curpos} += length $text; $self->{maxpos} = $pos if $self->{maxpos} < $pos; $self->{'out'}++; } sub goto_lm { my $self = shift; my $pos = $self->{curpos}; my $lm = $self->{lm}; if ( $pos < $lm ) { $self->{curpos} = $lm; $self->collect( " " x ( $lm - $pos ) ); } } sub nl { my $self = shift; $self->{'out'}++; $self->{curpos} = 0; $self->collect("\n"); } sub adjust_lm { my $self = shift; $self->{lm} += $_[0]; $self->goto_lm; } sub adjust_rm { shift->{rm} += $_[0]; } 1; __END__ =pod =for stopwords CPAN Markdown homepage =for test_synopsis 1; __END__ =head1 NAME HTML::FormatMarkdown - Format HTML as Markdown =head1 VERSION version 2.12 =head1 SYNOPSIS use HTML::FormatMarkdown; my $string = HTML::FormatMarkdown->format_file( 'test.html' ); open my $fh, ">", "test.md" or die "$!\n"; print $fh $string; close $fh; =head1 DESCRIPTION HTML::FormatMarkdown is a formatter that outputs Markdown. HTML::FormatMarkdown is built on L<HTML::Formatter> and documentation for that module applies to this - especially L<HTML::Formatter/new>, L<HTML::Formatter/format_file> and L<HTML::Formatter/format_string>. =head1 INSTALLATION See perlmodinstall for information and options on installing Perl modules. =head1 BUGS AND LIMITATIONS You can make new bug reports, and view existing ones, through the web interface at L<http://rt.cpan.org/Public/Dist/Display.html?Name=HTML-Format>. =head1 AVAILABILITY The project homepage is L<https://metacpan.org/release/HTML-Format>. The latest version of this module is available from the Comprehensive Perl Archive Network (CPAN). Visit L<http://www.perl.com/CPAN/> to find a CPAN site near you, or see L<https://metacpan.org/module/HTML::Format/>. =head1 AUTHORS =over 4 =item * Nigel Metheringham <nigelm@cpan.org> =item * Sean M Burke <sburke@cpan.org> =item * Gisle Aas <gisle@ActiveState.com> =back =head1 COPYRIGHT AND LICENSE This software is copyright (c) 2015 by Nigel Metheringham, 2002-2005 Sean M Burke, 1999-2002 Gisle Aas. This is free software; you can redistribute it and/or modify it under the same terms as the Perl 5 programming language system itself. =cut FormatPS.pm 0000644 00000074631 00000000000 0006551 0 ustar 00 package HTML::FormatPS; # ABSTRACT: Format HTML as PostScript use 5.008; use strict; use warnings; use Carp; use Encode; use IO::File; use utf8; # for the is_utf8 function use base 'HTML::Formatter'; our $VERSION = '2.12'; # VERSION our $AUTHORITY = 'cpan:NIGELM'; # AUTHORITY # We now use Smart::Comments in place of the old DEBUG framework. # this should be commented out in release versions.... ##use Smart::Comments; # ------------------------------------------------------------------------ # A few routines that convert lengths into points sub mm { $_[0] * 72 / 25.4; } sub in { $_[0] * 72; } # ------------------------------------------------------------------------ my %PaperSizes = ( A3 => [ mm(297), mm(420) ], A4 => [ mm(210), mm(297) ], A5 => [ mm(148), mm(210) ], B4 => [ 729, 1032 ], B5 => [ 516, 729 ], Letter => [ in(8.5), in(11) ], Legal => [ in(8.5), in(14) ], Executive => [ in(7.5), in(10) ], Tabloid => [ in(11), in(17) ], Statement => [ in(5.5), in(8.5) ], Folio => [ in(8.5), in(13) ], "10x14" => [ in(10), in(14) ], Quarto => [ 610, 780 ], ); # ------------------------------------------------------------------------ my %FontFamilies = ( Courier => [ qw(Courier Courier-Bold Courier-Oblique Courier-BoldOblique) ], Helvetica => [ qw(Helvetica Helvetica-Bold Helvetica-Oblique Helvetica-BoldOblique) ], Times => [ qw(Times-Roman Times-Bold Times-Italic Times-BoldItalic) ], ); # ------------------------------------------------------------------------ # size 0 1 2 3 4 5 6 7 8 my @FontSizes = ( 5, 6, 8, 10, 12, 14, 18, 24, 32 ); sub BOLD { 0x01; } sub ITALIC { 0x02; } my %param = ( papersize => 'papersize', paperwidth => 'paperwidth', paperheight => 'paperheigth', leftmargin => 'lmW', rightmargin => 'rmW', horizontalmargin => 'mW', topmargin => 'tmH', bottommargin => 'bmH', verticalmargin => 'mH', no_prolog => 'no_prolog', no_trailer => 'no_trailer', pageno => 'printpageno', startpage => 'startpage', fontfamily => 'family', fontscale => 'fontscale', leading => 'leading', ); # ------------------------------------------------------------------------ sub new { my $class = shift; my $self = $class->SUPER::new(@_); # Obtained from the <title> element $self->{title} = ""; # The font ID last sent to the PostScript output (this may be # temporarily different from the "current font" as read from # the HTML input). Initially none. $self->{psfontid} = ""; # Pending horizontal space. A list [ " ", $fontid, $width ], # or undef if no space is pending. $self->{hspace} = undef; # add an encoder object for perl native to Latin1 output $self->{encoder} = find_encoding('iso-8859-1'); $self; } # ------------------------------------------------------------------------ sub default_values { ( shift->SUPER::default_values(), family => "Times", mH => mm(40), mW => mm(20), printpageno => 1, startpage => 1, # yes, you can start numbering at 10, or whatever. fontscale => 1, leading => 0.1, papersize => 'A4', paperwidth => mm(210), paperheight => mm(297), ); } # ------------------------------------------------------------------------ sub configure { my ( $self, $hash ) = @_; my ( $key, $val ); while ( ( $key, $val ) = each %$hash ) { $key = lc $key; croak "Illegal parameter ($key => $val)" unless exists $param{$key}; $key = $param{$key}; { $key eq "family" && do { $val = "\u\L$val"; croak "Unknown font family ($val)" unless exists $FontFamilies{$val}; $self->{family} = $val; last; }; $key eq "papersize" && do { $self->papersize($val) || croak sprintf "Unknown papersize '%s'.\nThe knowns are: %s.\nAborting", $val, join( ', ', sort keys %PaperSizes ); last; }; $self->{$key} = lc $val; } } } # ------------------------------------------------------------------------ sub papersize { my ( $self, $val ) = @_; $val = "\u\L$val"; my ( $width, $height ) = @{ $PaperSizes{$val} || return 0 }; return 0 unless defined $width; $self->{papersize} = $val; $self->{paperwidth} = $width; $self->{paperheight} = $height; 1; } # ------------------------------------------------------------------------ sub fontsize { my $self = shift; my $size = $self->{font_size}[-1]; $size = 8 if $size > 8; $size = 3 if $size < 0; $FontSizes[$size] * $self->{fontscale}; } # Determine the current font and set font-related members. # If $plain_with_size is given (a number), use a plain font # of that size. Otherwise, use the font specified by the # HTML context. Returns the "font ID" of the current font. # ------------------------------------------------------------------------ sub setfont { my ( $self, $plain_with_size ) = @_; my $index = 0; my $family = $self->{family} || 'Times'; my $size = $plain_with_size; unless ($plain_with_size) { $index |= BOLD if $self->{bold}; $index |= ITALIC if $self->{italic} || $self->{underline}; $family = 'Courier' if $self->{teletype}; $size = $self->fontsize; } my $font = $FontFamilies{$family}[$index]; my $font_with_size = "$font-$size"; if ( $self->{currentfont} eq $font_with_size ) { return $self->{currentfontid}; } $self->{currentfont} = $font_with_size; $self->{pointsize} = $size; my $fontmod = "Font::Metrics::$font"; $fontmod =~ s/-//g; my $fontfile = $fontmod . ".pm"; $fontfile =~ s,::,/,g; require $fontfile; { ## no critic no strict 'refs'; $self->{wx} = \@{"${fontmod}::wx"}; ## use critic } $font = $self->{fonts}{$font_with_size} || do { my $fontID = "F" . ++$self->{fno}; $self->{fonts}{$font_with_size} = $fontID; $fontID; }; $self->{currentfontid} = $font; return $font; } # ------------------------------------------------------------------------ # Construct PostScript code for setting the current font according # to $fontid, or an empty string if no font change is needed. # Assumes the return string will always be output as PostScript if # nonempty, so that our notion of the current PostScript font # stays in sync with that of the PostScript interpreter. # sub switchfont { my ( $self, $fontid ) = @_; if ( $self->{psfontid} eq $fontid ) { return ""; } else { $self->{psfontid} = $fontid; return "$fontid SF"; } } # ------------------------------------------------------------------------ # Like setfont + switchfont. sub findfont { my ( $self, $plain_with_size ) = @_; return $self->switchfont( $self->setfont($plain_with_size) ); } # ------------------------------------------------------------------------ sub width { my $self = shift; my $str = shift; my $w = 0; my $wx = $self->{wx}; my $sz = $self->{pointsize}; # need to encode to same encoding as font before getting width for ( unpack( "C*", $self->encode_string($str) ) ) { # if the character is outside the table, assume its m sized $w += ( ( $_ > $#{$wx} ) ? $wx->[ ord('m') ] : $wx->[$_] ) * $sz # unless $_ eq 0xAD; # optional hyphen } $w; } # ------------------------------------------------------------------------ sub begin { my $self = shift; $self->SUPER::begin; # Margins are in points $self->{lm} = $self->{lmW} || $self->{mW}; $self->{rm} = $self->{paperwidth} - ( $self->{rmW} || $self->{mW} ); $self->{tm} = $self->{paperheight} - ( $self->{tmH} || $self->{mH} ); $self->{bm} = $self->{bmH} || $self->{mH}; $self->{'orig_margins'} = # used only by the debug-mode print-area marker [ map { sprintf "%.1f", $_ } @{$self}{qw(lm bm rm tm)} ]; # Font setup $self->{fno} = 0; $self->{fonts} = {}; $self->{en} = 0.55 * $self->fontsize(3); # Initial position $self->{xpos} = $self->{lm}; # top of the current line $self->{ypos} = $self->{tm}; $self->{pageno} = 1; $self->{visible_page_number} = $self->{startpage}; $self->{line} = ""; $self->{showstring} = ""; $self->{currentfont} = ""; $self->{prev_currentfont} = ""; $self->{largest_pointsize} = 0; $self->newpage; } # ------------------------------------------------------------------------ sub end { my $self = shift; $self->showline; $self->endpage if $self->{'out'}; my $pages = $self->{pageno} - 1; my @prolog = (); push( @prolog, "%!PS-Adobe-3.0\n" ); #push(@prolog,"%%Title: No title\n"); # should look for the <title> element push( @prolog, "%%Creator: " . $self->version_tag . "\n" ); push( @prolog, "%%CreationDate: " . localtime() . "\n" ); push( @prolog, "%%Pages: $pages\n" ); push( @prolog, "%%PageOrder: Ascend\n" ); push( @prolog, "%%Orientation: Portrait\n" ); my ( $pw, $ph ) = map { int($_); } @{$self}{qw(paperwidth paperheight)}; push( @prolog, "%%DocumentMedia: Plain $pw $ph 0 white ()\n" ); push( @prolog, "%%DocumentNeededResources: \n" ); my %seenfont; for my $full ( sort keys %{ $self->{fonts} } ) { $full =~ s/-\d+$//; next if $seenfont{$full}++; push( @prolog, "%%+ font $full\n" ); } push( @prolog, "%%DocumentSuppliedResources: procset newencode 1.0 0\n" ); push( @prolog, "%%+ encoding ISOLatin1Encoding\n" ); push( @prolog, "%%EndComments\n" ); push( @prolog, <<'EOT'); %%BeginProlog /S/show load def /M/moveto load def /SF/setfont load def %%BeginResource: encoding ISOLatin1Encoding systemdict /ISOLatin1Encoding known not { /ISOLatin1Encoding [ /space /space /space /space /space /space /space /space /space /space /space /space /space /space /space /space /space /space /space /space /space /space /space /space /space /space /space /space /space /space /space /space /space /exclam /quotedbl /numbersign /dollar /percent /ampersand /quoteright /parenleft /parenright /asterisk /plus /comma /minus /period /slash /zero /one /two /three /four /five /six /seven /eight /nine /colon /semicolon /less /equal /greater /question /at /A /B /C /D /E /F /G /H /I /J /K /L /M /N /O /P /Q /R /S /T /U /V /W /X /Y /Z /bracketleft /backslash /bracketright /asciicircum /underscore /quoteleft /a /b /c /d /e /f /g /h /i /j /k /l /m /n /o /p /q /r /s /t /u /v /w /x /y /z /braceleft /bar /braceright /asciitilde /space /space /space /space /space /space /space /space /space /space /space /space /space /space /space /space /space /dotlessi /grave /acute /circumflex /tilde /macron /breve /dotaccent /dieresis /space /ring /cedilla /space /hungarumlaut /ogonek /caron /space /exclamdown /cent /sterling /currency /yen /brokenbar /section /dieresis /copyright /ordfeminine /guillemotleft /logicalnot /hyphen /registered /macron /degree /plusminus /twosuperior /threesuperior /acute /mu /paragraph /periodcentered /cedillar /onesuperior /ordmasculine /guillemotright /onequarter /onehalf /threequarters /questiondown /Agrave /Aacute /Acircumflex /Atilde /Adieresis /Aring /AE /Ccedilla /Egrave /Eacute /Ecircumflex /Edieresis /Igrave /Iacute /Icircumflex /Idieresis /Eth /Ntilde /Ograve /Oacute /Ocircumflex /Otilde /Odieresis /multiply /Oslash /Ugrave /Uacute /Ucircumflex /Udieresis /Yacute /Thorn /germandbls /agrave /aacute /acircumflex /atilde /adieresis /aring /ae /ccedilla /egrave /eacute /ecircumflex /edieresis /igrave /iacute /icircumflex /idieresis /eth /ntilde /ograve /oacute /ocircumflex /otilde /odieresis /divide /oslash /ugrave /uacute /ucircumflex /udieresis /yacute /thorn /ydieresis ] def } if %%EndResource %%BeginResource: procset newencode 1.0 0 /NE { %def findfont begin currentdict dup length dict begin { %forall 1 index/FID ne {def} {pop pop} ifelse } forall /FontName exch def /Encoding exch def currentdict dup end end /FontName get exch definefont pop } bind def %%EndResource %%EndProlog EOT push( @prolog, "\n%%BeginSetup\n" ); for my $full ( sort keys %{ $self->{fonts} } ) { my $short = $self->{fonts}{$full}; $full =~ s/-(\d+)$//; my $size = $1; push( @prolog, "ISOLatin1Encoding/$full-ISO/$full NE\n" ); push( @prolog, "/$short/$full-ISO findfont $size scalefont def\n" ); } push( @prolog, "%%EndSetup\n" ); $self->collect("\n%%Trailer\n%%EOF\n") unless $self->{'no_trailer'}; unshift( @{ $self->{output} }, @prolog ) unless $self->{'no_prolog'}; } # ------------------------------------------------------------------------ sub header_start { my ( $self, $level ) = @_; # If we are close enough to be bottom of the page, start a new page # instead of this: ### Heading of level: $level $self->vspace( 1 + ( 6 - $level ) * 0.4 ); $self->{bold}++; push( @{ $self->{font_size} }, 8 - $level ); 1; } # ------------------------------------------------------------------------ sub header_end { my ($self) = @_; $self->vspace(1); $self->{bold}--; pop( @{ $self->{font_size} } ); 1; } # ------------------------------------------------------------------------ sub hr_start { my $self = shift; ### Making an HR... $self->showline; $self->vspace(0.5); $self->skip_vspace; my $lm = $self->{lm}; my $rm = $self->{rm}; my $y = $self->{ypos}; $self->collect( sprintf "newpath %.1f %.1f M %.1f %.1f lineto stroke\n", $lm, $y, $rm, $y ); $self->vspace(0.5); } # ------------------------------------------------------------------------ sub skip_vspace { my $self = shift; ### Skipping some amount of vspace... if ( defined $self->{vspace} ) { $self->showline; if ( $self->{'out'} ) { $self->{ypos} -= $self->{vspace} * 10 * $self->{fontscale}; if ( $self->{ypos} < $self->{bm} ) { ### vspace skip forced new page... $self->newpage; } else { ### Skipped vspace making y: $self->{'ypos'}, } } else { ### Not skipping vspace as out is false: $self->{ypos} } $self->{xpos} = $self->{lm}; $self->{vspace} = undef; $self->{hspace} = undef; } else { ### No vspace to skip... } return; } # ------------------------------------------------------------------------ sub show { my $self = shift; my $str = $self->{showstring}; $str =~ tr/\x01//d; return unless length $str; # must escape parentheses and backslash $str =~ s/([\(\)\\])/\\$1/g; # encode output to latin1 when pushing it out $self->{line} .= "(" . $self->encode_string($str) . ")S\n"; $self->{showstring} = ""; } # ------------------------------------------------------------------------ sub showline { my $self = shift; $self->show; my $line = $self->{line}; unless ( length $line ) { ### Showline is a no-op because line buffer is empty... return; } ### Showline emitting: $line $self->{ypos} -= $self->{largest_pointsize} || $self->{pointsize}; if ( $self->{ypos} < $self->{bm} ) { ### Showline forcing new page... $self->newpage; # newpage might alter currentfont! ### Showline sets vspace: $self->{vspace} || 0 $self->{ypos} -= $self->{pointsize}; #### Showline/Newpage x: $self->{xpos} #### Showline/Newpage y: $self->{ypos} # must set current font again my $font = $self->{prev_currentfont}; if ($font) { $self->collect("$self->{fonts}{$font} SF\n\n"); } ### End of doing newpage... } my $lm = $self->{lm}; my $x = $lm; if ( $self->{center} ) { # Unfortunately, the center attribute is gone when we get here, # so this code is never activated my $linewidth = $self->{xpos} - $lm; $x += ( $self->{rm} - $lm - $linewidth ) / 2; } $self->collect( sprintf "%.1f %.1f M\n", $x, $self->{ypos} ); # moveto $line =~ s/\s\)S$/)S/; # many lines will end uselessly with space $self->collect($line); $self->{'out'}++; if ( $self->{bullet} ) { # Putting this behind the first line of the list item # makes it more likely that we get the right font. We should # really set the font that we want to use. my $bullet = $self->{bullet}; if ( $bullet eq '*' ) { # There is no character that is really suitable. Let's make # a medium-sized filled circle ourself. my $radius = $self->{pointsize} / 8; ### Adding code for a '*' bullet for that line... $self->collect( sprintf "newpath %.1f %.1f %.1f 0 360 arc fill\n", $self->{bullet_pos} + $radius, $self->{ypos} + $radius * 2, $radius, ); } else { ### Adding code for other bullet for that line... $self->collect( sprintf "%.1f (%s) stringwidth pop sub %.1f add %.1f M\n", # moveto $self->{bullet_pos}, $bullet, $self->{pointsize} * 0.62, $self->{ypos}, ); $self->collect("($bullet)S\n"); } $self->{bullet} = ''; } $self->{prev_currentfont} = $self->{currentfont}; $self->{largest_pointsize} = 0; $self->{line} = ""; $self->{xpos} = $lm; # Additional linespacing $self->{ypos} -= $self->{leading} * $self->{pointsize}; #### Showline/end x: $self->{xpos} #### Showline/end y: $self->{ypos} return; } # ------------------------------------------------------------------------ sub endpage { my $self = shift; ### End page: $self->{pageno} # End previous page $self->collect("showpage\n"); $self->{visible_page_number}++; $self->{pageno}++; } # ------------------------------------------------------------------------ sub newpage { my $self = shift; local $self->{'pointsize'} = $self->{'pointsize'}; # That's needed for protecting against one bit of the # potential side-effects from page-numbering code if ( $self->{'out'} ) { # whether we've sent anything to the current page so far. ### Newpage calls endpage... $self->endpage; $self->collect( sprintf "%% %s has sent %s write-events to the above page.\n", ref($self), $self->{'out'}, ); } $self->{'out'} = 0; my $pageno = $self->{pageno}; my $visible_page_number = $self->{visible_page_number}; $self->collect("\n%%Page: $pageno $pageno\n"); ### Starting page: $pageno # Print page number if ( $self->{printpageno} ) { ### Printing page number: $visible_page_number $self->collect("%% Title and pageno\n"); my $f = $self->findfont(8); $self->collect("$f\n") if $f; my $x = $self->{paperwidth}; if ($x) { $x -= 30; } else { $x = 30; } $self->collect( sprintf "%.1f 30.0 M($visible_page_number)S\n", $x ); $x = $self->{lm}; $self->{title} =~ tr/\x01//d; $self->collect( sprintf "%.1f 30.0 M($self->{title})S\n", $x ); } else { ### Pointedly not printing page number... } $self->collect("\n"); $self->{xpos} = $self->{lm}; $self->{ypos} = $self->{tm}; #### Newpage/end x: $self->{xpos} #### Newpage/end y: $self->{ypos} } # ------------------------------------------------------------------------ sub encode_string { # converts string into latin1 charset my ( $self, $str ) = @_; # the string from the parser is normally unicode, and may contain # some punctuation characters in the 'General Punctuation' block # which can be expressed in latin1, but Encode module fails on them # so we will manually hack these... # Theres no usable latin1 for the double quote chars so map to " if ( utf8::is_utf8($str) ) { $str =~ tr/\x{2018}\x{2019}\x{201A}\x{201C}\x{201D}\x{201F}\x{2033}\x{2036}/`',"""""/; } return $self->{encoder}->encode($str); } # ------------------------------------------------------------------------ sub out { # Output a word my ( $self, $text ) = @_; $text =~ tr/\xA0\xAD/ /d; ### Trapping new word: $text if ( $self->{collectingTheTitle} ) { # Both collect and print the title $text =~ s/([\(\)\\])/\\$1/g; # Escape parens and the backslash $self->{title} .= $text; return; } my $fontid = $self->setfont(); my $w = $self->width($text); if ( $text =~ /^\s*$/ ) { $self->{hspace} = [ " ", $fontid, $w ]; return; } $self->skip_vspace; # determine spacing / line breaks needed before text if ( $self->{hspace} ) { my ( $stext, $sfont, $swidth ) = @{ $self->{hspace} }; if ( $self->{xpos} + $swidth + $w > $self->{rm} ) { # line break $self->showline; } else { # no line break; output a space $self->show_with_font( $stext, $sfont, $swidth ); } $self->{hspace} = undef; } # output the text $self->show_with_font( $text, $fontid, $w ); } # ------------------------------------------------------------------------ sub show_with_font { my ( $self, $text, $fontid, $w ) = @_; my $fontps = $self->switchfont($fontid); if ( length $fontps ) { $self->show; $self->{line} .= "$fontps\n"; } $self->{xpos} += $w; $self->{showstring} .= $text; #### Append to string buffer: $text #### with font: $fontid #### with xpos: $self->{xpos} $self->{largest_pointsize} = $self->{pointsize} if $self->{largest_pointsize} < $self->{pointsize}; $self->{'out'}++; } # ------------------------------------------------------------------------ sub pre_out { my ( $self, $text ) = @_; $self->skip_vspace; $self->tt_start; my $font = $self->findfont(); if ( length $font ) { $self->show; $self->{line} .= "$font\n"; } while ( $text =~ s/(.*)\n// ) { $self->{'out'}++; $self->{showstring} .= $1; $self->showline; } $self->{showstring} .= $text; $self->tt_end; 1; } # ------------------------------------------------------------------------ sub bullet { my ( $self, $bullet ) = @_; $self->{bullet} = $bullet; $self->{bullet_pos} = $self->{lm}; } # ------------------------------------------------------------------------ sub adjust_lm { my $self = shift; $self->showline; $self->{lm} += $_[0] * $self->{en}; 1; } # ------------------------------------------------------------------------ sub adjust_rm { my $self = shift; $self->showline; $self->{rm} += $_[0] * $self->{en}; } # ------------------------------------------------------------------------ sub head_start { 1; } sub head_end { 1; } sub title_start { my ($self) = @_; $self->{collectingTheTitle} = 1; 1; } sub title_end { my ($self) = @_; $self->{collectingTheTitle} = 0; 1; } # ------------------------------------------------------------------------ my ( $counter, $last_state_filename ); # For use in circumstances of total desperation: sub dump_state { my $self = shift; require Data::Dumper; ++$counter; my $filename = sprintf( "state%04d.txt", $counter ); my $state = IO::File->new( $filename, 'w' ) or die "Can't write-open $filename: $!"; $state->printf( "%s line %s\n", ( caller(1) )[ 3, 2 ] ); { local ( $self->{'wx'} ) = '<SUPPRESSED>'; local ( $self->{'output'} ) = '<SUPPRESSED>'; $state->print( Data::Dumper::Dumper($self) ); } $state->close; sleep 0; if ($last_state_filename) { system("perl -S diff.bat $last_state_filename $filename > $filename.diff"); } $last_state_filename = $filename; return 1; } # ------------------------------------------------------------------------ 1; __END__ =pod =for test_synopsis 1; __END__ =for stopwords bottommargin fontfamily fontscale helvetica horizontalmargin leftmargin noprolog notrailer pageno paperheight papersize paperwidth postscript rightmargin startpage topmargin verticalmargin ledding leeding prolog CPAN Quarto Tabloid Unicode homepage =head1 NAME HTML::FormatPS - Format HTML as PostScript =head1 VERSION version 2.12 =head1 SYNOPSIS use HTML::TreeBuilder; $tree = HTML::TreeBuilder->new->parse_file("test.html"); use HTML::FormatPS; $formatter = HTML::FormatPS->new( FontFamily => 'Helvetica', PaperSize => 'Letter', ); print $formatter->format($tree); Or, for short: use HTML::FormatPS; print HTML::FormatPS->format_file( "test.html", 'FontFamily' => 'Helvetica', 'PaperSize' => 'Letter', ); =head1 DESCRIPTION The HTML::FormatPS is a formatter that outputs PostScript code. Formatting of HTML tables and forms is not implemented. HTML::FormatPS is built on top of L<HTML::Formatter> and so further detail may be found in the documentation for L<HTML::Formatter>. You might specify the following parameters when constructing the formatter object (or when calling format_file or format_string): =over 4 =item PaperSize What kind of paper should we format for. The value can be one of these: A3, A4, A5, B4, B5, Letter, Legal, Executive, Tabloid, Statement, Folio, 10x14, Quarto. The default is "A4". =item PaperWidth The width of the paper, in points. Setting PaperSize also defines this value. =item PaperHeight The height of the paper, in points. Setting PaperSize also defines this value. =item LeftMargin The left margin, in points. =item RightMargin The right margin, in points. =item HorizontalMargin Both left and right margin at the same time. The default value is 4 cm. =item TopMargin The top margin, in points. =item BottomMargin The bottom margin, in points. =item VerticalMargin Both top and bottom margin at the same time. The default value is 2 cm, =item PageNo This parameter determines if we should put page numbers on the pages. The default value is true; so you have to set this value to 0 in order to suppress page numbers. (The "No" in "PageNo" means number/numero!) =item FontFamily This parameter specifies which family of fonts to use for the formatting. Legal values are "Courier", "Helvetica" and "Times". The default is "Times". =item FontScale This is a scaling factor for all the font sizes. The default value is 1. For example, if you want everything to be almost three times as large, you could set this to 2.7. If you wanted things just a bit smaller than normal, you could set it to .92. =item Leading This option (pronounced "ledding", not "leeding") controls how much is space between lines. This is a factor of the font size used for that line. Default is 0.1 -- so between two 12-point lines, there will be 1.2 points of space. =item StartPage Assuming you have PageNo on, StartPage controls what the page number of the first page will be. By default, it is 1. So if you set this to 87, the first page would say "87" on it, the next "88", and so on. =item NoProlog If this option is set to a true value, HTML::FormatPS will make a point of I<not> emitting the PostScript prolog before the document. By default, this is off, meaning that HTML::FormatPS I<will> emit the prolog. This option is of interest only to advanced users. =item NoTrailer If this option is set to a true value, HTML::FormatPS will make a point of I<not> emitting the PostScript trailer at the end of the document. By default, this is off, meaning that HTML::FormatPS I<will> emit the bit of PostScript that ends the document. This option is of interest only to advanced users. =back =head1 METHODS =head2 new my $formatter = FormatterClass->new( option1 => value1, option2 => value2, ... ); This creates a new formatter object with the given options. =head1 SEE ALSO L<HTML::Formatter> =head1 ISSUES =over =item * Output is in ISO Latin1 format. The underlying HTML parsers tend to now work in Unicode (perl native) code points. There is an impedance mismatch between these, which may give issues with complex characters within HTML. =back =head1 TO DO =over =item * Support for some more character styles, notably including: strike-through, underlining, superscript, and subscript. =item * Support for Unicode. =item * Support for Win-1252 encoding, since that's what most people mean when they use characters in the range 0x80-0x9F in HTML. =item * And, if it's ever even reasonably possible, support for tables. =back I would welcome email from people who can help me out or advise me on the above. =head1 INSTALLATION See perlmodinstall for information and options on installing Perl modules. =head1 BUGS AND LIMITATIONS You can make new bug reports, and view existing ones, through the web interface at L<http://rt.cpan.org/Public/Dist/Display.html?Name=HTML-Format>. =head1 AVAILABILITY The project homepage is L<https://metacpan.org/release/HTML-Format>. The latest version of this module is available from the Comprehensive Perl Archive Network (CPAN). Visit L<http://www.perl.com/CPAN/> to find a CPAN site near you, or see L<https://metacpan.org/module/HTML::Format/>. =head1 AUTHORS =over 4 =item * Nigel Metheringham <nigelm@cpan.org> =item * Sean M Burke <sburke@cpan.org> =item * Gisle Aas <gisle@ActiveState.com> =back =head1 COPYRIGHT AND LICENSE This software is copyright (c) 2015 by Nigel Metheringham, 2002-2005 Sean M Burke, 1999-2002 Gisle Aas. This is free software; you can redistribute it and/or modify it under the same terms as the Perl 5 programming language system itself. =cut Tree/Scanning.pod 0000644 00000062247 00000000000 0007663 0 ustar 00 #Time-stamp: "2001-03-10 23:19:11 MST" -*-Text-*- # This document contains text in Perl "POD" format. # Use a POD viewer like perldoc or perlman to render it. =head1 NAME HTML::Tree::Scanning -- article: "Scanning HTML" =head1 SYNOPSIS # This an article, not a module. =head1 DESCRIPTION The following article by Sean M. Burke first appeared in I<The Perl Journal> #19 and is copyright 2000 The Perl Journal. It appears courtesy of Jon Orwant and The Perl Journal. This document may be distributed under the same terms as Perl itself. (Note that this is discussed in chapters 6 through 10 of the book I<Perl and LWP> L<http://lwp.interglacial.com/> which was written after the following documentation, and which is available free online.) =head1 Scanning HTML -- Sean M. Burke In I<The Perl Journal> issue 17, Ken MacFarlane's article "Parsing HTML with HTML::Parser" describes how the HTML::Parser module scans HTML source as a stream of start-tags, end-tags, text, comments, etc. In TPJ #18, my "Trees" article kicked around the idea of tree-shaped data structures. Now I'll try to tie it together, in a discussion of HTML trees. The CPAN module HTML::TreeBuilder takes the tags that HTML::Parser picks out, and builds a parse tree -- a tree-shaped network of objects... =over Footnote: And if you need a quick explanation of objects, see my TPJ17 article "A User's View of Object-Oriented Modules"; or go whole hog and get Damian Conway's excellent book I<Object-Oriented Perl>, from Manning Publications. =back ...representing the structured content of the HTML document. And once the document is parsed as a tree, you'll find the common tasks of extracting data from that HTML document/tree to be quite straightforward. =head2 HTML::Parser, HTML::TreeBuilder, and HTML::Element You use HTML::TreeBuilder to make a parse tree out of an HTML source file, by simply saying: use HTML::TreeBuilder; my $tree = HTML::TreeBuilder->new(); $tree->parse_file('foo.html'); and then C<$tree> contains a parse tree built from the HTML source from the file "foo.html". The way this parse tree is represented is with a network of objects -- C<$tree> is the root, an element with tag-name "html", and its children typically include a "head" and "body" element, and so on. Elements in the tree are objects of the class HTML::Element. So, if you take this source: <html><head><title>Doc 1</title></head> <body> Stuff <hr> 2000-08-17 </body></html> and feed it to HTML::TreeBuilder, it'll return a tree of objects that looks like this: html / \ head body / / | \ title "Stuff" hr "2000-08-17" | "Doc 1" This is a pretty simple document, but if it were any more complex, it'd be a bit hard to draw in that style, since it's sprawl left and right. The same tree can be represented a bit more easily sideways, with indenting: . html . head . title . "Doc 1" . body . "Stuff" . hr . "2000-08-17" Either way expresses the same structure. In that structure, the root node is an object of the class HTML::Element =over Footnote: Well actually, the root is of the class HTML::TreeBuilder, but that's just a subclass of HTML::Element, plus the few extra methods like C<parse_file> that elaborate the tree =back , with the tag name "html", and with two children: an HTML::Element object whose tag names are "head" and "body". And each of those elements have children, and so on down. Not all elements (as we'll call the objects of class HTML::Element) have children -- the "hr" element doesn't. And note all nodes in the tree are elements -- the text nodes ("Doc 1", "Stuff", and "2000-08-17") are just strings. Objects of the class HTML::Element each have three noteworthy attributes: =over =item C<_tag> -- (best accessed as C<$e-E<gt>tag>) this element's tag-name, lowercased (e.g., "em" for an "em" element). =over Footnote: Yes, this is misnamed. In proper SGML terminology, this is instead called a "GI", short for "generic identifier"; and the term "tag" is used for a token of SGML source that represents either the start of an element (a start-tag like "<em lang='fr'>") or the end of an element (an end-tag like "</em>". However, since more people claim to have been abducted by aliens than to have ever seen the SGML standard, and since both encounters typically involve a feeling of "missing time", it's not surprising that the terminology of the SGML standard is not closely followed. =back =item C<_parent> -- (best accessed as C<$e-E<gt>parent>) the element that is C<$obj>'s parent, or undef if this element is the root of its tree. =item C<_content> -- (best accessed as C<$e-E<gt>content_list>) the list of nodes (i.e., elements or text segments) that are C<$e>'s children. =back Moreover, if an element object has any attributes in the SGML sense of the word, then those are readable as C<$e-E<gt>attr('name')> -- for example, with the object built from having parsed "E<lt>a B<id='foo'>E<gt>barE<lt>/aE<gt>", C<$e-E<gt>attr('id')> will return the string "foo". Moreover, C<$e-E<gt>tag> on that object returns the string "a", C<$e-E<gt>content_list> returns a list consisting of just the single scalar "bar", and C<$e-E<gt>parent> returns the object that's this node's parent -- which may be, for example, a "p" element. And that's all that there is to it -- you throw HTML source at TreeBuilder, and it returns a tree built of HTML::Element objects and some text strings. However, what do you I<do> with a tree of objects? People code information into HTML trees not for the fun of arranging elements, but to represent the structure of specific text and images -- some text is in this "li" element, some other text is in that heading, some images are in that other table cell that has those attributes, and so on. Now, it may happen that you're rendering that whole HTML tree into some layout format. Or you could be trying to make some systematic change to the HTML tree before dumping it out as HTML source again. But, in my experience, by far the most common programming task that Perl programmers face with HTML is in trying to extract some piece of information from a larger document. Since that's so common (and also since it involves concepts that are basic to more complex tasks), that is what the rest of this article will be about. =head2 Scanning HTML trees Suppose you have a thousand HTML documents, each of them a press release. They all start out: [...lots of leading images and junk...] <h1>ConGlomCo to Open New Corporate Office in Ougadougou</h1> BAKERSFIELD, CA, 2000-04-24 -- ConGlomCo's vice president in charge of world conquest, Rock Feldspar, announced today the opening of a new office in Ougadougou, the capital city of Burkino Faso, gateway to the bustling "Silicon Sahara" of Africa... [...etc...] ...and what you've got to do is, for each document, copy whatever text is in the "h1" element, so that you can, for example, make a table of contents of it. Now, there are three ways to do this: =over =item * You can just use a regexp to scan the file for a text pattern. For many very simple tasks, this will do fine. Many HTML documents are, in practice, very consistently formatted as far as placement of linebreaks and whitespace, so you could just get away with scanning the file like so: sub get_heading { my $filename = $_[0]; local *HTML; open(HTML, $filename) or die "Couldn't open $filename); my $heading; Line: while(<HTML>) { if( m{<h1>(.*?)</h1>}i ) { # match it! $heading = $1; last Line; } } close(HTML); warn "No heading in $filename?" unless defined $heading; return $heading; } This is quick and fast, but awfully fragile -- if there's a newline in the middle of a heading's text, it won't match the above regexp, and you'll get an error. The regexp will also fail if the "h1" element's start-tag has any attributes. If you have to adapt your code to fit more kinds of start-tags, you'll end up basically reinventing part of HTML::Parser, at which point you should probably just stop, and use HTML::Parser itself: =item * You can use HTML::Parser to scan the file for an "h1" start-tag token, then capture all the text tokens until the "h1" close-tag. This approach is extensively covered in the Ken MacFarlane's TPJ17 article "Parsing HTML with HTML::Parser". (A variant of this approach is to use HTML::TokeParser, which presents a different and rather handier interface to the tokens that HTML::Parser picks out.) Using HTML::Parser is less fragile than our first approach, since it's not sensitive to the exact internal formatting of the start-tag (much less whether it's split across two lines). However, when you need more information about the context of the "h1" element, or if you're having to deal with any of the tricky bits of HTML, such as parsing of tables, you'll find out the flat list of tokens that HTML::Parser returns isn't immediately useful. To get something useful out of those tokens, you'll need to write code that knows some things about what elements take no content (as with "hr" elements), and that a "</p>" end-tags are omissible, so a "<p>" will end any currently open paragraph -- and you're well on your way to pointlessly reinventing much of the code in HTML::TreeBuilder =over Footnote: And, as the person who last rewrote that module, I can attest that it wasn't terribly easy to get right! Never underestimate the perversity of people coding HTML. =back , at which point you should probably just stop, and use HTML::TreeBuilder itself: =item * You can use HTML::Treebuilder, and scan the tree of element objects that you get back. =back The last approach, using HTML::TreeBuilder, is the diametric opposite of first approach: The first approach involves just elementary Perl and one regexp, whereas the TreeBuilder approach involves being at home with the concept of tree-shaped data structures and modules with object-oriented interfaces, as well as with the particular interfaces that HTML::TreeBuilder and HTML::Element provide. However, what the TreeBuilder approach has going for it is that it's the most robust, because it involves dealing with HTML in its "native" format -- it deals with the tree structure that HTML code represents, without any consideration of how the source is coded and with what tags omitted. So, to extract the text from the "h1" elements of an HTML document: sub get_heading { my $tree = HTML::TreeBuilder->new; $tree->parse_file($_[0]); # ! my $heading; my $h1 = $tree->look_down('_tag', 'h1'); # ! if($h1) { $heading = $h1->as_text; # ! } else { warn "No heading in $_[0]?"; } $tree->delete; # clear memory! return $heading; } This uses some unfamiliar methods that need explaining. The C<parse_file> method that we've seen before, builds a tree based on source from the file given. The C<delete> method is for marking a tree's contents as available for garbage collection, when you're done with the tree. The C<as_text> method returns a string that contains all the text bits that are children (or otherwise descendants) of the given node -- to get the text content of the C<$h1> object, we could just say: $heading = join '', $h1->content_list; but that will work only if we're sure that the "h1" element's children will be only text bits -- if the document contained: <h1>Local Man Sees <cite>Blade</cite> Again</h1> then the sub-tree would be: . h1 . "Local Man Sees " . cite . "Blade" . " Again' so C<join '', $h1-E<gt>content_list> will be something like: Local Man Sees HTML::Element=HASH(0x15424040) Again whereas C<$h1-E<gt>as_text> would yield: Local Man Sees Blade Again and depending on what you're doing with the heading text, you might want the C<as_HTML> method instead. It returns the (sub)tree represented as HTML source. C<$h1-E<gt>as_HTML> would yield: <h1>Local Man Sees <cite>Blade</cite> Again</h1> However, if you wanted the contents of C<$h1> as HTML, but not the C<$h1> itself, you could say: join '', map( ref($_) ? $_->as_HTML : $_, $h1->content_list ) This C<map> iterates over the nodes in C<$h1>'s list of children; and for each node that's just a text bit (as "Local Man Sees " is), it just passes through that string value, and for each node that's an actual object (causing C<ref> to be true), C<as_HTML> will used instead of the string value of the object itself (which would be something quite useless, as most object values are). So that C<as_HTML> for the "cite" element will be the string "<cite>BladeE<lt>/cite>". And then, finally, C<join> just puts into one string all the strings that the C<map> returns. Last but not least, the most important method in our C<get_heading> sub is the C<look_down> method. This method looks down at the subtree starting at the given object (C<$h1>), looking for elements that meet criteria you provide. The criteria are specified in the method's argument list. Each criterion can consist of two scalars, a key and a value, which express that you want elements that have that attribute (like "_tag", or "src") with the given value ("h1"); or the criterion can be a reference to a subroutine that, when called on the given element, returns true if that is a node you're looking for. If you specify several criteria, then that's taken to mean that you want all the elements that each satisfy I<all> the criteria. (In other words, there's an "implicit AND".) And finally, there's a bit of an optimization -- if you call the C<look_down> method in a scalar context, you get just the I<first> node (or undef if none) -- and, in fact, once C<look_down> finds that first matching element, it doesn't bother looking any further. So the example: $h1 = $tree->look_down('_tag', 'h1'); returns the first element at-or-under C<$tree> whose C<"_tag"> attribute has the value C<"h1">. =head2 Complex Criteria in Tree Scanning Now, the above C<look_down> code looks like a lot of bother, with barely more benefit than just grepping the file! But consider if your criteria were more complicated -- suppose you found that some of the press releases that you were scanning had several "h1" elements, possibly before or after the one you actually want. For example: <h1><center>Visit Our Corporate Partner <br><a href="/dyna/clickthru" ><img src="/dyna/vend_ad"></a> </center></h1> <h1><center>ConGlomCo President Schreck to Visit Regional HQ <br><a href="/photos/Schreck_visit_large.jpg" ><img src="/photos/Schreck_visit.jpg"></a> </center></h1> Here, you want to ignore the first "h1" element because it contains an ad, and you want the text from the second "h1". The problem is in formalizing the way you know that it's an ad. Since ad banners are always entreating you to "visit" the sponsoring site, you could exclude "h1" elements that contain the word "visit" under them: my $real_h1 = $tree->look_down( '_tag', 'h1', sub { $_[0]->as_text !~ m/\bvisit/i } ); The first criterion looks for "h1" elements, and the second criterion limits those to only the ones whose text content doesn't match C<m/\bvisit/>. But unfortunately, that won't work for our example, since the second "h1" mentions "ConGlomCo President Schreck to I<Visit> Regional HQ". Instead you could try looking for the first "h1" element that doesn't contain an image: my $real_h1 = $tree->look_down( '_tag', 'h1', sub { not $_[0]->look_down('_tag', 'img') } ); This criterion sub might seem a bit odd, since it calls C<look_down> as part of a larger C<look_down> operation, but that's fine. Note that when considered as a boolean value, a C<look_down> in a scalar context value returns false (specifically, undef) if there's no matching element at or under the given element; and it returns the first matching element (which, being a reference and object, is always a true value), if any matches. So, here, sub { not $_[0]->look_down('_tag', 'img') } means "return true only if this element has no 'img' element as descendants (and isn't an 'img' element itself)." This correctly filters out the first "h1" that contains the ad, but it also incorrectly filters out the second "h1" that contains a non-advertisement photo besides the headline text you want. There clearly are detectable differences between the first and second "h1" elements -- the only second one contains the string "Schreck", and we could just test for that: my $real_h1 = $tree->look_down( '_tag', 'h1', sub { $_[0]->as_text =~ m{Schreck} } ); And that works fine for this one example, but unless all thousand of your press releases have "Schreck" in the headline, that's just not a general solution. However, if all the ads-in-"h1"s that you want to exclude involve a link whose URL involves "/dyna/", then you can use that: my $real_h1 = $tree->look_down( '_tag', 'h1', sub { my $link = $_[0]->look_down('_tag','a'); return 1 unless $link; # no link means it's fine return 0 if $link->attr('href') =~ m{/dyna/}; # a link to there is bad return 1; # otherwise okay } ); Or you can look at it another way and say that you want the first "h1" element that either contains no images, or else whose image has a "src" attribute whose value contains "/photos/": my $real_h1 = $tree->look_down( '_tag', 'h1', sub { my $img = $_[0]->look_down('_tag','img'); return 1 unless $img; # no image means it's fine return 1 if $img->attr('src') =~ m{/photos/}; # good if a photo return 0; # otherwise bad } ); Recall that this use of C<look_down> in a scalar context means to return the first element at or under C<$tree> that matches all the criteria. But if you notice that you can formulate criteria that'll match several possible "h1" elements, some of which may be bogus but the I<last> one of which is always the one you want, then you can use C<look_down> in a list context, and just use the last element of that list: my @h1s = $tree->look_down( '_tag', 'h1', ...maybe more criteria... ); die "What, no h1s here?" unless @h1s; my $real_h1 = $h1s[-1]; # last or only =head2 A Case Study: Scanning Yahoo News's HTML The above (somewhat contrived) case involves extracting data from a bunch of pre-existing HTML files. In that sort of situation, if your code works for all the files, then you know that the code I<works> -- since the data it's meant to handle won't go changing or growing; and, typically, once you've used the program, you'll never need to use it again. The other kind of situation faced in many data extraction tasks is where the program is used recurringly to handle new data -- such as from ever-changing Web pages. As a real-world example of this, consider a program that you could use (suppose it's crontabbed) to extract headline-links from subsections of Yahoo News (C<http://dailynews.yahoo.com/>). Yahoo News has several subsections: =over =item http://dailynews.yahoo.com/h/tc/ for technology news =item http://dailynews.yahoo.com/h/sc/ for science news =item http://dailynews.yahoo.com/h/hl/ for health news =item http://dailynews.yahoo.com/h/wl/ for world news =item http://dailynews.yahoo.com/h/en/ for entertainment news =back and others. All of them are built on the same basic HTML template -- and a scarily complicated template it is, especially when you look at it with an eye toward making up rules that will select where the real headline-links are, while screening out all the links to other parts of Yahoo, other news services, etc. You will need to puzzle over the HTML source, and scrutinize the output of C<$tree-E<gt>dump> on the parse tree of that HTML. Sometimes the only way to pin down what you're after is by position in the tree. For example, headlines of interest may be in the third column of the second row of the second table element in a page: my $table = ( $tree->look_down('_tag','table') )[1]; my $row2 = ( $table->look_down('_tag', 'tr' ) )[1]; my $col3 = ( $row2->look-down('_tag', 'td') )[2]; ...then do things with $col3... Or they may be all the links in a "p" element that has at least three "br" elements as children: my $p = $tree->look_down( '_tag', 'p', sub { 2 < grep { ref($_) and $_->tag eq 'br' } $_[0]->content_list } ); @links = $p->look_down('_tag', 'a'); But almost always, you can get away with looking for properties of the of the thing itself, rather than just looking for contexts. Now, if you're lucky, the document you're looking through has clear semantic tagging, such is as useful in CSS -- note the class="headlinelink" bit here: <a href="...long_news_url..." class="headlinelink">Elvis seen in tortilla</a> If you find anything like that, you could leap right in and select links with: @links = $tree->look_down('class','headlinelink'); Regrettably, your chances of seeing any sort of semantic markup principles really being followed with actual HTML are pretty thin. =over Footnote: In fact, your chances of finding a page that is simply free of HTML errors are even thinner. And surprisingly, sites like Amazon or Yahoo are typically worse as far as quality of code than personal sites whose entire production cycle involves simply being saved and uploaded from Netscape Composer. =back The code may be sort of "accidentally semantic", however -- for example, in a set of pages I was scanning recently, I found that looking for "td" elements with a "width" attribute value of "375" got me exactly what I wanted. No-one designing that page ever conceived of "width=375" as I<meaning> "this is a headline", but if you impute it to mean that, it works. An approach like this happens to work for the Yahoo News code, because the headline-links are distinguished by the fact that they (and they alone) contain a "b" element: <a href="...long_news_url..."><b>Elvis seen in tortilla</b></a> or, diagrammed as a part of the parse tree: . a [href="...long_news_url..."] . b . "Elvis seen in tortilla" A rule that matches these can be formalized as "look for any 'a' element that has only one daughter node, which must be a 'b' element". And this is what it looks like when cooked up as a C<look_down> expression and prefaced with a bit of code that retrieves the text of the given Yahoo News page and feeds it to TreeBuilder: use strict; use HTML::TreeBuilder 2.97; use LWP::UserAgent; sub get_headlines { my $url = $_[0] || die "What URL?"; my $response = LWP::UserAgent->new->request( HTTP::Request->new( GET => $url ) ); unless($response->is_success) { warn "Couldn't get $url: ", $response->status_line, "\n"; return; } my $tree = HTML::TreeBuilder->new(); $tree->parse($response->content); $tree->eof; my @out; foreach my $link ( $tree->look_down( # ! '_tag', 'a', sub { return unless $_[0]->attr('href'); my @c = $_[0]->content_list; @c == 1 and ref $c[0] and $c[0]->tag eq 'b'; } ) ) { push @out, [ $link->attr('href'), $link->as_text ]; } warn "Odd, fewer than 6 stories in $url!" if @out < 6; $tree->delete; return @out; } ...and add a bit of code to actually call that routine and display the results... foreach my $section (qw[tc sc hl wl en]) { my @links = get_headlines( "http://dailynews.yahoo.com/h/$section/" ); print $section, ": ", scalar(@links), " stories\n", map((" ", $_->[0], " : ", $_->[1], "\n"), @links), "\n"; } And we've got our own headline-extractor service! This in and of itself isn't no amazingly useful (since if you want to see the headlines, you I<can> just look at the Yahoo News pages), but it could easily be the basis for quite useful features like filtering the headlines for matching certain keywords of interest to you. Now, one of these days, Yahoo News will decide to change its HTML template. When this happens, this will appear to the above program as there being no links that meet the given criteria; or, less likely, dozens of erroneous links will meet the criteria. In either case, the criteria will have to be changed for the new template; they may just need adjustment, or you may need to scrap them and start over. =head2 I<Regardez, duvet!> It's often quite a challenge to write criteria to match the desired parts of an HTML parse tree. Very often you I<can> pull it off with a simple C<$tree-E<gt>look_down('_tag', 'h1')>, but sometimes you do have to keep adding and refining criteria, until you might end up with complex filters like what I've shown in this article. The benefit to learning how to deal with HTML parse trees is that one main search tool, the C<look_down> method, can do most of the work, making simple things easy, while still making hard things possible. B<[end body of article]> =head2 [Author Credit] Sean M. Burke (C<sburke@cpan.org>) is the current maintainer of C<HTML::TreeBuilder> and C<HTML::Element>, both originally by Gisle Aas. Sean adds: "I'd like to thank the folks who listened to me ramble incessantly about HTML::TreeBuilder and HTML::Element at this year's Yet Another Perl Conference and O'Reilly Open Source Software Convention." =head1 BACK Return to the L<HTML::Tree|HTML::Tree> docs. =cut Tree/AboutTrees.pod 0000644 00000140675 00000000000 0010202 0 ustar 00 #Time-stamp: "2001-02-23 20:09:47 MST" -*-Text-*- # This document contains text in Perl "POD" format. # Use a POD viewer like perldoc or perlman to render it. =head1 NAME HTML::Tree::AboutTrees -- article on tree-shaped data structures in Perl =head1 SYNOPSIS # This an article, not a module. =head1 DESCRIPTION The following article by Sean M. Burke first appeared in I<The Perl Journal> #18 and is copyright 2000 The Perl Journal. It appears courtesy of Jon Orwant and The Perl Journal. This document may be distributed under the same terms as Perl itself. =head1 Trees -- Sean M. Burke =over "AaaAAAaauugh! Watch out for that tree!" -- I<George of the Jungle theme> =back Perl's facility with references, combined with its automatic management of memory allocation, makes it straightforward to write programs that store data in structures of arbitrary form and complexity. But I've noticed that many programmers, especially those who started out with more restrictive languages, seem at home with complex but uniform data structures -- N-dimensional arrays, or more struct-like things like hashes-of-arrays(-of-hashes(-of-hashes), etc.) -- but they're often uneasy with building more freeform, less tabular structures, like tree-shaped data structures. But trees are easy to build and manage in Perl, as I'll demonstrate by showing off how the HTML::Element class manages elements in an HTML document tree, and by walking you through a from-scratch implementation of game trees. But first we need to nail down what we mean by a "tree". =head2 Socratic Dialogues: "What is a Tree?" My first brush with tree-shaped structures was in linguistics classes, where tree diagrams are used to describe the syntax underlying natural language sentences. After learning my way around I<those> trees, I started to wonder -- are what I'm used to calling "trees" the same as what programmers call "trees"? So I asked lots of helpful and patient programmers how they would define a tree. Many replied with a answer in jargon that they could not really explain (understandable, since explaining things, especially defining things, is harder than people think): =over -- So what I<is> a "tree", a tree-shaped data structure? -- A tree is a special case of an acyclic directed graph! -- What's a "graph"? -- Um... lines... and... you draw it... with... arcs! nodes! um... =back The most helpful were folks who couldn't explain directly, but with whom I could get into a rather Socratic dialog (where I<I> asked the half-dim half-earnest questions), often with much doodling of illustrations... Question: so what's a tree? Answer: A tree is a collection of nodes that are linked together in a, well, tree-like way! Like this I<[drawing on a napkin]:> A / \ B C / | \ D E F Q: So what do these letters represent? A: Each is a different node, a bunch of data. Maybe C is a bunch of data that stores a number, maybe a hash table, maybe nothing at all besides the fact that it links to D, E, and F (which are other nodes). Q: So what're the lines between the nodes? A: Links. Also called "arcs". They just symbolize the fact that each node holds a list of nodes it links to. Q: So what if I draw nodes and links, like this... B -- E / \ / \ A C \ / E Is that still a tree? A: No, not at all. There's a lot of un-treelike things about that. First off, E has a link coming off of it going into nowhere. You can't have a link to nothing -- you can only link to another node. Second off, I don't know what that sideways link between B and E means... Q: Okay, let's work our way up from something simpler. Is this a tree...? A A: Yes, I suppose. It's a tree of just one node. Q: And how about... A B A: No, you can't just have nodes floating there, unattached. Q: Okay, I'll link A and B. How's this? A | B A: Yup, that's a tree. There's a node A, and a node B, and they're linked. Q: How is that tree any different from this one...? B | A A: Well, in both cases A and B are linked. But it's in a different direction. Q: Direction? What does the direction mean? A: Well, it depends what the tree represents. If it represents a categorization, like this: citrus / | \ orange lemon kumquat ... then you mean to say that oranges, lemons, kumquats, etc., are a kind of citrus. But if you drew it upside down, you'd be saying, falsely, that citrus is a kind of kumquat, a kind of lemon, and a kind of orange. If the tree represented cause-and-effect (or at least what situations could follow others), or represented what's a part of what, you wouldn't want to get those backwards, either. So with the nodes you draw together on paper, one has to be over the other, so you can tell which way the relationship in the tree works. Q: So are these two trees the same? A A / \ / \ B C B \ C A: Yes, although by convention we often try to line up things in the same generation, like it is in the diagram on the left. Q: "generation"? This is a family tree? A: No, not unless it's a family tree for just yeast cells or something else that reproduces asexually. But for sake of having lots of terms to use, we just pretend that links in the tree represent the "is a child of" relationship, instead of "is a kind of" or "is a part of", or "could result from", or whatever the real relationship is. So we get to borrow a lot of kinship words for describing trees -- B and C are "children" (or "daughters") of A; A is the "parent" (or "mother") of B and C. Node C is a "sibling" (or "sister") of node C; and so on, with terms like "descendants" (a node's children, children's children, etc.), and "generation" (all the nodes at the same "level" in the tree, i.e., are either all grandchildren of the top node, or all great-grand-children, etc.), and "lineage" or "ancestors" (parents, and parent's parents, etc., all the way to the topmost node). So then we get to express rules in terms like "B<A node cannot have more than one parent>", which means that this is not a valid tree: A / \ B C \ / E And: "B<A node can't be its own parent>", which excludes this looped-up connection: /\ A | \/ Or, put more generally: "B<A node can't be its own ancestor>", which excludes the above loop, as well as the one here: /\ Z | / | A | / \ | B C | \/ That tree is excluded because A is a child of Z, and Z is a child of C, and C is a child of A, which means A is its own great-grandparent. So this whole network can't be a tree, because it breaks the sort of meta-rule: B<once any node in the supposed tree breaks the rules for trees, you don't have a tree anymore.> Q: Okay, now, are these two trees the same? A A / | \ / | \ B C D D C B A: It depends whether you're basing your concept of trees on each node having a set (unordered list) of children, or an (ordered) list of children. It's a question of whether ordering is important for what you're doing. With my diagram of citrus types, ordering isn't important, so these tree diagrams express the same thing: citrus / | \ orange lemon kumquat citrus / | \ kumquat orange lemon because it doesn't make sense to say that oranges are "before" or "after" kumquats in the whole botanical scheme of things. (Unless, of course, you I<are> using ordering to mean something, like a degree of genetic similarity.) But consider a tree that's a diagram of what steps are comprised in an activity, to some degree of specificity: make tea / | \ pour infuse serve hot water / \ in cup/pot / \ add let tea sit leaves This means that making tea consists of putting hot water in a cup or put, infusing it (which itself consists of adding tea leaves and letting it sit), then serving it -- I<in that order>. If you serve an empty dry pot (sipping from empty cups, etc.), let it sit, add tea leaves, and pour in hot water, then what you're doing is performance art, not tea preparation: performance art / | \ serve infuse pour / \ hot water / \ in cup/pot let add sit tea leaves Except for my having renamed the root, this tree is the same as the making-tea tree as far as what's under what, but it differs in order, and what the tree means makes the order important. Q: Wait -- "root"? What's a root? A: Besides kinship terms like "mother" and "daughter", the jargon for tree parts also has terms from real-life tree parts: the part that everything else grows from is called the root; and nodes that don't have nodes attached to them (i.e., childless nodes) are called "leaves". Q: But you've been drawing all your trees with the root at the top and leaves at the bottom. A: Yes, but for some reason, that's the way everyone seems to think of trees. They can draw trees as above; or they can draw them sort of sideways with indenting representing what nodes are children of what: * make tea * pour hot water in cup/pot * infuse * add tea leaves * let sit * serve ...but folks almost never seem to draw trees with the root at the bottom. So imagine it's based on spider plant in a hanging pot. Unfortunately, spider plants I<aren't> botanically trees, they're plants; but "spider plant diagram" is rather a mouthful, so let's just call them trees. =head2 Trees Defined Formally In time, I digested all these assorted facts about programmers' ideas of trees (which turned out to be just a more general case of linguistic ideas of trees) into a single rule: * A node is an item that contains ("is over", "is parent of", etc.) zero or more other nodes. From this you can build up formal definitions for useful terms, like so: * A node's B<descendants> are defined as all its children, and all their children, and so on. Or, stated recursively: a node's descendants are all its children, and all its children's descendants. (And if it has no children, it has no descendants.) * A node's B<ancestors> consist of its parent, and its parent's parent, etc, up to the root. Or, recursively: a node's ancestors consist of its parent and its parent's ancestors. (If it has no parent, it has no ancestors.) * A B<tree> is a root node and all the root's descendants. And you can add a proviso or two to clarify exactly what I impute to the word "other" in "other nodes": * A node cannot contain itself, or contain any node that contains it, etc. Looking at it the other way: a node cannot be its own parent or ancestor. * A node can be root (i.e., no other node contains it) or can be contained by only one parent; no node can be the child of two or more parents. Add to this the idea that children are sometimes ordered, and sometimes not, and that's about all you need to know about defining what a tree is. From there it's a matter of using them. =head2 Markup Language Trees: HTML-Tree While not I<all> markup languages are inherently tree-like, the best-known family of markup languages, HTML, SGML, and XML, are about as tree-like as you can get. In these languages, a document consists of elements and character data in a tree structure where there is one root element, and elements can contain either other elements, or character data. =over Footnote: For sake of simplicity, I'm glossing over comments (<!-- ... -->), processing instructions (<?xml version='1.0'>), and declarations (<!ELEMENT ...>, <!DOCTYPE ...>). And I'm not bothering to distinguish entity references (<, @) or CDATA sections (<![CDATA[ ...]]>) from normal text. =back For example, consider this HTML document: <html lang="en-US"> <head> <title> Blank Document! </title> </head> <body bgcolor="#d010ff"> I've got <em> something to saaaaay </em> ! </body> </html> I've indented this to point out what nodes (elements or text items) are children of what, with each node on a line of its own. The HTML::TreeBuilder module (in the CPAN distribution HTML-Tree) does the work of taking HTML source and building in memory the tree that the document source represents. =over Footnote: it requires the HTML::Parser module, which tokenizes the source -- i.e., identifies each tag, bit of text, comment, etc. =back The trees structures that it builds represent bits of text with normal Perl scalar string values; but elements are represented with objects -- that is, chunks of data that belong to a class (in this case, HTML::Element), a class that provides methods (routines) for accessing the pieces of data in each element, and otherwise doing things with elements. (See my article in TPJ#17 for a quick explanation of objects, the POD document C<perltoot> for a longer explanation, or Damian Conway's excellent book I<Object-Oriented Perl> for the full story.) Each HTML::Element object contains a number of pieces of data: * its element name ("html", "h1", etc., accessed as $element->tag) * a list of elements (or text segments) that it contains, if any (accessed as $element->content_list or $element->content, depending on whether you want a list, or an arrayref) * what element, if any, contains it (accessed as $element->parent) * and any SGML attributes that the element has, such as C<lang="en-US">, C<align="center">, etc. (accessed as $element->attr('lang'), $element->attr('center'), etc.) So, for example, when HTML::TreeBuilder builds the tree for the above HTML document source, the object for the "body" element has these pieces of data: * element name: "body" * nodes it contains: the string "I've got " the object for the "em" element the string "!" * its parent: the object for the "html" element * bgcolor: "#d010ff" Now, once you have this tree of objects, almost anything you'd want to do with it starts with searching the tree for some bit of information in some element. Accessing a piece of information in, say, a hash of hashes of hashes, is straightforward: $password{'sean'}{'sburke1'}{'hpux'} because you know that all data points in that structure are accessible with that syntax, but with just different keys. Now, the "em" element in the above HTML tree does happen to be accessible as the root's child #1's child #1: $root->content->[1]->content->[1] But with trees, you typically don't know the exact location (via indexes) of the data you're looking for. Instead, finding what you want will typically involve searching through the tree, seeing if every node is the kind you want. Searching the whole tree is simple enough -- look at a given node, and if it's not what you want, look at its children, and so on. HTML-Tree provides several methods that do this for you, such as C<find_by_tag_name>, which returns the elements (or the first element, if called in scalar context) under a given node (typically the root) whose tag name is whatever you specify. For example, that "em" node can be found as: my $that_em = $root->find_by_tag_name('em'); or as: @ems = $root->find_by_tag_name('em'); # will only have one element for this particular tree Now, given an HTML document of whatever structure and complexity, if you wanted to do something like change every =over E<lt>emE<gt>I<stuff>E<lt>/emE<gt> =back to =over E<lt>em class="funky"E<gt> B<E<lt>bE<gt>[-E<lt>/bE<gt>> I<stuff> B<E<lt>bE<gt>-]E<lt>/bE<gt>> E<lt>/emE<gt> =back the first step is to frame this operation in terms of what you're doing to the tree. You're changing this: em | ... to this: em / | \ b ... b | | "[-" "-]" In other words, you're finding all elements whose tag name is "em", setting its class attribute to "funky", and adding one child to the start of its content list -- a new "b" element whose content is the text string "[-" -- and one to the end of its content list -- a new "b" element whose content is the text string "-]". Once you've got it in these terms, it's just a matter of running to the HTML::Element documentation, and coding this up with calls to the appropriate methods, like so: use HTML::Element 1.53; use HTML::TreeBuilder 2.96; # Build the tree by parsing the document my $root = HTML::TreeBuilder->new; $root->parse_file('whatever.html'); # source file # Now make new nodes where needed foreach my $em ($root->find_by_tag_name('em')) { $em->attr('class', 'funky'); # Set that attribute # Make the two new B nodes my $new1 = HTML::Element->new('b'); my $new2 = HTML::Element->new('b'); # Give them content (they have none at first) $new1->push_content('[-'); $new2->push_content('-]'); # And put 'em in place! $em->unshift_content($new1); $em->push_content($new2); } print "<!-- Looky see what I did! -->\n", $root->as_HTML(), "\n"; The class HTML::Element provides just about every method I can image you needing, for manipulating trees made of HTML::Element objects. (And what it doesn't directly provide, it will give you the components to build it with.) =head2 Building Your Own Trees Theoretically, any tree is pretty much like any other tree, so you could use HTML::Element for anything you'd ever want to do with tree-arranged objects. However, as its name implies, HTML::Element is basically I<for> HTML elements; it has lots of features that make sense only for HTML elements (like the idea that every element must have a tag-name). And it lacks some features that might be useful for general applications -- such as any sort of checking to make sure that you're not trying to arrange objects in a non-treelike way. For a general-purpose tree class that does have such features, you can use Tree::DAG_Node, also available from CPAN. However, if your task is simple enough, you might find it overkill to bother using Tree::DAG_Node. And, in any case, I find that the best way to learn how something works is to implement it (or something like it, but simpler) yourself. So I'll here discuss how you'd implement a tree structure, I<without> using any of the existing classes for tree nodes. =head2 Implementation: Game Trees for Alak Suppose that the task at hand is to write a program that can play against a human opponent at a strategic board game (as opposed to a board game where there's an element of chance). For most such games, a "game tree" is an essential part of the program (as I will argue, below), and this will be our test case for implementing a tree structure from scratch. For sake of simplicity, our game is not chess or backgammon, but instead a much simpler game called Alak. Alak was invented by the mathematician A. K. Dewdney, and described in his 1984 book I<Planiverse>. The rules of Alak are simple: =over Footnote: Actually, I'm describing only my interpretation of the rules Dewdney describes in I<Planiverse>. Many other interpretations are possible. =back * Alak is a two-player game played on a one-dimensional board with eleven slots on it. Each slot can hold at most one piece at a time. There's two kinds of pieces, which I represent here as "x" and "o" -- x's belong to one player (called X), o's to the other (called O). * The initial configuration of the board is: xxxx___oooo For sake of the article, the slots are numbered from 1 (on the left) to 11 (on the right), and X always has the first move. * The players take turns moving. At each turn, each player can move only one piece, once. (This unlike checkers, where you move one piece per move but get to keep moving it if you jump an your opponent's piece.) A player cannot pass up on his turn. A player can move any one of his pieces to the next unoccupied slot to its right or left, which may involve jumping over occupied slots. A player cannot move a piece off the side of the board. * If a move creates a pattern where the opponent's pieces are surrounded, on both sides, by two pieces of the mover's color (with no intervening unoccupied blank slot), then those surrounded pieces are removed from the board. * The goal of the game is to remove all of your opponent's pieces, at which point the game ends. Removing all-but-one ends the game as well, since the opponent can't surround you with one piece, and so will always lose within a few moves anyway. Consider, then, this rather short game where X starts: xxxx___oooo ^ Move 1: X moves from 3 (shown with caret) to 5 (Note that any of X's pieces could move, but that the only place they could move to is 5.) xx_xx__oooo ^ Move 2: O moves from 9 to 7. xx_xx_oo_oo ^ Move 3: X moves from 4 to 6. xx__xxoo_oo ^ Move 4: O (stupidly) moves from 10 to 9. xx__xxooo_o ^ Move 5: X moves from 5 to 10, making the board "xx___xoooxo". The three o's that X just surrounded are removed. xx___x___xo O has only one piece, so has lost. Now, move 4 could have gone quite the other way: xx__xxoo_oo Move 4: O moves from 8 to 4, making the board "xx_oxxo__oo". The surrounded x's are removed. xx_o__o__oo ^ Move 5: X moves from 1 to 2. _xxo__o__oo ^ Move 6: O moves from 7 to 6. _xxo_o___oo ^ Move 7: X moves from 2 to 5, removing the o at 4. __x_xo___oo ...and so on. To teach a computer program to play Alak (as player X, say), it needs to be able to look at the configuration of the board, figure out what moves it can make, and weigh the benefit or costs, immediate or eventual, of those moves. So consider the board from just before move 3, and figure all the possible moves X could make. X has pieces in slots 1, 2, 4, and 5. The leftmost two x's (at 1 and 2) are up against the end of the board, so they can move only right. The other two x's (at 4 and 5) can move either right or left: Starting board: xx_xx_oo_oo moving 1 to 3 gives _xxxx_oo_oo moving 2 to 3 gives x_xxx_oo_oo moving 4 to 3 gives xxx_x_oo_oo moving 5 to 3 gives xxxx__oo_oo moving 4 to 6 gives xx__xxoo_oo moving 5 to 6 gives xx_x_xoo_oo For the computer to decide which of these is the best move to make, it needs to quantify the benefit of these moves as a number -- call that the "payoff". The payoff of a move can be figured as just the number of x pieces removed by the most recent move, minus the number of o pieces removed by the most recent move. (It so happens that the rules of the game mean that no move can delete both o's and x's, but the formula still applies.) Since none of these moves removed any pieces, all these moves have the same immediate payoff: 0. Now, we could race ahead and write an Alak-playing program that could use the immediate payoff to decide which is the best move to make. And when there's more than one best move (as here, where all the moves are equally good), it could choose randomly between the good alternatives. This strategy is simple to implement; but it makes for a very dumb program. Consider what O's response to each of the potential moves (above) could be. Nothing immediately suggests itself for the first four possibilities (X having moved something to position 3), but either of the last two (illustrated below) are pretty perilous, because in either case O has the obvious option (which he would be foolish to pass up) of removing x's from the board: xx_xx_oo_oo ^ X moves 4 to 6. xx__xxoo_oo ^ O moves 8 to 4, giving "xx_oxxo__oo". The two surrounded x's are removed. xx_o__o__oo or xx_xx_oo_oo ^ X moves 5 to 6. xx_x_xoo_oo ^ O moves 8 to 5, giving "xx_xoxo__oo". The one surrounded x is removed. xx_xo_o__oo Both contingencies are quite bad for X -- but this is not captured by the fact that they start out with X thinking his move will be harmless, having a payoff of zero. So what's needed is for X to think I<more> than one step ahead -- to consider not merely what it can do in this move, and what the payoff is, but to consider what O might do in response, and the payoff of those potential moves, and so on with X's possible responses to those cases could be. All these possibilities form a game tree -- a tree where each node is a board, and its children are successors of that node -- i.e., the boards that could result from every move possible, given the parent's board. But how to represent the tree, and how to represent the nodes? Well, consider that a node holds several pieces of data: 1) the configuration of the board, which, being nice and simple and one-dimensional, can be stored as just a string, like "xx_xx_oo_oo". 2) whose turn it is, X or O. (Or: who moved last, from which we can figure whose turn it is). 3) the successors (child nodes). 4) the immediate payoff of having moved to this board position from its predecessor (parent node). 5) and what move gets us from our predecessor node to here. (Granted, knowing the board configuration before and after the move, it's easy to figure out the move; but it's easier still to store it as one is figuring out a node's successors.) 6) whatever else we might want to add later. These could be stored equally well in an array or in a hash, but it's my experience that hashes are best for cases where you have more than just two or three bits of data, or especially when you might need to add new bits of data. Moreover, hash key names are mnemonic -- $node->{'last_move_payoff'} is plain as day, whereas it's not so easy having to remember with an array that $node->[3] is where you decided to keep the payoff. =over Footnote: Of course, there are ways around that problem: just swear you'll never use a real numeric index to access data in the array, and instead use constants with mnemonic names: use strict; use constant idx_PAYOFF => 3; ... $n->[idx_PAYOFF] Or use a pseudohash. But I prefer to keep it simple, and use a hash. These are, incidentally, the same arguments that people weigh when trying to decide whether their object-oriented modules should be based on blessed hashes, blessed arrays, or what. Essentially the only difference here is that we're not blessing our nodes or talking in terms of classes and methods. [end footnote] =back So, we might as well represent nodes like so: $node = { # hashref 'board' => ...board string, e.g., "xx_x_xoo_oo" 'last_move_payoff' => ...payoff of the move that got us here. 'last_move_from' => ...the start... 'last_move_to' => ...and end point of the move that got us here. E.g., 5 and 6, representing a move from 5 to 6. 'whose_turn' => ...whose move it then becomes. just an 'x' or 'o'. 'successors' => ...the successors }; Note that we could have a field called something like 'last_move_who' to denote who last moved, but since turns in Alak always alternate (and no-one can pass), storing whose move it is now I<and> who last moved is redundant -- if X last moved, it's O turn now, and vice versa. I chose to have a 'whose_turn' field instead of a 'last_move_who', but it doesn't really matter. Either way, we'll end up inferring one from the other at several points in the program. When we want to store the successors of a node, should we use an array or a hash? On the one hand, the successors to $node aren't essentially ordered, so there's no reason to use an array per se; on the other hand, if we used a hash, with successor nodes as values, we don't have anything particularly meaningful to use as keys. (And we can't use the successors themselves as keys, since the nodes are referred to by hash references, and you can't use a reference as a hash key.) Given no particularly compelling reason to do otherwise, I choose to just use an array to store all a node's successors, although the order is never actually used for anything: $node = { ... 'successors' => [ ...nodes... ], ... }; In any case, now that we've settled on what should be in a node, let's make a little sample tree out of a few nodes and see what we can do with it: # Board just before move 3 in above game my $n0 = { 'board' => 'xx_xx_oo_oo', 'last_move_payoff' => 0, 'last_move_from' => 9, 'last_move_to' => 7, 'whose_turn' => 'x', 'successors' => [], }; # And, for now, just two of the successors: # X moves 4 to 6, giving xx__xxoo_oo my $n1 = { 'board' => 'xx__xxoo_oo', 'last_move_payoff' => 0, 'last_move_from' => 4, 'last_move_to' => 6, 'whose_turn' => 'o', 'successors' => [], }; # or X moves 5 to 6, giving xx_x_xoo_oo my $n2 = { 'board' => 'xx_x_xoo_oo', 'last_move_payoff' => 0, 'last_move_from' => 5, 'last_move_to' => 6, 'whose_turn' => 'o', 'successors' => [], }; # Now connect them... push @{$n0->{'successors'}}, $n1, $n2; =head2 Digression: Links to Parents In comparing what we store in an Alak game tree node to what HTML::Element stores in HTML element nodes, you'll note one big difference: every HTML::Element node contains a link to its parent, whereas we don't have our Alak nodes keeping a link to theirs. The reason this can be an important difference is because it can affect how Perl knows when you're not using pieces of memory anymore. Consider the tree we just built, above: node 0 / \ node 1 node 2 There's two ways Perl knows you're using a piece of memory: 1) it's memory that belongs directly to a variable (i.e., is necessary to hold that variable's value, or valueI<s> in the case of a hash or array), or 2) it's a piece of memory that something holds a reference to. In the above code, Perl knows that the hash for node 0 (for board "xx_xx_oo_oo") is in use because something (namely, the variable C<$n0>) holds a reference to it. Now, even if you followed the above code with this: $n1 = $n2 = 'whatever'; to make your variables C<$n1> and C<$n2> stop holding references to the hashes for the two successors of node 0, Perl would still know that those hashes are still in use, because node 0's successors array holds a reference to those hashes. And Perl knows that node 0 is still in use because something still holds a reference to it. Now, if you added: my $root = $n0; This would change nothing -- there's just be I<two> things holding a reference to the node 0 hash, which in turn holds a reference to the node 1 and node 2 hashes. And if you then added: $n0 = 'stuff'; still nothing would change, because something (C<$root>) still holds a reference to the node 0 hash. But once I<nothing> holds a reference to the node 0 hash, Perl will know it can destroy that hash (and reclaim the memory for later use, say), and once it does that, nothing will hold a reference to the node 1 or the node 2 hashes, and those will be destroyed too. But consider if the node 1 and node 2 hashes each had an attribute "parent" (or "predecessor") that held a reference to node 0. If your program stopped holding a reference to the node 0 hash, Perl could I<not> then say that I<nothing> holds a reference to node 0 -- because node 1 and node 2 still do. So, the memory for nodes 0, 1, and 2 would never get reclaimed (until your program ended, at which point Perl destroys I<everything>). If your program grew and discarded lots of nodes in the game tree, but didn't let Perl know it could reclaim their memory, your program could grow to use immense amounts of memory -- never a nice thing to have happen. There's three ways around this: 1) When you're finished with a node, delete the reference each of its children have to it (in this case, deleting $n1->{'parent'}, say). When you're finished with a whole tree, just go through the whole tree erasing links that children have to their children. 2) Reconsider whether you really need to have each node hold a reference to its parent. Just not having those links will avoid the whole problem. 3) use the WeakRef module with Perl 5.6 or later. This allows you to "weaken" some references (like the references that node 1 and 2 could hold to their parent) so that they don't count when Perl goes asking whether anything holds a reference to a given piece of memory. This wonderful new module eliminates the headaches that can often crop up with either of the two previous methods. It so happens that our Alak program is simple enough that we don't need for our nodes to have links to their parents, so the second solution is fine. But in a more advanced program, the first or third solutions might be unavoidable. =head2 Recursively Printing the Tree I don't like working blind -- if I have any kind of a complex data structure in memory for a program I'm working on, the first thing I do is write something that can dump that structure to the screen so I can make sure that what I I<think> is in memory really I<is> what's in memory. Now, I could just use the "x" pretty-printer command in Perl's interactive debugger, or I could have the program use the C<Data::Dumper> module. But in this case, I think the output from those is rather too verbose. Once we have trees with dozens of nodes in them, we'll really want a dump of the tree to be as concise as possible, hopefully just one line per node. What I'd like is something that can print C<$n0> and its successors (see above) as something like: xx_xx_oo_oo (O moved 9 to 7, 0 payoff) xx__xxoo_oo (X moved 4 to 6, 0 payoff) xx_x_xoo_oo (X moved 5 to 6, 0 payoff) A subroutine to print a line for a given node, and then do that again for each successor, would look something like: sub dump_tree { my $n = $_[0]; # "n" is for node print ...something expressing $n'n content... foreach my $s (@{$n->{'successors'}}) { # "s for successor dump($s); } } And we could just start that out with a call to C<dump_tree($n0)>. Since this routine... =over Footnote: I first wrote this routine starting out with "sub dump {". But when I tried actually calling C<dump($n0)>, Perl would dump core! Imagine my shock when I discovered that this is absolutely to be expected -- Perl provides a built-in function called C<dump>, the purpose of which is to, yes, make Perl dump core. Calling our routine "dump_tree" instead of "dump" neatly avoids that problem. =back ...does its work (dumping the subtree at and under the given node) by calling itself, it's B<recursive>. However, there's a special term for this kind of recursion across a tree: traversal. To B<traverse> a tree means to do something to a node, and to traverse its children. There's two prototypical ways to do this, depending on what happens when: traversing X in pre-order: * do something to X * then traverse X's children traversing X in post-order: * traverse X's children * then do something to X Dumping the tree to the screen the way we want it happens to be a matter of pre-order traversal, since the thing we do (print a description of the node) happens before we recurse into the successors. When we try writing the C<print> statement for our above C<dump_tree>, we can get something like: sub dump_tree { my $n = $_[0]; # "xx_xx_oo_oo (O moved 9 to 7, 0 payoff)" print $n->{'board'}, " (", ($n->{'whose_turn'} eq 'o' ? 'X' : 'O'), # Infer who last moved from whose turn it is now. " moved ", $n->{'last_move_from'}, " to ", $n->{'last_move_to'}, ", ", $n->{'last_move_payoff'}, " payoff)\n", ; foreach my $s (@{$n->{'successors'}}) { dump_tree($s); } } If we run this on $n0 from above, we get this: xx_xx_oo_oo (O moved 9 to 7, 0 payoff) xx__xxoo_oo (X moved 4 to 6, 0 payoff) xx_x_xoo_oo (X moved 5 to 6, 0 payoff) Each line on its own is fine, but we forget to allow for indenting, and without that we can't tell what's a child of what. (Imagine if the first successor had successors of its own -- you wouldn't be able to tell if it were a child, or a sibling.) To get indenting, we'll need to have the instances of the C<dump_tree> routine know how far down in the tree they're being called, by passing a depth parameter between them: sub dump_tree { my $n = $_[0]; my $depth = $_[1]; $depth = 0 unless defined $depth; print " " x $depth, ...stuff... foreach my $s (@{$n->{'successors'}}) { dump_tree($s, $depth + 1); } } When we call C<dump_tree($n0)>, C<$depth> (from C<$_[1]>) is undefined, so gets set to 0, which translates into an indenting of no spaces. But when C<dump_tree> invokes itself on C<$n0>'s children, those instances see C<$depth> + 1 as their C<$_[1]>, giving appropriate indenting. =over Footnote: Passing values around between different invocations of a recursive routine, as shown, is a decent way to share the data. Another way to share the data is by keeping it in a global variable, like C<$Depth>, initially set to 0. Each time C<dump_tree> is about to recurse, it must C<++$Depth>, and when it's back, it must C<--$Depth>. Or, if the reader is familiar with closures, consider this approach: sub dump_tree { # A wrapper around calls to a recursive closure: my $start_node = $_[0]; my $depth = 0; # to be shared across calls to $recursor. my $recursor; $recursor = sub { my $n = $_[0]; print " " x $depth, ...stuff... ++$depth; foreach my $s (@{$n->{'successors'}}) { $recursor->($s); } --$depth; } $recursor->($start_node); # start recursing undef $recursor; } The reader with an advanced understanding of Perl's reference-count-based garbage collection is invited to consider why it is currently necessary to undef $recursor (or otherwise change its value) after all recursion is done. The reader whose mind is perverse in other ways is invited to consider how (or when!) passing a depth parameter around is unnecessary because of information that Perl's C<caller(N)> function reports! [end footnote] =back =head2 Growing the Tree Our C<dump_tree> routine works fine for the sample tree we've got, so now we should get the program working on making its own trees, starting from a given board. In C<Games::Alak> (the CPAN-released version of Alak that uses essentially the same code that we're currently discussing the tree-related parts of), there is a routine called C<figure_successors> that, given one childless node, will figure out all its possible successors. That is, it looks at the current board, looks at every piece belonging to the player whose turn it is, and considers the effect of moving each piece every possible way -- notably, it figures out the immediate payoff, and if that move would end the game, it notes that by setting an "endgame" entry in that node's hash. (That way, we know that that's a node that I<can't> have successors.) In the code for C<Games::Alak>, C<figure_successors> does all these things, in a rather straightforward way. I won't walk you through the details of the C<figure_successors> code I've written, since the code has nothing much to do with trees, and is all just implementation of the Alak rules for what can move where, with what result. Especially interested readers can puzzle over that part of code in the source listing in the archive from CPAN, but others can just assume that it works as described above. But consider that C<figure_successors>, regardless of its inner workings, does not grow the I<tree>; it only makes one set of successors for one node at a time. It has to be up to a different routine to call C<figure_successors>, and to keep applying it as needed, in order to make a nice big tree that our game-playing program can base its decisions on. Now, we could do this by just starting from one node, applying C<figure_successors> to it, then applying C<figure_successors> on all the resulting children, and so on: sub grow { # Just a first attempt at this! my $n = $_[0]; figure_successors($n); unless @{$n->{'successors'}} # already has successors. or $n->{'endgame'} # can't have successors. } foreach my $s (@{$n->{'successors'}}) { grow($s); # recurse } } If you have a game tree for tic-tac-toe, and you grow it without limitation (as above), you will soon enough have a fully "solved" tree, where every node that I<can> have successors I<does>, and all the leaves of the tree are I<all> the possible endgames (where, in each case, the board is filled). But a game of Alak is different from tic-tac-toe, because it can, in theory, go on forever. For example, the following sequence of moves is quite possible: xxxx___oooo xxx_x__oooo xxx_x_o_ooo xxxx__o_ooo (x moved back) xxxx___oooo (o moved back) ...repeat forever... So if you tried using our above attempt at a C<grow> routine, Perl would happily start trying to construct an infinitely deep tree, containing an infinite number of nodes, consuming an infinite amount of memory, and requiring an infinite amount of time. As the old saying goes: "You can't have everything -- where would you put it?" So we have to place limits on how much we'll grow the tree. There's more than one way to do this: 1. We could grow the tree until we hit some limit on the number of nodes we'll allow in the tree. 2. We could grow the tree until we hit some limit on the amount of time we're willing to spend. 3. Or we could grow the tree until it is fully fleshed out to a certain depth. Since we already know to track depth (as we did in writing C<dump_tree>), we'll do it that way, the third way. The implementation for that third approach is also pretty straightforward: $Max_depth = 3; sub grow { my $n = $_[0]; my $depth = $_[1] || 0; figure_successors($n) unless $depth >= $Max_depth or @{$n->{'successors'}} or $n->{'endgame'} } foreach my $s (@{$n->{'successors'}}) { grow($s, $depth + 1); } # If we're at $Max_depth, then figure_successors # didn't get called, so there's no successors # to recurse under -- that's what stops recursion. } If we start from a single node (whether it's a node for the starting board "xxxx___oooo", or for whatever board the computer is faced with), set C<$Max_depth> to 4, and apply C<grow> to it, it will grow the tree to include several hundred nodes. =over Footnote: If at each move there are four pieces that can move, and they can each move right or left, the "branching factor" of the tree is eight, giving a tree with 1 (depth 0) + 8 (depth 1) + 8 ** 2 + 8 ** 3 + 8 ** 4 = 4681 nodes in it. But, in practice, not all pieces can move in both directions (none of the x pieces in "xxxx___oooo" can move left, for example), and there may be fewer than four pieces, if some were lost. For example, there are 801 nodes in a tree of depth four starting from "xxxx___oooo", suggesting an average branching factor of about five (801 ** (1/4) is about 5.3), not eight. =back What we need to derive from that tree is the information about what are the best moves for X. The simplest way to consider the payoff of different successors is to just average them -- but what we average isn't always their immediate payoffs (because that'd leave us using only one generation of information), but the average payoff of I<their> successors, if any. We can formalize this as: To figure a node's average payoff: If the node has successors: Figure each successor's average payoff. My average payoff is the average of theirs. Otherwise: My average payoff is my immediate payoff. Since this involves recursing into the successors I<before> doing anything with the current node, this will traverse the tree I<in post-order>. We could work that up as a routine of its own, and apply that to the tree after we've applied C<grow> to it. But since we'd never grow the tree without also figuring the average benefit, we might as well make that figuring part of the C<grow> routine itself: $Max_depth = 3; sub grow { my $n = $_[0]; my $depth = $_[1] || 0; figure_successors($n); unless $depth >= $Max_depth or @{$n->{'successors'}} or $n->{'endgame'} } if(@{$n->{'successors'}}) { my $a_payoff_sum = 0; foreach my $s (@{$n->{'successors'}}) { grow($s, $depth + 1); # RECURSE $a_payoff_sum += $s->{'average_payoff'}; } $n->{'average_payoff'} = $a_payoff_sum / @{$n->{'successors'}}; } else { $n->{'average_payoff'} = $n->{'last_move_payoff'}; } } So, by time C<grow> has applied to a node (wherever in the tree it is), it will have figured successors if possible (which, in turn, sets C<last_move_payoff> for each node it creates), and will have set C<average_benefit>. Beyond this, all that's needed is to start the board out with a root note of "xxxx___oooo", and have the computer (X) take turns with the user (O) until someone wins. Whenever it's O's turn, C<Games::Alak> presents a prompt to the user, letting him know the state of the current board, and asking what move he selects. When it's X's turn, the computer grows the game tree as necessary (using just the C<grow> routine from above), then selects the move with the highest average payoff (or one of the highest, in case of a tie). In either case, "selecting" a move means just setting that move's node as the new root of the program's game tree. Its sibling nodes and their descendants (the boards that I<didn't> get selected) and its parent node will be erased from memory, since they will no longer be in use (as Perl can tell by the fact that nothing holds references to them anymore). The interface code in C<Games::Alak> (the code that prompts the user for his move) actually supports quite a few options besides just moving -- including dumping the game tree to a specified depth (using a slightly fancier version of C<dump_tree>, above), resetting the game, changing C<$Max_depth> in the middle of the game, and quitting the game. Like C<figure_successors>, it's a bit too long to print here, but interested users are welcome to peruse (and freely modify) the code, as well as to enjoy just playing the game. Now, in practice, there's more to game trees than this: for games with a larger branching factor than Alak has (which is most!), game trees of depth four or larger would contain too many nodes to be manageable, most of those nodes being strategically quite uninteresting for either player; dealing with game trees specifically is therefore a matter of recognizing uninteresting contingencies and not bothering to grow the tree under them. =over Footnote: For example, to choose a straightforward case: if O has a choice between moves that put him in immediate danger of X winning and moves that don't, then O won't ever choose the dangerous moves (and if he does, the computer will know enough to end the game), so there's no point in growing the tree any further beneath those nodes. =back But this sample implementation should illustrate the basics of how to build and manipulate a simple tree structure in memory. And once you've understood the basics of tree storage here, you should be ready to better understand the complexities and peculiarities of other systems for creating, accessing, and changing trees, including Tree::DAG_Node, HTML::Element, XML::DOM, or related formalisms like XPath and XSL. B<[end body of article]> =head2 [Author Credit] Sean M. Burke (C<sburke@cpan.org>) is a tree-dwelling hominid. =head2 References Dewdney, A[lexander] K[eewatin]. 1984. I<Planiverse: Computer Contact with a Two-Dimensional World.> Poseidon Press, New York. Knuth, Donald Ervin. 1997. I<Art of Computer Programming, Volume 1, Third Edition: Fundamental Algorithms>. Addison-Wesley, Reading, MA. Wirth, Niklaus. 1976. I<Algorithms + Data Structures = Programs> Prentice-Hall, Englewood Cliffs, NJ. Worth, Stan and Allman Sheldon. Circa 1967. I<George of the Jungle> theme. [music by Jay Ward.] Wirth's classic, currently and lamentably out of print, has a good section on trees. I find it clearer than Knuth's (if not quite as encyclopedic), probably because Wirth's example code is in a block-structured high-level language (basically Pascal), instead of in assembler (MIX). I believe the book was re-issued in the 1980s under the titles I<Algorithms and Data Structures> and, in a German edition, I<Algorithmen und Datenstrukturen>. Cheap copies of these editions should be available through used book services such as C<abebooks.com>. Worth's classic, however, is available on the soundtrack to the 1997 I<George of the Jungle> movie, as performed by The Presidents of the United States of America. =head1 BACK Return to the L<HTML::Tree|HTML::Tree> docs. =cut Tree/AboutObjects.pod 0000644 00000075313 00000000000 0010505 0 ustar 00 #Time-stamp: "2001-02-23 20:07:25 MST" -*-Text-*- # This document contains text in Perl "POD" format. # Use a POD viewer like perldoc or perlman to render it. =head1 NAME HTML::Tree::AboutObjects -- article: "User's View of Object-Oriented Modules" =head1 SYNOPSIS # This an article, not a module. =head1 DESCRIPTION The following article by Sean M. Burke first appeared in I<The Perl Journal> #17 and is copyright 2000 The Perl Journal. It appears courtesy of Jon Orwant and The Perl Journal. This document may be distributed under the same terms as Perl itself. =head1 A User's View of Object-Oriented Modules -- Sean M. Burke The first time that most Perl programmers run into object-oriented programming when they need to use a module whose interface is object-oriented. This is often a mystifying experience, since talk of "methods" and "constructors" is unintelligible to programmers who thought that functions and variables was all there was to worry about. Articles and books that explain object-oriented programming (OOP), do so in terms of how to program that way. That's understandable, and if you learn to write object-oriented code of your own, you'd find it easy to use object-oriented code that others write. But this approach is the I<long> way around for people whose immediate goal is just to use existing object-oriented modules, but who don't yet want to know all the gory details of having to write such modules for themselves. This article is for those programmers -- programmers who want to know about objects from the perspective of using object-oriented modules. =head2 Modules and Their Functional Interfaces Modules are the main way that Perl provides for bundling up code for later use by yourself or others. As I'm sure you can't help noticing from reading I<The Perl Journal>, CPAN (the Comprehensive Perl Archive Network) is the repository for modules (or groups of modules) that others have written, to do anything from composing music to accessing Web pages. A good deal of those modules even come with every installation of Perl. One module that you may have used before, and which is fairly typical in its interface, is Text::Wrap. It comes with Perl, so you don't even need to install it from CPAN. You use it in a program of yours, by having your program code say early on: use Text::Wrap; and after that, you can access a function called C<wrap>, which inserts line-breaks in text that you feed it, so that the text will be wrapped to seventy-two (or however many) columns. The way this C<use Text::Wrap> business works is that the module Text::Wrap exists as a file "Text/Wrap.pm" somewhere in one of your library directories. That file contains Perl code... =over Footnote: And mixed in with the Perl code, there's documentation, which is what you read with "perldoc Text::Wrap". The perldoc program simply ignores the code and formats the documentation text, whereas "use Text::Wrap" loads and runs the code while ignoring the documentation. =back ...which, among other things, defines a function called C<Text::Wrap::wrap>, and then C<exports> that function, which means that when you say C<wrap> after having said "use Text::Wrap", you'll be actually calling the C<Text::Wrap::wrap> function. Some modules don't export their functions, so you have to call them by their full name, like C<Text::Wrap::wrap(...parameters...)>. Regardless of whether the typical module exports the functions it provides, a module is basically just a container for chunks of code that do useful things. The way the module allows for you to interact with it, is its I<interface>. And when, like with Text::Wrap, its interface consists of functions, the module is said to have a B<functional interface>. =over Footnote: the term "function" (and therefore "functionI<al>") has various senses. I'm using the term here in its broadest sense, to refer to routines -- bits of code that are called by some name and which take parameters and return some value. =back Using modules with functional interfaces is straightforward -- instead of defining your own "wrap" function with C<sub wrap { ... }>, you entrust "use Text::Wrap" to do that for you, along with whatever other functions its defines and exports, according to the module's documentation. Without too much bother, you can even write your own modules to contain your frequently used functions; I suggest having a look at the C<perlmod> man page for more leads on doing this. =head2 Modules with Object-Oriented Interfaces So suppose that one day you want to write a program that will automate the process of C<ftp>ing a bunch of files from one server down to your local machine, and then off to another server. A quick browse through search.cpan.org turns up the module "Net::FTP", which you can download and install it using normal installation instructions (unless your sysadmin has already installed it, as many have). Like Text::Wrap or any other module with a familiarly functional interface, you start off using Net::FTP in your program by saying: use Net::FTP; However, that's where the similarity ends. The first hint of difference is that the documentation for Net::FTP refers to it as a B<class>. A class is a kind of module, but one that has an object-oriented interface. Whereas modules like Text::Wrap provide bits of useful code as I<functions>, to be called like C<function(...parameters...)> or like C<PackageName::function(...parameters...)>, Net::FTP and other modules with object-oriented interfaces provide B<methods>. Methods are sort of like functions in that they have a name and parameters; but methods look different, and are different, because you have to call them with a syntax that has a class name or an object as a special argument. I'll explain the syntax for method calls, and then later explain what they all mean. Some methods are meant to be called as B<class methods>, with the class name (same as the module name) as a special argument. Class methods look like this: ClassName->methodname(parameter1, parameter2, ...) ClassName->methodname() # if no parameters ClassName->methodname # same as above which you will sometimes see written: methodname ClassName (parameter1, parameter2, ...) methodname ClassName # if no parameters Basically all class methods are for making new objects, and methods that make objects are called "B<constructors>" (and the process of making them is called "constructing" or "instantiating"). Constructor methods typically have the name "new", or something including "new" ("new_from_file", etc.); but they can conceivably be named anything -- DBI's constructor method is named "connect", for example. The object that a constructor method returns is typically captured in a scalar variable: $object = ClassName->new(param1, param2...); Once you have an object (more later on exactly what that is), you can use the other kind of method call syntax, the syntax for B<object method> calls. Calling object methods is just like class methods, except that instead of the ClassName as the special argument, you use an expression that yields an "object". Usually this is just a scalar variable that you earlier captured the output of the constructor in. Object method calls look like this: $object->methodname(parameter1, parameter2, ...); $object->methodname() # if no parameters $object->methodname # same as above which is occasionally written as: methodname $object (parameter1, parameter2, ...) methodname $object # if no parameters Examples of method calls are: my $session1 = Net::FTP->new("ftp.myhost.com"); # Calls a class method "new", from class Net::FTP, # with the single parameter "ftp.myhost.com", # and saves the return value (which is, as usual, # an object), in $session1. # Could also be written: # new Net::FTP('ftp.myhost.com') $session1->login("sburke","aoeuaoeu") || die "failed to login!\n"; # calling the object method "login" print "Dir:\n", $session1->dir(), "\n"; $session1->quit; # same as $session1->quit() print "Done\n"; exit; Incidentally, I suggest always using the syntaxes with parentheses and "->" in them, =over Footnote: the character-pair "->" is supposed to look like an arrow, not "negative greater-than"! =back and avoiding the syntaxes that start out "methodname $object" or "methodname ModuleName". When everything's going right, they all mean the same thing as the "->" variants, but the syntax with "->" is more visually distinct from function calls, as well as being immune to some kinds of rare but puzzling ambiguities that can arise when you're trying to call methods that have the same name as subroutines you've defined. But, syntactic alternatives aside, all this talk of constructing objects and object methods begs the question -- what I<is> an object? There are several angles to this question that the rest of this article will answer in turn: what can you do with objects? what's in an object? what's an object value? and why do some modules use objects at all? =head2 What Can You Do with Objects? You've seen that you can make objects, and call object methods with them. But what are object methods for? The answer depends on the class: A Net::FTP object represents a session between your computer and an FTP server. So the methods you call on a Net::FTP object are for doing whatever you'd need to do across an FTP connection. You make the session and log in: my $session = Net::FTP->new('ftp.aol.com'); die "Couldn't connect!" unless defined $session; # The class method call to "new" will return # the new object if it goes OK, otherwise it # will return undef. $session->login('sburke', 'p@ssw3rD') || die "Did I change my password again?"; # The object method "login" will give a true # return value if actually logs in, otherwise # it'll return false. You can use the session object to change directory on that session: $session->cwd("/home/sburke/public_html") || die "Hey, that was REALLY supposed to work!"; # if the cwd fails, it'll return false ...get files from the machine at the other end of the session... foreach my $f ('log_report_ua.txt', 'log_report_dom.txt', 'log_report_browsers.txt') { $session->get($f) || warn "Getting $f failed!" }; ...and plenty else, ending finally with closing the connection: $session->quit(); In short, object methods are for doing things related to (or with) whatever the object represents. For FTP sessions, it's about sending commands to the server at the other end of the connection, and that's about it -- there, methods are for doing something to the world outside the object, and the objects is just something that specifies what bit of the world (well, what FTP session) to act upon. With most other classes, however, the object itself stores some kind of information, and it typically makes no sense to do things with such an object without considering the data that's in the object. =head2 What's I<in> an Object? An object is (with rare exceptions) a data structure containing a bunch of attributes, each of which has a value, as well as a name that you use when you read or set the attribute's value. Some of the object's attributes are private, meaning you'll never see them documented because they're not for you to read or write; but most of the object's documented attributes are at least readable, and usually writeable, by you. Net::FTP objects are a bit thin on attributes, so we'll use objects from the class Business::US_Amort for this example. Business::US_Amort is a very simple class (available from CPAN) that I wrote for making calculations to do with loans (specifically, amortization, using US-style algorithms). An object of the class Business::US_Amort represents a loan with particular parameters, i.e., attributes. The most basic attributes of a "loan object" are its interest rate, its principal (how much money it's for), and it's term (how long it'll take to repay). You need to set these attributes before anything else can be done with the object. The way to get at those attributes for loan objects is just like the way to get at attributes for any class's objects: through accessors. An B<accessor> is simply any method that accesses (whether reading or writing, AKA getting or putting) some attribute in the given object. Moreover, accessors are the B<only> way that you can change an object's attributes. (If a module's documentation wants you to know about any other way, it'll tell you.) Usually, for simplicity's sake, an accessor is named after the attribute it reads or writes. With Business::US_Amort objects, the accessors you need to use first are C<principal>, C<interest_rate>, and C<term>. Then, with at least those attributes set, you can call the C<run> method to figure out several things about the loan. Then you can call various accessors, like C<total_paid_toward_interest>, to read the results: use Business::US_Amort; my $loan = Business::US_Amort->new; # Set the necessary attributes: $loan->principal(123654); $loan->interest_rate(9.25); $loan->term(20); # twenty years # NOW we know enough to calculate: $loan->run; # And see what came of that: print "Total paid toward interest: A WHOPPING ", $loan->total_paid_interest, "!!\n"; This illustrates a convention that's common with accessors: calling the accessor with no arguments (as with $loan->total_paid_interest) usually means to read the value of that attribute, but providing a value (as with $loan->term(20)) means you want that attribute to be set to that value. This stands to reason: why would you be providing a value, if not to set the attribute to that value? Although a loan's term, principal, and interest rates are all single numeric values, an objects values can any kind of scalar, or an array, or even a hash. Moreover, an attribute's value(s) can be objects themselves. For example, consider MIDI files (as I wrote about in TPJ#13): a MIDI file usually consists of several tracks. A MIDI file is complex enough to merit being an object with attributes like its overall tempo, the file-format variant it's in, and the list of instrument tracks in the file. But tracks themselves are complex enough to be objects too, with attributes like their track-type, a list of MIDI commands if they're a MIDI track, or raw data if they're not. So I ended up writing the MIDI modules so that the "tracks" attribute of a MIDI::Opus object is an array of objects from the class MIDI::Track. This may seem like a runaround -- you ask what's in one object, and get I<another> object, or several! But in this case, it exactly reflects what the module is for -- MIDI files contain MIDI tracks, which then contain data. =head2 What is an Object Value? When you call a constructor like Net::FTP->new(I<hostname>), you get back an object value, a value you can later use, in combination with a method name, to call object methods. Now, so far we've been pretending, in the above examples, that the variables $session or $loan I<are> the objects you're dealing with. This idea is innocuous up to a point, but it's really a misconception that will, at best, limit you in what you know how to do. The reality is not that the variables $session or $query are objects; it's a little more indirect -- they I<hold> values that symbolize objects. The kind of value that $session or $query hold is what I'm calling an object value. To understand what kind of value this is, first think about the other kinds of scalar values you know about: The first two scalar values you probably ever ran into in Perl are B<numbers> and B<strings>, which you learned (or just assumed) will usually turn into each other on demand; that is, the three-character string "2.5" can become the quantity two and a half, and vice versa. Then, especially if you started using C<perl -w> early on, you learned about the B<undefined value>, which can turn into 0 if you treat it as a number, or the empty-string if you treat it as a string. =over Footnote: You may I<also> have been learning about references, in which case you're ready to hear that object values are just a kind of reference, except that they reflect the class that created thing they point to, instead of merely being a plain old array reference, hash reference, etc. I<If> this makes makes sense to you, and you want to know more about how objects are implemented in Perl, have a look at the C<perltoot> man page. =back And now you're learning about B<object values>. An object value is a value that points to a data structure somewhere in memory, which is where all the attributes for this object are stored. That data structure as a whole belongs to a class (probably the one you named in the constructor method, like ClassName->new), so that the object value can be used as part of object method calls. If you want to actually I<see> what an object value is, you might try just saying "print $object". That'll get you something like this: Net::FTP=GLOB(0x20154240) or Business::US_Amort=HASH(0x15424020) That's not very helpful if you wanted to really get at the object's insides, but that's because the object value is only a symbol for the object. This may all sound very abstruse and metaphysical, so a real-world allegory might be very helpful: =over You get an advertisement in the mail saying that you have been (im)personally selected to have the rare privilege of applying for a credit card. For whatever reason, I<this> offer sounds good to you, so you fill out the form and mail it back to the credit card company. They gleefully approve the application and create your account, and send you a card with a number on it. Now, you can do things with the number on that card -- clerks at stores can ring up things you want to buy, and charge your account by keying in the number on the card. You can pay for things you order online by punching in the card number as part of your online order. You can pay off part of the account by sending the credit card people some of your money (well, a check) with some note (usually the pre-printed slip) that has the card number for the account you want to pay toward. And you should be able to call the credit card company's computer and ask it things about the card, like its balance, its credit limit, its APR, and maybe an itemization of recent purchases ad payments. Now, what you're I<really> doing is manipulating a credit card I<account>, a completely abstract entity with some data attached to it (balance, APR, etc). But for ease of access, you have a credit card I<number> that is a symbol for that account. Now, that symbol is just a bunch of digits, and the number is effectively meaningless and useless in and of itself -- but in the appropriate context, it's understood to I<mean> the credit card account you're accessing. =back This is exactly the relationship between objects and object values, and from this analogy, several facts about object values are a bit more explicable: * An object value does nothing in and of itself, but it's useful when you use it in the context of an $object->method call, the same way that a card number is useful in the context of some operation dealing with a card account. Moreover, several copies of the same object value all refer to the same object, the same way that making several copies of your card number won't change the fact that they all still refer to the same single account (this is true whether you're "copying" the number by just writing it down on different slips of paper, or whether you go to the trouble of forging exact replicas of your own plastic credit card). That's why this: $x = Net::FTP->new("ftp.aol.com"); $x->login("sburke", "aoeuaoeu"); does the same thing as this: $x = Net::FTP->new("ftp.aol.com"); $y = $x; $z = $y; $z->login("sburke", "aoeuaoeu"); That is, $z and $y and $x are three different I<slots> for values, but what's in those slots are all object values pointing to the same object -- you don't have three different FTP connections, just three variables with values pointing to the some single FTP connection. * You can't tell much of anything about the object just by looking at the object value, any more than you can see your credit account balance by holding the plastic card up to the light, or by adding up the digits in your credit card number. * You can't just make up your own object values and have them work -- they can come only from constructor methods of the appropriate class. Similarly, you get a credit card number I<only> by having a bank approve your application for a credit card account -- at which point I<they> let I<you> know what the number of your new card is. Now, there's even more to the fact that you can't just make up your own object value: even though you can print an object value and get a string like "Net::FTP=GLOB(0x20154240)", that string is just a I<representation> of an object value. Internally, an object value has a basically different type from a string, or a number, or the undefined value -- if $x holds a real string, then that value's slot in memory says "this is a value of type I<string>, and its characters are...", whereas if it's an object value, the value's slot in memory says, "this is a value of type I<reference>, and the location in memory that it points to is..." (and by looking at what's at that location, Perl can tell the class of what's there). Perl programmers typically don't have to think about all these details of Perl's internals. Many other languages force you to be more conscious of the differences between all of these (and also between types of numbers, which are stored differently depending on their size and whether they have fractional parts). But Perl does its best to hide the different types of scalars from you -- it turns numbers into strings and back as needed, and takes the string or number representation of undef or of object values as needed. However, you can't go from a string representation of an object value, back to an object value. And that's why this doesn't work: $x = Net::FTP->new('ftp.aol.com'); $y = Net::FTP->new('ftp.netcom.com'); $z = Net::FTP->new('ftp.qualcomm.com'); $all = join(' ', $x,$y,$z); # !!! ...later... ($aol, $netcom, $qualcomm) = split(' ', $all); # !!! $aol->login("sburke", "aoeuaoeu"); $netcom->login("sburke", "qjkxqjkx"); $qualcomm->login("smb", "dhtndhtn"); This fails because $aol ends up holding merely the B<string representation> of the object value from $x, not the object value itself -- when C<join> tried to join the characters of the "strings" $x, $y, and $z, Perl saw that they weren't strings at all, so it gave C<join> their string representations. Unfortunately, this distinction between object values and their string representations doesn't really fit into the analogy of credit card numbers, because credit card numbers really I<are> numbers -- even thought they don't express any meaningful quantity, if you stored them in a database as a quantity (as opposed to just an ASCII string), that wouldn't stop them from being valid as credit card numbers. This may seem rather academic, but there's there's two common mistakes programmers new to objects often make, which make sense only in terms of the distinction between object values and their string representations: The first common error involves forgetting (or never having known in the first place) that when you go to use a value as a hash key, Perl uses the string representation of that value. When you want to use the numeric value two and a half as a key, Perl turns it into the three-character string "2.5". But if you then want to use that string as a number, Perl will treat it as meaning two and a half, so you're usually none the wiser that Perl converted the number to a string and back. But recall that Perl can't turn strings back into objects -- so if you tried to use a Net::FTP object value as a hash key, Perl actually used its string representation, like "Net::FTP=GLOB(0x20154240)", but that string is unusable as an object value. (Incidentally, there's a module Tie::RefHash that implements hashes that I<do> let you use real object-values as keys.) The second common error with object values is in trying to save an object value to disk (whether printing it to a file, or storing it in a conventional database file). All you'll get is the string, which will be useless. When you want to save an object and restore it later, you may find that the object's class already provides a method specifically for this. For example, MIDI::Opus provides methods for writing an object to disk as a standard MIDI file. The file can later be read back into memory by a MIDI::Opus constructor method, which will return a new MIDI::Opus object representing whatever file you tell it to read into memory. Similar methods are available with, for example, classes that manipulate graphic images and can save them to files, which can be read back later. But some classes, like Business::US_Amort, provide no such methods for storing an object in a file. When this is the case, you can try using any of the Data::Dumper, Storable, or FreezeThaw modules. Using these will be unproblematic for objects of most classes, but it may run into limitations with others. For example, a Business::US_Amort object can be turned into a string with Data::Dumper, and that string written to a file. When it's restored later, its attributes will be accessible as normal. But in the unlikely case that the loan object was saved in mid-calculation, the calculation may not be resumable. This is because of the way that that I<particular> class does its calculations, but similar limitations may occur with objects from other classes. But often, even I<wanting> to save an object is basically wrong -- what would saving an ftp I<session> even mean? Saving the hostname, username, and password? current directory on both machines? the local TCP/IP port number? In the case of "saving" a Net::FTP object, you're better off just saving whatever details you actually need for your own purposes, so that you can make a new object later and just set those values for it. =head2 So Why Do Some Modules Use Objects? All these details of using objects are definitely enough to make you wonder -- is it worth the bother? If you're a module author, writing your module with an object-oriented interface restricts the audience of potential users to those who understand the basic concepts of objects and object values, as well as Perl's syntax for calling methods. Why complicate things by having an object-oriented interface? A somewhat esoteric answer is that a module has an object-oriented interface because the module's insides are written in an object-oriented style. This article is about the basics of object-oriented I<interfaces>, and it'd be going far afield to explain what object-oriented I<design> is. But the short story is that object-oriented design is just one way of attacking messy problems. It's a way that many programmers find very helpful (and which others happen to find to be far more of a hassle than it's worth, incidentally), and it just happens to show up for you, the module user, as merely the style of interface. But a simpler answer is that a functional interface is sometimes a hindrance, because it limits the number of things you can do at once -- limiting it, in fact, to one. For many problems that some modules are meant to solve, doing without an object-oriented interface would be like wishing that Perl didn't use filehandles. The ideas are rather simpler -- just imagine that Perl let you access files, but I<only> one at a time, with code like: open("foo.txt") || die "Can't open foo.txt: $!"; while(readline) { print $_ if /bar/; } close; That hypothetical kind of Perl would be simpler, by doing without filehandles. But you'd be out of luck if you wanted to read from one file while reading from another, or read from two and print to a third. In the same way, a functional FTP module would be fine for just uploading files to one server at a time, but it wouldn't allow you to easily write programs that make need to use I<several> simultaneous sessions (like "look at server A and server B, and if A has a file called X.dat, then download it locally and then upload it to server B -- except if B has a file called Y.dat, in which case do it the other way around"). Some kinds of problems that modules solve just lend themselves to an object-oriented interface. For those kinds of tasks, a functional interface would be more familiar, but less powerful. Learning to use object-oriented modules' interfaces does require becoming comfortable with the concepts from this article. But in the end it will allow you to use a broader range of modules and, with them, to write programs that can do more. B<[end body of article]> =head2 [Author Credit] Sean M. Burke has contributed several modules to CPAN, about half of them object-oriented. [The next section should be in a greybox:] =head2 The Gory Details For sake of clarity of explanation, I had to oversimplify some of the facts about objects. Here's a few of the gorier details: * Every example I gave of a constructor was a class method. But object methods can be constructors, too, if the class was written to work that way: $new = $old->copy, $node_y = $node_x->new_subnode, or the like. * I've given the impression that there's two kinds of methods: object methods and class methods. In fact, the same method can be both, because it's not the kind of method it is, but the kind of calls it's written to accept -- calls that pass an object, or calls that pass a class-name. * The term "object value" isn't something you'll find used much anywhere else. It's just my shorthand for what would properly be called an "object reference" or "reference to a blessed item". In fact, people usually say "object" when they properly mean a reference to that object. * I mentioned creating objects with I<con>structors, but I didn't mention destroying them with I<de>structor -- a destructor is a kind of method that you call to tidy up the object once you're done with it, and want it to neatly go away (close connections, delete temporary files, free up memory, etc). But because of the way Perl handles memory, most modules won't require the user to know about destructors. * I said that class method syntax has to have the class name, as in $session = B<Net::FTP>->new($host). Actually, you can instead use any expression that returns a class name: $ftp_class = 'Net::FTP'; $session = B<$ftp_class>->new($host). Moreover, instead of the method name for object- or class-method calls, you can use a scalar holding the method name: $foo->B<$method>($host). But, in practice, these syntaxes are rarely useful. And finally, to learn about objects from the perspective of writing your own classes, see the C<perltoot> documentation, or Damian Conway's exhaustive and clear book I<Object Oriented Perl> (Manning Publications 1999, ISBN 1-884777-79-1). =head1 BACK Return to the L<HTML::Tree|HTML::Tree> docs. =cut Functions.pod 0000644 00000166160 00000000000 0007173 0 ustar 00 =head1 NAME CGI::HTML::Functions - Documentation for CGI.pm Legacy HTML Functionality =head1 SYNOPSIS Nothing here - please do not use this functionality, it is considered to be legacy and essentially deprecated. This documentation exists solely to aid in maintenance and migration of legacy code using this functionality and you are strongly encouraged to migrate away from it. If you are working on new code you should be using a template engine. For more information see L<CGI::Alternatives>. If you really want to continue using the HTML generation functionality of CGI.pm then you should take a look at L<HTML::Tiny> instead, which may give you a migration path away from CGI.pm's html generation functions; i strongly encourage you to move towards template driven page generation for anything involving markup as it will make porting your app to other frameworks much easier in the long run. =head1 DESCRIPTION The documentation here should be considered an addendum to the sections in the L<CGI> documentation - the sections here are named the same as those within the CGI perldoc. =head1 Calling CGI.pm routines HTML tag functions have both attributes (the attribute="value" pairs within the tag itself) and contents (the part between the opening and closing pairs). To distinguish between attributes and contents, CGI.pm uses the convention of passing HTML attributes as a hash reference as the first argument, and the contents, if any, as any subsequent arguments. It works out like this: Code Generated HTML ---- -------------- h1() <h1 /> h1('some','contents'); <h1>some contents</h1> h1({-align=>left}); <h1 align="LEFT"> h1({-align=>left},'contents'); <h1 align="LEFT">contents</h1> Many newcomers to CGI.pm are puzzled by the difference between the calling conventions for the HTML shortcuts, which require curly braces around the HTML tag attributes, and the calling conventions for other routines, which manage to generate attributes without the curly brackets. Don't be confused. As a convenience the curly braces are optional in all but the HTML shortcuts. If you like, you can use curly braces when calling any routine that takes named arguments. For example: print $q->header( { -type => 'image/gif', -expires => '+3d' } ); If you use warnings, you will be warned that some CGI.pm argument names conflict with built-in perl functions. The most frequent of these is the -values argument, used to create multi-valued menus, radio button clusters and the like. To get around this warning, you have several choices: =over 4 =item 1. Use another name for the argument, if one is available. For example, -value is an alias for -values. =item 2. Change the capitalization, e.g. -Values =item 3. Put quotes around the argument name, e.g. '-values' =back =head2 Function-oriented interface HTML exports Here is a list of the HTML related function sets you can import: =over 4 =item B<:form> Import all fill-out form generating methods, such as B<textfield()>. =item B<:html2> Import all methods that generate HTML 2.0 standard elements. =item B<:html3> Import all methods that generate HTML 3.0 elements (such as <table>, <super> and <sub>). =item B<:html4> Import all methods that generate HTML 4 elements (such as <abbrev>, <acronym> and <thead>). =item B<:netscape> Import the <blink>, <fontsize> and <center> tags. =item B<:html> Import all HTML-generating shortcuts (i.e. 'html2', 'html3', 'html4' and 'netscape') =item B<:standard> Import "standard" features, 'html2', 'html3', 'html4', 'ssl', 'form' and 'cgi'. =back If you import any of the state-maintaining CGI or form-generating methods, a default CGI object will be created and initialized automatically the first time you use any of the methods that require one to be present. This includes B<param()>, B<textfield()>, B<submit()> and the like. (If you need direct access to the CGI object, you can find it in the global variable B<$CGI::Q>). =head2 Pragmas Additional HTML generation related pragms: =over 4 =item -nosticky By default the CGI module implements a state-preserving behavior called "sticky" fields. The way this works is that if you are regenerating a form, the methods that generate the form field values will interrogate param() to see if similarly-named parameters are present in the query string. If they find a like-named parameter, they will use it to set their default values. Sometimes this isn't what you want. The B<-nosticky> pragma prevents this behavior. You can also selectively change the sticky behavior in each element that you generate. =item -tabindex Automatically add tab index attributes to each form field. With this option turned off, you can still add tab indexes manually by passing a -tabindex option to each field-generating method. =item -no_xhtml By default, CGI.pm versions 2.69 and higher emit XHTML (http://www.w3.org/TR/xhtml1/). The -no_xhtml pragma disables this feature. Thanks to Michalis Kabrianis <kabrianis@hellug.gr> for this feature. If start_html()'s -dtd parameter specifies an HTML 2.0, 3.2, 4.0 or 4.01 DTD, XHTML will automatically be disabled without needing to use this pragma. =back =head2 Special forms for importing HTML-tag functions Many of the methods generate HTML tags. As described below, tag functions automatically generate both the opening and closing tags. For example: print h1('Level 1 Header'); produces <h1>Level 1 Header</h1> There will be some times when you want to produce the start and end tags yourself. In this case, you can use the form start_I<tag_name> and end_I<tag_name>, as in: print start_h1,'Level 1 Header',end_h1; =head2 Creating the HTML document header print start_html( -title => 'Secrets of the Pyramids', -author => 'fred@capricorn.org', -base => 'true', -target => '_blank', -meta => {'keywords'=>'pharaoh secret mummy', 'copyright' => 'copyright 1996 King Tut'}, -style => {'src'=>'/styles/style1.css'}, -BGCOLOR => 'blue' ); The start_html() routine creates the top of the page, along with a lot of optional information that controls the page's appearance and behavior. This method returns a canned HTML header and the opening <body> tag. All parameters are optional. In the named parameter form, recognized parameters are -title, -author, -base, -xbase, -dtd, -lang and -target (see below for the explanation). Any additional parameters you provide, such as the unofficial BGCOLOR attribute, are added to the <body> tag. Additional parameters must be proceeded by a hyphen. The argument B<-xbase> allows you to provide an HREF for the <base> tag different from the current location, as in -xbase => "http://home.mcom.com/" All relative links will be interpreted relative to this tag. The argument B<-target> allows you to provide a default target frame for all the links and fill-out forms on the page. B<This is a non-standard HTTP feature> B<which only works with some browsers!> -target => "answer_window" All relative links will be interpreted relative to this tag. You add arbitrary meta information to the header with the B<-meta> argument. This argument expects a reference to a hash containing name/value pairs of meta information. These will be turned into a series of header <meta> tags that look something like this: <meta name="keywords" content="pharaoh secret mummy"> <meta name="description" content="copyright 1996 King Tut"> To create an HTTP-EQUIV type of <meta> tag, use B<-head>, described below. The B<-style> argument is used to incorporate cascading stylesheets into your code. See the section on CASCADING STYLESHEETS for more information. The B<-lang> argument is used to incorporate a language attribute into the <html> tag. For example: print $q->start_html( -lang => 'fr-CA' ); The default if not specified is "en-US" for US English, unless the -dtd parameter specifies an HTML 2.0 or 3.2 DTD, in which case the lang attribute is left off. You can force the lang attribute to left off in other cases by passing an empty string (-lang=>''). The B<-encoding> argument can be used to specify the character set for XHTML. It defaults to iso-8859-1 if not specified. The B<-dtd> argument can be used to specify a public DTD identifier string. For example: -dtd => '-//W3C//DTD HTML 4.01 Transitional//EN') Alternatively, it can take public and system DTD identifiers as an array: -dtd => [ '-//W3C//DTD HTML 4.01 Transitional//EN', 'http://www.w3.org/TR/html4/loose.dtd' ] For the public DTD identifier to be considered, it must be valid. Otherwise it will be replaced by the default DTD. If the public DTD contains 'XHTML', CGI.pm will emit XML. The B<-declare_xml> argument, when used in conjunction with XHTML, will put a <?xml> declaration at the top of the HTML header. The sole purpose of this declaration is to declare the character set encoding. In the absence of -declare_xml, the output HTML will contain a <meta> tag that specifies the encoding, allowing the HTML to pass most validators. The default for -declare_xml is false. You can place other arbitrary HTML elements to the <head> section with the B<-head> tag. For example, to place a <link> element in the head section, use this: print start_html( -head => Link({ -rel => 'shortcut icon', -href => 'favicon.ico' }) ); To incorporate multiple HTML elements into the <head> section, just pass an array reference: print start_html( -head => [ Link({ -rel => 'next', -href => 'http://www.capricorn.com/s2.html' }), Link({ -rel => 'previous', -href => 'http://www.capricorn.com/s1.html' }) ] ); And here's how to create an HTTP-EQUIV <meta> tag: print start_html( -head => meta({ -http_equiv => 'Content-Type', -content => 'text/html' }) ); JAVASCRIPTING: The B<-script>, B<-noScript>, B<-onLoad>, B<-onMouseOver>, B<-onMouseOut> and B<-onUnload> parameters are used to add JavaScript calls to your pages. B<-script> should point to a block of text containing JavaScript function definitions. This block will be placed within a <script> block inside the HTML (not HTTP) header. The block is placed in the header in order to give your page a fighting chance of having all its JavaScript functions in place even if the user presses the stop button before the page has loaded completely. CGI.pm attempts to format the script in such a way that JavaScript-naive browsers will not choke on the code: unfortunately there are some browsers that get confused by it nevertheless. The B<-onLoad> and B<-onUnload> parameters point to fragments of JavaScript code to execute when the page is respectively opened and closed by the browser. Usually these parameters are calls to functions defined in the B<-script> field: $q = CGI->new; print header; $JSCRIPT = <<END; // Ask a silly question function riddle_me_this() { var r = prompt( "What walks on four legs in the morning, " + "two legs in the afternoon, " + "and three legs in the evening?" ); response(r); } // Get a silly answer function response(answer) { if (answer == "man") alert("Right you are!"); else alert("Wrong! Guess again."); } END print start_html( -title => 'The Riddle of the Sphinx', -script => $JSCRIPT ); Use the B<-noScript> parameter to pass some HTML text that will be displayed on browsers that do not have JavaScript (or browsers where JavaScript is turned off). The <script> tag, has several attributes including "type", "charset" and "src". "src" allows you to keep JavaScript code in an external file. To use these attributes pass a HASH reference in the B<-script> parameter containing one or more of -type, -src, or -code: print $q->start_html( -title => 'The Riddle of the Sphinx', -script => { -type => 'JAVASCRIPT', -src => '/javascript/sphinx.js'} ); print $q->( -title => 'The Riddle of the Sphinx', -script => { -type => 'PERLSCRIPT', -code => 'print "hello world!\n;"' } ); A final feature allows you to incorporate multiple <script> sections into the header. Just pass the list of script sections as an array reference. This allows you to specify different source files for different dialects of JavaScript. Example: print $q->start_html( -title => 'The Riddle of the Sphinx', -script => [ { -type => 'text/javascript', -src => '/javascript/utilities10.js' }, { -type => 'text/javascript', -src => '/javascript/utilities11.js' }, { -type => 'text/jscript', -src => '/javascript/utilities12.js' }, { -type => 'text/ecmascript', -src => '/javascript/utilities219.js' } ] ); The option "-language" is a synonym for -type, and is supported for backwards compatibility. The old-style positional parameters are as follows: B<Parameters:> =over 4 =item 1. The title =item 2. The author's e-mail address (will create a <link rev="MADE"> tag if present =item 3. A 'true' flag if you want to include a <base> tag in the header. This helps resolve relative addresses to absolute ones when the document is moved, but makes the document hierarchy non-portable. Use with care! =back Other parameters you want to include in the <body> tag may be appended to these. This is a good place to put HTML extensions, such as colors and wallpaper patterns. =head2 Ending the Html document: print $q->end_html; This ends an HTML document by printing the </body></html> tags. =head1 CREATING STANDARD HTML ELEMENTS: CGI.pm defines general HTML shortcut methods for many HTML tags. HTML shortcuts are named after a single HTML element and return a fragment of HTML text. Example: print $q->blockquote( "Many years ago on the island of", $q->a({href=>"http://crete.org/"},"Crete"), "there lived a Minotaur named", $q->strong("Fred."), ), $q->hr; This results in the following HTML code (extra newlines have been added for readability): <blockquote> Many years ago on the island of <a href="http://crete.org/">Crete</a> there lived a minotaur named <strong>Fred.</strong> </blockquote> <hr> If you find the syntax for calling the HTML shortcuts awkward, you can import them into your namespace and dispense with the object syntax completely (see the next section for more details): use CGI ':standard'; print blockquote( "Many years ago on the island of", a({href=>"http://crete.org/"},"Crete"), "there lived a minotaur named", strong("Fred."), ), hr; =head2 Providing arguments to HTML shortcuts The HTML methods will accept zero, one or multiple arguments. If you provide no arguments, you get a single tag: print hr; # <hr> If you provide one or more string arguments, they are concatenated together with spaces and placed between opening and closing tags: print h1("Chapter","1"); # <h1>Chapter 1</h1>" If the first argument is a hash reference, then the keys and values of the hash become the HTML tag's attributes: print a({-href=>'fred.html',-target=>'_new'}, "Open a new frame"); <a href="fred.html",target="_new">Open a new frame</a> You may dispense with the dashes in front of the attribute names if you prefer: print img {src=>'fred.gif',align=>'LEFT'}; <img align="LEFT" src="fred.gif"> Sometimes an HTML tag attribute has no argument. For example, ordered lists can be marked as COMPACT. The syntax for this is an argument that that points to an undef string: print ol({compact=>undef},li('one'),li('two'),li('three')); Prior to CGI.pm version 2.41, providing an empty ('') string as an attribute argument was the same as providing undef. However, this has changed in order to accommodate those who want to create tags of the form <img alt="">. The difference is shown in these two pieces of code: CODE RESULT img({alt=>undef}) <img alt> img({alt=>''}) <img alt=""> =head2 The distributive property of HTML shortcuts One of the cool features of the HTML shortcuts is that they are distributive. If you give them an argument consisting of a B<reference> to a list, the tag will be distributed across each element of the list. For example, here's one way to make an ordered list: print ul( li({-type=>'disc'},['Sneezy','Doc','Sleepy','Happy']) ); This example will result in HTML output that looks like this: <ul> <li type="disc">Sneezy</li> <li type="disc">Doc</li> <li type="disc">Sleepy</li> <li type="disc">Happy</li> </ul> This is extremely useful for creating tables. For example: print table({-border=>undef}, caption('When Should You Eat Your Vegetables?'), Tr({-align=>'CENTER',-valign=>'TOP'}, [ th(['Vegetable', 'Breakfast','Lunch','Dinner']), td(['Tomatoes' , 'no', 'yes', 'yes']), td(['Broccoli' , 'no', 'no', 'yes']), td(['Onions' , 'yes','yes', 'yes']) ] ) ); =head2 HTML shortcuts and list interpolation Consider this bit of code: print blockquote(em('Hi'),'mom!')); It will ordinarily return the string that you probably expect, namely: <blockquote><em>Hi</em> mom!</blockquote> Note the space between the element "Hi" and the element "mom!". CGI.pm puts the extra space there using array interpolation, which is controlled by the magic $" variable. Sometimes this extra space is not what you want, for example, when you are trying to align a series of images. In this case, you can simply change the value of $" to an empty string. { local($") = ''; print blockquote(em('Hi'),'mom!')); } I suggest you put the code in a block as shown here. Otherwise the change to $" will affect all subsequent code until you explicitly reset it. =head2 Non-standard HTML shortcuts A few HTML tags don't follow the standard pattern for various reasons. B<comment()> generates an HTML comment (<!-- comment -->). Call it like print comment('here is my comment'); Because of conflicts with built-in perl functions, the following functions begin with initial caps: Select Tr Link Delete Accept Sub In addition, start_html(), end_html(), start_form(), end_form(), start_multipart_form() and all the fill-out form tags are special. See their respective sections. =head2 Autoescaping HTML By default, all HTML that is emitted by the form-generating functions is passed through a function called escapeHTML(): =over 4 =item $escaped_string = escapeHTML("unescaped string"); Escape HTML formatting characters in a string. Internally this calls L<HTML::Entities> (encode_entities) so really you should just use that instead - the default list of chars that will be encoded (passed to the HTML::Entities encode_entities method) is: & < > " \x8b \x9b ' you can control this list by setting the value of $CGI::ENCODE_ENTITIES: # only encode < > $CGI::ENCODE_ENTITIES = q{<>} if you want to encode B<all> entities then undef $CGI::ENCODE_ENTITIES: # encode all entities $CGI::ENCODE_ENTITIES = undef; =back The automatic escaping does not apply to other shortcuts, such as h1(). You should call escapeHTML() yourself on untrusted data in order to protect your pages against nasty tricks that people may enter into guestbooks, etc.. To change the character set, use charset(). To turn autoescaping off completely, use autoEscape(0): =over 4 =item $charset = charset([$charset]); Get or set the current character set. =item $flag = autoEscape([$flag]); Get or set the value of the autoescape flag. =back =head1 CREATING FILL-OUT FORMS: I<General note> The various form-creating methods all return strings to the caller, containing the tag or tags that will create the requested form element. You are responsible for actually printing out these strings. It's set up this way so that you can place formatting tags around the form elements. I<Another note> The default values that you specify for the forms are only used the B<first> time the script is invoked (when there is no query string). On subsequent invocations of the script (when there is a query string), the former values are used even if they are blank. If you want to change the value of a field from its previous value, you have two choices: (1) call the param() method to set it. (2) use the -override (alias -force) parameter (a new feature in version 2.15). This forces the default value to be used, regardless of the previous value: print textfield(-name=>'field_name', -default=>'starting value', -override=>1, -size=>50, -maxlength=>80); I<Yet another note> By default, the text and labels of form elements are escaped according to HTML rules. This means that you can safely use "<CLICK ME>" as the label for a button. However, it also interferes with your ability to incorporate special HTML character sequences, such as Á, into your fields. If you wish to turn off automatic escaping, call the autoEscape() method with a false value immediately after creating the CGI object: $q = CGI->new; $q->autoEscape(0); Note that autoEscape() is exclusively used to effect the behavior of how some CGI.pm HTML generation functions handle escaping. Calling escapeHTML() explicitly will always escape the HTML. I<A Lurking Trap!> Some of the form-element generating methods return multiple tags. In a scalar context, the tags will be concatenated together with spaces, or whatever is the current value of the $" global. In a list context, the methods will return a list of elements, allowing you to modify them if you wish. Usually you will not notice this behavior, but beware of this: printf("%s\n",end_form()) end_form() produces several tags, and only the first of them will be printed because the format only expects one value. <p> =head2 Creating an isindex tag print isindex(-action=>$action); -or- print isindex($action); Prints out an <isindex> tag. Not very exciting. The parameter -action specifies the URL of the script to process the query. The default is to process the query with the current script. =head2 Starting and ending a form print start_form(-method=>$method, -action=>$action, -enctype=>$encoding); <... various form stuff ...> print end_form; -or- print start_form($method,$action,$encoding); <... various form stuff ...> print end_form; start_form() will return a <form> tag with the optional method, action and form encoding that you specify. The defaults are: method: POST action: this script enctype: application/x-www-form-urlencoded for non-XHTML multipart/form-data for XHTML, see multipart/form-data below. end_form() returns the closing </form> tag. start_form()'s enctype argument tells the browser how to package the various fields of the form before sending the form to the server. Two values are possible: =over 4 =item B<application/x-www-form-urlencoded> This is the older type of encoding. It is compatible with many CGI scripts and is suitable for short fields containing text data. For your convenience, CGI.pm stores the name of this encoding type in B<&CGI::URL_ENCODED>. =item B<multipart/form-data> This is the newer type of encoding. It is suitable for forms that contain very large fields or that are intended for transferring binary data. Most importantly, it enables the "file upload" feature. For your convenience, CGI.pm stores the name of this encoding type in B<&CGI::MULTIPART> Forms that use this type of encoding are not easily interpreted by CGI scripts unless they use CGI.pm or another library designed to handle them. If XHTML is activated (the default), then forms will be automatically created using this type of encoding. =back The start_form() method uses the older form of encoding by default unless XHTML is requested. If you want to use the newer form of encoding by default, you can call B<start_multipart_form()> instead of B<start_form()>. The method B<end_multipart_form()> is an alias to B<end_form()>. JAVASCRIPTING: The B<-name> and B<-onSubmit> parameters are provided for use with JavaScript. The -name parameter gives the form a name so that it can be identified and manipulated by JavaScript functions. -onSubmit should point to a JavaScript function that will be executed just before the form is submitted to your server. You can use this opportunity to check the contents of the form for consistency and completeness. If you find something wrong, you can put up an alert box or maybe fix things up yourself. You can abort the submission by returning false from this function. Usually the bulk of JavaScript functions are defined in a <script> block in the HTML header and -onSubmit points to one of these function call. See start_html() for details. =head2 Form elements After starting a form, you will typically create one or more textfields, popup menus, radio groups and other form elements. Each of these elements takes a standard set of named arguments. Some elements also have optional arguments. The standard arguments are as follows: =over 4 =item B<-name> The name of the field. After submission this name can be used to retrieve the field's value using the param() method. =item B<-value>, B<-values> The initial value of the field which will be returned to the script after form submission. Some form elements, such as text fields, take a single scalar -value argument. Others, such as popup menus, take a reference to an array of values. The two arguments are synonyms. =item B<-tabindex> A numeric value that sets the order in which the form element receives focus when the user presses the tab key. Elements with lower values receive focus first. =item B<-id> A string identifier that can be used to identify this element to JavaScript and DHTML. =item B<-override> A boolean, which, if true, forces the element to take on the value specified by B<-value>, overriding the sticky behavior described earlier for the B<-nosticky> pragma. =item B<-onChange>, B<-onFocus>, B<-onBlur>, B<-onMouseOver>, B<-onMouseOut>, B<-onSelect> These are used to assign JavaScript event handlers. See the JavaScripting section for more details. =back Other common arguments are described in the next section. In addition to these, all attributes described in the HTML specifications are supported. =head2 Creating a text field print textfield(-name=>'field_name', -value=>'starting value', -size=>50, -maxlength=>80); -or- print textfield('field_name','starting value',50,80); textfield() will return a text input field. B<Parameters> =over 4 =item 1. The first parameter is the required name for the field (-name). =item 2. The optional second parameter is the default starting value for the field contents (-value, formerly known as -default). =item 3. The optional third parameter is the size of the field in characters (-size). =item 4. The optional fourth parameter is the maximum number of characters the field will accept (-maxlength). =back As with all these methods, the field will be initialized with its previous contents from earlier invocations of the script. When the form is processed, the value of the text field can be retrieved with: $value = param('foo'); If you want to reset it from its initial value after the script has been called once, you can do so like this: param('foo',"I'm taking over this value!"); =head2 Creating a big text field print textarea(-name=>'foo', -default=>'starting value', -rows=>10, -columns=>50); -or print textarea('foo','starting value',10,50); textarea() is just like textfield, but it allows you to specify rows and columns for a multiline text entry box. You can provide a starting value for the field, which can be long and contain multiple lines. =head2 Creating a password field print password_field(-name=>'secret', -value=>'starting value', -size=>50, -maxlength=>80); -or- print password_field('secret','starting value',50,80); password_field() is identical to textfield(), except that its contents will be starred out on the web page. =head2 Creating a file upload field print filefield(-name=>'uploaded_file', -default=>'starting value', -size=>50, -maxlength=>80); -or- print filefield('uploaded_file','starting value',50,80); filefield() will return a file upload field. In order to take full advantage of this I<you must use the new multipart encoding scheme> for the form. You can do this either by calling B<start_form()> with an encoding type of B<&CGI::MULTIPART>, or by calling the new method B<start_multipart_form()> instead of vanilla B<start_form()>. B<Parameters> =over 4 =item 1. The first parameter is the required name for the field (-name). =item 2. The optional second parameter is the starting value for the field contents to be used as the default file name (-default). For security reasons, browsers don't pay any attention to this field, and so the starting value will always be blank. Worse, the field loses its "sticky" behavior and forgets its previous contents. The starting value field is called for in the HTML specification, however, and possibly some browser will eventually provide support for it. =item 3. The optional third parameter is the size of the field in characters (-size). =item 4. The optional fourth parameter is the maximum number of characters the field will accept (-maxlength). =back JAVASCRIPTING: The B<-onChange>, B<-onFocus>, B<-onBlur>, B<-onMouseOver>, B<-onMouseOut> and B<-onSelect> parameters are recognized. See textfield() for details. =head2 Creating a popup menu print popup_menu('menu_name', ['eenie','meenie','minie'], 'meenie'); -or- %labels = ('eenie'=>'your first choice', 'meenie'=>'your second choice', 'minie'=>'your third choice'); %attributes = ('eenie'=>{'class'=>'class of first choice'}); print popup_menu('menu_name', ['eenie','meenie','minie'], 'meenie',\%labels,\%attributes); -or (named parameter style)- print popup_menu(-name=>'menu_name', -values=>['eenie','meenie','minie'], -default=>['meenie','minie'], -labels=>\%labels, -attributes=>\%attributes); popup_menu() creates a menu. Please note that the -multiple option will be ignored if passed - use scrolling_list() if you want to create a menu that supports multiple selections =over 4 =item 1. The required first argument is the menu's name (-name). =item 2. The required second argument (-values) is an array B<reference> containing the list of menu items in the menu. You can pass the method an anonymous array, as shown in the example, or a reference to a named array, such as "\@foo". =item 3. The optional third parameter (-default) is the name of the default menu choice. If not specified, the first item will be the default. The values of the previous choice will be maintained across queries. Pass an array reference to select multiple defaults. =item 4. The optional fourth parameter (-labels) is provided for people who want to use different values for the user-visible label inside the popup menu and the value returned to your script. It's a pointer to an hash relating menu values to user-visible labels. If you leave this parameter blank, the menu values will be displayed by default. (You can also leave a label undefined if you want to). =item 5. The optional fifth parameter (-attributes) is provided to assign any of the common HTML attributes to an individual menu item. It's a pointer to a hash relating menu values to another hash with the attribute's name as the key and the attribute's value as the value. =back When the form is processed, the selected value of the popup menu can be retrieved using: $popup_menu_value = param('menu_name'); =head2 Creating an option group Named parameter style print popup_menu(-name=>'menu_name', -values=>[qw/eenie meenie minie/, optgroup(-name=>'optgroup_name', -values => ['moe','catch'], -attributes=>{'catch'=>{'class'=>'red'}})], -labels=>{'eenie'=>'one', 'meenie'=>'two', 'minie'=>'three'}, -default=>'meenie'); Old style print popup_menu('menu_name', ['eenie','meenie','minie', optgroup('optgroup_name', ['moe', 'catch'], {'catch'=>{'class'=>'red'}})],'meenie', {'eenie'=>'one','meenie'=>'two','minie'=>'three'}); optgroup() creates an option group within a popup menu. =over 4 =item 1. The required first argument (B<-name>) is the label attribute of the optgroup and is B<not> inserted in the parameter list of the query. =item 2. The required second argument (B<-values>) is an array reference containing the list of menu items in the menu. You can pass the method an anonymous array, as shown in the example, or a reference to a named array, such as \@foo. If you pass a HASH reference, the keys will be used for the menu values, and the values will be used for the menu labels (see -labels below). =item 3. The optional third parameter (B<-labels>) allows you to pass a reference to a hash containing user-visible labels for one or more of the menu items. You can use this when you want the user to see one menu string, but have the browser return your program a different one. If you don't specify this, the value string will be used instead ("eenie", "meenie" and "minie" in this example). This is equivalent to using a hash reference for the -values parameter. =item 4. An optional fourth parameter (B<-labeled>) can be set to a true value and indicates that the values should be used as the label attribute for each option element within the optgroup. =item 5. An optional fifth parameter (-novals) can be set to a true value and indicates to suppress the val attribute in each option element within the optgroup. See the discussion on optgroup at W3C (http://www.w3.org/TR/REC-html40/interact/forms.html#edef-OPTGROUP) for details. =item 6. An optional sixth parameter (-attributes) is provided to assign any of the common HTML attributes to an individual menu item. It's a pointer to a hash relating menu values to another hash with the attribute's name as the key and the attribute's value as the value. =back =head2 Creating a scrolling list print scrolling_list('list_name', ['eenie','meenie','minie','moe'], ['eenie','moe'],5,'true',{'moe'=>{'class'=>'red'}}); -or- print scrolling_list('list_name', ['eenie','meenie','minie','moe'], ['eenie','moe'],5,'true', \%labels,%attributes); -or- print scrolling_list(-name=>'list_name', -values=>['eenie','meenie','minie','moe'], -default=>['eenie','moe'], -size=>5, -multiple=>'true', -labels=>\%labels, -attributes=>\%attributes); scrolling_list() creates a scrolling list. B<Parameters:> =over 4 =item 1. The first and second arguments are the list name (-name) and values (-values). As in the popup menu, the second argument should be an array reference. =item 2. The optional third argument (-default) can be either a reference to a list containing the values to be selected by default, or can be a single value to select. If this argument is missing or undefined, then nothing is selected when the list first appears. In the named parameter version, you can use the synonym "-defaults" for this parameter. =item 3. The optional fourth argument is the size of the list (-size). =item 4. The optional fifth argument can be set to true to allow multiple simultaneous selections (-multiple). Otherwise only one selection will be allowed at a time. =item 5. The optional sixth argument is a pointer to a hash containing long user-visible labels for the list items (-labels). If not provided, the values will be displayed. =item 6. The optional sixth parameter (-attributes) is provided to assign any of the common HTML attributes to an individual menu item. It's a pointer to a hash relating menu values to another hash with the attribute's name as the key and the attribute's value as the value. When this form is processed, all selected list items will be returned as a list under the parameter name 'list_name'. The values of the selected items can be retrieved with: @selected = param('list_name'); =back =head2 Creating a group of related checkboxes print checkbox_group(-name=>'group_name', -values=>['eenie','meenie','minie','moe'], -default=>['eenie','moe'], -linebreak=>'true', -disabled => ['moe'], -labels=>\%labels, -attributes=>\%attributes); print checkbox_group('group_name', ['eenie','meenie','minie','moe'], ['eenie','moe'],'true',\%labels, {'moe'=>{'class'=>'red'}}); HTML3-COMPATIBLE BROWSERS ONLY: print checkbox_group(-name=>'group_name', -values=>['eenie','meenie','minie','moe'], -rows=2,-columns=>2); checkbox_group() creates a list of checkboxes that are related by the same name. B<Parameters:> =over 4 =item 1. The first and second arguments are the checkbox name and values, respectively (-name and -values). As in the popup menu, the second argument should be an array reference. These values are used for the user-readable labels printed next to the checkboxes as well as for the values passed to your script in the query string. =item 2. The optional third argument (-default) can be either a reference to a list containing the values to be checked by default, or can be a single value to checked. If this argument is missing or undefined, then nothing is selected when the list first appears. =item 3. The optional fourth argument (-linebreak) can be set to true to place line breaks between the checkboxes so that they appear as a vertical list. Otherwise, they will be strung together on a horizontal line. =back The optional B<-labels> argument is a pointer to a hash relating the checkbox values to the user-visible labels that will be printed next to them. If not provided, the values will be used as the default. The optional parameters B<-rows>, and B<-columns> cause checkbox_group() to return an HTML3 compatible table containing the checkbox group formatted with the specified number of rows and columns. You can provide just the -columns parameter if you wish; checkbox_group will calculate the correct number of rows for you. The option B<-disabled> takes an array of checkbox values and disables them by greying them out (this may not be supported by all browsers). The optional B<-attributes> argument is provided to assign any of the common HTML attributes to an individual menu item. It's a pointer to a hash relating menu values to another hash with the attribute's name as the key and the attribute's value as the value. The optional B<-tabindex> argument can be used to control the order in which radio buttons receive focus when the user presses the tab button. If passed a scalar numeric value, the first element in the group will receive this tab index and subsequent elements will be incremented by one. If given a reference to an array of radio button values, then the indexes will be jiggered so that the order specified in the array will correspond to the tab order. You can also pass a reference to a hash in which the hash keys are the radio button values and the values are the tab indexes of each button. Examples: -tabindex => 100 # this group starts at index 100 and counts up -tabindex => ['moe','minie','eenie','meenie'] # tab in this order -tabindex => {meenie=>100,moe=>101,minie=>102,eenie=>200} # tab in this order The optional B<-labelattributes> argument will contain attributes attached to the <label> element that surrounds each button. When the form is processed, all checked boxes will be returned as a list under the parameter name 'group_name'. The values of the "on" checkboxes can be retrieved with: @turned_on = param('group_name'); The value returned by checkbox_group() is actually an array of button elements. You can capture them and use them within tables, lists, or in other creative ways: @h = checkbox_group(-name=>'group_name',-values=>\@values); &use_in_creative_way(@h); =head2 Creating a standalone checkbox print checkbox(-name=>'checkbox_name', -checked=>1, -value=>'ON', -label=>'CLICK ME'); -or- print checkbox('checkbox_name','checked','ON','CLICK ME'); checkbox() is used to create an isolated checkbox that isn't logically related to any others. B<Parameters:> =over 4 =item 1. The first parameter is the required name for the checkbox (-name). It will also be used for the user-readable label printed next to the checkbox. =item 2. The optional second parameter (-checked) specifies that the checkbox is turned on by default. Synonyms are -selected and -on. =item 3. The optional third parameter (-value) specifies the value of the checkbox when it is checked. If not provided, the word "on" is assumed. =item 4. The optional fourth parameter (-label) is the user-readable label to be attached to the checkbox. If not provided, the checkbox name is used. =back The value of the checkbox can be retrieved using: $turned_on = param('checkbox_name'); =head2 Creating a radio button group print radio_group(-name=>'group_name', -values=>['eenie','meenie','minie'], -default=>'meenie', -linebreak=>'true', -labels=>\%labels, -attributes=>\%attributes); -or- print radio_group('group_name',['eenie','meenie','minie'], 'meenie','true',\%labels,\%attributes); HTML3-COMPATIBLE BROWSERS ONLY: print radio_group(-name=>'group_name', -values=>['eenie','meenie','minie','moe'], -rows=2,-columns=>2); radio_group() creates a set of logically-related radio buttons (turning one member of the group on turns the others off) B<Parameters:> =over 4 =item 1. The first argument is the name of the group and is required (-name). =item 2. The second argument (-values) is the list of values for the radio buttons. The values and the labels that appear on the page are identical. Pass an array I<reference> in the second argument, either using an anonymous array, as shown, or by referencing a named array as in "\@foo". =item 3. The optional third parameter (-default) is the name of the default button to turn on. If not specified, the first item will be the default. You can provide a nonexistent button name, such as "-" to start up with no buttons selected. =item 4. The optional fourth parameter (-linebreak) can be set to 'true' to put line breaks between the buttons, creating a vertical list. =item 5. The optional fifth parameter (-labels) is a pointer to an associative array relating the radio button values to user-visible labels to be used in the display. If not provided, the values themselves are displayed. =back All modern browsers can take advantage of the optional parameters B<-rows>, and B<-columns>. These parameters cause radio_group() to return an HTML3 compatible table containing the radio group formatted with the specified number of rows and columns. You can provide just the -columns parameter if you wish; radio_group will calculate the correct number of rows for you. To include row and column headings in the returned table, you can use the B<-rowheaders> and B<-colheaders> parameters. Both of these accept a pointer to an array of headings to use. The headings are just decorative. They don't reorganize the interpretation of the radio buttons -- they're still a single named unit. The optional B<-tabindex> argument can be used to control the order in which radio buttons receive focus when the user presses the tab button. If passed a scalar numeric value, the first element in the group will receive this tab index and subsequent elements will be incremented by one. If given a reference to an array of radio button values, then the indexes will be jiggered so that the order specified in the array will correspond to the tab order. You can also pass a reference to a hash in which the hash keys are the radio button values and the values are the tab indexes of each button. Examples: -tabindex => 100 # this group starts at index 100 and counts up -tabindex => ['moe','minie','eenie','meenie'] # tab in this order -tabindex => {meenie=>100,moe=>101,minie=>102,eenie=>200} # tab in this order The optional B<-attributes> argument is provided to assign any of the common HTML attributes to an individual menu item. It's a pointer to a hash relating menu values to another hash with the attribute's name as the key and the attribute's value as the value. The optional B<-labelattributes> argument will contain attributes attached to the <label> element that surrounds each button. When the form is processed, the selected radio button can be retrieved using: $which_radio_button = param('group_name'); The value returned by radio_group() is actually an array of button elements. You can capture them and use them within tables, lists, or in other creative ways: @h = radio_group(-name=>'group_name',-values=>\@values); &use_in_creative_way(@h); =head2 Creating a submit button print submit(-name=>'button_name', -value=>'value'); -or- print submit('button_name','value'); submit() will create the query submission button. Every form should have one of these. B<Parameters:> =over 4 =item 1. The first argument (-name) is optional. You can give the button a name if you have several submission buttons in your form and you want to distinguish between them. =item 2. The second argument (-value) is also optional. This gives the button a value that will be passed to your script in the query string. The name will also be used as the user-visible label. =item 3. You can use -label as an alias for -value. I always get confused about which of -name and -value changes the user-visible label on the button. =back You can figure out which button was pressed by using different values for each one: $which_one = param('button_name'); =head2 Creating a reset button print reset reset() creates the "reset" button. Note that it restores the form to its value from the last time the script was called, NOT necessarily to the defaults. Note that this conflicts with the perl reset() built-in. Use CORE::reset() to get the original reset function. =head2 Creating a default button print defaults('button_label') defaults() creates a button that, when invoked, will cause the form to be completely reset to its defaults, wiping out all the changes the user ever made. =head2 Creating a hidden field print hidden(-name=>'hidden_name', -default=>['value1','value2'...]); -or- print hidden('hidden_name','value1','value2'...); hidden() produces a text field that can't be seen by the user. It is useful for passing state variable information from one invocation of the script to the next. B<Parameters:> =over 4 =item 1. The first argument is required and specifies the name of this field (-name). =item 2. The second argument is also required and specifies its value (-default). In the named parameter style of calling, you can provide a single value here or a reference to a whole list =back Fetch the value of a hidden field this way: $hidden_value = param('hidden_name'); Note, that just like all the other form elements, the value of a hidden field is "sticky". If you want to replace a hidden field with some other values after the script has been called once you'll have to do it manually: param('hidden_name','new','values','here'); =head2 Creating a clickable image button print image_button(-name=>'button_name', -src=>'/source/URL', -align=>'MIDDLE'); -or- print image_button('button_name','/source/URL','MIDDLE'); image_button() produces a clickable image. When it's clicked on the position of the click is returned to your script as "button_name.x" and "button_name.y", where "button_name" is the name you've assigned to it. B<Parameters:> =over 4 =item 1. The first argument (-name) is required and specifies the name of this field. =item 2. The second argument (-src) is also required and specifies the URL =item 3. The third option (-align, optional) is an alignment type, and may be TOP, BOTTOM or MIDDLE =back Fetch the value of the button this way: $x = param('button_name.x'); $y = param('button_name.y'); =head2 Creating a javascript action button print button(-name=>'button_name', -value=>'user visible label', -onClick=>"do_something()"); -or- print button('button_name',"user visible value","do_something()"); button() produces an C<< <input> >> tag with C<type="button">. When it's pressed the fragment of JavaScript code pointed to by the B<-onClick> parameter will be executed. =head1 WORKING WITH FRAMES It's possible for CGI.pm scripts to write into several browser panels and windows using the HTML 4 frame mechanism. There are three techniques for defining new frames programmatically: =over 4 =item 1. Create a <Frameset> document After writing out the HTTP header, instead of creating a standard HTML document using the start_html() call, create a <frameset> document that defines the frames on the page. Specify your script(s) (with appropriate parameters) as the SRC for each of the frames. There is no specific support for creating <frameset> sections in CGI.pm, but the HTML is very simple to write. =item 2. Specify the destination for the document in the HTTP header You may provide a B<-target> parameter to the header() method: print header(-target=>'ResultsWindow'); This will tell the browser to load the output of your script into the frame named "ResultsWindow". If a frame of that name doesn't already exist, the browser will pop up a new window and load your script's document into that. There are a number of magic names that you can use for targets. See the HTML C<< <frame> >> documentation for details. =item 3. Specify the destination for the document in the <form> tag You can specify the frame to load in the FORM tag itself. With CGI.pm it looks like this: print start_form(-target=>'ResultsWindow'); When your script is reinvoked by the form, its output will be loaded into the frame named "ResultsWindow". If one doesn't already exist a new window will be created. =back The script "frameset.cgi" in the examples directory shows one way to create pages in which the fill-out form and the response live in side-by-side frames. =head1 SUPPORT FOR JAVASCRIPT The usual way to use JavaScript is to define a set of functions in a <SCRIPT> block inside the HTML header and then to register event handlers in the various elements of the page. Events include such things as the mouse passing over a form element, a button being clicked, the contents of a text field changing, or a form being submitted. When an event occurs that involves an element that has registered an event handler, its associated JavaScript code gets called. The elements that can register event handlers include the <BODY> of an HTML document, hypertext links, all the various elements of a fill-out form, and the form itself. There are a large number of events, and each applies only to the elements for which it is relevant. Here is a partial list: =over 4 =item B<onLoad> The browser is loading the current document. Valid in: + The HTML <BODY> section only. =item B<onUnload> The browser is closing the current page or frame. Valid for: + The HTML <BODY> section only. =item B<onSubmit> The user has pressed the submit button of a form. This event happens just before the form is submitted, and your function can return a value of false in order to abort the submission. Valid for: + Forms only. =item B<onClick> The mouse has clicked on an item in a fill-out form. Valid for: + Buttons (including submit, reset, and image buttons) + Checkboxes + Radio buttons =item B<onChange> The user has changed the contents of a field. Valid for: + Text fields + Text areas + Password fields + File fields + Popup Menus + Scrolling lists =item B<onFocus> The user has selected a field to work with. Valid for: + Text fields + Text areas + Password fields + File fields + Popup Menus + Scrolling lists =item B<onBlur> The user has deselected a field (gone to work somewhere else). Valid for: + Text fields + Text areas + Password fields + File fields + Popup Menus + Scrolling lists =item B<onSelect> The user has changed the part of a text field that is selected. Valid for: + Text fields + Text areas + Password fields + File fields =item B<onMouseOver> The mouse has moved over an element. + Text fields + Text areas + Password fields + File fields + Popup Menus + Scrolling lists =item B<onMouseOut> The mouse has moved off an element. + Text fields + Text areas + Password fields + File fields + Popup Menus + Scrolling lists =back In order to register a JavaScript event handler with an HTML element, just use the event name as a parameter when you call the corresponding CGI method. For example, to have your validateAge() JavaScript code executed every time the textfield named "age" changes, generate the field like this: print textfield(-name=>'age',-onChange=>"validateAge(this)"); This example assumes that you've already declared the validateAge() function by incorporating it into a <SCRIPT> block. The CGI.pm start_html() method provides a convenient way to create this section. Similarly, you can create a form that checks itself over for consistency and alerts the user if some essential value is missing by creating it this way: print start_form(-onSubmit=>"validateMe(this)"); See the javascript.cgi script for a demonstration of how this all works. =head1 LIMITED SUPPORT FOR CASCADING STYLE SHEETS CGI.pm has limited support for HTML3's cascading style sheets (css). To incorporate a stylesheet into your document, pass the start_html() method a B<-style> parameter. The value of this parameter may be a scalar, in which case it is treated as the source URL for the stylesheet, or it may be a hash reference. In the latter case you should provide the hash with one or more of B<-src> or B<-code>. B<-src> points to a URL where an externally-defined stylesheet can be found. B<-code> points to a scalar value to be incorporated into a <style> section. Style definitions in B<-code> override similarly-named ones in B<-src>, hence the name "cascading." You may also specify the type of the stylesheet by adding the optional B<-type> parameter to the hash pointed to by B<-style>. If not specified, the style defaults to 'text/css'. To refer to a style within the body of your document, add the B<-class> parameter to any HTML element: print h1({-class=>'Fancy'},'Welcome to the Party'); Or define styles on the fly with the B<-style> parameter: print h1({-style=>'Color: red;'},'Welcome to Hell'); You may also use the new B<span()> element to apply a style to a section of text: print span({-style=>'Color: red;'}, h1('Welcome to Hell'), "Where did that handbasket get to?" ); Note that you must import the ":html3" definitions to have the B<span()> method available. Here's a quick and dirty example of using CSS's. See the CSS specification at http://www.w3.org/Style/CSS/ for more information. use CGI qw/:standard :html3/; #here's a stylesheet incorporated directly into the page $newStyle=<<END; <!-- P.Tip { margin-right: 50pt; margin-left: 50pt; color: red; } P.Alert { font-size: 30pt; font-family: sans-serif; color: red; } --> END print header(); print start_html( -title=>'CGI with Style', -style=>{-src=>'http://www.capricorn.com/style/st1.css', -code=>$newStyle} ); print h1('CGI with Style'), p({-class=>'Tip'}, "Better read the cascading style sheet spec before playing with this!"), span({-style=>'color: magenta'}, "Look Mom, no hands!", p(), "Whooo wee!" ); print end_html; Pass an array reference to B<-code> or B<-src> in order to incorporate multiple stylesheets into your document. Should you wish to incorporate a verbatim stylesheet that includes arbitrary formatting in the header, you may pass a -verbatim tag to the -style hash, as follows: print start_html (-style => {-verbatim => '@import url("/server-common/css/'.$cssFile.'");', -src => '/server-common/css/core.css'}); This will generate an HTML header that contains this: <link rel="stylesheet" type="text/css" href="/server-common/css/core.css"> <style type="text/css"> @import url("/server-common/css/main.css"); </style> Any additional arguments passed in the -style value will be incorporated into the <link> tag. For example: start_html(-style=>{-src=>['/styles/print.css','/styles/layout.css'], -media => 'all'}); This will give: <link rel="stylesheet" type="text/css" href="/styles/print.css" media="all"/> <link rel="stylesheet" type="text/css" href="/styles/layout.css" media="all"/> <p> To make more complicated <link> tags, use the Link() function and pass it to start_html() in the -head argument, as in: @h = (Link({-rel=>'stylesheet',-type=>'text/css',-src=>'/ss/ss.css',-media=>'all'}), Link({-rel=>'stylesheet',-type=>'text/css',-src=>'/ss/fred.css',-media=>'paper'})); print start_html({-head=>\@h}) To create primary and "alternate" stylesheet, use the B<-alternate> option: start_html(-style=>{-src=>[ {-src=>'/styles/print.css'}, {-src=>'/styles/alt.css',-alternate=>1} ] }); =head2 Dumping out all the name/value pairs The Dump() method produces a string consisting of all the query's name/value pairs formatted nicely as a nested list. This is useful for debugging purposes: print Dump Produces something that looks like: <ul> <li>name1 <ul> <li>value1 <li>value2 </ul> <li>name2 <ul> <li>value1 </ul> </ul> As a shortcut, you can interpolate the entire CGI object into a string and it will be replaced with the a nice HTML dump shown above: $q=CGI->new; print "<h2>Current Values</h2> $q\n"; =head1 BUGS Address bug reports and comments to: L<https://github.com/leejo/CGI.pm/issues> See the L<https://github.com/leejo/CGI.pm/blob/master/CONTRIBUTING.md> file for information on raising issues and contributing The original bug tracker can be found at: L<https://rt.cpan.org/Public/Dist/Display.html?Queue=CGI.pm> =head1 SEE ALSO L<CGI> - The original source of this documentation / functionality =cut Functions.pm 0000644 00000000175 00000000000 0007016 0 ustar 00 package CGI::HTML::Functions; use strict; use warnings; # nothing here yet... may move functions here in the long term 1;
| ver. 1.4 |
Github
|
.
| PHP 8.3.30 | Generation time: 0.09 |
proxy
|
phpinfo
|
Settings