stas 02/03/27 18:19:48 Modified: lib/DocSet/Source HTML.pm Log: sync with DocSet 0.12, the latest change: - add code that parses source HTML and escapes unsafe URI/HTML chars in links. Revision Changes Path 1.4 +106 -19 modperl-docs/lib/DocSet/Source/HTML.pm Index: HTML.pm =================================================================== RCS file: /home/cvs/modperl-docs/lib/DocSet/Source/HTML.pm,v retrieving revision 1.3 retrieving revision 1.4 diff -u -r1.3 -r1.4 --- HTML.pm 30 Jan 2002 16:55:04 -0000 1.3 +++ HTML.pm 28 Mar 2002 02:19:48 -0000 1.4 @@ -9,6 +9,8 @@ require DocSet::Doc; @ISA = qw(DocSet::Doc); +use constant ENCODE_CHARS => '<>&" '; + sub retrieve_meta_data { my($self) = @_; @@ -31,6 +33,26 @@ # there is no autogenerated TOC for HTML files } +my %linkElements = ( # from HTML::Element.pm + body => 'background', + base => 'href', + a => 'href', + img => [qw(src lowsrc usemap)], # lowsrc is a Netscape invention + form => 'action', + input => 'src', + 'link' => 'href', # need quoting since link is a perl builtin + frame => 'src', + applet => 'codebase', + area => 'href', +); +my %tag_attr; +for my $tag (keys %linkElements) { + my $tagval = $linkElements{$tag}; + for my $attr (ref $tagval ? @$tagval : $tagval) { + $tag_attr{"$tag $attr"}++; + } +} + # currently retrieves these parts from the source HTML # head.title # head.meta.description @@ -41,30 +63,95 @@ # already parsed return if exists $self->{parsed_tree} && $self->{parsed_tree}; - # this one retrievs the body and the title of the given html require HTML::Parser; - sub start_h { - my($self, $tagname, $attr) = @_; - if ($tagname eq 'meta' && lc $attr->{name} eq 'description') { - $self->{parsed_tree}->{abstract} = $attr->{content}; + require HTML::Entities; + + my $new_content; + + # this parsing is for fixing up unsafe chars in URLs + { + # accum_h(self, $text) + sub accum_h { + my $self = shift; + #print "[ @_ ]"; + $self->{content} .= join '', @_; + } + + # encode unsafe chars in the URL attributes + sub start_h { + my($self, $tagname, $attr, $text) = @_; + + # store away the HTML as is + unless ($linkElements{$tagname}) { + accum_h($self, $text); + return; + } + + # escape thpse that include link elements + accum_h($self, qq{<$tagname}); + for (keys %$attr) { + accum_h($self, qq{ $_="}); + my $val = $attr->{$_}; + if ($tag_attr{"$tagname $_"}) { + $val = HTML::Entities::encode($val, ENCODE_CHARS); + } + accum_h($self, $val); + } + accum_h($self, qq{">}); + } + + sub end_h { + my($self, $tagname) = @_; + accum_h($self, "</$tagname>"); } + + sub text_h { + my($self, $text) = @_; + accum_h($self, $text); + } + + my $p = HTML::Parser->new(api_version => 3, + start_h => [\&start_h, "self, tagname, attr, text"], + end_h => [\&end_h, "self, tagname"], + text_h => [\&text_h, "self, text"], + ); + # Parse document text chunk by chunk + $p->parse(${ $self->{content} }); + $p->eof; + $new_content = $p->{content}; + $self->{content} = \$new_content; + #print $new_content, "\n\n\n"; } - sub end_h { - my($self, $tagname, $skipped_text) = @_; - # use $p itself as a tmp storage (ok according to the docs) - $self->{parsed_tree}->{$tagname} = $skipped_text; + + { + # this one retrieves and stashes away the description (As 'abstract') + # and the body and the title of the given html + my $start_h = sub { + my($self, $tagname, $attr) = @_; + if ($tagname eq 'meta' && lc $attr->{name} eq 'description') { + $self->{parsed_tree}->{abstract} = $attr->{content}; + } + }; + + my $end_h = sub { + my($self, $tagname, $skipped_text) = @_; + # use $p itself as a tmp storage (ok according to the docs) + $self->{parsed_tree}->{$tagname} = $skipped_text; + }; + + my $p = HTML::Parser->new(api_version => 3, + report_tags => [qw(title meta body)], + start_h => [$start_h, "self, tagname, attr"], + end_h => [$end_h, "self, tagname, skipped_text"], + ); + # Parse document text chunk by chunk + $p->parse(${ $self->{content} }); + $p->eof; + + # store the tree away + $self->{parsed_tree} = $p->{parsed_tree}; } - my $p = HTML::Parser->new(api_version => 3, - report_tags => [qw(title body meta)], - start_h => [\&start_h, "self,tagname,attr"], - end_h => [\&end_h, "self,tagname,skipped_text"], - ); - # Parse document text chunk by chunk - $p->parse(${ $self->{content} }); - $p->eof; - # store the tree away - $self->{parsed_tree} = $p->{parsed_tree}; }
--------------------------------------------------------------------- To unsubscribe, e-mail: [EMAIL PROTECTED] For additional commands, e-mail: [EMAIL PROTECTED]