Source HTML.pm

stas 28 Mar 2002 02:19:49 -0000

stas        02/03/27 18:19:48

  Modified:    lib/DocSet/Source HTML.pm
  Log:
  sync with DocSet 0.12, the latest change:
  - add code that parses source HTML and escapes unsafe URI/HTML chars
    in links.
  
  Revision  Changes    Path
  1.4       +106 -19   modperl-docs/lib/DocSet/Source/HTML.pm
  
  Index: HTML.pm
  ===================================================================
  RCS file: /home/cvs/modperl-docs/lib/DocSet/Source/HTML.pm,v
  retrieving revision 1.3
  retrieving revision 1.4
  diff -u -r1.3 -r1.4
  --- HTML.pm   30 Jan 2002 16:55:04 -0000      1.3
  +++ HTML.pm   28 Mar 2002 02:19:48 -0000      1.4
  @@ -9,6 +9,8 @@
   require DocSet::Doc;
   @ISA = qw(DocSet::Doc);
   
  +use constant ENCODE_CHARS => '<>&" ';
  +
   sub retrieve_meta_data {
       my($self) = @_;
   
  @@ -31,6 +33,26 @@
       # there is no autogenerated TOC for HTML files
   }
   
  +my %linkElements =  (          # from HTML::Element.pm
  +    body   => 'background',
  +    base   => 'href',
  +    a      => 'href',
  +    img    => [qw(src lowsrc usemap)], # lowsrc is a Netscape invention
  +    form   => 'action',
  +    input  => 'src',
  +    'link'  => 'href',         # need quoting since link is a perl builtin
  +    frame  => 'src',
  +    applet => 'codebase',
  +    area   => 'href',
  +);
  +my %tag_attr;
  +for my $tag (keys %linkElements) {
  +    my $tagval = $linkElements{$tag};
  +    for my $attr (ref $tagval ? @$tagval : $tagval) {
  +        $tag_attr{"$tag $attr"}++;
  +    }
  +}
  +
   # currently retrieves these parts from the source HTML
   # head.title
   # head.meta.description
  @@ -41,30 +63,95 @@
       # already parsed
       return if exists $self->{parsed_tree} && $self->{parsed_tree};
   
  -    # this one retrievs the body and the title of the given html
       require HTML::Parser;
  -    sub start_h {
  -        my($self, $tagname, $attr) = @_;
  -        if ($tagname eq 'meta' && lc $attr->{name} eq 'description') {
  -            $self->{parsed_tree}->{abstract} = $attr->{content};
  +    require HTML::Entities;
  +
  +    my $new_content;
  +
  +    # this parsing is for fixing up unsafe chars in URLs
  +    {
  +        # accum_h(self, $text)
  +        sub accum_h { 
  +            my $self = shift;
  +            #print "[ @_ ]";
  +            $self->{content} .= join '', @_;
  +        }
  +
  +        # encode unsafe chars in the URL attributes
  +        sub start_h {
  +            my($self, $tagname, $attr, $text) = @_;
  +
  +            # store away the HTML as is
  +            unless ($linkElements{$tagname}) {
  +                accum_h($self, $text);
  +                return;
  +            }
  +
  +            # escape thpse that include link elements
  +            accum_h($self, qq{<$tagname});
  +            for (keys %$attr) {
  +                accum_h($self, qq{ $_="});
  +                my $val = $attr->{$_};
  +                if ($tag_attr{"$tagname $_"}) {
  +                    $val = HTML::Entities::encode($val, ENCODE_CHARS);
  +                }
  +                accum_h($self, $val);
  +            }
  +            accum_h($self, qq{">});
  +        }
  +
  +        sub end_h {
  +            my($self, $tagname) = @_;
  +            accum_h($self, "</$tagname>");
           }
  +
  +        sub text_h {
  +            my($self, $text) = @_;
  +            accum_h($self, $text);
  +        }
  +
  +        my $p = HTML::Parser->new(api_version => 3,
  +                                  start_h     => [\&start_h, "self, tagname, 
attr, text"],
  +                                  end_h   => [\&end_h,  "self, tagname"],
  +                                  text_h  => [\&text_h, "self, text"],
  +                                 );
  +        # Parse document text chunk by chunk
  +        $p->parse(${ $self->{content} });
  +        $p->eof;
  +        $new_content = $p->{content};
  +        $self->{content} = \$new_content;
  +        #print $new_content, "\n\n\n";
       }
  -    sub end_h {
  -        my($self, $tagname, $skipped_text) = @_;
  -        # use $p itself as a tmp storage (ok according to the docs)
  -        $self->{parsed_tree}->{$tagname} = $skipped_text;
  +
  +    {
  +        # this one retrieves and stashes away the description (As 
'abstract') 
  +        # and the body and the title of the given html
  +        my $start_h = sub {
  +            my($self, $tagname, $attr) = @_;
  +            if ($tagname eq 'meta' && lc $attr->{name} eq 'description') {
  +                $self->{parsed_tree}->{abstract} = $attr->{content};
  +            }
  +        };
  +    
  +        my $end_h = sub {
  +            my($self, $tagname, $skipped_text) = @_;
  +            # use $p itself as a tmp storage (ok according to the docs)
  +                $self->{parsed_tree}->{$tagname} = $skipped_text;
  +        };
  +
  +        my $p = HTML::Parser->new(api_version => 3,
  +                                  report_tags => [qw(title meta body)],
  +                                  start_h => [$start_h, "self, tagname, 
attr"],
  +                                  end_h   => [$end_h,   "self, tagname, 
skipped_text"],
  +                                 );
  +        # Parse document text chunk by chunk
  +        $p->parse(${ $self->{content} });
  +        $p->eof;
  +
  +        # store the tree away
  +        $self->{parsed_tree} = $p->{parsed_tree};
       }
  -    my $p = HTML::Parser->new(api_version => 3,
  -                              report_tags => [qw(title body meta)],
  -                              start_h => [\&start_h, "self,tagname,attr"],
  -                              end_h   => [\&end_h,   
"self,tagname,skipped_text"],
  -                             );
  -    # Parse document text chunk by chunk
  -    $p->parse(${ $self->{content} });
  -    $p->eof;
   
  -    # store the tree away
  -    $self->{parsed_tree} = $p->{parsed_tree};
   }


---------------------------------------------------------------------
To unsubscribe, e-mail: [EMAIL PROTECTED]
For additional commands, e-mail: [EMAIL PROTECTED]

cvs commit: modperl-docs/lib/DocSet/Source HTML.pm

Reply via email to