Source HTML.pm

stas 15 Jun 2002 18:20:34 -0000

stas        2002/06/15 11:20:33

  Modified:    lib/DocSet RunTime.pm
               lib/DocSet/Doc HTML2HTML.pm POD2HTML.pm
               lib/DocSet/Source HTML.pm
  Log:
  sync with DocSet
  - improve the parsing of the E<lt>headE<gt> and make base, meta and
    link elements available to the templates. [Per Einar Ellefsen
    <[EMAIL PROTECTED]>]
  
  - correct the mapping of config.cfg to autogenerated index.html, also
    use path2uri to convert from a path to uri. also fix the stripping
    of the full base path on the non unix system, by using abs2rel from
    File::Spec.
  
  Revision  Changes    Path
  1.8       +13 -23    modperl-docs/lib/DocSet/RunTime.pm
  
  Index: RunTime.pm
  ===================================================================
  RCS file: /home/cvs/modperl-docs/lib/DocSet/RunTime.pm,v
  retrieving revision 1.7
  retrieving revision 1.8
  diff -u -r1.7 -r1.8
  --- RunTime.pm        11 Jun 2002 15:02:10 -0000      1.7
  +++ RunTime.pm        15 Jun 2002 18:20:33 -0000      1.8
  @@ -6,7 +6,7 @@
   use strict;
   use warnings;
   
  -use File::Spec::Functions qw(catdir catfile splitdir);
  +use File::Spec::Functions qw(catdir catfile splitdir abs2rel);
   use File::Find;
   
   use DocSet::Util;
  @@ -84,7 +84,8 @@
   
       @search_paths = @{$ra_search_paths || []};
   
  -    %exts = map {$_ => 1} @{$ra_search_exts || []};
  +    # .cfg is for matching config.cfg to become index.html
  +    %exts = map {$_ => 1} @{$ra_search_exts || []}, 'cfg';
   
       my @ext_accept_pattern = map {quotemeta($_)."\$"} keys %exts;
       my $rsub_keep_ext = 
  @@ -92,34 +93,23 @@
   
       my %seen;
       for my $rel_path (@search_paths) {
  -        my $full_path = catdir $base, $rel_path;
  -        die "$full_path is not a dir" unless -d $full_path;
  +        my $full_base_path = catdir $base, $rel_path;
  +        die "$full_base_path is not a dir" unless -d $full_base_path;
   
           my @seen_pattern = map {"^".quotemeta($_)} keys %seen;
  -        my $rsub_skip_seen = 
  -            build_matchmany_sub([EMAIL PROTECTED]);
  +        my $rsub_skip_seen = build_matchmany_sub([EMAIL PROTECTED]);
   
  -        # rewrite non / paths to be / as in URI ($rel_path is no more
  -        # needed to read from the real fs, will need this fixup for
  -        # generating proper URIs.
  -        $rel_path = join "/", splitdir $rel_path;
  -
  -        my $full_path_regex = quotemeta $full_path;
  -        $src_docs{$rel_path} = {
  -            "index.html" => 1,              # base index.html
  -            map { m{(.*?/?)[^/]+$}          # add autogenerated index.html
  -                  ? ("$1index.html" => 1, $_ => 1)
  -                  : ($_ => 1);              # shouldn't happen, but just in 
case
  -            }
  -            map {join "/", splitdir $_}     # rewrite non / paths to be 
URI's /
  -            map {s|$full_path_regex/||; $_} # strip the leading path
  +        my $rel_uri = path2uri($rel_path);
  +        $src_docs{$rel_uri} = {
  +            map { s/config\.cfg$/index.html/; ($_ => 1) } # autogenerated 
index.html
  +            map path2uri( abs2rel($_, $full_base_path) ), # full 
path=>relative uri
               grep $rsub_keep_ext->($_),      # get files with wanted exts
               grep !$rsub_skip_seen->($_),    # skip seen base dirs
  -            @{ expand_dir($full_path) }
  +            @{ expand_dir($full_base_path) }
           };
   
  -        note "Scanning for src files: $full_path";
  -        $seen{$full_path}++;
  +        note "Scanning for src files: $full_base_path";
  +        $seen{$full_base_path}++;
       }
   
   #   dumper \%src_docs;
  
  
  
  1.3       +1 -0      modperl-docs/lib/DocSet/Doc/HTML2HTML.pm
  
  Index: HTML2HTML.pm
  ===================================================================
  RCS file: /home/cvs/modperl-docs/lib/DocSet/Doc/HTML2HTML.pm,v
  retrieving revision 1.2
  retrieving revision 1.3
  diff -u -r1.2 -r1.3
  --- HTML2HTML.pm      5 Feb 2002 10:27:19 -0000       1.2
  +++ HTML2HTML.pm      15 Jun 2002 18:20:33 -0000      1.3
  @@ -20,6 +20,7 @@
       my $vars = {
                   meta => $self->{meta},
                   body => [EMAIL PROTECTED],
  +                headers => $self->{parsed_tree}{head},
                   dir  => $self->{dir},
                   nav  => $self->{nav},
                   last_modified => $self->{timestamp},
  
  
  
  1.7       +1 -0      modperl-docs/lib/DocSet/Doc/POD2HTML.pm
  
  Index: POD2HTML.pm
  ===================================================================
  RCS file: /home/cvs/modperl-docs/lib/DocSet/Doc/POD2HTML.pm,v
  retrieving revision 1.6
  retrieving revision 1.7
  diff -u -r1.6 -r1.7
  --- POD2HTML.pm       13 Jun 2002 09:20:16 -0000      1.6
  +++ POD2HTML.pm       15 Jun 2002 18:20:33 -0000      1.7
  @@ -48,6 +48,7 @@
                   meta => $self->{meta},
                   toc  => $self->{toc},
                   body => [EMAIL PROTECTED],
  +                headers => {},
                   dir  => $self->{dir},
                   nav  => $self->{nav},
                   last_modified => $self->{timestamp},
  
  
  
  1.7       +62 -20    modperl-docs/lib/DocSet/Source/HTML.pm
  
  Index: HTML.pm
  ===================================================================
  RCS file: /home/cvs/modperl-docs/lib/DocSet/Source/HTML.pm,v
  retrieving revision 1.6
  retrieving revision 1.7
  diff -u -r1.6 -r1.7
  --- HTML.pm   14 Apr 2002 06:17:48 -0000      1.6
  +++ HTML.pm   15 Jun 2002 18:20:33 -0000      1.7
  @@ -53,10 +53,6 @@
       }
   }
   
  -# currently retrieves these parts from the source HTML
  -# head.title
  -# head.meta.description
  -# body
   sub parse {
       my($self) = @_;
       
  @@ -110,11 +106,12 @@
               accum_h($self, $text);
           }
   
  -        my $p = HTML::Parser->new(api_version => 3,
  -                                  start_h     => [\&start_h, "self, tagname, 
attr, text"],
  -                                  end_h   => [\&end_h,  "self, tagname"],
  -                                  text_h  => [\&text_h, "self, text"],
  -                                 );
  +        my $p = HTML::Parser->new(
  +            api_version => 3,
  +            start_h     => [\&start_h, "self, tagname, attr, text"],
  +            end_h       => [\&end_h,  "self, tagname"],
  +            text_h      => [\&text_h, "self, text"],
  +        );
           # Parse document text chunk by chunk
           $p->parse(${ $self->{content} });
           $p->eof;
  @@ -124,26 +121,70 @@
       }
   
       {
  -        # this one retrieves and stashes away the description (As 
'abstract') 
  -        # and the body and the title of the given html
  +        # this parsing extracts the following elements and makes them
  +        # available to templates as:
  +        # meta.title
  +        # head.meta.* (+ renames: description -> abstract)
  +        # head.base
  +        # head.link
  +        # body
  +
  +        # init
           my $start_h = sub {
  -            my($self, $tagname, $attr) = @_;
  -            if ($tagname eq 'meta' && lc $attr->{name} eq 'description') {
  -                $self->{parsed_tree}->{abstract} = $attr->{content};
  +            my($self, $tagname, $attr, $text) = @_;
  +            my $meta = $self->{parsed_tree}{head}{meta};
  +
  +            # special treatment
  +            if ($tagname eq 'meta' && exists $attr->{name} && 
  +                lc $attr->{name} eq 'description') {
  +                $self->{parsed_tree}{abstract} = $attr->{content};
  +            }
  +            elsif ($tagname eq 'meta' && exists $attr->{content}) {
  +                # note: doesn't take into account the 'scheme' attr,
  +                # but that one isn't used much
  +                if (exists $attr->{name}) {
  +                    $meta->{name}{ $attr->{name} } = $attr->{content};
  +                }
  +                elsif (exists $attr->{'http-equiv'}) {
  +                    $meta->{'http-equiv'}{ $attr->{'http-equiv'} }
  +                        = $attr->{content};
  +                }
  +                else {
  +                    # unsupported head element?
  +                }
  +            }
  +            elsif ($tagname eq 'base') {
  +                # there is usually only one <base>
  +                $self->{parsed_tree}{head}{base} = $attr->{href}
  +                    if exists $attr->{href};
               }
  +            elsif ($tagname eq 'link') {
  +                # link elements won't overlap, because each is
  +                # additive -> easier to store text
  +                $self->{parsed_tree}{head}{link} .= $text if length $text;
  +            }
  +            # note: if adding other elements that also appear outside <head>,
  +            # you will need to check that you are inside <head>  by setting
  +            # a flag when entering it and unsetting it when exiting
           };
       
           my $end_h = sub {
               my($self, $tagname, $skipped_text) = @_;
               # use $p itself as a tmp storage (ok according to the docs)
  +            # <title> and <body> get special treatment
  +            if ($tagname eq 'title' or $tagname eq 'body') { 
                   $self->{parsed_tree}->{$tagname} = $skipped_text;
  +            }
           };
   
  -        my $p = HTML::Parser->new(api_version => 3,
  -                                  report_tags => [qw(title meta body)],
  -                                  start_h => [$start_h, "self, tagname, 
attr"],
  -                                  end_h   => [$end_h,   "self, tagname, 
skipped_text"],
  -                                 );
  +        my $p = HTML::Parser->new(
  +            api_version => 3,
  +            report_tags => [qw(title meta body base link)],
  +            start_h     => [$start_h, "self, tagname, attr, text"],
  +            end_h       => [$end_h, "self, tagname, skipped_text"],
  +        );
  +        # init
  +        $p->{parsed_tree}{head}{meta} = {};
           # Parse document text chunk by chunk
           $p->parse(${ $self->{content} });
           $p->eof;
  @@ -180,7 +221,8 @@
   
   Retrieve and set the meta data that describes the input document into
   the I<meta> object attribute. The I<title> and I<link> meta attributes
  -are getting set.
  +are getting set. the rest of the E<lt>headE<gt> is made available for
  +the templates too.
   
   =back


---------------------------------------------------------------------
To unsubscribe, e-mail: [EMAIL PROTECTED]
For additional commands, e-mail: [EMAIL PROTECTED]

cvs commit: modperl-docs/lib/DocSet/Source HTML.pm

Reply via email to