Dan Muey wrote:

> 
> Very nice, although I'd like to keep html tags that are between the body
> tags as well except script & comment.
> 
> Also @body contains the attributes of the body tag as well as all of the
> text in the body :
> 
> my $new_title = join '', @title;
> my $new_body_atts = join(//,@body);
> 
> print "TITLE -$new_title- \n BODY ATTRIBUTES -$new_body_atts- \n";
> 
> Any ideas?
> 

so you want to:
1. get title
2. get body but without comment and script
3. all other tags except comment and script should be included
4. attribute from body should not be part of body

#!/usr/bin/perl -w
use strict;

use HTML::Parser;

my $text = <<HTML;
<html><head>
<title> HI Title </title>
heaD STUFF
</head>
<body bodytag=attributes>
hI HERE'S CONTENT i WANT
<!-- i WANT TO STRIP COMMENTS OUT -->
<SCRIPT>

i DON'T WANT THIS SCRIPT EITHER
</SCRIPT>
<font>Hello world</font>

</BODY>
</HTMl>
HTML

my $body = 0;
my $title = 0;
my @body;
my @title;
my @tags;
my %body_attr;

my $html = HTML::Parser->new(api_version => 3,
                                text_h  => [\&text,'dtext'],
                                start_h => [\&open_tag,'tagname,attr'],
                                end_h   => [\&close_tag,'tagname']);

$html->ignore_elements(qw(script comment));
$html->parse($text);
$html->eof;

print "title is: @title\n";
print "body text: @body\n";
print "body attr.:\n";
while(my($k,$v) = each %body_attr){
        print "$k=$v\n";
}
print "Other tag inside body: @tags\n";

#-- DONE --#

sub text{
        my $text = shift;

        return unless($text =~ /\w/);

        if($title){
                push(@title,$text);
        }elsif($body){
                push(@body,$text);
        }
}

sub open_tag{

        my $tagname = shift;
        my $attr    = shift;

        $title = 1 if($tagname eq 'title');

        if($tagname eq 'body'){
                $body = 1;
                while(my($key,$value) = each %{$attr}){
                        $body_attr{$key} = "'$value'";
                }
        }elsif($body){
                push(@tags,"<$tagname>");
        }
}

sub close_tag{

        my $tagname = shift;

        $title = 0 if($tagname eq 'title');
        $body  = 0 if($tagname eq 'body');

        push(@tags,"</$tagname>") if($body);
}

__END__

prints:

title is:  HI Title
body text:
hI HERE'S CONTENT i WANT
 Hello world
body attr.:
bodytag='attributes'
Other tag inside body: <font> </font>

imagine you have to do the same in reg. expr.

david

-- 
To unsubscribe, e-mail: [EMAIL PROTECTED]
For additional commands, e-mail: [EMAIL PROTECTED]

Reply via email to