Dan Muey wrote:
>
> print "body text: @body\n"; # this needs to keep the tags were they are**
>
that' fairly easy to do:
#!/usr/bin/perl -w
use strict;
use HTMP::Parser;
my $text = <<HTML;
<html><head>
<title> HI Title </title>
heaD STUFF
</head>
<body bodytag=attributes>
<i> keep the I tag </i>
hI HERE'S CONTENT i WANT
<img src=""> IMaGE
<!-- i WANT TO STRIP COMMENTS OUT -->
<SCRIPT>
i DON'T WANT THIS SCRIPT EITHER
</SCRIPT>
<font>Hello world</font>
</BODY>
</HTMl>
HTML
my $body = 0;
my $title = 0;
my @body;
my @title;
my %body_attr;
my $html = HTML::Parser->new(api_version => 3,
text_h => [\&text,'dtext'],
start_h => [\&open_tag, 'tagname,attr'],
end_h => [\&close_tag, 'tagname']);
$html->ignore_elements(qw(script comment));
$html->parse($text);
$html->eof;
print "title is:\n@title\n\n";
print "body text:\n@body\n\n";
print "body attr:\n";
while(my($k,$v) = each %body_attr){
print "$k=$v\n";
}
sub text{
my $text = shift;
return unless($text =~ /\w/);
if($title){
push(@title,$text);
}elsif($body){
push(@body,$text);
}
}
sub open_tag{
my $tagname = shift;
my $attr = shift;
$title = 1 if($tagname eq 'title');
if($tagname eq 'body'){
$body = 1;
while(my($key,$value) = each %{$attr}){
$body_attr{$key} = "'$value'";
}
}elsif($body){
my $t = '';
while(my($key,$value) = each %{$attr}){
$t .= "$key='$value' ";
}
$t =~ s/\s$//;
push(@body,"<$tagname" . ($t ? " $t>" : '>'));
}
}
sub close_tag{
my $tagname = shift;
$title = 0 if($tagname eq 'title');
$body = 0 if($tagname eq 'body');
push(@body,"</$tagname>") if($body);
}
__END__
prints:
title is:
HI Title
body text:
<i> keep the I tag </i>
hI HERE'S CONTENT i WANT
<img src=''> IMaGE
<font> Hello world </font>
body attr.:
bodytag='attributes'
david
--
To unsubscribe, e-mail: [EMAIL PROTECTED]
For additional commands, e-mail: [EMAIL PROTECTED]