Here is what I ended up with - this is a chunk from a much bigger
script. Suggestions gladly accepted. Haven't fixed the special
characters yet.
my ( $date, $p, @articles ) = ();
if ( ! defined( $p = HTML::TokeParser->new( $html )))
{
localError( "Unable to parse $html : $!" );
}
my ( $title, $body ) = ();
while ( my $token = $p->get_token())
{
if ( $token->[0] eq 'C' )
{
if ( $token->[1] =~ m#<!-- begin header date --># )
{
while ( my $token = $p->get_token())
{
if ( $token->[0] eq "T" )
{
$date .= $token->[1];
}
elsif ( $token->[0] eq "S" )
{
$date .= $token->[4];
}
elsif ( $token->[0] eq "E" )
{
$date .= $token->[2];
}
elsif ( $token->[0] eq "C" && $token->[1] =~ m#<!-
- end header date --># )
{
last;
}
else
{
localError( "$token->[0] : unrecognized HTML
Token Type in Date : <PRE>" . Dumper( $token ) . "</PRE>" );
}
}
}
elsif( $token->[1] =~ m#<!-- begin article\d* title# )
{
while ( my $token = $p->get_token())
{
if ( $token->[0] eq "T" )
{
$title .= $token->[1];
}
elsif ( $token->[0] eq "S" )
{
$title .= $token->[4];
}
elsif ( $token->[0] eq "E" )
{
$title .= $token->[2];
}
elsif ( $token->[0] eq "C" && $token->[1] =~ m#<!-
- end article# )
{
last;
}
elsif ( $token->[0] ne "C" )
{
localError( "$token->[0] : unrecognized HTML
Token Type in Title : <PRE>" . Dumper( $token ) . "</PRE>" );
}
}
}
elsif( $token->[1] =~ m#<!-- begin article\d* body# )
{
while ( my $token = $p->get_token())
{
if ( $token->[0] eq "T" )
{
$body .= $token->[1];
}
elsif ( $token->[0] eq "S" )
{
$body .= $token->[4];
}
elsif ( $token->[0] eq "E" )
{
$body .= $token->[2];
}
elsif ( $token->[0] eq "C" && $token->[1] =~ m#<!-
- end article# )
{
last;
}
elsif ( $token->[0] ne "C" )
{
localError( "$token->[0] : unrecognized HTML
Token Type in Body : <PRE>" . Dumper( $token ) . "</PRE>" );
}
}
}
}
if ( defined( $title ) && defined( $body ) && $title ne "" &&
$body ne "" )
{
my %article = ();
$title =~ s#\n##g;
$article{'title'} = $title;
$article{'body'} = $body;
push( @articles, \%article );
( $body, $title ) = ();
}
}
--
To unsubscribe, e-mail: [EMAIL PROTECTED]
For additional commands, e-mail: [EMAIL PROTECTED]