Hi there,

the following produces a segfault using the latest version of libwww. 

As it seems, HTML::Parser is marking non UTF8 strings as UTF8 strings.



use HTML::TokeParser;
use LWP::Simple;
use URI::URL;

$data = get("http://www.aries.lu/site.php?section=movies";);

my $tp = HTML::TokeParser->new(\$data);


while (my $token = $tp->get_token)
                {
                my $ttype = shift @{ $token };

                if($ttype eq "S")    # start tag?
                    {
                    my($tag, $attr, $attrseq, $rawtxt) = @{ $token };

                    $tag = lc($tag);

                    if($tag eq "a")
                        {
                        my $a_href = $attr->{'href'};
                        my $a_encl = $tp->get_trimmed_text("/$tag"); 
                                                print "$a_href\n";
                        $a_href = url($a_href, $docurl)->abs if ($a_href
ne "");
                                                }
                    }
                
                }



or to see it:


#!/usr/bin/perl
use warnings;
use strict;
use Devel::Peek;
use HTML::Parser;
my $html = qq{<img title="&rsquo;\260">};
my $p = HTML::Parser->new(api_version=>3,start_h=>[sub{Dump(shift-
>{title})}, "attr"]);
$p->parse($html);



Thibaut

Reply via email to