CC: Sean M. Burke, maintainer of HTML::TreeBuilder
"Josts Smokehouse" <[EMAIL PROTECTED]> wrote:
> FAIL webrobot-0.60 sun4-solaris-thread-multi 2.9
Confirmed on SunOS 5.7 / Perl 5.8.5
(luckily found a dusty, tucked away, old sun)
The fail reason is that HTML::TreeBuilder->as_XML behaves different on
linux vs. SunOs. I have attached a test script for HTML::TreeBuilder and
two necessary data files. The script will run standalone.
HTML::TreeBuilder->as_XML will return (see output section below):
source char linux, win2000 SunOs 5.7
-----------------------------------------------------
umlaut-a ä ä
an asian char ç›® 目
--- and note ----------------------------------------
utf-8 flag off on
On SunOS as_XML gives the character number encoded as entity, while on
linux its the utf-8 byte sequence, each byte itself encoded as entity.
The question are:
1. Is this intended in HTML::TreeBuilder? If so, why?
2. Is the entity encoding reliably attached to the utf-8 flag?
I can fix it, if the utf-8 flag reliably indicates the kind of entity
encoding.
Stefan
--------------------------------------------------------------------
Output of the script (search for 'DIRTY' and 'XML' in the script):
=== output on linux ===
| <erbse:stlocal>
| <erbse:stlocal> /save/st/tmp/html-parser.t
| 1..4
| DIRTY[utf-8]=<html>
| <head>
| <title>\x{C3}\x{A4}</title>
| <meta name="description" content="\x{C3}\x{A4}">
| </head>
| <body>
| 1111\x{C2}\x{A0}2222\x{C3}\x{86}3333
| </body>
| </html>
|
| XML[nativ]=<?xml version="1.0" encoding="UTF-8"?>
| <html><head> <title>ä</title> <meta
content="ä" name="description" /> </head><body> 1
| 111 2222Æ3333 </body> </html>
|
| ok 1 - iso-8859-1: No HTML entities
| ok 2 - Entities must be utf-8 encoded
| DIRTY[utf-8]=<html>
| <head>
| <title>\x{E7}\x{9B}\x{AE}</title>
| <meta name="description" content="\x{C3}\x{A4}"/>
| </head>
| </html>
|
| XML[nativ]=<?xml version="1.0" encoding="UTF-8"?>
| <html><head> <title>ç›®</title> <meta
content="ä" name="description" /> </head><body></body
| > </html>
|
| ok 3 - utf-8: chinese character
| ok 4 - utf-8: umlaut
| <erbse:stlocal>
=== output on sun ===
| <sun:st> /save/st/tmp/html-parser.t
| 1..4
| DIRTY[utf-8]=<html>
| <head>
| <title>\x{C3}\x{A4}</title>
| <meta name="description" content="\x{C3}\x{A4}">
| </head>
| <body>
| 1111\x{C2}\x{A0}2222\x{C3}\x{86}3333
| </body>
| </html>
|
| XML[utf-8]=<?xml version="1.0" encoding="UTF-8"?>
| <html><head> <title>ä</title> <meta content="ä"
name="description" /> </head><body>
1111 2222Æ3333 </body> </html>
|
| ok 1 - iso-8859-1: No HTML entities
| ok 2 - Entities must be utf-8 encoded
| DIRTY[utf-8]=<html>
| <head>
| <title>\x{E7}\x{9B}\x{AE}</title>
| <meta name="description" content="\x{C3}\x{A4}"/>
| </head>
| </html>
|
| XML[utf-8]=<?xml version="1.0" encoding="UTF-8"?>
| <html><head> <title>目</title> <meta content="ä"
name="description" /> </head><body></body> </html>
|
| ok 3 - utf-8: chinese character
| ok 4 - utf-8: umlaut
| <sun:st>
#!/usr/bin/perl -w
use strict;
use warnings;
use Encode;
use HTML::TreeBuilder;
use HTML::Entities;
use Test::More tests => 4;
sub debug {1}
my $XML_HEADER = qq(<?xml version="1.0" encoding="UTF-8"?>\n);
my %e2c =
map {$_ => pack("U", ord $HTML::Entities::entity2char{$_})}
grep {my $value = ord($HTML::Entities::entity2char{$_}); 128 <= $value &&
$value < 256}
keys %HTML::Entities::entity2char;
sub utf8 {Encode::is_utf8($_[0]) ? "utf-8" : "nativ"}
sub octet {
join("",
map {
$_ > 255 ? # if wide character...
sprintf("\\x{%04X}", $_) # \x{...}
: $_ > 127 ? # if 1xxxxxxx
sprintf("\\x{%02X}", $_) # \x..
: # else
chr($_) # as themselves
} unpack("C*", $_[0])
);
}
sub html_decode_entities_utf8 {
my ($value) = @_;
foreach ($value) {
s/(&\#(\d+);?)/ 128<=$2 && $2<256 ? pack("U", $2) : $1 /eg;
s/(&\#[xX]([0-9a-fA-F]+);?)/ my $c = hex($2); 128<=$c && $c<256 ?
pack("U", $c) : $1 /eg;
s/(&(\w+);?)/ $e2c{$2} || $1 /eg;
}
return $value;
}
sub to_xhtml {
my ($dirty_html, $encoding) = @_;
my $parser = new HTML::TreeBuilder();
$parser->no_space_compacting(1);
$parser->ignore_ignorable_whitespace(0);
# Encode $dirty_html to Perls internal encoding UTF-8.
#$dirty_html = octet_to_internal_utf8($encoding, $dirty_html);
$dirty_html = Encode::decode($encoding, $dirty_html);
# Decode HTML entities, because HTML::TreeBuilder doesn't handle it right.
# Can't use HTML::Entities::decode_entities because it uses 'chr($x)'
# instead of 'pack("U",$x)'
$dirty_html = html_decode_entities_utf8($dirty_html);
print STDERR "DIRTY[", utf8($dirty_html), "]=", octet($dirty_html), "\n" if
debug;
# Parse $dirty_html and encode all remaining bytes as html entities.
# That works because all non-ASCII UTF-8 character bytes are 1xxxxxxx
my $tree = $parser->parse($dirty_html);
my $xml = $XML_HEADER . $tree->as_XML();
# $xml has all byte encoded as &#xx;
$tree = $tree -> delete;
print STDERR "XML[", utf8($xml), "]=", octet($xml), "\n" if debug;
# Decode UTF-8 characters and control characters, $xml is ASCII
$xml =~ s/(&\#(\d+);)/ 32 <= $2 && $2 < 128 ? $1 : pack("C", $2) /eg;
# Now we have an UTF-8 string and must Perl believe so too.
Encode::_utf8_on($xml);
return $xml;
}
sub check_file {
my ($filename, $encoding, $assert) = @_;
local *F;
open F, "<$filename" or die "Can't open file=$filename, $!";
binmode F;
my $content = do {local $/; <F>};
close F;
my $xhtml = to_xhtml($content, $encoding);
$assert->($xhtml);
}
MAIN: {
check_file("isolatin-simple.html", "iso-8859-1", sub {
my ($xhtml) = @_;
unlike($xhtml, qr/&#\d+;/, "iso-8859-1: No HTML entities");
$xhtml =~ m/(1111[^3]*3333)/;
my $out = join ":", unpack("C*", $1);
is($out, "49:49:49:49:194:160:50:50:50:50:195:134:51:51:51:51",
"Entities must be utf-8 encoded");
});
check_file("chinese.html", "utf-8", sub {
my ($xhtml) = @_;
like($xhtml, qr{<title>\x{76EE}</title>}, "utf-8: chinese character");
like($xhtml, qr{content="\xE4"}, "utf-8: umlaut");
});
}
1;
Title: Ã
1111 2222Æ3333
Title: ÃâÂ