CC: Sean M. Burke, maintainer of HTML::TreeBuilder

"Josts Smokehouse" <[EMAIL PROTECTED]> wrote:

> FAIL webrobot-0.60 sun4-solaris-thread-multi 2.9

Confirmed on SunOS 5.7 / Perl 5.8.5
(luckily found a dusty, tucked away, old sun)

The fail reason is that HTML::TreeBuilder->as_XML behaves different on
linux vs. SunOs. I have attached a test script for HTML::TreeBuilder and
two necessary data files. The script will run standalone.

HTML::TreeBuilder->as_XML will return (see output section below):

source char         linux, win2000          SunOs 5.7
-----------------------------------------------------
umlaut-a            &#195;&#164;            &#228;
an asian char       &#231;&#155;&#174;      &#30446;
--- and note ----------------------------------------
utf-8 flag          off                     on

On SunOS as_XML gives the character number encoded as entity, while on
linux its the utf-8 byte sequence, each byte itself encoded as entity.

The question are:

1. Is this intended in HTML::TreeBuilder?  If so, why?
2. Is the entity encoding reliably attached to the utf-8 flag?

I can fix it, if the utf-8 flag reliably indicates the kind of entity
encoding.

Stefan
--------------------------------------------------------------------

Output of the script (search for 'DIRTY' and 'XML' in the script):

=== output on linux ===
| <erbse:stlocal>
| <erbse:stlocal> /save/st/tmp/html-parser.t
| 1..4
| DIRTY[utf-8]=<html>
|   <head>
|     <title>\x{C3}\x{A4}</title>
|     <meta name="description" content="\x{C3}\x{A4}">
|   </head>
|   <body>
|       1111\x{C2}\x{A0}2222\x{C3}\x{86}3333
|   </body>
| </html>
| 
| XML[nativ]=<?xml version="1.0" encoding="UTF-8"?>
| <html><head>&#10;    <title>&#195;&#164;</title>&#10;    <meta 
content="&#195;&#164;" name="description" />&#10;  </head><body>&#10;      1
| 111&#194;&#160;2222&#195;&#134;3333&#10;  </body>&#10;  &#10;  &#10;</html>
| 
| ok 1 - iso-8859-1: No HTML entities
| ok 2 - Entities must be utf-8 encoded
| DIRTY[utf-8]=<html>
|   <head>
|     <title>\x{E7}\x{9B}\x{AE}</title>
|     <meta name="description" content="\x{C3}\x{A4}"/>
|   </head>
| </html>
| 
| XML[nativ]=<?xml version="1.0" encoding="UTF-8"?>
| <html><head>&#10;    <title>&#231;&#155;&#174;</title>&#10;    <meta 
content="&#195;&#164;" name="description" />&#10;  </head><body></body
| >&#10;  &#10;</html>
| 
| ok 3 - utf-8: chinese character
| ok 4 - utf-8: umlaut
| <erbse:stlocal>

=== output on sun ===
| <sun:st> /save/st/tmp/html-parser.t
| 1..4
| DIRTY[utf-8]=<html>
|   <head>
|     <title>\x{C3}\x{A4}</title>
|     <meta name="description" content="\x{C3}\x{A4}">
|   </head>
|   <body>
|       1111\x{C2}\x{A0}2222\x{C3}\x{86}3333
|   </body>
| </html>
| 
| XML[utf-8]=<?xml version="1.0" encoding="UTF-8"?>
| <html><head>&#10;    <title>&#228;</title>&#10;    <meta content="&#228;" 
name="description" />&#10;  </head><body>&#10;      
1111&#160;2222&#198;3333&#10;  </body>&#10;  &#10;  &#10;</html>
| 
| ok 1 - iso-8859-1: No HTML entities
| ok 2 - Entities must be utf-8 encoded
| DIRTY[utf-8]=<html>
|   <head>
|     <title>\x{E7}\x{9B}\x{AE}</title>
|     <meta name="description" content="\x{C3}\x{A4}"/>
|   </head>
| </html>
| 
| XML[utf-8]=<?xml version="1.0" encoding="UTF-8"?>
| <html><head>&#10;    <title>&#30446;</title>&#10;    <meta content="&#228;" 
name="description" />&#10;  </head><body></body>&#10;  &#10;</html>
| 
| ok 3 - utf-8: chinese character
| ok 4 - utf-8: umlaut
| <sun:st> 

#!/usr/bin/perl -w
use strict;
use warnings;

use Encode;
use HTML::TreeBuilder;
use HTML::Entities;
use Test::More tests => 4;

sub debug {1}


my $XML_HEADER = qq(<?xml version="1.0" encoding="UTF-8"?>\n);
my %e2c =
    map {$_ => pack("U", ord $HTML::Entities::entity2char{$_})}
    grep {my $value = ord($HTML::Entities::entity2char{$_}); 128 <= $value && 
$value < 256}
    keys %HTML::Entities::entity2char;

sub utf8 {Encode::is_utf8($_[0]) ? "utf-8" : "nativ"}

sub octet {
    join("",
        map {
            $_ > 255 ?                      # if wide character...
                sprintf("\\x{%04X}", $_)    #     \x{...}
            : $_ > 127 ?                    # if 1xxxxxxx
                sprintf("\\x{%02X}", $_)      #     \x..
            :                               # else
                chr($_)                     #     as themselves
        } unpack("C*", $_[0])
    );
}

sub html_decode_entities_utf8 {
    my ($value) = @_;
    foreach ($value) {
        s/(&\#(\d+);?)/ 128<=$2 && $2<256 ? pack("U", $2) : $1 /eg;
        s/(&\#[xX]([0-9a-fA-F]+);?)/ my $c = hex($2); 128<=$c && $c<256 ? 
pack("U", $c) : $1 /eg;
        s/(&(\w+);?)/ $e2c{$2} || $1 /eg;
    }
    return $value;
}

sub to_xhtml {
    my ($dirty_html, $encoding) = @_;

    my $parser = new HTML::TreeBuilder();
    $parser->no_space_compacting(1);
    $parser->ignore_ignorable_whitespace(0);

    # Encode $dirty_html to Perls internal encoding UTF-8.
    #$dirty_html = octet_to_internal_utf8($encoding, $dirty_html);
    $dirty_html = Encode::decode($encoding, $dirty_html);

    # Decode HTML entities, because HTML::TreeBuilder doesn't handle it right.
    # Can't use HTML::Entities::decode_entities because it uses 'chr($x)'
    # instead of 'pack("U",$x)'
    $dirty_html = html_decode_entities_utf8($dirty_html);
    print STDERR "DIRTY[", utf8($dirty_html), "]=", octet($dirty_html), "\n" if 
debug;

    # Parse $dirty_html and encode all remaining bytes as html entities.
    # That works because all non-ASCII UTF-8 character bytes are 1xxxxxxx
    my $tree = $parser->parse($dirty_html);
    my $xml = $XML_HEADER . $tree->as_XML();
    # $xml has all byte encoded as &#xx;
    $tree = $tree -> delete;

    print STDERR "XML[", utf8($xml), "]=", octet($xml), "\n" if debug;
    # Decode UTF-8 characters and control characters, $xml is ASCII
    $xml =~ s/(&\#(\d+);)/ 32 <= $2 && $2 < 128 ? $1 : pack("C", $2) /eg;

    # Now we have an UTF-8 string and must Perl believe so too.
    Encode::_utf8_on($xml);

    return $xml;
}

sub check_file {
    my ($filename, $encoding, $assert) = @_;

    local *F;
    open F, "<$filename" or die "Can't open file=$filename, $!";
    binmode F;
    my $content = do {local $/; <F>};
    close F;

    my $xhtml = to_xhtml($content, $encoding);
    $assert->($xhtml);
}


MAIN: {
    check_file("isolatin-simple.html", "iso-8859-1", sub {
        my ($xhtml) = @_;
        unlike($xhtml, qr/&#\d+;/, "iso-8859-1: No HTML entities");
        $xhtml =~ m/(1111[^3]*3333)/;
        my $out = join ":", unpack("C*", $1);
        is($out, "49:49:49:49:194:160:50:50:50:50:195:134:51:51:51:51",
            "Entities must be utf-8 encoded");
    });
    check_file("chinese.html", "utf-8", sub {
        my ($xhtml) = @_;
        like($xhtml, qr{<title>\x{76EE}</title>}, "utf-8: chinese character");
        like($xhtml, qr{content="\xE4"}, "utf-8: umlaut");
    });
}


1;
Title: Ã
1111 2222Æ3333
Title: ÃâÂ

Reply via email to