I hope this is the right place to post this.
It looks to me as if HTML::Entities (or is it HTML::Parser?) decode_entities doesn't do the right thing with Unicode strings.
The code below gives warnings about malformed Unicode, but works if I replace decode_entities with HTML::Entities::decode_entities_old or don't force the string to Unicode.
This is with ActivePerl build 810, HTML::Entities 1.27 and HTML::Parser 3.36.
/Sune
*** output with decode_entities, note that Latin1 code points is substituted for entities
c:\MyDocs>perl -w t3.pl
À l'exception des États-Unis
$abs is utf8
$abs is utf8 after decode
â l'exception des âtats-Unis
\x{00C0}\ l\'exception\ des\ \x{00C9}tats\-Unis
Malformed UTF-8 character (unexpected non-continuation byte 0x20, immediately af
ter start byte 0xc0) in substitution (s///) at t3.pl line 19.
Malformed UTF-8 character (unexpected non-continuation byte 0x74, immediately af
ter start byte 0xc9) in substitution (s///) at t3.pl line 19.
Malformed UTF-8 character (unexpected non-continuation byte 0x74, immediately af
ter start byte 0xc9) in substitution (s///) at t3.pl line 19.
*** code
use strict;
binmode( STDOUT, ':utf8' );
use HTML::Entities;
my $abs = "À l'exception des États-Unis";
# make Unicode
$abs = pack("U*", unpack("C*", $abs));
print "$abs\n";
print '$abs is ', utf8::is_utf8( $abs ) ? '' : 'not ', 'utf8', " \n";
decode_entities( $abs );
#HTML::Entities::decode_entities_old( $abs );
print '$abs is ', utf8::is_utf8( $abs ) ? '' : 'not ', 'utf8', " after decode \n";
print "$abs\n";
print nice_string( $abs ), "\n";
# this gives warning about malformed Unicode $abs =~ s/\s\w+\s*$//;
sub nice_string { join("", map { $_ > 127 ? # if non-ascii character... sprintf("\\x{%04X}", $_) : # \x{...} chr($_) =~ /[[:cntrl:]]/ ? # else if control character ... sprintf("\\x%02X", $_) : # \x.. quotemeta(chr($_)) # else quoted or as themselves } unpack("C*", $_[0])) # unpack Unicode characters ; }
-- Sune Karlsson | Fax: + 46 8 34 81 61 Stockholm School of Economics | Phone: + 46 8 736 92 39 Box 6501, 113 83 Stockholm, Sweden | http://www.hhs.se/personal/SuneK/ http://econpapers.hhs.se/ | http://swopec.hhs.se/