The following patch should make sure that HTML::Parser does not produce badly encoded SVs. That avoid the problem demonstrated, but I still need to track down why perl itself segfaulted because of this.
Regards, Gisle Index: util.c =================================================================== RCS file: /cvsroot/libwww-perl/html-parser/util.c,v retrieving revision 2.20 retrieving revision 2.21 diff -u -p -r2.20 -r2.21 --- util.c 8 Nov 2004 14:14:35 -0000 2.20 +++ util.c 10 Nov 2004 13:32:56 -0000 2.21 @@ -209,23 +209,21 @@ decode_entities(pTHX_ SV* sv, HV* entity } if (!SvUTF8(sv) && repl_utf8) { - STRLEN len = t - SvPVX(sv); - if (len) { - /* need to upgrade the part that we have looked though */ - STRLEN old_len = len; - char *ustr = bytes_to_utf8(SvPVX(sv), &len); - STRLEN grow = len - old_len; - if (grow) { - /* XXX It might already be enough gap, so we don't need this, - but it should not hurt either. - */ - grow_gap(aTHX_ sv, grow, &t, &s, &end); - Copy(ustr, SvPVX(sv), len, char); - t = SvPVX(sv) + len; - } - Safefree(ustr); - } + /* need to upgrade sv before we continue */ + STRLEN before_gap_len = t - SvPVX(sv); + char *before_gap = bytes_to_utf8(SvPVX(sv), &before_gap_len); + STRLEN after_gap_len = end - s; + char *after_gap = bytes_to_utf8(s, &after_gap_len); + + sv_setpvn(sv, before_gap, before_gap_len); + sv_catpvn(sv, after_gap, after_gap_len); SvUTF8_on(sv); + + Safefree(before_gap); + Safefree(after_gap); + + s = t = SvPVX(sv) + before_gap_len; + end = SvPVX(sv) + before_gap_len + after_gap_len; } else if (SvUTF8(sv) && !repl_utf8) { repl = bytes_to_utf8(repl, &repl_len); Index: t/uentities.t =================================================================== RCS file: /cvsroot/libwww-perl/html-parser/t/uentities.t,v retrieving revision 1.8 retrieving revision 1.9 diff -u -p -r1.8 -r1.9 --- t/uentities.t 8 Nov 2004 14:14:42 -0000 1.8 +++ t/uentities.t 10 Nov 2004 13:33:03 -0000 1.9 @@ -14,7 +14,7 @@ unless (&HTML::Entities::UNICODE_SUPPORT exit; } -print "1..13\n"; +print "1..14\n"; print "not " unless decode_entities("&euro") eq "\x{20AC}"; print "ok 1\n"; @@ -90,3 +90,6 @@ print "ok 12\n"; print "not " unless decode_entities("�") eq chr(0xFFFD); print "ok 13\n"; + +print "not " unless decode_entities("\260’\260") eq "\x{b0}\x{2019}\x{b0}"; +print "ok 14\n";