The following patch should make sure that HTML::Parser does not
produce badly encoded SVs.  That avoid the problem demonstrated, but I
still need to track down why perl itself segfaulted because of this.

Regards,
Gisle

Index: util.c
===================================================================
RCS file: /cvsroot/libwww-perl/html-parser/util.c,v
retrieving revision 2.20
retrieving revision 2.21
diff -u -p -r2.20 -r2.21
--- util.c      8 Nov 2004 14:14:35 -0000       2.20
+++ util.c      10 Nov 2004 13:32:56 -0000      2.21
@@ -209,23 +209,21 @@ decode_entities(pTHX_ SV* sv, HV* entity
            }
 
            if (!SvUTF8(sv) && repl_utf8) {
-               STRLEN len = t - SvPVX(sv);
-               if (len) {
-                   /* need to upgrade the part that we have looked though */
-                   STRLEN old_len = len;
-                   char *ustr = bytes_to_utf8(SvPVX(sv), &len);
-                   STRLEN grow = len - old_len;
-                   if (grow) {
-                       /* XXX It might already be enough gap, so we don't need 
this,
-                          but it should not hurt either.
-                       */
-                       grow_gap(aTHX_ sv, grow, &t, &s, &end);
-                       Copy(ustr, SvPVX(sv), len, char);
-                       t = SvPVX(sv) + len;
-                   }
-                   Safefree(ustr);
-               }
+               /* need to upgrade sv before we continue */
+               STRLEN before_gap_len = t - SvPVX(sv);
+               char *before_gap = bytes_to_utf8(SvPVX(sv), &before_gap_len);
+               STRLEN after_gap_len = end - s;
+               char *after_gap = bytes_to_utf8(s, &after_gap_len);
+
+               sv_setpvn(sv, before_gap, before_gap_len);
+               sv_catpvn(sv, after_gap, after_gap_len);
                SvUTF8_on(sv);
+
+               Safefree(before_gap);
+               Safefree(after_gap);
+
+               s = t = SvPVX(sv) + before_gap_len;
+               end = SvPVX(sv) + before_gap_len + after_gap_len;
            }
            else if (SvUTF8(sv) && !repl_utf8) {
                repl = bytes_to_utf8(repl, &repl_len);
Index: t/uentities.t
===================================================================
RCS file: /cvsroot/libwww-perl/html-parser/t/uentities.t,v
retrieving revision 1.8
retrieving revision 1.9
diff -u -p -r1.8 -r1.9
--- t/uentities.t       8 Nov 2004 14:14:42 -0000       1.8
+++ t/uentities.t       10 Nov 2004 13:33:03 -0000      1.9
@@ -14,7 +14,7 @@ unless (&HTML::Entities::UNICODE_SUPPORT
     exit;
 }
 
-print "1..13\n";
+print "1..14\n";
 
 print "not " unless decode_entities("&euro") eq "\x{20AC}";
 print "ok 1\n";
@@ -90,3 +90,6 @@ print "ok 12\n";
 
 print "not " unless decode_entities("&#56256") eq chr(0xFFFD);
 print "ok 13\n";
+
+print "not " unless decode_entities("\260’\260") eq 
"\x{b0}\x{2019}\x{b0}";
+print "ok 14\n";

Reply via email to