Package: libxml-dom-perl
Version: 1.43-4
Severity: normal
Tags: patch

Hi there!

I've ran into trouble with encodings support in XML::DOM. In short, it
fails to properly set utf8 flag on long strings in non-ascii
single-byte encodings. That is, everythng works fine on XML like

<foo>Long string that is less than 1024 bytes when encoded to
UTF-8</foo>

But as soon as character data exceed 1024 bytes, perl stops recognising
it as utf8 data and requires manual intervention (Encode::decode_utf8)
to work.

The reason for it is 'use bytes' pragma in DOM.pm

The patch provided removes it from global scope, adding it where necessary (at 
least for tests to work). It also includes a test for this situation, 
t/dom_encodings.t and an input file for it, t/dom_encodings.xml. I haven't 
tested it extensively, but it works for my applications.

Best regards,
Alex.

-- System Information:
Debian Release: 3.1
  APT prefers testing
  APT policy: (990, 'testing'), (500, 'unstable'), (1, 'experimental')
Architecture: i386 (i686)
Kernel: Linux 2.6.11-1-686-smp
Locale: LANG=ru_RU.UTF-8, LC_CTYPE=ru_RU.UTF-8 (charmap=UTF-8)

Versions of packages libxml-dom-perl depends on:
ii  libwww-perl                   5.803-3    WWW client/server library for Perl
ii  libxml-parser-perl            2.34-4     Perl module for parsing XML files
ii  libxml-perl                   0.08-1     Perl modules for working with XML
ii  libxml-regexp-perl            0.03-7     Perl module for regular expression
ii  perl                          5.8.4-8    Larry Wall's Practical Extraction 

-- no debconf information
diff -urN libxml-dom-perl-1.43.orig/lib/XML/DOM.pm 
libxml-dom-perl-1.43/lib/XML/DOM.pm
--- libxml-dom-perl-1.43.orig/lib/XML/DOM.pm    2003-07-29 05:46:43.000000000 
+0700
+++ libxml-dom-perl-1.43/lib/XML/DOM.pm 2005-07-13 15:47:49.000000000 +0700
@@ -29,8 +29,6 @@
 
 use strict;
 
-use bytes;
-
 use vars qw( $VERSION @ISA @EXPORT
             $IgnoreReadOnly $SafeMode $TagStyle
             %DefaultEntities %DecodeDefaultEntity
@@ -405,6 +403,7 @@
 #
 sub forgiving_isValidName
 {
+    use bytes;
     $_[0] =~ /^$XML::RegExp::Name$/o;
 }
 
@@ -413,6 +412,7 @@
 #
 sub picky_isValidName
 {
+    use bytes;
     $_[0] =~ /^$XML::RegExp::Name$/o and $_[0] !~ /^xml/i;
 }
 
@@ -1243,6 +1247,7 @@
     my ($self, $str) = @_;
     my $doctype = $self->[_Doc]->getDoctype;
 
+    use bytes;
     $str =~ s/&($XML::RegExp::Name|(#([0-9]+)|#x([0-9a-fA-F]+)));/
        defined($2) ? XML::DOM::XmlUtf8Encode ($3 || hex ($4)) 
                    : expandEntityRef ($1, $doctype)/ego;
diff -urN libxml-dom-perl-1.43.orig/MANIFEST libxml-dom-perl-1.43/MANIFEST
--- libxml-dom-perl-1.43.orig/MANIFEST  2002-12-05 21:35:30.000000000 +0600
+++ libxml-dom-perl-1.43/MANIFEST       2005-07-13 14:36:21.000000000 +0700
@@ -42,6 +42,8 @@
 t/dom_cdata.t
 t/dom_documenttype.t
 t/dom_encode.t
+t/dom_encodings.t
+t/dom_encodings.xml
 t/dom_example.t
 t/dom_extent.dtd
 t/dom_extent.ent
diff -urN libxml-dom-perl-1.43.orig/t/dom_encodings.t 
libxml-dom-perl-1.43/t/dom_encodings.t
--- libxml-dom-perl-1.43.orig/t/dom_encodings.t 1970-01-01 07:00:00.000000000 
+0700
+++ libxml-dom-perl-1.43/t/dom_encodings.t      2005-07-13 14:49:03.000000000 
+0700
@@ -0,0 +1,78 @@
+BEGIN {print "1..5\n";}
+END {print "not ok 1\n" unless $loaded;}
+use XML::DOM;
+$loaded = 1;
+print "ok 1\n";
+
+my $test = 1;
+sub assert_ok
+{
+    my $ok = shift;
+    print "not " unless $ok;
+    ++$test;
+    print "ok $test\n";
+    $ok;
+}
+
+# Replaces the filepath separator if necessary (i.e for Macs and Windows/DOS)
+sub filename
+{
+    my $name = shift;
+
+    if ((defined $^O and
+        $^O =~ /MSWin32/i ||
+        $^O =~ /Windows_95/i ||
+        $^O =~ /Windows_NT/i) ||
+       (defined $ENV{OS} and
+        $ENV{OS} =~ /MSWin32/i ||
+        $ENV{OS} =~ /Windows_95/i ||
+        $ENV{OS} =~ /Windows_NT/i))
+    {
+       $name =~ s!/!\\!g;
+    }
+    elsif  ((defined $^O and $^O =~ /MacOS/i) ||
+           (defined $ENV{OS} and $ENV{OS} =~ /MacOS/i))
+    {
+       $name =~ s!/!:!g;
+       $name = ":$name";
+    }
+    $name;
+}
+
+######################### End of black magic.
+
+# Insert your test code below (better if it prints "ok 13"
+# (correspondingly "not ok 13") depending on the success of chunk 13
+# of the test code):
+
+# Test 2
+
+
+my $parser = new XML::DOM::Parser;
+unless (assert_ok ($parser))
+{
+    exit;
+}
+
+# Test 3
+my $doc;
+eval {
+    $doc = $parser->parsefile (filename ('t/dom_encodings.xml'));
+};
+print $@;
+assert_ok (not $@);
+
+# Test 4
+$elem = $doc->getElementsByTagName('foo')->item(0);
+$elem->normalize();
+$data = $elem->getFirstChild()->getData();
+#print STDERR "len: ", length $data, ": ", $data, "\n";
+assert_ok (length($data) == 513);
+
+# Test 5
+if ($] > 5.007) {
+    require Encode;
+    assert_ok(Encode::is_utf8($data));
+} else {
+    assert_ok(1);
+}
diff -urN libxml-dom-perl-1.43.orig/t/dom_encodings.xml 
libxml-dom-perl-1.43/t/dom_encodings.xml
--- libxml-dom-perl-1.43.orig/t/dom_encodings.xml       1970-01-01 
07:00:00.000000000 +0700
+++ libxml-dom-perl-1.43/t/dom_encodings.xml    2005-07-13 14:40:07.000000000 
+0700
@@ -0,0 +1,3 @@
+<?xml version="1.0" encoding="iso-8859-5"?>
+<!-- >512 chars inside foo -->
+<foo>ÑÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐ</foo>

Reply via email to