Package: libxml-dom-perl Version: 1.43-4 Severity: normal Tags: patch Hi there!
I've ran into trouble with encodings support in XML::DOM. In short, it fails to properly set utf8 flag on long strings in non-ascii single-byte encodings. That is, everythng works fine on XML like <foo>Long string that is less than 1024 bytes when encoded to UTF-8</foo> But as soon as character data exceed 1024 bytes, perl stops recognising it as utf8 data and requires manual intervention (Encode::decode_utf8) to work. The reason for it is 'use bytes' pragma in DOM.pm The patch provided removes it from global scope, adding it where necessary (at least for tests to work). It also includes a test for this situation, t/dom_encodings.t and an input file for it, t/dom_encodings.xml. I haven't tested it extensively, but it works for my applications. Best regards, Alex. -- System Information: Debian Release: 3.1 APT prefers testing APT policy: (990, 'testing'), (500, 'unstable'), (1, 'experimental') Architecture: i386 (i686) Kernel: Linux 2.6.11-1-686-smp Locale: LANG=ru_RU.UTF-8, LC_CTYPE=ru_RU.UTF-8 (charmap=UTF-8) Versions of packages libxml-dom-perl depends on: ii libwww-perl 5.803-3 WWW client/server library for Perl ii libxml-parser-perl 2.34-4 Perl module for parsing XML files ii libxml-perl 0.08-1 Perl modules for working with XML ii libxml-regexp-perl 0.03-7 Perl module for regular expression ii perl 5.8.4-8 Larry Wall's Practical Extraction -- no debconf information
diff -urN libxml-dom-perl-1.43.orig/lib/XML/DOM.pm libxml-dom-perl-1.43/lib/XML/DOM.pm --- libxml-dom-perl-1.43.orig/lib/XML/DOM.pm 2003-07-29 05:46:43.000000000 +0700 +++ libxml-dom-perl-1.43/lib/XML/DOM.pm 2005-07-13 15:47:49.000000000 +0700 @@ -29,8 +29,6 @@ use strict; -use bytes; - use vars qw( $VERSION @ISA @EXPORT $IgnoreReadOnly $SafeMode $TagStyle %DefaultEntities %DecodeDefaultEntity @@ -405,6 +403,7 @@ # sub forgiving_isValidName { + use bytes; $_[0] =~ /^$XML::RegExp::Name$/o; } @@ -413,6 +412,7 @@ # sub picky_isValidName { + use bytes; $_[0] =~ /^$XML::RegExp::Name$/o and $_[0] !~ /^xml/i; } @@ -1243,6 +1247,7 @@ my ($self, $str) = @_; my $doctype = $self->[_Doc]->getDoctype; + use bytes; $str =~ s/&($XML::RegExp::Name|(#([0-9]+)|#x([0-9a-fA-F]+)));/ defined($2) ? XML::DOM::XmlUtf8Encode ($3 || hex ($4)) : expandEntityRef ($1, $doctype)/ego; diff -urN libxml-dom-perl-1.43.orig/MANIFEST libxml-dom-perl-1.43/MANIFEST --- libxml-dom-perl-1.43.orig/MANIFEST 2002-12-05 21:35:30.000000000 +0600 +++ libxml-dom-perl-1.43/MANIFEST 2005-07-13 14:36:21.000000000 +0700 @@ -42,6 +42,8 @@ t/dom_cdata.t t/dom_documenttype.t t/dom_encode.t +t/dom_encodings.t +t/dom_encodings.xml t/dom_example.t t/dom_extent.dtd t/dom_extent.ent diff -urN libxml-dom-perl-1.43.orig/t/dom_encodings.t libxml-dom-perl-1.43/t/dom_encodings.t --- libxml-dom-perl-1.43.orig/t/dom_encodings.t 1970-01-01 07:00:00.000000000 +0700 +++ libxml-dom-perl-1.43/t/dom_encodings.t 2005-07-13 14:49:03.000000000 +0700 @@ -0,0 +1,78 @@ +BEGIN {print "1..5\n";} +END {print "not ok 1\n" unless $loaded;} +use XML::DOM; +$loaded = 1; +print "ok 1\n"; + +my $test = 1; +sub assert_ok +{ + my $ok = shift; + print "not " unless $ok; + ++$test; + print "ok $test\n"; + $ok; +} + +# Replaces the filepath separator if necessary (i.e for Macs and Windows/DOS) +sub filename +{ + my $name = shift; + + if ((defined $^O and + $^O =~ /MSWin32/i || + $^O =~ /Windows_95/i || + $^O =~ /Windows_NT/i) || + (defined $ENV{OS} and + $ENV{OS} =~ /MSWin32/i || + $ENV{OS} =~ /Windows_95/i || + $ENV{OS} =~ /Windows_NT/i)) + { + $name =~ s!/!\\!g; + } + elsif ((defined $^O and $^O =~ /MacOS/i) || + (defined $ENV{OS} and $ENV{OS} =~ /MacOS/i)) + { + $name =~ s!/!:!g; + $name = ":$name"; + } + $name; +} + +######################### End of black magic. + +# Insert your test code below (better if it prints "ok 13" +# (correspondingly "not ok 13") depending on the success of chunk 13 +# of the test code): + +# Test 2 + + +my $parser = new XML::DOM::Parser; +unless (assert_ok ($parser)) +{ + exit; +} + +# Test 3 +my $doc; +eval { + $doc = $parser->parsefile (filename ('t/dom_encodings.xml')); +}; +print $@; +assert_ok (not $@); + +# Test 4 +$elem = $doc->getElementsByTagName('foo')->item(0); +$elem->normalize(); +$data = $elem->getFirstChild()->getData(); +#print STDERR "len: ", length $data, ": ", $data, "\n"; +assert_ok (length($data) == 513); + +# Test 5 +if ($] > 5.007) { + require Encode; + assert_ok(Encode::is_utf8($data)); +} else { + assert_ok(1); +} diff -urN libxml-dom-perl-1.43.orig/t/dom_encodings.xml libxml-dom-perl-1.43/t/dom_encodings.xml --- libxml-dom-perl-1.43.orig/t/dom_encodings.xml 1970-01-01 07:00:00.000000000 +0700 +++ libxml-dom-perl-1.43/t/dom_encodings.xml 2005-07-13 14:40:07.000000000 +0700 @@ -0,0 +1,3 @@ +<?xml version="1.0" encoding="iso-8859-5"?> +<!-- >512 chars inside foo --> +<foo>ÑÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐ</foo>