Bug#545457: Revised patch

2009-09-24 Thread Muharem Hrnjadovic
Joey Hess wrote:
 Muharem Hrnjadovic wrote:
 sorry for not responding earlier. What would be the best way to perform
 the suggested full-archive test?
 
 * Clone pristine-tar git repo.
 * git checkout origin/testsuite
 * Use the testexternal target in the Makefile, which expects you to
   provide it with `tars` file listing filenames of tar files to test.
   (I use a local mirror to get those.)
 
 I'd run a test now, but my machine with the local mirror has had a disk
 fail.

I have run the extension against appox. 1200 packages by now and fixed
another few bugs (incremental diff enclosed). It will take some time to
go through all of them .. will keep you posted.

Best regards

-- 
Muharem Hrnjadovic muha...@ubuntu.com
Public key id   : B2BBFCFC
Key fingerprint : A5A3 CC67 2B87 D641 103F  5602 219F 6B60 B2BB FCFC
=== modified file 'Ptutils.pm'
--- Ptutils.pm	2009-09-23 22:33:13 +
+++ Ptutils.pm	2009-09-24 13:16:55 +
@@ -44,12 +44,11 @@
 use strict;
 use warnings;
 use Cwd qw(realpath);
-use Encode qw(encode decode);
 use File::Find;
 use File::Spec::Functions qw(abs2rel canonpath catdir splitdir splitpath);
 use List::MoreUtils qw(zip);
 
-my $debug=1;
+my $debug=0;
 
 sub debug {
 	message(@_) if $debug;
@@ -76,6 +75,8 @@
 chomp $_;
 # Strip off trailing slashes.
 s,/*$,,;
+# Condense multiple slashes to one.
+s,//+,/,;
 # Unicode code points (e.g. \201) are read as text i.e. as four
 # characters and *not* as a single byte. We convert them to bytes
 # here.
@@ -84,6 +85,8 @@
 }
 close IN;
 
+# debug(\n\n\n-- TAR : $#manifest_entries, join(, , @manifest_entries));
+# debug(\n\n\n-- DIR : $#paths_in_tree, join(, , @paths_in_tree));
 my @path_prefixes = normalizepaths(\...@paths_in_tree, \...@manifest_entries);
 
 if ($#path_prefixes  0) {
@@ -93,12 +96,14 @@
 
 # Compare the paths found in the manifest with the ones in the local
 # local tree.
+# debug(\n\n\n-- TAR : $#manifest_entries, join(, , @manifest_entries));
+# debug(\n\n\n-- DIR : $#paths_in_tree, join(, , @paths_in_tree));
 my %seen;
 @seen {...@manifest_entries} = ();
 delete @seen {...@paths_in_tree};
 @missing_in_tree = sort(keys %seen);
 
-debug(Missing : $#missing_in_tree, join(, , @missing_in_tree));
+# debug(Missing : $#missing_in_tree, join(, , @missing_in_tree));
 return \...@missing_in_tree;
 }
 
@@ -135,8 +140,10 @@
 my $paths_in_tree = shift;
 my $manifest_entries = shift;
 
-# A list of all paths in local tree that are files.
-my @local_files = grep { -f $_ } @$paths_in_tree;
+# A list of all paths in local tree that are either files or links. The
+# latter category is needed when packages don't contain a single proper
+# file (example: binutils-z80_2.19.51.20090827-2).
+my @local_files = grep { -f $_ || -l $_ } @$paths_in_tree;
 # Sort by base name frequency.
 sortbybasenamefrequency(\...@local_files);
 # The local path selected eventually.
@@ -200,14 +207,16 @@
 while(my ($prefix, $paths) = each(%pdata)) {
 # Do we need to strip off prefixes for this set of paths?
 if (length($prefix)  0) {
+# Escape regex meta chars in path prefixes (example:
+# aptoncd-0.1.98+bzr112)
+$prefix =~ s![+?*]!\\$!g;
+debug(\n\n\n## , $prefix);
 map { $_ = canonpath($_); s,^$prefix/?,,; $_ } @$paths;
-# Discard all paths that are substrings of the prefix.
-@$paths = grep { index($prefix/, $_)  0 } @$paths;
 }
 }
 }
 
-debug(\n** PPS: $#$prefixes, join(, , @$prefixes));
+debug(\n\n\n** PPS: $#$prefixes, join(, , @$prefixes));
 return $prefixes;
 }
 

=== modified file 'pristine-tar'
--- pristine-tar	2009-09-23 22:33:13 +
+++ pristine-tar	2009-09-23 23:06:08 +
@@ -205,17 +205,17 @@
 	my $source=shift;
 	my %optio...@_;
 	
-	my @manifest;
-	open (IN, $tempdir/manifest) || die $tempdir/manifest: $!;
-	while (IN) {
-		chomp;
+my @manifest;
+open (IN, $tempdir/manifest) || die $tempdir/manifest: $!;
+while (IN) {
+chomp;
 # Unicode code points (e.g. \201) are read as text i.e. as four
 # characters and *not* as a single byte. We convert them to bytes
 # here.
 s/\\(\d{3})/chr(0$1)/eeg;
-		push @manifest, $_;
-	}
-	close IN;
+push @manifest, $_;
+}
+close IN;
 
 	# The manifest and source should have the same filenames,
 	# but the manifest probably has all the files under a common

=== modified file 't/gendelta.t'
--- t/gendelta.t	2009-09-09 16:02:36 +
+++ t/gendelta.t	2009-09-24 12:43:41 +
@@ -98,8 +98,6 @@
 # !! TEST SETUP !!
 # This mimics what would be found in the (pristine) tar file.
 my @manifest_entries = qw(
-a/
-a/b/
 a/b/apg-2.2.3.dfsg.1/
 a/b/apg-2.2.3.dfsg.1/cast/
 

Bug#545457: Revised patch

2009-09-23 Thread Muharem Hrnjadovic
Joey Hess wrote:
 Muharem Hrnjadovic wrote:
 Also, the
 tar-generated manifest sometimes contains filenames encoded in ways that
 pristine-tar does not understand, but tar does, and your patch would
 certianly fail then.
 Could you please provide more detail on this. This is something I'd like
 to address as well.
 
 Well, I can't remember examples of such tarballs, but I remember them
 showing up in testing of pristine-tar against the whole Debian archive,
 and having to fix it to deal with that. That's why the $full_sweep code
 exists.
 
 The best thing to do will be to try your code in such a full-archive
 test and see what breaks.
 

OK .. I am downloading the 5 DVDs from
http://cdimage.debian.org/debian-cd/5.0.3/amd64/bt-dvd/ and will test
the extension against the tar balls found on these.

Best regards

-- 
Muharem Hrnjadovic muha...@ubuntu.com
Public key id   : B2BBFCFC
Key fingerprint : A5A3 CC67 2B87 D641 103F  5602 219F 6B60 B2BB FCFC



signature.asc
Description: OpenPGP digital signature


Bug#545457: Revised patch

2009-09-23 Thread Muharem Hrnjadovic
Muharem Hrnjadovic wrote:
 Joey Hess wrote:
 Muharem Hrnjadovic wrote:
 Also, the
 tar-generated manifest sometimes contains filenames encoded in ways that
 pristine-tar does not understand, but tar does, and your patch would
 certianly fail then.
 Could you please provide more detail on this. This is something I'd like
 to address as well.
 Well, I can't remember examples of such tarballs, but I remember them
 showing up in testing of pristine-tar against the whole Debian archive,
 and having to fix it to deal with that. That's why the $full_sweep code
 exists.

 The best thing to do will be to try your code in such a full-archive
 test and see what breaks.

 
 OK .. I am downloading the 5 DVDs from
 http://cdimage.debian.org/debian-cd/5.0.3/amd64/bt-dvd/ and will test
 the extension against the tar balls found on these.
 
 Best regards
 

hmm .. just learned that these DVDs do not carry the sources. So
cancelling that.

Best regards

-- 
Muharem Hrnjadovic muha...@ubuntu.com
Public key id   : B2BBFCFC
Key fingerprint : A5A3 CC67 2B87 D641 103F  5602 219F 6B60 B2BB FCFC



signature.asc
Description: OpenPGP digital signature


Bug#545457: Revised patch

2009-09-23 Thread Joey Hess
Muharem Hrnjadovic wrote:
 sorry for not responding earlier. What would be the best way to perform
 the suggested full-archive test?

* Clone pristine-tar git repo.
* git checkout origin/testsuite
* Use the testexternal target in the Makefile, which expects you to
  provide it with `tars` file listing filenames of tar files to test.
  (I use a local mirror to get those.)

I'd run a test now, but my machine with the local mirror has had a disk
fail.

-- 
see shy jo


signature.asc
Description: Digital signature


Bug#545457: Revised patch

2009-09-23 Thread Muharem Hrnjadovic
Joey Hess wrote:
 Muharem Hrnjadovic wrote:
 sorry for not responding earlier. What would be the best way to perform
 the suggested full-archive test?
 
 * Clone pristine-tar git repo.
 * git checkout origin/testsuite
 * Use the testexternal target in the Makefile, which expects you to
   provide it with `tars` file listing filenames of tar files to test.
   (I use a local mirror to get those.)
 
 I'd run a test now, but my machine with the local mirror has had a disk
 fail.
 

Hello Joey,

thanks for the advice, I already found an alternative way to test.
Currently, the extension is reporting a false positive for
acidbase_1.3.9.orig.tar.gz

AFAICS it's a unicode issue. The tar ball is using a 'UTF-8' encoding.
I am presently trying to revise the code to handle paths containing
unicode characters ..

Best regards

-- 
Muharem Hrnjadovic muha...@ubuntu.com
Public key id   : B2BBFCFC
Key fingerprint : A5A3 CC67 2B87 D641 103F  5602 219F 6B60 B2BB FCFC



signature.asc
Description: OpenPGP digital signature


Bug#545457: Revised patch

2009-09-23 Thread Muharem Hrnjadovic
Joey Hess wrote:
 Muharem Hrnjadovic wrote:
 sorry for not responding earlier. What would be the best way to perform
 the suggested full-archive test?
 
 * Clone pristine-tar git repo.
 * git checkout origin/testsuite
 * Use the testexternal target in the Makefile, which expects you to
   provide it with `tars` file listing filenames of tar files to test.
   (I use a local mirror to get those.)
 
 I'd run a test now, but my machine with the local mirror has had a disk
 fail.

Alright. I fixed the unicode issue that surfaced while processing the
acidbase_1.3.9.orig.tar.gz tar ball (please see enclosed, incremental diff).

I am seeing another problem with the acpidump_20071116.orig.tar.gz tar
ball but it's bed time for me, so, will tackle that tomorrow.

Best regards

-- 
Muharem Hrnjadovic muha...@ubuntu.com
Public key id   : B2BBFCFC
Key fingerprint : A5A3 CC67 2B87 D641 103F  5602 219F 6B60 B2BB FCFC
=== modified file 'Ptutils.pm'
--- Ptutils.pm	2009-09-09 16:55:42 +
+++ Ptutils.pm	2009-09-23 22:48:14 +
@@ -41,9 +41,10 @@
 @ISA = (Exporter);
 @EXPORT = qw(checkmanifest);
 
+use strict;
 use warnings;
-use strict;
 use Cwd qw(realpath);
+use Encode qw(encode decode);
 use File::Find;
 use File::Spec::Functions qw(abs2rel canonpath catdir splitdir splitpath);
 use List::MoreUtils qw(zip);
@@ -75,6 +76,10 @@
 chomp $_;
 # Strip off trailing slashes.
 s,/*$,,;
+# Unicode code points (e.g. \201) are read as text i.e. as four
+# characters and *not* as a single byte. We convert them to bytes
+# here.
+s/\\(\d{3})/chr(0$1)/eeg;
 push(@manifest_entries, $_);
 }
 close IN;

=== modified file 'pristine-tar'
--- pristine-tar	2009-09-09 16:02:36 +
+++ pristine-tar	2009-09-23 23:06:08 +
@@ -205,13 +205,17 @@
 	my $source=shift;
 	my %optio...@_;
 	
-	my @manifest;
-	open (IN, $tempdir/manifest) || die $tempdir/manifest: $!;
-	while (IN) {
-		chomp;
-		push @manifest, $_;
-	}
-	close IN;
+my @manifest;
+open (IN, $tempdir/manifest) || die $tempdir/manifest: $!;
+while (IN) {
+chomp;
+# Unicode code points (e.g. \201) are read as text i.e. as four
+# characters and *not* as a single byte. We convert them to bytes
+# here.
+s/\\(\d{3})/chr(0$1)/eeg;
+push @manifest, $_;
+}
+close IN;
 
 	# The manifest and source should have the same filenames,
 	# but the manifest probably has all the files under a common



signature.asc
Description: OpenPGP digital signature


Bug#545457: Revised patch

2009-09-16 Thread Muharem Hrnjadovic
Joey Hess wrote:
 Muharem Hrnjadovic wrote:
 Also, the
 tar-generated manifest sometimes contains filenames encoded in ways that
 pristine-tar does not understand, but tar does, and your patch would
 certianly fail then.
 Could you please provide more detail on this. This is something I'd like
 to address as well.
 
 Well, I can't remember examples of such tarballs, but I remember them
 showing up in testing of pristine-tar against the whole Debian archive,
 and having to fix it to deal with that. That's why the $full_sweep code
 exists.
 
 The best thing to do will be to try your code in such a full-archive
 test and see what breaks.

Hello Joey,

sorry for not responding earlier. What would be the best way to perform
the suggested full-archive test?

Best regards

-- 
Muharem Hrnjadovic muha...@ubuntu.com
Public key id   : B2BBFCFC
Key fingerprint : A5A3 CC67 2B87 D641 103F  5602 219F 6B60 B2BB FCFC



signature.asc
Description: OpenPGP digital signature


Bug#545457: Revised patch

2009-09-11 Thread Joey Hess
Muharem Hrnjadovic wrote:
  Also, the
  tar-generated manifest sometimes contains filenames encoded in ways that
  pristine-tar does not understand, but tar does, and your patch would
  certianly fail then.
 
 Could you please provide more detail on this. This is something I'd like
 to address as well.

Well, I can't remember examples of such tarballs, but I remember them
showing up in testing of pristine-tar against the whole Debian archive,
and having to fix it to deal with that. That's why the $full_sweep code
exists.

The best thing to do will be to try your code in such a full-archive
test and see what breaks.

-- 
see shy jo


signature.asc
Description: Digital signature


Bug#545457: Revised patch

2009-09-09 Thread Muharem Hrnjadovic
Joey Hess wrote:
 Muharem Hrnjadovic wrote:
  pristine-tar's gendelta command does not check whether all paths in
  the manifest are also present in the local/unpacked source tree.

 Nor should it; pristine-tar gendelta can be run from anywhere; it has
 no knowledge about the source tree that will in the future be used by
 pristine-tar gentar.

 Which is why pristine-tar gentar has a documented requirement that the
 source tree it is run in have identical content to the original tarball
 contents. Including having all the files that are in it.

Hello Joey,

thank you very much for your email. I understand how the documented
requirement you mention above makes sense. On the other hand there are
situations where a check the local tree *option* for gendelta would be
handy:

- for one I'd rather have it fail early as opposed to obscuring a
  gentar failure at some later point in time.

- there is also usage of pristine-tar by other tools like
  bzr-builddeb [1]. For a variety of reasons these cannot or do not
  necessarily observe the requirement above.
  Being able to run gendelta in a fail early mode would allow such
  tools to provide a better user experience on their part.

  It thus generates a broken delta when something is missing from the
  unpacked source tree (since not all the files required for generating
  a pristine tar *later* are actually available).

 If you want this level of assurance, I think you should be using
 pristine-tar commit/checkout, rather than manually using
 gendelta/gentar.

 The difference is that pristine-tar commit *does* know exactly what
 tree will later be used be pristine-tar checkout. And so it can enable
 a hack that creates a dummy entry for missing files in that tree, which
 handles this case fine.

I understand the commit/checkout commands only support the git VCS
whereas the gendelta option proposed facilitates integration with
any kind of tool.

Anyway, the revised patch enclosed in this email

- addresses the shortcomings you listed below and
- provides the check the local tree functionality as a gendelta
  *option* i.e. something the user has to turn on explicitly by
  using the '-l' command line argument
- puts the additional functionality in a separate Perl module
  and changes the pristine-tar code only minimally.

 Besides making pristine-tar gendelta fail if run outside a source tree,

Fixed.

 your patch has some other problems. It assumes that the manifest
 generated by tar always puts files in a subdirectory. A tarball that
 does not put all files in a subdirectory probably breaks it.

Fixed.

 Also, the
 tar-generated manifest sometimes contains filenames encoded in ways that
 pristine-tar does not understand, but tar does, and your patch would
 certianly fail then.

Could you please provide more detail on this. This is something I'd like
to address as well.

[1] https://launchpad.net/bzr-builddeb

Best regards

-- 
Muharem Hrnjadovic muha...@ubuntu.com
Public key id   : B2BBFCFC
Key fingerprint : A5A3 CC67 2B87 D641 103F  5602 219F 6B60 B2BB FCFC
=== added file 'Ptutils.pm'
--- Ptutils.pm	1970-01-01 00:00:00 +
+++ Ptutils.pm	2009-09-09 16:55:16 +
@@ -0,0 +1,299 @@
+#!/usr/bin/perl
+=head1 NAME
+
+ptutils - pristine-tar utilities
+
+=head1 DESCRIPTION
+
+Various utility functions used by the pristine-tar tool.
+
+=head1 FUNCTIONS
+
+=over 4
+
+=item checkmanifest
+
+Find all paths that are listed in the gendelta manifest but do *not* exist
+in the local tree.
+
+When the optional localtree argument is passed to gendelta it will check
+whether the files required are all present in the local tree and abort if this
+is not the case.
+
+=back
+
+=head1 LIMITATIONS
+
+The tar-generated manifest sometimes contains filenames encoded in ways that
+pristine-tar does not understand, but tar does.
+
+=head1 AUTHOR
+
+Muharem Hrnjadovic muha...@ubuntu.com
+
+Licensed under the GPL, version 2 or above.
+
+=cut
+
+package Ptutils;
+
+use Exporter;
+...@isa = (Exporter);
+...@export = qw(checkmanifest);
+
+use warnings;
+use strict;
+use Cwd qw(realpath);
+use File::Find;
+use File::Spec::Functions qw(abs2rel canonpath catdir splitdir splitpath);
+use List::MoreUtils qw(zip);
+
+my $debug=0;
+
+sub debug {
+	message(@_) if $debug;
+}
+
+sub message {
+	print STDERR pristine-tar utils: @_\n;
+}
+
+sub checkmanifest {
+# Find all paths that are listed in the manifest but do *not* exist in
+# the local tree.
+my $manifest=shift;
+my $localtree=shift;
+
+my @manifest_entries = ();
+my @paths_in_tree = ();
+my @missing_in_tree;
+
+find sub { push(@paths_in_tree, canonpath($File::Find::name)) }, abs2rel(realpath($localtree));
+
+open(IN, , $manifest) || die $!;
+while (IN) {
+chomp $_;
+# Strip off trailing slashes.
+s,/*$,,;
+push(@manifest_entries, $_);
+}
+close IN;
+
+my @path_prefixes = normalizepaths(\...@paths_in_tree,