[PATCH 1/2] Add duplicate message removal for notmuch-mutt.

2012-08-02 Thread David Bremner
Stefano Zacchiroli  writes:

> From: Kevin McCarthy 
>
> Add a --remove-dups flag which removes duplicate files from search and
> thread results.  Uses fdupes if installed.  Otherwise it runs a size and
> Digest::SHA scan on each file to detect duplicates.

Pushed,

d


Re: [PATCH 1/2] Add duplicate message removal for notmuch-mutt.

2012-08-02 Thread David Bremner
Stefano Zacchiroli z...@upsilon.cc writes:

 From: Kevin McCarthy ke...@8t8.us

 Add a --remove-dups flag which removes duplicate files from search and
 thread results.  Uses fdupes if installed.  Otherwise it runs a size and
 Digest::SHA scan on each file to detect duplicates.

Pushed,

d
___
notmuch mailing list
notmuch@notmuchmail.org
http://notmuchmail.org/mailman/listinfo/notmuch


[PATCH 1/2] Add duplicate message removal for notmuch-mutt.

2012-08-01 Thread Stefano Zacchiroli
From: Kevin McCarthy 

Add a --remove-dups flag which removes duplicate files from search and
thread results.  Uses fdupes if installed.  Otherwise it runs a size and
Digest::SHA scan on each file to detect duplicates.

Signed-off-by: Stefano Zacchiroli 
---
 contrib/notmuch-mutt/notmuch-mutt|   89 --
 contrib/notmuch-mutt/notmuch-mutt.rc |4 +-
 2 files changed, 76 insertions(+), 17 deletions(-)

diff --git a/contrib/notmuch-mutt/notmuch-mutt 
b/contrib/notmuch-mutt/notmuch-mutt
index 7c125e6..d14709d 100755
--- a/contrib/notmuch-mutt/notmuch-mutt
+++ b/contrib/notmuch-mutt/notmuch-mutt
@@ -18,6 +18,8 @@ use Mail::Box::Maildir;
 use Pod::Usage;
 use String::ShellQuote;
 use Term::ReadLine;
+use Digest::SHA;
+use File::Which;


 my $xdg_cache_dir = "$ENV{HOME}/.cache";
@@ -34,16 +36,65 @@ sub empty_maildir($) {
 $folder->close();
 }

-# search($maildir, $query)
+# Match files by size and SHA-256; then delete duplicates
+sub builtin_remove_dups($) {
+my ($maildir) = @_;
+my (%size_to_files, %sha_to_files);
+
+# Group files by matching sizes
+foreach my $file (glob("$maildir/cur/*")) {
+my $size = -s $file;
+push(@{$size_to_files{$size}}, $file) if $size;
+}
+
+foreach my $same_size_files (values %size_to_files) {
+# Don't run sha unless there is another file of the same size
+next if scalar(@$same_size_files) < 2;
+%sha_to_files = ();
+
+# Group files with matching sizes by SHA-256
+foreach my $file (@$same_size_files) {
+open(my $fh, '<', $file) or next;
+binmode($fh);
+my $sha256hash = Digest::SHA->new(256)->addfile($fh)->hexdigest;
+close($fh);
+
+push(@{$sha_to_files{$sha256hash}}, $file);
+}
+
+# Remove duplicates
+foreach my $same_sha_files (values %sha_to_files) {
+next if scalar(@$same_sha_files) < 2;
+unlink(@{$same_sha_files}[1..$#$same_sha_files]);
+}
+}
+}
+
+# Use either fdupes or the built-in scanner to detect and remove duplicate
+# search results in the maildir
+sub remove_duplicates($) {
+my ($maildir) = @_;
+
+my $fdupes = which("fdupes");
+if ($fdupes) {
+  system("$fdupes --hardlinks --symlinks --delete --noprompt"
+ . " --quiet $maildir/cur/ > /dev/null");
+} else {
+builtin_remove_dups($maildir);
+}
+}
+
+# search($maildir, $remove_dups, $query)
 # search mails according to $query with notmuch; store results in $maildir
-sub search($$) {
-my ($maildir, $query) = @_;
+sub search($$$) {
+my ($maildir, $remove_dups, $query) = @_;
 $query = shell_quote($query);

 empty_maildir($maildir);
 system("notmuch search --output=files $query"
   . " | sed -e 's: : :g'"
   . " | xargs --no-run-if-empty ln -s -t $maildir/cur/");
+remove_duplicates($maildir) if ($remove_dups);
 }

 sub prompt($$) {
@@ -74,28 +125,28 @@ sub get_message_id() {
 return $1;
 }

-sub search_action($$@) {
-my ($interactive, $results_dir, @params) = @_;
+sub search_action($$$@) {
+my ($interactive, $results_dir, $remove_dups, @params) = @_;

 if (! $interactive) {
-   search($results_dir, join(' ', @params));
+   search($results_dir, $remove_dups, join(' ', @params));
 } else {
my $query = prompt("search ('?' for man): ", join(' ', @params));
if ($query ne "") {
-   search($results_dir,$query);
+   search($results_dir, $remove_dups, $query);
}
 }
 }

-sub thread_action(@) {
-my ($results_dir, @params) = @_;
+sub thread_action($$@) {
+my ($results_dir, $remove_dups, @params) = @_;

 my $mid = get_message_id();
 my $search_cmd = 'notmuch search --output=threads ' . 
shell_quote("id:$mid");
 my $tid = `$search_cmd`;   # get thread id
 chomp($tid);

-search($results_dir, $tid);
+search($results_dir, $remove_dups, $tid);
 }

 sub tag_action(@) {
@@ -118,11 +169,13 @@ sub main() {
 my $results_dir = "$cache_dir/results";
 my $interactive = 0;
 my $help_needed = 0;
+my $remove_dups = 0;

 my $getopt = GetOptions(
"h|help" => \$help_needed,
"o|output-dir=s" => \$results_dir,
-   "p|prompt" => \$interactive);
+   "p|prompt" => \$interactive,
+   "r|remove-dups" => \$remove_dups);
 if (! $getopt || $#ARGV < 0) { die_usage() };
 my ($action, @params) = ($ARGV[0], @ARGV[1..$#ARGV]);

@@ -136,9 +189,9 @@ sub main() {
print STDERR "Error: no search term provided\n\n";
die_usage();
 } elsif ($action eq "search") {
-   search_action($interactive, $results_dir, @params);
+   search_action($interactive, $results_dir, $remove_dups, @params);
 } elsif ($action eq "thread") {
-   thread_action($results_dir, @params);
+   thread_action($results_dir, $remove_dups, @params);
 } elsif ($action eq "tag") {
 

[PATCH 1/2] Add duplicate message removal for notmuch-mutt.

2012-08-01 Thread Stefano Zacchiroli
From: Kevin McCarthy ke...@8t8.us

Add a --remove-dups flag which removes duplicate files from search and
thread results.  Uses fdupes if installed.  Otherwise it runs a size and
Digest::SHA scan on each file to detect duplicates.

Signed-off-by: Stefano Zacchiroli z...@upsilon.cc
---
 contrib/notmuch-mutt/notmuch-mutt|   89 --
 contrib/notmuch-mutt/notmuch-mutt.rc |4 +-
 2 files changed, 76 insertions(+), 17 deletions(-)

diff --git a/contrib/notmuch-mutt/notmuch-mutt 
b/contrib/notmuch-mutt/notmuch-mutt
index 7c125e6..d14709d 100755
--- a/contrib/notmuch-mutt/notmuch-mutt
+++ b/contrib/notmuch-mutt/notmuch-mutt
@@ -18,6 +18,8 @@ use Mail::Box::Maildir;
 use Pod::Usage;
 use String::ShellQuote;
 use Term::ReadLine;
+use Digest::SHA;
+use File::Which;
 
 
 my $xdg_cache_dir = $ENV{HOME}/.cache;
@@ -34,16 +36,65 @@ sub empty_maildir($) {
 $folder-close();
 }
 
-# search($maildir, $query)
+# Match files by size and SHA-256; then delete duplicates
+sub builtin_remove_dups($) {
+my ($maildir) = @_;
+my (%size_to_files, %sha_to_files);
+
+# Group files by matching sizes
+foreach my $file (glob($maildir/cur/*)) {
+my $size = -s $file;
+push(@{$size_to_files{$size}}, $file) if $size;
+}
+
+foreach my $same_size_files (values %size_to_files) {
+# Don't run sha unless there is another file of the same size
+next if scalar(@$same_size_files)  2;
+%sha_to_files = ();
+
+# Group files with matching sizes by SHA-256
+foreach my $file (@$same_size_files) {
+open(my $fh, '', $file) or next;
+binmode($fh);
+my $sha256hash = Digest::SHA-new(256)-addfile($fh)-hexdigest;
+close($fh);
+
+push(@{$sha_to_files{$sha256hash}}, $file);
+}
+
+# Remove duplicates
+foreach my $same_sha_files (values %sha_to_files) {
+next if scalar(@$same_sha_files)  2;
+unlink(@{$same_sha_files}[1..$#$same_sha_files]);
+}
+}
+}
+
+# Use either fdupes or the built-in scanner to detect and remove duplicate
+# search results in the maildir
+sub remove_duplicates($) {
+my ($maildir) = @_;
+
+my $fdupes = which(fdupes);
+if ($fdupes) {
+  system($fdupes --hardlinks --symlinks --delete --noprompt
+ .  --quiet $maildir/cur/  /dev/null);
+} else {
+builtin_remove_dups($maildir);
+}
+}
+
+# search($maildir, $remove_dups, $query)
 # search mails according to $query with notmuch; store results in $maildir
-sub search($$) {
-my ($maildir, $query) = @_;
+sub search($$$) {
+my ($maildir, $remove_dups, $query) = @_;
 $query = shell_quote($query);
 
 empty_maildir($maildir);
 system(notmuch search --output=files $query
   .  | sed -e 's: : :g'
   .  | xargs --no-run-if-empty ln -s -t $maildir/cur/);
+remove_duplicates($maildir) if ($remove_dups);
 }
 
 sub prompt($$) {
@@ -74,28 +125,28 @@ sub get_message_id() {
 return $1;
 }
 
-sub search_action($$@) {
-my ($interactive, $results_dir, @params) = @_;
+sub search_action($$$@) {
+my ($interactive, $results_dir, $remove_dups, @params) = @_;
 
 if (! $interactive) {
-   search($results_dir, join(' ', @params));
+   search($results_dir, $remove_dups, join(' ', @params));
 } else {
my $query = prompt(search ('?' for man): , join(' ', @params));
if ($query ne ) {
-   search($results_dir,$query);
+   search($results_dir, $remove_dups, $query);
}
 }
 }
 
-sub thread_action(@) {
-my ($results_dir, @params) = @_;
+sub thread_action($$@) {
+my ($results_dir, $remove_dups, @params) = @_;
 
 my $mid = get_message_id();
 my $search_cmd = 'notmuch search --output=threads ' . 
shell_quote(id:$mid);
 my $tid = `$search_cmd`;   # get thread id
 chomp($tid);
 
-search($results_dir, $tid);
+search($results_dir, $remove_dups, $tid);
 }
 
 sub tag_action(@) {
@@ -118,11 +169,13 @@ sub main() {
 my $results_dir = $cache_dir/results;
 my $interactive = 0;
 my $help_needed = 0;
+my $remove_dups = 0;
 
 my $getopt = GetOptions(
h|help = \$help_needed,
o|output-dir=s = \$results_dir,
-   p|prompt = \$interactive);
+   p|prompt = \$interactive,
+   r|remove-dups = \$remove_dups);
 if (! $getopt || $#ARGV  0) { die_usage() };
 my ($action, @params) = ($ARGV[0], @ARGV[1..$#ARGV]);
 
@@ -136,9 +189,9 @@ sub main() {
print STDERR Error: no search term provided\n\n;
die_usage();
 } elsif ($action eq search) {
-   search_action($interactive, $results_dir, @params);
+   search_action($interactive, $results_dir, $remove_dups, @params);
 } elsif ($action eq thread) {
-   thread_action($results_dir, @params);
+   thread_action($results_dir, $remove_dups, @params);
 } elsif ($action eq tag) {
tag_action(@params);