It's cheaper to check for duplicates than run `spamc'
repeatedly when rechecking.  We already do this for
v1 with by using the "ls" command with fast-import,
but v2 requires checking against over.sqlite3.
---
 lib/PublicInbox/Import.pm       |  2 +-
 lib/PublicInbox/V2Writable.pm   |  2 +-
 lib/PublicInbox/WatchMaildir.pm | 21 ++++++++++++++++++++-
 3 files changed, 22 insertions(+), 3 deletions(-)

diff --git a/lib/PublicInbox/Import.pm b/lib/PublicInbox/Import.pm
index ae508cd8013..fb813159ef7 100644
--- a/lib/PublicInbox/Import.pm
+++ b/lib/PublicInbox/Import.pm
@@ -387,7 +387,7 @@ sub add {
 
        # spam check:
        if ($check_cb) {
-               $mime = $check_cb->($mime) or return;
+               $mime = $check_cb->($mime, $self->{-inbox}) or return;
        }
 
        my $blob = $self->{mark}++;
diff --git a/lib/PublicInbox/V2Writable.pm b/lib/PublicInbox/V2Writable.pm
index 8b31b69a62f..528f5e9a565 100644
--- a/lib/PublicInbox/V2Writable.pm
+++ b/lib/PublicInbox/V2Writable.pm
@@ -171,7 +171,7 @@ sub _add {
 
        # spam check:
        if ($check_cb) {
-               $mime = $check_cb->($mime) or return;
+               $mime = $check_cb->($mime, $self->{-inbox}) or return;
        }
 
        # All pipes (> $^F) known to Perl 5.6+ have FD_CLOEXEC set,
diff --git a/lib/PublicInbox/WatchMaildir.pm b/lib/PublicInbox/WatchMaildir.pm
index efc9849a6ef..ec28a3034ff 100644
--- a/lib/PublicInbox/WatchMaildir.pm
+++ b/lib/PublicInbox/WatchMaildir.pm
@@ -12,6 +12,8 @@ use PublicInbox::Filter::Base qw(REJECT);
 use PublicInbox::Spamcheck;
 use PublicInbox::Sigfd;
 use PublicInbox::DS qw(now);
+use PublicInbox::MID qw(mids);
+use PublicInbox::ContentHash qw(content_hash);
 use POSIX qw(_exit);
 *mime_from_path = \&PublicInbox::InboxWritable::mime_from_path;
 
@@ -988,10 +990,27 @@ sub _importer_for {
        $importers->{"$ibx"} = $im;
 }
 
+# XXX consider sharing with V2Writable, this only requires read-only access
+sub content_exists ($$) {
+       my ($ibx, $eml) = @_;
+       my $over = $ibx->over or return;
+       my $mids = mids($eml);
+       my $chash = content_hash($eml);
+       my ($id, $prev);
+       for my $mid (@$mids) {
+               while (my $smsg = $over->next_by_mid($mid, \$id, \$prev)) {
+                       my $cmp = $ibx->smsg_eml($smsg) or return;
+                       return 1 if $chash eq content_hash($cmp);
+               }
+       }
+       undef;
+}
+
 sub _spamcheck_cb {
        my ($sc) = @_;
        sub {
-               my ($mime) = @_;
+               my ($mime, $ibx) = @_;
+               return if content_exists($ibx, $mime);
                my $tmp = '';
                if ($sc->spamcheck($mime, \$tmp)) {
                        return PublicInbox::Eml->new(\$tmp);
--
unsubscribe: one-click, see List-Unsubscribe header
archive: https://public-inbox.org/meta/

Reply via email to