It's cheaper to check for duplicates than run `spamc'
repeatedly when rechecking. We already do this for
v1 with by using the "ls" command with fast-import,
but v2 requires checking against over.sqlite3.
---
lib/PublicInbox/Import.pm | 2 +-
lib/PublicInbox/V2Writable.pm | 2 +-
lib/PublicInbox/WatchMaildir.pm | 21 ++++++++++++++++++++-
3 files changed, 22 insertions(+), 3 deletions(-)
diff --git a/lib/PublicInbox/Import.pm b/lib/PublicInbox/Import.pm
index ae508cd8013..fb813159ef7 100644
--- a/lib/PublicInbox/Import.pm
+++ b/lib/PublicInbox/Import.pm
@@ -387,7 +387,7 @@ sub add {
# spam check:
if ($check_cb) {
- $mime = $check_cb->($mime) or return;
+ $mime = $check_cb->($mime, $self->{-inbox}) or return;
}
my $blob = $self->{mark}++;
diff --git a/lib/PublicInbox/V2Writable.pm b/lib/PublicInbox/V2Writable.pm
index 8b31b69a62f..528f5e9a565 100644
--- a/lib/PublicInbox/V2Writable.pm
+++ b/lib/PublicInbox/V2Writable.pm
@@ -171,7 +171,7 @@ sub _add {
# spam check:
if ($check_cb) {
- $mime = $check_cb->($mime) or return;
+ $mime = $check_cb->($mime, $self->{-inbox}) or return;
}
# All pipes (> $^F) known to Perl 5.6+ have FD_CLOEXEC set,
diff --git a/lib/PublicInbox/WatchMaildir.pm b/lib/PublicInbox/WatchMaildir.pm
index efc9849a6ef..ec28a3034ff 100644
--- a/lib/PublicInbox/WatchMaildir.pm
+++ b/lib/PublicInbox/WatchMaildir.pm
@@ -12,6 +12,8 @@ use PublicInbox::Filter::Base qw(REJECT);
use PublicInbox::Spamcheck;
use PublicInbox::Sigfd;
use PublicInbox::DS qw(now);
+use PublicInbox::MID qw(mids);
+use PublicInbox::ContentHash qw(content_hash);
use POSIX qw(_exit);
*mime_from_path = \&PublicInbox::InboxWritable::mime_from_path;
@@ -988,10 +990,27 @@ sub _importer_for {
$importers->{"$ibx"} = $im;
}
+# XXX consider sharing with V2Writable, this only requires read-only access
+sub content_exists ($$) {
+ my ($ibx, $eml) = @_;
+ my $over = $ibx->over or return;
+ my $mids = mids($eml);
+ my $chash = content_hash($eml);
+ my ($id, $prev);
+ for my $mid (@$mids) {
+ while (my $smsg = $over->next_by_mid($mid, \$id, \$prev)) {
+ my $cmp = $ibx->smsg_eml($smsg) or return;
+ return 1 if $chash eq content_hash($cmp);
+ }
+ }
+ undef;
+}
+
sub _spamcheck_cb {
my ($sc) = @_;
sub {
- my ($mime) = @_;
+ my ($mime, $ibx) = @_;
+ return if content_exists($ibx, $mime);
my $tmp = '';
if ($sc->spamcheck($mime, \$tmp)) {
return PublicInbox::Eml->new(\$tmp);
--
unsubscribe: one-click, see List-Unsubscribe header
archive: https://public-inbox.org/meta/