Some emails in LKML archives are identical with the only
difference being s/References:/In-Reply-To:/ in the headers.
Since this difference doesn't affect how we handle message
threading, we will treat them the same way for the purposes
of deduplication.

There may be more changes to how we do content_id along these
lines (e.g. using msg_iter to walk the message).
---
 lib/PublicInbox/ContentId.pm | 16 +++++++++++++++-
 1 file changed, 15 insertions(+), 1 deletion(-)

diff --git a/lib/PublicInbox/ContentId.pm b/lib/PublicInbox/ContentId.pm
index 65d5a76..7ec638c 100644
--- a/lib/PublicInbox/ContentId.pm
+++ b/lib/PublicInbox/ContentId.pm
@@ -11,7 +11,7 @@ our @EXPORT_OK = qw/content_id/;
 use Digest::SHA;
 
 # Content-* headers are often no-ops, so maybe we don't need them
-my @ID_HEADERS = qw(Subject From Date Message-ID References To Cc In-Reply-To);
+my @ID_HEADERS = qw(Subject From Date To Cc);
 
 sub content_id ($;$) {
        my ($mime, $alg) = @_;
@@ -19,6 +19,20 @@ sub content_id ($;$) {
        my $dig = Digest::SHA->new($alg);
        my $hdr = $mime->header_obj;
 
+       # References: and In-Reply-To: get used interchangeably
+       # in some "duplicates" in LKML.  We treat them the same
+       # in SearchIdx, so treat them the same for this:
+       my @mid = $hdr->header_raw('Message-ID');
+       @mid = (join(' ', @mid) =~ /<([^>]+)>/g);
+       my $refs = join(' ', $hdr->header_raw('References'),
+                       $hdr->header_raw('In-Reply-To'));
+       my @refs = ($refs =~ /<([^>]+)>/g);
+       my %seen;
+       foreach my $mid (@mid, @refs) {
+               next if $seen{$mid};
+               $dig->add($mid);
+               $seen{$mid} = 1;
+       }
        foreach my $h (@ID_HEADERS) {
                my @v = $hdr->header_raw($h);
                $dig->add($_) foreach @v;
-- 
EW

--
unsubscribe: meta+unsubscr...@public-inbox.org
archive: https://public-inbox.org/meta/

Reply via email to