Since we only query the SQLite over DB for OVER/XOVER; do not
need to waste space storing fields To/Cc/:bytes/:lines or the
XNUM term.  We only use From/Subject/References/Message-ID/:blob
in various places of the PSGI code.

For reindexing, we will take advantage of docid stability
in "xapian-compact --no-renumber" to ensure duplicates do not
show up in search results.  Since the PSGI interface is the
only consumer of Xapian at the moment, it has no need to
search based on NNTP article number.
---
 lib/PublicInbox/NNTP.pm       |  2 +-
 lib/PublicInbox/OverIdx.pm    |  6 +++---
 lib/PublicInbox/SearchIdx.pm  | 37 ++++---------------------------------
 lib/PublicInbox/SearchMsg.pm  |  6 ++----
 lib/PublicInbox/V2Writable.pm |  2 +-
 script/public-inbox-compact   |  6 +++---
 t/search.t                    | 24 +++++++++++++-----------
 t/v2writable.t                |  7 ++++---
 8 files changed, 31 insertions(+), 59 deletions(-)

diff --git a/lib/PublicInbox/NNTP.pm b/lib/PublicInbox/NNTP.pm
index fa890cb..ace56e7 100644
--- a/lib/PublicInbox/NNTP.pm
+++ b/lib/PublicInbox/NNTP.pm
@@ -725,7 +725,7 @@ sub hdr_searchmsg ($$$$) {
                        my $nr = scalar @$msgs or return;
                        my $tmp = '';
                        foreach my $s (@$msgs) {
-                               $tmp .= $s->num . ' ' . $s->$field . "\r\n";
+                               $tmp .= $s->{num} . ' ' . $s->$field . "\r\n";
                        }
                        utf8::encode($tmp);
                        do_more($self, $tmp);
diff --git a/lib/PublicInbox/OverIdx.pm b/lib/PublicInbox/OverIdx.pm
index 08f8744..62fec0d 100644
--- a/lib/PublicInbox/OverIdx.pm
+++ b/lib/PublicInbox/OverIdx.pm
@@ -207,8 +207,8 @@ sub link_refs {
        $tid;
 }
 
-sub parse_references ($$$$) {
-       my ($self, $smsg, $mid0, $mids) = @_;
+sub parse_references ($$$) {
+       my ($smsg, $mid0, $mids) = @_;
        my $mime = $smsg->{mime};
        my $hdr = $mime->header_obj;
        my $refs = references($hdr);
@@ -241,7 +241,7 @@ sub add_overview {
                blob => $oid,
        }, 'PublicInbox::SearchMsg';
        my $mids = mids($mime->header_obj);
-       my $refs = $self->parse_references($smsg, $mid0, $mids);
+       my $refs = parse_references($smsg, $mid0, $mids);
        my $subj = $smsg->subject;
        my $xpath;
        if ($subj ne '') {
diff --git a/lib/PublicInbox/SearchIdx.pm b/lib/PublicInbox/SearchIdx.pm
index 7cfa745..f9b40b0 100644
--- a/lib/PublicInbox/SearchIdx.pm
+++ b/lib/PublicInbox/SearchIdx.pm
@@ -273,18 +273,12 @@ sub add_message {
                my $smsg = PublicInbox::SearchMsg->new($mime);
                my $doc = $smsg->{doc};
                my $subj = $smsg->subject;
-
-               $smsg->{lines} = $mime->body_raw =~ tr!\n!\n!;
-               defined $bytes or $bytes = length($mime->as_string);
-               $smsg->{bytes} = $bytes;
-
                add_val($doc, PublicInbox::Search::TS(), $smsg->ts);
                my @ds = gmtime($smsg->ds);
                my $yyyymmdd = strftime('%Y%m%d', @ds);
                add_val($doc, PublicInbox::Search::YYYYMMDD(), $yyyymmdd);
                my $dt = strftime('%Y%m%d%H%M%S', @ds);
                add_val($doc, PublicInbox::Search::DT(), $dt);
-               my @vals = ($smsg->{ts}, $smsg->{ds});
 
                my $tg = $self->term_generator;
 
@@ -333,11 +327,11 @@ sub add_message {
                        index_body($tg, \@orig, $doc) if @orig;
                });
 
-               # populates smsg->references for smsg->to_doc_data
-               my $data = $smsg->to_doc_data($oid, $mid0);
                foreach my $mid (@$mids) {
                        $tg->index_text($mid, 1, 'XM');
                }
+               $smsg->{to} = $smsg->{cc} = '';
+               my $data = $smsg->to_doc_data($oid, $mid0);
                $doc->set_data($data);
                if (my $altid = $self->{-altid}) {
                        foreach my $alt (@$altid) {
@@ -350,24 +344,11 @@ sub add_message {
                        }
                }
 
-               $self->delete_article($num) if defined $num; # for reindexing
-
                if (my $over = $self->{over}) {
-                       utf8::encode($data);
-                       $data = compress($data);
-                       my $refs = $over->parse_references($smsg, $mid0, $mids);
-                       my $xpath;
-                       if ($subj ne '') {
-                               $xpath = $self->subject_path($subj);
-                               $xpath = id_compress($xpath);
-                       }
-
-                       push @vals, $num, $mids, $refs, $xpath, $data;
-                       $over->add_over(\@vals);
+                       $over->add_overview($mime, $bytes, $num, $oid, $mid0);
                }
                $doc->add_boolean_term('Q' . $_) foreach @$mids;
-               $doc->add_boolean_term('XNUM' . $num) if defined $num;
-               $doc_id = $self->{xdb}->add_document($doc);
+               $self->{xdb}->replace_document($doc_id = $num, $doc);
        };
 
        if ($@) {
@@ -419,16 +400,6 @@ sub remove_message {
        }
 }
 
-sub delete_article {
-       my ($self, $num) = @_;
-       my $ndel = 0;
-       batch_do($self, 'XNUM' . $num, sub {
-               my ($ids) = @_;
-               $ndel += scalar @$ids;
-               $self->{xdb}->delete_document($_) for @$ids;
-       });
-}
-
 # MID is a hint in V2
 sub remove_by_oid {
        my ($self, $oid, $mid) = @_;
diff --git a/lib/PublicInbox/SearchMsg.pm b/lib/PublicInbox/SearchMsg.pm
index 3278802..ab971e0 100644
--- a/lib/PublicInbox/SearchMsg.pm
+++ b/lib/PublicInbox/SearchMsg.pm
@@ -45,12 +45,11 @@ sub to_doc_data {
                $self->cc,
                $oid,
                $mid0,
-               $self->{bytes},
-               $self->{lines}
+               $self->{bytes} || '',
+               $self->{lines} || ''
        );
 }
 
-
 sub load_from_data ($$) {
        my ($self) = $_[0]; # data = $_[1]
        (
@@ -92,7 +91,6 @@ sub load_doc {
 # :bytes and :lines metadata in RFC 3977
 sub bytes ($) { $_[0]->{bytes} }
 sub lines ($) { $_[0]->{lines} }
-sub num ($) { $_[0]->{num} ||= _get_term_val($_[0], 'XNUM', qr/\AXNUM/) }
 
 sub __hdr ($$) {
        my ($self, $field) = @_;
diff --git a/lib/PublicInbox/V2Writable.pm b/lib/PublicInbox/V2Writable.pm
index 53fdb73..1cc4b00 100644
--- a/lib/PublicInbox/V2Writable.pm
+++ b/lib/PublicInbox/V2Writable.pm
@@ -800,7 +800,7 @@ sub unindex_oid {
                my %gone;
                my ($id, $prev);
                while (my $smsg = $over->next_by_mid($mid, \$id, \$prev)) {
-                       $gone{$smsg->num} = 1 if $oid eq $smsg->{blob};
+                       $gone{$smsg->{num}} = 1 if $oid eq $smsg->{blob};
                        1; # continue
                }
                my $n = scalar keys %gone;
diff --git a/script/public-inbox-compact b/script/public-inbox-compact
index d855b9e..9f33265 100755
--- a/script/public-inbox-compact
+++ b/script/public-inbox-compact
@@ -48,7 +48,7 @@ sub commit_changes ($$$) {
        $im->lock_release;
        remove_tree("$old/old") or die "failed to remove $old/old: $!\n";
 }
-
+my @compact = qw(xapian-compact --no-renumber);
 if ($v == 2) {
        require PublicInbox::V2Writable;
        my $v2w = PublicInbox::V2Writable->new($ibx);
@@ -70,7 +70,7 @@ if ($v == 2) {
                }
                close $dh;
                die "No Xapian parts found in $old\n" unless @parts;
-               my $cmd = ['xapian-compact', @parts, "$new/0" ];
+               my $cmd = [@compact, @parts, "$new/0" ];
                PublicInbox::Import::run_die($cmd);
                commit_changes($v2w, $old, $new);
        });
@@ -84,7 +84,7 @@ if ($v == 2) {
        my $new = tempdir('compact-XXXXXXXX', CLEANUP => 1, DIR => $v1_root);
        $ibx->with_umask(sub {
                $im->lock_acquire;
-               PublicInbox::Import::run_die(['xapian-compact', $old, $new]);
+               PublicInbox::Import::run_die([@compact, $old, $new]);
                commit_changes($im, $old, $new);
        });
 } else {
diff --git a/t/search.t b/t/search.t
index fda32d3..516f567 100644
--- a/t/search.t
+++ b/t/search.t
@@ -306,31 +306,33 @@ sub filter_mids {
 
 # names and addresses
 {
-       my $res = $ro->query('t:l...@example.com');
-       is(scalar @$res, 6, 'searched To: successfully');
-       foreach my $smsg (@$res) {
+       my $mset = $ro->query('t:l...@example.com', {mset => 1});
+       is($mset->size, 6, 'searched To: successfully');
+       foreach my $m ($mset->items) {
+               my $smsg = $ro->lookup_article($m->get_docid);
                like($smsg->to, qr/\blist\@example\.com\b/, 'to appears');
        }
 
-       $res = $ro->query('tc:l...@example.com');
-       is(scalar @$res, 6, 'searched To+Cc: successfully');
-       foreach my $smsg (@$res) {
+       $mset = $ro->query('tc:l...@example.com', {mset => 1});
+       is($mset->size, 6, 'searched To+Cc: successfully');
+       foreach my $m ($mset->items) {
+               my $smsg = $ro->lookup_article($m->get_docid);
                my $tocc = join("\n", $smsg->to, $smsg->cc);
                like($tocc, qr/\blist\@example\.com\b/, 'tocc appears');
        }
 
        foreach my $pfx ('tcf:', 'c:') {
-               $res = $ro->query($pfx . 'f...@example.com');
-               is(scalar @$res, 1,
-                       "searched $pfx successfully for Cc:");
-               foreach my $smsg (@$res) {
+               my $mset = $ro->query($pfx . 'f...@example.com', { mset => 1 });
+               is($mset->items, 1, "searched $pfx successfully for Cc:");
+               foreach my $m ($mset->items) {
+                       my $smsg = $ro->lookup_article($m->get_docid);
                        like($smsg->cc, qr/\bfoo\@example\.com\b/,
                                'cc appears');
                }
        }
 
        foreach my $pfx ('', 'tcf:', 'f:') {
-               $res = $ro->query($pfx . 'Laggy');
+               my $res = $ro->query($pfx . 'Laggy');
                is(scalar(@$res), 1,
                        "searched $pfx successfully for From:");
                foreach my $smsg (@$res) {
diff --git a/t/v2writable.t b/t/v2writable.t
index b543c53..85fb6a6 100644
--- a/t/v2writable.t
+++ b/t/v2writable.t
@@ -220,13 +220,14 @@ EOF
                'commit message propagated to git');
        is_deeply(\@after, \@before, 'only one commit written to git');
        is($ibx->mm->num_for($smsg->mid), undef, 'no longer in Msgmap by mid');
-       like($smsg->num, qr/\A\d+\z/, 'numeric number in return message');
-       is($ibx->mm->mid_for($smsg->num), undef, 'no longer in Msgmap by num');
+       my $num = $smsg->{num};
+       like($num, qr/\A\d+\z/, 'numeric number in return message');
+       is($ibx->mm->mid_for($num), undef, 'no longer in Msgmap by num');
        my $srch = $ibx->search->reopen;
        my $mset = $srch->query('m:'.$smsg->mid, { mset => 1});
        is($mset->size, 0, 'no longer found in Xapian');
        my @log1 = qw(log -1 --pretty=raw --raw -r --no-abbrev --no-renames);
-       is($srch->{over_ro}->get_art($smsg->num), undef,
+       is($srch->{over_ro}->get_art($num), undef,
                'removal propagated to Over DB');
 
        my $after = $git0->qx(@log1);
-- 
EW

--
unsubscribe: meta+unsubscr...@public-inbox.org
archive: https://public-inbox.org/meta/

Reply via email to