[PATCH 4/4] ensure Xapian and SQLite are still optional for v1 tests

2018-04-06 Thread Eric Wong (Contractor, The Linux Foundation)
Xapian is size-intensive and SQLite is not strictly necessary for v1.
---
 script/public-inbox-compact   | 2 +-
 scripts/import_vger_from_mbox | 2 +-
 t/convert-compact.t   | 2 +-
 t/v2mirror.t  | 2 +-
 4 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/script/public-inbox-compact b/script/public-inbox-compact
index 43e9460..d855b9e 100755
--- a/script/public-inbox-compact
+++ b/script/public-inbox-compact
@@ -4,9 +4,9 @@
 use strict;
 use warnings;
 use Getopt::Long qw(:config gnu_getopt no_ignore_case auto_abbrev);
-use PublicInbox::V2Writable;
 use PublicInbox::Search;
 use PublicInbox::Config;
+use PublicInbox::InboxWritable;
 use Cwd 'abs_path';
 use File::Temp qw(tempdir);
 use File::Path qw(remove_tree);
diff --git a/scripts/import_vger_from_mbox b/scripts/import_vger_from_mbox
index 191f75d..ca5a408 100644
--- a/scripts/import_vger_from_mbox
+++ b/scripts/import_vger_from_mbox
@@ -6,7 +6,6 @@ use warnings;
 use Getopt::Long qw/:config gnu_getopt no_ignore_case auto_abbrev/;
 use PublicInbox::MIME;
 use PublicInbox::InboxWritable;
-use PublicInbox::V2Writable;
 use PublicInbox::Import;
 use PublicInbox::MDA;
 my $usage = "usage: $0 NAME EMAIL DIR new($ibx);
 unless ($dry_run) {
if ($version >= 2) {
+   require PublicInbox::V2Writable;
PublicInbox::V2Writable->new($ibx, 1)->init_inbox(0);
} else {
system(qw(git init --bare -q), $mainrepo) == 0 or die;
diff --git a/t/convert-compact.t b/t/convert-compact.t
index e2ba40a..5caa0ac 100644
--- a/t/convert-compact.t
+++ b/t/convert-compact.t
@@ -10,7 +10,7 @@ foreach my $mod (@mods) {
eval "require $mod";
plan skip_all => "$mod missing for convert-compact.t" if $@;
 }
-use PublicInbox::V2Writable;
+use_ok 'PublicInbox::V2Writable';
 use PublicInbox::Import;
 my $tmpdir = tempdir('convert-compact-XX', TMPDIR => 1, CLEANUP => 1);
 my $ibx = {
diff --git a/t/v2mirror.t b/t/v2mirror.t
index 0c66aef..9e0c9e1 100644
--- a/t/v2mirror.t
+++ b/t/v2mirror.t
@@ -13,7 +13,7 @@ foreach my $mod (qw(Plack::Util Plack::Builder Danga::Socket
 use File::Temp qw/tempdir/;
 use IO::Socket;
 use POSIX qw(dup2);
-use PublicInbox::V2Writable;
+use_ok 'PublicInbox::V2Writable';
 use PublicInbox::MIME;
 use PublicInbox::Config;
 use Fcntl qw(FD_CLOEXEC F_SETFD F_GETFD);
-- 
EW

--
unsubscribe: meta+unsubscr...@public-inbox.org
archive: https://public-inbox.org/meta/



[PATCH 2/4] nntp: set Xref across multiple inboxes

2018-04-06 Thread Eric Wong (Contractor, The Linux Foundation)
Noted by Jonathan Corbet in https://lwn.net/Articles/748184/
---
 lib/PublicInbox/NNTP.pm | 43 ---
 t/nntp.t|  6 --
 2 files changed, 32 insertions(+), 17 deletions(-)

diff --git a/lib/PublicInbox/NNTP.pm b/lib/PublicInbox/NNTP.pm
index e517935..fa890cb 100644
--- a/lib/PublicInbox/NNTP.pm
+++ b/lib/PublicInbox/NNTP.pm
@@ -407,12 +407,30 @@ sub header_append ($$$) {
$hdr->header_set($k, @v, $v);
 }
 
-sub set_nntp_headers {
-   my ($hdr, $ng, $n, $mid) = @_;
+sub xref () {
+   my ($self, $ng, $n, $mid) = @_;
+   my $ret = "$ng->{domain} $ng->{newsgroup}:$n";
+
+   # num_for is pretty cheap and sometimes we'll lookup the existence
+   # of an article without getting even the OVER info.  In other words,
+   # I'm not sure if its worth optimizing by scanning To:/Cc: and
+   # PublicInbox::ExtMsg on the PSGI end is just as expensive
+   foreach my $other (@{$self->{nntpd}->{grouplist}}) {
+   next if $ng eq $other;
+   my $num = eval { $other->mm->num_for($mid) } or next;
+   $ret .= " $other->{newsgroup}:$num";
+   }
+   $ret;
+}
+
+sub set_nntp_headers ($) {
+   my ($self, $hdr, $ng, $n, $mid) = @_;
 
# clobber some
-   $hdr->header_set('Newsgroups', $ng->{newsgroup});
-   $hdr->header_set('Xref', xref($ng, $n));
+   my $xref = xref($self, $ng, $n, $mid);
+   $hdr->header_set('Xref', $xref);
+   $xref =~ s/:\d+//g;
+   $hdr->header_set('Newsgroups', (split(/ /, $xref, 2))[1]);
header_append($hdr, 'List-Post', "{-primary_address}>");
if (my $url = $ng->base_url) {
$mid = mid_escape($mid);
@@ -461,7 +479,7 @@ found:
my $msg = $ng->msg_by_smsg($smsg) or return $err;
my $s = Email::Simple->new($msg);
if ($set_headers) {
-   set_nntp_headers($s->header_obj, $ng, $n, $mid);
+   set_nntp_headers($self, $s->header_obj, $ng, $n, $mid);
 
# must be last
$s->body_set('') if ($set_headers == 2);
@@ -635,11 +653,6 @@ sub hdr_message_id ($$$) { # optimize XHDR Message-ID 
[range] for slrnpull.
}
 }
 
-sub xref ($$) {
-   my ($ng, $n) = @_;
-   "$ng->{domain} $ng->{newsgroup}:$n"
-}
-
 sub mid_lookup ($$) {
my ($self, $mid) = @_;
my $self_ng = $self->{ng};
@@ -659,9 +672,11 @@ sub hdr_xref ($$$) { # optimize XHDR Xref [range] for rtin
my ($self, $xhdr, $range) = @_;
 
if (defined $range && $range =~ /\A<(.+)>\z/) { # Message-ID
-   my ($ng, $n) = mid_lookup($self, $1);
+   my $mid = $1;
+   my ($ng, $n) = mid_lookup($self, $mid);
return r430 unless $n;
-   hdr_mid_response($self, $xhdr, $ng, $n, $range, xref($ng, $n));
+   hdr_mid_response($self, $xhdr, $ng, $n, $range,
+   xref($self, $ng, $n, $mid));
} else { # numeric range
$range = $self->{article} unless defined $range;
my $r = get_range($self, $range);
@@ -674,10 +689,8 @@ sub hdr_xref ($$$) { # optimize XHDR Xref [range] for rtin
my $r = $mm->msg_range(\$beg, $end);
@$r or return;
more($self, join("\r\n", map {
-   # TODO: use $_->[1] (mid) to fill
-   # Xref: from other inboxes
my $num = $_->[0];
-   "$num ".xref($ng, $num);
+   "$num ".xref($self, $ng, $num, $_->[1]);
} @$r));
1;
});
diff --git a/t/nntp.t b/t/nntp.t
index 03c7f08..57fef48 100644
--- a/t/nntp.t
+++ b/t/nntp.t
@@ -109,7 +109,9 @@ use_ok 'PublicInbox::Inbox';
is($ng->base_url, $u, 'URL expanded');
my $mid = 'a@b';
my $mime = Email::MIME->new("Message-ID: <$mid>\r\n\r\n");
-   PublicInbox::NNTP::set_nntp_headers($mime->header_obj, $ng, 1, $mid);
+   my $hdr = $mime->header_obj;
+   my $mock_self = { nntpd => { grouplist => [] } };
+   PublicInbox::NNTP::set_nntp_headers($mock_self, $hdr, $ng, 1, $mid);
is_deeply([ $mime->header('Message-ID') ], [ "<$mid>" ],
'Message-ID unchanged');
is_deeply([ $mime->header('Archived-At') ], [ "<${u}a\@b/>" ],
@@ -124,7 +126,7 @@ use_ok 'PublicInbox::Inbox';
'Xref: set');
 
$ng->{-base_url} = 'http://mirror.example.com/m/';
-   PublicInbox::NNTP::set_nntp_headers($mime->header_obj, $ng, 2, $mid);
+   PublicInbox::NNTP::set_nntp_headers($mock_self, $hdr, $ng, 2, $mid);
is_deeply([ $mime->header('Message-ID') ], [ "<$mid>" ],
'Message-ID unchanged');
is_deeply([ $mime->header('Archived-At') ],
-- 
EW

--
unsubscribe: meta+unsubscr...@public-inbox.org
archive: 

[PATCH 8/8] msgmap: speed up minmax with separate queries

2018-04-06 Thread Eric Wong (Contractor, The Linux Foundation)
This significantly improves the performance of the NNTP GROUP
command with 2.7 million messages from over 250ms to 700us.
SQLite is weird about this, but at least there's a way to
optimize it.
---
 lib/PublicInbox/Msgmap.pm | 10 +++---
 t/perf-nntpd.t| 13 +
 2 files changed, 16 insertions(+), 7 deletions(-)

diff --git a/lib/PublicInbox/Msgmap.pm b/lib/PublicInbox/Msgmap.pm
index f5f8843..feef8ba 100644
--- a/lib/PublicInbox/Msgmap.pm
+++ b/lib/PublicInbox/Msgmap.pm
@@ -138,10 +138,14 @@ sub num_for {
 sub minmax {
my ($self) = @_;
my $dbh = $self->{dbh};
-   my $sth = $self->{num_minmax} ||=
-   $dbh->prepare('SELECT MIN(num),MAX(num) FROM msgmap');
+   # breaking MIN and MAX into separate queries speeds up from 250ms
+   # to around 700us with 2.7million messages.
+   my $sth = $dbh->prepare_cached('SELECT MIN(num) FROM msgmap', undef, 1);
$sth->execute;
-$sth->fetchrow_array;
+   my $min = $sth->fetchrow_array;
+   $sth = $dbh->prepare_cached('SELECT MAX(num) FROM msgmap', undef, 1);
+   $sth->execute;
+   ($min, $sth->fetchrow_array);
 }
 
 sub mid_prefixes {
diff --git a/t/perf-nntpd.t b/t/perf-nntpd.t
index 4987f98..e502153 100644
--- a/t/perf-nntpd.t
+++ b/t/perf-nntpd.t
@@ -3,7 +3,7 @@
 use strict;
 use warnings;
 use Test::More;
-use Benchmark qw(:all);
+use Benchmark qw(:all :hireswallclock);
 use PublicInbox::Inbox;
 use File::Temp qw/tempdir/;
 use POSIX qw(dup2);
@@ -79,8 +79,13 @@ $s = IO::Socket::INET->new(%opts);
 $s->autoflush(1);
 my $buf = $s->getline;
 is($buf, "201 server ready - post via email\r\n", 'got greeting');
-ok($s->print("GROUP $group\r\n"), 'changed group');
-$buf = $s->getline;
+
+my $t = timeit(10, sub {
+   ok($s->print("GROUP $group\r\n"), 'changed group');
+   $buf = $s->getline;
+});
+diag 'GROUP took: ' . timestr($t);
+
 my ($tot, $min, $max) = ($buf =~ /\A211 (\d+) (\d+) (\d+) /);
 ok($tot && $min && $max, 'got GROUP response');
 my $nr = $max - $min;
@@ -100,7 +105,7 @@ sub read_until_dot ($) {
$n;
 }
 
-my $t = timeit(1, sub {
+$t = timeit(1, sub {
$s->print("XOVER $spec\r\n");
$n = read_until_dot($s);
 });
-- 
EW

--
unsubscribe: meta+unsubscr...@public-inbox.org
archive: https://public-inbox.org/meta/



[PATCH 3/8] over: remove forked subprocess

2018-04-06 Thread Eric Wong (Contractor, The Linux Foundation)
Since the overview stuff is a synchronization point anyways,
move it into the main V2Writable process and allow us to
drop a bunch of code.  This is another step towards making
Xapian optional for v2.

In other words, the fan-out point is moved and the Xapian
partitions no longer need to synchronize against each other:

Before:
 />\
/-->\
 v2writable -->+parts> over
\-->/
 \>/

After:

  /-->
 /--->
  v2writable --> over-->+parts--->
 \--->
  \-->

Since the overview/threading logic needs to run on the same core
that feeds git-fast-import, it's slower for small repos but is
not noticeable in large imports where I/O wait in the partitions
dominates.
---
 MANIFEST |   1 -
 lib/PublicInbox/OverIdx.pm   |  57 -
 lib/PublicInbox/OverIdxFork.pm   | 180 ---
 lib/PublicInbox/SearchIdx.pm |  62 +-
 lib/PublicInbox/SearchIdxPart.pm |  14 +--
 lib/PublicInbox/V2Writable.pm|  89 +--
 6 files changed, 144 insertions(+), 259 deletions(-)
 delete mode 100644 lib/PublicInbox/OverIdxFork.pm

diff --git a/MANIFEST b/MANIFEST
index 82cc67d..58b3634 100644
--- a/MANIFEST
+++ b/MANIFEST
@@ -84,7 +84,6 @@ lib/PublicInbox/NNTPD.pm
 lib/PublicInbox/NewsWWW.pm
 lib/PublicInbox/Over.pm
 lib/PublicInbox/OverIdx.pm
-lib/PublicInbox/OverIdxFork.pm
 lib/PublicInbox/ParentPipe.pm
 lib/PublicInbox/ProcessPipe.pm
 lib/PublicInbox/Qspawn.pm
diff --git a/lib/PublicInbox/OverIdx.pm b/lib/PublicInbox/OverIdx.pm
index 28e4aa9..08f8744 100644
--- a/lib/PublicInbox/OverIdx.pm
+++ b/lib/PublicInbox/OverIdx.pm
@@ -2,14 +2,21 @@
 # License: AGPL-3.0+ 
 
 # for XOVER, OVER in NNTP, and feeds/homepage/threads in PSGI
-# Unlike Msgmap, this is an _UNSTABLE_ database which can be
+# Unlike Msgmap, this is an _UNSTABLE_ cache which can be
 # tweaked/updated over time and rebuilt.
+#
+# Ghost messages (messages which are only referenced in References/In-Reply-To)
+# are denoted by a negative NNTP article number.
 package PublicInbox::OverIdx;
 use strict;
 use warnings;
 use base qw(PublicInbox::Over);
 use IO::Handle;
 use DBI qw(:sql_types); # SQL_BLOB
+use PublicInbox::MID qw/id_compress mids references/;
+use PublicInbox::SearchMsg;
+use Compress::Zlib qw(compress);
+use PublicInbox::Search;
 
 sub dbh_new {
my ($self) = @_;
@@ -200,6 +207,54 @@ sub link_refs {
$tid;
 }
 
+sub parse_references () {
+   my ($self, $smsg, $mid0, $mids) = @_;
+   my $mime = $smsg->{mime};
+   my $hdr = $mime->header_obj;
+   my $refs = references($hdr);
+   push(@$refs, @$mids) if scalar(@$mids) > 1;
+   return $refs if scalar(@$refs) == 0;
+
+   # prevent circular references here:
+   my %seen = ( $mid0 => 1 );
+   my @keep;
+   foreach my $ref (@$refs) {
+   if (length($ref) > PublicInbox::MID::MAX_MID_SIZE) {
+   warn "References: <$ref> too long, ignoring\n";
+   next;
+   }
+   next if $seen{$ref}++;
+   push @keep, $ref;
+   }
+   $smsg->{references} = '<'.join('> <', @keep).'>' if @keep;
+   \@keep;
+}
+
+sub add_overview {
+   my ($self, $mime, $bytes, $num, $oid, $mid0) = @_;
+   my $lines = $mime->body_raw =~ tr!\n!\n!;
+   my $smsg = bless {
+   mime => $mime,
+   mid => $mid0,
+   bytes => $bytes,
+   lines => $lines,
+   blob => $oid,
+   }, 'PublicInbox::SearchMsg';
+   my $mids = mids($mime->header_obj);
+   my $refs = $self->parse_references($smsg, $mid0, $mids);
+   my $subj = $smsg->subject;
+   my $xpath;
+   if ($subj ne '') {
+   $xpath = PublicInbox::Search::subject_path($subj);
+   $xpath = id_compress($xpath);
+   }
+   my $dd = $smsg->to_doc_data($oid, $mid0);
+   utf8::encode($dd);
+   $dd = compress($dd);
+   my $values = [ $smsg->ts, $smsg->ds, $num, $mids, $refs, $xpath, $dd ];
+   add_over($self, $values);
+}
+
 sub add_over {
my ($self, $values) = @_;
my ($ts, $ds, $num, $mids, $refs, $xpath, $ddd) = @$values;
diff --git a/lib/PublicInbox/OverIdxFork.pm b/lib/PublicInbox/OverIdxFork.pm
deleted file mode 100644
index ec96528..000
--- a/lib/PublicInbox/OverIdxFork.pm
+++ /dev/null
@@ -1,180 +0,0 @@
-# Copyright (C) 2018 all contributors 
-# License: AGPL-3.0+ 
-package PublicInbox::OverIdxFork;
-use strict;
-use warnings;
-use base qw(PublicInbox::OverIdx PublicInbox::Lock);
-use Storable qw(freeze thaw);
-use IO::Handle;
-
-sub create {
-   my 

[PATCH 4/8] v2writable: reduce barriers

2018-04-06 Thread Eric Wong (Contractor, The Linux Foundation)
Since we handle the overview info synchronously, we only need
barriers in tests, now.  We will use asynchronous checkpoints
to sync less-important Xapian data.

For data deduplication, this requires us to hoist out the
cat-blob support in ::Import for reading uncommitted data
in git.
---
 lib/PublicInbox/Import.pm |  34 -
 lib/PublicInbox/V2Writable.pm | 111 --
 t/v2writable.t|   2 +-
 3 files changed, 75 insertions(+), 72 deletions(-)

diff --git a/lib/PublicInbox/Import.pm b/lib/PublicInbox/Import.pm
index 2529798..9e8900f 100644
--- a/lib/PublicInbox/Import.pm
+++ b/lib/PublicInbox/Import.pm
@@ -95,19 +95,13 @@ sub _check_path () {
$info =~ /\Amissing / ? undef : $info;
 }
 
-sub check_remove_v1 {
-   my ($r, $w, $tip, $path, $mime) = @_;
-
-   my $info = _check_path($r, $w, $tip, $path) or return ('MISSING',undef);
-   $info =~ m!\A100644 blob ([a-f0-9]{40})\t!s or die "not blob: $info";
-   my $blob = $1;
-
-   print $w "cat-blob $blob\n" or wfail;
+sub _cat_blob ($$$) {
+   my ($r, $w, $oid) = @_;
+   print $w "cat-blob $oid\n" or wfail;
local $/ = "\n";
-   $info = <$r>;
+   my $info = <$r>;
defined $info or die "EOF from fast-import / cat-blob: $!";
-   $info =~ /\A[a-f0-9]{40} blob (\d+)\n\z/ or
-   die "unexpected cat-blob response: $info";
+   $info =~ /\A[a-f0-9]{40} blob (\d+)\n\z/ or return;
my $left = $1;
my $offset = 0;
my $buf = '';
@@ -122,7 +116,23 @@ sub check_remove_v1 {
$n = read($r, my $lf, 1);
defined($n) or die "read final byte of cat-blob failed: $!";
die "bad read on final byte: <$lf>" if $lf ne "\n";
-   my $cur = PublicInbox::MIME->new(\$buf);
+   \$buf;
+}
+
+sub cat_blob {
+   my ($self, $oid) = @_;
+   my ($r, $w) = $self->gfi_start;
+   _cat_blob($r, $w, $oid);
+}
+
+sub check_remove_v1 {
+   my ($r, $w, $tip, $path, $mime) = @_;
+
+   my $info = _check_path($r, $w, $tip, $path) or return ('MISSING',undef);
+   $info =~ m!\A100644 blob ([a-f0-9]{40})\t!s or die "not blob: $info";
+   my $oid = $1;
+   my $msg = _cat_blob($r, $w, $oid) or die "BUG: cat-blob $1 failed";
+   my $cur = PublicInbox::MIME->new($msg);
my $cur_s = $cur->header('Subject');
$cur_s = '' unless defined $cur_s;
my $cur_m = $mime->header('Subject');
diff --git a/lib/PublicInbox/V2Writable.pm b/lib/PublicInbox/V2Writable.pm
index 8361d09..53fdb73 100644
--- a/lib/PublicInbox/V2Writable.pm
+++ b/lib/PublicInbox/V2Writable.pm
@@ -139,7 +139,6 @@ sub num_for {
};
 
# crap, Message-ID is already known, hope somebody just resent:
-   $self->barrier;
foreach my $m (@$mids) {
# read-only lookup now safe to do after above barrier
my $existing = $self->lookup_content($mime, $m);
@@ -259,10 +258,8 @@ sub purge_oids {
 
 sub remove_internal {
my ($self, $mime, $cmt_msg, $purge) = @_;
-   $self->barrier;
$self->idx_init;
my $im = $self->importer unless $purge;
-   my $ibx = $self->{-inbox};
my $over = $self->{over};
my $cid = content_id($mime);
my $parts = $self->{idx_parts};
@@ -280,7 +277,7 @@ sub remove_internal {
my %gone;
my ($id, $prev);
while (my $smsg = $over->next_by_mid($mid, \$id, \$prev)) {
-   my $msg = $ibx->msg_by_smsg($smsg);
+   my $msg = get_blob($self, $smsg);
if (!defined($msg)) {
warn "broken smsg for $mid\n";
next; # continue
@@ -313,7 +310,6 @@ sub remove_internal {
$orig = undef;
$self->unindex_oid_remote($oid, $mid);
}
-   $self->barrier;
}
 
if (defined $mark) {
@@ -359,45 +355,6 @@ sub set_last_commits ($) {
}
 }
 
-sub done {
-   my ($self) = @_;
-   my $im = delete $self->{im};
-   $im->done if $im; # PublicInbox::Import::done
-
-   my $mm = $self->{mm};
-   $mm->{dbh}->commit if $mm;
-
-   # order matters, we can only close {over} after all partitions
-   # are done because the partitions also write to {over}
-   my $parts = delete $self->{idx_parts};
-   if ($parts) {
-   $_->remote_commit for @$parts;
-   $_->remote_close for @$parts;
-   }
-
-   my $over = $self->{over};
-   $over->commit_lazy;
-   $over->disconnect;
-
-   if ($mm) {
-   $mm->{dbh}->begin_work;
-   set_last_commits($self);
-   $mm->{dbh}->commit;
-   delete $self->{mm};
-   }
-
-   delete $self->{bnote};
-   $self->{transact_bytes} = 0;
-   $self->lock_release 

[PATCH 7/8] store less data in the Xapian document

2018-04-06 Thread Eric Wong (Contractor, The Linux Foundation)
Since we only query the SQLite over DB for OVER/XOVER; do not
need to waste space storing fields To/Cc/:bytes/:lines or the
XNUM term.  We only use From/Subject/References/Message-ID/:blob
in various places of the PSGI code.

For reindexing, we will take advantage of docid stability
in "xapian-compact --no-renumber" to ensure duplicates do not
show up in search results.  Since the PSGI interface is the
only consumer of Xapian at the moment, it has no need to
search based on NNTP article number.
---
 lib/PublicInbox/NNTP.pm   |  2 +-
 lib/PublicInbox/OverIdx.pm|  6 +++---
 lib/PublicInbox/SearchIdx.pm  | 37 -
 lib/PublicInbox/SearchMsg.pm  |  6 ++
 lib/PublicInbox/V2Writable.pm |  2 +-
 script/public-inbox-compact   |  6 +++---
 t/search.t| 24 +---
 t/v2writable.t|  7 ---
 8 files changed, 31 insertions(+), 59 deletions(-)

diff --git a/lib/PublicInbox/NNTP.pm b/lib/PublicInbox/NNTP.pm
index fa890cb..ace56e7 100644
--- a/lib/PublicInbox/NNTP.pm
+++ b/lib/PublicInbox/NNTP.pm
@@ -725,7 +725,7 @@ sub hdr_searchmsg () {
my $nr = scalar @$msgs or return;
my $tmp = '';
foreach my $s (@$msgs) {
-   $tmp .= $s->num . ' ' . $s->$field . "\r\n";
+   $tmp .= $s->{num} . ' ' . $s->$field . "\r\n";
}
utf8::encode($tmp);
do_more($self, $tmp);
diff --git a/lib/PublicInbox/OverIdx.pm b/lib/PublicInbox/OverIdx.pm
index 08f8744..62fec0d 100644
--- a/lib/PublicInbox/OverIdx.pm
+++ b/lib/PublicInbox/OverIdx.pm
@@ -207,8 +207,8 @@ sub link_refs {
$tid;
 }
 
-sub parse_references () {
-   my ($self, $smsg, $mid0, $mids) = @_;
+sub parse_references ($$$) {
+   my ($smsg, $mid0, $mids) = @_;
my $mime = $smsg->{mime};
my $hdr = $mime->header_obj;
my $refs = references($hdr);
@@ -241,7 +241,7 @@ sub add_overview {
blob => $oid,
}, 'PublicInbox::SearchMsg';
my $mids = mids($mime->header_obj);
-   my $refs = $self->parse_references($smsg, $mid0, $mids);
+   my $refs = parse_references($smsg, $mid0, $mids);
my $subj = $smsg->subject;
my $xpath;
if ($subj ne '') {
diff --git a/lib/PublicInbox/SearchIdx.pm b/lib/PublicInbox/SearchIdx.pm
index 7cfa745..f9b40b0 100644
--- a/lib/PublicInbox/SearchIdx.pm
+++ b/lib/PublicInbox/SearchIdx.pm
@@ -273,18 +273,12 @@ sub add_message {
my $smsg = PublicInbox::SearchMsg->new($mime);
my $doc = $smsg->{doc};
my $subj = $smsg->subject;
-
-   $smsg->{lines} = $mime->body_raw =~ tr!\n!\n!;
-   defined $bytes or $bytes = length($mime->as_string);
-   $smsg->{bytes} = $bytes;
-
add_val($doc, PublicInbox::Search::TS(), $smsg->ts);
my @ds = gmtime($smsg->ds);
my $mmdd = strftime('%Y%m%d', @ds);
add_val($doc, PublicInbox::Search::MMDD(), $mmdd);
my $dt = strftime('%Y%m%d%H%M%S', @ds);
add_val($doc, PublicInbox::Search::DT(), $dt);
-   my @vals = ($smsg->{ts}, $smsg->{ds});
 
my $tg = $self->term_generator;
 
@@ -333,11 +327,11 @@ sub add_message {
index_body($tg, \@orig, $doc) if @orig;
});
 
-   # populates smsg->references for smsg->to_doc_data
-   my $data = $smsg->to_doc_data($oid, $mid0);
foreach my $mid (@$mids) {
$tg->index_text($mid, 1, 'XM');
}
+   $smsg->{to} = $smsg->{cc} = '';
+   my $data = $smsg->to_doc_data($oid, $mid0);
$doc->set_data($data);
if (my $altid = $self->{-altid}) {
foreach my $alt (@$altid) {
@@ -350,24 +344,11 @@ sub add_message {
}
}
 
-   $self->delete_article($num) if defined $num; # for reindexing
-
if (my $over = $self->{over}) {
-   utf8::encode($data);
-   $data = compress($data);
-   my $refs = $over->parse_references($smsg, $mid0, $mids);
-   my $xpath;
-   if ($subj ne '') {
-   $xpath = $self->subject_path($subj);
-   $xpath = id_compress($xpath);
-   }
-
-   push @vals, $num, $mids, $refs, $xpath, $data;
-   $over->add_over(\@vals);
+   $over->add_overview($mime, $bytes, $num, $oid, $mid0);
}
$doc->add_boolean_term('Q' . $_) foreach @$mids;
-   $doc->add_boolean_term('XNUM' . $num) if defined $num;
- 

[PATCH 1/8] psgi: ensure /$INBOX/$MESSAGE_ID/T/ endpoint is chronological

2018-04-06 Thread Eric Wong (Contractor, The Linux Foundation)
We only need to call get_thread beyond 1000 messages for
fetching entire mboxes.  It's probably too much for the HTML
display otherwise.
---
 lib/PublicInbox/Mbox.pm |  6 +++---
 lib/PublicInbox/Over.pm | 18 +++---
 2 files changed, 14 insertions(+), 10 deletions(-)

diff --git a/lib/PublicInbox/Mbox.pm b/lib/PublicInbox/Mbox.pm
index 4427ae5..11b2302 100644
--- a/lib/PublicInbox/Mbox.pm
+++ b/lib/PublicInbox/Mbox.pm
@@ -130,9 +130,9 @@ sub thread_mbox {
eval { require IO::Compress::Gzip };
return sub { need_gzip(@_) } if $@;
my $mid = $ctx->{mid};
-   my $msgs = $srch->get_thread($mid, 0);
+   my $msgs = $srch->get_thread($mid, {});
return [404, [qw(Content-Type text/plain)], []] if !@$msgs;
-   my $prev = $msgs->[-1]->{num};
+   my $prev = $msgs->[-1];
my $i = 0;
my $cb = sub {
while (1) {
@@ -142,7 +142,7 @@ sub thread_mbox {
# refill result set
$msgs = $srch->get_thread($mid, $prev);
return unless @$msgs;
-   $prev = $msgs->[-1]->{num};
+   $prev = $msgs->[-1];
$i = 0;
}
};
diff --git a/lib/PublicInbox/Over.pm b/lib/PublicInbox/Over.pm
index 309e044..da0f11e 100644
--- a/lib/PublicInbox/Over.pm
+++ b/lib/PublicInbox/Over.pm
@@ -93,16 +93,20 @@ ORDER BY num ASC LIMIT 1
 SELECT tid,sid FROM over WHERE num = ? LIMIT 1
 
defined $tid or return nothing; # $sid may be undef
-
-   $prev ||= 0;
-   my $cond = 'FROM over WHERE (tid = ? OR sid = ?) AND num > ?';
-   my $msgs = do_get($self, <<"", {}, $tid, $sid, $prev);
-SELECT * $cond ORDER BY num ASC
+   my $sort_col = 'ds';
+   $num = 0;
+   if ($prev) {
+   $num = $prev->{num} || 0;
+   $sort_col = 'num';
+   }
+   my $cond = '(tid = ? OR sid = ?) AND num > ?';
+   my $msgs = do_get($self, <<"", {}, $tid, $sid, $num);
+SELECT num,ts,ds,ddd FROM over WHERE $cond ORDER BY $sort_col ASC
 
return $msgs unless wantarray;
 
-   my $nr = $dbh->selectrow_array(<<"", undef, $tid, $sid, $prev);
-SELECT COUNT(num) $cond
+   my $nr = $dbh->selectrow_array(<<"", undef, $tid, $sid, $num);
+SELECT COUNT(num) FROM over WHERE $cond
 
($nr, $msgs);
 }
-- 
EW

--
unsubscribe: meta+unsubscr...@public-inbox.org
archive: https://public-inbox.org/meta/