Xapian has always sorted termlist iterators, so we now:

1) break out of the iterator loop early on non-matches
2) avoid doing sorting ourselves

As a result, we'll also favor the wantarray forms of xap_terms
and all_terms to preserve sort order in most cases.

Confirmed by the Xapian maintainer: <20231201184844.go4...@survex.com>

Link: 
https://lists.xapian.org/pipermail/xapian-discuss/2023-December/010013.html
---
 lib/PublicInbox/LeiInspect.pm |  1 -
 lib/PublicInbox/Search.pm     | 19 ++++++++++---------
 lib/PublicInbox/SearchIdx.pm  | 13 ++++++-------
 lib/PublicInbox/xh_cidx.h     | 15 +++++----------
 lib/PublicInbox/xh_mset.h     |  2 +-
 5 files changed, 22 insertions(+), 28 deletions(-)

diff --git a/lib/PublicInbox/LeiInspect.pm b/lib/PublicInbox/LeiInspect.pm
index d4ad03eb..88d7949c 100644
--- a/lib/PublicInbox/LeiInspect.pm
+++ b/lib/PublicInbox/LeiInspect.pm
@@ -97,7 +97,6 @@ sub _inspect_doc ($$) {
                my $term = ($1 // '');
                push @{$ent->{terms}->{$term}}, $tn;
        }
-       @$_ = sort(@$_) for values %{$ent->{terms} // {}};
        $cur = $doc->values_begin;
        $end = $doc->values_end;
        for (; $cur != $end; $cur++) {
diff --git a/lib/PublicInbox/Search.pm b/lib/PublicInbox/Search.pm
index 8ef17d58..678c8c5d 100644
--- a/lib/PublicInbox/Search.pm
+++ b/lib/PublicInbox/Search.pm
@@ -614,16 +614,16 @@ sub get_pct ($) { # mset item
 
 sub xap_terms ($$;@) {
        my ($pfx, $xdb_or_doc, @docid) = @_; # @docid may be empty ()
-       my %ret;
        my $end = $xdb_or_doc->termlist_end(@docid);
        my $cur = $xdb_or_doc->termlist_begin(@docid);
+       $cur->skip_to($pfx);
+       my (@ret, $tn);
+       my $pfxlen = length($pfx);
        for (; $cur != $end; $cur++) {
-               $cur->skip_to($pfx);
-               last if $cur == $end;
-               my $tn = $cur->get_termname;
-               $ret{substr($tn, length($pfx))} = undef if !index($tn, $pfx);
+               $tn = $cur->get_termname;
+               index($tn, $pfx) ? last : push(@ret, substr($tn, $pfxlen));
        }
-       wantarray ? sort(keys(%ret)) : \%ret;
+       wantarray ? @ret : +{ map { $_ => undef } @ret };
 }
 
 # get combined docid from over.num:
@@ -638,11 +638,12 @@ sub all_terms {
        my ($self, $pfx) = @_;
        my $cur = xdb($self)->allterms_begin($pfx);
        my $end = $self->{xdb}->allterms_end($pfx);
-       my %ret;
+       my $pfxlen = length($pfx);
+       my @ret;
        for (; $cur != $end; $cur++) {
-               $ret{substr($cur->get_termname, length($pfx))} = undef;
+               push @ret, substr($cur->get_termname, $pfxlen);
        }
-       wantarray ? (sort keys %ret) : \%ret;
+       wantarray ? @ret : +{ map { $_ => undef } @ret };
 }
 
 sub xh_args { # prep getopt args to feed to xap_helper.h socket
diff --git a/lib/PublicInbox/SearchIdx.pm b/lib/PublicInbox/SearchIdx.pm
index 1bf471fc..1ac8e33e 100644
--- a/lib/PublicInbox/SearchIdx.pm
+++ b/lib/PublicInbox/SearchIdx.pm
@@ -42,7 +42,7 @@ my $BASE85 = 
qr/[a-zA-Z0-9\!\#\$\%\&\(\)\*\+\-;<=>\?\@\^_`\{\|\}\~]+/;
 my $xapianlevels = qr/\A(?:full|medium)\z/;
 my $hex = '[a-f0-9]';
 my $OID = $hex .'{40,}';
-my @VMD_MAP = (kw => 'K', L => 'L');
+my @VMD_MAP = (kw => 'K', L => 'L'); # value order matters
 our $INDEXLEVELS = qr/\A(?:full|medium|basic)\z/;
 
 sub new {
@@ -608,17 +608,16 @@ sub set_vmd {
        my ($self, $docid, $vmd) = @_;
        begin_txn_lazy($self);
        my $doc = _get_doc($self, $docid) or return;
-       my ($end, @rm, @add);
+       my ($v, @rm, @add);
        my @x = @VMD_MAP;
+       my ($cur, $end) = ($doc->termlist_begin, $doc->termlist_end);
        while (my ($field, $pfx) = splice(@x, 0, 2)) {
                my $set = $vmd->{$field} // next;
                my %keep = map { $_ => 1 } @$set;
                my %add = %keep;
-               $end //= $doc->termlist_end;
-               for (my $cur = $doc->termlist_begin; $cur != $end; $cur++) {
-                       $cur->skip_to($pfx);
-                       last if $cur == $end;
-                       my $v = $cur->get_termname;
+               $cur->skip_to($pfx); # works due to @VMD_MAP order
+               for (; $cur != $end; $cur++) {
+                       $v = $cur->get_termname;
                        $v =~ s/\A$pfx//s or next;
                        $keep{$v} ? delete($add{$v}) : push(@rm, $pfx.$v);
                }
diff --git a/lib/PublicInbox/xh_cidx.h b/lib/PublicInbox/xh_cidx.h
index 1980f9f6..2803b3a4 100644
--- a/lib/PublicInbox/xh_cidx.h
+++ b/lib/PublicInbox/xh_cidx.h
@@ -12,12 +12,9 @@ static void dump_ibx_term(struct req *req, const char *pfx,
 
        for (cur.skip_to(pfx); cur != end; cur++) {
                std::string tn = *cur;
-
-               if (starts_with(&tn, pfx, pfx_len)) {
-                       fprintf(req->fp[0], "%s %s\n",
-                               tn.c_str() + pfx_len, ibx_id);
-                       ++req->nr_out;
-               }
+               if (!starts_with(&tn, pfx, pfx_len)) break;
+               fprintf(req->fp[0], "%s %s\n", tn.c_str() + pfx_len, ibx_id);
+               ++req->nr_out;
        }
 }
 
@@ -95,8 +92,7 @@ static bool root2offs_str(struct fbuf *root_offs, 
Xapian::Document *doc)
        fbuf_init(root_offs);
        for (cur.skip_to("G"); cur != end; cur++) {
                std::string tn = *cur;
-               if (!starts_with(&tn, "G", 1))
-                       continue;
+               if (!starts_with(&tn, "G", 1)) break;
                union { const char *in; char *out; } u;
                u.in = tn.c_str() + 1;
                e.key = u.out;
@@ -125,8 +121,7 @@ static void dump_roots_term(struct req *req, const char 
*pfx,
 
        for (cur.skip_to(pfx); cur != end; cur++) {
                std::string tn = *cur;
-               if (!starts_with(&tn, pfx, pfx_len))
-                       continue;
+               if (!starts_with(&tn, pfx, pfx_len)) break;
                fputs(tn.c_str() + pfx_len, drt->wbuf.fp);
                fwrite(root_offs->ptr, root_offs->len, 1, drt->wbuf.fp);
                ++req->nr_out;
diff --git a/lib/PublicInbox/xh_mset.h b/lib/PublicInbox/xh_mset.h
index 056fe22b..4e97a284 100644
--- a/lib/PublicInbox/xh_mset.h
+++ b/lib/PublicInbox/xh_mset.h
@@ -11,7 +11,7 @@ static void emit_doc_term(FILE *fp, const char *pfx, 
Xapian::Document *doc)
 
        for (cur.skip_to(pfx); cur != end; cur++) {
                std::string tn = *cur;
-               if (!starts_with(&tn, pfx, pfx_len)) continue;
+               if (!starts_with(&tn, pfx, pfx_len)) break;
                fputc(0, fp);
                fwrite(tn.data(), tn.size(), 1, fp);
        }

Reply via email to