The termlist is already sorted, so this is the patch trying to minimize
the modification of database as suggested in the comment and Carl's
TODO file.

My poor profiling shows not much, but some improvement.

*Before*
 
% time notmuch tag +test id:hfntnu+g...@egroups.com   
MOD_PLISTS: 368
notmuch tag +test id:hfntnu+g...@egroups.com  0.05s user 0.03s system 11% cpu 
0.673 total

% time notmuch tag -test id:hfntnu+g...@egroups.com   
MOD_PLISTS: 368
notmuch tag -test id:hfntnu+g...@egroups.com  0.06s user 0.01s system 10% cpu 
0.681 total
 
*After*
 
% time notmuch tag +test id:hfntnu+g...@egroups.com                
MOD_PLIST: 1
notmuch tag +test id:hfntnu+g...@egroups.com  0.01s user 0.02s system 6% cpu 
0.436 total

% time notmuch tag -test id:hfntnu+g...@egroups.com                
MOD_PLIST: 1
notmuch tag -test id:hfntnu+g...@egroups.com  0.01s user 0.01s system 5% cpu 
0.383 total


% time notmuch tag +test tag:notmuch
notmuch tag +test tag:notmuch  1.71s user 0.03s system 65% cpu 2.632 total

% time notmuch tag -test tag:notmuch
notmuch tag -test tag:notmuch  1.61s user 0.02s system 73% cpu 2.204 total

% notmuch count tag:notmuch
682

--- flint_database.cc   2009-12-08 13:34:24.790284881 +0800
+++ flint_database.cc   2009-12-10 14:22:14.493653956 +0800
@@ -1188,7 +1188,7 @@
 
        termlist.next();
        while (!termlist.at_end()) {
-           string tname = termlist.get_termname();
+            string tname = termlist.get_termname();
            position_table.delete_positionlist(did, tname);
            termcount wdf = termlist.get_wdf();
 
@@ -1278,20 +1278,50 @@
        }
   
        if (!modifying || document.internal->terms_modified()) {
-           // FIXME - in the case where there is overlap between the new
-           // termlist and the old termlist, it would be better to compare the
-           // two lists, and make the minimum set of modifications required.
-           // This would lead to smaller changesets for replication, and
-           // probably be faster overall.
-
-           // First, add entries to remove the postings in the underlying 
record.
            Xapian::Internal::RefCntPtr<const FlintWritableDatabase> 
ptrtothis(this);
            FlintTermList termlist(ptrtothis, did);
+            Xapian::TermIterator term = document.termlist_begin();
+           Xapian::TermIterator term_end = document.termlist_end();
+            flint_doclen_t new_doclen = termlist.get_doclength();
+            string old_tname, new_tname;
+            
+            total_length -= new_doclen;
+            
+            termlist.next();
+            while (true) {
+              bool identical = false;
+              int cmp;
+              if (termlist.at_end() && term == term_end)
+                break;
+              if (!termlist.at_end() && term != term_end) {
+                old_tname = termlist.get_termname();
+                new_tname = *term;
+                cmp = old_tname.compare(new_tname);
+
+                // Check postlist to see whether they are identical
+                if (cmp == 0) {
+                  int new_count = term.positionlist_count();
+                  int old_count = termlist.positionlist_count();
+                  if (old_count == new_count) {
+                    PositionIterator it = term.positionlist_begin();
+                    PositionIterator it_end = term.positionlist_end();
+                    PositionIterator old = termlist.positionlist_begin();
+                    if (equal(it, it_end, old))
+                      identical = true;
+                  }
+                }
+              } else if (termlist.at_end()) {
+                cmp = 2;
+                new_tname = *term;
+              } else {
+                cmp = -2;
+                old_tname = termlist.get_termname();
+              }
 
-           termlist.next();
-           while (!termlist.at_end()) {
-               string tname = termlist.get_termname();
+              if (cmp < 0) {
+                const string& tname = old_tname;
                termcount wdf = termlist.get_wdf();
+                new_doclen -= wdf;
 
                map<string, pair<termcount_diff, termcount_diff> >::iterator i;
                i = freq_deltas.find(tname);
@@ -1318,58 +1348,62 @@
                    // Modifying a document we added/modified since the last 
flush.
                    k->second = make_pair('D', 0u);
                }
-
-               termlist.next();
-           }
-
-           total_length -= termlist.get_doclength();
-
-           flint_doclen_t new_doclen = 0;
-           Xapian::TermIterator term = document.termlist_begin();
-           Xapian::TermIterator term_end = document.termlist_end();
-           for ( ; term != term_end; ++term) {
-               // Calculate the new document length
+              } else if (!identical) {
+                const string& tname = new_tname;
                termcount wdf = term.get_wdf();
-               new_doclen += wdf;
-
-               string tname = *term;
-               if (tname.size() > MAX_SAFE_TERM_LENGTH)
-                   throw Xapian::InvalidArgumentError("Term too long (> 
"STRINGIZE(MAX_SAFE_TERM_LENGTH)"): " + tname);
-               map<string, pair<termcount_diff, termcount_diff> >::iterator i;
-               i = freq_deltas.find(tname);
-               if (i == freq_deltas.end()) {
-                   freq_deltas.insert(make_pair(tname, make_pair(1, 
termcount_diff(wdf))));
-               } else {
-                   ++i->second.first;
-                   i->second.second += wdf;
-               }
+                new_doclen += wdf;
+                
+                if (cmp > 0) {
+                  if (tname.size() > MAX_SAFE_TERM_LENGTH)
+                    throw Xapian::InvalidArgumentError("Term too long (> 
"STRINGIZE(MAX_SAFE_TERM_LENGTH)"): " + tname);
+                  map<string, pair<termcount_diff, termcount_diff> >::iterator 
i;
+                  i = freq_deltas.find(tname);
+                  if (i == freq_deltas.end()) {
+                    freq_deltas.insert(make_pair(tname, make_pair(1, 
termcount_diff(wdf))));
+                  } else {
+                    ++i->second.first;
+                    i->second.second += wdf;
+                  }
+
+                  // Add did to tname's postlist
+                  map<string, map<docid, pair<char, termcount> > >::iterator j;
+                  j = mod_plists.find(tname);
+                  if (j == mod_plists.end()) {
+                    map<docid, pair<char, termcount> > m;
+                    j = mod_plists.insert(make_pair(tname, m)).first;
+                  }
+                  map<docid, pair<char, termcount> >::iterator k;
+                  k = j->second.find(did);
+                  if (k != j->second.end()) {
+                    Assert(k->second.first == 'D');
+                    k->second.first = 'M';
+                    k->second.second = wdf;
+                  } else {
+                    j->second.insert(make_pair(did, make_pair('A', wdf)));
+                  }
+                }
+
+                PositionIterator it = term.positionlist_begin();
+                PositionIterator it_end = term.positionlist_end();
+                if (it != it_end) {
+                  position_table.set_positionlist(
+                                                  did, tname, it, it_end);
+                } else {
+                  position_table.delete_positionlist(did, tname);
+                }
+              }
+              if (termlist.at_end())
+                ++term;
+              else if (term == term_end)
+                termlist.next();
+              else {
+                if (cmp >= 0)
+                  ++term;
+                if (cmp <= 0)
+                  termlist.next();
+              }
+            }
 
-               // Add did to tname's postlist
-               map<string, map<docid, pair<char, termcount> > >::iterator j;
-               j = mod_plists.find(tname);
-               if (j == mod_plists.end()) {
-                   map<docid, pair<char, termcount> > m;
-                   j = mod_plists.insert(make_pair(tname, m)).first;
-               }
-               map<docid, pair<char, termcount> >::iterator k;
-               k = j->second.find(did);
-               if (k != j->second.end()) {
-                   Assert(k->second.first == 'D');
-                   k->second.first = 'M';
-                   k->second.second = wdf;
-               } else {
-                   j->second.insert(make_pair(did, make_pair('A', wdf)));
-               }
-
-               PositionIterator it = term.positionlist_begin();
-               PositionIterator it_end = term.positionlist_end();
-               if (it != it_end) {
-                   position_table.set_positionlist(
-                       did, tname, it, it_end);
-               } else {
-                   position_table.delete_positionlist(did, tname);
-               }
-           }
            LOGLINE(DB, "Calculated doclen for replacement document " << did << 
" as " << new_doclen);
 
            // Set the termlist


-- 
Kan-Ru Chen | http://kanru.info

Q: Why are my replies five sentences or less?
A: http://five.sentenc.es/

Attachment: pgpQudZ7dJHFR.pgp
Description: PGP signature

_______________________________________________
notmuch mailing list
notmuch@notmuchmail.org
http://notmuchmail.org/mailman/listinfo/notmuch

Reply via email to