I have been looking at the I/O patterns of "notmuch search" with the
default output format and noticed that it has to parse the maildir file
of every matched message to get the From and Subject headers. I figured
that this must be slowing things down, especially when the files are not
in the filesystem cache.

So I wanted to see how much difference would it make to have the From
and Subject stored in xapian to avoid this parsing. 

With the attached patch I get a speedup of 2x with cached and almost 10x
with uncached files for searches with many matches.

The attached patch is only intended as proof of concept. I am not
familiar with xapian so I wasn't sure if this kind of data should be
stored as terms, values or data. I went with values simply because I saw
that message-id and timestamp were already stored that way. Perhaps the
data type would be more appropriate since the fields are not used for
searching or sorting. Oh and for some reason I get blank Subject for
about 1% of the matches.


Is there a downside to this approach? The only one I see is that the
xapian db size increases by about 1% but to me the speed increase would
be well worth it.


diff --git a/lib/database.cc b/lib/database.cc
index 7f79cf4..5f7f197 100644
--- a/lib/database.cc
+++ b/lib/database.cc
@@ -1654,7 +1654,7 @@ notmuch_database_add_message (notmuch_database_t *notmuch,
 		goto DONE;
 
 	    date = notmuch_message_file_get_header (message_file, "date");
-	    _notmuch_message_set_date (message, date);
+	    _notmuch_message_set_header_values (message, date, from, subject);
 
 	    _notmuch_message_index_file (message, filename);
 	} else {
diff --git a/lib/message.cc b/lib/message.cc
index ecda75a..8c85c40 100644
--- a/lib/message.cc
+++ b/lib/message.cc
@@ -726,6 +726,14 @@ notmuch_message_get_date (notmuch_message_t *message)
     return Xapian::sortable_unserialise (value);
 }
 
+const char *
+_notmuch_message_get_header_value (notmuch_message_t *message,int valuetag)
+{
+    std::string value;
+    value = message->doc.get_value (valuetag);
+    return value.c_str();
+}
+
 notmuch_tags_t *
 notmuch_message_get_tags (notmuch_message_t *message)
 {
@@ -762,8 +770,10 @@ notmuch_message_set_author (notmuch_message_t *message,
 }
 
 void
-_notmuch_message_set_date (notmuch_message_t *message,
-			   const char *date)
+_notmuch_message_set_header_values (notmuch_message_t *message,
+				    const char *date,
+				    const char *from,
+				    const char *subject)
 {
     time_t time_value;
 
@@ -776,6 +786,8 @@ _notmuch_message_set_date (notmuch_message_t *message,
 
     message->doc.add_value (NOTMUCH_VALUE_TIMESTAMP,
 			    Xapian::sortable_serialise (time_value));
+    message->doc.add_value (NOTMUCH_VALUE_FROM, from);
+    message->doc.add_value (NOTMUCH_VALUE_SUBJECT, subject);
 }
 
 /* Synchronize changes made to message->doc out into the database. */
diff --git a/lib/notmuch-private.h b/lib/notmuch-private.h
index 0856751..ef6348a 100644
--- a/lib/notmuch-private.h
+++ b/lib/notmuch-private.h
@@ -105,7 +105,9 @@ _internal_error (const char *format, ...) PRINTF_ATTRIBUTE (1, 2);
 
 typedef enum {
     NOTMUCH_VALUE_TIMESTAMP = 0,
-    NOTMUCH_VALUE_MESSAGE_ID
+    NOTMUCH_VALUE_MESSAGE_ID,
+    NOTMUCH_VALUE_FROM,
+    NOTMUCH_VALUE_SUBJECT
 } notmuch_value_t;
 
 /* Xapian (with flint backend) complains if we provide a term longer
@@ -281,8 +283,14 @@ void
 _notmuch_message_ensure_thread_id (notmuch_message_t *message);
 
 void
-_notmuch_message_set_date (notmuch_message_t *message,
-			   const char *date);
+_notmuch_message_set_header_values (notmuch_message_t *message,
+				    const char *date,
+				    const char *from,
+				    const char *subject);
+const char *
+_notmuch_message_get_header_value (notmuch_message_t *message,
+				   int valuetag);
+
 
 void
 _notmuch_message_sync (notmuch_message_t *message);
diff --git a/lib/thread.cc b/lib/thread.cc
index ace5ce7..636a3dc 100644
--- a/lib/thread.cc
+++ b/lib/thread.cc
@@ -231,7 +231,8 @@ _thread_add_message (notmuch_thread_t *thread,
 			 xstrdup (notmuch_message_get_message_id (message)),
 			 message);
 
-    from = notmuch_message_get_header (message, "from");
+    from = _notmuch_message_get_header_value(message,NOTMUCH_VALUE_FROM);
+      //notmuch_message_get_header (message, "from");
     if (from)
 	list = internet_address_list_parse_string (from);
 
@@ -253,7 +254,8 @@ _thread_add_message (notmuch_thread_t *thread,
 
     if (! thread->subject) {
 	const char *subject;
-	subject = notmuch_message_get_header (message, "subject");
+	subject = _notmuch_message_get_header_value(message,NOTMUCH_VALUE_SUBJECT);
+	// subject = notmuch_message_get_header (message, "subject");
 	thread->subject = talloc_strdup (thread, subject ? subject : "");
     }
 
@@ -273,7 +275,8 @@ _thread_set_subject_from_message (notmuch_thread_t *thread,
     const char *subject;
     const char *cleaned_subject;
 
-    subject = notmuch_message_get_header (message, "subject");
+    subject = _notmuch_message_get_header_value(message,NOTMUCH_VALUE_SUBJECT);
+    // subject = notmuch_message_get_header (message, "subject");
     if (! subject)
 	return;
 
-- 
        Istvan
_______________________________________________
notmuch mailing list
notmuch@notmuchmail.org
http://notmuchmail.org/mailman/listinfo/notmuch

Reply via email to