I have been looking at the I/O patterns of "notmuch search" with the default output format and noticed that it has to parse the maildir file of every matched message to get the From and Subject headers. I figured that this must be slowing things down, especially when the files are not in the filesystem cache.
So I wanted to see how much difference would it make to have the From and Subject stored in xapian to avoid this parsing. With the attached patch I get a speedup of 2x with cached and almost 10x with uncached files for searches with many matches. The attached patch is only intended as proof of concept. I am not familiar with xapian so I wasn't sure if this kind of data should be stored as terms, values or data. I went with values simply because I saw that message-id and timestamp were already stored that way. Perhaps the data type would be more appropriate since the fields are not used for searching or sorting. Oh and for some reason I get blank Subject for about 1% of the matches. Is there a downside to this approach? The only one I see is that the xapian db size increases by about 1% but to me the speed increase would be well worth it.
diff --git a/lib/database.cc b/lib/database.cc index 7f79cf4..5f7f197 100644 --- a/lib/database.cc +++ b/lib/database.cc @@ -1654,7 +1654,7 @@ notmuch_database_add_message (notmuch_database_t *notmuch, goto DONE; date = notmuch_message_file_get_header (message_file, "date"); - _notmuch_message_set_date (message, date); + _notmuch_message_set_header_values (message, date, from, subject); _notmuch_message_index_file (message, filename); } else { diff --git a/lib/message.cc b/lib/message.cc index ecda75a..8c85c40 100644 --- a/lib/message.cc +++ b/lib/message.cc @@ -726,6 +726,14 @@ notmuch_message_get_date (notmuch_message_t *message) return Xapian::sortable_unserialise (value); } +const char * +_notmuch_message_get_header_value (notmuch_message_t *message,int valuetag) +{ + std::string value; + value = message->doc.get_value (valuetag); + return value.c_str(); +} + notmuch_tags_t * notmuch_message_get_tags (notmuch_message_t *message) { @@ -762,8 +770,10 @@ notmuch_message_set_author (notmuch_message_t *message, } void -_notmuch_message_set_date (notmuch_message_t *message, - const char *date) +_notmuch_message_set_header_values (notmuch_message_t *message, + const char *date, + const char *from, + const char *subject) { time_t time_value; @@ -776,6 +786,8 @@ _notmuch_message_set_date (notmuch_message_t *message, message->doc.add_value (NOTMUCH_VALUE_TIMESTAMP, Xapian::sortable_serialise (time_value)); + message->doc.add_value (NOTMUCH_VALUE_FROM, from); + message->doc.add_value (NOTMUCH_VALUE_SUBJECT, subject); } /* Synchronize changes made to message->doc out into the database. */ diff --git a/lib/notmuch-private.h b/lib/notmuch-private.h index 0856751..ef6348a 100644 --- a/lib/notmuch-private.h +++ b/lib/notmuch-private.h @@ -105,7 +105,9 @@ _internal_error (const char *format, ...) PRINTF_ATTRIBUTE (1, 2); typedef enum { NOTMUCH_VALUE_TIMESTAMP = 0, - NOTMUCH_VALUE_MESSAGE_ID + NOTMUCH_VALUE_MESSAGE_ID, + NOTMUCH_VALUE_FROM, + NOTMUCH_VALUE_SUBJECT } notmuch_value_t; /* Xapian (with flint backend) complains if we provide a term longer @@ -281,8 +283,14 @@ void _notmuch_message_ensure_thread_id (notmuch_message_t *message); void -_notmuch_message_set_date (notmuch_message_t *message, - const char *date); +_notmuch_message_set_header_values (notmuch_message_t *message, + const char *date, + const char *from, + const char *subject); +const char * +_notmuch_message_get_header_value (notmuch_message_t *message, + int valuetag); + void _notmuch_message_sync (notmuch_message_t *message); diff --git a/lib/thread.cc b/lib/thread.cc index ace5ce7..636a3dc 100644 --- a/lib/thread.cc +++ b/lib/thread.cc @@ -231,7 +231,8 @@ _thread_add_message (notmuch_thread_t *thread, xstrdup (notmuch_message_get_message_id (message)), message); - from = notmuch_message_get_header (message, "from"); + from = _notmuch_message_get_header_value(message,NOTMUCH_VALUE_FROM); + //notmuch_message_get_header (message, "from"); if (from) list = internet_address_list_parse_string (from); @@ -253,7 +254,8 @@ _thread_add_message (notmuch_thread_t *thread, if (! thread->subject) { const char *subject; - subject = notmuch_message_get_header (message, "subject"); + subject = _notmuch_message_get_header_value(message,NOTMUCH_VALUE_SUBJECT); + // subject = notmuch_message_get_header (message, "subject"); thread->subject = talloc_strdup (thread, subject ? subject : ""); } @@ -273,7 +275,8 @@ _thread_set_subject_from_message (notmuch_thread_t *thread, const char *subject; const char *cleaned_subject; - subject = notmuch_message_get_header (message, "subject"); + subject = _notmuch_message_get_header_value(message,NOTMUCH_VALUE_SUBJECT); + // subject = notmuch_message_get_header (message, "subject"); if (! subject) return;
-- Istvan
_______________________________________________ notmuch mailing list notmuch@notmuchmail.org http://notmuchmail.org/mailman/listinfo/notmuch