I have been looking at the I/O patterns of "notmuch search" with the
default output format and noticed that it has to parse the maildir file
of every matched message to get the From and Subject headers. I figured
that this must be slowing things down, especially when the files are not
in the filesystem cache.
So I wanted to see how much difference would it make to have the From
and Subject stored in xapian to avoid this parsing.
With the attached patch I get a speedup of 2x with cached and almost 10x
with uncached files for searches with many matches.
The attached patch is only intended as proof of concept. I am not
familiar with xapian so I wasn't sure if this kind of data should be
stored as terms, values or data. I went with values simply because I saw
that message-id and timestamp were already stored that way. Perhaps the
data type would be more appropriate since the fields are not used for
searching or sorting. Oh and for some reason I get blank Subject for
about 1% of the matches.
Is there a downside to this approach? The only one I see is that the
xapian db size increases by about 1% but to me the speed increase would
be well worth it.
diff --git a/lib/database.cc b/lib/database.cc
index 7f79cf4..5f7f197 100644
--- a/lib/database.cc
+++ b/lib/database.cc
@@ -1654,7 +1654,7 @@ notmuch_database_add_message (notmuch_database_t *notmuch,
goto DONE;
date = notmuch_message_file_get_header (message_file, "date");
- _notmuch_message_set_date (message, date);
+ _notmuch_message_set_header_values (message, date, from, subject);
_notmuch_message_index_file (message, filename);
} else {
diff --git a/lib/message.cc b/lib/message.cc
index ecda75a..8c85c40 100644
--- a/lib/message.cc
+++ b/lib/message.cc
@@ -726,6 +726,14 @@ notmuch_message_get_date (notmuch_message_t *message)
return Xapian::sortable_unserialise (value);
}
+const char *
+_notmuch_message_get_header_value (notmuch_message_t *message,int valuetag)
+{
+ std::string value;
+ value = message->doc.get_value (valuetag);
+ return value.c_str();
+}
+
notmuch_tags_t *
notmuch_message_get_tags (notmuch_message_t *message)
{
@@ -762,8 +770,10 @@ notmuch_message_set_author (notmuch_message_t *message,
}
void
-_notmuch_message_set_date (notmuch_message_t *message,
- const char *date)
+_notmuch_message_set_header_values (notmuch_message_t *message,
+ const char *date,
+ const char *from,
+ const char *subject)
{
time_t time_value;
@@ -776,6 +786,8 @@ _notmuch_message_set_date (notmuch_message_t *message,
message->doc.add_value (NOTMUCH_VALUE_TIMESTAMP,
Xapian::sortable_serialise (time_value));
+ message->doc.add_value (NOTMUCH_VALUE_FROM, from);
+ message->doc.add_value (NOTMUCH_VALUE_SUBJECT, subject);
}
/* Synchronize changes made to message->doc out into the database. */
diff --git a/lib/notmuch-private.h b/lib/notmuch-private.h
index 0856751..ef6348a 100644
--- a/lib/notmuch-private.h
+++ b/lib/notmuch-private.h
@@ -105,7 +105,9 @@ _internal_error (const char *format, ...) PRINTF_ATTRIBUTE (1, 2);
typedef enum {
NOTMUCH_VALUE_TIMESTAMP = 0,
- NOTMUCH_VALUE_MESSAGE_ID
+ NOTMUCH_VALUE_MESSAGE_ID,
+ NOTMUCH_VALUE_FROM,
+ NOTMUCH_VALUE_SUBJECT
} notmuch_value_t;
/* Xapian (with flint backend) complains if we provide a term longer
@@ -281,8 +283,14 @@ void
_notmuch_message_ensure_thread_id (notmuch_message_t *message);
void
-_notmuch_message_set_date (notmuch_message_t *message,
- const char *date);
+_notmuch_message_set_header_values (notmuch_message_t *message,
+ const char *date,
+ const char *from,
+ const char *subject);
+const char *
+_notmuch_message_get_header_value (notmuch_message_t *message,
+ int valuetag);
+
void
_notmuch_message_sync (notmuch_message_t *message);
diff --git a/lib/thread.cc b/lib/thread.cc
index ace5ce7..636a3dc 100644
--- a/lib/thread.cc
+++ b/lib/thread.cc
@@ -231,7 +231,8 @@ _thread_add_message (notmuch_thread_t *thread,
xstrdup (notmuch_message_get_message_id (message)),
message);
- from = notmuch_message_get_header (message, "from");
+ from = _notmuch_message_get_header_value(message,NOTMUCH_VALUE_FROM);
+ //notmuch_message_get_header (message, "from");
if (from)
list = internet_address_list_parse_string (from);
@@ -253,7 +254,8 @@ _thread_add_message (notmuch_thread_t *thread,
if (! thread->subject) {
const char *subject;
- subject = notmuch_message_get_header (message, "subject");
+ subject = _notmuch_message_get_header_value(message,NOTMUCH_VALUE_SUBJECT);
+ // subject = notmuch_message_get_header (message, "subject");
thread->subject = talloc_strdup (thread, subject ? subject : "");
}
@@ -273,7 +275,8 @@ _thread_set_subject_from_message (notmuch_thread_t *thread,
const char *subject;
const char *cleaned_subject;
- subject = notmuch_message_get_header (message, "subject");
+ subject = _notmuch_message_get_header_value(message,NOTMUCH_VALUE_SUBJECT);
+ // subject = notmuch_message_get_header (message, "subject");
if (! subject)
return;
--
Istvan
_______________________________________________
notmuch mailing list
[email protected]
http://notmuchmail.org/mailman/listinfo/notmuch