[Patch v4 12/12] add "notmuch reindex" subcommand

2017-07-21 Thread David Bremner
From: Daniel Kahn Gillmor 

This new subcommand takes a set of search terms, and re-indexes the
list of matching messages.
---
 Makefile.local|   1 +
 doc/conf.py   |   4 ++
 doc/index.rst |   1 +
 doc/man1/notmuch-reindex.rst  |  29 +
 doc/man1/notmuch.rst  |   4 +-
 doc/man7/notmuch-search-terms.rst |   7 +-
 notmuch-client.h  |   3 +
 notmuch-reindex.c | 134 ++
 notmuch.c |   2 +
 performance-test/M04-reindex.sh   |  11 
 performance-test/T03-reindex.sh   |  13 
 test/T670-duplicate-mid.sh|   7 ++
 test/T700-reindex.sh  |  79 ++
 13 files changed, 291 insertions(+), 4 deletions(-)
 create mode 100644 doc/man1/notmuch-reindex.rst
 create mode 100644 notmuch-reindex.c
 create mode 100755 performance-test/M04-reindex.sh
 create mode 100755 performance-test/T03-reindex.sh
 create mode 100755 test/T700-reindex.sh

diff --git a/Makefile.local b/Makefile.local
index 6bc78ef8..af12ca7f 100644
--- a/Makefile.local
+++ b/Makefile.local
@@ -225,6 +225,7 @@ notmuch_client_srcs =   \
notmuch-dump.c  \
notmuch-insert.c\
notmuch-new.c   \
+   notmuch-reindex.c   \
notmuch-reply.c \
notmuch-restore.c   \
notmuch-search.c\
diff --git a/doc/conf.py b/doc/conf.py
index a3d82696..aa864b3c 100644
--- a/doc/conf.py
+++ b/doc/conf.py
@@ -95,6 +95,10 @@ man_pages = [
  u'incorporate new mail into the notmuch database',
  [notmuch_authors], 1),
 
+('man1/notmuch-reindex', 'notmuch-reindex',
+ u're-index matching messages',
+ [notmuch_authors], 1),
+
 ('man1/notmuch-reply', 'notmuch-reply',
  u'constructs a reply template for a set of messages',
  [notmuch_authors], 1),
diff --git a/doc/index.rst b/doc/index.rst
index 344606d9..aa6c9f40 100644
--- a/doc/index.rst
+++ b/doc/index.rst
@@ -18,6 +18,7 @@ Contents:
man5/notmuch-hooks
man1/notmuch-insert
man1/notmuch-new
+   man1/notmuch-reindex
man1/notmuch-reply
man1/notmuch-restore
man1/notmuch-search
diff --git a/doc/man1/notmuch-reindex.rst b/doc/man1/notmuch-reindex.rst
new file mode 100644
index ..e39cc4ee
--- /dev/null
+++ b/doc/man1/notmuch-reindex.rst
@@ -0,0 +1,29 @@
+===
+notmuch-reindex
+===
+
+SYNOPSIS
+
+
+**notmuch** **reindex** [*option* ...] <*search-term*> ...
+
+DESCRIPTION
+===
+
+Re-index all messages matching the search terms.
+
+See **notmuch-search-terms(7)** for details of the supported syntax for
+<*search-term*\ >.
+
+The **reindex** command searches for all messages matching the
+supplied search terms, and re-creates the full-text index on these
+messages using the supplied options.
+
+SEE ALSO
+
+
+**notmuch(1)**, **notmuch-config(1)**, **notmuch-count(1)**,
+**notmuch-dump(1)**, **notmuch-hooks(5)**, **notmuch-insert(1)**,
+**notmuch-new(1)**,
+**notmuch-reply(1)**, **notmuch-restore(1)**, **notmuch-search(1)**,
+**notmuch-search-terms(7)**, **notmuch-show(1)**, **notmuch-tag(1)**
diff --git a/doc/man1/notmuch.rst b/doc/man1/notmuch.rst
index cb350d1a..40fd335b 100644
--- a/doc/man1/notmuch.rst
+++ b/doc/man1/notmuch.rst
@@ -163,8 +163,8 @@ SEE ALSO
 
 **notmuch-address(1)**, **notmuch-compact(1)**, **notmuch-config(1)**,
 **notmuch-count(1)**, **notmuch-dump(1)**, **notmuch-hooks(5)**,
-**notmuch-insert(1)**, **notmuch-new(1)**, **notmuch-reply(1)**,
-**notmuch-restore(1)**, **notmuch-search(1)**,
+**notmuch-insert(1)**, **notmuch-new(1)**, **notmuch-reindex(1)**,
+**notmuch-reply(1)**, **notmuch-restore(1)**, **notmuch-search(1)**,
 **notmuch-search-terms(7)**, **notmuch-show(1)**, **notmuch-tag(1)**
 
 The notmuch website: **https://notmuchmail.org**
diff --git a/doc/man7/notmuch-search-terms.rst 
b/doc/man7/notmuch-search-terms.rst
index 47cab48d..dd76972e 100644
--- a/doc/man7/notmuch-search-terms.rst
+++ b/doc/man7/notmuch-search-terms.rst
@@ -9,6 +9,8 @@ SYNOPSIS
 
 **notmuch** **dump** [--format=(batch-tag|sup)] [--] [--output=<*file*>] [--] 
[<*search-term*> ...]
 
+**notmuch** **reindex** [option ...] <*search-term*> ...
+
 **notmuch** **search** [option ...] <*search-term*> ...
 
 **notmuch** **show** [option ...] <*search-term*> ...
@@ -421,5 +423,6 @@ SEE ALSO
 
 **notmuch(1)**, **notmuch-config(1)**, **notmuch-count(1)**,
 **notmuch-dump(1)**, **notmuch-hooks(5)**, **notmuch-insert(1)**,
-**notmuch-new(1)**, **notmuch-reply(1)**, **notmuch-restore(1)**,
-**notmuch-search(1)**, **notmuch-show(1)**, **notmuch-tag(1)**
+**notmuch-new(1)**, **notmuch-reindex(1)**, **notmuch-reply(1)**,
+**notmuch-restore(1)**, **notmuch-search(1)**, **notmuch-show(1)**,
+**notmuch-tag(1)**
diff --git a/notmuch-client.h b/notmuch-client.h
index ae37360b..1d3c0829 100644
--- a/notmuch-client.h
+++ 

[Patch v4 08/12] lib: add notmuch_thread_get_total_files

2017-07-21 Thread David Bremner
This is relatively inexpensive in terms of run time and implementation
cost as we are already traversing the list of messages in a thread.
---
 lib/notmuch.h | 12 
 lib/thread.cc |  9 +
 2 files changed, 21 insertions(+)

diff --git a/lib/notmuch.h b/lib/notmuch.h
index f5018497..4c03a893 100644
--- a/lib/notmuch.h
+++ b/lib/notmuch.h
@@ -1097,6 +1097,18 @@ int
 notmuch_thread_get_total_messages (notmuch_thread_t *thread);
 
 /**
+ * Get the total number of files in 'thread'.
+ *
+ * This sums notmuch_message_count_files over all messages in the
+ * thread
+ * @returns Non-negative integer
+ * @since libnotmuch 5.0 (notmuch 0.25)
+ */
+
+int
+notmuch_thread_get_total_files (notmuch_thread_t *thread);
+
+/**
  * Get a notmuch_messages_t iterator for the top-level messages in
  * 'thread' in oldest-first order.
  *
diff --git a/lib/thread.cc b/lib/thread.cc
index 1a1ecfa5..e17ef63e 100644
--- a/lib/thread.cc
+++ b/lib/thread.cc
@@ -44,6 +44,7 @@ struct _notmuch_thread {
 
 GHashTable *message_hash;
 int total_messages;
+int total_files;
 int matched_messages;
 time_t oldest;
 time_t newest;
@@ -266,6 +267,7 @@ _thread_add_message (notmuch_thread_t *thread,
 _notmuch_message_list_add_message (thread->message_list,
   talloc_steal (thread, message));
 thread->total_messages++;
+thread->total_files += notmuch_message_count_files (message);
 
 g_hash_table_insert (thread->message_hash,
 xstrdup (notmuch_message_get_message_id (message)),
@@ -495,6 +497,7 @@ _notmuch_thread_create (void *ctx,
  free, NULL);
 
 thread->total_messages = 0;
+thread->total_files = 0;
 thread->matched_messages = 0;
 thread->oldest = 0;
 thread->newest = 0;
@@ -567,6 +570,12 @@ notmuch_thread_get_total_messages (notmuch_thread_t 
*thread)
 }
 
 int
+notmuch_thread_get_total_files (notmuch_thread_t *thread)
+{
+return thread->total_files;
+}
+
+int
 notmuch_thread_get_matched_messages (notmuch_thread_t *thread)
 {
 return thread->matched_messages;
-- 
2.13.2

___
notmuch mailing list
notmuch@notmuchmail.org
https://notmuchmail.org/mailman/listinfo/notmuch


[Patch v4 03/12] lib: factor out message-id parsing to separate file.

2017-07-21 Thread David Bremner
This is really pure C string parsing, and doesn't need to be mixed in
with the Xapian/C++ layer. Although not strictly necessary, it also
makes it a bit more natural to call _parse_message_id from multiple
compilation units.
---
 lib/Makefile.local|   1 +
 lib/add-message.cc| 108 +-
 lib/message-id.c  |  96 
 lib/notmuch-private.h |  14 +++
 4 files changed, 113 insertions(+), 106 deletions(-)
 create mode 100644 lib/message-id.c

diff --git a/lib/Makefile.local b/lib/Makefile.local
index 9dd68286..0b5c4b08 100644
--- a/lib/Makefile.local
+++ b/lib/Makefile.local
@@ -38,6 +38,7 @@ libnotmuch_c_srcs =   \
$(dir)/filenames.c  \
$(dir)/string-list.c\
$(dir)/message-file.c   \
+   $(dir)/message-id.c \
$(dir)/messages.c   \
$(dir)/sha1.c   \
$(dir)/built-with.c \
diff --git a/lib/add-message.cc b/lib/add-message.cc
index 0f09415e..f09094af 100644
--- a/lib/add-message.cc
+++ b/lib/add-message.cc
@@ -1,109 +1,5 @@
 #include "database-private.h"
 
-/* Advance 'str' past any whitespace or RFC 822 comments. A comment is
- * a (potentially nested) parenthesized sequence with '\' used to
- * escape any character (including parentheses).
- *
- * If the sequence to be skipped continues to the end of the string,
- * then 'str' will be left pointing at the final terminating '\0'
- * character.
- */
-static void
-skip_space_and_comments (const char **str)
-{
-const char *s;
-
-s = *str;
-while (*s && (isspace (*s) || *s == '(')) {
-   while (*s && isspace (*s))
-   s++;
-   if (*s == '(') {
-   int nesting = 1;
-   s++;
-   while (*s && nesting) {
-   if (*s == '(') {
-   nesting++;
-   } else if (*s == ')') {
-   nesting--;
-   } else if (*s == '\\') {
-   if (*(s+1))
-   s++;
-   }
-   s++;
-   }
-   }
-}
-
-*str = s;
-}
-
-/* Parse an RFC 822 message-id, discarding whitespace, any RFC 822
- * comments, and the '<' and '>' delimiters.
- *
- * If not NULL, then *next will be made to point to the first character
- * not parsed, (possibly pointing to the final '\0' terminator.
- *
- * Returns a newly talloc'ed string belonging to 'ctx'.
- *
- * Returns NULL if there is any error parsing the message-id. */
-static char *
-_parse_message_id (void *ctx, const char *message_id, const char **next)
-{
-const char *s, *end;
-char *result;
-
-if (message_id == NULL || *message_id == '\0')
-   return NULL;
-
-s = message_id;
-
-skip_space_and_comments ();
-
-/* Skip any unstructured text as well. */
-while (*s && *s != '<')
-   s++;
-
-if (*s == '<') {
-   s++;
-} else {
-   if (next)
-   *next = s;
-   return NULL;
-}
-
-skip_space_and_comments ();
-
-end = s;
-while (*end && *end != '>')
-   end++;
-if (next) {
-   if (*end)
-   *next = end + 1;
-   else
-   *next = end;
-}
-
-if (end > s && *end == '>')
-   end--;
-if (end <= s)
-   return NULL;
-
-result = talloc_strndup (ctx, s, end - s + 1);
-
-/* Finally, collapse any whitespace that is within the message-id
- * itself. */
-{
-   char *r;
-   int len;
-
-   for (r = result, len = strlen (r); *r; r++, len--)
-   if (*r == ' ' || *r == '\t')
-   memmove (r, r+1, len);
-}
-
-return result;
-}
-
 /* Parse a References header value, putting a (talloc'ed under 'ctx')
  * copy of each referenced message-id into 'hash'.
  *
@@ -126,7 +22,7 @@ parse_references (void *ctx,
return NULL;
 
 while (*refs) {
-   ref = _parse_message_id (ctx, refs, );
+   ref = _notmuch_message_id_parse (ctx, refs, );
 
if (ref && strcmp (ref, message_id)) {
g_hash_table_add (hash, ref);
@@ -619,7 +515,7 @@ notmuch_database_add_message (notmuch_database_t *notmuch,
  */
 header = _notmuch_message_file_get_header (message_file, "message-id");
 if (header && *header != '\0') {
-   message_id = _parse_message_id (message_file, header, NULL);
+   message_id = _notmuch_message_id_parse (message_file, header, NULL);
 
/* So the header value isn't RFC-compliant, but it's
 * better than no message-id at all.
diff --git a/lib/message-id.c b/lib/message-id.c
new file mode 100644
index ..d7541d50
--- /dev/null
+++ b/lib/message-id.c
@@ -0,0 +1,96 @@
+#include "notmuch-private.h"
+
+/* Advance 'str' past any whitespace or RFC 822 comments. A comment is
+ * a (potentially nested) parenthesized sequence with '\' used to
+ * escape any character (including parentheses).
+ *
+ * If the sequence to be skipped continues to the end of the string,
+ * then 'str' will 

[Patch v4 04/12] lib: refactor notmuch_database_add_message header parsing

2017-07-21 Thread David Bremner
This function is large and hard to understand and modify. Start to
break it down into meaningful pieces.
---
 lib/add-message.cc| 54 +++-
 lib/message-file.c| 86 +++
 lib/notmuch-private.h | 11 +++
 3 files changed, 101 insertions(+), 50 deletions(-)

diff --git a/lib/add-message.cc b/lib/add-message.cc
index f09094af..2922eaa9 100644
--- a/lib/add-message.cc
+++ b/lib/add-message.cc
@@ -468,7 +468,7 @@ notmuch_database_add_message (notmuch_database_t *notmuch,
 notmuch_private_status_t private_status;
 notmuch_bool_t is_ghost = FALSE, is_new = FALSE;
 
-const char *date, *header;
+const char *date;
 const char *from, *to, *subject;
 char *message_id = NULL;
 
@@ -489,57 +489,12 @@ notmuch_database_add_message (notmuch_database_t *notmuch,
 if (ret)
goto DONE;
 
-/* Parse message up front to get better error status. */
-ret = _notmuch_message_file_parse (message_file);
+ret = _notmuch_message_file_get_headers (message_file,
+, , , ,
+_id);
 if (ret)
goto DONE;
 
-/* Before we do any real work, (especially before doing a
- * potential SHA-1 computation on the entire file's contents),
- * let's make sure that what we're looking at looks like an
- * actual email message.
- */
-from = _notmuch_message_file_get_header (message_file, "from");
-subject = _notmuch_message_file_get_header (message_file, "subject");
-to = _notmuch_message_file_get_header (message_file, "to");
-
-if ((from == NULL || *from == '\0') &&
-   (subject == NULL || *subject == '\0') &&
-   (to == NULL || *to == '\0')) {
-   ret = NOTMUCH_STATUS_FILE_NOT_EMAIL;
-   goto DONE;
-}
-
-/* Now that we're sure it's mail, the first order of business
- * is to find a message ID (or else create one ourselves).
- */
-header = _notmuch_message_file_get_header (message_file, "message-id");
-if (header && *header != '\0') {
-   message_id = _notmuch_message_id_parse (message_file, header, NULL);
-
-   /* So the header value isn't RFC-compliant, but it's
-* better than no message-id at all.
-*/
-   if (message_id == NULL)
-   message_id = talloc_strdup (message_file, header);
-}
-
-if (message_id == NULL ) {
-   /* No message-id at all, let's generate one by taking a
-* hash over the file's contents.
-*/
-   char *sha1 = _notmuch_sha1_of_file (filename);
-
-   /* If that failed too, something is really wrong. Give up. */
-   if (sha1 == NULL) {
-   ret = NOTMUCH_STATUS_FILE_ERROR;
-   goto DONE;
-   }
-
-   message_id = talloc_asprintf (message_file, "notmuch-sha1-%s", sha1);
-   free (sha1);
-}
-
 try {
/* Now that we have a message ID, we get a message object,
 * (which may or may not reference an existing document in the
@@ -579,7 +534,6 @@ notmuch_database_add_message (notmuch_database_t *notmuch,
if (ret)
goto DONE;
 
-   date = _notmuch_message_file_get_header (message_file, "date");
_notmuch_message_set_header_values (message, date, from, subject);
 
ret = _notmuch_message_index_file (message, message_file);
diff --git a/lib/message-file.c b/lib/message-file.c
index d7acf0d5..b90df305 100644
--- a/lib/message-file.c
+++ b/lib/message-file.c
@@ -92,6 +92,12 @@ _notmuch_message_file_open (notmuch_database_t *notmuch,
 return _notmuch_message_file_open_ctx (notmuch, NULL, filename);
 }
 
+const char *
+_notmuch_message_file_get_filename (notmuch_message_file_t *message_file)
+{
+return message_file->filename;
+}
+
 void
 _notmuch_message_file_close (notmuch_message_file_t *message)
 {
@@ -345,3 +351,83 @@ _notmuch_message_file_get_header (notmuch_message_file_t 
*message,
 
 return decoded;
 }
+
+notmuch_status_t
+_notmuch_message_file_get_headers (notmuch_message_file_t *message_file,
+  const char **from_out,
+  const char **subject_out,
+  const char **to_out,
+  const char **date_out,
+  char **message_id_out)
+{
+notmuch_status_t ret;
+const char *header;
+const char *from, *to, *subject, *date;
+char *message_id = NULL;
+
+/* Parse message up front to get better error status. */
+ret = _notmuch_message_file_parse (message_file);
+if (ret)
+   goto DONE;
+
+/* Before we do any real work, (especially before doing a
+ * potential SHA-1 computation on the entire file's contents),
+ * let's make sure that what we're looking at looks like an
+ * actual email message.
+ */
+from = _notmuch_message_file_get_header (message_file, "from");

[Patch v4 07/12] lib: add notmuch_message_count_files

2017-07-21 Thread David Bremner
This operation is relatively inexpensive, as the needed metadata is
already computed by our lazy metadata fetching. The goal is to support
better UI for messages with multipile files.
---
 lib/message.cc| 8 
 lib/notmuch-private.h | 6 ++
 lib/notmuch.h | 8 
 lib/string-list.c | 6 ++
 4 files changed, 28 insertions(+)

diff --git a/lib/message.cc b/lib/message.cc
index f78e5a9d..68c02001 100644
--- a/lib/message.cc
+++ b/lib/message.cc
@@ -946,6 +946,14 @@ notmuch_message_get_filenames (notmuch_message_t *message)
 return _notmuch_filenames_create (message, message->filename_list);
 }
 
+int
+notmuch_message_count_files (notmuch_message_t *message)
+{
+_notmuch_message_ensure_filename_list (message);
+
+return _notmuch_string_list_length (message->filename_list);
+}
+
 notmuch_bool_t
 notmuch_message_get_flag (notmuch_message_t *message,
  notmuch_message_flag_t flag)
diff --git a/lib/notmuch-private.h b/lib/notmuch-private.h
index 57c61639..544788bc 100644
--- a/lib/notmuch-private.h
+++ b/lib/notmuch-private.h
@@ -550,6 +550,12 @@ typedef struct _notmuch_string_list {
 notmuch_string_list_t *
 _notmuch_string_list_create (const void *ctx);
 
+/*
+ * return the number of strings in 'list'
+ */
+int
+_notmuch_string_list_length (notmuch_string_list_t *list);
+
 /* Add 'string' to 'list'.
  *
  * The list will create its own talloced copy of 'string'.
diff --git a/lib/notmuch.h b/lib/notmuch.h
index 17f0872e..f5018497 100644
--- a/lib/notmuch.h
+++ b/lib/notmuch.h
@@ -1342,6 +1342,14 @@ notmuch_messages_t *
 notmuch_message_get_replies (notmuch_message_t *message);
 
 /**
+ * Get the total number of files associated with a message.
+ * @returns Non-negative integer
+ * @since libnotmuch 5.0 (notmuch 0.25)
+ */
+int
+notmuch_message_count_files (notmuch_message_t *message);
+
+/**
  * Get a filename for the email corresponding to 'message'.
  *
  * The returned filename is an absolute filename, (the initial
diff --git a/lib/string-list.c b/lib/string-list.c
index 43ebe499..9c3ae7ef 100644
--- a/lib/string-list.c
+++ b/lib/string-list.c
@@ -42,6 +42,12 @@ _notmuch_string_list_create (const void *ctx)
 return list;
 }
 
+int
+_notmuch_string_list_length (notmuch_string_list_t *list)
+{
+return list->length;
+}
+
 void
 _notmuch_string_list_append (notmuch_string_list_t *list,
 const char *string)
-- 
2.13.2

___
notmuch mailing list
notmuch@notmuchmail.org
https://notmuchmail.org/mailman/listinfo/notmuch


[Patch v4 06/12] lib: index message files with duplicate message-ids

2017-07-21 Thread David Bremner
The corresponding xapian document just gets more terms added to it,
but this doesn't seem to break anything. Values on the other hand get
overwritten, which is a bit annoying, but arguably it is not worse to
take the values (from, subject, date) from the last file indexed
rather than the first.
---
 lib/add-message.cc | 19 +++
 test/T160-json.sh  |  4 ++--
 test/T670-duplicate-mid.sh |  9 +++--
 3 files changed, 20 insertions(+), 12 deletions(-)

diff --git a/lib/add-message.cc b/lib/add-message.cc
index 2922eaa9..f0a80c4f 100644
--- a/lib/add-message.cc
+++ b/lib/add-message.cc
@@ -529,19 +529,22 @@ notmuch_database_add_message (notmuch_database_t *notmuch,
if (is_ghost)
/* Convert ghost message to a regular message */
_notmuch_message_remove_term (message, "type", "ghost");
-   ret = _notmuch_database_link_message (notmuch, message,
+   }
+
+   ret = _notmuch_database_link_message (notmuch, message,
  message_file, is_ghost);
-   if (ret)
-   goto DONE;
+   if (ret)
+   goto DONE;
 
+   if (is_new || is_ghost)
_notmuch_message_set_header_values (message, date, from, subject);
 
-   ret = _notmuch_message_index_file (message, message_file);
-   if (ret)
-   goto DONE;
-   } else {
+   ret = _notmuch_message_index_file (message, message_file);
+   if (ret)
+   goto DONE;
+
+   if (! is_new && !is_ghost)
ret = NOTMUCH_STATUS_DUPLICATE_MESSAGE_ID;
-   }
 
_notmuch_message_sync (message);
 } catch (const Xapian::Error ) {
diff --git a/test/T160-json.sh b/test/T160-json.sh
index ac51895e..07955a2b 100755
--- a/test/T160-json.sh
+++ b/test/T160-json.sh
@@ -71,8 +71,8 @@ test_begin_subtest "Format version: too high"
 test_expect_code 21 "notmuch search --format-version=999 \\*"
 
 test_begin_subtest "Show message: multiple filenames"
-add_message "[id]=message...@example.com [filename]=copy1"
-add_message "[id]=message...@example.com [filename]=copy2"
+add_message '[id]=message...@example.com [filename]=copy1 [date]="Fri, 05 Jan 
2001 15:43:52 +"'
+add_message '[id]=message...@example.com [filename]=copy2 [date]="Fri, 05 Jan 
2001 15:43:52 +"'
 cat < EXPECTED
 [
 [
diff --git a/test/T670-duplicate-mid.sh b/test/T670-duplicate-mid.sh
index ced28a21..137cb6a5 100755
--- a/test/T670-duplicate-mid.sh
+++ b/test/T670-duplicate-mid.sh
@@ -5,8 +5,14 @@ test_description="duplicate message ids"
 add_message '[id]="duplicate"' '[subject]="message 1" [filename]=copy1'
 add_message '[id]="duplicate"' '[subject]="message 2" [filename]=copy2'
 
+test_begin_subtest 'First subject preserved'
+cat < EXPECTED
+thread:XXX   2001-01-05 [1/1] Notmuch Test Suite; message 1 (inbox unread)
+EOF
+notmuch search id:duplicate | notmuch_search_sanitize > OUTPUT
+test_expect_equal_file EXPECTED OUTPUT
+
 test_begin_subtest 'Search for second subject'
-test_subtest_known_broken
 cat 

[Patch v4 10/12] lib: add _notmuch_message_remove_indexed_terms

2017-07-21 Thread David Bremner
Testing will be provided via use in notmuch_message_reindex
---
 lib/message.cc| 54 +++
 lib/notmuch-private.h |  2 ++
 2 files changed, 56 insertions(+)

diff --git a/lib/message.cc b/lib/message.cc
index 68c02001..a1c3cd78 100644
--- a/lib/message.cc
+++ b/lib/message.cc
@@ -599,6 +599,59 @@ _notmuch_message_remove_terms (notmuch_message_t *message, 
const char *prefix)
 }
 }
 
+
+/* Remove all terms generated by indexing, i.e. not tags or
+ * properties, along with any automatic tags*/
+notmuch_private_status_t
+_notmuch_message_remove_indexed_terms (notmuch_message_t *message)
+{
+Xapian::TermIterator i;
+
+const std::string
+   id_prefix = _find_prefix ("id"),
+   property_prefix = _find_prefix ("property"),
+   tag_prefix = _find_prefix ("tag"),
+   type_prefix = _find_prefix ("type");
+
+for (i = message->doc.termlist_begin ();
+i != message->doc.termlist_end (); i++) {
+
+   const std::string term = *i;
+
+   if (term.compare (0, type_prefix.size (), type_prefix) == 0)
+   continue;
+
+   if (term.compare (0, id_prefix.size (), id_prefix) == 0)
+   continue;
+
+   if (term.compare (0, property_prefix.size (), property_prefix) == 0)
+   continue;
+
+   if (term.compare (0, tag_prefix.size (), tag_prefix) == 0 &&
+   term.compare (1, strlen("encrypted"), "encrypted") != 0 &&
+   term.compare (1, strlen("signed"), "signed") != 0 &&
+   term.compare (1, strlen("attachment"), "attachment") != 0)
+   continue;
+
+   try {
+   message->doc.remove_term ((*i));
+   message->modified = TRUE;
+   } catch (const Xapian::InvalidArgumentError) {
+   /* Ignore failure to remove non-existent term. */
+   } catch (const Xapian::Error ) {
+   notmuch_database_t *notmuch = message->notmuch;
+
+   if (!notmuch->exception_reported) {
+   _notmuch_database_log(_notmuch_message_database (message), "A 
Xapian exception occurred creating message: %s\n",
+ error.get_msg().c_str());
+   notmuch->exception_reported = TRUE;
+   }
+   return NOTMUCH_PRIVATE_STATUS_XAPIAN_EXCEPTION;
+   }
+}
+return NOTMUCH_PRIVATE_STATUS_SUCCESS;
+}
+
 /* Return true if p points at "new" or "cur". */
 static bool is_maildir (const char *p)
 {
@@ -646,6 +699,7 @@ _notmuch_message_add_folder_terms (notmuch_message_t 
*message,
 
 talloc_free (folder);
 
+message->modified = TRUE;
 return NOTMUCH_STATUS_SUCCESS;
 }
 
diff --git a/lib/notmuch-private.h b/lib/notmuch-private.h
index 544788bc..c8934b8b 100644
--- a/lib/notmuch-private.h
+++ b/lib/notmuch-private.h
@@ -526,6 +526,8 @@ _notmuch_message_add_reply (notmuch_message_t *message,
 notmuch_database_t *
 _notmuch_message_database (notmuch_message_t *message);
 
+void
+_notmuch_message_remove_unprefixed_terms (notmuch_message_t *message);
 /* sha1.c */
 
 char *
-- 
2.13.2

___
notmuch mailing list
notmuch@notmuchmail.org
https://notmuchmail.org/mailman/listinfo/notmuch


[Patch v4 02/12] lib/n_d_add_message: refactor test for new/ghost messages

2017-07-21 Thread David Bremner
The switch is easier to understand than the side effects in the if
test. It also potentially allows us more flexibility in breaking up
this function into smaller pieces, since passing private_status around
is icky.
---
 lib/add-message.cc | 23 +--
 1 file changed, 13 insertions(+), 10 deletions(-)

diff --git a/lib/add-message.cc b/lib/add-message.cc
index 5fe2c45b..0f09415e 100644
--- a/lib/add-message.cc
+++ b/lib/add-message.cc
@@ -570,7 +570,7 @@ notmuch_database_add_message (notmuch_database_t *notmuch,
 notmuch_message_t *message = NULL;
 notmuch_status_t ret = NOTMUCH_STATUS_SUCCESS, ret2;
 notmuch_private_status_t private_status;
-notmuch_bool_t is_ghost = false;
+notmuch_bool_t is_ghost = FALSE, is_new = FALSE;
 
 const char *date, *header;
 const char *from, *to, *subject;
@@ -655,7 +655,17 @@ notmuch_database_add_message (notmuch_database_t *notmuch,
 
talloc_free (message_id);
 
-   if (message == NULL) {
+   /* We cannot call notmuch_message_get_flag for a new message */
+   switch (private_status) {
+   case NOTMUCH_PRIVATE_STATUS_NO_DOCUMENT_FOUND:
+   is_ghost = FALSE;
+   is_new = TRUE;
+   break;
+   case NOTMUCH_PRIVATE_STATUS_SUCCESS:
+   is_ghost = notmuch_message_get_flag (message, 
NOTMUCH_MESSAGE_FLAG_GHOST);
+   is_new = FALSE;
+   break;
+   default:
ret = COERCE_STATUS (private_status,
 "Unexpected status value from 
_notmuch_message_create_for_message_id");
goto DONE;
@@ -663,18 +673,11 @@ notmuch_database_add_message (notmuch_database_t *notmuch,
 
_notmuch_message_add_filename (message, filename);
 
-   /* Is this a newly created message object or a ghost
-* message?  We have to be slightly careful: if this is a
-* blank message, it's not safe to call
-* notmuch_message_get_flag yet. */
-   if (private_status == NOTMUCH_PRIVATE_STATUS_NO_DOCUMENT_FOUND ||
-   (is_ghost = notmuch_message_get_flag (
-   message, NOTMUCH_MESSAGE_FLAG_GHOST))) {
+   if (is_new || is_ghost) {
_notmuch_message_add_term (message, "type", "mail");
if (is_ghost)
/* Convert ghost message to a regular message */
_notmuch_message_remove_term (message, "type", "ghost");
-
ret = _notmuch_database_link_message (notmuch, message,
  message_file, is_ghost);
if (ret)
-- 
2.13.2

___
notmuch mailing list
notmuch@notmuchmail.org
https://notmuchmail.org/mailman/listinfo/notmuch


[Patch v4 01/12] lib: isolate n_d_add_message and helper functions into own file

2017-07-21 Thread David Bremner
'database.cc' is becoming a monster, and it's hard to follow what the
various static functions are used for. It turns out that about 1/3 of
this file notmuch_database_add_message and helper functions not used
by any other function. This commit isolates this code into it's own
file.

Some side effects of this refactoring:

- find_doc_ids becomes the non-static (but still private)
  _notmuch_database_find_doc_ids
- a few instances of 'string' have 'std::' prepended, avoiding the
  need for 'using namespace std;' in the new file.
---
 lib/Makefile.local |   1 +
 lib/add-message.cc | 721 
 lib/database-private.h |   6 +
 lib/database.cc| 732 +
 4 files changed, 735 insertions(+), 725 deletions(-)
 create mode 100644 lib/add-message.cc

diff --git a/lib/Makefile.local b/lib/Makefile.local
index bf6e0649..9dd68286 100644
--- a/lib/Makefile.local
+++ b/lib/Makefile.local
@@ -50,6 +50,7 @@ libnotmuch_cxx_srcs = \
$(dir)/directory.cc \
$(dir)/index.cc \
$(dir)/message.cc   \
+   $(dir)/add-message.cc   \
$(dir)/message-property.cc \
$(dir)/query.cc \
$(dir)/query-fp.cc  \
diff --git a/lib/add-message.cc b/lib/add-message.cc
new file mode 100644
index ..5fe2c45b
--- /dev/null
+++ b/lib/add-message.cc
@@ -0,0 +1,721 @@
+#include "database-private.h"
+
+/* Advance 'str' past any whitespace or RFC 822 comments. A comment is
+ * a (potentially nested) parenthesized sequence with '\' used to
+ * escape any character (including parentheses).
+ *
+ * If the sequence to be skipped continues to the end of the string,
+ * then 'str' will be left pointing at the final terminating '\0'
+ * character.
+ */
+static void
+skip_space_and_comments (const char **str)
+{
+const char *s;
+
+s = *str;
+while (*s && (isspace (*s) || *s == '(')) {
+   while (*s && isspace (*s))
+   s++;
+   if (*s == '(') {
+   int nesting = 1;
+   s++;
+   while (*s && nesting) {
+   if (*s == '(') {
+   nesting++;
+   } else if (*s == ')') {
+   nesting--;
+   } else if (*s == '\\') {
+   if (*(s+1))
+   s++;
+   }
+   s++;
+   }
+   }
+}
+
+*str = s;
+}
+
+/* Parse an RFC 822 message-id, discarding whitespace, any RFC 822
+ * comments, and the '<' and '>' delimiters.
+ *
+ * If not NULL, then *next will be made to point to the first character
+ * not parsed, (possibly pointing to the final '\0' terminator.
+ *
+ * Returns a newly talloc'ed string belonging to 'ctx'.
+ *
+ * Returns NULL if there is any error parsing the message-id. */
+static char *
+_parse_message_id (void *ctx, const char *message_id, const char **next)
+{
+const char *s, *end;
+char *result;
+
+if (message_id == NULL || *message_id == '\0')
+   return NULL;
+
+s = message_id;
+
+skip_space_and_comments ();
+
+/* Skip any unstructured text as well. */
+while (*s && *s != '<')
+   s++;
+
+if (*s == '<') {
+   s++;
+} else {
+   if (next)
+   *next = s;
+   return NULL;
+}
+
+skip_space_and_comments ();
+
+end = s;
+while (*end && *end != '>')
+   end++;
+if (next) {
+   if (*end)
+   *next = end + 1;
+   else
+   *next = end;
+}
+
+if (end > s && *end == '>')
+   end--;
+if (end <= s)
+   return NULL;
+
+result = talloc_strndup (ctx, s, end - s + 1);
+
+/* Finally, collapse any whitespace that is within the message-id
+ * itself. */
+{
+   char *r;
+   int len;
+
+   for (r = result, len = strlen (r); *r; r++, len--)
+   if (*r == ' ' || *r == '\t')
+   memmove (r, r+1, len);
+}
+
+return result;
+}
+
+/* Parse a References header value, putting a (talloc'ed under 'ctx')
+ * copy of each referenced message-id into 'hash'.
+ *
+ * We explicitly avoid including any reference identical to
+ * 'message_id' in the result (to avoid mass confusion when a single
+ * message references itself cyclically---and yes, mail messages are
+ * not infrequent in the wild that do this---don't ask me why).
+ *
+ * Return the last reference parsed, if it is not equal to message_id.
+ */
+static char *
+parse_references (void *ctx,
+ const char *message_id,
+ GHashTable *hash,
+ const char *refs)
+{
+char *ref, *last_ref = NULL;
+
+if (refs == NULL || *refs == '\0')
+   return NULL;
+
+while (*refs) {
+   ref = _parse_message_id (ctx, refs, );
+
+   if (ref && strcmp (ref, message_id)) {
+   g_hash_table_add (hash, ref);
+   last_ref = ref;
+   }
+}
+
+/* The return value of this function is used to add a parent
+ * 

[Patch v4 05/12] test: add known broken tests for duplicate message id

2017-07-21 Thread David Bremner
There are many other problems that could be tested, but these ones we
have some hope of fixing because it doesn't require UI changes, just
indexing changes.
---
 test/T670-duplicate-mid.sh | 28 
 1 file changed, 28 insertions(+)
 create mode 100755 test/T670-duplicate-mid.sh

diff --git a/test/T670-duplicate-mid.sh b/test/T670-duplicate-mid.sh
new file mode 100755
index ..ced28a21
--- /dev/null
+++ b/test/T670-duplicate-mid.sh
@@ -0,0 +1,28 @@
+#!/usr/bin/env bash
+test_description="duplicate message ids"
+. ./test-lib.sh || exit 1
+
+add_message '[id]="duplicate"' '[subject]="message 1" [filename]=copy1'
+add_message '[id]="duplicate"' '[subject]="message 2" [filename]=copy2'
+
+test_begin_subtest 'Search for second subject'
+test_subtest_known_broken
+cat  
OUTPUT
+test_expect_equal_file EXPECTED OUTPUT
+
+add_message '[id]="duplicate"' '[body]="sekrit" [filename]=copy3'
+test_begin_subtest 'search for body in duplicate file'
+test_subtest_known_broken
+cat  OUTPUT
+test_expect_equal_file EXPECTED OUTPUT
+
+test_done
-- 
2.13.2

___
notmuch mailing list
notmuch@notmuchmail.org
https://notmuchmail.org/mailman/listinfo/notmuch


v4 of index multiple files per msg-id, add reindex command

2017-07-21 Thread David Bremner
This is mainly a rebase of

 id:20170604123235.24466-1-da...@tethera.net

against current master (roughly 0.25~rc1)

There is a bit of movement between patches to make sure tests pass
after every patch, but the resulting interdiff is basically trivial

diff --git a/lib/database-private.h b/lib/database-private.h
index 504618b9..554b 100644
--- a/lib/database-private.h
+++ b/lib/database-private.h
@@ -252,8 +252,4 @@ _notmuch_database_find_doc_ids (notmuch_database_t *notmuch,
const char *value,
Xapian::PostingIterator *begin,
Xapian::PostingIterator *end);
-
-
-#pragma GCC visibility pop
-

AFAIK, that has to do with Jani's changes to visibility in the library.

Unless I hear otherwise, I'll probably merge this series to master
shortly after the 0.25 release.
___
notmuch mailing list
notmuch@notmuchmail.org
https://notmuchmail.org/mailman/listinfo/notmuch


[Patch v4 09/12] cli/search: print total number of files matched in summary output.

2017-07-21 Thread David Bremner
The structured output formats already have all of the filenames. This
is an easy bit of UI change to make the multiple files visible.
---
 doc/man1/notmuch-search.rst  | 17 -
 notmuch-search.c | 15 +--
 test/T080-search.sh  |  2 +-
 test/T100-search-by-folder.sh|  4 ++--
 test/T340-maildir-sync.sh|  4 ++--
 test/T370-search-folder-coherence.sh |  2 +-
 test/T500-search-date.sh |  2 +-
 test/T650-regexp-query.sh|  4 ++--
 test/T670-duplicate-mid.sh   |  2 +-
 9 files changed, 39 insertions(+), 13 deletions(-)

diff --git a/doc/man1/notmuch-search.rst b/doc/man1/notmuch-search.rst
index 4e660a6f..fd6dcadd 100644
--- a/doc/man1/notmuch-search.rst
+++ b/doc/man1/notmuch-search.rst
@@ -42,7 +42,9 @@ Supported options for **search** include
 the search terms. The summary includes the thread ID, date,
 the number of messages in the thread (both the number
 matched and the total number), the authors of the thread and
-the subject.
+   the subject. In the case where a thread contains multiple files for
+   some messages, the total number of files is printed in parentheses
+   (see below for an example).
 
 **threads**
 Output the thread IDs of all threads with any message
@@ -135,6 +137,19 @@ Supported options for **search** include
 prefix. The prefix matches messages based on filenames. This
 option filters filenames of the matching messages.
 
+EXAMPLE
+===
+
+The following shows an example of the summary output format, with one
+message having multiple filenames.
+
+::
+
+  % notmuch search date:today.. and tag:bad-news
+  thread:00063c10 Today [1/1] Some Persun; To the bone (inbox unread)
+  thread:00063c25 Today [1/1(2)] Ann Other; Bears (inbox unread)
+  thread:00063c00 Today [1/1] A Thurd; Bites, stings, sad feelings 
(inbox unread)
+
 EXIT STATUS
 ===
 
diff --git a/notmuch-search.c b/notmuch-search.c
index 019e14ee..380e9d8f 100644
--- a/notmuch-search.c
+++ b/notmuch-search.c
@@ -160,6 +160,7 @@ do_search_threads (search_context_t *ctx)
const char *subject = notmuch_thread_get_subject (thread);
const char *thread_id = notmuch_thread_get_thread_id (thread);
int matched = notmuch_thread_get_matched_messages (thread);
+   int files = notmuch_thread_get_total_files (thread);
int total = notmuch_thread_get_total_messages (thread);
const char *relative_date = NULL;
notmuch_bool_t first_tag = TRUE;
@@ -175,13 +176,23 @@ do_search_threads (search_context_t *ctx)
 
if (format->is_text_printer) {
 /* Special case for the text formatter */
-   printf ("thread:%s %12s [%d/%d] %s; %s (",
+   printf ("thread:%s %12s ",
thread_id,
-   relative_date,
+   relative_date);
+   if (total == files)
+   printf ("[%d/%d] %s; %s (",
matched,
total,
sanitize_string (ctx_quote, authors),
sanitize_string (ctx_quote, subject));
+   else
+   printf ("[%d/%d(%d)] %s; %s (",
+   matched,
+   total,
+   files,
+   sanitize_string (ctx_quote, authors),
+   sanitize_string (ctx_quote, subject));
+
} else { /* Structured Output */
format->map_key (format, "thread");
format->string (format, thread_id);
diff --git a/test/T080-search.sh b/test/T080-search.sh
index d2d71ca9..3bb3dced 100755
--- a/test/T080-search.sh
+++ b/test/T080-search.sh
@@ -111,7 +111,7 @@ thread:XXX   2009-11-18 [3/3] Adrian Perez de Castro, Keith 
Packard, Carl Worth;
 thread:XXX   2009-11-18 [3/3] Israel Herraiz, Keith Packard, Carl Worth; 
[notmuch] New to the list (inbox unread)
 thread:XXX   2009-11-18 [3/3] Jan Janak, Carl Worth; [notmuch] What a great 
idea! (inbox unread)
 thread:XXX   2009-11-18 [2/2] Jan Janak, Carl Worth; [notmuch] [PATCH] Older 
versions of install do not support -C. (inbox unread)
-thread:XXX   2009-11-18 [3/3] Aron Griffis, Keith Packard, Carl Worth; 
[notmuch] archive (inbox unread)
+thread:XXX   2009-11-18 [3/3(4)] Aron Griffis, Keith Packard, Carl Worth; 
[notmuch] archive (inbox unread)
 thread:XXX   2009-11-18 [2/2] Keith Packard, Carl Worth; [notmuch] [PATCH] 
Make notmuch-show 'X' (and 'x') commands remove inbox (and unread) tags (inbox 
unread)
 thread:XXX   2009-11-18 [7/7] Lars Kellogg-Stedman, Mikhail Gusarov, Keith 
Packard, Carl Worth; [notmuch] Working with Maildir storage? (inbox signed 
unread)
 thread:XXX   2009-11-18 [5/5] Mikhail Gusarov, Carl Worth, Keith Packard; 

[Patch v4 11/12] lib: add notmuch_message_reindex

2017-07-21 Thread David Bremner
From: Daniel Kahn Gillmor 

This new function asks the database to reindex a given message.
The parameter `indexopts` is currently ignored, but is intended to
provide an extensible API to support e.g. changing the encryption or
filtering status (e.g. whether and how certain non-plaintext parts are
indexed).
---
 lib/add-message.cc|   2 +-
 lib/message.cc| 108 +-
 lib/notmuch-private.h |   6 +++
 lib/notmuch.h |  15 +++
 4 files changed, 129 insertions(+), 2 deletions(-)

diff --git a/lib/add-message.cc b/lib/add-message.cc
index f0a80c4f..711ed9fa 100644
--- a/lib/add-message.cc
+++ b/lib/add-message.cc
@@ -220,7 +220,7 @@ _my_talloc_free_for_g_hash (void *ptr)
 talloc_free (ptr);
 }
 
-static notmuch_status_t
+notmuch_status_t
 _notmuch_database_link_message_to_parents (notmuch_database_t *notmuch,
   notmuch_message_t *message,
   notmuch_message_file_t *message_file,
diff --git a/lib/message.cc b/lib/message.cc
index a1c3cd78..539d3320 100644
--- a/lib/message.cc
+++ b/lib/message.cc
@@ -579,7 +579,9 @@ void
 _notmuch_message_remove_terms (notmuch_message_t *message, const char *prefix)
 {
 Xapian::TermIterator i;
-size_t prefix_len = strlen (prefix);
+size_t prefix_len = 0;
+
+prefix_len = strlen (prefix);
 
 while (1) {
i = message->doc.termlist_begin ();
@@ -1934,3 +1936,107 @@ _notmuch_message_frozen (notmuch_message_t *message)
 {
 return message->frozen;
 }
+
+notmuch_status_t
+notmuch_message_reindex (notmuch_message_t *message,
+notmuch_param_t unused (*indexopts))
+{
+notmuch_database_t *notmuch = NULL;
+notmuch_status_t ret = NOTMUCH_STATUS_SUCCESS;
+notmuch_private_status_t private_status;
+notmuch_filenames_t *orig_filenames = NULL;
+const char *orig_thread_id = NULL;
+notmuch_message_file_t *message_file = NULL;
+
+int found = 0;
+
+if (message == NULL)
+   return NOTMUCH_STATUS_NULL_POINTER;
+
+/* Save in case we need to delete message */
+orig_thread_id = notmuch_message_get_thread_id (message);
+if (!orig_thread_id) {
+   /* XXX TODO: make up new error return? */
+   INTERNAL_ERROR ("message without thread-id");
+}
+
+/* strdup it because the metadata may be invalidated */
+orig_thread_id = talloc_strdup (message, orig_thread_id);
+
+notmuch = _notmuch_message_database (message);
+
+ret = _notmuch_database_ensure_writable (notmuch);
+if (ret)
+   return ret;
+
+orig_filenames = notmuch_message_get_filenames (message);
+
+private_status = _notmuch_message_remove_indexed_terms (message);
+if (private_status) {
+   ret = COERCE_STATUS(private_status, "error removing terms");
+   goto DONE;
+}
+
+/* re-add the filenames with the associated indexopts */
+for (; notmuch_filenames_valid (orig_filenames);
+notmuch_filenames_move_to_next (orig_filenames)) {
+
+   const char *date;
+   const char *from, *to, *subject;
+   char *message_id = NULL;
+   const char *thread_id = NULL;
+
+   const char *filename = notmuch_filenames_get (orig_filenames);
+
+   message_file = _notmuch_message_file_open (notmuch, filename);
+   if (message_file == NULL)
+   continue;
+
+   ret = _notmuch_message_file_get_headers (message_file,
+, , , ,
+_id);
+   if (ret)
+   goto DONE;
+
+   /* XXX TODO: deal with changing message id? */
+
+   _notmuch_message_add_filename (message, filename);
+
+   ret = _notmuch_database_link_message_to_parents (notmuch, message,
+message_file,
+_id);
+   if (ret)
+   goto DONE;
+
+   if (thread_id == NULL)
+   thread_id = orig_thread_id;
+
+   _notmuch_message_add_term (message, "thread", thread_id);
+   _notmuch_message_set_header_values (message, date, from, subject);
+
+   ret = _notmuch_message_index_file (message, message_file);
+
+   if (ret == NOTMUCH_STATUS_FILE_ERROR)
+   continue;
+   if (ret)
+   goto DONE;
+
+   found++;
+   _notmuch_message_file_close (message_file);
+   message_file = NULL;
+}
+if (found == 0) {
+   /* put back thread id to help cleanup */
+   _notmuch_message_add_term (message, "thread", orig_thread_id);
+   ret = _notmuch_message_delete (message);
+} else {
+   _notmuch_message_sync (message);
+}
+
+ DONE:
+if (message_file)
+   _notmuch_message_file_close (message_file);
+
+/* XXX TODO destroy orig_filenames? */
+return ret;
+}
diff --git a/lib/notmuch-private.h b/lib/notmuch-private.h
index c8934b8b..b187a80f