This obsoletes

  id:1356415076-5692-1-git-send-email-amdragon at mit.edu

In addition to incorporating all of David's suggestions, this reworks
the boolean term parsing so it only handles the subset of quoting
syntax used by make_boolean_term (which also happens to be all that we
described in the man page for the format).  The diff from v1 is below.

diff --git a/man/man1/notmuch-restore.1 b/man/man1/notmuch-restore.1
index 6bba628..78fef52 100644
--- a/man/man1/notmuch-restore.1
+++ b/man/man1/notmuch-restore.1
@@ -57,10 +57,8 @@ sup calls them).
 The
 .B batch-tag
 dump format is intended to more robust against malformed message-ids
-and tags containing whitespace or non-\fBascii\fR(7) characters.  This
-format hex-escapes all characters those outside of a small character
-set, intended to be suitable for e.g. pathnames in most UNIX-like
-systems.
+and tags containing whitespace or non-\fBascii\fR(7) characters.  See
+\fBnotmuch-dump\fR(1) for details on this format.

 .B "notmuch restore"
 updates the maildir flags according to tag changes if the
diff --git a/test/dump-restore b/test/dump-restore
index aecc393..f9ae5b3 100755
--- a/test/dump-restore
+++ b/test/dump-restore
@@ -200,6 +200,8 @@ a
 # the next non-comment line should report an an empty tag error for
 # batch tagging, but not for restore
 + +e -- id:20091117232137.GA7669 at griffis1.net
+# valid id, but warning about missing message
++e id:missing_message_id
 EOF

 cat <<EOF > EXPECTED
@@ -211,6 +213,7 @@ Warning: no query string after -- [+c +d --]
 Warning: hex decoding of tag %zz failed [+%zz -- id:whatever]
 Warning: cannot parse query: id:"
 Warning: not an id query: tag:abc
+Warning: cannot apply tags to missing message: missing_message_id
 EOF

 test_expect_equal_file EXPECTED OUTPUT
diff --git a/test/random-corpus.c b/test/random-corpus.c
index d0e3e8f..8b7748e 100644
--- a/test/random-corpus.c
+++ b/test/random-corpus.c
@@ -96,9 +96,9 @@ random_utf8_string (void *ctx, size_t char_count)
            buf = talloc_realloc (ctx, buf, gchar, buf_size);
        }

-       randomchar = random_unichar ();
-       if (randomchar == '\n')
-           randomchar = 'x';
+       do {
+           randomchar = random_unichar ();
+       } while (randomchar == '\n');

        written = g_unichar_to_utf8 (randomchar, buf + offset);

diff --git a/util/string-util.c b/util/string-util.c
index eaa6c99..db01b4b 100644
--- a/util/string-util.c
+++ b/util/string-util.c
@@ -43,9 +43,11 @@ make_boolean_term (void *ctx, const char *prefix, const char 
*term,
     size_t needed = 3;
     int need_quoting = 0;

-    /* Do we need quoting? */
+    /* Do we need quoting?  To be paranoid, we quote anything
+     * containing a quote, even though it only matters at the
+     * beginning, and anything containing non-ASCII text. */
     for (in = term; *in && !need_quoting; in++)
-       if (*in <= ' ' || *in == ')' || *in == '"')
+       if (*in <= ' ' || *in == ')' || *in == '"' || (unsigned char)*in > 127)
            need_quoting = 1;

     if (need_quoting)
@@ -95,21 +97,6 @@ make_boolean_term (void *ctx, const char *prefix, const char 
*term,
     return 0;
 }

-static int
-consume_double_quote (const char **str)
-{
-    if (**str == '"') {
-       ++*str;
-       return 1;
-    } else if (strncmp(*str, "\xe2\x80\x9c", 3) == 0 || /* UTF8 0x201c */
-              strncmp(*str, "\xe2\x80\x9d", 3) == 0) { /* UTF8 0x201d */
-       *str += 3;
-       return 3;
-    } else {
-       return 0;
-    }
-}
-
 int
 parse_boolean_term (void *ctx, const char *str,
                    char **prefix_out, char **term_out)
@@ -123,28 +110,31 @@ parse_boolean_term (void *ctx, const char *str,
     *prefix_out = talloc_strndup (ctx, str, pos - str);
     ++pos;

-    /* Implement Xapian's boolean term de-quoting.  This is a nearly
-     * direct translation of QueryParser::Internal::parse_query. */
-    pos = *term_out = talloc_strdup (ctx, pos);
-    if (consume_double_quote (&pos)) {
-       char *out = talloc_strdup (ctx, pos);
-       pos = *term_out = out;
-       while (1) {
-           if (! *pos) {
-               /* Premature end of string */
-               goto FAIL;
-           } else if (*pos == '"') {
-               if (*++pos != '"')
+    /* Implement de-quoting compatible with make_boolean_term. */
+    if (*pos == '"') {
+       char *out = talloc_strdup (ctx, pos + 1);
+       int closed = 0;
+       /* Find the closing quote and un-double doubled internal
+        * quotes. */
+       for (pos = *term_out = out; *pos; ) {
+           if (*pos == '"') {
+               ++pos;
+               if (*pos != '"') {
+                   /* Found the closing quote. */
+                   closed = 1;
                    break;
-           } else if (consume_double_quote (&pos)) {
-               break;
+               }
            }
            *out++ = *pos++;
        }
-       if (*pos)
+       /* Did the term terminate without a closing quote or is there
+        * trailing text after the closing quote? */
+       if (!closed || *pos)
            goto FAIL;
        *out = '\0';
     } else {
+       *term_out = talloc_strdup (ctx, pos);
+       /* Check for text after the boolean term. */
        while (*pos > ' ' && *pos != ')')
            ++pos;
        if (*pos)
diff --git a/util/string-util.h b/util/string-util.h
index e4e4c42..aff2d65 100644
--- a/util/string-util.h
+++ b/util/string-util.h
@@ -28,9 +28,9 @@ char *strtok_len (char *s, const char *delim, size_t *len);
 int make_boolean_term (void *talloc_ctx, const char *prefix, const char *term,
                       char **buf, size_t *len);

-/* Parse a boolean term query, returning the prefix in *prefix_out and
- * the term in *term_out.  *prefix_out and *term_out will be talloc'd
- * with context ctx.
+/* Parse a boolean term query produced by make_boolean_term, returning
+ * the prefix in *prefix_out and the term in *term_out.  *prefix_out
+ * and *term_out will be talloc'd with context ctx.
  *
  * Return: 0 on success, non-zero on parse error (including trailing
  * data in str).


Reply via email to