[Bug+Patch] error in IMAP mailbox encoding

Albrecht Dreß Sat, 23 Feb 2019 06:57:03 -0800

Hi all,

Balsa has a bug in encoding IMAP mailbox names to modified UTF-7 as defined in 
RFC 3501, sect. 5.1.3 [1], as the '\' (0x5c) is encoded as '\\' which is 
explicitly prohibited.  To reproduce: activate main() in libbalsa/imap/util.c, 
compile it, and run e.g. (yes, the mailbox name is weird, but it includes all 
special cases from the RFC…)


./util 'ϴä ab c&d+e/f\~ßx'
orig='ϴä ab c&d+e/f\~ßx' mbx='&A,QA5A- ab c&-d+e/f\\~&AN8-x' back='ϴä ab 
c&d+e/f\\~ßx'
WRONG CONVERSION: --------------------------------^^

The attached patch replaces the hand-coded conversion by utilising g_convert(), 
and also fixes the bug.  For testing:

./util 'ϴä ab c&d+e/f\~ßx'
orig='ϴä ab c&d+e/f\~ßx' mbx='&A,QA5A- ab c&-d+e/f\~&AN8-x' back='ϴä ab 
c&d+e/f\~ßx'
PROPER CONVERSION: -----------------------------^

Opinions?

Cheers,
Albrecht.

[1] <https://tools.ietf.org/html/rfc3501#section-5.1.3>

diff --git a/libbalsa/imap/util.c b/libbalsa/imap/util.c
index 6860ab3da..bb29575f9 100644
--- a/libbalsa/imap/util.c
+++ b/libbalsa/imap/util.c
@@ -94,220 +94,102 @@ imap_next_word(char *s)
 /* ===================================================================
  * UTF-7 conversion routines as in RFC 2192
  * =================================================================== */
-/* UTF7 modified base64 alphabet */
-static char base64chars[] =
-  "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+,";
-#define UNDEFINED 64
 
-/* UTF16 definitions */
-#define UTF16MASK       0x03FFUL
-#define UTF16SHIFT      10
-#define UTF16BASE       0x10000UL
-#define UTF16HIGHSTART  0xD800UL
-#define UTF16HIGHEND    0xDBFFUL
-#define UTF16LOSTART    0xDC00UL
-#define UTF16LOEND      0xDFFFUL
+/* see RFC 3501, Section 5.1.3. Mailbox International Naming Convention:
+ * In modified UTF-7, printable US-ASCII characters, except for "&", represent themselves; that is, characters with octet values
+ * 0x20-0x25 and 0x27-0x7e. */
+#define IS_VALID_ASCII(c)	((((c) >= '\x20') && ((c) <= '\x25')) || (((c) >= '\x27') && ((c) <= '\x7e')))
 
-
-/* Convert an IMAP mailbox to a UTF-8 string.
- *  dst needs to have roughly 4 times the storage space of src
- *    Hex encoding can triple the size of the input
- *    UTF-7 can be slightly denser than UTF-8
- *     (worst case: 8 octets UTF-7 becomes 9 octets UTF-8)
- */
-char*
-imap_mailbox_to_utf8(const char *mbox)
+gchar *
+imap_utf8_to_mailbox(const gchar *mbox)
 {
-  unsigned c, i, bitcount;
-  unsigned long ucs4, utf16, bitbuf;
-  unsigned char base64[256];
-  const char *src;
-  char *dst, *res  = malloc(2*strlen(mbox)+1);
-  
-  bitbuf = 0;
-  dst = res;
-  src = mbox;
-  if(!dst) return NULL;
-  /* initialize modified base64 decoding table */
-  memset(base64, UNDEFINED, sizeof (base64));
-  for (i = 0; i < sizeof (base64chars); ++i) {
-    base64[(unsigned)base64chars[i]] = i;
-  }
-  
-  /* loop until end of string */
-  while (*src != '\0') {
-    c = *src++;
-    /* deal with literal characters and &- */
-    if (c != '&' || *src == '-') {
-      /* encode literally */
-      *dst++ = c;
-      /* skip over the '-' if this is an &- sequence */
-      if (c == '&') ++src;
-    } else {
-      /* convert modified UTF-7 -> UTF-16 -> UCS-4 -> UTF-8 -> HEX */
-      bitbuf = 0;
-      bitcount = 0;
-      ucs4 = 0;
-      while ((c = base64[(unsigned char) *src]) != UNDEFINED) {
-        ++src;
-        bitbuf = (bitbuf << 6) | c;
-        bitcount += 6;
-        /* enough bits for a UTF-16 character? */
-        if (bitcount >= 16) {
-          bitcount -= 16;
-          utf16 = (bitcount ? bitbuf >> bitcount
-                   : bitbuf) & 0xffff;
-          /* convert UTF16 to UCS4 */
-          if
-            (utf16 >= UTF16HIGHSTART && utf16 <= UTF16HIGHEND) {
-            ucs4 = (utf16 - UTF16HIGHSTART) << UTF16SHIFT;
-            continue;
-          } else if
-            (utf16 >= UTF16LOSTART && utf16 <= UTF16LOEND) {
-            ucs4 += utf16 - UTF16LOSTART + UTF16BASE;
-          } else {
-            ucs4 = utf16;
-          }
-          
-          /* convert UTF-16 range of UCS4 to UTF-8 */
-          if (ucs4 <= 0x7fUL) {
-            dst[0] = ucs4;
-            dst += 1;
-          } else if (ucs4 <= 0x7ffUL) {
-            dst[0] = 0xc0 | (ucs4 >> 6);
-            dst[1] = 0x80 | (ucs4 & 0x3f);
-            dst += 2;
-          } else if (ucs4 <= 0xffffUL) {
-            dst[0] = 0xe0 | (ucs4 >> 12);
-            dst[1] = 0x80 | ((ucs4 >> 6) & 0x3f);
-            dst[2] = 0x80 | (ucs4 & 0x3f);
-            dst += 3;
-          } else {
-            dst[0] = 0xf0 | (ucs4 >> 18);
-            dst[1] = 0x80 | ((ucs4 >> 12) & 0x3f);
-            dst[2] = 0x80 | ((ucs4 >> 6) & 0x3f);
-            dst[3] = 0x80 | (ucs4 & 0x3f);
-            dst += 4;
-          }
-        }
-      }
-      /* skip over trailing '-' in modified UTF-7 encoding */
-      if (*src == '-') ++src;
-    }
-  }
-  /* terminate destination string */
-  *dst = '\0';
-  return res;
+	GString *buffer;
+	const gchar *next_in;
+
+	buffer = g_string_sized_new(strlen(mbox));	/* sufficient size for ASCII only */
+	next_in = mbox;
+	while (*next_in != '\0') {
+		if (IS_VALID_ASCII(*next_in)) {
+			g_string_append_c(buffer, *next_in++);
+		} else if (*next_in == '&') {
+			g_string_append(buffer, "&-");		/* see RFC 3501, Section 5.1.3 */
+			next_in++;
+		} else {
+			const gchar *next_ascii;
+			gchar *utf7;
+			gsize utf7len;
+
+			next_ascii = g_utf8_next_char(next_in);
+			while ((*next_ascii != '\0') && !IS_VALID_ASCII(*next_ascii)) {
+				 next_ascii = g_utf8_next_char(next_ascii);
+			}
+			utf7 = g_convert(next_in, next_ascii - next_in, "utf7", "utf8", NULL, &utf7len, NULL);
+			if (utf7 != NULL) {
+				gsize n;
+				utf7[0] = '&';					/* see RFC 3501, Section 5.1.3 */
+
+				for (n = 1U; n < utf7len; n++) {
+					if (utf7[n] == '/') {		/* see RFC 3501, Section 5.1.3 */
+						utf7[n] = ',';
+					}
+				}
+				g_string_append_len(buffer, utf7, utf7len);
+				g_free(utf7);
+			}
+			next_in = next_ascii;
+		}
+	}
+
+	return g_string_free(buffer, FALSE);
 }
 
-/* Convert hex coded UTF-8 string to modified UTF-7 IMAP mailbox
- *  dst should be about twice the length of src to deal with non-hex
- *  coded URLs
- */
-char*
-imap_utf8_to_mailbox(const char *src)
+gchar *
+imap_mailbox_to_utf8(const gchar *mbox)
 {
-  unsigned int utf8pos, utf8total, c, utf7mode, bitstogo, utf16flag;
-  unsigned long ucs4 = 0, bitbuf = 0;
-
-  /* initialize hex lookup table */
-  char *dst, *res = malloc(2*strlen(src)+1);
-  dst = res;
-  if(!dst) return NULL;
+	GString *buffer;
+	const gchar *next_in;
+
+	buffer = g_string_sized_new(strlen(mbox));		/* always sufficiently long */
+	next_in = mbox;
+	while (*next_in != '\0') {
+		if (*next_in == '&') {
+			if (next_in[1] == '-') {				/* see RFC 3501, Section 5.1.3 */
+				g_string_append_c(buffer, '&');
+				next_in = &next_in[2];
+			} else {
+				gchar *utf7buf;
+				gchar *next_utf7;
+				gchar *utf8;
+				gsize utf8len;
+
+				utf7buf = g_malloc0(strlen(next_in) + 1U);
+				utf7buf[0] = '+';					/* RFC 2152 shift character */
+				next_in++;
+				next_utf7 = &utf7buf[1];
+				for (next_utf7 = &utf7buf[1]; (*next_in != '\0') && (*next_in != '-'); next_in++) {
+					if (*next_in == ',') {			/* see RFC 3501, Section 5.1.3 */
+						*next_utf7++ = '/';
+					} else {
+						*next_utf7++ = *next_in;
+					}
+				}
+				*next_utf7 = *next_in;
+				if (*next_in == '-') {
+					next_in++;
+				}
+				utf8 = g_convert(utf7buf, -1, "utf8", "utf7", NULL, &utf8len, NULL);
+				if (utf8 != NULL) {
+					g_string_append_len(buffer, utf8, utf8len);
+					g_free(utf8);
+				}
+				g_free(utf7buf);
+			}
+		} else {
+			g_string_append_c(buffer, *next_in++);
+		}
+	}
 
-  utf7mode = 0;
-  utf8total = 0;
-  bitstogo = 0;
-  utf8pos = 0;
-  while ((c = (unsigned char)*src) != '\0') {
-    ++src;
-    /* normal character? */
-    if (c >= ' ' && c <= '~') {
-      /* switch out of UTF-7 mode */
-      if (utf7mode) {
-        if (bitstogo) {
-          *dst++ = base64chars[(bitbuf << (6 - bitstogo)) & 0x3F];
-        }
-        *dst++ = '-';
-        utf7mode = 0;
-        utf8pos  = 0;
-        bitstogo = 0;
-        utf8total= 0;
-      }
-      /* encode '\' as '\\', and '"' as '\"' */
-      if (c == '\\' || c == '"') {
-        *dst++ = '\\';
-      }
-      *dst++ = c;
-      /* encode '&' as '&-' */
-      if (c == '&') {
-        *dst++ = '-';
-      }
-      continue;
-    }
-    /* switch to UTF-7 mode */
-    if (!utf7mode) {
-      *dst++ = '&';
-      utf7mode = 1;
-    }
-    /* Encode US-ASCII characters as themselves */
-    if (c < 0x80) {
-      ucs4 = c;
-      utf8total = 1;
-    } else if (utf8total) {
-      /* save UTF8 bits into UCS4 */
-      ucs4 = (ucs4 << 6) | (c & 0x3FUL);
-      if (++utf8pos < utf8total) {
-        continue;
-      }
-    } else {
-      utf8pos = 1;
-      if (c < 0xE0) {
-        utf8total = 2;
-        ucs4 = c & 0x1F;
-      } else if (c < 0xF0) {
-        utf8total = 3;
-        ucs4 = c & 0x0F;
-      } else {
-        /* NOTE: can't convert UTF8 sequences longer than 4 */
-        utf8total = 4;
-        ucs4 = c & 0x03;
-      }
-      continue;
-    }
-    /* loop to split ucs4 into two utf16 chars if necessary */
-    utf8total = 0;
-    do {
-      if (ucs4 >= UTF16BASE) {
-        ucs4 -= UTF16BASE;
-        bitbuf = (bitbuf << 16) | ((ucs4 >> UTF16SHIFT)
-                                   + UTF16HIGHSTART);
-        ucs4 = (ucs4 & UTF16MASK) + UTF16LOSTART;
-        utf16flag = 1;
-      } else {
-        bitbuf = (bitbuf << 16) | ucs4;
-        utf16flag = 0;
-      }
-      bitstogo += 16;
-      /* spew out base64 */
-      while (bitstogo >= 6) {
-        bitstogo -= 6;
-        *dst++ = base64chars[(bitstogo ? (bitbuf >> bitstogo)
-                              : bitbuf)
-                             & 0x3F];
-      }
-    } while (utf16flag);
-  }
-  /* if in UTF-7 mode, finish in ASCII */
-  if (utf7mode) {
-    if (bitstogo) {
-      *dst++ = base64chars[(bitbuf << (6 - bitstogo)) & 0x3F];
-    }
-    *dst++ = '-';
-  }
-  /* tie off string */
-  *dst = '\0';
-  return res;
+	return g_string_free(buffer, FALSE);
 }
 
 #if 0
diff --git a/libbalsa/imap/util.h b/libbalsa/imap/util.h
index a01477f1d..1557e1a99 100644
--- a/libbalsa/imap/util.h
+++ b/libbalsa/imap/util.h
@@ -24,7 +24,9 @@ gchar *imap_quote_string(const gchar *src)
 char* imap_next_word(char *s);
 char* imap_skip_atom(char *s);
 
-char* imap_mailbox_to_utf8(const char *src);
-char* imap_utf8_to_mailbox(const char *src);
+gchar* imap_mailbox_to_utf8(const char *src)
+	G_GNUC_WARN_UNUSED_RESULT;
+gchar* imap_utf8_to_mailbox(const char *src)
+	G_GNUC_WARN_UNUSED_RESULT;
 
 #endif

pgpNJ6Nk6REj3.pgp
Description: PGP signature

_______________________________________________
balsa-list mailing list
[email protected]
https://mail.gnome.org/mailman/listinfo/balsa-list

[Bug+Patch] error in IMAP mailbox encoding

Reply via email to