Charset encoded headers problem on tmda-pending.

Jinhyok Heo Thu, 12 Sep 2002 21:07:49 -0700

When I check pending list, I see many charset encoded headers, which
are illiterate by themselves. So I made a dirty hack on tmda-pending,
which gave out readable headers.


I don't know much about python and I just copy some code from
EncWord.py of Mailman. I want somebody else to make it better.

-- 
| Jinhyok Heo (novembre @ ournature.org || http://ournature.org/~novembre/)
|--------------------------------------------------------------------------
| "We are still reaching for the sky. In the developed countries people
|  are coming back down, saying, `It's empty up there.'" --- a Ladakhi monk

--- tmda-pending	Fri Sep 13 10:38:22 2002
+++ tmda-pending.n	Fri Sep 13 13:25:33 2002
@@ -149,6 +149,120 @@
 import sys
 import time
 
+import string
+import base64
+
+class DecodeError(ValueError):
+    __super_init = ValueError.__init__
+    def __init__(self, msg):
+        self.__super_init('invalid encoded-word: %s' % msg)
+
+class Decoder:
+    """Decode mail header encoded-word format defined by RFC 2047"""
+    
+    offset = 0
+
+    def decode(self, s):
+        """Decode an encoded-word.
+
+        Returns the charset of the encoded-word, the decoded text, and the
+        position of the first character following the encoded-word.
+
+        The first position of the input string must by the first character of
+        the encoded-word.
+        """
+        if s[:2] <> '=?':
+            raise DecodeError('must start with "=?", not %s' % repr(s[:2]))
+        charset = self._get_charset(s)
+        encoding = self._get_encoding(s)
+        _text = self._get_text(s)
+        # encoding must be either 'q' or 'b', ensured by _get_encoding()
+        if encoding == 'q':
+            text = self._decode_q(_text)
+        else:
+            text = self._decode_b(_text)
+        return charset, text, self.offset
+
+    # TBD: Technically the charset and encoding can't contain SPACE, CTLs, or
+    # especials; do not currently check this.
+
+    def _get_charset(self, s):
+        i = string.find(s, '?', 2)
+        if i == -1:
+            raise DecodeError("can't find end of charset")
+        self.offset = i + 1
+        return s[2:i]
+
+    _valid_encodings = ('q', 'b')
+
+    def _get_encoding(self, s):
+        i = string.find(s, '?', self.offset)
+        if i < 0:
+            raise DecodeError("can't find encoding")
+        enc = string.lower(s[self.offset:i])
+        self.offset = i + 1
+        if enc not in Decoder._valid_encodings:
+            raise DecodeError('not a valid encoding: %s' % enc)
+        return enc
+
+    def _get_text(self, s):
+        i = string.find(s, '?=', self.offset)
+        if i < 0:
+            raise DecodeError("can't find end of encoded text")
+        text = s[self.offset:i]
+        self.offset = i + 2
+        return text
+
+    SPACE = chr(0x20)
+
+    def _decode_q(self, s):
+        """Q encoding defined by RFC 2047"""
+        chunks = []
+        offset = 0
+        end = len(s)
+        while offset < end:
+            i = string.find(s, '=', offset)
+            j = string.find(s, '_', offset)
+            if i < 0 and j < 0:
+                chunks.append(s[offset:])
+                break
+            if (j < i and j >= 0) or i < 0:
+                chunks.append(s[offset:j])
+                chunks.append(Decoder.SPACE)
+                offset = j + 1
+            else:
+                chunks.append(s[offset:i])
+                hexdig = s[i+1:i+3]
+                chunks.append(chr(string.atoi(hexdig, 16)))
+                offset = i + 3
+        return string.join(chunks, '')
+
+    def _decode_b(self, s):
+        """B encoding == base64 encoding defined by RFC 2045"""
+        return base64.decodestring(s)
+
+def decode(s):
+    """Decode a string containing encoded words"""
+    _decode = Decoder().decode
+
+    chunks = []
+    offset = 0
+    charset = None
+    while 1:
+        i = string.find(s, '=?', offset)
+        if i < 0:
+            chunks.append(s[offset:])
+            break
+        chunks.append(s[offset:i])
+        _charset, text, offset = _decode(s[i:])
+        offset = offset + i
+        if charset is None:
+            charset = _charset
+        elif charset <> _charset:
+            raise ValueError("can't decode string with multiple charsets")
+        chunks.append(text)
+    return string.join(chunks, ''), charset
+
 try:
     import paths
 except ImportError:
@@ -396,9 +510,9 @@
                                                    msg_size,
                                                    bytes)
                 for hdr in ('date', 'from', 'to', 'subject'):
+                    text, charset = decode(headers.getheader(hdr, 'None'))
                     print "%s %s: %s" % ('  >>',
-                                         hdr.capitalize()[:4].rjust(4),
-                                         headers.getheader(hdr, 'None'))
+                                         hdr.capitalize()[:4].rjust(4), text)
                 if summary:
                     print '<mailto:%s>' % (confirm_accept_address
                                            (recipient_address, msg))

Charset encoded headers problem on tmda-pending.

Reply via email to