email Charset.py,1.3,1.4 Generator.py,1.2,1.3 Header.py,1.2,1.3 init.py,1.3,1.4

Jason R. Mastaler Mon, 14 Oct 2002 15:31:54 -0700

Update of /cvsroot/tmda/tmda/TMDA/pythonlib/email
In directory usw-pr-cvs1:/tmp/cvs-serv10299


Modified Files:
        Charset.py Generator.py Header.py __init__.py 
Log Message:
Sync with email 2.4.3

Index: Charset.py
===================================================================
RCS file: /cvsroot/tmda/tmda/TMDA/pythonlib/email/Charset.py,v
retrieving revision 1.3
retrieving revision 1.4
diff -u -r1.3 -r1.4
--- Charset.py  10 Oct 2002 17:27:35 -0000      1.3
+++ Charset.py  14 Oct 2002 22:58:11 -0000      1.4
@@ -43,6 +43,8 @@
     'iso-2022-jp': (BASE64,    None,    None),
     'koi8-r':      (BASE64,    BASE64,  None),
     'utf-8':       (SHORTEST,  BASE64, 'utf-8'),
+    # We're making this one up to represent raw unencoded 8-bit
+    '8bit':        (None,      BASE64, 'utf-8'),
     }
 
 # Aliases for other commonly-used names for character sets.  Map
@@ -53,21 +55,16 @@
     'ascii':   'us-ascii',
     }
 
-# Map charsets to their Unicode codec strings.  Note that the Japanese
-# examples included below do not (yet) come with Python!  They are available
-# from http://pseudo.grad.sccs.chukyo-u.ac.jp/~kajiyama/python/
-
-# The Chinese and Korean codecs are available from SourceForge:
-#
-#     http://sourceforge.net/projects/python-codecs/
+# Map charsets to their Unicode codec strings.  Note that Python doesn't come
+# with any Asian codecs by default.  Here's where to get them:
 #
-# although you'll need to check them out of cvs since they haven't been file
-# released yet.  You might also try to use
+# Japanese -- http://www.asahi-net.or.jp/~rd6t-kjym/python
+# Korean   -- http://sf.net/projects/koco
+# Chinese  -- http://sf.net/projects/python-codecs
 #
-#     http://www.freshports.org/port-description.php3?port=6702
-#
-# if you can get logged in.  AFAICT, both the Chinese and Korean codecs are
-# fairly experimental at this point.
+# Note that these codecs have their own lifecycle and may be in varying states
+# of stability and useability.
+
 CODEC_MAP = {
     'euc-jp':      'japanese.euc-jp',
     'iso-2022-jp': 'japanese.iso-2022-jp',

Index: Generator.py
===================================================================
RCS file: /cvsroot/tmda/tmda/TMDA/pythonlib/email/Generator.py,v
retrieving revision 1.2
retrieving revision 1.3
diff -u -r1.2 -r1.3
--- Generator.py        1 Oct 2002 20:06:22 -0000       1.2
+++ Generator.py        14 Oct 2002 22:58:12 -0000      1.3
@@ -8,7 +8,7 @@
 import re
 import random
 
-from types import ListType
+from types import ListType, StringType
 from cStringIO import StringIO
 
 from email.Header import Header
@@ -35,6 +35,14 @@
 
 fcre = re.compile(r'^From ', re.MULTILINE)
 
+def _is8bitstring(s):
+    if isinstance(s, StringType):
+        try:
+            unicode(s, 'us-ascii')
+        except UnicodeError:
+            return True
+    return False
+
 
 
 class Generator:
@@ -173,6 +181,14 @@
         else:
             # No line was actually longer than maxheaderlen characters, so
             # just return the original unchanged.
+            return text
+        # If we have raw 8bit data in a byte string, we have no idea what the
+        # encoding is.  I think there is no safe way to split this string.  If
+        # it's ascii-subset, then we could do a normal ascii split, but if
+        # it's multibyte then we could break the string.  There's no way to
+        # know so the least harm seems to be to not split the string and risk
+        # it being too long.
+        if _is8bitstring(text):
             return text
         # The `text' argument already has the field name prepended, so don't
         # provide it here or the first line will get folded too short.

Index: Header.py
===================================================================
RCS file: /cvsroot/tmda/tmda/TMDA/pythonlib/email/Header.py,v
retrieving revision 1.2
retrieving revision 1.3
diff -u -r1.2 -r1.3
--- Header.py   1 Oct 2002 20:06:22 -0000       1.2
+++ Header.py   14 Oct 2002 22:58:12 -0000      1.3
@@ -153,6 +153,8 @@
         """
         if charset is None:
             charset = USASCII
+        if not isinstance(charset, Charset):
+            charset = Charset(charset)
         self._charset = charset
         self._continuation_ws = continuation_ws
         cws_expanded_len = len(continuation_ws.replace('\t', SPACE8))
@@ -216,31 +218,52 @@
             charset = self._charset
         elif not isinstance(charset, Charset):
             charset = Charset(charset)
-        # Normalize and check the string
-        if isinstance(s, StringType):
-            # Possibly raise UnicodeError if it can't e encoded
-            unicode(s, charset.get_output_charset())
-        elif isinstance(s, UnicodeType):
-            # Convert Unicode to byte string for later concatenation
-            for charset in USASCII, charset, UTF8:
-                try:
-                    s = s.encode(charset.get_output_charset())
-                    break
-                except UnicodeError:
-                    pass
-            else:
-                assert False, 'Could not encode to utf-8'
+        # If the charset is our faux 8bit charset, leave the string unchanged
+        if charset <> '8bit':
+            # We need to test that the string can be converted to unicode and
+            # back to a byte string, given the input and output codecs of the
+            # charset.
+            if isinstance(s, StringType):
+                # Possibly raise UnicodeError if the byte string can't be
+                # converted to a unicode with the input codec of the charset.
+                incodec = charset.input_codec or 'us-ascii'
+                ustr = unicode(s, incodec)
+                # Now make sure that the unicode could be converted back to a
+                # byte string with the output codec, which may be different
+                # than the iput coded.  Still, use the original byte string.
+                outcodec = charset.output_codec or 'us-ascii'
+                ustr.encode(outcodec)
+            elif isinstance(s, UnicodeType):
+                # Now we have to be sure the unicode string can be converted
+                # to a byte string with a reasonable output codec.  We want to
+                # use the byte string in the chunk.
+                for charset in USASCII, charset, UTF8:
+                    try:
+                        outcodec = charset.output_codec or 'us-ascii'
+                        s = s.encode(outcodec)
+                        break
+                    except UnicodeError:
+                        pass
+                else:
+                    assert False, 'utf-8 conversion failed'
         self._chunks.append((s, charset))
 
     def _split(self, s, charset, firstline=False):
-        # Split up a header safely for use with encode_chunks.  BAW: this
-        # appears to be a private convenience method.
+        # Split up a header safely for use with encode_chunks.
         splittable = charset.to_splittable(s)
         encoded = charset.from_splittable(splittable)
         elen = charset.encoded_header_len(encoded)
 
         if elen <= self._maxlinelen:
             return [(encoded, charset)]
+        # If we have undetermined raw 8bit characters sitting in a byte
+        # string, we really don't know what the right thing to do is.  We
+        # can't really split it because it might be multibyte data which we
+        # could break if we split it between pairs.  The least harm seems to
+        # be to not split the header at all, but that means they could go out
+        # longer than maxlinelen.
+        elif charset == '8bit':
+            return [(s, charset)]
         # BAW: I'm not sure what the right test here is.  What we're trying to
         # do is be faithful to RFC 2822's recommendation that ($2.2.3):
         #
@@ -346,27 +369,27 @@
                 rtn.append(EMPTYSTRING.join(sublines))
         return [(chunk, charset) for chunk in rtn]
 
-    def _encode_chunks(self):
-        """MIME-encode a header with many different charsets and/or encodings.
-
-        Given a list of pairs (string, charset), return a MIME-encoded string
-        suitable for use in a header field.  Each pair may have different
-        charsets and/or encodings, and the resulting header will accurately
-        reflect each setting.
-
-        Each encoding can be email.Utils.QP (quoted-printable, for ASCII-like
-        character sets like iso-8859-1), email.Utils.BASE64 (Base64, for
-        non-ASCII like character sets like KOI8-R and iso-2022-jp), or None
-        (no encoding).
-
-        Each pair will be represented on a separate line; the resulting string
-        will be in the format:
-
-        "=?charset1?q?Mar=EDa_Gonz=E1lez_Alonso?=\n
-          =?charset2?b?SvxyZ2VuIEL2aW5n?="
-        """
+    def _encode_chunks(self, newchunks):
+        # MIME-encode a header with many different charsets and/or encodings.
+        #
+        # Given a list of pairs (string, charset), return a MIME-encoded
+        # string suitable for use in a header field.  Each pair may have
+        # different charsets and/or encodings, and the resulting header will
+        # accurately reflect each setting.
+        #
+        # Each encoding can be email.Utils.QP (quoted-printable, for
+        # ASCII-like character sets like iso-8859-1), email.Utils.BASE64
+        # (Base64, for non-ASCII like character sets like KOI8-R and
+        # iso-2022-jp), or None (no encoding).
+        #
+        # Each pair will be represented on a separate line; the resulting
+        # string will be in the format:
+        #
+        # =?charset1?q?Mar=EDa_Gonz=E1lez_Alonso?=\n
+        #  =?charset2?b?SvxyZ2VuIEL2aW5n?="
+        #
         chunks = []
-        for header, charset in self._chunks:
+        for header, charset in newchunks:
             if charset is None or charset.header_encoding is None:
                 # There's no encoding for this chunk's charsets
                 _max_append(chunks, header, self._maxlinelen)
@@ -397,5 +420,4 @@
         newchunks = []
         for s, charset in self._chunks:
             newchunks += self._split(s, charset, True)
-        self._chunks = newchunks
-        return self._encode_chunks()
+        return self._encode_chunks(newchunks)

Index: __init__.py
===================================================================
RCS file: /cvsroot/tmda/tmda/TMDA/pythonlib/email/__init__.py,v
retrieving revision 1.3
retrieving revision 1.4
diff -u -r1.3 -r1.4
--- __init__.py 10 Oct 2002 17:27:36 -0000      1.3
+++ __init__.py 14 Oct 2002 22:58:12 -0000      1.4
@@ -4,7 +4,7 @@
 """A package for parsing, handling, and generating email messages.
 """
 
-__version__ = '2.4.2'
+__version__ = '2.4.3'
 
 __all__ = [
     'base64MIME',

_______________________________________
tmda-cvs mailing list
http://tmda.net/lists/listinfo/tmda-cvs

CVS: tmda/TMDA/pythonlib/email Charset.py,1.3,1.4 Generator.py,1.2,1.3 Header.py,1.2,1.3 __init__.py,1.3,1.4

Reply via email to

CVS: tmda/TMDA/pythonlib/email Charset.py,1.3,1.4 Generator.py,1.2,1.3 Header.py,1.2,1.3 init.py,1.3,1.4