Update of /cvsroot/tmda/tmda/TMDA/pythonlib/email
In directory usw-pr-cvs1:/tmp/cvs-serv10299
Modified Files:
Charset.py Generator.py Header.py __init__.py
Log Message:
Sync with email 2.4.3
Index: Charset.py
===================================================================
RCS file: /cvsroot/tmda/tmda/TMDA/pythonlib/email/Charset.py,v
retrieving revision 1.3
retrieving revision 1.4
diff -u -r1.3 -r1.4
--- Charset.py 10 Oct 2002 17:27:35 -0000 1.3
+++ Charset.py 14 Oct 2002 22:58:11 -0000 1.4
@@ -43,6 +43,8 @@
'iso-2022-jp': (BASE64, None, None),
'koi8-r': (BASE64, BASE64, None),
'utf-8': (SHORTEST, BASE64, 'utf-8'),
+ # We're making this one up to represent raw unencoded 8-bit
+ '8bit': (None, BASE64, 'utf-8'),
}
# Aliases for other commonly-used names for character sets. Map
@@ -53,21 +55,16 @@
'ascii': 'us-ascii',
}
-# Map charsets to their Unicode codec strings. Note that the Japanese
-# examples included below do not (yet) come with Python! They are available
-# from http://pseudo.grad.sccs.chukyo-u.ac.jp/~kajiyama/python/
-
-# The Chinese and Korean codecs are available from SourceForge:
-#
-# http://sourceforge.net/projects/python-codecs/
+# Map charsets to their Unicode codec strings. Note that Python doesn't come
+# with any Asian codecs by default. Here's where to get them:
#
-# although you'll need to check them out of cvs since they haven't been file
-# released yet. You might also try to use
+# Japanese -- http://www.asahi-net.or.jp/~rd6t-kjym/python
+# Korean -- http://sf.net/projects/koco
+# Chinese -- http://sf.net/projects/python-codecs
#
-# http://www.freshports.org/port-description.php3?port=6702
-#
-# if you can get logged in. AFAICT, both the Chinese and Korean codecs are
-# fairly experimental at this point.
+# Note that these codecs have their own lifecycle and may be in varying states
+# of stability and useability.
+
CODEC_MAP = {
'euc-jp': 'japanese.euc-jp',
'iso-2022-jp': 'japanese.iso-2022-jp',
Index: Generator.py
===================================================================
RCS file: /cvsroot/tmda/tmda/TMDA/pythonlib/email/Generator.py,v
retrieving revision 1.2
retrieving revision 1.3
diff -u -r1.2 -r1.3
--- Generator.py 1 Oct 2002 20:06:22 -0000 1.2
+++ Generator.py 14 Oct 2002 22:58:12 -0000 1.3
@@ -8,7 +8,7 @@
import re
import random
-from types import ListType
+from types import ListType, StringType
from cStringIO import StringIO
from email.Header import Header
@@ -35,6 +35,14 @@
fcre = re.compile(r'^From ', re.MULTILINE)
+def _is8bitstring(s):
+ if isinstance(s, StringType):
+ try:
+ unicode(s, 'us-ascii')
+ except UnicodeError:
+ return True
+ return False
+
class Generator:
@@ -173,6 +181,14 @@
else:
# No line was actually longer than maxheaderlen characters, so
# just return the original unchanged.
+ return text
+ # If we have raw 8bit data in a byte string, we have no idea what the
+ # encoding is. I think there is no safe way to split this string. If
+ # it's ascii-subset, then we could do a normal ascii split, but if
+ # it's multibyte then we could break the string. There's no way to
+ # know so the least harm seems to be to not split the string and risk
+ # it being too long.
+ if _is8bitstring(text):
return text
# The `text' argument already has the field name prepended, so don't
# provide it here or the first line will get folded too short.
Index: Header.py
===================================================================
RCS file: /cvsroot/tmda/tmda/TMDA/pythonlib/email/Header.py,v
retrieving revision 1.2
retrieving revision 1.3
diff -u -r1.2 -r1.3
--- Header.py 1 Oct 2002 20:06:22 -0000 1.2
+++ Header.py 14 Oct 2002 22:58:12 -0000 1.3
@@ -153,6 +153,8 @@
"""
if charset is None:
charset = USASCII
+ if not isinstance(charset, Charset):
+ charset = Charset(charset)
self._charset = charset
self._continuation_ws = continuation_ws
cws_expanded_len = len(continuation_ws.replace('\t', SPACE8))
@@ -216,31 +218,52 @@
charset = self._charset
elif not isinstance(charset, Charset):
charset = Charset(charset)
- # Normalize and check the string
- if isinstance(s, StringType):
- # Possibly raise UnicodeError if it can't e encoded
- unicode(s, charset.get_output_charset())
- elif isinstance(s, UnicodeType):
- # Convert Unicode to byte string for later concatenation
- for charset in USASCII, charset, UTF8:
- try:
- s = s.encode(charset.get_output_charset())
- break
- except UnicodeError:
- pass
- else:
- assert False, 'Could not encode to utf-8'
+ # If the charset is our faux 8bit charset, leave the string unchanged
+ if charset <> '8bit':
+ # We need to test that the string can be converted to unicode and
+ # back to a byte string, given the input and output codecs of the
+ # charset.
+ if isinstance(s, StringType):
+ # Possibly raise UnicodeError if the byte string can't be
+ # converted to a unicode with the input codec of the charset.
+ incodec = charset.input_codec or 'us-ascii'
+ ustr = unicode(s, incodec)
+ # Now make sure that the unicode could be converted back to a
+ # byte string with the output codec, which may be different
+ # than the iput coded. Still, use the original byte string.
+ outcodec = charset.output_codec or 'us-ascii'
+ ustr.encode(outcodec)
+ elif isinstance(s, UnicodeType):
+ # Now we have to be sure the unicode string can be converted
+ # to a byte string with a reasonable output codec. We want to
+ # use the byte string in the chunk.
+ for charset in USASCII, charset, UTF8:
+ try:
+ outcodec = charset.output_codec or 'us-ascii'
+ s = s.encode(outcodec)
+ break
+ except UnicodeError:
+ pass
+ else:
+ assert False, 'utf-8 conversion failed'
self._chunks.append((s, charset))
def _split(self, s, charset, firstline=False):
- # Split up a header safely for use with encode_chunks. BAW: this
- # appears to be a private convenience method.
+ # Split up a header safely for use with encode_chunks.
splittable = charset.to_splittable(s)
encoded = charset.from_splittable(splittable)
elen = charset.encoded_header_len(encoded)
if elen <= self._maxlinelen:
return [(encoded, charset)]
+ # If we have undetermined raw 8bit characters sitting in a byte
+ # string, we really don't know what the right thing to do is. We
+ # can't really split it because it might be multibyte data which we
+ # could break if we split it between pairs. The least harm seems to
+ # be to not split the header at all, but that means they could go out
+ # longer than maxlinelen.
+ elif charset == '8bit':
+ return [(s, charset)]
# BAW: I'm not sure what the right test here is. What we're trying to
# do is be faithful to RFC 2822's recommendation that ($2.2.3):
#
@@ -346,27 +369,27 @@
rtn.append(EMPTYSTRING.join(sublines))
return [(chunk, charset) for chunk in rtn]
- def _encode_chunks(self):
- """MIME-encode a header with many different charsets and/or encodings.
-
- Given a list of pairs (string, charset), return a MIME-encoded string
- suitable for use in a header field. Each pair may have different
- charsets and/or encodings, and the resulting header will accurately
- reflect each setting.
-
- Each encoding can be email.Utils.QP (quoted-printable, for ASCII-like
- character sets like iso-8859-1), email.Utils.BASE64 (Base64, for
- non-ASCII like character sets like KOI8-R and iso-2022-jp), or None
- (no encoding).
-
- Each pair will be represented on a separate line; the resulting string
- will be in the format:
-
- "=?charset1?q?Mar=EDa_Gonz=E1lez_Alonso?=\n
- =?charset2?b?SvxyZ2VuIEL2aW5n?="
- """
+ def _encode_chunks(self, newchunks):
+ # MIME-encode a header with many different charsets and/or encodings.
+ #
+ # Given a list of pairs (string, charset), return a MIME-encoded
+ # string suitable for use in a header field. Each pair may have
+ # different charsets and/or encodings, and the resulting header will
+ # accurately reflect each setting.
+ #
+ # Each encoding can be email.Utils.QP (quoted-printable, for
+ # ASCII-like character sets like iso-8859-1), email.Utils.BASE64
+ # (Base64, for non-ASCII like character sets like KOI8-R and
+ # iso-2022-jp), or None (no encoding).
+ #
+ # Each pair will be represented on a separate line; the resulting
+ # string will be in the format:
+ #
+ # =?charset1?q?Mar=EDa_Gonz=E1lez_Alonso?=\n
+ # =?charset2?b?SvxyZ2VuIEL2aW5n?="
+ #
chunks = []
- for header, charset in self._chunks:
+ for header, charset in newchunks:
if charset is None or charset.header_encoding is None:
# There's no encoding for this chunk's charsets
_max_append(chunks, header, self._maxlinelen)
@@ -397,5 +420,4 @@
newchunks = []
for s, charset in self._chunks:
newchunks += self._split(s, charset, True)
- self._chunks = newchunks
- return self._encode_chunks()
+ return self._encode_chunks(newchunks)
Index: __init__.py
===================================================================
RCS file: /cvsroot/tmda/tmda/TMDA/pythonlib/email/__init__.py,v
retrieving revision 1.3
retrieving revision 1.4
diff -u -r1.3 -r1.4
--- __init__.py 10 Oct 2002 17:27:36 -0000 1.3
+++ __init__.py 14 Oct 2002 22:58:12 -0000 1.4
@@ -4,7 +4,7 @@
"""A package for parsing, handling, and generating email messages.
"""
-__version__ = '2.4.2'
+__version__ = '2.4.3'
__all__ = [
'base64MIME',
_______________________________________
tmda-cvs mailing list
http://tmda.net/lists/listinfo/tmda-cvs