Update of /cvsroot/spambayes/spambayes/spambayes
In directory sc8-pr-cvs1.sourceforge.net:/tmp/cvs-serv5654/spambayes

Modified Files:
        storage.py 
Log Message:
Fix [ 1187208 ] import into CDB chokes on 8-bit chars

Index: storage.py
===================================================================
RCS file: /cvsroot/spambayes/spambayes/spambayes/storage.py,v
retrieving revision 1.50
retrieving revision 1.51
diff -C2 -d -r1.50 -r1.51
*** storage.py  21 Apr 2005 07:16:48 -0000      1.50
--- storage.py  22 Apr 2005 04:08:25 -0000      1.51
***************
*** 65,68 ****
--- 65,69 ----
  import os
  import sys
+ import types
  from spambayes import classifier
  from spambayes.Options import options, get_pathname_option
***************
*** 620,623 ****
--- 621,636 ----
          return wi
  
+     # Stolen from sb_dbexpimp.py
+     # Heaven only knows what encoding non-ASCII stuff will be in
+     # Try a few common western encodings and punt if they all fail
+     def uunquote(self, s):
+         for encoding in ("utf-8", "cp1252", "iso-8859-1"):
+             try:
+                 return unicode(s, encoding)
+             except UnicodeDecodeError:
+                 pass
+         # punt
+         return s
+ 
      def load(self):
          if os.path.exists(self.db_name):
***************
*** 627,631 ****
              self.nham, self.nspam = [int(i) for i in \
                                       data[self.statekey].split(',')]
!             self.wordinfo = dict([(k, self._WordInfoFactory(v)) \
                                    for k, v in data.iteritems() \
                                        if k != self.statekey])
--- 640,645 ----
              self.nham, self.nspam = [int(i) for i in \
                                       data[self.statekey].split(',')]
!             self.wordinfo = dict([(self.uunquote(k),
!                                    self._WordInfoFactory(v)) \
                                    for k, v in data.iteritems() \
                                        if k != self.statekey])
***************
*** 645,648 ****
--- 659,664 ----
          items = [(self.statekey, "%d,%d" % (self.nham, self.nspam))]
          for word, wi in self.wordinfo.iteritems():
+             if isinstance(word, types.UnicodeType):
+                 word = word.encode("utf-8")
              items.append((word, "%d,%d" % (wi.hamcount, wi.spamcount)))
          db = open(self.db_name, "wb")

_______________________________________________
Spambayes-checkins mailing list
[email protected]
http://mail.python.org/mailman/listinfo/spambayes-checkins

Reply via email to