All this business about safely writing pickles got me to wondering about
locking files.  Rather than implement something which was
SpamBayes-specific, I scouted around a little bit then implemented something
which hopefully

  * has a reasonable API (I think)
  * is cross-platform
  * allows multiple different implementations (currently one each based on
    os.link and os.mkdir and one which uses a SQLite database to maintain
    lock info)
  * might someday be good enough for inclusion in Python proper as the One
    True Way to do advisory file locking.

The result is the lockfile module, which you can find at your local
neighborhood (virtual) cheese shop:

    http://pypi.python.org/pypi/lockfile/

I've done a fair amount of testing on my Mac.  (It has a fair number of
doctests.  Thank you again Uncle Timmy.)  Scott Dial (from the python-dev
mailing list) ran it through its paces on Windows a bit and sent me some
revisions.  Other than Scott's inputs I've received no direct feedback on
the module, though when I first raised the idea of one-lock-mechanism-to-
rule-them-all it generated a few comments.  I suspect Py3k and an impending
2.5.2 release are probably sucking up all available Python developer round
tuits.

Today I added lockfile usage into Dave Abrahams' safe_pickle function, in
the process renaming it to safe_pickle_write and adding a corresponding
safe_pickle_read.  I then replaced all pickle.{dump,load} calls with the
relevant safe_pickle_{write,read} functions.

This code has not been checked in yet.  I've attached a diff against the
current rev 3168.  I'm interested in how people think I should proceed.
Some possibilities:

    * throw it out - we don't need dependencies on external crap
    * keep it but pull lockfile into the spambayes package for ease of
      distribution
    * check the changes in on head
    * create a branch and check the changes in there
    * get 1.1 out the door, damn it!  leave this for 1.2 (around 2011)
    * keep the safe_pickle_{read,write} functions but take out the lockfile
      stuff.

Feedback please.

Thx,

Skip

Index: spambayes/TestDriver.py
===================================================================
--- spambayes/TestDriver.py     (revision 3168)
+++ spambayes/TestDriver.py     (working copy)
@@ -33,17 +33,11 @@
     except ImportError:
         from spambayes.compatsets import Set
 
-import cPickle as pickle
-
-try:
-    from heapq import heapreplace
-except ImportError:
-    from spambayes.compatheapq import heapreplace
-
 from spambayes.Options import options
 from spambayes import Tester
 from spambayes import classifier
 from spambayes.Histogram import Hist
+from spambayes import storage
 
 try:
     True, False
@@ -200,9 +194,7 @@
             fname = "%s%d.pik" % (options["TestDriver", "pickle_basename"],
                                   self.ntimes_finishtest_called)
             print "    saving pickle to", fname
-            fp = file(fname, 'wb')
-            pickle.dump(self.classifier, fp, 1)
-            fp.close()
+            storage.safe_pickle_write(fname, self.classifier, 1)
 
     def alldone(self):
         if options["TestDriver", "show_histograms"]:
@@ -240,9 +232,7 @@
                 fname = "%s_%shist.pik" % (options["TestDriver",
                                                    "pickle_basename"], f)
                 print "    saving %s histogram pickle to %s" %(f, fname)
-                fp = file(fname, 'wb')
-                pickle.dump(h, fp, 1)
-                fp.close()
+                storage.safe_pickle_write(fname, h, 1)
 
     def test(self, ham, spam):
         c = self.classifier
Index: spambayes/__init__.py
===================================================================
--- spambayes/__init__.py       (revision 3168)
+++ spambayes/__init__.py       (working copy)
@@ -1,4 +1,4 @@
 # package marker.
 
-__version__ = "1.1a4"
-__date__ = "June 25, 2007"
+__version__ = "1.1a5"
+__date__ = "November 6, 2007"
Index: spambayes/storage.py
===================================================================
--- spambayes/storage.py        (revision 3168)
+++ spambayes/storage.py        (working copy)
@@ -74,6 +74,7 @@
 import shelve
 from spambayes import cdb
 from spambayes import dbmstorage
+import lockfile
 
 # Make shelve use binary pickles by default.
 oldShelvePickler = shelve.Pickler
@@ -85,36 +86,50 @@
 NO_UPDATEPROBS = False   # Probabilities will not be autoupdated with training
 UPDATEPROBS = True       # Probabilities will be autoupdated with training
 
-def safe_pickle(filename, value, protocol=0):
+def safe_pickle_read(filename):
+    """Read pickle file contents with a lock."""
+    lock = lockfile.FileLock(filename)
+    lock.acquire(timeout=20)
+    try:
+        return pickle.load(open(filename, 'rb'))
+    finally:
+        lock.release()
+
+def safe_pickle_write(filename, value, protocol=0):
     '''Store value as a pickle without creating corruption'''
 
-    # Be as defensive as possible.  Always keep a safe copy.
-    tmp = filename + '.tmp'
-    fp = None
-    try: 
-        fp = open(tmp, 'wb') 
-        pickle.dump(value, fp, protocol) 
-        fp.close() 
-    except IOError, e: 
-        if options["globals", "verbose"]: 
-            print >> sys.stderr, 'Failed update: ' + str(e)
-        if fp is not None: 
-            os.remove(tmp) 
-        raise
+    lock = lockfile.FileLock(filename)
+    lock.acquire(timeout=20)
+
     try:
-        # With *nix we can just rename, and (as long as permissions
-        # are correct) the old file will vanish.  With win32, this
-        # won't work - the Python help says that there may not be
-        # a way to do an atomic replace, so we rename the old one,
-        # put the new one there, and then delete the old one.  If
-        # something goes wrong, there is at least a copy of the old
-        # one.
-        os.rename(tmp, filename)
-    except OSError:
-        os.rename(filename, filename + '.bak')
-        os.rename(tmp, filename)
-        os.remove(filename + '.bak')
-    
+        # Be as defensive as possible.  Always keep a safe copy.
+        tmp = filename + '.tmp'
+        fp = None
+        try: 
+            fp = open(tmp, 'wb') 
+            pickle.dump(value, fp, protocol) 
+            fp.close() 
+        except IOError, e: 
+            if options["globals", "verbose"]: 
+                print >> sys.stderr, 'Failed update: ' + str(e)
+            if fp is not None: 
+                os.remove(tmp) 
+            raise
+        try:
+            # With *nix we can just rename, and (as long as permissions
+            # are correct) the old file will vanish.  With win32, this
+            # won't work - the Python help says that there may not be
+            # a way to do an atomic replace, so we rename the old one,
+            # put the new one there, and then delete the old one.  If
+            # something goes wrong, there is at least a copy of the old
+            # one.
+            os.rename(tmp, filename)
+        except OSError:
+            os.rename(filename, filename + '.bak')
+            os.rename(tmp, filename)
+            os.remove(filename + '.bak')
+    finally:
+        lock.release()
 class PickledClassifier(classifier.Classifier):
     '''Classifier object persisted in a pickle'''
 
@@ -171,7 +186,7 @@
         if options["globals", "verbose"]:
             print >> sys.stderr, 'Persisting',self.db_name,'as a pickle'
 
-        safe_pickle(self.db_name, self, PICKLE_TYPE)
+        safe_pickle_write(self.db_name, self, PICKLE_TYPE)
 
     def close(self):
         # we keep no resources open - nothing to do
Index: spambayes/ImageStripper.py
===================================================================
--- spambayes/ImageStripper.py  (revision 3168)
+++ spambayes/ImageStripper.py  (working copy)
@@ -8,14 +8,9 @@
 import os
 import tempfile
 import math
-import time
 import md5
 import atexit
 try:
-    import cPickle as pickle
-except ImportError:
-    import pickle
-try:
     import cStringIO as StringIO
 except ImportError:
     import StringIO
@@ -25,6 +20,8 @@
 except ImportError:
     Image = None
 
+from spambayes import storage
+
 # The email mime object carrying the image data can have a special attribute
 # which indicates that a message had an image, but it was large (ie, larger
 # than the 'max_image_size' option.)  This allows the provider of the email
@@ -302,7 +299,7 @@
     def __init__(self, cachefile=""):
         self.cachefile = os.path.expanduser(cachefile)
         if os.path.exists(self.cachefile):
-            self.cache = pickle.load(open(self.cachefile))
+            self.cache = storage.safe_pickle_read(self.cachefile)
         else:
             self.cache = {}
         self.misses = self.hits = 0
@@ -385,7 +382,7 @@
                 print >> sys.stderr, "%.2f%% hit rate" % \
                       (100 * self.hits / (self.hits + self.misses)),
             print >> sys.stderr
-        pickle.dump(self.cache, open(self.cachefile, "wb"))
+        storage.safe_pickle_write(self.cachefile, self.cache)
 
 _cachefile = options["Tokenizer", "crack_image_cache"]
 crack_images = ImageStripper(_cachefile).analyze
Index: spambayes/classifier.py
===================================================================
--- spambayes/classifier.py     (revision 3168)
+++ spambayes/classifier.py     (working copy)
@@ -614,9 +614,8 @@
         self.bad_url_cache_name = os.path.join(dir, "bad_urls.pck")
         self.http_error_cache_name = os.path.join(dir, "http_error_urls.pck")
         if os.path.exists(self.bad_url_cache_name):
-            b_file = file(self.bad_url_cache_name, "r")
             try:
-                self.bad_urls = pickle.load(b_file)
+                self.bad_urls = 
storage.safe_pickle_read(self.bad_url_cache_name)
             except IOError, ValueError:
                 # Something went wrong loading it (bad pickle,
                 # probably).  Start afresh.
@@ -625,7 +624,6 @@
                 self.bad_urls = {"url:non_resolving": (),
                                  "url:non_html": (),
                                  "url:unknown_error": ()}
-            b_file.close()
         else:
             if options["globals", "verbose"]:
                 print "URL caches don't exist: creating"
@@ -652,8 +650,8 @@
         # XXX becomes valid, for example).
         for name, data in [(self.bad_url_cache_name, self.bad_urls),
                            (self.http_error_cache_name, 
self.http_error_urls),]:
-            from storage import safe_pickle
-            safe_pickle(name, data)
+            from spambayes import storage
+            storage.safe_pickle_write(name, data)
 
     def slurp(self, proto, url):
         # We generate these tokens:
Index: spambayes/dnscache.py
===================================================================
--- spambayes/dnscache.py       (revision 3168)
+++ spambayes/dnscache.py       (working copy)
@@ -14,12 +14,9 @@
 import time
 import types
 import socket
-try:
-    import cPickle as pickle
-except ImportError:
-    import pickle
 
 from spambayes.Options import options
+from spambayes import storage
 
 kCheckForPruneEvery=20
 kMaxTTL=60 * 60 * 24 * 7                # One week
@@ -98,7 +95,7 @@
 
         if self.cachefile and os.path.exists(self.cachefile):
             try:
-                self.caches = pickle.load(open(self.cachefile, "rb"))
+                self.caches = storage.safe_pickle_read(self.cachefile)
             except:
                 os.unlink(self.cachefile)
 
@@ -129,8 +126,7 @@
         if self.printStatsAtEnd:
             self.printStats()
         if self.cachefile:
-            from storage import safe_pickle
-            safe_pickle(self.cachefile, self.caches)
+            storage.safe_pickle_write(self.cachefile, self.caches)
 
     def printStats(self):
         for key,val in self.caches.items():
Index: spambayes/message.py
===================================================================
--- spambayes/message.py        (revision 3168)
+++ spambayes/message.py        (working copy)
@@ -81,7 +81,6 @@
     def bool(val):
         return not not val
 
-import os
 import sys
 import types
 import time
@@ -104,7 +103,6 @@
 
 from spambayes import storage
 from spambayes import dbmstorage
-from spambayes.Options import options, get_pathname_option
 from spambayes.tokenizer import tokenize
 
 try:
@@ -220,25 +218,20 @@
 
     def load(self):
         try:
-            fp = open(self.db_name, 'rb')
+            self.db = storage.safe_pickle_read(self.db_name)
         except IOError, e:
             if e.errno == errno.ENOENT:
                 # New pickle
                 self.db = {}
             else:
                 raise
-        else:
-            self.db = pickle.load(fp)
-            fp.close()
 
     def close(self):
         # we keep no resources open - nothing to do
         pass
 
     def store(self):
-        fp = open(self.db_name, 'wb')
-        pickle.dump(self.db, fp, self.mode)
-        fp.close()
+        storage.safe_pickle_write(self.db_name, self.db, self.mode)
 
 class MessageInfoDB(MessageInfoBase):
     def __init__(self, db_name, mode='c'):
Index: scripts/sb_notesfilter.py
===================================================================
--- scripts/sb_notesfilter.py   (revision 3168)
+++ scripts/sb_notesfilter.py   (working copy)
@@ -146,7 +146,6 @@
 import sys
 from spambayes import tokenizer, storage
 from spambayes.Options import options
-import cPickle as pickle
 import errno
 import win32com.client
 import pywintypes
@@ -305,16 +304,13 @@
     bayes = storage.open_storage(bdbname, useDBM)
 
     try:
-        fp = open(idxname, 'rb')
+        notesindex = storage.safe_pickle_read(idxname)
     except IOError, e:
         if e.errno != errno.ENOENT:
             raise
         notesindex = {}
         print "%s file not found, this is a first time run" % (idxname,)
         print "No classification will be performed"
-    else:
-        notesindex = pickle.load(fp)
-        fp.close()
 
     need_replicate = False
 
@@ -378,9 +374,7 @@
 
     bayes.store()
 
-    fp = open(idxname, 'wb')
-    pickle.dump(notesindex, fp)
-    fp.close()
+    storage.safe_pickle_write(idxname, notesindex)
 
     if log:
         log.LogAction("Finished running spambayes")
Index: contrib/findbest.py
===================================================================
--- contrib/findbest.py (revision 3168)
+++ contrib/findbest.py (working copy)
@@ -66,7 +66,6 @@
 
 import sys
 import os
-import cPickle as pickle
 import getopt
 import math
 
@@ -75,6 +74,7 @@
 from spambayes.hammie import Hammie
 from spambayes.tokenizer import tokenize
 from spambayes.Options import options
+from spambayes import storage
 
 cls = Classifier()
 h = Hammie(cls)
@@ -223,7 +223,7 @@
     print "scoring"
 
     if best:
-        last_scores = pickle.load(file(bestfile))
+        last_scores = storage.safe_pickle_read(bestfile)
         last_scores = last_scores.items()
         last_scores.sort()
         msgids = set()
@@ -240,7 +240,7 @@
         pass
 
     if not best:
-        pickle.dump(scores, file(bestfile, 'w'))
+        storage.safe_pickle_write(bestfile, scores)
 
     return 0
 
Index: utilities/mkreversemap.py
===================================================================
--- utilities/mkreversemap.py   (revision 3168)
+++ utilities/mkreversemap.py   (working copy)
@@ -18,12 +18,12 @@
 import sys
 import getopt
 import anydbm
-import cPickle as pickle
 
 from spambayes.mboxutils import getmbox
 from spambayes.tokenizer import tokenize
 from spambayes.Options import options
 from spambayes.classifier import Classifier
+from spambayes import storage
 
 prog = sys.argv[0]
 
@@ -99,13 +99,13 @@
         return 1
 
     try:
-        mapd = pickle.load(file(mapfile))
+        mapd = storage.safe_pickle_read(mapfile)
     except IOError:
         mapd = {}
 
     for f in args:
         mapmessages(f, mboxtype, mapd)
-    pickle.dump(mapd, file(mapfile, "w"))
+    storage.safe_pickle_write(mapfile, mapd)
 
 if __name__ == "__main__":
     sys.exit(main(sys.argv[1:]))
Index: utilities/extractmessages.py
===================================================================
--- utilities/extractmessages.py        (revision 3168)
+++ utilities/extractmessages.py        (working copy)
@@ -23,11 +23,11 @@
 import sys
 import getopt
 import re
-import cPickle as pickle
 import locale
 from email.Header import make_header, decode_header
 
 from spambayes.mboxutils import getmbox
+from spambayes import storage
 
 prog = sys.argv[0]
 
@@ -115,7 +115,7 @@
         return 1
 
     try:
-        mapd = pickle.load(file(mapfile))
+        mapd = storage.safe_pickle_read(mapfile)
     except IOError:
         usage("Mapfile %s does not exist" % mapfile)
         return 1
Index: utilities/HistToGNU.py
===================================================================
--- utilities/HistToGNU.py      (revision 3168)
+++ utilities/HistToGNU.py      (working copy)
@@ -22,9 +22,9 @@
 
 from spambayes.Options import options
 from spambayes.TestDriver import Hist
+from spambayes import storage
 
 import sys
-import cPickle as pickle
 
 program = sys.argv[0]
 
@@ -38,7 +38,7 @@
 
 def loadHist(path):
     """Load the histogram pickle object"""
-    return pickle.load(file(path))
+    return storage.safe_pickle_read(path)
 
 def outputHist(hist, f=sys.stdout):
     """Output the Hist object to file f"""
_______________________________________________
spambayes-dev mailing list
[email protected]
http://mail.python.org/mailman/listinfo/spambayes-dev

Reply via email to