All this business about safely writing pickles got me to wondering about
locking files. Rather than implement something which was
SpamBayes-specific, I scouted around a little bit then implemented something
which hopefully
* has a reasonable API (I think)
* is cross-platform
* allows multiple different implementations (currently one each based on
os.link and os.mkdir and one which uses a SQLite database to maintain
lock info)
* might someday be good enough for inclusion in Python proper as the One
True Way to do advisory file locking.
The result is the lockfile module, which you can find at your local
neighborhood (virtual) cheese shop:
http://pypi.python.org/pypi/lockfile/
I've done a fair amount of testing on my Mac. (It has a fair number of
doctests. Thank you again Uncle Timmy.) Scott Dial (from the python-dev
mailing list) ran it through its paces on Windows a bit and sent me some
revisions. Other than Scott's inputs I've received no direct feedback on
the module, though when I first raised the idea of one-lock-mechanism-to-
rule-them-all it generated a few comments. I suspect Py3k and an impending
2.5.2 release are probably sucking up all available Python developer round
tuits.
Today I added lockfile usage into Dave Abrahams' safe_pickle function, in
the process renaming it to safe_pickle_write and adding a corresponding
safe_pickle_read. I then replaced all pickle.{dump,load} calls with the
relevant safe_pickle_{write,read} functions.
This code has not been checked in yet. I've attached a diff against the
current rev 3168. I'm interested in how people think I should proceed.
Some possibilities:
* throw it out - we don't need dependencies on external crap
* keep it but pull lockfile into the spambayes package for ease of
distribution
* check the changes in on head
* create a branch and check the changes in there
* get 1.1 out the door, damn it! leave this for 1.2 (around 2011)
* keep the safe_pickle_{read,write} functions but take out the lockfile
stuff.
Feedback please.
Thx,
Skip
Index: spambayes/TestDriver.py
===================================================================
--- spambayes/TestDriver.py (revision 3168)
+++ spambayes/TestDriver.py (working copy)
@@ -33,17 +33,11 @@
except ImportError:
from spambayes.compatsets import Set
-import cPickle as pickle
-
-try:
- from heapq import heapreplace
-except ImportError:
- from spambayes.compatheapq import heapreplace
-
from spambayes.Options import options
from spambayes import Tester
from spambayes import classifier
from spambayes.Histogram import Hist
+from spambayes import storage
try:
True, False
@@ -200,9 +194,7 @@
fname = "%s%d.pik" % (options["TestDriver", "pickle_basename"],
self.ntimes_finishtest_called)
print " saving pickle to", fname
- fp = file(fname, 'wb')
- pickle.dump(self.classifier, fp, 1)
- fp.close()
+ storage.safe_pickle_write(fname, self.classifier, 1)
def alldone(self):
if options["TestDriver", "show_histograms"]:
@@ -240,9 +232,7 @@
fname = "%s_%shist.pik" % (options["TestDriver",
"pickle_basename"], f)
print " saving %s histogram pickle to %s" %(f, fname)
- fp = file(fname, 'wb')
- pickle.dump(h, fp, 1)
- fp.close()
+ storage.safe_pickle_write(fname, h, 1)
def test(self, ham, spam):
c = self.classifier
Index: spambayes/__init__.py
===================================================================
--- spambayes/__init__.py (revision 3168)
+++ spambayes/__init__.py (working copy)
@@ -1,4 +1,4 @@
# package marker.
-__version__ = "1.1a4"
-__date__ = "June 25, 2007"
+__version__ = "1.1a5"
+__date__ = "November 6, 2007"
Index: spambayes/storage.py
===================================================================
--- spambayes/storage.py (revision 3168)
+++ spambayes/storage.py (working copy)
@@ -74,6 +74,7 @@
import shelve
from spambayes import cdb
from spambayes import dbmstorage
+import lockfile
# Make shelve use binary pickles by default.
oldShelvePickler = shelve.Pickler
@@ -85,36 +86,50 @@
NO_UPDATEPROBS = False # Probabilities will not be autoupdated with training
UPDATEPROBS = True # Probabilities will be autoupdated with training
-def safe_pickle(filename, value, protocol=0):
+def safe_pickle_read(filename):
+ """Read pickle file contents with a lock."""
+ lock = lockfile.FileLock(filename)
+ lock.acquire(timeout=20)
+ try:
+ return pickle.load(open(filename, 'rb'))
+ finally:
+ lock.release()
+
+def safe_pickle_write(filename, value, protocol=0):
'''Store value as a pickle without creating corruption'''
- # Be as defensive as possible. Always keep a safe copy.
- tmp = filename + '.tmp'
- fp = None
- try:
- fp = open(tmp, 'wb')
- pickle.dump(value, fp, protocol)
- fp.close()
- except IOError, e:
- if options["globals", "verbose"]:
- print >> sys.stderr, 'Failed update: ' + str(e)
- if fp is not None:
- os.remove(tmp)
- raise
+ lock = lockfile.FileLock(filename)
+ lock.acquire(timeout=20)
+
try:
- # With *nix we can just rename, and (as long as permissions
- # are correct) the old file will vanish. With win32, this
- # won't work - the Python help says that there may not be
- # a way to do an atomic replace, so we rename the old one,
- # put the new one there, and then delete the old one. If
- # something goes wrong, there is at least a copy of the old
- # one.
- os.rename(tmp, filename)
- except OSError:
- os.rename(filename, filename + '.bak')
- os.rename(tmp, filename)
- os.remove(filename + '.bak')
-
+ # Be as defensive as possible. Always keep a safe copy.
+ tmp = filename + '.tmp'
+ fp = None
+ try:
+ fp = open(tmp, 'wb')
+ pickle.dump(value, fp, protocol)
+ fp.close()
+ except IOError, e:
+ if options["globals", "verbose"]:
+ print >> sys.stderr, 'Failed update: ' + str(e)
+ if fp is not None:
+ os.remove(tmp)
+ raise
+ try:
+ # With *nix we can just rename, and (as long as permissions
+ # are correct) the old file will vanish. With win32, this
+ # won't work - the Python help says that there may not be
+ # a way to do an atomic replace, so we rename the old one,
+ # put the new one there, and then delete the old one. If
+ # something goes wrong, there is at least a copy of the old
+ # one.
+ os.rename(tmp, filename)
+ except OSError:
+ os.rename(filename, filename + '.bak')
+ os.rename(tmp, filename)
+ os.remove(filename + '.bak')
+ finally:
+ lock.release()
class PickledClassifier(classifier.Classifier):
'''Classifier object persisted in a pickle'''
@@ -171,7 +186,7 @@
if options["globals", "verbose"]:
print >> sys.stderr, 'Persisting',self.db_name,'as a pickle'
- safe_pickle(self.db_name, self, PICKLE_TYPE)
+ safe_pickle_write(self.db_name, self, PICKLE_TYPE)
def close(self):
# we keep no resources open - nothing to do
Index: spambayes/ImageStripper.py
===================================================================
--- spambayes/ImageStripper.py (revision 3168)
+++ spambayes/ImageStripper.py (working copy)
@@ -8,14 +8,9 @@
import os
import tempfile
import math
-import time
import md5
import atexit
try:
- import cPickle as pickle
-except ImportError:
- import pickle
-try:
import cStringIO as StringIO
except ImportError:
import StringIO
@@ -25,6 +20,8 @@
except ImportError:
Image = None
+from spambayes import storage
+
# The email mime object carrying the image data can have a special attribute
# which indicates that a message had an image, but it was large (ie, larger
# than the 'max_image_size' option.) This allows the provider of the email
@@ -302,7 +299,7 @@
def __init__(self, cachefile=""):
self.cachefile = os.path.expanduser(cachefile)
if os.path.exists(self.cachefile):
- self.cache = pickle.load(open(self.cachefile))
+ self.cache = storage.safe_pickle_read(self.cachefile)
else:
self.cache = {}
self.misses = self.hits = 0
@@ -385,7 +382,7 @@
print >> sys.stderr, "%.2f%% hit rate" % \
(100 * self.hits / (self.hits + self.misses)),
print >> sys.stderr
- pickle.dump(self.cache, open(self.cachefile, "wb"))
+ storage.safe_pickle_write(self.cachefile, self.cache)
_cachefile = options["Tokenizer", "crack_image_cache"]
crack_images = ImageStripper(_cachefile).analyze
Index: spambayes/classifier.py
===================================================================
--- spambayes/classifier.py (revision 3168)
+++ spambayes/classifier.py (working copy)
@@ -614,9 +614,8 @@
self.bad_url_cache_name = os.path.join(dir, "bad_urls.pck")
self.http_error_cache_name = os.path.join(dir, "http_error_urls.pck")
if os.path.exists(self.bad_url_cache_name):
- b_file = file(self.bad_url_cache_name, "r")
try:
- self.bad_urls = pickle.load(b_file)
+ self.bad_urls =
storage.safe_pickle_read(self.bad_url_cache_name)
except IOError, ValueError:
# Something went wrong loading it (bad pickle,
# probably). Start afresh.
@@ -625,7 +624,6 @@
self.bad_urls = {"url:non_resolving": (),
"url:non_html": (),
"url:unknown_error": ()}
- b_file.close()
else:
if options["globals", "verbose"]:
print "URL caches don't exist: creating"
@@ -652,8 +650,8 @@
# XXX becomes valid, for example).
for name, data in [(self.bad_url_cache_name, self.bad_urls),
(self.http_error_cache_name,
self.http_error_urls),]:
- from storage import safe_pickle
- safe_pickle(name, data)
+ from spambayes import storage
+ storage.safe_pickle_write(name, data)
def slurp(self, proto, url):
# We generate these tokens:
Index: spambayes/dnscache.py
===================================================================
--- spambayes/dnscache.py (revision 3168)
+++ spambayes/dnscache.py (working copy)
@@ -14,12 +14,9 @@
import time
import types
import socket
-try:
- import cPickle as pickle
-except ImportError:
- import pickle
from spambayes.Options import options
+from spambayes import storage
kCheckForPruneEvery=20
kMaxTTL=60 * 60 * 24 * 7 # One week
@@ -98,7 +95,7 @@
if self.cachefile and os.path.exists(self.cachefile):
try:
- self.caches = pickle.load(open(self.cachefile, "rb"))
+ self.caches = storage.safe_pickle_read(self.cachefile)
except:
os.unlink(self.cachefile)
@@ -129,8 +126,7 @@
if self.printStatsAtEnd:
self.printStats()
if self.cachefile:
- from storage import safe_pickle
- safe_pickle(self.cachefile, self.caches)
+ storage.safe_pickle_write(self.cachefile, self.caches)
def printStats(self):
for key,val in self.caches.items():
Index: spambayes/message.py
===================================================================
--- spambayes/message.py (revision 3168)
+++ spambayes/message.py (working copy)
@@ -81,7 +81,6 @@
def bool(val):
return not not val
-import os
import sys
import types
import time
@@ -104,7 +103,6 @@
from spambayes import storage
from spambayes import dbmstorage
-from spambayes.Options import options, get_pathname_option
from spambayes.tokenizer import tokenize
try:
@@ -220,25 +218,20 @@
def load(self):
try:
- fp = open(self.db_name, 'rb')
+ self.db = storage.safe_pickle_read(self.db_name)
except IOError, e:
if e.errno == errno.ENOENT:
# New pickle
self.db = {}
else:
raise
- else:
- self.db = pickle.load(fp)
- fp.close()
def close(self):
# we keep no resources open - nothing to do
pass
def store(self):
- fp = open(self.db_name, 'wb')
- pickle.dump(self.db, fp, self.mode)
- fp.close()
+ storage.safe_pickle_write(self.db_name, self.db, self.mode)
class MessageInfoDB(MessageInfoBase):
def __init__(self, db_name, mode='c'):
Index: scripts/sb_notesfilter.py
===================================================================
--- scripts/sb_notesfilter.py (revision 3168)
+++ scripts/sb_notesfilter.py (working copy)
@@ -146,7 +146,6 @@
import sys
from spambayes import tokenizer, storage
from spambayes.Options import options
-import cPickle as pickle
import errno
import win32com.client
import pywintypes
@@ -305,16 +304,13 @@
bayes = storage.open_storage(bdbname, useDBM)
try:
- fp = open(idxname, 'rb')
+ notesindex = storage.safe_pickle_read(idxname)
except IOError, e:
if e.errno != errno.ENOENT:
raise
notesindex = {}
print "%s file not found, this is a first time run" % (idxname,)
print "No classification will be performed"
- else:
- notesindex = pickle.load(fp)
- fp.close()
need_replicate = False
@@ -378,9 +374,7 @@
bayes.store()
- fp = open(idxname, 'wb')
- pickle.dump(notesindex, fp)
- fp.close()
+ storage.safe_pickle_write(idxname, notesindex)
if log:
log.LogAction("Finished running spambayes")
Index: contrib/findbest.py
===================================================================
--- contrib/findbest.py (revision 3168)
+++ contrib/findbest.py (working copy)
@@ -66,7 +66,6 @@
import sys
import os
-import cPickle as pickle
import getopt
import math
@@ -75,6 +74,7 @@
from spambayes.hammie import Hammie
from spambayes.tokenizer import tokenize
from spambayes.Options import options
+from spambayes import storage
cls = Classifier()
h = Hammie(cls)
@@ -223,7 +223,7 @@
print "scoring"
if best:
- last_scores = pickle.load(file(bestfile))
+ last_scores = storage.safe_pickle_read(bestfile)
last_scores = last_scores.items()
last_scores.sort()
msgids = set()
@@ -240,7 +240,7 @@
pass
if not best:
- pickle.dump(scores, file(bestfile, 'w'))
+ storage.safe_pickle_write(bestfile, scores)
return 0
Index: utilities/mkreversemap.py
===================================================================
--- utilities/mkreversemap.py (revision 3168)
+++ utilities/mkreversemap.py (working copy)
@@ -18,12 +18,12 @@
import sys
import getopt
import anydbm
-import cPickle as pickle
from spambayes.mboxutils import getmbox
from spambayes.tokenizer import tokenize
from spambayes.Options import options
from spambayes.classifier import Classifier
+from spambayes import storage
prog = sys.argv[0]
@@ -99,13 +99,13 @@
return 1
try:
- mapd = pickle.load(file(mapfile))
+ mapd = storage.safe_pickle_read(mapfile)
except IOError:
mapd = {}
for f in args:
mapmessages(f, mboxtype, mapd)
- pickle.dump(mapd, file(mapfile, "w"))
+ storage.safe_pickle_write(mapfile, mapd)
if __name__ == "__main__":
sys.exit(main(sys.argv[1:]))
Index: utilities/extractmessages.py
===================================================================
--- utilities/extractmessages.py (revision 3168)
+++ utilities/extractmessages.py (working copy)
@@ -23,11 +23,11 @@
import sys
import getopt
import re
-import cPickle as pickle
import locale
from email.Header import make_header, decode_header
from spambayes.mboxutils import getmbox
+from spambayes import storage
prog = sys.argv[0]
@@ -115,7 +115,7 @@
return 1
try:
- mapd = pickle.load(file(mapfile))
+ mapd = storage.safe_pickle_read(mapfile)
except IOError:
usage("Mapfile %s does not exist" % mapfile)
return 1
Index: utilities/HistToGNU.py
===================================================================
--- utilities/HistToGNU.py (revision 3168)
+++ utilities/HistToGNU.py (working copy)
@@ -22,9 +22,9 @@
from spambayes.Options import options
from spambayes.TestDriver import Hist
+from spambayes import storage
import sys
-import cPickle as pickle
program = sys.argv[0]
@@ -38,7 +38,7 @@
def loadHist(path):
"""Load the histogram pickle object"""
- return pickle.load(file(path))
+ return storage.safe_pickle_read(path)
def outputHist(hist, f=sys.stdout):
"""Output the Hist object to file f"""
_______________________________________________
spambayes-dev mailing list
[email protected]
http://mail.python.org/mailman/listinfo/spambayes-dev