Revision: 3222
http://spambayes.svn.sourceforge.net/spambayes/?rev=3222&view=rev
Author: montanaro
Date: 2008-11-28 15:45:43 +0000 (Fri, 28 Nov 2008)
Log Message:
-----------
Add -d flag (skip duplicate messages).
Modified Paths:
--------------
trunk/spambayes/utilities/splitndirs.py
Modified: trunk/spambayes/utilities/splitndirs.py
===================================================================
--- trunk/spambayes/utilities/splitndirs.py 2008-11-25 15:34:18 UTC (rev
3221)
+++ trunk/spambayes/utilities/splitndirs.py 2008-11-28 15:45:43 UTC (rev
3222)
@@ -24,6 +24,8 @@
-n N
The number of output mboxes desired. This is required.
+ -d Eliminate duplicates.
+
Arguments:
sourcembox
The mbox or path to an mbox to split.
@@ -49,6 +51,10 @@
import random
import getopt
import glob
+try:
+ from hashlib import md5
+except ImportError:
+ from md5 import new as md5
from spambayes import mboxutils
@@ -69,13 +75,14 @@
def main():
try:
- opts, args = getopt.getopt(sys.argv[1:], 'hgn:s:v', ['help'])
+ opts, args = getopt.getopt(sys.argv[1:], 'dhgn:s:v', ['help'])
except getopt.error, msg:
usage(1, msg)
doglob = False
n = None
verbose = False
+ delete_dups = False
for opt, arg in opts:
if opt in ('-h', '--help'):
usage(0)
@@ -87,6 +94,8 @@
n = int(arg)
elif opt == '-v':
verbose = True
+ elif opt == '-d':
+ delete_dups = True
if n is None or n <= 1:
usage(1, "an -n value > 1 is required")
@@ -101,6 +110,8 @@
os.makedirs(dir)
counter = 0
+ cksums = set()
+ skipped = 0
for inputpath in inputpaths:
if doglob:
inpaths = glob.glob(inputpath)
@@ -110,8 +121,13 @@
for inpath in inpaths:
mbox = mboxutils.getmbox(inpath)
for msg in mbox:
+ astext = str(msg)
+ cksum = md5(astext).hexdigest()
+ if delete_dups and cksum in cksums:
+ skipped += 1
+ continue
+ cksums.add(cksum)
i = random.randrange(n)
- astext = str(msg)
#assert astext.endswith('\n')
counter += 1
msgfile = open('%s/%d' % (outdirs[i], counter), 'wb')
@@ -125,6 +141,8 @@
if verbose:
print
print counter, "messages split into", n, "directories"
+ if skipped:
+ print "skipped", skipped, "duplicate messages"
if __name__ == '__main__':
main()
This was sent by the SourceForge.net collaborative development platform, the
world's largest Open Source development site.
_______________________________________________
Spambayes-checkins mailing list
[email protected]
http://mail.python.org/mailman/listinfo/spambayes-checkins