Update of /cvsroot/spambayes/spambayes/contrib In directory sc8-pr-cvs8.sourceforge.net:/tmp/cvs-serv16513
Modified Files: pycksum.py Log Message: * Try to improve the duplicate detection capability. Lots of spam nowadays has random text junk, so be more lenient about how many chunks have to match. Also do a little more filtering on the source: - Compress multiple spaces and tabs to a single space - Compress multiple contiguous newlines into one - Map all strings of digits to a single "#" character - Map some common html entities to their plain text equivalents. * Use md5 checksum hexdigests instead of binascii.b2a_hex. * Correct line breaking of filtered body. * Use email.generator to flatten body instead of the broken flatten() function. Index: pycksum.py =================================================================== RCS file: /cvsroot/spambayes/spambayes/contrib/pycksum.py,v retrieving revision 1.1 retrieving revision 1.2 diff -C2 -d -r1.1 -r1.2 *** pycksum.py 25 May 2004 14:58:39 -0000 1.1 --- pycksum.py 18 Aug 2006 02:29:02 -0000 1.2 *************** *** 39,60 **** import sys import email.Parser import md5 import anydbm import re import time ! import binascii ! ! def flatten(body): ! # three types are possible: list, string, Message ! if isinstance(body, str): ! return body ! if hasattr(body, "get_payload"): ! payload = body.get_payload() ! if payload is None: ! return "" ! return flatten(payload) ! if isinstance(body, list): ! return "\n".join([flatten(b) for b in body]) ! raise TypeError, ("unrecognized body type: %s" % type(body)) def clean(data): --- 39,51 ---- import sys import email.Parser + import email.generator import md5 import anydbm import re import time ! try: ! import cStringIO as StringIO ! except ImportError: ! import StringIO def clean(data): *************** *** 67,74 **** data = re.sub(r"<[^>]*>", "", data).lower() # delete anything which looks like a url or email address # not sure what a pmguid: url is but it seems to occur frequently in spam # also convert all runs of whitespace into a single space ! return " ".join([w for w in data.split() if ('@' not in w and (':' not in w or --- 58,78 ---- data = re.sub(r"<[^>]*>", "", data).lower() + # Map all digits to '#' + data = re.sub(r"[0-9]+", "#", data) + + # Map a few common html entities + data = re.sub(r"( )+", " ", data) + data = re.sub(r"<", "<", data) + data = re.sub(r">", ">", data) + data = re.sub(r"&", "&", data) + + # Elide blank lines and multiple horizontal whitespace + data = re.sub(r"\n+", "\n", data) + data = re.sub(r"[ \t]+", " ", data) + # delete anything which looks like a url or email address # not sure what a pmguid: url is but it seems to occur frequently in spam # also convert all runs of whitespace into a single space ! return " ".join([w for w in data.split(" ") if ('@' not in w and (':' not in w or *************** *** 87,97 **** # separately or in various combinations if desired. ! body = flatten(msg) ! lines = clean(body) chunksize = len(lines)//4+1 sum = [] for i in range(4): chunk = "\n".join(lines[i*chunksize:(i+1)*chunksize]) ! sum.append(binascii.b2a_hex(md5.new(chunk).digest())) return ".".join(sum) --- 91,105 ---- # separately or in various combinations if desired. ! fp = StringIO.StringIO() ! g = email.generator.Generator(fp, mangle_from_=False, maxheaderlen=60) ! g.flatten(msg) ! text = fp.getvalue() ! body = text.split("\n\n", 1)[1] ! lines = clean(body).split("\n") chunksize = len(lines)//4+1 sum = [] for i in range(4): chunk = "\n".join(lines[i*chunksize:(i+1)*chunksize]) ! sum.append(md5.new(chunk).hexdigest()) return ".".join(sum) *************** *** 102,111 **** db = anydbm.open(f, "c") maxdblen = 2**14 ! # consider the first three pieces, the last three pieces and the middle ! # two pieces - one or more will likely eliminate attempts at disrupting ! # the checksum - if any are found in the db file, call it a match ! for subsum in (".".join(pieces[:-1]), ".".join(pieces[1:-1]), ! ".".join(pieces[1:])): if not db.has_key(subsum): db[subsum] = str(time.time()) --- 110,119 ---- db = anydbm.open(f, "c") maxdblen = 2**14 ! # consider the first two pieces, the middle two pieces and the last two ! # pieces - one or more will likely eliminate attempts at disrupting the ! # checksum - if any are found in the db file, call it a match ! for subsum in (".".join(pieces[:-2]), ".".join(pieces[1:-1]), ! ".".join(pieces[2:])): if not db.has_key(subsum): db[subsum] = str(time.time()) *************** *** 155,157 **** if __name__ == "__main__": sys.exit(main(sys.argv[1:])) - --- 163,164 ---- _______________________________________________ Spambayes-checkins mailing list Spambayes-checkins@python.org http://mail.python.org/mailman/listinfo/spambayes-checkins