Update of /cvsroot/spambayes/spambayes/spambayes
In directory sc8-pr-cvs8.sourceforge.net:/tmp/cvs-serv30712/spambayes

Modified Files:
        Options.py tokenizer.py 
Log Message:
Add an x-short_runs option.  When enabled, instead of completely skipping
short words, runs of them are counted, the longest generating a token using
the usual log2() technique.  See the comment in tokenizer.py and doc string
in Options.py for examples of the sort of things it attempts to catch.


Index: Options.py
===================================================================
RCS file: /cvsroot/spambayes/spambayes/spambayes/Options.py,v
retrieving revision 1.132
retrieving revision 1.133
diff -C2 -d -r1.132 -r1.133
*** Options.py  6 Aug 2006 16:14:17 -0000       1.132
--- Options.py  6 Aug 2006 16:34:37 -0000       1.133
***************
*** 98,101 ****
--- 98,109 ----
       INTEGER, RESTORE),
  
+     ("x-short_runs", _("Count runs of short 'words'"), False,
+      _("""(EXPERIMENTAL) If true, generate tokens based on max number of
+      short word runs. Short words are anything of length < the
+      skip_max_word_size option.  Normally they are skipped, but one common
+      spam technique spells words like 'V I A G RA'.
+      """),
+      BOOLEAN, RESTORE),
+ 
      ("count_all_header_lines", _("Count all header lines"), False,
       _("""Generate tokens just counting the number of instances of each kind

Index: tokenizer.py
===================================================================
RCS file: /cvsroot/spambayes/spambayes/spambayes/tokenizer.py,v
retrieving revision 1.38
retrieving revision 1.39
diff -C2 -d -r1.38 -r1.39
*** tokenizer.py        6 Aug 2006 16:19:19 -0000       1.38
--- tokenizer.py        6 Aug 2006 16:34:37 -0000       1.39
***************
*** 1531,1543 ****
                                                    "skip_max_word_size"]):
          """Tokenize everything in the chunk of text we were handed."""
          for w in text.split():
              n = len(w)
!             # Make sure this range matches in tokenize_word().
!             if 3 <= n <= maxword:
!                 yield w
  
!             elif n >= 3:
!                 for t in tokenize_word(w):
!                     yield t
  
      def tokenize_body(self, msg):
--- 1531,1558 ----
                                                    "skip_max_word_size"]):
          """Tokenize everything in the chunk of text we were handed."""
+         short_runs = Set()
+         short_count = 0
          for w in text.split():
              n = len(w)
!             if n < 3:
!                 # count how many short words we see in a row - meant to
!                 # latch onto crap like this:
!                 # X j A m N j A d X h
!                 # M k E z R d I p D u I m A c
!                 # C o I d A t L j I v S j
!                 short_count += 1
!             else:
!                 if short_count:
!                     short_runs.add(short_count)
!                     short_count = 0
!                 # Make sure this range matches in tokenize_word().
!                 if 3 <= n <= maxword:
!                     yield w
  
!                 elif n >= 3:
!                     for t in tokenize_word(w):
!                         yield t
!         if short_runs and options["Tokenizer", "x-short_runs"]:
!             yield "short:%d" % int(log2(max(short_runs)))
  
      def tokenize_body(self, msg):

_______________________________________________
Spambayes-checkins mailing list
[email protected]
http://mail.python.org/mailman/listinfo/spambayes-checkins

Reply via email to