Update of /cvsroot/spambayes/spambayes/spambayes
In directory sc8-pr-cvs8.sourceforge.net:/tmp/cvs-serv18791

Modified Files:
        ImageStripper.py 
Log Message:
The spammers don't just chop up their GIF images left-to-right.  Concatenate
them left-to-right until the height of adjacent images changes, then start a
new row.  At the end concatenate the rows top-to-bottom.

Add a couple tokens to mark decode or conversion errors.

The *_decode_parts don't use the class's state, so make them functions
instead of methods.


Index: ImageStripper.py
===================================================================
RCS file: /cvsroot/spambayes/spambayes/spambayes/ImageStripper.py,v
retrieving revision 1.2
retrieving revision 1.3
diff -C2 -d -r1.2 -r1.3
*** ImageStripper.py    10 Aug 2006 04:07:59 -0000      1.2
--- ImageStripper.py    13 Aug 2006 16:27:49 -0000      1.3
***************
*** 83,179 ****
      return decoders
  
! def imconcat(im1, im2):
!     # concatenate im1 and im2 left-to-right
!     w1, h1 = im1.size
!     w2, h2 = im2.size
!     im3 = Image.new("RGB", (w1+w2, max(h1, h2)))
!     im3.paste(im1, (0, 0))
!     im3.paste(im2, (0, w1))
!     return im3
  
! class ImageStripper:
!     def __init__(self, cachefile=""):
!         self.cachefile = os.path.expanduser(cachefile)
!         if os.path.exists(self.cachefile):
!             self.cache = pickle.load(open(self.cachefile))
!         else:
!             self.cache = {}
!         self.misses = self.hits = 0
!         if self.cachefile:
!             atexit.register(self.close)
  
!     def NetPBM_decode_parts(self, parts, decoders):
!         pnmfiles = []
!         for part in parts:
!             decoder = decoders.get(part.get_content_type())
!             if decoder is None:
!                 continue
!             try:
!                 bytes = part.get_payload(decode=True)
!             except:
!                 continue
  
!             if len(bytes) > options["Tokenizer", "max_image_size"]:
!                 continue                # assume it's just a picture for now
  
!             fd, imgfile = tempfile.mkstemp()
!             os.write(fd, bytes)
!             os.close(fd)
  
              fd, pnmfile = tempfile.mkstemp()
              os.close(fd)
!             os.system("%s <%s >%s 2>dev.null" % (decoder, imgfile, pnmfile))
!             pnmfiles.append(pnmfile)
!             os.unlink(imgfile)
  
!         if not pnmfiles:
!             return
  
!         if len(pnmfiles) > 1:
!             if find_program("pnmcat"):
!                 fd, pnmfile = tempfile.mkstemp()
!                 os.close(fd)
!                 os.system("pnmcat -lr %s > %s 2>/dev/null" %
!                           (" ".join(pnmfiles), pnmfile))
!                 for f in pnmfiles:
!                     os.unlink(f)
!                 pnmfiles = [pnmfile]
  
!         return pnmfiles
  
!     def PIL_decode_parts(self, parts):
!         full_image = None
!         for part in parts:
!             try:
!                 bytes = part.get_payload(decode=True)
!             except:
!                 continue
  
!             if len(bytes) > options["Tokenizer", "max_image_size"]:
!                 continue                # assume it's just a picture for now
  
!             # We're dealing with spammers here - who knows what garbage they
!             # will call a GIF image to entice you to open it?
!             try:
!                 image = Image.open(StringIO.StringIO(bytes))
!                 image.load()
!             except IOError:
!                 continue
!             else:
!                 image = image.convert("RGB")
  
!             if full_image is None:
!                 full_image = image
!             else:
!                 full_image = imconcat(full_image, image)
  
!         if not full_image:
!             return
  
!         fd, pnmfile = tempfile.mkstemp()
!         os.close(fd)
!         full_image.save(open(pnmfile, "wb"), "PPM")
  
!         return [pnmfile]
  
      def extract_ocr_info(self, pnmfiles):
--- 83,228 ----
      return decoders
  
! def imconcatlr(left, right):
!     """Concatenate two images left to right."""
!     w1, h1 = left.size
!     w2, h2 = right.size
!     result = Image.new("RGB", (w1 + w2, max(h1, h2)))
!     result.paste(left, (0, 0))
!     result.paste(right, (w1, 0))
!     return result
  
! def imconcattb(upper, lower):
!     """Concatenate two images top to bottom."""
!     w1, h1 = upper.size
!     w2, h2 = lower.size
!     result = Image.new("RGB", (max(w1, w2), h1 + h2))
!     result.paste(upper, (0, 0))
!     result.paste(lower, (0, h1))
!     return result
  
! def pnmsize(pnmfile):
!     """Return dimensions of a PNM file."""
!     f = open(pnmfile)
!     line1 = f.readline()
!     line2 = f.readline()
!     w, h = [int(n) for n in line2.split()]
!     return w, h
  
! def NetPBM_decode_parts(parts, decoders):
!     """Decode and assemble a bunch of images using NetPBM tools."""
!     rows = []
!     tokens = Set()
!     for part in parts:
!         decoder = decoders.get(part.get_content_type())
!         if decoder is None:
!             continue
!         try:
!             bytes = part.get_payload(decode=True)
!         except:
!             tokens.add("invalid-image:%s" % part.get_content_type())
!             continue
  
!         if len(bytes) > options["Tokenizer", "max_image_size"]:
!             tokens.add("image:big")
!             continue                # assume it's just a picture for now
  
+         fd, imgfile = tempfile.mkstemp()
+         os.write(fd, bytes)
+         os.close(fd)
+ 
+         fd, pnmfile = tempfile.mkstemp()
+         os.close(fd)
+         os.system("%s <%s >%s 2>dev.null" % (decoder, imgfile, pnmfile))
+         w, h = pnmsize(pnmfile)
+         if not rows:
+             # first image
+             rows.append([pnmfile])
+         elif pnmsize(rows[-1][-1])[1] != h:
+             # new image, different height => start new row
+             rows.append([pnmfile])
+         else:
+             # new image, same height => extend current row
+             rows[-1].append(pnmfile)
+ 
+     for (i, row) in enumerate(rows):
+         if len(row) > 1:
              fd, pnmfile = tempfile.mkstemp()
              os.close(fd)
!             os.system("pnmcat -lr %s > %s 2>/dev/null" %
!                       (" ".join(row), pnmfile))
!             for f in row:
!                 os.unlink(f)
!             rows[i] = pnmfile
!         else:
!             rows[i] = row[0]
  
!     fd, pnmfile = tempfile.mkstemp()
!     os.close(fd)
!     os.system("pnmcat -tb %s > %s 2>/dev/null" % (" ".join(rows), pnmfile))
!     for f in rows:
!         os.unlink(f)
!     return [pnmfile], tokens
  
! def PIL_decode_parts(parts):
!     """Decode and assemble a bunch of images using PIL."""
!     tokens = Set()
!     rows = []
!     for part in parts:
!         try:
!             bytes = part.get_payload(decode=True)
!         except:
!             tokens.add("invalid-image:%s" % part.get_content_type())
!             continue
  
!         if len(bytes) > options["Tokenizer", "max_image_size"]:
!             tokens.add("image:big")
!             continue                # assume it's just a picture for now
  
!         # We're dealing with spammers and virus writers here.  Who knows
!         # what garbage they will call a GIF image to entice you to open
!         # it?
!         try:
!             image = Image.open(StringIO.StringIO(bytes))
!             image.load()
!         except IOError:
!             tokens.add("invalid-image:%s" % part.get_content_type())
!             continue
!         else:
!             image = image.convert("RGB")
  
!         if not rows:
!             # first image
!             rows.append(image)
!         elif image.size[1] != rows[-1].size[1]:
!             # new image, different height => start new row
!             rows.append(image)
!         else:
!             # new image, same height => extend current row
!             rows[-1] = imconcatlr(rows[-1], image)
  
!     if not rows:
!         return [], tokens
  
!     # now concatenate the resulting row images top-to-bottom
!     full_image, rows = rows[0], rows[1:]
!     for image in rows:
!         full_image = imconcattb(full_image, image)
  
!     fd, pnmfile = tempfile.mkstemp()
!     os.close(fd)
!     full_image.save(open(pnmfile, "wb"), "PPM")
  
!     return [pnmfile], tokens
  
! class ImageStripper:
!     def __init__(self, cachefile=""):
!         self.cachefile = os.path.expanduser(cachefile)
!         if os.path.exists(self.cachefile):
!             self.cache = pickle.load(open(self.cachefile))
!         else:
!             self.cache = {}
!         self.misses = self.hits = 0
!         if self.cachefile:
!             atexit.register(self.close)
  
      def extract_ocr_info(self, pnmfiles):
***************
*** 217,228 ****
  
          if Image is not None:
!             pnmfiles = self.PIL_decode_parts(parts)
          else:
!             pnmfiles = self.NetPBM_decode_parts(parts, find_decoders())
  
          if pnmfiles:
!             return self.extract_ocr_info(pnmfiles)
  
!         return "", Set()
  
  
--- 266,280 ----
  
          if Image is not None:
!             pnmfiles, tokens = PIL_decode_parts(parts)
          else:
!             if not find_program("pnmcat"):
!                 return "", Set()
!             pnmfiles, tokens = NetPBM_decode_parts(parts, find_decoders())
  
          if pnmfiles:
!             text, new_tokens = self.extract_ocr_info(pnmfiles)
!             return text, tokens | new_tokens
  
!         return "", tokens
  
  

_______________________________________________
Spambayes-checkins mailing list
[email protected]
http://mail.python.org/mailman/listinfo/spambayes-checkins

Reply via email to