Author: hwright
Date: Wed Aug  4 04:20:09 2010
New Revision: 982120

URL: http://svn.apache.org/viewvc?rev=982120&view=rev
Log:
Add brute force checking of binary content to Mouse, along with a test for same.

Modified:
    labs/mouse/guesser/binary.py
    labs/mouse/tests/test_mouse.py

Modified: labs/mouse/guesser/binary.py
URL: 
http://svn.apache.org/viewvc/labs/mouse/guesser/binary.py?rev=982120&r1=982119&r2=982120&view=diff
==============================================================================
--- labs/mouse/guesser/binary.py (original)
+++ labs/mouse/guesser/binary.py Wed Aug  4 04:20:09 2010
@@ -46,6 +46,11 @@ _bytecode_exts = [
 _binary_exts = _data_exts + _exec_exts + _keystore_exts + _image_exts + \
                _bytecode_exts
 
+_non_ascii_threshold = 256;
+_ascii_char_threshold = 8;
+_high_bytes_ratio = 100;
+_total_read_ratio = 30;
+
 
 def is_binary(item):
   '''Entry method, will return True if ITEM is thought to be binary,
@@ -68,3 +73,14 @@ def is_binary(item):
       return True
 
   # Time to attempt a brute-force divination
+  high_bytes = 0
+  for c in item.get_content()[0:100]:
+    if ord(c) > _non_ascii_threshold or ord(c) <= _ascii_char_threshold:
+      high_bytes += 1
+
+  if (high_bytes * _high_bytes_ratio) > \
+            (min(100, len(item.get_content())) * _total_read_ratio):
+    return True
+
+  # we've exhausted our options, so this file must not be binary
+  return False

Modified: labs/mouse/tests/test_mouse.py
URL: 
http://svn.apache.org/viewvc/labs/mouse/tests/test_mouse.py?rev=982120&r1=982119&r2=982120&view=diff
==============================================================================
--- labs/mouse/tests/test_mouse.py (original)
+++ labs/mouse/tests/test_mouse.py Wed Aug  4 04:20:09 2010
@@ -143,6 +143,16 @@ class TestBinaryGuessing(unittest.TestCa
       # have to check the content, anyway
       self.assertTrue(guesser.is_binary(sources.Item(name, None)))
 
+  def test_is_binary_content(self):
+    self.assertTrue(guesser.is_binary(sources.Item('txt',
+                    open(os.path.join(data_path, 'rat-tests',
+                                      'binaries', 'Image-png.not')))))
+    self.assertFalse(guesser.is_binary(sources.Item('txt',
+                     open(os.path.join(data_path, 'rat-tests',
+                                       'elements', 'Source.java')))))
+    self.assertFalse(guesser.is_binary(sources.Item('txt',
+                     open(os.path.join(data_path, 'rat-tests',
+                                       'elements', 'NOTICE')))))
 
 
 class TestFilters(unittest.TestCase):



---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Reply via email to