Author: hwright
Date: Wed Aug 4 04:20:09 2010
New Revision: 982120
URL: http://svn.apache.org/viewvc?rev=982120&view=rev
Log:
Add brute force checking of binary content to Mouse, along with a test for same.
Modified:
labs/mouse/guesser/binary.py
labs/mouse/tests/test_mouse.py
Modified: labs/mouse/guesser/binary.py
URL:
http://svn.apache.org/viewvc/labs/mouse/guesser/binary.py?rev=982120&r1=982119&r2=982120&view=diff
==============================================================================
--- labs/mouse/guesser/binary.py (original)
+++ labs/mouse/guesser/binary.py Wed Aug 4 04:20:09 2010
@@ -46,6 +46,11 @@ _bytecode_exts = [
_binary_exts = _data_exts + _exec_exts + _keystore_exts + _image_exts + \
_bytecode_exts
+_non_ascii_threshold = 256;
+_ascii_char_threshold = 8;
+_high_bytes_ratio = 100;
+_total_read_ratio = 30;
+
def is_binary(item):
'''Entry method, will return True if ITEM is thought to be binary,
@@ -68,3 +73,14 @@ def is_binary(item):
return True
# Time to attempt a brute-force divination
+ high_bytes = 0
+ for c in item.get_content()[0:100]:
+ if ord(c) > _non_ascii_threshold or ord(c) <= _ascii_char_threshold:
+ high_bytes += 1
+
+ if (high_bytes * _high_bytes_ratio) > \
+ (min(100, len(item.get_content())) * _total_read_ratio):
+ return True
+
+ # we've exhausted our options, so this file must not be binary
+ return False
Modified: labs/mouse/tests/test_mouse.py
URL:
http://svn.apache.org/viewvc/labs/mouse/tests/test_mouse.py?rev=982120&r1=982119&r2=982120&view=diff
==============================================================================
--- labs/mouse/tests/test_mouse.py (original)
+++ labs/mouse/tests/test_mouse.py Wed Aug 4 04:20:09 2010
@@ -143,6 +143,16 @@ class TestBinaryGuessing(unittest.TestCa
# have to check the content, anyway
self.assertTrue(guesser.is_binary(sources.Item(name, None)))
+ def test_is_binary_content(self):
+ self.assertTrue(guesser.is_binary(sources.Item('txt',
+ open(os.path.join(data_path, 'rat-tests',
+ 'binaries', 'Image-png.not')))))
+ self.assertFalse(guesser.is_binary(sources.Item('txt',
+ open(os.path.join(data_path, 'rat-tests',
+ 'elements', 'Source.java')))))
+ self.assertFalse(guesser.is_binary(sources.Item('txt',
+ open(os.path.join(data_path, 'rat-tests',
+ 'elements', 'NOTICE')))))
class TestFilters(unittest.TestCase):
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]