At http://bazaar.launchpad.net/~jameinel/meliae/non-json-loader

------------------------------------------------------------
revno: 154
revision-id: [email protected]
parent: [email protected]
committer: John Arbash Meinel <[email protected]>
branch nick: non-json-loader
timestamp: Mon 2010-07-12 17:28:03 -0500
message:
  Fix up the regex based parser. It does turn out to be slower (105ms vs 85ms).
  
  I'm curious if the conditionals kill the performance.
=== modified file 'meliae/loader.py'
--- a/meliae/loader.py  2010-07-12 21:53:49 +0000
+++ b/meliae/loader.py  2010-07-12 22:28:03 +0000
@@ -50,7 +50,7 @@
     r', "size": (?P<size>\d+)'
     r'(, "name": "(?P<name>.*)")?'
     r'(, "len": (?P<len>\d+))?'
-    r'(, "value": "?(?P<value>[^"]*)"?)?'
+    r'(, "value": "?(?P<value>.*)"?)?'
     r', "refs": \[(?P<refs>[^]]*)\]'
     r'\}')
 

=== modified file 'meliae/scanner.py'
--- a/meliae/scanner.py 2010-07-12 20:02:44 +0000
+++ b/meliae/scanner.py 2010-07-12 22:28:03 +0000
@@ -36,6 +36,13 @@
     else:
         pending = [obj]
     last_offset = len(pending) - 1
+    # TODO: Instead of using an IDSet, we could use a BloomFilter. It would
+    #       mean some objects may not get dumped (blooms say "yes you
+    #       definitely are not present", but only "you might already be
+    #       present", collisions cause false positives.)
+    #       However, you can get by with 8-10bits for a 1% FPR, rather than
+    #       using 32/64-bit pointers + overhead for avoiding hash collisions.
+    #       So on 64-bit we drop from 16bytes/object to 1...
     seen = _intset.IDSet()
     if is_pending:
         seen.add(id(pending))

=== modified file 'meliae/tests/test_loader.py'
--- a/meliae/tests/test_loader.py       2010-07-12 21:53:49 +0000
+++ b/meliae/tests/test_loader.py       2010-07-12 22:28:03 +0000
@@ -122,9 +122,12 @@
                 ', "refs": []}',
             '{"address": 2345, "type": "module", "size": 60, "name": "mymod"'
                 ', "refs": [1234]}',
+            '{"address": 4567, "type": "str", "size": 150, "len": 126'
+                ', "value": "Test \\\'whoami\\\'\\u000a\\"Your name\\"'
+                ', "refs": []}'
             ], using_json=False, show_prog=False).objs
         keys = sorted(objs.keys())
-        self.assertEqual([1234, 2345], keys)
+        self.assertEqual([1234, 2345, 4567], keys)
         obj = objs[1234]
         self.assertTrue(isinstance(obj, _loader._MemObjectProxy))
         # The address should be exactly the same python object as the key in
@@ -134,6 +137,11 @@
         obj = objs[2345]
         self.assertEqual("module", obj.type_str)
         self.assertEqual("mymod", obj.value)
+        obj = objs[4567]
+        # Known failure? We don't unescape properly, also, I'm surprised this
+        # works. " should exit the " string, but \" seems to leave it. But the
+        # '\' is also left verbatim because it is a raw string...
+        self.assertEqual(r"Test \'whoami\'\u000a\"Your name\"", obj.value)
 
     def test_load_example(self):
         objs = loader.load(_example_dump, show_prog=False)

-- 
bazaar-commits mailing list
[email protected]
https://lists.ubuntu.com/mailman/listinfo/bazaar-commits

Reply via email to