At http://bazaar.launchpad.net/~jameinel/meliae/non-json-loader
------------------------------------------------------------
revno: 154
revision-id: [email protected]
parent: [email protected]
committer: John Arbash Meinel <[email protected]>
branch nick: non-json-loader
timestamp: Mon 2010-07-12 17:28:03 -0500
message:
Fix up the regex based parser. It does turn out to be slower (105ms vs 85ms).
I'm curious if the conditionals kill the performance.
=== modified file 'meliae/loader.py'
--- a/meliae/loader.py 2010-07-12 21:53:49 +0000
+++ b/meliae/loader.py 2010-07-12 22:28:03 +0000
@@ -50,7 +50,7 @@
r', "size": (?P<size>\d+)'
r'(, "name": "(?P<name>.*)")?'
r'(, "len": (?P<len>\d+))?'
- r'(, "value": "?(?P<value>[^"]*)"?)?'
+ r'(, "value": "?(?P<value>.*)"?)?'
r', "refs": \[(?P<refs>[^]]*)\]'
r'\}')
=== modified file 'meliae/scanner.py'
--- a/meliae/scanner.py 2010-07-12 20:02:44 +0000
+++ b/meliae/scanner.py 2010-07-12 22:28:03 +0000
@@ -36,6 +36,13 @@
else:
pending = [obj]
last_offset = len(pending) - 1
+ # TODO: Instead of using an IDSet, we could use a BloomFilter. It would
+ # mean some objects may not get dumped (blooms say "yes you
+ # definitely are not present", but only "you might already be
+ # present", collisions cause false positives.)
+ # However, you can get by with 8-10bits for a 1% FPR, rather than
+ # using 32/64-bit pointers + overhead for avoiding hash collisions.
+ # So on 64-bit we drop from 16bytes/object to 1...
seen = _intset.IDSet()
if is_pending:
seen.add(id(pending))
=== modified file 'meliae/tests/test_loader.py'
--- a/meliae/tests/test_loader.py 2010-07-12 21:53:49 +0000
+++ b/meliae/tests/test_loader.py 2010-07-12 22:28:03 +0000
@@ -122,9 +122,12 @@
', "refs": []}',
'{"address": 2345, "type": "module", "size": 60, "name": "mymod"'
', "refs": [1234]}',
+ '{"address": 4567, "type": "str", "size": 150, "len": 126'
+ ', "value": "Test \\\'whoami\\\'\\u000a\\"Your name\\"'
+ ', "refs": []}'
], using_json=False, show_prog=False).objs
keys = sorted(objs.keys())
- self.assertEqual([1234, 2345], keys)
+ self.assertEqual([1234, 2345, 4567], keys)
obj = objs[1234]
self.assertTrue(isinstance(obj, _loader._MemObjectProxy))
# The address should be exactly the same python object as the key in
@@ -134,6 +137,11 @@
obj = objs[2345]
self.assertEqual("module", obj.type_str)
self.assertEqual("mymod", obj.value)
+ obj = objs[4567]
+ # Known failure? We don't unescape properly, also, I'm surprised this
+ # works. " should exit the " string, but \" seems to leave it. But the
+ # '\' is also left verbatim because it is a raw string...
+ self.assertEqual(r"Test \'whoami\'\u000a\"Your name\"", obj.value)
def test_load_example(self):
objs = loader.load(_example_dump, show_prog=False)
--
bazaar-commits mailing list
[email protected]
https://lists.ubuntu.com/mailman/listinfo/bazaar-commits