Author: Antonio Cuni <anto.c...@gmail.com> Branch: Changeset: r45817:69a7e76a319a Date: 2011-07-21 15:10 +0200 http://bitbucket.org/pypy/pypy/changeset/69a7e76a319a/
Log: merge the identity-dict-strategy branch, which optimizes access to dictionary containing only classes which compare 'by identity' diff --git a/pypy/config/pypyoption.py b/pypy/config/pypyoption.py --- a/pypy/config/pypyoption.py +++ b/pypy/config/pypyoption.py @@ -327,6 +327,9 @@ BoolOption("mutable_builtintypes", "Allow the changing of builtin types", default=False, requires=[("objspace.std.builtinshortcut", True)]), + BoolOption("withidentitydict", + "track types that override __hash__, __eq__ or __cmp__ and use a special dict strategy for those which do not", + default=True), ]), ]) diff --git a/pypy/doc/config/objspace.std.withidentitydict.txt b/pypy/doc/config/objspace.std.withidentitydict.txt new file mode 100644 --- /dev/null +++ b/pypy/doc/config/objspace.std.withidentitydict.txt @@ -0,0 +1,21 @@ +============================= +objspace.std.withidentitydict +============================= + +* **name:** withidentitydict + +* **description:** enable a dictionary strategy for "by identity" comparisons + +* **command-line:** --objspace-std-withidentitydict + +* **command-line for negation:** --no-objspace-std-withidentitydict + +* **option type:** boolean option + +* **default:** True + + +Enable a dictionary strategy specialized for instances of classes which +compares "by identity", which is the default unless you override ``__hash__``, +``__eq__`` or ``__cmp__``. This strategy will be used only with new-style +classes. diff --git a/pypy/doc/cpython_differences.rst b/pypy/doc/cpython_differences.rst --- a/pypy/doc/cpython_differences.rst +++ b/pypy/doc/cpython_differences.rst @@ -211,6 +211,38 @@ >>>> print d1['a'] 42 +Mutating classes of objects which are already used as dictionary keys +--------------------------------------------------------------------- + +Consider the following snippet of code:: + + class X(object): + pass + + def __evil_eq__(self, other): + print 'hello world' + return False + + def evil(y): + d = {x(): 1} + X.__eq__ = __evil_eq__ + d[y] # might trigger a call to __eq__? + +In CPython, __evil_eq__ **might** be called, although there is no way to write +a test which reliably calls it. It happens if ``y is not x`` and ``hash(y) == +hash(x)``, where ``hash(x)`` is computed when ``x`` is inserted into the +dictionary. If **by chance** the condition is satisfied, then ``__evil_eq__`` +is called. + +PyPy uses a special strategy to optimize dictionaries whose keys are instances +of user-defined classes which do not override the default ``__hash__``, +``__eq__`` and ``__cmp__``: when using this strategy, ``__eq__`` and +``__cmp__`` are never called, but instead the lookup is done by identity, so +in the case above it is guaranteed that ``__eq__`` won't be called. + +Note that in all other cases (e.g., if you have a custom ``__hash__`` and +``__eq__`` in ``y``) the behavior is exactly the same as CPython. + Ignored exceptions ----------------------- diff --git a/pypy/module/pypyjit/test_pypy_c/test_containers.py b/pypy/module/pypyjit/test_pypy_c/test_containers.py --- a/pypy/module/pypyjit/test_pypy_c/test_containers.py +++ b/pypy/module/pypyjit/test_pypy_c/test_containers.py @@ -23,3 +23,29 @@ ops = loop.ops_by_id('look') assert log.opnames(ops) == ['setfield_gc', 'guard_not_invalidated'] + + def test_identitydict(self): + def fn(n): + class X(object): + pass + x = X() + d = {} + d[x] = 1 + res = 0 + for i in range(300): + value = d[x] # ID: getitem + res += value + return res + # + log = self.run(fn, [1000]) + assert log.result == 300 + loop, = log.loops_by_filename(self.filepath) + # check that the call to ll_dict_lookup is not a call_may_force + assert loop.match_by_id("getitem", """ + i25 = call(ConstClass(_ll_1_gc_identityhash__objectPtr), p6, descr=...) + ... + i28 = call(ConstClass(ll_dict_lookup__dicttablePtr_objectPtr_Signed), p18, p6, i25, descr=...) + ... + p33 = call(ConstClass(ll_get_value__dicttablePtr_Signed), p18, i28, descr=...) + ... + """) diff --git a/pypy/objspace/descroperation.py b/pypy/objspace/descroperation.py --- a/pypy/objspace/descroperation.py +++ b/pypy/objspace/descroperation.py @@ -28,6 +28,13 @@ return w_delattr object_delattr._annspecialcase_ = 'specialize:memo' +def object_hash(space): + "Utility that returns the app-level descriptor object.__hash__." + w_src, w_hash = space.lookup_in_type_where(space.w_object, + '__hash__') + return w_hash +object_hash._annspecialcase_ = 'specialize:memo' + def raiseattrerror(space, w_obj, name, w_descr=None): w_type = space.type(w_obj) typename = w_type.getname(space) diff --git a/pypy/objspace/std/dictmultiobject.py b/pypy/objspace/std/dictmultiobject.py --- a/pypy/objspace/std/dictmultiobject.py +++ b/pypy/objspace/std/dictmultiobject.py @@ -157,11 +157,15 @@ return self.erase(None) def switch_to_correct_strategy(self, w_dict, w_key): - #XXX implement other strategies later + withidentitydict = self.space.config.objspace.std.withidentitydict if type(w_key) is self.space.StringObjectCls: self.switch_to_string_strategy(w_dict) - elif self.space.is_w(self.space.type(w_key), self.space.w_int): + return + w_type = self.space.type(w_key) + if self.space.is_w(w_type, self.space.w_int): self.switch_to_int_strategy(w_dict) + elif withidentitydict and w_type.compares_by_identity(): + self.switch_to_identity_strategy(w_dict) else: self.switch_to_object_strategy(w_dict) @@ -177,6 +181,13 @@ w_dict.strategy = strategy w_dict.dstorage = storage + def switch_to_identity_strategy(self, w_dict): + from pypy.objspace.std.identitydict import IdentityDictStrategy + strategy = self.space.fromcache(IdentityDictStrategy) + storage = strategy.get_empty_storage() + w_dict.strategy = strategy + w_dict.dstorage = storage + def switch_to_object_strategy(self, w_dict): strategy = self.space.fromcache(ObjectDictStrategy) storage = strategy.get_empty_storage() @@ -338,7 +349,6 @@ def getitem(self, w_dict, w_key): space = self.space - if self.is_correct_type(w_key): return self.unerase(w_dict.dstorage).get(self.unwrap(w_key), None) elif self._never_equal_to(space.type(w_key)): @@ -404,6 +414,7 @@ def keys(self, w_dict): return self.unerase(w_dict.dstorage).keys() + class StringDictStrategy(AbstractTypedStrategy, DictStrategy): erase, unerase = rerased.new_erasing_pair("string") @@ -448,7 +459,9 @@ return StrIteratorImplementation(self.space, self, w_dict) -class StrIteratorImplementation(IteratorImplementation): +class _WrappedIteratorMixin(object): + _mixin_ = True + def __init__(self, space, strategy, dictimplementation): IteratorImplementation.__init__(self, space, dictimplementation) self.iterator = strategy.unerase(dictimplementation.dstorage).iteritems() @@ -460,6 +473,23 @@ else: return None, None +class _UnwrappedIteratorMixin: + _mixin_ = True + + def __init__(self, space, strategy, dictimplementation): + IteratorImplementation.__init__(self, space, dictimplementation) + self.iterator = strategy.unerase(dictimplementation.dstorage).iteritems() + + def next_entry(self): + # note that this 'for' loop only runs once, at most + for w_key, w_value in self.iterator: + return w_key, w_value + else: + return None, None + + +class StrIteratorImplementation(_WrappedIteratorMixin, IteratorImplementation): + pass class IntDictStrategy(AbstractTypedStrategy, DictStrategy): erase, unerase = rerased.new_erasing_pair("int") @@ -490,31 +520,11 @@ def iter(self, w_dict): return IntIteratorImplementation(self.space, self, w_dict) -class IntIteratorImplementation(IteratorImplementation): - def __init__(self, space, strategy, dictimplementation): - IteratorImplementation.__init__(self, space, dictimplementation) - self.iterator = strategy.unerase(dictimplementation.dstorage).iteritems() +class IntIteratorImplementation(_WrappedIteratorMixin, IteratorImplementation): + pass - def next_entry(self): - # note that this 'for' loop only runs once, at most - for key, w_value in self.iterator: - return self.space.wrap(key), w_value - else: - return None, None - - -class ObjectIteratorImplementation(IteratorImplementation): - def __init__(self, space, strategy, dictimplementation): - IteratorImplementation.__init__(self, space, dictimplementation) - self.iterator = strategy.unerase(dictimplementation.dstorage).iteritems() - - def next_entry(self): - # note that this 'for' loop only runs once, at most - for w_key, w_value in self.iterator: - return w_key, w_value - else: - return None, None - +class ObjectIteratorImplementation(_UnwrappedIteratorMixin, IteratorImplementation): + pass init_signature = Signature(['seq_or_map'], None, 'kwargs') init_defaults = [None] diff --git a/pypy/objspace/std/dictproxyobject.py b/pypy/objspace/std/dictproxyobject.py --- a/pypy/objspace/std/dictproxyobject.py +++ b/pypy/objspace/std/dictproxyobject.py @@ -86,7 +86,7 @@ def clear(self, w_dict): self.unerase(w_dict.dstorage).dict_w.clear() - self.unerase(w_dict.dstorage).mutated() + self.unerase(w_dict.dstorage).mutated(None) class DictProxyIteratorImplementation(IteratorImplementation): def __init__(self, space, strategy, dictimplementation): diff --git a/pypy/objspace/std/identitydict.py b/pypy/objspace/std/identitydict.py new file mode 100644 --- /dev/null +++ b/pypy/objspace/std/identitydict.py @@ -0,0 +1,85 @@ +## ---------------------------------------------------------------------------- +## dict strategy (see dict_multiobject.py) + +from pypy.rlib import rerased +from pypy.objspace.std.dictmultiobject import (AbstractTypedStrategy, + DictStrategy, + IteratorImplementation, + _UnwrappedIteratorMixin) + + +# this strategy is selected by EmptyDictStrategy.switch_to_correct_strategy +class IdentityDictStrategy(AbstractTypedStrategy, DictStrategy): + """ + Strategy for custom instances which compares by identity (i.e., the + default unless you override __hash__, __eq__ or __cmp__). The storage is + just a normal RPython dict, which has already the correct by-identity + semantics. + + Note that at a first sight, you might have problems if you mutate the + class of an object which is already inside an identitydict. Consider this + example:: + + class X(object): + pass + d = {x(): 1} + X.__eq__ = ... + d[y] # might trigger a call to __eq__? + + We want to be sure that x.__eq__ is called in the same cases as in + CPython. However, as long as the strategy is IdentityDictStrategy, the + __eq__ will never be called. + + It turns out that it's not a problem. In CPython (and in PyPy without + this strategy), the __eq__ is called if ``hash(y) == hash(x)`` and ``x is + not y``. Note that hash(x) is computed at the time when we insert x in + the dict, not at the time we lookup y. + + Now, how can hash(y) == hash(x)? There are two possibilities: + + 1. we write a custom __hash__ for the class of y, thus making it a not + "compares by reference" type + + 2. the class of y is "compares by reference" type, and by chance the + hash is the same as x + + In the first case, the getitem immediately notice that y is not of the + right type, and switches the strategy to ObjectDictStrategy, then the + lookup works as usual. + + The second case is completely non-deterministic, even in CPython. + Depending on the phase of the moon, you might call the __eq__ or not, so + it is perfectly fine to *never* call it. Morever, in practice with the + minimar GC we never have two live objects with the same hash, so it would + never happen anyway. + """ + + erase, unerase = rerased.new_erasing_pair("identitydict") + erase = staticmethod(erase) + unerase = staticmethod(unerase) + + def wrap(self, unwrapped): + return unwrapped + + def unwrap(self, wrapped): + return wrapped + + def get_empty_storage(self): + return self.erase({}) + + def is_correct_type(self, w_obj): + w_type = self.space.type(w_obj) + return w_type.compares_by_identity() + + def _never_equal_to(self, w_lookup_type): + return False + + def iter(self, w_dict): + return IdentityDictIteratorImplementation(self.space, self, w_dict) + + def keys(self, w_dict): + return self.unerase(w_dict.dstorage).keys() + + +class IdentityDictIteratorImplementation(_UnwrappedIteratorMixin, IteratorImplementation): + pass diff --git a/pypy/objspace/std/objecttype.py b/pypy/objspace/std/objecttype.py --- a/pypy/objspace/std/objecttype.py +++ b/pypy/objspace/std/objecttype.py @@ -6,7 +6,7 @@ from pypy.objspace.descroperation import Object from pypy.objspace.std.stdtypedef import StdTypeDef from pypy.objspace.std.register_all import register_all - +from pypy.objspace.std import identitydict def descr__repr__(space, w_obj): w = space.wrap diff --git a/pypy/objspace/std/objspace.py b/pypy/objspace/std/objspace.py --- a/pypy/objspace/std/objspace.py +++ b/pypy/objspace/std/objspace.py @@ -39,7 +39,6 @@ from pypy.objspace.std.stringtype import wrapstr from pypy.objspace.std.unicodetype import wrapunicode - class StdObjSpace(ObjSpace, DescrOperation): """The standard object space, implementing a general-purpose object library in Restricted Python.""" diff --git a/pypy/objspace/std/test/test_dictmultiobject.py b/pypy/objspace/std/test/test_dictmultiobject.py --- a/pypy/objspace/std/test/test_dictmultiobject.py +++ b/pypy/objspace/std/test/test_dictmultiobject.py @@ -898,6 +898,7 @@ withsmalldicts = False withcelldict = False withmethodcache = False + withidentitydict = False FakeSpace.config = Config() @@ -1105,3 +1106,4 @@ fakespace = FakeSpace() d = fakespace.newdict(module=True) assert type(d.strategy) is StringDictStrategy + diff --git a/pypy/objspace/std/test/test_identitydict.py b/pypy/objspace/std/test/test_identitydict.py new file mode 100644 --- /dev/null +++ b/pypy/objspace/std/test/test_identitydict.py @@ -0,0 +1,135 @@ +from pypy.interpreter.gateway import interp2app +from pypy.conftest import gettestobjspace +from pypy.conftest import option + +class AppTestComparesByIdentity: + + def setup_class(cls): + from pypy.objspace.std import identitydict + cls.space = gettestobjspace( + **{"objspace.std.withidentitydict": True}) + + def compares_by_identity(space, w_cls): + return space.wrap(w_cls.compares_by_identity()) + cls.w_compares_by_identity = cls.space.wrap(interp2app(compares_by_identity)) + + def test_compares_by_identity(self): + class Plain(object): + pass + + class CustomEq(object): + def __eq__(self, other): + return True + + class CustomCmp (object): + def __cmp__(self, other): + return 0 + + class CustomHash(object): + def __hash__(self): + return 0 + + assert self.compares_by_identity(Plain) + assert not self.compares_by_identity(CustomEq) + assert not self.compares_by_identity(CustomCmp) + assert not self.compares_by_identity(CustomHash) + + def test_modify_class(self): + class X(object): + pass + + assert self.compares_by_identity(X) + X.__eq__ = lambda x: None + assert not self.compares_by_identity(X) + del X.__eq__ + assert self.compares_by_identity(X) + + +class AppTestIdentityDict(object): + def setup_class(cls): + cls.space = gettestobjspace(**{"objspace.std.withidentitydict": True}) + if option.runappdirect: + py.test.skip("__repr__ doesn't work on appdirect") + + def w_uses_identity_strategy(self, obj): + import __pypy__ + return "IdentityDictStrategy" in __pypy__.internal_repr(obj) + + def test_use_strategy(self): + class X(object): + pass + d = {} + x = X() + d[x] = 1 + assert self.uses_identity_strategy(d) + assert d[x] == 1 + + def test_bad_item(self): + class X(object): + pass + class Y(object): + def __hash__(self): + return 32 + + d = {} + x = X() + y = Y() + d[x] = 1 + assert self.uses_identity_strategy(d) + d[y] = 2 + assert not self.uses_identity_strategy(d) + assert d[x] == 1 + assert d[y] == 2 + + def test_bad_key(self): + class X(object): + pass + d = {} + x = X() + + class Y(object): + def __hash__(self): + return hash(x) # to make sure we do x == y + + def __eq__(self, other): + return True + + y = Y() + d[x] = 1 + assert self.uses_identity_strategy(d) + assert d[y] == 1 + assert not self.uses_identity_strategy(d) + + def test_iter(self): + class X(object): + pass + x = X() + d = {x: 1} + assert self.uses_identity_strategy(d) + assert list(iter(d)) == [x] + + def test_mutate_class_and_then_compare(self): + class X(object): + pass + class Y(object): + pass + + x = X() + y = Y() + d1 = {x: 1} + d2 = {y: 1} + assert self.uses_identity_strategy(d1) + assert self.uses_identity_strategy(d2) + # + X.__hash__ = lambda self: hash(y) + X.__eq__ = lambda self, other: True + # + assert d1 == d2 + assert self.uses_identity_strategy(d1) + assert not self.uses_identity_strategy(d2) + + def test_old_style_classes(self): + class X: + pass + d = {X(): 1} + assert not self.uses_identity_strategy(d) diff --git a/pypy/objspace/std/typeobject.py b/pypy/objspace/std/typeobject.py --- a/pypy/objspace/std/typeobject.py +++ b/pypy/objspace/std/typeobject.py @@ -7,6 +7,7 @@ from pypy.interpreter.baseobjspace import W_Root from pypy.objspace.std.stdtypedef import std_dict_descr, issubtypedef, Member from pypy.objspace.std.objecttype import object_typedef +from pypy.objspace.std import identitydict from pypy.rlib.objectmodel import we_are_translated from pypy.rlib.objectmodel import current_object_addr_as_int, compute_hash from pypy.rlib.jit import promote, elidable_promote, we_are_jitted @@ -76,6 +77,10 @@ for i in range(len(self.lookup_where)): self.lookup_where[i] = None_None +# possible values of compares_by_identity_status +UNKNOWN = 0 +COMPARES_BY_IDENTITY = 1 +OVERRIDES_EQ_CMP_OR_HASH = 2 class W_TypeObject(W_Object): from pypy.objspace.std.typetype import type_typedef as typedef @@ -102,6 +107,9 @@ # (False is a conservative default, fixed during real usage) uses_object_getattribute = False + # for config.objspace.std.withidentitydict + compares_by_identity_status = UNKNOWN + # used to cache the type __new__ function if it comes from a builtin type # != 'type', in that case call__Type will also assumes the result # of the __new__ is an instance of the type @@ -146,11 +154,17 @@ else: w_self.terminator = NoDictTerminator(space, w_self) - def mutated(w_self): + def mutated(w_self, key): + """ + The type is being mutated. key is either the string containing the + specific attribute which is being deleted/set or None to indicate a + generic mutation. + """ space = w_self.space assert w_self.is_heaptype() or space.config.objspace.std.mutable_builtintypes if (not space.config.objspace.std.withtypeversion and not space.config.objspace.std.getattributeshortcut and + not space.config.objspace.std.withidentitydict and not space.config.objspace.std.newshortcut): return @@ -158,6 +172,13 @@ w_self.uses_object_getattribute = False # ^^^ conservative default, fixed during real usage + if space.config.objspace.std.withidentitydict: + did_compare_by_identity = ( + w_self.compares_by_identity_status == COMPARES_BY_IDENTITY) + if (key is None or key == '__eq__' or + key == '__cmp__' or key == '__hash__'): + w_self.compares_by_identity_status = UNKNOWN + if space.config.objspace.std.newshortcut: w_self.w_bltin_new = None @@ -168,7 +189,7 @@ subclasses_w = w_self.get_subclasses() for w_subclass in subclasses_w: assert isinstance(w_subclass, W_TypeObject) - w_subclass.mutated() + w_subclass.mutated(key) def version_tag(w_self): if (not we_are_jitted() or w_self.is_heaptype() or @@ -207,6 +228,25 @@ def has_object_getattribute(w_self): return w_self.getattribute_if_not_from_object() is None + def compares_by_identity(w_self): + from pypy.objspace.descroperation import object_hash + if not w_self.space.config.objspace.std.withidentitydict: + return False # conservative + # + if w_self.compares_by_identity_status != UNKNOWN: + # fast path + return w_self.compares_by_identity_status == COMPARES_BY_IDENTITY + # + default_hash = object_hash(w_self.space) + overrides_eq_cmp_or_hash = (w_self.lookup('__eq__') or + w_self.lookup('__cmp__') or + w_self.lookup('__hash__') is not default_hash) + if overrides_eq_cmp_or_hash: + w_self.compares_by_identity_status = OVERRIDES_EQ_CMP_OR_HASH + else: + w_self.compares_by_identity_status = COMPARES_BY_IDENTITY + return w_self.compares_by_identity_status == COMPARES_BY_IDENTITY + def ready(w_self): for w_base in w_self.bases_w: if not isinstance(w_base, W_TypeObject): @@ -269,7 +309,7 @@ w_curr.w_value = w_value return True w_value = TypeCell(w_value) - w_self.mutated() + w_self.mutated(name) w_self.dict_w[name] = w_value return True @@ -286,7 +326,7 @@ except KeyError: return False else: - w_self.mutated() + w_self.mutated(key) return True def lookup(w_self, name): diff --git a/pypy/objspace/std/typetype.py b/pypy/objspace/std/typetype.py --- a/pypy/objspace/std/typetype.py +++ b/pypy/objspace/std/typetype.py @@ -141,7 +141,7 @@ w_oldbestbase.getname(space)) # invalidate the version_tag of all the current subclasses - w_type.mutated() + w_type.mutated(None) # now we can go ahead and change 'w_type.bases_w' saved_bases_w = w_type.bases_w _______________________________________________ pypy-commit mailing list pypy-commit@python.org http://mail.python.org/mailman/listinfo/pypy-commit