Author: Carl Friedrich Bolz-Tereick <[email protected]>
Branch: unicode-utf8
Changeset: r95694:b6331207f8b9
Date: 2019-01-21 13:11 +0100
http://bitbucket.org/pypy/pypy/changeset/b6331207f8b9/

Log:    change UnicodeDictStrategy to store wrapped unicode objects as keys.
        this lifts the restriction that they are ascii only. the faster hash
        and eq dispatching should still be a big win compared to going
        through the space, despite the keys being wrapped.

diff --git a/pypy/objspace/std/dictmultiobject.py 
b/pypy/objspace/std/dictmultiobject.py
--- a/pypy/objspace/std/dictmultiobject.py
+++ b/pypy/objspace/std/dictmultiobject.py
@@ -639,7 +639,7 @@
         if type(w_key) is self.space.StringObjectCls:
             self.switch_to_bytes_strategy(w_dict)
             return
-        elif type(w_key) is self.space.UnicodeObjectCls and w_key.is_ascii():
+        elif type(w_key) is self.space.UnicodeObjectCls:
             self.switch_to_unicode_strategy(w_dict)
             return
         w_type = self.space.type(w_key)
@@ -1193,6 +1193,11 @@
 
 create_iterator_classes(BytesDictStrategy)
 
+def unicode_eq(w_uni1, w_uni2):
+    return w_uni1.eq_w(w_uni2)
+
+def unicode_hash(w_uni):
+    return w_uni.hash_w()
 
 class UnicodeDictStrategy(AbstractTypedStrategy, DictStrategy):
     erase, unerase = rerased.new_erasing_pair("unicode")
@@ -1200,18 +1205,18 @@
     unerase = staticmethod(unerase)
 
     def wrap(self, unwrapped):
-        return self.space.newutf8(unwrapped, len(unwrapped))
+        return unwrapped
 
     def unwrap(self, wrapped):
-        return self.space.utf8_w(wrapped)
+        return wrapped
 
     def is_correct_type(self, w_obj):
         space = self.space
-        return type(w_obj) is space.UnicodeObjectCls and w_obj.is_ascii()
+        return type(w_obj) is space.UnicodeObjectCls
 
     def get_empty_storage(self):
-        res = {}
-        mark_dict_non_null(res)
+        res = r_dict(unicode_eq, unicode_hash,
+                     force_non_null=True)
         return self.erase(res)
 
     def _never_equal_to(self, w_lookup_type):
@@ -1235,14 +1240,14 @@
     ##     assert key is not None
     ##     return self.unerase(w_dict.dstorage).get(key, None)
 
-    def listview_utf8(self, w_dict):
-        return self.unerase(w_dict.dstorage).keys()
+    ## def listview_utf8(self, w_dict):
+    ##     return self.unerase(w_dict.dstorage).keys()
 
     ## def w_keys(self, w_dict):
     ##     return self.space.newlist_bytes(self.listview_bytes(w_dict))
 
     def wrapkey(space, key):
-        return space.newutf8(key, len(key))
+        return key
 
     ## @jit.look_inside_iff(lambda self, w_dict:
     ##                      w_dict_unrolling_heuristic(w_dict))
diff --git a/pypy/objspace/std/test/test_dictmultiobject.py 
b/pypy/objspace/std/test/test_dictmultiobject.py
--- a/pypy/objspace/std/test/test_dictmultiobject.py
+++ b/pypy/objspace/std/test/test_dictmultiobject.py
@@ -1,3 +1,4 @@
+# -*- encoding: utf-8 -*-
 import sys
 import py
 
@@ -141,6 +142,7 @@
         w_d.initialize_content([(wb("a"), w(1)), (wb("b"), w(2))])
         assert self.space.listview_bytes(w_d) == ["a", "b"]
 
+    @py.test.mark.skip("possible re-enable later?")
     def test_listview_unicode_dict(self):
         w = self.space.wrap
         w_d = self.space.newdict()
@@ -1151,8 +1153,11 @@
         assert d.keys() == [u"a"]
         assert type(d.keys()[0]) is unicode
 
+        d = {}
+        d[u"&#228;"] = 1
+        assert "UnicodeDictStrategy" in self.get_strategy(d)
+
     def test_empty_to_int(self):
-        import sys
         d = {}
         d[1] = "hi"
         assert "IntDictStrategy" in self.get_strategy(d)
diff --git a/pypy/objspace/std/unicodeobject.py 
b/pypy/objspace/std/unicodeobject.py
--- a/pypy/objspace/std/unicodeobject.py
+++ b/pypy/objspace/std/unicodeobject.py
@@ -224,10 +224,19 @@
     def descr_str(self, space):
         return encode_object(space, self, 'ascii', 'strict')
 
-    def descr_hash(self, space):
+    def hash_w(self):
+        # shortcut for UnicodeDictStrategy
         x = compute_hash(self._utf8)
         x -= (x == -1) # convert -1 to -2 without creating a bridge
-        return space.newint(x)
+        return x
+
+    def descr_hash(self, space):
+        return space.newint(self.hash_w())
+
+    def eq_w(self, w_other):
+        # shortcut for UnicodeDictStrategy
+        assert isinstance(w_other, W_UnicodeObject)
+        return self._utf8 == w_other._utf8
 
     def descr_eq(self, space, w_other):
         try:
_______________________________________________
pypy-commit mailing list
[email protected]
https://mail.python.org/mailman/listinfo/pypy-commit

Reply via email to