Author: Carl Friedrich Bolz-Tereick <cfb...@gmx.de>
Branch: py3.6
Changeset: r97482:9d30b102e8b1
Date: 2019-09-15 14:20 +0200
http://bitbucket.org/pypy/pypy/changeset/9d30b102e8b1/

Log:    merge default

diff --git a/pypy/module/pypyjit/test_pypy_c/test_string.py 
b/pypy/module/pypyjit/test_pypy_c/test_string.py
--- a/pypy/module/pypyjit/test_pypy_c/test_string.py
+++ b/pypy/module/pypyjit/test_pypy_c/test_string.py
@@ -277,3 +277,72 @@
         jump(..., descr=...)
         """)
         # XXX remove the guard_nonnull above?
+
+    def test_unicode_indexing_makes_no_bridges(self):
+        log = self.run("""
+        u = 
u"aaaaa&#228;&#128105;&#8205;&#128105;&#8205;&#128103;&#8205;&#128102;" * 1000
+        def main():
+            for j in range(10):
+                for i in range(len(u)):
+                    u[i] # ID: index0
+        """, [])
+        ops = log.loops[0].ops_by_id("index0")
+        for op in ops:
+            assert op.bridge is None
+
+    def test_unicode_indexing_small_constant_indices(self):
+        log = self.run("""
+        l = [u"ab&#228;", u"cd&#228;", u"&#228;&#252;", u"&#233;&#233;", 
u"&#8211;&#8212;&#191;"] * 1000
+        def main(n):
+            global s
+            for u in l:
+                s = u[0] + u[1] + u[-1] # ID: index
+                len(u)
+            return len(s)
+        """, [1000])
+        loop, = log.loops_by_filename(self.filepath)
+        assert loop.match_by_id('index', '''
+            i77 = getfield_gc_i(p73, descr=<FieldS 
pypy.objspace.std.unicodeobject.W_UnicodeObject.inst__length .*>)
+            p78 = getfield_gc_r(p73, descr=<FieldP 
pypy.objspace.std.unicodeobject.W_UnicodeObject.inst__utf8 .* pure>)
+            i79 = strlen(p78)
+            i80 = int_eq(i77, i79)
+            guard_false(i80, descr=...) # check not ascii
+            i82 = int_ge(0, i77)
+            guard_false(i82, descr=...)
+            i85 = call_i(ConstClass(next_codepoint_pos_dont_look_inside), p78, 
0, descr=...)
+            i86 = int_gt(i85, i79)
+            guard_false(i86, descr=...)
+            i88 = int_ge(1, i77)
+            guard_false(i88, descr=...)
+            i90 = call_i(ConstClass(next_codepoint_pos_dont_look_inside), p78, 
i85, descr=...)
+            i91 = int_gt(i90, i79)
+            guard_false(i91, descr=...)
+            i92 = int_sub(i90, i85)
+            i94 = int_add(-1, i77)
+            i96 = call_i(ConstClass(prev_codepoint_pos_dont_look_inside), p78, 
i79, descr=...)
+            i97 = int_sub(i79, i96)
+            guard_not_invalidated(descr=...)
+        ''')
+
+    def test_unicode_slicing_small_constant_indices(self):
+        log = self.run("""
+        def main(n):
+            u = 
u"ab&#228;&#128105;&#8205;&#128105;&#8205;&#128103;&#8205;&#128102;&#233;&#233;&#8211;&#8212;&#191;"
 * 1000
+            global s
+            count = 0
+            while u:
+                u = u[1:] # ID: index
+                count += 1
+            return count
+        """, [1000])
+        loop, = log.loops_by_filename(self.filepath)
+        assert loop.match_by_id('index', '''
+            i51 = int_eq(1, i38)
+            guard_false(i51, descr=...)
+            i52 = strlen(p47)
+            i53 = int_eq(i38, i52)
+            guard_false(i53, descr=...)
+            i56 = call_i(ConstClass(next_codepoint_pos_dont_look_inside), p47, 
0, descr=...)
+            i57 = int_sub(i52, i56)
+            i59 = int_sub(i38, 1)
+        ''')
diff --git a/pypy/objspace/std/test/test_unicodeobject.py 
b/pypy/objspace/std/test/test_unicodeobject.py
--- a/pypy/objspace/std/test/test_unicodeobject.py
+++ b/pypy/objspace/std/test/test_unicodeobject.py
@@ -95,6 +95,26 @@
                                         space.newint(start + len1))
             assert space.int_w(w_index) == rexpected
 
+    def test_getitem_constant_index_jit(self):
+        # test it directly, to prevent only seeing bugs in jitted code
+        space = self.space
+        u = u"&#228;&#246;abc"
+        w_u = self.space.wrap(u)
+        for i in range(-len(u), len(u)):
+            assert w_u._getitem_result_constant_index_jit(space, i)._utf8 == 
u[i].encode("utf-8")
+        with py.test.raises(OperationError):
+            w_u._getitem_result_constant_index_jit(space, len(u))
+        with py.test.raises(OperationError):
+            w_u._getitem_result_constant_index_jit(space, -len(u) - 1)
+
+    def test_getslice_constant_index_jit(self):
+        space = self.space
+        u = u"&#228;&#246;abc&#233;&#233;&#233;&#223;"
+        w_u = self.space.wrap(u)
+        for start in range(0, 4):
+            for end in range(start, len(u)):
+                assert w_u._unicode_sliced_constant_index_jit(space, start, 
end)._utf8 == u[start: end].encode("utf-8")
+
 
 class AppTestUnicodeStringStdOnly:
     def test_compares(self):
diff --git a/pypy/objspace/std/unicodeobject.py 
b/pypy/objspace/std/unicodeobject.py
--- a/pypy/objspace/std/unicodeobject.py
+++ b/pypy/objspace/std/unicodeobject.py
@@ -26,6 +26,16 @@
 __all__ = ['W_UnicodeObject', 'encode_object', 'decode_object',
            'unicode_from_object', 'unicode_to_decimal_w']
 
+MAX_UNROLL_NEXT_CODEPOINT_POS = 4
+
+@jit.elidable
+def next_codepoint_pos_dont_look_inside(utf8, p):
+    return rutf8.next_codepoint_pos(utf8, p)
+
+@jit.elidable
+def prev_codepoint_pos_dont_look_inside(utf8, p):
+    return rutf8.prev_codepoint_pos(utf8, p)
+
 
 class W_UnicodeObject(W_Root):
     import_from_mixin(StringMethods)
@@ -854,6 +864,9 @@
             if sl == 0:
                 return self._empty()
             elif step == 1:
+                if jit.we_are_jitted() and \
+                        self._unroll_slice_heuristic(start, stop, 
w_index.w_stop):
+                    return self._unicode_sliced_constant_index_jit(space, 
start, stop)
                 assert start >= 0 and stop >= 0
                 return self._unicode_sliced(space, start, stop)
             else:
@@ -869,7 +882,7 @@
         i = 0
         while True:
             next_pos = rutf8.next_codepoint_pos(self._utf8, byte_pos)
-            builder.append(self._utf8[byte_pos:next_pos])
+            builder.append_slice(self._utf8, byte_pos, next_pos)
             if i == sl - 1:
                 break
             i += 1
@@ -882,6 +895,9 @@
         if start == stop:
             return self._empty()
         else:
+            if (jit.we_are_jitted() and
+                    self._unroll_slice_heuristic(start, stop, w_stop)):
+                return self._unicode_sliced_constant_index_jit(space, start, 
stop)
             return self._unicode_sliced(space, start, stop)
 
     def _unicode_sliced(self, space, start, stop):
@@ -893,6 +909,31 @@
         byte_stop = self._index_to_byte(stop)
         return W_UnicodeObject(self._utf8[byte_start:byte_stop], stop - start)
 
+    @jit.unroll_safe
+    def _unicode_sliced_constant_index_jit(self, space, start, stop):
+        assert start >= 0
+        assert stop >= 0
+        byte_start = 0
+        for i in range(start):
+            byte_start = next_codepoint_pos_dont_look_inside(self._utf8, 
byte_start)
+        byte_stop = len(self._utf8)
+        for i in range(self._len() - stop):
+            byte_stop = prev_codepoint_pos_dont_look_inside(self._utf8, 
byte_stop)
+        return W_UnicodeObject(self._utf8[byte_start:byte_stop], stop - start)
+
+    def _unroll_slice_heuristic(self, start, stop, w_stop):
+        from pypy.objspace.std.intobject import W_IntObject
+        # the reason we use the *wrapped* stop is that for
+        # w_stop ==  wrapped -1, or w_None the stop that is computed will *not*
+        # be constant, because the length is often not constant.
+        return (not self.is_ascii() and
+            jit.isconstant(start) and
+            (jit.isconstant(w_stop) or
+                (isinstance(w_stop, W_IntObject) and
+                    jit.isconstant(w_stop.intval))) and
+            start <= MAX_UNROLL_NEXT_CODEPOINT_POS and
+            self._len() - stop <= MAX_UNROLL_NEXT_CODEPOINT_POS)
+
     def descr_capitalize(self, space):
         return W_UnicodeObject._capitalize_unicode(self._utf8)
 
@@ -1024,12 +1065,43 @@
         return storage
 
     def _getitem_result(self, space, index):
+        if (jit.we_are_jitted() and
+                not self.is_ascii() and
+                jit.isconstant(index) and
+                -MAX_UNROLL_NEXT_CODEPOINT_POS <= index <= 
MAX_UNROLL_NEXT_CODEPOINT_POS):
+            return self._getitem_result_constant_index_jit(space, index)
         if index < 0:
             index += self._length
         if index < 0 or index >= self._length:
             raise oefmt(space.w_IndexError, "string index out of range")
         start = self._index_to_byte(index)
-        end = rutf8.next_codepoint_pos(self._utf8, start)
+        # we must not inline next_codepoint_pos, otherwise we produce a guard!
+        end = self.next_codepoint_pos_dont_look_inside(start)
+        return W_UnicodeObject(self._utf8[start:end], 1)
+
+    @jit.unroll_safe
+    def _getitem_result_constant_index_jit(self, space, index):
+        # for small known indices, call next/prev_codepoint_pos a few times
+        # instead of possibly creating an index structure
+        if index < 0:
+            posindex = index + self._length
+            if posindex < 0:
+                raise oefmt(space.w_IndexError, "string index out of range")
+            end = len(self._utf8)
+            start = self.prev_codepoint_pos_dont_look_inside(end)
+            for i in range(-index-1):
+                end = start
+                start = self.prev_codepoint_pos_dont_look_inside(start)
+        else:
+            if index >= self._length:
+                raise oefmt(space.w_IndexError, "string index out of range")
+            start = 0
+            end = self.next_codepoint_pos_dont_look_inside(start)
+            for i in range(index):
+                start = end
+                end = self.next_codepoint_pos_dont_look_inside(end)
+        assert start >= 0
+        assert end >= 0
         return W_UnicodeObject(self._utf8[start:end], 1)
 
     def is_ascii(self):
@@ -1056,6 +1128,16 @@
         return rutf8.codepoint_index_at_byte_position(
             self._utf8, self._get_index_storage(), bytepos, self._len())
 
+    def next_codepoint_pos_dont_look_inside(self, pos):
+        if self.is_ascii():
+            return pos + 1
+        return next_codepoint_pos_dont_look_inside(self._utf8, pos)
+
+    def prev_codepoint_pos_dont_look_inside(self, pos):
+        if self.is_ascii():
+            return pos - 1
+        return prev_codepoint_pos_dont_look_inside(self._utf8, pos)
+
     @always_inline
     def _unwrap_and_search(self, space, w_sub, w_start, w_end, forward=True):
         w_sub = self.convert_arg_to_w_unicode(space, w_sub)
_______________________________________________
pypy-commit mailing list
pypy-commit@python.org
https://mail.python.org/mailman/listinfo/pypy-commit

Reply via email to