Author: Carl Friedrich Bolz-Tereick <cfb...@gmx.de> Branch: py3.6 Changeset: r97482:9d30b102e8b1 Date: 2019-09-15 14:20 +0200 http://bitbucket.org/pypy/pypy/changeset/9d30b102e8b1/
Log: merge default diff --git a/pypy/module/pypyjit/test_pypy_c/test_string.py b/pypy/module/pypyjit/test_pypy_c/test_string.py --- a/pypy/module/pypyjit/test_pypy_c/test_string.py +++ b/pypy/module/pypyjit/test_pypy_c/test_string.py @@ -277,3 +277,72 @@ jump(..., descr=...) """) # XXX remove the guard_nonnull above? + + def test_unicode_indexing_makes_no_bridges(self): + log = self.run(""" + u = u"aaaaaä👩‍👩‍👧‍👦" * 1000 + def main(): + for j in range(10): + for i in range(len(u)): + u[i] # ID: index0 + """, []) + ops = log.loops[0].ops_by_id("index0") + for op in ops: + assert op.bridge is None + + def test_unicode_indexing_small_constant_indices(self): + log = self.run(""" + l = [u"abä", u"cdä", u"äü", u"éé", u"–—¿"] * 1000 + def main(n): + global s + for u in l: + s = u[0] + u[1] + u[-1] # ID: index + len(u) + return len(s) + """, [1000]) + loop, = log.loops_by_filename(self.filepath) + assert loop.match_by_id('index', ''' + i77 = getfield_gc_i(p73, descr=<FieldS pypy.objspace.std.unicodeobject.W_UnicodeObject.inst__length .*>) + p78 = getfield_gc_r(p73, descr=<FieldP pypy.objspace.std.unicodeobject.W_UnicodeObject.inst__utf8 .* pure>) + i79 = strlen(p78) + i80 = int_eq(i77, i79) + guard_false(i80, descr=...) # check not ascii + i82 = int_ge(0, i77) + guard_false(i82, descr=...) + i85 = call_i(ConstClass(next_codepoint_pos_dont_look_inside), p78, 0, descr=...) + i86 = int_gt(i85, i79) + guard_false(i86, descr=...) + i88 = int_ge(1, i77) + guard_false(i88, descr=...) + i90 = call_i(ConstClass(next_codepoint_pos_dont_look_inside), p78, i85, descr=...) + i91 = int_gt(i90, i79) + guard_false(i91, descr=...) + i92 = int_sub(i90, i85) + i94 = int_add(-1, i77) + i96 = call_i(ConstClass(prev_codepoint_pos_dont_look_inside), p78, i79, descr=...) + i97 = int_sub(i79, i96) + guard_not_invalidated(descr=...) + ''') + + def test_unicode_slicing_small_constant_indices(self): + log = self.run(""" + def main(n): + u = u"abä👩‍👩‍👧‍👦éé–—¿" * 1000 + global s + count = 0 + while u: + u = u[1:] # ID: index + count += 1 + return count + """, [1000]) + loop, = log.loops_by_filename(self.filepath) + assert loop.match_by_id('index', ''' + i51 = int_eq(1, i38) + guard_false(i51, descr=...) + i52 = strlen(p47) + i53 = int_eq(i38, i52) + guard_false(i53, descr=...) + i56 = call_i(ConstClass(next_codepoint_pos_dont_look_inside), p47, 0, descr=...) + i57 = int_sub(i52, i56) + i59 = int_sub(i38, 1) + ''') diff --git a/pypy/objspace/std/test/test_unicodeobject.py b/pypy/objspace/std/test/test_unicodeobject.py --- a/pypy/objspace/std/test/test_unicodeobject.py +++ b/pypy/objspace/std/test/test_unicodeobject.py @@ -95,6 +95,26 @@ space.newint(start + len1)) assert space.int_w(w_index) == rexpected + def test_getitem_constant_index_jit(self): + # test it directly, to prevent only seeing bugs in jitted code + space = self.space + u = u"äöabc" + w_u = self.space.wrap(u) + for i in range(-len(u), len(u)): + assert w_u._getitem_result_constant_index_jit(space, i)._utf8 == u[i].encode("utf-8") + with py.test.raises(OperationError): + w_u._getitem_result_constant_index_jit(space, len(u)) + with py.test.raises(OperationError): + w_u._getitem_result_constant_index_jit(space, -len(u) - 1) + + def test_getslice_constant_index_jit(self): + space = self.space + u = u"äöabcéééß" + w_u = self.space.wrap(u) + for start in range(0, 4): + for end in range(start, len(u)): + assert w_u._unicode_sliced_constant_index_jit(space, start, end)._utf8 == u[start: end].encode("utf-8") + class AppTestUnicodeStringStdOnly: def test_compares(self): diff --git a/pypy/objspace/std/unicodeobject.py b/pypy/objspace/std/unicodeobject.py --- a/pypy/objspace/std/unicodeobject.py +++ b/pypy/objspace/std/unicodeobject.py @@ -26,6 +26,16 @@ __all__ = ['W_UnicodeObject', 'encode_object', 'decode_object', 'unicode_from_object', 'unicode_to_decimal_w'] +MAX_UNROLL_NEXT_CODEPOINT_POS = 4 + +@jit.elidable +def next_codepoint_pos_dont_look_inside(utf8, p): + return rutf8.next_codepoint_pos(utf8, p) + +@jit.elidable +def prev_codepoint_pos_dont_look_inside(utf8, p): + return rutf8.prev_codepoint_pos(utf8, p) + class W_UnicodeObject(W_Root): import_from_mixin(StringMethods) @@ -854,6 +864,9 @@ if sl == 0: return self._empty() elif step == 1: + if jit.we_are_jitted() and \ + self._unroll_slice_heuristic(start, stop, w_index.w_stop): + return self._unicode_sliced_constant_index_jit(space, start, stop) assert start >= 0 and stop >= 0 return self._unicode_sliced(space, start, stop) else: @@ -869,7 +882,7 @@ i = 0 while True: next_pos = rutf8.next_codepoint_pos(self._utf8, byte_pos) - builder.append(self._utf8[byte_pos:next_pos]) + builder.append_slice(self._utf8, byte_pos, next_pos) if i == sl - 1: break i += 1 @@ -882,6 +895,9 @@ if start == stop: return self._empty() else: + if (jit.we_are_jitted() and + self._unroll_slice_heuristic(start, stop, w_stop)): + return self._unicode_sliced_constant_index_jit(space, start, stop) return self._unicode_sliced(space, start, stop) def _unicode_sliced(self, space, start, stop): @@ -893,6 +909,31 @@ byte_stop = self._index_to_byte(stop) return W_UnicodeObject(self._utf8[byte_start:byte_stop], stop - start) + @jit.unroll_safe + def _unicode_sliced_constant_index_jit(self, space, start, stop): + assert start >= 0 + assert stop >= 0 + byte_start = 0 + for i in range(start): + byte_start = next_codepoint_pos_dont_look_inside(self._utf8, byte_start) + byte_stop = len(self._utf8) + for i in range(self._len() - stop): + byte_stop = prev_codepoint_pos_dont_look_inside(self._utf8, byte_stop) + return W_UnicodeObject(self._utf8[byte_start:byte_stop], stop - start) + + def _unroll_slice_heuristic(self, start, stop, w_stop): + from pypy.objspace.std.intobject import W_IntObject + # the reason we use the *wrapped* stop is that for + # w_stop == wrapped -1, or w_None the stop that is computed will *not* + # be constant, because the length is often not constant. + return (not self.is_ascii() and + jit.isconstant(start) and + (jit.isconstant(w_stop) or + (isinstance(w_stop, W_IntObject) and + jit.isconstant(w_stop.intval))) and + start <= MAX_UNROLL_NEXT_CODEPOINT_POS and + self._len() - stop <= MAX_UNROLL_NEXT_CODEPOINT_POS) + def descr_capitalize(self, space): return W_UnicodeObject._capitalize_unicode(self._utf8) @@ -1024,12 +1065,43 @@ return storage def _getitem_result(self, space, index): + if (jit.we_are_jitted() and + not self.is_ascii() and + jit.isconstant(index) and + -MAX_UNROLL_NEXT_CODEPOINT_POS <= index <= MAX_UNROLL_NEXT_CODEPOINT_POS): + return self._getitem_result_constant_index_jit(space, index) if index < 0: index += self._length if index < 0 or index >= self._length: raise oefmt(space.w_IndexError, "string index out of range") start = self._index_to_byte(index) - end = rutf8.next_codepoint_pos(self._utf8, start) + # we must not inline next_codepoint_pos, otherwise we produce a guard! + end = self.next_codepoint_pos_dont_look_inside(start) + return W_UnicodeObject(self._utf8[start:end], 1) + + @jit.unroll_safe + def _getitem_result_constant_index_jit(self, space, index): + # for small known indices, call next/prev_codepoint_pos a few times + # instead of possibly creating an index structure + if index < 0: + posindex = index + self._length + if posindex < 0: + raise oefmt(space.w_IndexError, "string index out of range") + end = len(self._utf8) + start = self.prev_codepoint_pos_dont_look_inside(end) + for i in range(-index-1): + end = start + start = self.prev_codepoint_pos_dont_look_inside(start) + else: + if index >= self._length: + raise oefmt(space.w_IndexError, "string index out of range") + start = 0 + end = self.next_codepoint_pos_dont_look_inside(start) + for i in range(index): + start = end + end = self.next_codepoint_pos_dont_look_inside(end) + assert start >= 0 + assert end >= 0 return W_UnicodeObject(self._utf8[start:end], 1) def is_ascii(self): @@ -1056,6 +1128,16 @@ return rutf8.codepoint_index_at_byte_position( self._utf8, self._get_index_storage(), bytepos, self._len()) + def next_codepoint_pos_dont_look_inside(self, pos): + if self.is_ascii(): + return pos + 1 + return next_codepoint_pos_dont_look_inside(self._utf8, pos) + + def prev_codepoint_pos_dont_look_inside(self, pos): + if self.is_ascii(): + return pos - 1 + return prev_codepoint_pos_dont_look_inside(self._utf8, pos) + @always_inline def _unwrap_and_search(self, space, w_sub, w_start, w_end, forward=True): w_sub = self.convert_arg_to_w_unicode(space, w_sub) _______________________________________________ pypy-commit mailing list pypy-commit@python.org https://mail.python.org/mailman/listinfo/pypy-commit