Author: Carl Friedrich Bolz-Tereick <[email protected]>
Branch: py3.6
Changeset: r97482:9d30b102e8b1
Date: 2019-09-15 14:20 +0200
http://bitbucket.org/pypy/pypy/changeset/9d30b102e8b1/
Log: merge default
diff --git a/pypy/module/pypyjit/test_pypy_c/test_string.py
b/pypy/module/pypyjit/test_pypy_c/test_string.py
--- a/pypy/module/pypyjit/test_pypy_c/test_string.py
+++ b/pypy/module/pypyjit/test_pypy_c/test_string.py
@@ -277,3 +277,72 @@
jump(..., descr=...)
""")
# XXX remove the guard_nonnull above?
+
+ def test_unicode_indexing_makes_no_bridges(self):
+ log = self.run("""
+ u =
u"aaaaaä👩‍👩‍👧‍👦" * 1000
+ def main():
+ for j in range(10):
+ for i in range(len(u)):
+ u[i] # ID: index0
+ """, [])
+ ops = log.loops[0].ops_by_id("index0")
+ for op in ops:
+ assert op.bridge is None
+
+ def test_unicode_indexing_small_constant_indices(self):
+ log = self.run("""
+ l = [u"abä", u"cdä", u"äü", u"éé",
u"–—¿"] * 1000
+ def main(n):
+ global s
+ for u in l:
+ s = u[0] + u[1] + u[-1] # ID: index
+ len(u)
+ return len(s)
+ """, [1000])
+ loop, = log.loops_by_filename(self.filepath)
+ assert loop.match_by_id('index', '''
+ i77 = getfield_gc_i(p73, descr=<FieldS
pypy.objspace.std.unicodeobject.W_UnicodeObject.inst__length .*>)
+ p78 = getfield_gc_r(p73, descr=<FieldP
pypy.objspace.std.unicodeobject.W_UnicodeObject.inst__utf8 .* pure>)
+ i79 = strlen(p78)
+ i80 = int_eq(i77, i79)
+ guard_false(i80, descr=...) # check not ascii
+ i82 = int_ge(0, i77)
+ guard_false(i82, descr=...)
+ i85 = call_i(ConstClass(next_codepoint_pos_dont_look_inside), p78,
0, descr=...)
+ i86 = int_gt(i85, i79)
+ guard_false(i86, descr=...)
+ i88 = int_ge(1, i77)
+ guard_false(i88, descr=...)
+ i90 = call_i(ConstClass(next_codepoint_pos_dont_look_inside), p78,
i85, descr=...)
+ i91 = int_gt(i90, i79)
+ guard_false(i91, descr=...)
+ i92 = int_sub(i90, i85)
+ i94 = int_add(-1, i77)
+ i96 = call_i(ConstClass(prev_codepoint_pos_dont_look_inside), p78,
i79, descr=...)
+ i97 = int_sub(i79, i96)
+ guard_not_invalidated(descr=...)
+ ''')
+
+ def test_unicode_slicing_small_constant_indices(self):
+ log = self.run("""
+ def main(n):
+ u =
u"abä👩‍👩‍👧‍👦éé–—¿"
* 1000
+ global s
+ count = 0
+ while u:
+ u = u[1:] # ID: index
+ count += 1
+ return count
+ """, [1000])
+ loop, = log.loops_by_filename(self.filepath)
+ assert loop.match_by_id('index', '''
+ i51 = int_eq(1, i38)
+ guard_false(i51, descr=...)
+ i52 = strlen(p47)
+ i53 = int_eq(i38, i52)
+ guard_false(i53, descr=...)
+ i56 = call_i(ConstClass(next_codepoint_pos_dont_look_inside), p47,
0, descr=...)
+ i57 = int_sub(i52, i56)
+ i59 = int_sub(i38, 1)
+ ''')
diff --git a/pypy/objspace/std/test/test_unicodeobject.py
b/pypy/objspace/std/test/test_unicodeobject.py
--- a/pypy/objspace/std/test/test_unicodeobject.py
+++ b/pypy/objspace/std/test/test_unicodeobject.py
@@ -95,6 +95,26 @@
space.newint(start + len1))
assert space.int_w(w_index) == rexpected
+ def test_getitem_constant_index_jit(self):
+ # test it directly, to prevent only seeing bugs in jitted code
+ space = self.space
+ u = u"äöabc"
+ w_u = self.space.wrap(u)
+ for i in range(-len(u), len(u)):
+ assert w_u._getitem_result_constant_index_jit(space, i)._utf8 ==
u[i].encode("utf-8")
+ with py.test.raises(OperationError):
+ w_u._getitem_result_constant_index_jit(space, len(u))
+ with py.test.raises(OperationError):
+ w_u._getitem_result_constant_index_jit(space, -len(u) - 1)
+
+ def test_getslice_constant_index_jit(self):
+ space = self.space
+ u = u"äöabcéééß"
+ w_u = self.space.wrap(u)
+ for start in range(0, 4):
+ for end in range(start, len(u)):
+ assert w_u._unicode_sliced_constant_index_jit(space, start,
end)._utf8 == u[start: end].encode("utf-8")
+
class AppTestUnicodeStringStdOnly:
def test_compares(self):
diff --git a/pypy/objspace/std/unicodeobject.py
b/pypy/objspace/std/unicodeobject.py
--- a/pypy/objspace/std/unicodeobject.py
+++ b/pypy/objspace/std/unicodeobject.py
@@ -26,6 +26,16 @@
__all__ = ['W_UnicodeObject', 'encode_object', 'decode_object',
'unicode_from_object', 'unicode_to_decimal_w']
+MAX_UNROLL_NEXT_CODEPOINT_POS = 4
+
[email protected]
+def next_codepoint_pos_dont_look_inside(utf8, p):
+ return rutf8.next_codepoint_pos(utf8, p)
+
[email protected]
+def prev_codepoint_pos_dont_look_inside(utf8, p):
+ return rutf8.prev_codepoint_pos(utf8, p)
+
class W_UnicodeObject(W_Root):
import_from_mixin(StringMethods)
@@ -854,6 +864,9 @@
if sl == 0:
return self._empty()
elif step == 1:
+ if jit.we_are_jitted() and \
+ self._unroll_slice_heuristic(start, stop,
w_index.w_stop):
+ return self._unicode_sliced_constant_index_jit(space,
start, stop)
assert start >= 0 and stop >= 0
return self._unicode_sliced(space, start, stop)
else:
@@ -869,7 +882,7 @@
i = 0
while True:
next_pos = rutf8.next_codepoint_pos(self._utf8, byte_pos)
- builder.append(self._utf8[byte_pos:next_pos])
+ builder.append_slice(self._utf8, byte_pos, next_pos)
if i == sl - 1:
break
i += 1
@@ -882,6 +895,9 @@
if start == stop:
return self._empty()
else:
+ if (jit.we_are_jitted() and
+ self._unroll_slice_heuristic(start, stop, w_stop)):
+ return self._unicode_sliced_constant_index_jit(space, start,
stop)
return self._unicode_sliced(space, start, stop)
def _unicode_sliced(self, space, start, stop):
@@ -893,6 +909,31 @@
byte_stop = self._index_to_byte(stop)
return W_UnicodeObject(self._utf8[byte_start:byte_stop], stop - start)
+ @jit.unroll_safe
+ def _unicode_sliced_constant_index_jit(self, space, start, stop):
+ assert start >= 0
+ assert stop >= 0
+ byte_start = 0
+ for i in range(start):
+ byte_start = next_codepoint_pos_dont_look_inside(self._utf8,
byte_start)
+ byte_stop = len(self._utf8)
+ for i in range(self._len() - stop):
+ byte_stop = prev_codepoint_pos_dont_look_inside(self._utf8,
byte_stop)
+ return W_UnicodeObject(self._utf8[byte_start:byte_stop], stop - start)
+
+ def _unroll_slice_heuristic(self, start, stop, w_stop):
+ from pypy.objspace.std.intobject import W_IntObject
+ # the reason we use the *wrapped* stop is that for
+ # w_stop == wrapped -1, or w_None the stop that is computed will *not*
+ # be constant, because the length is often not constant.
+ return (not self.is_ascii() and
+ jit.isconstant(start) and
+ (jit.isconstant(w_stop) or
+ (isinstance(w_stop, W_IntObject) and
+ jit.isconstant(w_stop.intval))) and
+ start <= MAX_UNROLL_NEXT_CODEPOINT_POS and
+ self._len() - stop <= MAX_UNROLL_NEXT_CODEPOINT_POS)
+
def descr_capitalize(self, space):
return W_UnicodeObject._capitalize_unicode(self._utf8)
@@ -1024,12 +1065,43 @@
return storage
def _getitem_result(self, space, index):
+ if (jit.we_are_jitted() and
+ not self.is_ascii() and
+ jit.isconstant(index) and
+ -MAX_UNROLL_NEXT_CODEPOINT_POS <= index <=
MAX_UNROLL_NEXT_CODEPOINT_POS):
+ return self._getitem_result_constant_index_jit(space, index)
if index < 0:
index += self._length
if index < 0 or index >= self._length:
raise oefmt(space.w_IndexError, "string index out of range")
start = self._index_to_byte(index)
- end = rutf8.next_codepoint_pos(self._utf8, start)
+ # we must not inline next_codepoint_pos, otherwise we produce a guard!
+ end = self.next_codepoint_pos_dont_look_inside(start)
+ return W_UnicodeObject(self._utf8[start:end], 1)
+
+ @jit.unroll_safe
+ def _getitem_result_constant_index_jit(self, space, index):
+ # for small known indices, call next/prev_codepoint_pos a few times
+ # instead of possibly creating an index structure
+ if index < 0:
+ posindex = index + self._length
+ if posindex < 0:
+ raise oefmt(space.w_IndexError, "string index out of range")
+ end = len(self._utf8)
+ start = self.prev_codepoint_pos_dont_look_inside(end)
+ for i in range(-index-1):
+ end = start
+ start = self.prev_codepoint_pos_dont_look_inside(start)
+ else:
+ if index >= self._length:
+ raise oefmt(space.w_IndexError, "string index out of range")
+ start = 0
+ end = self.next_codepoint_pos_dont_look_inside(start)
+ for i in range(index):
+ start = end
+ end = self.next_codepoint_pos_dont_look_inside(end)
+ assert start >= 0
+ assert end >= 0
return W_UnicodeObject(self._utf8[start:end], 1)
def is_ascii(self):
@@ -1056,6 +1128,16 @@
return rutf8.codepoint_index_at_byte_position(
self._utf8, self._get_index_storage(), bytepos, self._len())
+ def next_codepoint_pos_dont_look_inside(self, pos):
+ if self.is_ascii():
+ return pos + 1
+ return next_codepoint_pos_dont_look_inside(self._utf8, pos)
+
+ def prev_codepoint_pos_dont_look_inside(self, pos):
+ if self.is_ascii():
+ return pos - 1
+ return prev_codepoint_pos_dont_look_inside(self._utf8, pos)
+
@always_inline
def _unwrap_and_search(self, space, w_sub, w_start, w_end, forward=True):
w_sub = self.convert_arg_to_w_unicode(space, w_sub)
_______________________________________________
pypy-commit mailing list
[email protected]
https://mail.python.org/mailman/listinfo/pypy-commit