Author: Carl Friedrich Bolz-Tereick <[email protected]>
Branch:
Changeset: r97416:7b5414be06fa
Date: 2019-09-10 15:31 +0200
http://bitbucket.org/pypy/pypy/changeset/7b5414be06fa/
Log: optimize codepoint_index_at_byte_position by using the index
structure more, instead of calling next_codepoint_pos up to 64
times.
gives a speedup of about 2x on average
diff --git a/pypy/module/_sre/interp_sre.py b/pypy/module/_sre/interp_sre.py
--- a/pypy/module/_sre/interp_sre.py
+++ b/pypy/module/_sre/interp_sre.py
@@ -566,7 +566,8 @@
if isinstance(ctx, rsre_utf8.Utf8MatchContext):
index_storage = ctx.w_unicode_obj._get_index_storage()
return rutf8.codepoint_index_at_byte_position(
- ctx.w_unicode_obj._utf8, index_storage, bytepos)
+ ctx.w_unicode_obj._utf8, index_storage, bytepos,
+ ctx.w_unicode_obj._len())
else:
return bytepos
diff --git a/pypy/objspace/std/unicodeobject.py
b/pypy/objspace/std/unicodeobject.py
--- a/pypy/objspace/std/unicodeobject.py
+++ b/pypy/objspace/std/unicodeobject.py
@@ -898,7 +898,7 @@
if self.is_ascii():
return bytepos
return rutf8.codepoint_index_at_byte_position(
- self._utf8, self._get_index_storage(), bytepos)
+ self._utf8, self._get_index_storage(), bytepos, self._len())
@always_inline
def _unwrap_and_search(self, space, w_sub, w_start, w_end, forward=True):
diff --git a/rpython/jit/metainterp/test/test_ajit.py
b/rpython/jit/metainterp/test/test_ajit.py
--- a/rpython/jit/metainterp/test/test_ajit.py
+++ b/rpython/jit/metainterp/test/test_ajit.py
@@ -134,7 +134,7 @@
u = U(x, len(x))
st = u._get_index_storage()
return rutf8.codepoint_index_at_byte_position(
- u.u, st, 1)
+ u.u, st, 1, len(x))
self.interp_operations(m, [123232])
diff --git a/rpython/rlib/rutf8.py b/rpython/rlib/rutf8.py
--- a/rpython/rlib/rutf8.py
+++ b/rpython/rlib/rutf8.py
@@ -584,7 +584,7 @@
return codepoint_at_pos(utf8, bytepos)
@jit.elidable
-def codepoint_index_at_byte_position(utf8, storage, bytepos):
+def codepoint_index_at_byte_position(utf8, storage, bytepos, num_codepoints):
""" Return the character index for which
codepoint_position_at_index(index) == bytepos.
This is a relatively slow operation in that it runs in a time
@@ -596,6 +596,7 @@
# binary search on elements of storage
index_min = 0
index_max = len(storage) - 1
+ i = 0
while index_min < index_max:
# this addition can't overflow because storage has a length that is
# 1/64 of the length of a string
@@ -605,8 +606,27 @@
index_max = index_middle - 1
else:
index_min = index_middle
- bytepos1 = storage[index_min].baseindex
+ i += 1
+
+ baseindex = storage[index_min].baseindex
+ if baseindex == bytepos:
+ return index_min << 6
+
+ # use ofs to get closer to the correct character index
result = index_min << 6
+ bytepos1 = baseindex
+ if index_min == len(storage) - 1:
+ maxindex = ((num_codepoints - 1) >> 2) & 0x0F
+ else:
+ maxindex = 16
+ for i in range(maxindex):
+ x = baseindex + ord(storage[index_min].ofs[i])
+ if x >= bytepos:
+ break
+ bytepos1 = x
+ result = (index_min << 6) + (i << 2) + 1
+
+ # this loop should runs at most four times
while bytepos1 < bytepos:
bytepos1 = next_codepoint_pos(utf8, bytepos1)
result += 1
diff --git a/rpython/rlib/test/test_rutf8.py b/rpython/rlib/test/test_rutf8.py
--- a/rpython/rlib/test/test_rutf8.py
+++ b/rpython/rlib/test/test_rutf8.py
@@ -1,3 +1,4 @@
+#encoding: utf-8
import pytest
import sys
from hypothesis import given, strategies, settings, example
@@ -133,12 +134,24 @@
@given(strategies.text())
@example(u'x' * 64 * 5)
@example(u'x' * (64 * 5 - 1))
+@example(u'ä' + u'x«' * 1000 + u'–' + u'y' * 100)
def test_codepoint_index_at_byte_position(u):
- storage = rutf8.create_utf8_index_storage(u.encode('utf8'), len(u))
+ b = u.encode('utf8')
+ storage = rutf8.create_utf8_index_storage(b, len(u))
for i in range(len(u) + 1):
bytepos = len(u[:i].encode('utf8'))
assert rutf8.codepoint_index_at_byte_position(
- u.encode('utf8'), storage, bytepos) == i
+ b, storage, bytepos, len(u)) == i
+
+@given(strategies.text(average_size=128))
+def test_codepoint_position_at_index_inverse(u):
+ print u
+ b = u.encode('utf8')
+ storage = rutf8.create_utf8_index_storage(b, len(u))
+ for i in range(len(u) + 1):
+ bytepos = rutf8.codepoint_position_at_index(b, storage, i)
+ assert rutf8.codepoint_index_at_byte_position(
+ b, storage, bytepos, len(u)) == i
repr_func = rutf8.make_utf8_escape_function(prefix='u', pass_printable=False,
_______________________________________________
pypy-commit mailing list
[email protected]
https://mail.python.org/mailman/listinfo/pypy-commit