Author: Carl Friedrich Bolz-Tereick <[email protected]>
Branch: 
Changeset: r97416:7b5414be06fa
Date: 2019-09-10 15:31 +0200
http://bitbucket.org/pypy/pypy/changeset/7b5414be06fa/

Log:    optimize codepoint_index_at_byte_position by using the index
        structure more, instead of calling next_codepoint_pos up to 64
        times.

        gives a speedup of about 2x on average

diff --git a/pypy/module/_sre/interp_sre.py b/pypy/module/_sre/interp_sre.py
--- a/pypy/module/_sre/interp_sre.py
+++ b/pypy/module/_sre/interp_sre.py
@@ -566,7 +566,8 @@
         if isinstance(ctx, rsre_utf8.Utf8MatchContext):
             index_storage = ctx.w_unicode_obj._get_index_storage()
             return rutf8.codepoint_index_at_byte_position(
-                ctx.w_unicode_obj._utf8, index_storage, bytepos)
+                ctx.w_unicode_obj._utf8, index_storage, bytepos,
+                ctx.w_unicode_obj._len())
         else:
             return bytepos
 
diff --git a/pypy/objspace/std/unicodeobject.py 
b/pypy/objspace/std/unicodeobject.py
--- a/pypy/objspace/std/unicodeobject.py
+++ b/pypy/objspace/std/unicodeobject.py
@@ -898,7 +898,7 @@
         if self.is_ascii():
             return bytepos
         return rutf8.codepoint_index_at_byte_position(
-            self._utf8, self._get_index_storage(), bytepos)
+            self._utf8, self._get_index_storage(), bytepos, self._len())
 
     @always_inline
     def _unwrap_and_search(self, space, w_sub, w_start, w_end, forward=True):
diff --git a/rpython/jit/metainterp/test/test_ajit.py 
b/rpython/jit/metainterp/test/test_ajit.py
--- a/rpython/jit/metainterp/test/test_ajit.py
+++ b/rpython/jit/metainterp/test/test_ajit.py
@@ -134,7 +134,7 @@
             u = U(x, len(x))
             st = u._get_index_storage()
             return rutf8.codepoint_index_at_byte_position(
-                u.u, st, 1)
+                u.u, st, 1, len(x))
 
         self.interp_operations(m, [123232])
 
diff --git a/rpython/rlib/rutf8.py b/rpython/rlib/rutf8.py
--- a/rpython/rlib/rutf8.py
+++ b/rpython/rlib/rutf8.py
@@ -584,7 +584,7 @@
     return codepoint_at_pos(utf8, bytepos)
 
 @jit.elidable
-def codepoint_index_at_byte_position(utf8, storage, bytepos):
+def codepoint_index_at_byte_position(utf8, storage, bytepos, num_codepoints):
     """ Return the character index for which
     codepoint_position_at_index(index) == bytepos.
     This is a relatively slow operation in that it runs in a time
@@ -596,6 +596,7 @@
     # binary search on elements of storage
     index_min = 0
     index_max = len(storage) - 1
+    i = 0
     while index_min < index_max:
         # this addition can't overflow because storage has a length that is
         # 1/64 of the length of a string
@@ -605,8 +606,27 @@
             index_max = index_middle - 1
         else:
             index_min = index_middle
-    bytepos1 = storage[index_min].baseindex
+        i += 1
+
+    baseindex = storage[index_min].baseindex
+    if baseindex == bytepos:
+        return index_min << 6
+
+    # use ofs to get closer to the correct character index
     result = index_min << 6
+    bytepos1 = baseindex
+    if index_min == len(storage) - 1:
+        maxindex = ((num_codepoints - 1) >> 2) & 0x0F
+    else:
+        maxindex = 16
+    for i in range(maxindex):
+        x = baseindex + ord(storage[index_min].ofs[i])
+        if x >= bytepos:
+            break
+        bytepos1 = x
+        result = (index_min << 6) + (i << 2) + 1
+
+    # this loop should runs at most four times
     while bytepos1 < bytepos:
         bytepos1 = next_codepoint_pos(utf8, bytepos1)
         result += 1
diff --git a/rpython/rlib/test/test_rutf8.py b/rpython/rlib/test/test_rutf8.py
--- a/rpython/rlib/test/test_rutf8.py
+++ b/rpython/rlib/test/test_rutf8.py
@@ -1,3 +1,4 @@
+#encoding: utf-8
 import pytest
 import sys
 from hypothesis import given, strategies, settings, example
@@ -133,12 +134,24 @@
 @given(strategies.text())
 @example(u'x' * 64 * 5)
 @example(u'x' * (64 * 5 - 1))
+@example(u'&#228;' + u'x&#171;' * 1000 + u'&#8211;' + u'y' * 100)
 def test_codepoint_index_at_byte_position(u):
-    storage = rutf8.create_utf8_index_storage(u.encode('utf8'), len(u))
+    b = u.encode('utf8')
+    storage = rutf8.create_utf8_index_storage(b, len(u))
     for i in range(len(u) + 1):
         bytepos = len(u[:i].encode('utf8'))
         assert rutf8.codepoint_index_at_byte_position(
-                       u.encode('utf8'), storage, bytepos) == i
+                       b, storage, bytepos, len(u)) == i
+
+@given(strategies.text(average_size=128))
+def test_codepoint_position_at_index_inverse(u):
+    print u
+    b = u.encode('utf8')
+    storage = rutf8.create_utf8_index_storage(b, len(u))
+    for i in range(len(u) + 1):
+        bytepos = rutf8.codepoint_position_at_index(b, storage, i)
+        assert rutf8.codepoint_index_at_byte_position(
+                       b, storage, bytepos, len(u)) == i
 
 
 repr_func = rutf8.make_utf8_escape_function(prefix='u', pass_printable=False,
_______________________________________________
pypy-commit mailing list
[email protected]
https://mail.python.org/mailman/listinfo/pypy-commit

Reply via email to