[pypy-commit] pypy default: stop using codepoints_in_utf8 for the result of find

cfbolz Tue, 10 Sep 2019 04:20:44 -0700

Author: Carl Friedrich Bolz-Tereick <[email protected]>
Branch: 
Changeset: r97415:b2971d2576c1
Date: 2019-09-10 13:18 +0200
http://bitbucket.org/pypy/pypy/changeset/b2971d2576c1/


Log:    stop using codepoints_in_utf8 for the result of find

        even for very small strings it's a lot slower than using binary
        search

diff --git a/pypy/objspace/std/unicodeobject.py 
b/pypy/objspace/std/unicodeobject.py
--- a/pypy/objspace/std/unicodeobject.py
+++ b/pypy/objspace/std/unicodeobject.py
@@ -891,6 +891,15 @@
             return end - start
         return rutf8.codepoints_in_utf8(self._utf8, start, end)
 
+    def _byte_to_index(self, bytepos):
+        """ this returns index such that self._index_to_byte(index) == bytepos
+        NB: this is slow! roughly logarithmic with a big constant
+        """
+        if self.is_ascii():
+            return bytepos
+        return rutf8.codepoint_index_at_byte_position(
+            self._utf8, self._get_index_storage(), bytepos)
+
     @always_inline
     def _unwrap_and_search(self, space, w_sub, w_start, w_end, forward=True):
         w_sub = self.convert_arg_to_w_unicode(space, w_sub)
@@ -912,16 +921,14 @@
             res_index = self._utf8.find(w_sub._utf8, start_index, end_index)
             if res_index < 0:
                 return None
-            skip = self._codepoints_in_utf8(start_index, res_index)
-            res = start + skip
+            res = self._byte_to_index(res_index)
             assert res >= 0
             return space.newint(res)
         else:
             res_index = self._utf8.rfind(w_sub._utf8, start_index, end_index)
             if res_index < 0:
                 return None
-            skip = self._codepoints_in_utf8(res_index, end_index)
-            res = end - skip
+            res = self._byte_to_index(res_index)
             assert res >= 0
             return space.newint(res)
 
_______________________________________________
pypy-commit mailing list
[email protected]
https://mail.python.org/mailman/listinfo/pypy-commit

[pypy-commit] pypy default: stop using codepoints_in_utf8 for the result of find

Reply via email to