Author: Carl Friedrich Bolz-Tereick <cfb...@gmx.de> Branch: Changeset: r97415:b2971d2576c1 Date: 2019-09-10 13:18 +0200 http://bitbucket.org/pypy/pypy/changeset/b2971d2576c1/
Log: stop using codepoints_in_utf8 for the result of find even for very small strings it's a lot slower than using binary search diff --git a/pypy/objspace/std/unicodeobject.py b/pypy/objspace/std/unicodeobject.py --- a/pypy/objspace/std/unicodeobject.py +++ b/pypy/objspace/std/unicodeobject.py @@ -891,6 +891,15 @@ return end - start return rutf8.codepoints_in_utf8(self._utf8, start, end) + def _byte_to_index(self, bytepos): + """ this returns index such that self._index_to_byte(index) == bytepos + NB: this is slow! roughly logarithmic with a big constant + """ + if self.is_ascii(): + return bytepos + return rutf8.codepoint_index_at_byte_position( + self._utf8, self._get_index_storage(), bytepos) + @always_inline def _unwrap_and_search(self, space, w_sub, w_start, w_end, forward=True): w_sub = self.convert_arg_to_w_unicode(space, w_sub) @@ -912,16 +921,14 @@ res_index = self._utf8.find(w_sub._utf8, start_index, end_index) if res_index < 0: return None - skip = self._codepoints_in_utf8(start_index, res_index) - res = start + skip + res = self._byte_to_index(res_index) assert res >= 0 return space.newint(res) else: res_index = self._utf8.rfind(w_sub._utf8, start_index, end_index) if res_index < 0: return None - skip = self._codepoints_in_utf8(res_index, end_index) - res = end - skip + res = self._byte_to_index(res_index) assert res >= 0 return space.newint(res) _______________________________________________ pypy-commit mailing list pypy-commit@python.org https://mail.python.org/mailman/listinfo/pypy-commit