Author: Carl Friedrich Bolz-Tereick <cfb...@gmx.de> Branch: Changeset: r97462:5e5857c2fae6 Date: 2019-09-12 16:52 +0200 http://bitbucket.org/pypy/pypy/changeset/5e5857c2fae6/
Log: merge heads diff --git a/pypy/interpreter/unicodehelper.py b/pypy/interpreter/unicodehelper.py --- a/pypy/interpreter/unicodehelper.py +++ b/pypy/interpreter/unicodehelper.py @@ -60,12 +60,6 @@ return encode_object(space, w_data, encoding, errors) -def _has_surrogate(u): - for c in u: - if 0xD800 <= ord(c) <= 0xDFFF: - return True - return False - # These functions take and return unwrapped rpython strings def decode_unicode_escape(space, string): from pypy.module._codecs import interp_codecs diff --git a/pypy/module/_io/interp_textio.py b/pypy/module/_io/interp_textio.py --- a/pypy/module/_io/interp_textio.py +++ b/pypy/module/_io/interp_textio.py @@ -433,7 +433,10 @@ end = len(self.text) else: end = self.pos + limit - pos = self.text.find(marker, self.pos, end) + pos = self.pos + assert pos >= 0 + assert end >= 0 + pos = self.text.find(marker, pos, end) if pos >= 0: self.pos = self.upos = pos + 1 return True diff --git a/pypy/objspace/std/unicodeobject.py b/pypy/objspace/std/unicodeobject.py --- a/pypy/objspace/std/unicodeobject.py +++ b/pypy/objspace/std/unicodeobject.py @@ -874,11 +874,6 @@ def is_ascii(self): return self._length == len(self._utf8) - def _has_surrogates(self): - if self.is_ascii(): - return False - return rutf8.has_surrogates(self._utf8) - def _index_to_byte(self, index): if self.is_ascii(): assert index >= 0 diff --git a/rpython/rlib/rutf8.py b/rpython/rlib/rutf8.py --- a/rpython/rlib/rutf8.py +++ b/rpython/rlib/rutf8.py @@ -435,10 +435,17 @@ return result def has_surrogates(utf8): - # XXX write a faster version maybe - for ch in Utf8StringIterator(utf8): - if 0xD800 <= ch <= 0xDBFF: + # a surrogate starts with 0xed in utf-8 encoding + pos = 0 + while True: + pos = utf8.find("\xed", pos) + if pos < 0: + return False + assert pos <= len(utf8) - 1 # otherwise invalid utf-8 + ordch2 = ord(utf8[pos + 1]) + if _invalid_byte_2_of_3(0xed, ordch2, allow_surrogates=False): return True + pos += 1 return False def reencode_utf8_with_surrogates(utf8): diff --git a/rpython/rlib/test/test_rutf8.py b/rpython/rlib/test/test_rutf8.py --- a/rpython/rlib/test/test_rutf8.py +++ b/rpython/rlib/test/test_rutf8.py @@ -238,3 +238,17 @@ assert pos == i i = rutf8.next_codepoint_pos(utf8s, i) assert list(arg) == l + + +@given(strategies.text(), strategies.integers(0xd800, 0xdfff)) +def test_has_surrogates(arg, surrogate): + b = (arg + unichr(surrogate) + arg).encode("utf-8") + assert not rutf8.has_surrogates(arg.encode("utf-8")) + assert rutf8.has_surrogates(unichr(surrogate).encode("utf-8")) + assert rutf8.has_surrogates(b) + +def test_has_surrogate_xed_no_surrogate(): + u = unichr(55217) + unichr(54990) + b = u.encode("utf-8") + assert b.startswith(b"\xed") + assert not rutf8.has_surrogates(b) _______________________________________________ pypy-commit mailing list pypy-commit@python.org https://mail.python.org/mailman/listinfo/pypy-commit