Author: Carl Friedrich Bolz-Tereick <cfb...@gmx.de>
Branch: 
Changeset: r97462:5e5857c2fae6
Date: 2019-09-12 16:52 +0200
http://bitbucket.org/pypy/pypy/changeset/5e5857c2fae6/

Log:    merge heads

diff --git a/pypy/interpreter/unicodehelper.py 
b/pypy/interpreter/unicodehelper.py
--- a/pypy/interpreter/unicodehelper.py
+++ b/pypy/interpreter/unicodehelper.py
@@ -60,12 +60,6 @@
     return encode_object(space, w_data, encoding, errors)
 
 
-def _has_surrogate(u):
-    for c in u:
-        if 0xD800 <= ord(c) <= 0xDFFF:
-            return True
-    return False
-
 # These functions take and return unwrapped rpython strings
 def decode_unicode_escape(space, string):
     from pypy.module._codecs import interp_codecs
diff --git a/pypy/module/_io/interp_textio.py b/pypy/module/_io/interp_textio.py
--- a/pypy/module/_io/interp_textio.py
+++ b/pypy/module/_io/interp_textio.py
@@ -433,7 +433,10 @@
                 end = len(self.text)
             else:
                 end = self.pos + limit
-            pos = self.text.find(marker, self.pos, end)
+            pos = self.pos
+            assert pos >= 0
+            assert end >= 0
+            pos = self.text.find(marker, pos, end)
             if pos >= 0:
                 self.pos = self.upos = pos + 1
                 return True
diff --git a/pypy/objspace/std/unicodeobject.py 
b/pypy/objspace/std/unicodeobject.py
--- a/pypy/objspace/std/unicodeobject.py
+++ b/pypy/objspace/std/unicodeobject.py
@@ -874,11 +874,6 @@
     def is_ascii(self):
         return self._length == len(self._utf8)
 
-    def _has_surrogates(self):
-        if self.is_ascii():
-            return False
-        return rutf8.has_surrogates(self._utf8)
-
     def _index_to_byte(self, index):
         if self.is_ascii():
             assert index >= 0
diff --git a/rpython/rlib/rutf8.py b/rpython/rlib/rutf8.py
--- a/rpython/rlib/rutf8.py
+++ b/rpython/rlib/rutf8.py
@@ -435,10 +435,17 @@
     return result
 
 def has_surrogates(utf8):
-    # XXX write a faster version maybe
-    for ch in Utf8StringIterator(utf8):
-        if 0xD800 <= ch <= 0xDBFF:
+    # a surrogate starts with 0xed in utf-8 encoding
+    pos = 0
+    while True:
+        pos = utf8.find("\xed", pos)
+        if pos < 0:
+            return False
+        assert pos <= len(utf8) - 1 # otherwise invalid utf-8
+        ordch2 = ord(utf8[pos + 1])
+        if _invalid_byte_2_of_3(0xed, ordch2, allow_surrogates=False):
             return True
+        pos += 1
     return False
 
 def reencode_utf8_with_surrogates(utf8):
diff --git a/rpython/rlib/test/test_rutf8.py b/rpython/rlib/test/test_rutf8.py
--- a/rpython/rlib/test/test_rutf8.py
+++ b/rpython/rlib/test/test_rutf8.py
@@ -238,3 +238,17 @@
         assert pos == i
         i = rutf8.next_codepoint_pos(utf8s, i)
     assert list(arg) == l
+
+
+@given(strategies.text(), strategies.integers(0xd800, 0xdfff))
+def test_has_surrogates(arg, surrogate):
+    b = (arg + unichr(surrogate) + arg).encode("utf-8")
+    assert not rutf8.has_surrogates(arg.encode("utf-8"))
+    assert rutf8.has_surrogates(unichr(surrogate).encode("utf-8"))
+    assert rutf8.has_surrogates(b)
+
+def test_has_surrogate_xed_no_surrogate():
+    u = unichr(55217) + unichr(54990)
+    b = u.encode("utf-8")
+    assert b.startswith(b"\xed")
+    assert not rutf8.has_surrogates(b)
_______________________________________________
pypy-commit mailing list
pypy-commit@python.org
https://mail.python.org/mailman/listinfo/pypy-commit

Reply via email to