[pypy-commit] pypy py3k: Restore CPython compatibility for the super-important case of calling int() on a string containing lone surrogates

rlamy Sat, 30 Jul 2016 06:08:12 -0700

Author: Ronan Lamy <[email protected]>
Branch: py3k
Changeset: r85925:ffcb26465f79
Date: 2016-07-30 14:06 +0100
http://bitbucket.org/pypy/pypy/changeset/ffcb26465f79/


Log:    Restore CPython compatibility for the super-important case of
        calling int() on a string containing lone surrogates

diff --git a/pypy/objspace/std/intobject.py b/pypy/objspace/std/intobject.py
--- a/pypy/objspace/std/intobject.py
+++ b/pypy/objspace/std/intobject.py
@@ -871,14 +871,7 @@
             return _from_intlike(space, w_inttype, space.trunc(w_value))
         elif space.isinstance_w(w_value, space.w_unicode):
             from pypy.objspace.std.unicodeobject import unicode_to_decimal_w
-            try:
-                b = unicode_to_decimal_w(space, w_value)
-            except OperationError as e:
-                if not e.match(space, space.w_UnicodeEncodeError):
-                    raise
-                raise oefmt(space.w_ValueError,
-                            "int() called with a string containing a "
-                            "lone surrogate")
+            b = unicode_to_decimal_w(space, w_value, allow_surrogates=True)
             return _string_to_int_or_long(space, w_inttype, w_value, b)
         elif (space.isinstance_w(w_value, space.w_bytearray) or
               space.isinstance_w(w_value, space.w_bytes)):
@@ -906,7 +899,7 @@
 
         if space.isinstance_w(w_value, space.w_unicode):
             from pypy.objspace.std.unicodeobject import unicode_to_decimal_w
-            s = unicode_to_decimal_w(space, w_value)
+            s = unicode_to_decimal_w(space, w_value, allow_surrogates=True)
         else:
             try:
                 s = space.bufferstr_w(w_value)
diff --git a/pypy/objspace/std/test/test_longobject.py 
b/pypy/objspace/std/test/test_longobject.py
--- a/pypy/objspace/std/test/test_longobject.py
+++ b/pypy/objspace/std/test/test_longobject.py
@@ -422,6 +422,3 @@
         assert a is not b
         b -= 1
         assert a is b
-
-    def test_invalid_surrogate(self):
-        raises(ValueError, int, u"\u8000")
diff --git a/pypy/objspace/std/unicodeobject.py 
b/pypy/objspace/std/unicodeobject.py
--- a/pypy/objspace/std/unicodeobject.py
+++ b/pypy/objspace/std/unicodeobject.py
@@ -1269,11 +1269,11 @@
 # In CPython3 the call to PyUnicode_EncodeDecimal has been replaced to a call
 # to _PyUnicode_TransformDecimalAndSpaceToASCII, which is much simpler.
 # We do that here plus the final step of encoding the result to utf-8.
-# This final step corresponds to encode_utf8 *without* allow_surrogates.
-# In float.__new__() and complex.__new__(), a lone surrogate will throw
-# an app-level UnicodeEncodeError.  In long.__new__(), though, CPython3
-# gives inconsistently a ValueError, so we handle that case in intobject.py.
-def unicode_to_decimal_w(space, w_unistr):
+# This final step corresponds to encode_utf8. In float.__new__() and
+# complex.__new__(), a lone surrogate will throw an app-level
+# UnicodeEncodeError.
+
+def unicode_to_decimal_w(space, w_unistr, allow_surrogates=False):
     if not isinstance(w_unistr, W_UnicodeObject):
         raise oefmt(space.w_TypeError, "expected unicode, got '%T'", w_unistr)
     unistr = w_unistr._value
@@ -1290,7 +1290,7 @@
                 pass
         result[i] = unichr(uchr)
     return unicodehelper.encode_utf8(space, u''.join(result),
-                                     allow_surrogates=False)
+                                     allow_surrogates=allow_surrogates)
 
 
 _repr_function, _ = make_unicode_escape_function(
_______________________________________________
pypy-commit mailing list
[email protected]
https://mail.python.org/mailman/listinfo/pypy-commit

[pypy-commit] pypy py3k: Restore CPython compatibility for the super-important case of calling int() on a string containing lone surrogates

Reply via email to