Author: Ronan Lamy <[email protected]>
Branch: py3k
Changeset: r85925:ffcb26465f79
Date: 2016-07-30 14:06 +0100
http://bitbucket.org/pypy/pypy/changeset/ffcb26465f79/
Log: Restore CPython compatibility for the super-important case of
calling int() on a string containing lone surrogates
diff --git a/pypy/objspace/std/intobject.py b/pypy/objspace/std/intobject.py
--- a/pypy/objspace/std/intobject.py
+++ b/pypy/objspace/std/intobject.py
@@ -871,14 +871,7 @@
return _from_intlike(space, w_inttype, space.trunc(w_value))
elif space.isinstance_w(w_value, space.w_unicode):
from pypy.objspace.std.unicodeobject import unicode_to_decimal_w
- try:
- b = unicode_to_decimal_w(space, w_value)
- except OperationError as e:
- if not e.match(space, space.w_UnicodeEncodeError):
- raise
- raise oefmt(space.w_ValueError,
- "int() called with a string containing a "
- "lone surrogate")
+ b = unicode_to_decimal_w(space, w_value, allow_surrogates=True)
return _string_to_int_or_long(space, w_inttype, w_value, b)
elif (space.isinstance_w(w_value, space.w_bytearray) or
space.isinstance_w(w_value, space.w_bytes)):
@@ -906,7 +899,7 @@
if space.isinstance_w(w_value, space.w_unicode):
from pypy.objspace.std.unicodeobject import unicode_to_decimal_w
- s = unicode_to_decimal_w(space, w_value)
+ s = unicode_to_decimal_w(space, w_value, allow_surrogates=True)
else:
try:
s = space.bufferstr_w(w_value)
diff --git a/pypy/objspace/std/test/test_longobject.py
b/pypy/objspace/std/test/test_longobject.py
--- a/pypy/objspace/std/test/test_longobject.py
+++ b/pypy/objspace/std/test/test_longobject.py
@@ -422,6 +422,3 @@
assert a is not b
b -= 1
assert a is b
-
- def test_invalid_surrogate(self):
- raises(ValueError, int, u"\u8000")
diff --git a/pypy/objspace/std/unicodeobject.py
b/pypy/objspace/std/unicodeobject.py
--- a/pypy/objspace/std/unicodeobject.py
+++ b/pypy/objspace/std/unicodeobject.py
@@ -1269,11 +1269,11 @@
# In CPython3 the call to PyUnicode_EncodeDecimal has been replaced to a call
# to _PyUnicode_TransformDecimalAndSpaceToASCII, which is much simpler.
# We do that here plus the final step of encoding the result to utf-8.
-# This final step corresponds to encode_utf8 *without* allow_surrogates.
-# In float.__new__() and complex.__new__(), a lone surrogate will throw
-# an app-level UnicodeEncodeError. In long.__new__(), though, CPython3
-# gives inconsistently a ValueError, so we handle that case in intobject.py.
-def unicode_to_decimal_w(space, w_unistr):
+# This final step corresponds to encode_utf8. In float.__new__() and
+# complex.__new__(), a lone surrogate will throw an app-level
+# UnicodeEncodeError.
+
+def unicode_to_decimal_w(space, w_unistr, allow_surrogates=False):
if not isinstance(w_unistr, W_UnicodeObject):
raise oefmt(space.w_TypeError, "expected unicode, got '%T'", w_unistr)
unistr = w_unistr._value
@@ -1290,7 +1290,7 @@
pass
result[i] = unichr(uchr)
return unicodehelper.encode_utf8(space, u''.join(result),
- allow_surrogates=False)
+ allow_surrogates=allow_surrogates)
_repr_function, _ = make_unicode_escape_function(
_______________________________________________
pypy-commit mailing list
[email protected]
https://mail.python.org/mailman/listinfo/pypy-commit