Author: Armin Rigo <[email protected]>
Branch: py3k
Changeset: r85883:f1508f8d4bf6
Date: 2016-07-27 20:36 +0200
http://bitbucket.org/pypy/pypy/changeset/f1508f8d4bf6/
Log: Fix int("\ud800") and float("\ud800")
diff --git a/pypy/interpreter/unicodehelper.py
b/pypy/interpreter/unicodehelper.py
--- a/pypy/interpreter/unicodehelper.py
+++ b/pypy/interpreter/unicodehelper.py
@@ -141,9 +141,7 @@
return result
def encode_utf8(space, uni, allow_surrogates=False):
- # Note that this function never raises UnicodeEncodeError,
- # since surrogate pairs are allowed.
- # This is not the case with Python3.
+ # Note that Python3 tends to forbid lone surrogates
return runicode.unicode_encode_utf_8(
uni, len(uni), "strict",
errorhandler=encode_error_handler(space),
diff --git a/pypy/objspace/std/intobject.py b/pypy/objspace/std/intobject.py
--- a/pypy/objspace/std/intobject.py
+++ b/pypy/objspace/std/intobject.py
@@ -871,8 +871,15 @@
return _from_intlike(space, w_inttype, space.trunc(w_value))
elif space.isinstance_w(w_value, space.w_unicode):
from pypy.objspace.std.unicodeobject import unicode_to_decimal_w
- return _string_to_int_or_long(space, w_inttype, w_value,
- unicode_to_decimal_w(space, w_value))
+ try:
+ b = unicode_to_decimal_w(space, w_value)
+ except OperationError as e:
+ if not e.match(space, space.w_UnicodeEncodeError):
+ raise
+ raise oefmt(space.w_ValueError,
+ "int() called with a string containing a "
+ "lone surrogate")
+ return _string_to_int_or_long(space, w_inttype, w_value, b)
elif (space.isinstance_w(w_value, space.w_bytearray) or
space.isinstance_w(w_value, space.w_bytes)):
return _string_to_int_or_long(space, w_inttype, w_value,
diff --git a/pypy/objspace/std/test/test_floatobject.py
b/pypy/objspace/std/test/test_floatobject.py
--- a/pypy/objspace/std/test/test_floatobject.py
+++ b/pypy/objspace/std/test/test_floatobject.py
@@ -149,6 +149,8 @@
assert float(memoryview(b"inf")) == inf
assert float(bytearray(b"inf")) == inf
+ raises(UnicodeEncodeError, float, u"\ud800")
+
def test_float_unicode(self):
# u00A0 and u2000 are some kind of spaces
assert 42.75 == float(chr(0x00A0)+str("42.75")+chr(0x2000))
diff --git a/pypy/objspace/std/test/test_longobject.py
b/pypy/objspace/std/test/test_longobject.py
--- a/pypy/objspace/std/test/test_longobject.py
+++ b/pypy/objspace/std/test/test_longobject.py
@@ -415,3 +415,6 @@
assert a is not b
b -= 1
assert a is b
+
+ def test_invalid_surrogate(self):
+ raises(ValueError, int, u"\u8000")
diff --git a/pypy/objspace/std/unicodeobject.py
b/pypy/objspace/std/unicodeobject.py
--- a/pypy/objspace/std/unicodeobject.py
+++ b/pypy/objspace/std/unicodeobject.py
@@ -1264,8 +1264,12 @@
# using the same logic as PyUnicode_EncodeDecimal, as CPython 2.7 does.
#
# In CPython3 the call to PyUnicode_EncodeDecimal has been replaced to a call
-# to PyUnicode_TransformDecimalToASCII, which is much simpler. Here, we do the
-# equivalent plus the final step of encoding the result to utf-8.
+# to _PyUnicode_TransformDecimalAndSpaceToASCII, which is much simpler.
+# We do that here plus the final step of encoding the result to utf-8.
+# This final step corresponds to encode_utf8 *without* allow_surrogates.
+# In float.__new__() and complex.__new__(), a lone surrogate will throw
+# an app-level UnicodeEncodeError. In long.__new__(), though, CPython3
+# gives inconsistently a ValueError, so we handle that case in intobject.py.
def unicode_to_decimal_w(space, w_unistr):
if not isinstance(w_unistr, W_UnicodeObject):
raise oefmt(space.w_TypeError, "expected unicode, got '%T'", w_unistr)
@@ -1282,7 +1286,8 @@
except KeyError:
pass
result[i] = unichr(uchr)
- return unicodehelper.encode_utf8(space, u''.join(result),
allow_surrogates=True)
+ return unicodehelper.encode_utf8(space, u''.join(result),
+ allow_surrogates=False)
_repr_function, _ = make_unicode_escape_function(
_______________________________________________
pypy-commit mailing list
[email protected]
https://mail.python.org/mailman/listinfo/pypy-commit