Author: Armin Rigo <[email protected]>
Branch: py3.5
Changeset: r91665:b86de9385d47
Date: 2017-07-02 16:12 +0200
http://bitbucket.org/pypy/pypy/changeset/b86de9385d47/

Log:    Issue #2598

        Try to interpret a byte string for '%s' like a utf-8 string. But
        don't crash if it is not valid UTF-8; instead use the "replace"
        error handler.

diff --git a/pypy/interpreter/error.py b/pypy/interpreter/error.py
--- a/pypy/interpreter/error.py
+++ b/pypy/interpreter/error.py
@@ -10,6 +10,7 @@
 from rpython.rlib.objectmodel import dont_inline
 from rpython.rlib import rstack, rstackovf
 from rpython.rlib import rwin32
+from rpython.rlib import runicode
 
 from pypy.interpreter import debug
 
@@ -468,6 +469,14 @@
     assert len(formats) > 0, "unsupported: no % command found"
     return tuple(parts), tuple(formats)
 
+def _decode_utf8(string):
+    # when building the error message, don't crash if the byte string
+    # provided is not valid UTF-8
+    assert isinstance(string, str)
+    result, consumed = runicode.str_decode_utf_8(
+        string, len(string), "replace", final=True)
+    return result
+
 def get_operrcls2(valuefmt):
     valuefmt = valuefmt.decode('ascii')
     strings, formats = decompose_valuefmt(valuefmt)
@@ -499,13 +508,16 @@
                     elif fmt == 'S':
                         result = space.unicode_w(space.str(value))
                     elif fmt == 'T':
-                        result = space.type(value).name.decode('utf-8')
+                        result = _decode_utf8(space.type(value).name)
                     elif fmt == 'N':
                         result = value.getname(space)
                     elif fmt == '8':
-                        result = value.decode('utf-8')
+                        result = _decode_utf8(value)
                     else:
-                        result = unicode(value)
+                        if isinstance(value, unicode):
+                            result = value
+                        else:
+                            result = _decode_utf8(str(value))
                     lst[i + i + 1] = result
                 lst[-1] = self.xstrings[-1]
                 return u''.join(lst)
diff --git a/pypy/interpreter/test/test_error.py 
b/pypy/interpreter/test/test_error.py
--- a/pypy/interpreter/test/test_error.py
+++ b/pypy/interpreter/test/test_error.py
@@ -92,6 +92,20 @@
     operr = oefmt("w_type", "abc %8", arg)
     val = operr._compute_value(space)
     assert val == u"abc &#224;&#232;&#236;&#242;&#249;"
+    #
+    # if the arg is a byte string and we specify '%s', then we
+    # also get utf-8 encoding.  This should be the common case
+    # nowadays with utf-8 byte strings being common in the RPython
+    # sources of PyPy.
+    operr = oefmt("w_type", "abc %s", arg)
+    val = operr._compute_value(space)
+    assert val == u"abc &#224;&#232;&#236;&#242;&#249;"
+    #
+    # if the byte string is not valid utf-8, then don't crash
+    arg = '\xe9'
+    operr = oefmt("w_type", "abc %8", arg)
+    val = operr._compute_value(space)
+
 
 def test_errorstr(space):
     operr = OperationError(space.w_ValueError, space.wrap("message"))
_______________________________________________
pypy-commit mailing list
[email protected]
https://mail.python.org/mailman/listinfo/pypy-commit

Reply via email to