Author: Brian Kearns <[email protected]>
Branch: stdlib-2.7.6
Changeset: r69607:a3e0260e2eea
Date: 2014-03-02 05:06 -0500
http://bitbucket.org/pypy/pypy/changeset/a3e0260e2eea/

Log:    fix unicode_decode_escape behavior

diff --git a/pypy/module/_codecs/test/test_codecs.py 
b/pypy/module/_codecs/test/test_codecs.py
--- a/pypy/module/_codecs/test/test_codecs.py
+++ b/pypy/module/_codecs/test/test_codecs.py
@@ -315,6 +315,28 @@
         raises(ValueError, br"\x0".decode, 'string_escape')
         raises(ValueError, br"[\x0]".decode, 'string_escape')
 
+    def test_unicode_escape_decode_errors(self):
+        from _codecs import unicode_escape_decode, raw_unicode_escape_decode
+        for decode in [unicode_escape_decode, raw_unicode_escape_decode]:
+            for c, d in ('u', 4), ('U', 4):
+                for i in range(d):
+                    raises(UnicodeDecodeError, decode,
+                                      "\\" + c + "0"*i)
+                    raises(UnicodeDecodeError, decode,
+                                      "[\\" + c + "0"*i + "]")
+                    data = "[\\" + c + "0"*i + "]\\" + c + "0"*i
+                    assert decode(data, "ignore") == (u"[]", len(data))
+                    assert decode(data, "replace") == (u"[\ufffd]\ufffd", 
len(data))
+            raises(UnicodeDecodeError, decode, r"\U00110000")
+            assert decode(r"\U00110000", "ignore") == (u"", 10)
+            assert decode(r"\U00110000", "replace") == (u"\ufffd", 10)
+        exc = raises(UnicodeDecodeError, unicode_escape_decode, "\u1z32z3", 
'strict')
+        assert str(exc.value) == "'unicodeescape' codec can't decode bytes in 
position 0-2: truncated \uXXXX escape"
+        exc = raises(UnicodeDecodeError, raw_unicode_escape_decode, 
"\u1z32z3", 'strict')
+        assert str(exc.value) == "'rawunicodeescape' codec can't decode bytes 
in position 0-2: truncated \uXXXX"
+        exc = raises(UnicodeDecodeError, raw_unicode_escape_decode, 
"\U1z32z3", 'strict')
+        assert str(exc.value) == "'rawunicodeescape' codec can't decode bytes 
in position 0-2: truncated \uXXXX"
+
     def test_escape_encode(self):
         assert '"'.encode('string_escape') == '"'
         assert "'".encode('string_escape') == "\\'"
diff --git a/rpython/rlib/runicode.py b/rpython/rlib/runicode.py
--- a/rpython/rlib/runicode.py
+++ b/rpython/rlib/runicode.py
@@ -1126,9 +1126,11 @@
               encoding, errorhandler, message, errors):
     chr = 0
     if pos + digits > len(s):
-        message = "end of string in escape sequence"
-        res, pos = errorhandler(errors, "unicodeescape",
-                                message, s, pos-2, len(s))
+        endinpos = pos
+        while endinpos < len(s) and s[endinpos] in hexdigits:
+            endinpos += 1
+        res, pos = errorhandler(errors, encoding,
+                                message, s, pos-2, endinpos)
         builder.append(res)
     else:
         try:
@@ -1138,7 +1140,7 @@
             while s[endinpos] in hexdigits:
                 endinpos += 1
             res, pos = errorhandler(errors, encoding,
-                                    message, s, pos-2, endinpos+1)
+                                    message, s, pos-2, endinpos)
             builder.append(res)
         else:
             # when we get here, chr is a 32-bit unicode character
@@ -1443,12 +1445,8 @@
             pos += 1
             continue
 
-        if s[pos] == 'u':
-            digits = 4
-            message = "truncated \\uXXXX escape"
-        else:
-            digits = 8
-            message = "truncated \\UXXXXXXXX escape"
+        digits = 4 if s[pos] == 'u' else 8
+        message = "truncated \\uXXXX"
         pos += 1
         pos = hexescape(result, s, pos, digits,
                         "rawunicodeescape", errorhandler, message, errors)
_______________________________________________
pypy-commit mailing list
[email protected]
https://mail.python.org/mailman/listinfo/pypy-commit

Reply via email to