Author: Brian Kearns <[email protected]>
Branch: stdlib-2.7.6
Changeset: r69607:a3e0260e2eea
Date: 2014-03-02 05:06 -0500
http://bitbucket.org/pypy/pypy/changeset/a3e0260e2eea/
Log: fix unicode_decode_escape behavior
diff --git a/pypy/module/_codecs/test/test_codecs.py
b/pypy/module/_codecs/test/test_codecs.py
--- a/pypy/module/_codecs/test/test_codecs.py
+++ b/pypy/module/_codecs/test/test_codecs.py
@@ -315,6 +315,28 @@
raises(ValueError, br"\x0".decode, 'string_escape')
raises(ValueError, br"[\x0]".decode, 'string_escape')
+ def test_unicode_escape_decode_errors(self):
+ from _codecs import unicode_escape_decode, raw_unicode_escape_decode
+ for decode in [unicode_escape_decode, raw_unicode_escape_decode]:
+ for c, d in ('u', 4), ('U', 4):
+ for i in range(d):
+ raises(UnicodeDecodeError, decode,
+ "\\" + c + "0"*i)
+ raises(UnicodeDecodeError, decode,
+ "[\\" + c + "0"*i + "]")
+ data = "[\\" + c + "0"*i + "]\\" + c + "0"*i
+ assert decode(data, "ignore") == (u"[]", len(data))
+ assert decode(data, "replace") == (u"[\ufffd]\ufffd",
len(data))
+ raises(UnicodeDecodeError, decode, r"\U00110000")
+ assert decode(r"\U00110000", "ignore") == (u"", 10)
+ assert decode(r"\U00110000", "replace") == (u"\ufffd", 10)
+ exc = raises(UnicodeDecodeError, unicode_escape_decode, "\u1z32z3",
'strict')
+ assert str(exc.value) == "'unicodeescape' codec can't decode bytes in
position 0-2: truncated \uXXXX escape"
+ exc = raises(UnicodeDecodeError, raw_unicode_escape_decode,
"\u1z32z3", 'strict')
+ assert str(exc.value) == "'rawunicodeescape' codec can't decode bytes
in position 0-2: truncated \uXXXX"
+ exc = raises(UnicodeDecodeError, raw_unicode_escape_decode,
"\U1z32z3", 'strict')
+ assert str(exc.value) == "'rawunicodeescape' codec can't decode bytes
in position 0-2: truncated \uXXXX"
+
def test_escape_encode(self):
assert '"'.encode('string_escape') == '"'
assert "'".encode('string_escape') == "\\'"
diff --git a/rpython/rlib/runicode.py b/rpython/rlib/runicode.py
--- a/rpython/rlib/runicode.py
+++ b/rpython/rlib/runicode.py
@@ -1126,9 +1126,11 @@
encoding, errorhandler, message, errors):
chr = 0
if pos + digits > len(s):
- message = "end of string in escape sequence"
- res, pos = errorhandler(errors, "unicodeescape",
- message, s, pos-2, len(s))
+ endinpos = pos
+ while endinpos < len(s) and s[endinpos] in hexdigits:
+ endinpos += 1
+ res, pos = errorhandler(errors, encoding,
+ message, s, pos-2, endinpos)
builder.append(res)
else:
try:
@@ -1138,7 +1140,7 @@
while s[endinpos] in hexdigits:
endinpos += 1
res, pos = errorhandler(errors, encoding,
- message, s, pos-2, endinpos+1)
+ message, s, pos-2, endinpos)
builder.append(res)
else:
# when we get here, chr is a 32-bit unicode character
@@ -1443,12 +1445,8 @@
pos += 1
continue
- if s[pos] == 'u':
- digits = 4
- message = "truncated \\uXXXX escape"
- else:
- digits = 8
- message = "truncated \\UXXXXXXXX escape"
+ digits = 4 if s[pos] == 'u' else 8
+ message = "truncated \\uXXXX"
pos += 1
pos = hexescape(result, s, pos, digits,
"rawunicodeescape", errorhandler, message, errors)
_______________________________________________
pypy-commit mailing list
[email protected]
https://mail.python.org/mailman/listinfo/pypy-commit