Author: Matti Picus <[email protected]>
Branch: unicode-utf8-py3
Changeset: r95692:d529e654cfb5
Date: 2019-01-22 12:36 +0200
http://bitbucket.org/pypy/pypy/changeset/d529e654cfb5/
Log: test, fix for suppressing multiple error output bytes
diff --git a/pypy/interpreter/unicodehelper.py
b/pypy/interpreter/unicodehelper.py
--- a/pypy/interpreter/unicodehelper.py
+++ b/pypy/interpreter/unicodehelper.py
@@ -398,9 +398,9 @@
msg = "invalid continuation byte"
else:
msg = "unexpected end of data"
+ suppressing = True
r, pos, rettype = errorhandler(errors, "utf8", msg, s, pos -
1, pos)
res.append(r)
- suppressing = True
continue
ordch2 = ord(s[pos])
ordch3 = ord(s[pos + 1])
@@ -437,9 +437,9 @@
msg = "invalid continuation byte"
else:
msg = "unexpected end of data"
+ suppressing = True
r, pos, rettype = errorhandler(errors, "utf8", msg, s, pos -
1, pos)
res.append(r)
- suppressing = True
continue
ordch2 = ord(s[pos])
ordch3 = ord(s[pos + 1])
diff --git a/pypy/module/_codecs/test/test_codecs.py
b/pypy/module/_codecs/test/test_codecs.py
--- a/pypy/module/_codecs/test/test_codecs.py
+++ b/pypy/module/_codecs/test/test_codecs.py
@@ -272,6 +272,73 @@
assert ((b'aaaa' + bseq + b'bbbb').decode('utf-8', 'ignore') ==
u'aaaabbbb')
+ def test_invalid_cb_for_3bytes_seq(self):
+ """
+ Test that an 'invalid continuation byte' error is raised when the
+ continuation byte(s) of a 3-bytes sequence are invalid. When
+ errors='replace', if the first continuation byte is valid, the first
+ two bytes (start byte + 1st cb) are replaced by a single U+FFFD and the
+ third byte is handled separately, otherwise only the start byte is
+ replaced with a U+FFFD and the other continuation bytes are handled
+ separately.
+ E.g. in the sequence <E1 80 41>, E1 is the start byte of a 3-bytes
+ sequence, 80 is a valid continuation byte, but 41 is not a valid cb
+ because it's the ASCII letter 'A'.
+ Note: when the start byte is E0 or ED, the valid ranges for the first
+ continuation byte are limited to A0..BF and 80..9F respectively.
+ Python 2 used to consider all the bytes in range 80..BF valid when the
+ start byte was ED. This is fixed in Python 3.
+ """
+ FFFD = '\ufffd'
+ FFFDx2 = FFFD * 2
+ sequences = [
+ ('E0 00', FFFD+'\x00'), ('E0 7F', FFFD+'\x7f'), ('E0 80', FFFDx2),
+ ('E0 9F', FFFDx2), ('E0 C0', FFFDx2), ('E0 FF', FFFDx2),
+ ('E0 A0 00', FFFD+'\x00'), ('E0 A0 7F', FFFD+'\x7f'),
+ ('E0 A0 C0', FFFDx2), ('E0 A0 FF', FFFDx2),
+ ('E0 BF 00', FFFD+'\x00'), ('E0 BF 7F', FFFD+'\x7f'),
+ ('E0 BF C0', FFFDx2), ('E0 BF FF', FFFDx2), ('E1 00', FFFD+'\x00'),
+ ('E1 7F', FFFD+'\x7f'), ('E1 C0', FFFDx2), ('E1 FF', FFFDx2),
+ ('E1 80 00', FFFD+'\x00'), ('E1 80 7F', FFFD+'\x7f'),
+ ('E1 80 C0', FFFDx2), ('E1 80 FF', FFFDx2),
+ ('E1 BF 00', FFFD+'\x00'), ('E1 BF 7F', FFFD+'\x7f'),
+ ('E1 BF C0', FFFDx2), ('E1 BF FF', FFFDx2), ('EC 00', FFFD+'\x00'),
+ ('EC 7F', FFFD+'\x7f'), ('EC C0', FFFDx2), ('EC FF', FFFDx2),
+ ('EC 80 00', FFFD+'\x00'), ('EC 80 7F', FFFD+'\x7f'),
+ ('EC 80 C0', FFFDx2), ('EC 80 FF', FFFDx2),
+ ('EC BF 00', FFFD+'\x00'), ('EC BF 7F', FFFD+'\x7f'),
+ ('EC BF C0', FFFDx2), ('EC BF FF', FFFDx2), ('ED 00', FFFD+'\x00'),
+ ('ED 7F', FFFD+'\x7f'),
+ ('ED A0', FFFDx2), ('ED BF', FFFDx2), # see note ^
+ ('ED C0', FFFDx2), ('ED FF', FFFDx2), ('ED 80 00', FFFD+'\x00'),
+ ('ED 80 7F', FFFD+'\x7f'), ('ED 80 C0', FFFDx2),
+ ('ED 80 FF', FFFDx2), ('ED 9F 00', FFFD+'\x00'),
+ ('ED 9F 7F', FFFD+'\x7f'), ('ED 9F C0', FFFDx2),
+ ('ED 9F FF', FFFDx2), ('EE 00', FFFD+'\x00'),
+ ('EE 7F', FFFD+'\x7f'), ('EE C0', FFFDx2), ('EE FF', FFFDx2),
+ ('EE 80 00', FFFD+'\x00'), ('EE 80 7F', FFFD+'\x7f'),
+ ('EE 80 C0', FFFDx2), ('EE 80 FF', FFFDx2),
+ ('EE BF 00', FFFD+'\x00'), ('EE BF 7F', FFFD+'\x7f'),
+ ('EE BF C0', FFFDx2), ('EE BF FF', FFFDx2), ('EF 00', FFFD+'\x00'),
+ ('EF 7F', FFFD+'\x7f'), ('EF C0', FFFDx2), ('EF FF', FFFDx2),
+ ('EF 80 00', FFFD+'\x00'), ('EF 80 7F', FFFD+'\x7f'),
+ ('EF 80 C0', FFFDx2), ('EF 80 FF', FFFDx2),
+ ('EF BF 00', FFFD+'\x00'), ('EF BF 7F', FFFD+'\x7f'),
+ ('EF BF C0', FFFDx2), ('EF BF FF', FFFDx2),
+ ]
+ err = 'invalid continuation byte'
+ for s, res in sequences:
+ seq = bytes(int(c, 16) for c in s.split())
+ print(seq, [hex(ord(c)) for c in res])
+ exc = raises(UnicodeDecodeError, seq.decode, 'utf-8')
+ assert err in str(exc.value)
+ assert seq.decode('utf-8', 'replace') == res
+ assert ((b'aaaa' + seq + b'bbbb').decode('utf-8', 'replace') ==
+ 'aaaa' + res + 'bbbb')
+ res = res.replace('\ufffd', '')
+ assert seq.decode('utf-8', 'ignore') == res
+ assert((b'aaaa' + seq + b'bbbb').decode('utf-8', 'ignore') ==
+ 'aaaa' + res + 'bbbb')
class AppTestPartialEvaluation:
spaceconfig = dict(usemodules=['array',])
_______________________________________________
pypy-commit mailing list
[email protected]
https://mail.python.org/mailman/listinfo/pypy-commit