[pypy-commit] pypy unicode-utf8-py3: test, fix for suppressing multiple error output bytes

mattip Tue, 22 Jan 2019 02:38:07 -0800

Author: Matti Picus <[email protected]>
Branch: unicode-utf8-py3
Changeset: r95692:d529e654cfb5
Date: 2019-01-22 12:36 +0200
http://bitbucket.org/pypy/pypy/changeset/d529e654cfb5/


Log:    test, fix for suppressing multiple error output bytes

diff --git a/pypy/interpreter/unicodehelper.py 
b/pypy/interpreter/unicodehelper.py
--- a/pypy/interpreter/unicodehelper.py
+++ b/pypy/interpreter/unicodehelper.py
@@ -398,9 +398,9 @@
                     msg = "invalid continuation byte"
                 else:
                     msg = "unexpected end of data"
+                    suppressing = True
                 r, pos, rettype = errorhandler(errors, "utf8", msg, s, pos - 
1, pos)
                 res.append(r)
-                suppressing = True
                 continue
             ordch2 = ord(s[pos])
             ordch3 = ord(s[pos + 1])
@@ -437,9 +437,9 @@
                     msg = "invalid continuation byte"
                 else:
                     msg = "unexpected end of data"
+                suppressing = True
                 r, pos, rettype = errorhandler(errors, "utf8", msg, s, pos - 
1, pos)
                 res.append(r)
-                suppressing = True
                 continue
             ordch2 = ord(s[pos])
             ordch3 = ord(s[pos + 1])
diff --git a/pypy/module/_codecs/test/test_codecs.py 
b/pypy/module/_codecs/test/test_codecs.py
--- a/pypy/module/_codecs/test/test_codecs.py
+++ b/pypy/module/_codecs/test/test_codecs.py
@@ -272,6 +272,73 @@
             assert ((b'aaaa' + bseq + b'bbbb').decode('utf-8', 'ignore') == 
                     u'aaaabbbb')
 
+    def test_invalid_cb_for_3bytes_seq(self):
+        """
+        Test that an 'invalid continuation byte' error is raised when the
+        continuation byte(s) of a 3-bytes sequence are invalid.  When
+        errors='replace', if the first continuation byte is valid, the first
+        two bytes (start byte + 1st cb) are replaced by a single U+FFFD and the
+        third byte is handled separately, otherwise only the start byte is
+        replaced with a U+FFFD and the other continuation bytes are handled
+        separately.
+        E.g. in the sequence <E1 80 41>, E1 is the start byte of a 3-bytes
+        sequence, 80 is a valid continuation byte, but 41 is not a valid cb
+        because it's the ASCII letter 'A'.
+        Note: when the start byte is E0 or ED, the valid ranges for the first
+        continuation byte are limited to A0..BF and 80..9F respectively.
+        Python 2 used to consider all the bytes in range 80..BF valid when the
+        start byte was ED.  This is fixed in Python 3.
+        """
+        FFFD = '\ufffd'
+        FFFDx2 = FFFD * 2
+        sequences = [
+            ('E0 00', FFFD+'\x00'), ('E0 7F', FFFD+'\x7f'), ('E0 80', FFFDx2),
+            ('E0 9F', FFFDx2), ('E0 C0', FFFDx2), ('E0 FF', FFFDx2),
+            ('E0 A0 00', FFFD+'\x00'), ('E0 A0 7F', FFFD+'\x7f'),
+            ('E0 A0 C0', FFFDx2), ('E0 A0 FF', FFFDx2),
+            ('E0 BF 00', FFFD+'\x00'), ('E0 BF 7F', FFFD+'\x7f'),
+            ('E0 BF C0', FFFDx2), ('E0 BF FF', FFFDx2), ('E1 00', FFFD+'\x00'),
+            ('E1 7F', FFFD+'\x7f'), ('E1 C0', FFFDx2), ('E1 FF', FFFDx2),
+            ('E1 80 00', FFFD+'\x00'), ('E1 80 7F', FFFD+'\x7f'),
+            ('E1 80 C0', FFFDx2), ('E1 80 FF', FFFDx2),
+            ('E1 BF 00', FFFD+'\x00'), ('E1 BF 7F', FFFD+'\x7f'),
+            ('E1 BF C0', FFFDx2), ('E1 BF FF', FFFDx2), ('EC 00', FFFD+'\x00'),
+            ('EC 7F', FFFD+'\x7f'), ('EC C0', FFFDx2), ('EC FF', FFFDx2),
+            ('EC 80 00', FFFD+'\x00'), ('EC 80 7F', FFFD+'\x7f'),
+            ('EC 80 C0', FFFDx2), ('EC 80 FF', FFFDx2),
+            ('EC BF 00', FFFD+'\x00'), ('EC BF 7F', FFFD+'\x7f'),
+            ('EC BF C0', FFFDx2), ('EC BF FF', FFFDx2), ('ED 00', FFFD+'\x00'),
+            ('ED 7F', FFFD+'\x7f'),
+            ('ED A0', FFFDx2), ('ED BF', FFFDx2), # see note ^
+            ('ED C0', FFFDx2), ('ED FF', FFFDx2), ('ED 80 00', FFFD+'\x00'),
+            ('ED 80 7F', FFFD+'\x7f'), ('ED 80 C0', FFFDx2),
+            ('ED 80 FF', FFFDx2), ('ED 9F 00', FFFD+'\x00'),
+            ('ED 9F 7F', FFFD+'\x7f'), ('ED 9F C0', FFFDx2),
+            ('ED 9F FF', FFFDx2), ('EE 00', FFFD+'\x00'),
+            ('EE 7F', FFFD+'\x7f'), ('EE C0', FFFDx2), ('EE FF', FFFDx2),
+            ('EE 80 00', FFFD+'\x00'), ('EE 80 7F', FFFD+'\x7f'),
+            ('EE 80 C0', FFFDx2), ('EE 80 FF', FFFDx2),
+            ('EE BF 00', FFFD+'\x00'), ('EE BF 7F', FFFD+'\x7f'),
+            ('EE BF C0', FFFDx2), ('EE BF FF', FFFDx2), ('EF 00', FFFD+'\x00'),
+            ('EF 7F', FFFD+'\x7f'), ('EF C0', FFFDx2), ('EF FF', FFFDx2),
+            ('EF 80 00', FFFD+'\x00'), ('EF 80 7F', FFFD+'\x7f'),
+            ('EF 80 C0', FFFDx2), ('EF 80 FF', FFFDx2),
+            ('EF BF 00', FFFD+'\x00'), ('EF BF 7F', FFFD+'\x7f'),
+            ('EF BF C0', FFFDx2), ('EF BF FF', FFFDx2),
+        ]
+        err = 'invalid continuation byte'
+        for s, res in sequences:
+            seq = bytes(int(c, 16) for c in s.split())
+            print(seq, [hex(ord(c)) for c in res])
+            exc = raises(UnicodeDecodeError, seq.decode, 'utf-8')
+            assert err in str(exc.value)
+            assert seq.decode('utf-8', 'replace') == res
+            assert ((b'aaaa' + seq + b'bbbb').decode('utf-8', 'replace') == 
+                         'aaaa' + res + 'bbbb')
+            res = res.replace('\ufffd', '')
+            assert seq.decode('utf-8', 'ignore') == res
+            assert((b'aaaa' + seq + b'bbbb').decode('utf-8', 'ignore') ==
+                          'aaaa' + res + 'bbbb')
 
 class AppTestPartialEvaluation:
     spaceconfig = dict(usemodules=['array',])
_______________________________________________
pypy-commit mailing list
[email protected]
https://mail.python.org/mailman/listinfo/pypy-commit

[pypy-commit] pypy unicode-utf8-py3: test, fix for suppressing multiple error output bytes

Reply via email to