Author: Matti Picus <[email protected]>
Branch: unicode-utf8-py3
Changeset: r95402:ce732b3c6929
Date: 2018-12-02 08:27 -0800
http://bitbucket.org/pypy/pypy/changeset/ce732b3c6929/

Log:    add failing test

diff --git a/pypy/module/_codecs/test/test_codecs.py 
b/pypy/module/_codecs/test/test_codecs.py
--- a/pypy/module/_codecs/test/test_codecs.py
+++ b/pypy/module/_codecs/test/test_codecs.py
@@ -148,6 +148,40 @@
         lgt = 12
         assert unicode_escape_decode(b'\\x61\\x62\\x63') == ('abc', lgt)
 
+    def test_unexpected_end_of_data(self):
+        """
+        Test that an 'unexpected end of data' error is raised when the string
+        ends after a start byte of a 2-, 3-, or 4-bytes sequence without having
+        enough continuation bytes.  The incomplete sequence is replaced with a
+        single U+FFFD when errors='replace'.
+        E.g. in the sequence <F3 80 80>, F3 is the start byte of a 4-bytes
+        sequence, but it's followed by only 2 valid continuation bytes and the
+        last continuation bytes is missing.
+        Note: the continuation bytes must be all valid, if one of them is
+        invalid another error will be raised.
+        """
+        sequences = [
+            'C2', 'DF',
+            'E0 A0', 'E0 BF', 'E1 80', 'E1 BF', 'EC 80', 'EC BF',
+            'ED 80', 'ED 9F', 'EE 80', 'EE BF', 'EF 80', 'EF BF',
+            'F0 90', 'F0 BF', 'F0 90 80', 'F0 90 BF', 'F0 BF 80', 'F0 BF BF',
+            'F1 80', 'F1 BF', 'F1 80 80', 'F1 80 BF', 'F1 BF 80', 'F1 BF BF',
+            'F3 80', 'F3 BF', 'F3 80 80', 'F3 80 BF', 'F3 BF 80', 'F3 BF BF',
+            'F4 80', 'F4 8F', 'F4 80 80', 'F4 80 BF', 'F4 8F 80', 'F4 8F BF'
+        ]
+        FFFD = '\ufffd'
+        for seq in sequences:
+            print(seq)
+            bseq = bytes(int(c, 16) for c in seq.split())
+            exc = raises(UnicodeDecodeError, bseq.decode, 'utf-8')
+            assert 'unexpected end of data' in str(exc.value)
+            assert bseq.decode('utf-8', 'replace') == u'\ufffd'
+            assert ((b'aaaa' + bseq + b'bbbb').decode('utf-8', 'replace') == 
+                    u'aaaa\ufffdbbbb')
+            assert bseq.decode('utf-8', 'ignore') == ''
+            assert ((b'aaaa' + bseq + b'bbbb').decode('utf-8', 'ignore') == 
+                    u'aaaabbbb')
+            
 
 class AppTestPartialEvaluation:
     spaceconfig = dict(usemodules=['array',])
_______________________________________________
pypy-commit mailing list
[email protected]
https://mail.python.org/mailman/listinfo/pypy-commit

Reply via email to