Author: Matti Picus <[email protected]>
Branch: unicode-utf8-py3
Changeset: r95443:0cca4bcffdbf
Date: 2018-12-02 20:56 -0800
http://bitbucket.org/pypy/pypy/changeset/0cca4bcffdbf/

Log:    test, fix for 'replace' error handler and short sequences

diff --git a/pypy/interpreter/unicodehelper.py 
b/pypy/interpreter/unicodehelper.py
--- a/pypy/interpreter/unicodehelper.py
+++ b/pypy/interpreter/unicodehelper.py
@@ -349,18 +349,21 @@
     res = StringBuilder(slen)
     pos = 0
     end = len(s)
+    suppressing = False # we are in a chain of "bad" unicode, only emit one fix
     while pos < end:
         ordch1 = ord(s[pos])
         # fast path for ASCII
         if ordch1 <= 0x7F:
             pos += 1
             res.append(chr(ordch1))
+            suppressing = False
             continue
 
         if ordch1 <= 0xC1:
             r, pos, rettype = errorhandler(errors, "utf8", "invalid start 
byte",
                     s, pos, pos + 1)
-            res.append(r)
+            if not suppressing:
+                res.append(r)
             continue
 
         pos += 1
@@ -372,14 +375,16 @@
                     break
                 r, pos, rettype = errorhandler(errors, "utf8", "unexpected end 
of data",
                     s, pos - 1, pos)
-                res.append(r)
+                if not suppressing:
+                    res.append(r)
                 continue
             ordch2 = ord(s[pos])
 
             if rutf8._invalid_byte_2_of_2(ordch2):
                 r, pos, rettype = errorhandler(errors, "utf8", "invalid 
continuation byte",
                     s, pos - 1, pos)
-                res.append(r)
+                if not suppressing:
+                    res.append(r)
                 continue
             # 110yyyyy 10zzzzzz -> 00000000 00000yyy yyzzzzzz
             pos += 1
@@ -393,8 +398,9 @@
                     pos -= 1
                     break
                 r, pos, rettype = errorhandler(errors, "utf8", "unexpected end 
of data",
-                    s, pos - 1, pos + 1)
+                    s, pos - 1, pos)
                 res.append(r)
+                suppressing = True
                 continue
             ordch2 = ord(s[pos])
             ordch3 = ord(s[pos + 1])
@@ -402,12 +408,14 @@
             if rutf8._invalid_byte_2_of_3(ordch1, ordch2, allow_surrogates):
                 r, pos, rettype = errorhandler(errors, "utf8", "invalid 
continuation byte",
                     s, pos - 1, pos)
-                res.append(r)
+                if not suppressing:
+                    res.append(r)
                 continue
             elif rutf8._invalid_byte_3_of_3(ordch3):
                 r, pos, rettype = errorhandler(errors, "utf8", "invalid 
continuation byte",
                     s, pos - 1, pos + 1)
-                res.append(r)
+                if not suppressing:
+                    res.append(r)
                 continue
             pos += 2
 
@@ -415,6 +423,7 @@
             res.append(chr(ordch1))
             res.append(chr(ordch2))
             res.append(chr(ordch3))
+            suppressing = False
             continue
 
         if ordch1 <= 0xF4:
@@ -424,6 +433,8 @@
                     break
                 r, pos, rettype = errorhandler(errors, "utf8", "unexpected end 
of data",
                     s, pos - 1, pos)
+                res.append(r)
+                suppressing = True
                 continue
             ordch2 = ord(s[pos])
             ordch3 = ord(s[pos + 1])
@@ -432,7 +443,8 @@
             if rutf8._invalid_byte_2_of_4(ordch1, ordch2):
                 r, pos, rettype = errorhandler(errors, "utf8", "invalid 
continuation byte",
                     s, pos - 1, pos)
-                res.append(r)
+                if not suppressing:
+                    res.append(r)
                 continue
             elif rutf8._invalid_byte_3_of_4(ordch3):
                 r, pos, rettype = errorhandler(errors, "utf8", "invalid 
continuation byte",
@@ -442,7 +454,8 @@
             elif rutf8._invalid_byte_4_of_4(ordch4):
                 r, pos, rettype = errorhandler(errors, "utf8", "invalid 
continuation byte",
                     s, pos - 1, pos + 2)
-                res.append(r)
+                if not suppressing:
+                    res.append(r)
                 continue
 
             pos += 3
@@ -451,11 +464,13 @@
             res.append(chr(ordch2))
             res.append(chr(ordch3))
             res.append(chr(ordch4))
+            suppressing = False
             continue
 
         r, pos, rettype = errorhandler(errors, "utf8", "invalid start byte",
                 s, pos - 1, pos)
-        res.append(r)
+        if not suppressing:
+            res.append(r)
 
     r = res.build()
     return r, rutf8.check_utf8(r, True), pos
diff --git a/pypy/module/_codecs/test/test_codecs.py 
b/pypy/module/_codecs/test/test_codecs.py
--- a/pypy/module/_codecs/test/test_codecs.py
+++ b/pypy/module/_codecs/test/test_codecs.py
@@ -261,12 +261,12 @@
             'F3 80', 'F3 BF', 'F3 80 80', 'F3 80 BF', 'F3 BF 80', 'F3 BF BF',
             'F4 80', 'F4 8F', 'F4 80 80', 'F4 80 BF', 'F4 8F 80', 'F4 8F BF'
         ]
-        FFFD = '\ufffd'
         for seq in sequences:
             bseq = bytes(int(c, 16) for c in seq.split())
             exc = raises(UnicodeDecodeError, bseq.decode, 'utf-8')
             assert 'unexpected end of data' in str(exc.value)
-            assert bseq.decode('utf-8', 'replace') == u'\ufffd'
+            useq = bseq.decode('utf-8', 'replace')
+            assert  useq == u'\ufffd', (bseq, useq)
             assert ((b'aaaa' + bseq + b'bbbb').decode('utf-8', 'replace') == 
                     u'aaaa\ufffdbbbb')
             assert bseq.decode('utf-8', 'ignore') == ''
_______________________________________________
pypy-commit mailing list
[email protected]
https://mail.python.org/mailman/listinfo/pypy-commit

Reply via email to