Author: Matti Picus <[email protected]>
Branch: unicode-utf8-py3
Changeset: r95443:0cca4bcffdbf
Date: 2018-12-02 20:56 -0800
http://bitbucket.org/pypy/pypy/changeset/0cca4bcffdbf/
Log: test, fix for 'replace' error handler and short sequences
diff --git a/pypy/interpreter/unicodehelper.py
b/pypy/interpreter/unicodehelper.py
--- a/pypy/interpreter/unicodehelper.py
+++ b/pypy/interpreter/unicodehelper.py
@@ -349,18 +349,21 @@
res = StringBuilder(slen)
pos = 0
end = len(s)
+ suppressing = False # we are in a chain of "bad" unicode, only emit one fix
while pos < end:
ordch1 = ord(s[pos])
# fast path for ASCII
if ordch1 <= 0x7F:
pos += 1
res.append(chr(ordch1))
+ suppressing = False
continue
if ordch1 <= 0xC1:
r, pos, rettype = errorhandler(errors, "utf8", "invalid start
byte",
s, pos, pos + 1)
- res.append(r)
+ if not suppressing:
+ res.append(r)
continue
pos += 1
@@ -372,14 +375,16 @@
break
r, pos, rettype = errorhandler(errors, "utf8", "unexpected end
of data",
s, pos - 1, pos)
- res.append(r)
+ if not suppressing:
+ res.append(r)
continue
ordch2 = ord(s[pos])
if rutf8._invalid_byte_2_of_2(ordch2):
r, pos, rettype = errorhandler(errors, "utf8", "invalid
continuation byte",
s, pos - 1, pos)
- res.append(r)
+ if not suppressing:
+ res.append(r)
continue
# 110yyyyy 10zzzzzz -> 00000000 00000yyy yyzzzzzz
pos += 1
@@ -393,8 +398,9 @@
pos -= 1
break
r, pos, rettype = errorhandler(errors, "utf8", "unexpected end
of data",
- s, pos - 1, pos + 1)
+ s, pos - 1, pos)
res.append(r)
+ suppressing = True
continue
ordch2 = ord(s[pos])
ordch3 = ord(s[pos + 1])
@@ -402,12 +408,14 @@
if rutf8._invalid_byte_2_of_3(ordch1, ordch2, allow_surrogates):
r, pos, rettype = errorhandler(errors, "utf8", "invalid
continuation byte",
s, pos - 1, pos)
- res.append(r)
+ if not suppressing:
+ res.append(r)
continue
elif rutf8._invalid_byte_3_of_3(ordch3):
r, pos, rettype = errorhandler(errors, "utf8", "invalid
continuation byte",
s, pos - 1, pos + 1)
- res.append(r)
+ if not suppressing:
+ res.append(r)
continue
pos += 2
@@ -415,6 +423,7 @@
res.append(chr(ordch1))
res.append(chr(ordch2))
res.append(chr(ordch3))
+ suppressing = False
continue
if ordch1 <= 0xF4:
@@ -424,6 +433,8 @@
break
r, pos, rettype = errorhandler(errors, "utf8", "unexpected end
of data",
s, pos - 1, pos)
+ res.append(r)
+ suppressing = True
continue
ordch2 = ord(s[pos])
ordch3 = ord(s[pos + 1])
@@ -432,7 +443,8 @@
if rutf8._invalid_byte_2_of_4(ordch1, ordch2):
r, pos, rettype = errorhandler(errors, "utf8", "invalid
continuation byte",
s, pos - 1, pos)
- res.append(r)
+ if not suppressing:
+ res.append(r)
continue
elif rutf8._invalid_byte_3_of_4(ordch3):
r, pos, rettype = errorhandler(errors, "utf8", "invalid
continuation byte",
@@ -442,7 +454,8 @@
elif rutf8._invalid_byte_4_of_4(ordch4):
r, pos, rettype = errorhandler(errors, "utf8", "invalid
continuation byte",
s, pos - 1, pos + 2)
- res.append(r)
+ if not suppressing:
+ res.append(r)
continue
pos += 3
@@ -451,11 +464,13 @@
res.append(chr(ordch2))
res.append(chr(ordch3))
res.append(chr(ordch4))
+ suppressing = False
continue
r, pos, rettype = errorhandler(errors, "utf8", "invalid start byte",
s, pos - 1, pos)
- res.append(r)
+ if not suppressing:
+ res.append(r)
r = res.build()
return r, rutf8.check_utf8(r, True), pos
diff --git a/pypy/module/_codecs/test/test_codecs.py
b/pypy/module/_codecs/test/test_codecs.py
--- a/pypy/module/_codecs/test/test_codecs.py
+++ b/pypy/module/_codecs/test/test_codecs.py
@@ -261,12 +261,12 @@
'F3 80', 'F3 BF', 'F3 80 80', 'F3 80 BF', 'F3 BF 80', 'F3 BF BF',
'F4 80', 'F4 8F', 'F4 80 80', 'F4 80 BF', 'F4 8F 80', 'F4 8F BF'
]
- FFFD = '\ufffd'
for seq in sequences:
bseq = bytes(int(c, 16) for c in seq.split())
exc = raises(UnicodeDecodeError, bseq.decode, 'utf-8')
assert 'unexpected end of data' in str(exc.value)
- assert bseq.decode('utf-8', 'replace') == u'\ufffd'
+ useq = bseq.decode('utf-8', 'replace')
+ assert useq == u'\ufffd', (bseq, useq)
assert ((b'aaaa' + bseq + b'bbbb').decode('utf-8', 'replace') ==
u'aaaa\ufffdbbbb')
assert bseq.decode('utf-8', 'ignore') == ''
_______________________________________________
pypy-commit mailing list
[email protected]
https://mail.python.org/mailman/listinfo/pypy-commit