Author: Matti Picus <[email protected]>
Branch: py3.6
Changeset: r96093:6090343709fa
Date: 2019-02-19 13:47 +0200
http://bitbucket.org/pypy/pypy/changeset/6090343709fa/
Log: test, fix for infinite encoding due to bad error handler
diff --git a/pypy/interpreter/test/test_unicodehelper.py
b/pypy/interpreter/test/test_unicodehelper.py
--- a/pypy/interpreter/test/test_unicodehelper.py
+++ b/pypy/interpreter/test/test_unicodehelper.py
@@ -1,3 +1,5 @@
+import pytest
+
from pypy.interpreter.unicodehelper import (
utf8_encode_utf_8, decode_utf8sp,
)
@@ -16,13 +18,11 @@
def test_encode_utf_8_combine_surrogates():
"""
In the case of a surrogate pair, the error handler should
- return back a start and stop position of the full surrogate
- pair (new behavior inherited from python3.6)
+ called with a start and stop position of the full surrogate
+ pair (new behavior in python3.6)
"""
u = u"\udc80\ud800\udfff"
- handler_num = 0
-
def errorhandler(errors, encoding, msg, s, start, end):
"""
This handler will be called twice, so asserting both times:
@@ -33,7 +33,7 @@
that is a valid surrogate pair.
"""
assert s[start:end] in [u'\udc80', u'\uD800\uDFFF']
- return '', 0, end
+ return '', end, 'b'
utf8_encode_utf_8(
u, 'strict',
@@ -41,6 +41,24 @@
allow_surrogates=False
)
+def test_bad_error_handler():
+ u = u"\udc80\ud800\udfff"
+
+ def errorhandler(errors, encoding, msg, s, start, end):
+ """
+ This handler will be called twice, so asserting both times:
+
+ 1. the first time, 0xDC80 will be handled as a single surrogate,
+ since it is a standalone character and an invalid surrogate.
+ 2. the second time, the characters will be 0xD800 and 0xDFFF, since
+ that is a valid surrogate pair.
+ """
+ assert s[start:end] in [u'\udc80', u'\uD800\uDFFF']
+ return '', start, 'b'
+
+ assert pytest.raises(IndexError, utf8_encode_utf_8, u, 'strict',
+ errorhandler=errorhandler, allow_surrogates=False)
+
def test_decode_utf8sp():
space = FakeSpace()
assert decode_utf8sp(space, "\xed\xa0\x80") == ("\xed\xa0\x80", 1, 3)
diff --git a/pypy/interpreter/unicodehelper.py
b/pypy/interpreter/unicodehelper.py
--- a/pypy/interpreter/unicodehelper.py
+++ b/pypy/interpreter/unicodehelper.py
@@ -254,6 +254,10 @@
else:
for ch in res:
result.append(ch)
+ if newindex <= upos:
+ raise IndexError(
+ "position %d from error handler invalid, already encoded
%d",
+ newindex, upos)
upos = newindex
pos = rutf8._pos_at_index(s, upos)
return result.build()
diff --git a/pypy/module/_codecs/interp_codecs.py
b/pypy/module/_codecs/interp_codecs.py
--- a/pypy/module/_codecs/interp_codecs.py
+++ b/pypy/module/_codecs/interp_codecs.py
@@ -732,8 +732,11 @@
if errors is None:
errors = 'strict'
state = space.fromcache(CodecState)
- result = unicodehelper.utf8_encode_utf_8(utf8, errors,
+ try:
+ result = unicodehelper.utf8_encode_utf_8(utf8, errors,
state.encode_error_handler, allow_surrogates=False)
+ except IndexError as e:
+ raise oefmt(space.w_IndexError, e.args[0])
return space.newtuple([space.newbytes(result), space.newint(lgt)])
@unwrap_spec(string='bufferstr', errors='text_or_none',
_______________________________________________
pypy-commit mailing list
[email protected]
https://mail.python.org/mailman/listinfo/pypy-commit