Author: Matti Picus <[email protected]>
Branch: py3.6
Changeset: r96093:6090343709fa
Date: 2019-02-19 13:47 +0200
http://bitbucket.org/pypy/pypy/changeset/6090343709fa/

Log:    test, fix for infinite encoding due to bad error handler

diff --git a/pypy/interpreter/test/test_unicodehelper.py 
b/pypy/interpreter/test/test_unicodehelper.py
--- a/pypy/interpreter/test/test_unicodehelper.py
+++ b/pypy/interpreter/test/test_unicodehelper.py
@@ -1,3 +1,5 @@
+import pytest
+
 from pypy.interpreter.unicodehelper import (
     utf8_encode_utf_8, decode_utf8sp,
 )
@@ -16,13 +18,11 @@
 def test_encode_utf_8_combine_surrogates():
     """
     In the case of a surrogate pair, the error handler should
-    return back a start and stop position of the full surrogate
-    pair (new behavior inherited from python3.6)
+    called with a start and stop position of the full surrogate
+    pair (new behavior in python3.6)
     """
     u = u"\udc80\ud800\udfff"
 
-    handler_num = 0
-
     def errorhandler(errors, encoding, msg, s, start, end):
         """
         This handler will be called twice, so asserting both times:
@@ -33,7 +33,7 @@
            that is a valid surrogate pair.
         """
         assert s[start:end] in [u'\udc80', u'\uD800\uDFFF']
-        return '', 0, end
+        return '', end, 'b'
 
     utf8_encode_utf_8(
         u, 'strict',
@@ -41,6 +41,24 @@
         allow_surrogates=False
     )
 
+def test_bad_error_handler():
+    u = u"\udc80\ud800\udfff"
+
+    def errorhandler(errors, encoding, msg, s, start, end):
+        """
+        This handler will be called twice, so asserting both times:
+
+        1. the first time, 0xDC80 will be handled as a single surrogate,
+           since it is a standalone character and an invalid surrogate.
+        2. the second time, the characters will be 0xD800 and 0xDFFF, since
+           that is a valid surrogate pair.
+        """
+        assert s[start:end] in [u'\udc80', u'\uD800\uDFFF']
+        return '', start, 'b'
+
+    assert pytest.raises(IndexError, utf8_encode_utf_8, u, 'strict',
+                  errorhandler=errorhandler, allow_surrogates=False)
+
 def test_decode_utf8sp():
     space = FakeSpace()
     assert decode_utf8sp(space, "\xed\xa0\x80") == ("\xed\xa0\x80", 1, 3)
diff --git a/pypy/interpreter/unicodehelper.py 
b/pypy/interpreter/unicodehelper.py
--- a/pypy/interpreter/unicodehelper.py
+++ b/pypy/interpreter/unicodehelper.py
@@ -254,6 +254,10 @@
             else:
                 for ch in res:
                     result.append(ch)
+            if newindex <= upos:
+                raise IndexError(
+                   "position %d from error handler invalid, already encoded 
%d",
+                   newindex, upos)
             upos = newindex
             pos = rutf8._pos_at_index(s, upos)
     return result.build()
diff --git a/pypy/module/_codecs/interp_codecs.py 
b/pypy/module/_codecs/interp_codecs.py
--- a/pypy/module/_codecs/interp_codecs.py
+++ b/pypy/module/_codecs/interp_codecs.py
@@ -732,8 +732,11 @@
     if errors is None:
         errors = 'strict'
     state = space.fromcache(CodecState)
-    result = unicodehelper.utf8_encode_utf_8(utf8, errors,
+    try:
+        result = unicodehelper.utf8_encode_utf_8(utf8, errors,
                      state.encode_error_handler, allow_surrogates=False)
+    except IndexError as e:
+        raise oefmt(space.w_IndexError, e.args[0])
     return space.newtuple([space.newbytes(result), space.newint(lgt)])
 
 @unwrap_spec(string='bufferstr', errors='text_or_none',
_______________________________________________
pypy-commit mailing list
[email protected]
https://mail.python.org/mailman/listinfo/pypy-commit

Reply via email to