Author: Yusuke Tsutsumi <yus...@tsutsumi.io> Branch: fix_test_codecs Changeset: r94702:c2a3d03741e2 Date: 2018-05-25 23:09 -0700 http://bitbucket.org/pypy/pypy/changeset/c2a3d03741e2/
Log: Adding unit test for new behavior in runicode. Fixing bug Fixing a bug in the utf 8 handling which did not consider lower surrogates below 0xDC80. Adding unit tests for the new behavior in runicode, which combines high and low surrogates into a single errorhandler call. diff --git a/pypy/interpreter/unicodehelper.py b/pypy/interpreter/unicodehelper.py --- a/pypy/interpreter/unicodehelper.py +++ b/pypy/interpreter/unicodehelper.py @@ -199,7 +199,7 @@ # check if the first character is a high surrogate, # and the second character is a low surrogate. If so, # they should be handled collectively. - if ch <= 0xDBFF and 0xDC80 <= ch2 <= 0xDFFFF: + if ch <= 0xDBFF and 0xDC00 <= ch2 <= 0xDFFFF: # pos should be incremented regardless. # by doing so, it ensures the lower surrogate # is also included in the characters considered diff --git a/rpython/rlib/runicode.py b/rpython/rlib/runicode.py --- a/rpython/rlib/runicode.py +++ b/rpython/rlib/runicode.py @@ -369,7 +369,7 @@ # check if the first character is a high surrogate, # and the second character is a low surrogate. If so, # they should be handled collectively. - if ch <= 0xDBFF and 0xDC80 <= ch2 <= 0xDFFFF: + if ch <= 0xDBFF and 0xDC00 <= ch2 <= 0xDFFFF: # pos should be incremented regardless. # by doing so, it ensures the lower surrogate # is also included in the characters considered diff --git a/rpython/rlib/test/test_runicode.py b/rpython/rlib/test/test_runicode.py --- a/rpython/rlib/test/test_runicode.py +++ b/rpython/rlib/test/test_runicode.py @@ -963,3 +963,32 @@ py.test.raises( UnicodeEncodeError, runicode.unicode_encode_utf_8, u, len(u), True, allow_surrogates=False) + + def test_encode_utf_8_combine_surrogates(self): + """ + In the case of a surrogate pair, the error handler should + return back a start and stop position of the full surrogate + pair (new behavior inherited from python3.6) + """ + u = runicode.UNICHR(0xDC80) + runicode.UNICHR(0xD800) + \ + runicode.UNICHR(0xDFFF) + + handler_num = 0 + + def errorhandler(errors, encoding, msg, s, start, end): + """ + This handler will be called twice, so asserting both times: + + 1. the first time, 0xDC80 will be handled as a single surrogate, + since it is a standalone character and an invalid surrogate. + 2. the second time, the characters will be 0xD800 and 0xDFFF, since + that is a valid surrogate pair. + """ + assert s[start:end] in [u'\udc80', u'\uD800\uDFFF'] + return [], None, end + + runicode.unicode_encode_utf_8( + u, len(u), True, + errorhandler=errorhandler, + allow_surrogates=False + ) _______________________________________________ pypy-commit mailing list pypy-commit@python.org https://mail.python.org/mailman/listinfo/pypy-commit