Author: Yusuke Tsutsumi <yus...@tsutsumi.io> Branch: fix_test_codecs Changeset: r94703:c6a20c1af5c0 Date: 2018-05-26 21:56 -0700 http://bitbucket.org/pypy/pypy/changeset/c6a20c1af5c0/
Log: Addressing code review feedback on #612 * removing all changes to rpython, as nothing needs to change there to ensure pypy3 is python3.6 compliant. * adding tests for new behavior introduced in pypy3, to satsify pyhton3.6 behavior diff --git a/pypy/interpreter/test/test_unicodehelper.py b/pypy/interpreter/test/test_unicodehelper.py --- a/pypy/interpreter/test/test_unicodehelper.py +++ b/pypy/interpreter/test/test_unicodehelper.py @@ -3,7 +3,10 @@ import struct import sys from pypy.interpreter.unicodehelper import ( - encode_utf8, decode_utf8, unicode_encode_utf_32_be, str_decode_utf_32_be) + encode_utf8, decode_utf8, + unicode_encode_utf_8, + unicode_encode_utf_32_be, str_decode_utf_32_be +) from pypy.interpreter.unicodehelper import encode_utf8sp, decode_utf8sp @@ -28,6 +31,35 @@ c = u"\udc00" py.test.raises(Hit, encode_utf8, space, u"\ud800" + c) + +def test_encode_utf_8_combine_surrogates(): + """ + In the case of a surrogate pair, the error handler should + return back a start and stop position of the full surrogate + pair (new behavior inherited from python3.6) + """ + u = u"\udc80\ud800\udfff" + + handler_num = 0 + + def errorhandler(errors, encoding, msg, s, start, end): + """ + This handler will be called twice, so asserting both times: + + 1. the first time, 0xDC80 will be handled as a single surrogate, + since it is a standalone character and an invalid surrogate. + 2. the second time, the characters will be 0xD800 and 0xDFFF, since + that is a valid surrogate pair. + """ + assert s[start:end] in [u'\udc80', u'\uD800\uDFFF'] + return [], None, end + + unicode_encode_utf_8( + u, len(u), True, + errorhandler=errorhandler, + allow_surrogates=False + ) + def test_encode_utf8_allow_surrogates(): sp = FakeSpace() assert encode_utf8(sp, u"\ud800", allow_surrogates=True) == "\xed\xa0\x80" diff --git a/pypy/module/_codecs/test/test_codecs.py b/pypy/module/_codecs/test/test_codecs.py --- a/pypy/module/_codecs/test/test_codecs.py +++ b/pypy/module/_codecs/test/test_codecs.py @@ -796,6 +796,14 @@ test_sequence = before_sequence + ill_surrogate + after_sequence raises(UnicodeDecodeError, test_sequence.decode, encoding) + def test_lone_surrogates_utf_8(self): + """ + utf-8 should not longer allow surrogates, + and should return back full surrogate pairs. + """ + e = raises(UnicodeEncodeError, u"\udc80\ud800\udfff".encode, "utf-8") + assert e.object[e.start:e.end] == u'\ud800\udfff' + def test_charmap_encode(self): assert 'xxx'.encode('charmap') == b'xxx' diff --git a/rpython/rlib/runicode.py b/rpython/rlib/runicode.py --- a/rpython/rlib/runicode.py +++ b/rpython/rlib/runicode.py @@ -361,27 +361,20 @@ else: # Encode UCS2 Unicode ordinals if ch < 0x10000: - # Special case: check for surrogates + # Special case: check for high surrogate if 0xD800 <= ch <= 0xDFFF: - error_start_pos = pos - 1 if pos != size: ch2 = ord(s[pos]) - # check if the first character is a high surrogate, - # and the second character is a low surrogate. If so, - # they should be handled collectively. - if ch <= 0xDBFF and 0xDC00 <= ch2 <= 0xDFFFF: - # pos should be incremented regardless. - # by doing so, it ensures the lower surrogate - # is also included in the characters considered - # in the errorhandler. + # Check for low surrogate and combine the two to + # form a UCS4 value + if ((allow_surrogates or MAXUNICODE < 65536 + or is_narrow_host()) and + ch <= 0xDBFF and 0xDC00 <= ch2 <= 0xDFFF): + ch3 = ((ch - 0xD800) << 10 | (ch2 - 0xDC00)) + 0x10000 + assert ch3 >= 0 pos += 1 - # if we allow surrogates, we should combine - # the two and form a UCS4 value - if allow_surrogates or MAXUNICODE < 65535 or is_narrow_host(): - ch3 = ((ch - 0xD800) << 10 | (ch2 - 0xDC00)) + 0x10000 - assert ch3 >= 0 - _encodeUCS4(result, ch3) - continue + _encodeUCS4(result, ch3) + continue # note: if the program only ever calls this with # allow_surrogates=True, then we'll never annotate # the following block of code, and errorhandler() @@ -390,7 +383,7 @@ if not allow_surrogates or nonconst.NonConstant(False): ru, rs, pos = errorhandler(errors, 'utf8', 'surrogates not allowed', - s, error_start_pos, pos) + s, pos-1, pos) if rs is not None: # py3k only result.append(rs) @@ -401,7 +394,7 @@ else: errorhandler('strict', 'utf8', 'surrogates not allowed', - s, pos - 1 , pos) + s, pos-1, pos) continue # else: Fall through and handles isolated high surrogates result.append((chr((0xe0 | (ch >> 12))))) @@ -1442,11 +1435,10 @@ errorhandler = default_unicode_error_decode if size == 0: - return u'', 0, None + return u'', 0 builder = UnicodeBuilder(size) pos = 0 - first_escape_error_char = None while pos < size: ch = s[pos] @@ -1549,11 +1541,10 @@ message, s, pos-1, look+1) builder.append(res) else: - first_escape_error_char = unichr(ord(ch)) builder.append(u'\\') builder.append(unichr(ord(ch))) - return builder.build(), pos, first_escape_error_char + return builder.build(), pos def make_unicode_escape_function(pass_printable=False, unicode_output=False, quotes=False, prefix=None): diff --git a/rpython/rlib/test/test_runicode.py b/rpython/rlib/test/test_runicode.py --- a/rpython/rlib/test/test_runicode.py +++ b/rpython/rlib/test/test_runicode.py @@ -963,32 +963,3 @@ py.test.raises( UnicodeEncodeError, runicode.unicode_encode_utf_8, u, len(u), True, allow_surrogates=False) - - def test_encode_utf_8_combine_surrogates(self): - """ - In the case of a surrogate pair, the error handler should - return back a start and stop position of the full surrogate - pair (new behavior inherited from python3.6) - """ - u = runicode.UNICHR(0xDC80) + runicode.UNICHR(0xD800) + \ - runicode.UNICHR(0xDFFF) - - handler_num = 0 - - def errorhandler(errors, encoding, msg, s, start, end): - """ - This handler will be called twice, so asserting both times: - - 1. the first time, 0xDC80 will be handled as a single surrogate, - since it is a standalone character and an invalid surrogate. - 2. the second time, the characters will be 0xD800 and 0xDFFF, since - that is a valid surrogate pair. - """ - assert s[start:end] in [u'\udc80', u'\uD800\uDFFF'] - return [], None, end - - runicode.unicode_encode_utf_8( - u, len(u), True, - errorhandler=errorhandler, - allow_surrogates=False - ) _______________________________________________ pypy-commit mailing list pypy-commit@python.org https://mail.python.org/mailman/listinfo/pypy-commit