Author: Yusuke Tsutsumi <yus...@tsutsumi.io> Branch: fix_test_codecs Changeset: r94699:6d4fc7830371 Date: 2018-05-20 06:34 -0700 http://bitbucket.org/pypy/pypy/changeset/6d4fc7830371/
Log: Adding low surrogates into errorhandlers for utf-8encoding In CPython 3.6, Behavior was defined that included low surrogates when handling unicode encoding errors. This change adds that behavior. References: https://hg.python.org/cpython/rev/2b5357b38366 https://bugs.python.org/issue25267 diff --git a/rpython/rlib/runicode.py b/rpython/rlib/runicode.py --- a/rpython/rlib/runicode.py +++ b/rpython/rlib/runicode.py @@ -361,20 +361,27 @@ else: # Encode UCS2 Unicode ordinals if ch < 0x10000: - # Special case: check for high surrogate + # Special case: check for surrogates if 0xD800 <= ch <= 0xDFFF: + error_start_pos = pos - 1 if pos != size: ch2 = ord(s[pos]) - # Check for low surrogate and combine the two to - # form a UCS4 value - if ((allow_surrogates or MAXUNICODE < 65536 - or is_narrow_host()) and - ch <= 0xDBFF and 0xDC00 <= ch2 <= 0xDFFF): - ch3 = ((ch - 0xD800) << 10 | (ch2 - 0xDC00)) + 0x10000 - assert ch3 >= 0 + # check if the first character is a high surrogate, + # and the second character is a low surrogate. If so, + # they should be handled collectively. + if ch <= 0xDBFF and 0xDC80 <= ch2 <= 0xDFFFF: + # pos should be incremented regardless. + # by doing so, it ensures the lower surrogate + # is also included in the characters considered + # in the errorhandler. pos += 1 - _encodeUCS4(result, ch3) - continue + # if we allow surrogates, we should combine + # the two and form a UCS4 value + if allow_surrogates or MAXUNICODE < 65535 or is_narrow_host(): + ch3 = ((ch - 0xD800) << 10 | (ch2 - 0xDC00)) + 0x10000 + assert ch3 >= 0 + _encodeUCS4(result, ch3) + continue # note: if the program only ever calls this with # allow_surrogates=True, then we'll never annotate # the following block of code, and errorhandler() @@ -383,7 +390,7 @@ if not allow_surrogates or nonconst.NonConstant(False): ru, rs, pos = errorhandler(errors, 'utf8', 'surrogates not allowed', - s, pos-1, pos) + s, error_start_pos, pos) if rs is not None: # py3k only result.append(rs) @@ -394,7 +401,7 @@ else: errorhandler('strict', 'utf8', 'surrogates not allowed', - s, pos-1, pos) + s, pos - 1 , pos) continue # else: Fall through and handles isolated high surrogates result.append((chr((0xe0 | (ch >> 12))))) _______________________________________________ pypy-commit mailing list pypy-commit@python.org https://mail.python.org/mailman/listinfo/pypy-commit