Author: Yusuke Tsutsumi <yus...@tsutsumi.io> Branch: fix_test_codecs Changeset: r94701:f9ba6a5a05cc Date: 2018-05-25 07:54 -0700 http://bitbucket.org/pypy/pypy/changeset/f9ba6a5a05cc/
Log: Ported fix for surrogate handling to copied unicodehelper diff --git a/pypy/interpreter/unicodehelper.py b/pypy/interpreter/unicodehelper.py --- a/pypy/interpreter/unicodehelper.py +++ b/pypy/interpreter/unicodehelper.py @@ -191,20 +191,27 @@ else: # Encode UCS2 Unicode ordinals if ch < 0x10000: - # Special case: check for high surrogate + # Special case: check for surrogates if 0xD800 <= ch <= 0xDFFF: + error_start_pos = pos - 1 if pos != size: ch2 = ord(s[pos]) - # Check for low surrogate and combine the two to - # form a UCS4 value - if ((allow_surrogates or MAXUNICODE < 65536 - or is_narrow_host()) and - ch <= 0xDBFF and 0xDC00 <= ch2 <= 0xDFFF): - ch3 = ((ch - 0xD800) << 10 | (ch2 - 0xDC00)) + 0x10000 - assert ch3 >= 0 + # check if the first character is a high surrogate, + # and the second character is a low surrogate. If so, + # they should be handled collectively. + if ch <= 0xDBFF and 0xDC80 <= ch2 <= 0xDFFFF: + # pos should be incremented regardless. + # by doing so, it ensures the lower surrogate + # is also included in the characters considered + # in the errorhandler. pos += 1 - _encodeUCS4(result, ch3) - continue + # if we allow surrogates, we should combine + # the two and form a UCS4 value + if allow_surrogates or MAXUNICODE < 65535 or is_narrow_host(): + ch3 = ((ch - 0xD800) << 10 | (ch2 - 0xDC00)) + 0x10000 + assert ch3 >= 0 + _encodeUCS4(result, ch3) + continue # note: if the program only ever calls this with # allow_surrogates=True, then we'll never annotate # the following block of code, and errorhandler() @@ -213,7 +220,7 @@ if not allow_surrogates or nonconst.NonConstant(False): ru, rs, pos = errorhandler(errors, 'utf8', 'surrogates not allowed', - s, pos-1, pos) + s, error_start_pos, pos) if rs is not None: # py3k only result.append(rs) @@ -224,7 +231,7 @@ else: errorhandler('strict', 'utf8', 'surrogates not allowed', - s, pos-1, pos) + s, pos - 1 , pos) continue # else: Fall through and handles isolated high surrogates result.append((chr((0xe0 | (ch >> 12))))) _______________________________________________ pypy-commit mailing list pypy-commit@python.org https://mail.python.org/mailman/listinfo/pypy-commit