Author: Philip Jenvey <pjen...@underboss.org> Branch: Changeset: r73938:db3e26419a95 Date: 2014-10-13 17:28 -0700 http://bitbucket.org/pypy/pypy/changeset/db3e26419a95/
Log: backout d4a4d951ddc2 diff --git a/rpython/rlib/runicode.py b/rpython/rlib/runicode.py --- a/rpython/rlib/runicode.py +++ b/rpython/rlib/runicode.py @@ -336,7 +336,8 @@ ch2 = ord(s[pos]) # Check for low surrogate and combine the two to # form a UCS4 value - if ch <= 0xDBFF and 0xDC00 <= ch2 <= 0xDFFF: + if ((allow_surrogates or MAXUNICODE < 65536) and + ch <= 0xDBFF and 0xDC00 <= ch2 <= 0xDFFF): ch3 = ((ch - 0xD800) << 10 | (ch2 - 0xDC00)) + 0x10000 pos += 1 _encodeUCS4(result, ch3) diff --git a/rpython/rlib/test/test_runicode.py b/rpython/rlib/test/test_runicode.py --- a/rpython/rlib/test/test_runicode.py +++ b/rpython/rlib/test/test_runicode.py @@ -806,3 +806,20 @@ u, len(u), True) == r'\ud800\udc00' assert runicode.unicode_encode_raw_unicode_escape( u, len(u), True) == r'\ud800\udc00' + + def test_encode_surrogate_pair_utf8(self): + u = runicode.UNICHR(0xD800) + runicode.UNICHR(0xDC00) + if runicode.MAXUNICODE < 65536: + # Narrow unicode build, consider utf16 surrogate pairs + assert runicode.unicode_encode_utf_8( + u, len(u), True, allow_surrogates=True) == '\xf0\x90\x80\x80' + assert runicode.unicode_encode_utf_8( + u, len(u), True, allow_surrogates=False) == '\xf0\x90\x80\x80' + else: + # Wide unicode build, merge utf16 surrogate pairs only when allowed + assert runicode.unicode_encode_utf_8( + u, len(u), True, allow_surrogates=True) == '\xf0\x90\x80\x80' + # Surrogates not merged, encoding fails. + py.test.raises( + UnicodeEncodeError, runicode.unicode_encode_utf_8, + u, len(u), True, allow_surrogates=False) _______________________________________________ pypy-commit mailing list pypy-commit@python.org https://mail.python.org/mailman/listinfo/pypy-commit