Author: Philip Jenvey <pjen...@underboss.org>
Branch: 
Changeset: r73938:db3e26419a95
Date: 2014-10-13 17:28 -0700
http://bitbucket.org/pypy/pypy/changeset/db3e26419a95/

Log:    backout d4a4d951ddc2

diff --git a/rpython/rlib/runicode.py b/rpython/rlib/runicode.py
--- a/rpython/rlib/runicode.py
+++ b/rpython/rlib/runicode.py
@@ -336,7 +336,8 @@
                         ch2 = ord(s[pos])
                         # Check for low surrogate and combine the two to
                         # form a UCS4 value
-                        if ch <= 0xDBFF and 0xDC00 <= ch2 <= 0xDFFF:
+                        if ((allow_surrogates or MAXUNICODE < 65536) and
+                            ch <= 0xDBFF and 0xDC00 <= ch2 <= 0xDFFF):
                             ch3 = ((ch - 0xD800) << 10 | (ch2 - 0xDC00)) + 
0x10000
                             pos += 1
                             _encodeUCS4(result, ch3)
diff --git a/rpython/rlib/test/test_runicode.py 
b/rpython/rlib/test/test_runicode.py
--- a/rpython/rlib/test/test_runicode.py
+++ b/rpython/rlib/test/test_runicode.py
@@ -806,3 +806,20 @@
                 u, len(u), True) == r'\ud800\udc00'
             assert runicode.unicode_encode_raw_unicode_escape(
                 u, len(u), True) == r'\ud800\udc00'
+
+    def test_encode_surrogate_pair_utf8(self):
+        u = runicode.UNICHR(0xD800) + runicode.UNICHR(0xDC00)
+        if runicode.MAXUNICODE < 65536:
+            # Narrow unicode build, consider utf16 surrogate pairs
+            assert runicode.unicode_encode_utf_8(
+                u, len(u), True, allow_surrogates=True) == '\xf0\x90\x80\x80'
+            assert runicode.unicode_encode_utf_8(
+                u, len(u), True, allow_surrogates=False) == '\xf0\x90\x80\x80'
+        else:
+            # Wide unicode build, merge utf16 surrogate pairs only when allowed
+            assert runicode.unicode_encode_utf_8(
+                u, len(u), True, allow_surrogates=True) == '\xf0\x90\x80\x80'
+            # Surrogates not merged, encoding fails.
+            py.test.raises(
+                UnicodeEncodeError, runicode.unicode_encode_utf_8,
+                u, len(u), True, allow_surrogates=False)
_______________________________________________
pypy-commit mailing list
pypy-commit@python.org
https://mail.python.org/mailman/listinfo/pypy-commit

Reply via email to