Author: Amaury Forgeot d'Arc <[email protected]>
Branch:
Changeset: r71358:d492bd661190
Date: 2014-04-11 23:10 +0200
http://bitbucket.org/pypy/pypy/changeset/d492bd661190/
Log: On Unicode wide builds (=all except win32), don't merge utf16
surrogate pairs on encoding. This only affects python3 which sets
allow_surrogates=False. (grafted from
5494a374d576b41509aa34faef64465f38dbd117)
diff --git a/rpython/rlib/runicode.py b/rpython/rlib/runicode.py
--- a/rpython/rlib/runicode.py
+++ b/rpython/rlib/runicode.py
@@ -331,7 +331,8 @@
ch2 = ord(s[pos])
# Check for low surrogate and combine the two to
# form a UCS4 value
- if ch <= 0xDBFF and 0xDC00 <= ch2 <= 0xDFFF:
+ if ((allow_surrogates or MAXUNICODE < 65536) and
+ ch <= 0xDBFF and 0xDC00 <= ch2 <= 0xDFFF):
ch3 = ((ch - 0xD800) << 10 | (ch2 - 0xDC00)) +
0x10000
pos += 1
_encodeUCS4(result, ch3)
diff --git a/rpython/rlib/test/test_runicode.py
b/rpython/rlib/test/test_runicode.py
--- a/rpython/rlib/test/test_runicode.py
+++ b/rpython/rlib/test/test_runicode.py
@@ -803,3 +803,20 @@
u, len(u), True) == r'\ud800\udc00'
assert runicode.unicode_encode_raw_unicode_escape(
u, len(u), True) == r'\ud800\udc00'
+
+ def test_encode_surrogate_pair_utf8(self):
+ u = runicode.UNICHR(0xD800) + runicode.UNICHR(0xDC00)
+ if runicode.MAXUNICODE < 65536:
+ # Narrow unicode build, consider utf16 surrogate pairs
+ assert runicode.unicode_encode_utf_8(
+ u, len(u), True, allow_surrogates=True) == '\xf0\x90\x80\x80'
+ assert runicode.unicode_encode_utf_8(
+ u, len(u), True, allow_surrogates=False) == '\xf0\x90\x80\x80'
+ else:
+ # Wide unicode build, merge utf16 surrogate pairs only when allowed
+ assert runicode.unicode_encode_utf_8(
+ u, len(u), True, allow_surrogates=True) == '\xf0\x90\x80\x80'
+ # Surrogates not merged, encoding fails.
+ py.test.raises(
+ UnicodeEncodeError, runicode.unicode_encode_utf_8,
+ u, len(u), True, allow_surrogates=False)
_______________________________________________
pypy-commit mailing list
[email protected]
https://mail.python.org/mailman/listinfo/pypy-commit