Author: Ronan Lamy <[email protected]>
Branch:
Changeset: r92690:0aec7a27d5ba
Date: 2017-10-10 01:18 +0200
http://bitbucket.org/pypy/pypy/changeset/0aec7a27d5ba/
Log: Fix handling of high surrogates in unicode_encode_utf_32
diff --git a/rpython/rlib/runicode.py b/rpython/rlib/runicode.py
--- a/rpython/rlib/runicode.py
+++ b/rpython/rlib/runicode.py
@@ -877,32 +877,31 @@
ch = ord(s[pos])
pos += 1
ch2 = 0
- if 0xD800 <= ch < 0xDC00:
- if not allow_surrogates:
- ru, rs, pos = errorhandler(errors, public_encoding_name,
- 'surrogates not allowed',
- s, pos-1, pos)
- if rs is not None:
- # py3k only
- if len(rs) % 4 != 0:
- errorhandler('strict', public_encoding_name,
- 'surrogates not allowed',
- s, pos-1, pos)
- result.append(rs)
- continue
- for ch in ru:
- if ord(ch) < 0xD800:
- _STORECHAR32(result, ord(ch), byteorder)
- else:
- errorhandler('strict', public_encoding_name,
- 'surrogates not allowed',
- s, pos-1, pos)
+ if not allow_surrogates and 0xD800 <= ch < 0xE000:
+ ru, rs, pos = errorhandler(errors, public_encoding_name,
+ 'surrogates not allowed',
+ s, pos-1, pos)
+ if rs is not None:
+ # py3k only
+ if len(rs) % 4 != 0:
+ errorhandler('strict', public_encoding_name,
+ 'surrogates not allowed',
+ s, pos-1, pos)
+ result.append(rs)
continue
- elif MAXUNICODE < 65536 and pos < size:
- ch2 = ord(s[pos])
- if 0xDC00 <= ch2 < 0xE000:
- ch = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000;
- pos += 1
+ for ch in ru:
+ if ord(ch) < 0xD800:
+ _STORECHAR32(result, ord(ch), byteorder)
+ else:
+ errorhandler('strict', public_encoding_name,
+ 'surrogates not allowed',
+ s, pos-1, pos)
+ continue
+ if 0xD800 <= ch < 0xDC00 and MAXUNICODE < 65536 and pos < size:
+ ch2 = ord(s[pos])
+ if 0xDC00 <= ch2 < 0xE000:
+ ch = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000;
+ pos += 1
_STORECHAR32(result, ch, byteorder)
return result.build()
diff --git a/rpython/rlib/test/test_runicode.py
b/rpython/rlib/test/test_runicode.py
--- a/rpython/rlib/test/test_runicode.py
+++ b/rpython/rlib/test/test_runicode.py
@@ -2,6 +2,7 @@
import py
import sys, random
+import struct
from rpython.rlib import runicode
from hypothesis import given, settings, strategies
@@ -266,11 +267,12 @@
assert replace_with(u'rep', None) == '\x00<\x00r\x00e\x00p\x00>'
assert replace_with(None, '\xca\xfe') == '\x00<\xca\xfe\x00>'
- def test_utf32_surrogates(self):
+ @py.test.mark.parametrize('unich',[u"\ud800", u"\udc80"])
+ def test_utf32_surrogates(self, unich):
assert runicode.unicode_encode_utf_32_be(
- u"\ud800", 1, None) == '\x00\x00\xd8\x00'
+ unich, 1, None) == struct.pack('>i', ord(unich))
py.test.raises(UnicodeEncodeError, runicode.unicode_encode_utf_32_be,
- u"\ud800", 1, None, allow_surrogates=False)
+ unich, 1, None, allow_surrogates=False)
def replace_with(ru, rs):
def errorhandler(errors, enc, msg, u, startingpos, endingpos):
if errors == 'strict':
@@ -278,7 +280,7 @@
endingpos, msg)
return ru, rs, endingpos
return runicode.unicode_encode_utf_32_be(
- u"<\ud800>", 3, None,
+ u"<%s>" % unich, 3, None,
errorhandler, allow_surrogates=False)
assert replace_with(u'rep', None) == u'<rep>'.encode('utf-32-be')
assert replace_with(None, '\xca\xfe\xca\xfe') ==
'\x00\x00\x00<\xca\xfe\xca\xfe\x00\x00\x00>'
@@ -432,7 +434,7 @@
assert (self.decoder('aaaa' + seq + 'bbbb', len(seq) + 8, 'ignore',
final=True) == (u'aaaabbbb', len(seq) + 8))
assert (self.decoder(seq, len(seq), 'custom', final=True,
- errorhandler=self.custom_replace) ==
+ errorhandler=self.custom_replace) ==
(FOO * len(seq), len(seq)))
assert (self.decoder('aaaa' + seq + 'bbbb', len(seq) + 8, 'custom',
final=True, errorhandler=self.custom_replace) ==
@@ -628,7 +630,7 @@
msg='invalid continuation byte')
assert self.decoder(seq, len(seq), 'replace', final=True
) == (res, len(seq))
- assert (self.decoder('aaaa' + seq + 'bbbb', len(seq) + 8,
+ assert (self.decoder('aaaa' + seq + 'bbbb', len(seq) + 8,
'replace', final=True) ==
(u'aaaa' + res + u'bbbb', len(seq) + 8))
res = res.replace(FFFD, u'')
_______________________________________________
pypy-commit mailing list
[email protected]
https://mail.python.org/mailman/listinfo/pypy-commit