[pypy-commit] pypy default: Fix handling of high surrogates in unicode_encode_utf_32

rlamy Mon, 09 Oct 2017 16:19:36 -0700

Author: Ronan Lamy <[email protected]>
Branch: 
Changeset: r92690:0aec7a27d5ba
Date: 2017-10-10 01:18 +0200
http://bitbucket.org/pypy/pypy/changeset/0aec7a27d5ba/


Log:    Fix handling of high surrogates in unicode_encode_utf_32

diff --git a/rpython/rlib/runicode.py b/rpython/rlib/runicode.py
--- a/rpython/rlib/runicode.py
+++ b/rpython/rlib/runicode.py
@@ -877,32 +877,31 @@
         ch = ord(s[pos])
         pos += 1
         ch2 = 0
-        if 0xD800 <= ch < 0xDC00:
-            if not allow_surrogates:
-                ru, rs, pos = errorhandler(errors, public_encoding_name,
-                                           'surrogates not allowed',
-                                           s, pos-1, pos)
-                if rs is not None:
-                    # py3k only
-                    if len(rs) % 4 != 0:
-                        errorhandler('strict', public_encoding_name,
-                                     'surrogates not allowed',
-                                     s, pos-1, pos)
-                    result.append(rs)
-                    continue
-                for ch in ru:
-                    if ord(ch) < 0xD800:
-                        _STORECHAR32(result, ord(ch), byteorder)
-                    else:
-                        errorhandler('strict', public_encoding_name,
-                                     'surrogates not allowed',
-                                     s, pos-1, pos)
+        if not allow_surrogates and 0xD800 <= ch < 0xE000:
+            ru, rs, pos = errorhandler(errors, public_encoding_name,
+                                        'surrogates not allowed',
+                                        s, pos-1, pos)
+            if rs is not None:
+                # py3k only
+                if len(rs) % 4 != 0:
+                    errorhandler('strict', public_encoding_name,
+                                    'surrogates not allowed',
+                                    s, pos-1, pos)
+                result.append(rs)
                 continue
-            elif MAXUNICODE < 65536 and pos < size:
-                ch2 = ord(s[pos])
-                if 0xDC00 <= ch2 < 0xE000:
-                    ch = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000;
-                    pos += 1
+            for ch in ru:
+                if ord(ch) < 0xD800:
+                    _STORECHAR32(result, ord(ch), byteorder)
+                else:
+                    errorhandler('strict', public_encoding_name,
+                                    'surrogates not allowed',
+                                    s, pos-1, pos)
+            continue
+        if 0xD800 <= ch < 0xDC00 and MAXUNICODE < 65536 and pos < size:
+            ch2 = ord(s[pos])
+            if 0xDC00 <= ch2 < 0xE000:
+                ch = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000;
+                pos += 1
         _STORECHAR32(result, ch, byteorder)
 
     return result.build()
diff --git a/rpython/rlib/test/test_runicode.py 
b/rpython/rlib/test/test_runicode.py
--- a/rpython/rlib/test/test_runicode.py
+++ b/rpython/rlib/test/test_runicode.py
@@ -2,6 +2,7 @@
 
 import py
 import sys, random
+import struct
 from rpython.rlib import runicode
 
 from hypothesis import given, settings, strategies
@@ -266,11 +267,12 @@
         assert replace_with(u'rep', None) == '\x00<\x00r\x00e\x00p\x00>'
         assert replace_with(None, '\xca\xfe') == '\x00<\xca\xfe\x00>'
 
-    def test_utf32_surrogates(self):
+    @py.test.mark.parametrize('unich',[u"\ud800", u"\udc80"])
+    def test_utf32_surrogates(self, unich):
         assert runicode.unicode_encode_utf_32_be(
-            u"\ud800", 1, None) == '\x00\x00\xd8\x00'
+            unich, 1, None) == struct.pack('>i', ord(unich))
         py.test.raises(UnicodeEncodeError, runicode.unicode_encode_utf_32_be,
-                       u"\ud800", 1, None, allow_surrogates=False)
+                       unich, 1, None, allow_surrogates=False)
         def replace_with(ru, rs):
             def errorhandler(errors, enc, msg, u, startingpos, endingpos):
                 if errors == 'strict':
@@ -278,7 +280,7 @@
                                              endingpos, msg)
                 return ru, rs, endingpos
             return runicode.unicode_encode_utf_32_be(
-                u"<\ud800>", 3, None,
+                u"<%s>" % unich, 3, None,
                 errorhandler, allow_surrogates=False)
         assert replace_with(u'rep', None) == u'<rep>'.encode('utf-32-be')
         assert replace_with(None, '\xca\xfe\xca\xfe') == 
'\x00\x00\x00<\xca\xfe\xca\xfe\x00\x00\x00>'
@@ -432,7 +434,7 @@
             assert (self.decoder('aaaa' + seq + 'bbbb', len(seq) + 8, 'ignore',
                         final=True) == (u'aaaabbbb', len(seq) + 8))
             assert (self.decoder(seq, len(seq), 'custom', final=True,
-                        errorhandler=self.custom_replace) == 
+                        errorhandler=self.custom_replace) ==
                         (FOO * len(seq), len(seq)))
             assert (self.decoder('aaaa' + seq + 'bbbb', len(seq) + 8, 'custom',
                         final=True, errorhandler=self.custom_replace) ==
@@ -628,7 +630,7 @@
                                   msg='invalid continuation byte')
             assert self.decoder(seq, len(seq), 'replace', final=True
                                 ) == (res, len(seq))
-            assert (self.decoder('aaaa' + seq + 'bbbb', len(seq) + 8, 
+            assert (self.decoder('aaaa' + seq + 'bbbb', len(seq) + 8,
                                  'replace', final=True) ==
                         (u'aaaa' + res + u'bbbb', len(seq) + 8))
             res = res.replace(FFFD, u'')
_______________________________________________
pypy-commit mailing list
[email protected]
https://mail.python.org/mailman/listinfo/pypy-commit

[pypy-commit] pypy default: Fix handling of high surrogates in unicode_encode_utf_32

Reply via email to