Author: Ronan Lamy <[email protected]>
Branch: py3.5
Changeset: r93418:badb71ed332d
Date: 2017-12-14 02:22 +0000
http://bitbucket.org/pypy/pypy/changeset/badb71ed332d/

Log:    Port b0267eee69d8 to unicodehelper and fix it

diff --git a/pypy/interpreter/test/test_unicodehelper.py 
b/pypy/interpreter/test/test_unicodehelper.py
--- a/pypy/interpreter/test/test_unicodehelper.py
+++ b/pypy/interpreter/test/test_unicodehelper.py
@@ -2,7 +2,7 @@
 import pytest
 import struct
 from pypy.interpreter.unicodehelper import (
-    encode_utf8, decode_utf8, unicode_encode_utf_32_be)
+    encode_utf8, decode_utf8, unicode_encode_utf_32_be, str_decode_utf_32_be)
 from pypy.interpreter.unicodehelper import encode_utf8sp, decode_utf8sp
 
 
@@ -90,3 +90,6 @@
     assert replace_with(u'rep', None) == u'<rep>'.encode('utf-32-be')
     assert (replace_with(None, '\xca\xfe\xca\xfe') ==
             '\x00\x00\x00<\xca\xfe\xca\xfe\x00\x00\x00>')
+
+    with pytest.raises(UnicodeDecodeError):
+        str_decode_utf_32_be(b"\x00\x00\xdc\x80", 4, None)
diff --git a/pypy/interpreter/unicodehelper.py 
b/pypy/interpreter/unicodehelper.py
--- a/pypy/interpreter/unicodehelper.py
+++ b/pypy/interpreter/unicodehelper.py
@@ -172,19 +172,22 @@
 def str_decode_utf_32(s, size, errors, final=True,
                            errorhandler=None):
     result, length, byteorder = str_decode_utf_32_helper(
-        s, size, errors, final, errorhandler, "native", 'utf-32-' + BYTEORDER2)
+        s, size, errors, final, errorhandler, "native", 'utf-32-' + BYTEORDER2,
+        allow_surrogates=False)
     return result, length
 
 def str_decode_utf_32_be(s, size, errors, final=True,
                               errorhandler=None):
     result, length, byteorder = str_decode_utf_32_helper(
-        s, size, errors, final, errorhandler, "big", 'utf-32-be')
+        s, size, errors, final, errorhandler, "big", 'utf-32-be',
+        allow_surrogates=False)
     return result, length
 
 def str_decode_utf_32_le(s, size, errors, final=True,
                               errorhandler=None):
     result, length, byteorder = str_decode_utf_32_helper(
-        s, size, errors, final, errorhandler, "little", 'utf-32-le')
+        s, size, errors, final, errorhandler, "little", 'utf-32-le',
+        allow_surrogates=False)
     return result, length
 
 BOM32_DIRECT = intmask(0x0000FEFF)
@@ -193,7 +196,8 @@
 def str_decode_utf_32_helper(s, size, errors, final=True,
                              errorhandler=None,
                              byteorder="native",
-                             public_encoding_name='utf32'):
+                             public_encoding_name='utf32',
+                             allow_surrogates=True):
     if errorhandler is None:
         errorhandler = default_unicode_error_decode
     bo = 0
@@ -256,10 +260,17 @@
             continue
         ch = ((ord(s[pos + iorder[3]]) << 24) | (ord(s[pos + iorder[2]]) << 
16) |
             (ord(s[pos + iorder[1]]) << 8) | ord(s[pos + iorder[0]]))
-        if ch >= 0x110000:
+        if not allow_surrogates and 0xD800 <= ch <= 0xDFFF:
+            r, pos = errorhandler(errors, public_encoding_name,
+                                  "code point in surrogate code point "
+                                  "range(0xd800, 0xe000)",
+                                  s, pos, pos + 4)
+            result.append(r)
+            continue
+        elif ch >= 0x110000:
             r, pos = errorhandler(errors, public_encoding_name,
                                   "codepoint not in range(0x110000)",
-                                  s, pos, len(s))
+                                  s, pos, pos + 4)
             result.append(r)
             continue
 
_______________________________________________
pypy-commit mailing list
[email protected]
https://mail.python.org/mailman/listinfo/pypy-commit

Reply via email to