Author: Amaury Forgeot d'Arc <[email protected]>
Branch: py3.5
Changeset: r91155:1ca1708d0606
Date: 2017-04-19 15:28 +0200
http://bitbucket.org/pypy/pypy/changeset/1ca1708d0606/

Log:    Disable lone surrogates in utf-16 and utf-32 encoders

diff --git a/pypy/module/_codecs/interp_codecs.py 
b/pypy/module/_codecs/interp_codecs.py
--- a/pypy/module/_codecs/interp_codecs.py
+++ b/pypy/module/_codecs/interp_codecs.py
@@ -629,6 +629,21 @@
     wrap_encoder.func_name = rname
     globals()[name] = wrap_encoder
 
+def make_utf_encoder_wrapper(name):
+    rname = "unicode_encode_%s" % (name.replace("_encode", ""), )
+    assert hasattr(runicode, rname)
+    @unwrap_spec(uni=unicode, errors='text_or_none')
+    def wrap_encoder(space, uni, errors="strict"):
+        if errors is None:
+            errors = 'strict'
+        state = space.fromcache(CodecState)
+        func = getattr(runicode, rname)
+        result = func(uni, len(uni), errors, state.encode_error_handler,
+                      allow_surrogates=False)
+        return space.newtuple([space.newbytes(result), space.newint(len(uni))])
+    wrap_encoder.func_name = rname
+    globals()[name] = wrap_encoder
+
 def make_decoder_wrapper(name):
     rname = "str_decode_%s" % (name.replace("_decode", ""), )
     assert hasattr(runicode, rname)
@@ -650,16 +665,20 @@
          "ascii_encode",
          "latin_1_encode",
          "utf_7_encode",
+         "unicode_escape_encode",
+         "raw_unicode_escape_encode",
+        ]:
+    make_encoder_wrapper(encoder)
+
+for encoder in [
          "utf_16_encode",
          "utf_16_be_encode",
          "utf_16_le_encode",
          "utf_32_encode",
          "utf_32_be_encode",
          "utf_32_le_encode",
-         "unicode_escape_encode",
-         "raw_unicode_escape_encode",
         ]:
-    make_encoder_wrapper(encoder)
+    make_utf_encoder_wrapper(encoder)
 
 for decoder in [
          "ascii_decode",
diff --git a/pypy/module/_codecs/test/test_codecs.py 
b/pypy/module/_codecs/test/test_codecs.py
--- a/pypy/module/_codecs/test/test_codecs.py
+++ b/pypy/module/_codecs/test/test_codecs.py
@@ -758,6 +758,11 @@
         s = "\u5678".encode("latin-1", "test.bad_handler")
         assert s == b'\xe9'
 
+    def test_lone_surrogates(self):
+        for encoding in ('utf-8', 'utf-16', 'utf-16-le', 'utf-16-be',
+                         'utf-32', 'utf-32-le', 'utf-32-be'):
+            raises(UnicodeEncodeError, u'\ud800'.encode, encoding)
+
     def test_charmap_encode(self):
         assert 'xxx'.encode('charmap') == b'xxx'
 
_______________________________________________
pypy-commit mailing list
[email protected]
https://mail.python.org/mailman/listinfo/pypy-commit

Reply via email to