[pypy-commit] pypy unicode-utf8-py3: surrogate and illegal unicode handling

mattip Wed, 11 Jul 2018 06:55:05 -0700

Author: Matti Picus <[email protected]>
Branch: unicode-utf8-py3
Changeset: r94846:9cf4fc74394c
Date: 2018-07-11 06:49 -0700
http://bitbucket.org/pypy/pypy/changeset/9cf4fc74394c/


Log:    surrogate and illegal unicode handling

diff --git a/pypy/interpreter/unicodehelper.py 
b/pypy/interpreter/unicodehelper.py
--- a/pypy/interpreter/unicodehelper.py
+++ b/pypy/interpreter/unicodehelper.py
@@ -29,11 +29,17 @@
                                              space.newtext(msg)]))
     return raise_unicode_exception_decode
 
+def decode_never_raise(errors, encoding, msg, s, startingpos, endingpos):
+    ux = ['\ux' + hex(ord(x))[2:].upper() for x in s[startingpos:endingpos]]
+    return ''.join(ux), endingpos
+
 @specialize.memo()
 def encode_error_handler(space):
     # Fast version of the "strict" errors handler.
     def raise_unicode_exception_encode(errors, encoding, msg, utf8,
                                        startingpos, endingpos):
+        if isinstance(utf8, unicode):
+            utf8 = utf8.encode('utf8')
         u_len = rutf8.get_utf8_length(utf8)
         raise OperationError(space.w_UnicodeEncodeError,
                              space.newtuple([space.newtext(encoding),
@@ -993,7 +999,7 @@
     # Surrogate-preserving utf-8 decoding.  Assuming there is no
     # encoding error, it should always be reversible, and the reverse is
     # encode_utf8sp().
-    return str_decode_utf8(string, "string", True, decode_error_handler(space),
+    return str_decode_utf8(string, "string", True, decode_never_raise,
                            allow_surrogates=True)
 
 
diff --git a/pypy/objspace/std/unicodeobject.py 
b/pypy/objspace/std/unicodeobject.py
--- a/pypy/objspace/std/unicodeobject.py
+++ b/pypy/objspace/std/unicodeobject.py
@@ -1187,22 +1187,26 @@
 
 
 def encode_object(space, w_object, encoding, errors):
+    utf8 = space.utf8_w(w_object)
+    idx = rutf8.surrogate_in_utf8(utf8)
+    if idx >= 0:
+        eh = unicodehelper.encode_error_handler(space)
+        eh(None, "utf8", "surrogates not allowed", utf8,
+            idx, idx + 1)
     if errors is None or errors == 'strict':
         if encoding is None or encoding == 'utf-8':
-            utf8 = space.utf8_w(w_object)
-            if rutf8.has_surrogates(utf8):
-                utf8 = rutf8.reencode_utf8_with_surrogates(utf8)
+            #if rutf8.has_surrogates(utf8):
+            #    utf8 = rutf8.reencode_utf8_with_surrogates(utf8)
             return space.newbytes(utf8)
         elif encoding == 'ascii':
-            s = space.utf8_w(w_object)
             try:
-                rutf8.check_ascii(s)
+                rutf8.check_ascii(utf8)
             except rutf8.CheckError as a:
                 eh = unicodehelper.encode_error_handler(space)
-                eh(None, "ascii", "ordinal not in range(128)", s,
+                eh(None, "ascii", "ordinal not in range(128)", utf8,
                     a.pos, a.pos + 1)
                 assert False, "always raises"
-            return space.newbytes(s)
+            return space.newbytes(utf8)
 
     from pypy.module._codecs.interp_codecs import encode_text
     if encoding is None:
_______________________________________________
pypy-commit mailing list
[email protected]
https://mail.python.org/mailman/listinfo/pypy-commit

[pypy-commit] pypy unicode-utf8-py3: surrogate and illegal unicode handling

Reply via email to