Author: Matti Picus <[email protected]>
Branch: unicode-utf8-py3
Changeset: r95230:9f83d1c579c4
Date: 2018-10-21 22:49 +0300
http://bitbucket.org/pypy/pypy/changeset/9f83d1c579c4/

Log:    use utf8 in further encoding if error handler "fixes" surrogates

diff --git a/pypy/objspace/std/unicodeobject.py 
b/pypy/objspace/std/unicodeobject.py
--- a/pypy/objspace/std/unicodeobject.py
+++ b/pypy/objspace/std/unicodeobject.py
@@ -1201,11 +1201,11 @@
 
 
 def encode_object(space, w_object, encoding, errors, allow_surrogates=False):
-    utf8 = space.utf8_w(w_object)
     # TODO: refactor unnatrual use of error hanlders here,
     # we should make a single pass over the utf8 str
     from pypy.module._codecs.interp_codecs import encode_text, CodecState
     if not allow_surrogates:
+        utf8 = space.utf8_w(w_object)
         if errors is None:
             errors = 'strict'
         pos = rutf8.surrogate_in_utf8(utf8)
@@ -1215,9 +1215,12 @@
             start = utf8[:pos]
             ru, pos = eh(errors, "utf8", "surrogates not allowed", utf8,
                 pos, pos + 1)
-            end = utf8[pos+1:]
+            upos = rutf8.next_codepoint_pos(utf8,pos)
+            end = utf8[upos+1:]
             utf8 = start + ru + end
+            w_object = space.newtext(utf8)
     if errors is None or errors == 'strict':
+        utf8 = space.utf8_w(w_object)
         if encoding is None or encoding == 'utf-8':
             #if rutf8.has_surrogates(utf8):
             #    utf8 = rutf8.reencode_utf8_with_surrogates(utf8)
_______________________________________________
pypy-commit mailing list
[email protected]
https://mail.python.org/mailman/listinfo/pypy-commit

Reply via email to