Author: Matti Picus <matti.pi...@gmail.com>
Branch: unicode-utf8-py3
Changeset: r94994:215aee0a5be7
Date: 2018-08-11 23:36 -0700
http://bitbucket.org/pypy/pypy/changeset/215aee0a5be7/

Log:    add mising decode_surrogateescape, implement more of
        encode_error_handler

diff --git a/pypy/interpreter/unicodehelper.py 
b/pypy/interpreter/unicodehelper.py
--- a/pypy/interpreter/unicodehelper.py
+++ b/pypy/interpreter/unicodehelper.py
@@ -36,6 +36,26 @@
     ux = ['\ux' + hex(ord(x))[2:].upper() for x in s[startingpos:endingpos]]
     return ''.join(ux), endingpos
 
+def decode_surrogateescape(errors, encoding, msg, obj, start, end):
+    consumed = 0
+    replace = u''
+    while consumed < 4 and consumed < end - start:
+        c = ord(obj[start+consumed])
+        if c < 128:
+            # Refuse to escape ASCII bytes.
+            break
+        replace += unichr(0xdc00 + c)
+        consumed += 1
+    if not consumed:
+        # codec complained about ASCII byte.
+        raise OperationError(space.w_UnicodeDecodeError,
+                         space.newtuple([space.newtext(encoding),
+                                         space.newbytes(obj),
+                                         space.newint(start),
+                                         space.newint(end),
+                                         space.newtext(msg)]))
+    return replace.encode('utf8'), start + consumed
+
 @specialize.memo()
 def encode_error_handler(space):
     # Fast version of the "strict" errors handler.
@@ -227,12 +247,16 @@
     return res.build(), len(s), len(s)
 
 def utf8_encode_utf_8(s, errors, errorhandler, allow_surrogates=False):
-    # XXX completly implement this
     try:
         lgt = rutf8.check_utf8(s, allow_surrogates=allow_surrogates)
     except rutf8.CheckError as e:
-        s, lgt = errorhandler(errors, 'encoding',
+        # XXX change this to non-recursive
+        start = s[:e.pos]
+        ru, lgt = errorhandler(errors, 'utf8',
                     'surrogates not allowed', s, e.pos, e.pos + 1)
+        end = utf8_encode_utf_8(s[e.pos+3:], errors, errorhandler,
+                                allow_surrogates=allow_surrogates)
+        s = start + ru + end
     return s
 
 def utf8_encode_latin_1(s, errors, errorhandler):
_______________________________________________
pypy-commit mailing list
pypy-commit@python.org
https://mail.python.org/mailman/listinfo/pypy-commit

Reply via email to