[pypy-commit] pypy unicode-utf8-py3: distinguish between errorhandler returning unicode and bytes

mattip Sun, 18 Nov 2018 19:37:50 -0800

Author: Matti Picus <matti.pi...@gmail.com>
Branch: unicode-utf8-py3
Changeset: r95339:9dc3de0b34d5
Date: 2018-11-18 18:36 -0800
http://bitbucket.org/pypy/pypy/changeset/9dc3de0b34d5/


Log:    distinguish between errorhandler returning unicode and bytes

diff --git a/pypy/interpreter/unicodehelper.py 
b/pypy/interpreter/unicodehelper.py
--- a/pypy/interpreter/unicodehelper.py
+++ b/pypy/interpreter/unicodehelper.py
@@ -27,7 +27,7 @@
 def decode_never_raise(errors, encoding, msg, s, startingpos, endingpos):
     assert startingpos >= 0
     ux = ['\ux' + hex(ord(x))[2:].upper() for x in s[startingpos:endingpos]]
-    return ''.join(ux), endingpos
+    return ''.join(ux), endingpos, 'b'
 
 @specialize.memo()
 def encode_error_handler(space):
@@ -199,7 +199,7 @@
     while i < len(s):
         ch = s[i]
         if ord(ch) > 0x7F:
-            r, i = errorhandler(errors, 'ascii', 'ordinal not in range(128)',
+            r, i, rettype = errorhandler(errors, 'ascii', 'ordinal not in 
range(128)',
                 s, i, i + 1)
             res.append(r)
         else:
@@ -242,7 +242,7 @@
         assert pos >= 0
         start = s[:pos]
         upos = rutf8.codepoints_in_utf8(s, end=pos)
-        ru, lgt = errorhandler(errors, 'utf8',
+        ru, lgt, rettype = errorhandler(errors, 'utf8',
                     'surrogates not allowed', s, upos, upos + 1)
         end = utf8_encode_utf_8(s[pos+3:], errors, errorhandler,
                                 allow_surrogates=allow_surrogates)
@@ -275,13 +275,20 @@
                 pos = rutf8.next_codepoint_pos(s, pos)
                 index += 1
             msg = "ordinal not in range(256)"
-            res_8, newindex = errorhandler(
+            res, newindex, rettype = errorhandler(
                 errors, 'latin1', msg, s, startindex, index)
-            for ch in res_8:
-                if ord(ch) > 0xFF:
-                    errorhandler("strict", 'latin1', msg, s, startindex, index)
-                    raise RuntimeError('error handler should not have 
returned')
-                result.append(ch)
+            if rettype == 'u':
+                for cp in rutf8.Utf8StringIterator(res):
+                    if cp > 0xFF:
+                        errorhandler("strict", 'latin1', msg, s, startindex, 
index)
+                        raise RuntimeError('error handler should not have 
returned')
+                    result.append(chr(cp))
+            else:
+                for ch in res:
+                    if ord(ch) > 0xFF:
+                        errorhandler("strict", 'latin1', msg, s, startindex, 
index)
+                        raise RuntimeError('error handler should not have 
returned')
+                    result.append(ch)
             if index != newindex:  # Should be uncommon
                 index = newindex
                 pos = rutf8._pos_at_index(s, newindex)
@@ -309,9 +316,20 @@
                 pos = rutf8.next_codepoint_pos(s, pos)
                 index += 1
             msg = "ordinal not in range(128)"
-            res_8, newindex = errorhandler(
+            res, newindex, rettype = errorhandler(
                 errors, 'ascii', msg, s, startindex, index)
-            result.append(res_8)
+            if rettype == 'u':
+                for cp in rutf8.Utf8StringIterator(res):
+                    if cp > 0x80:
+                        errorhandler("strict", 'ascii', msg, s, startindex, 
index)
+                        raise RuntimeError('error handler should not have 
returned')
+                    result.append(chr(cp))
+            else:
+                for ch in res:
+                    if ord(ch) > 0x80:
+                        errorhandler("strict", 'ascii', msg, s, startindex, 
index)
+                        raise RuntimeError('error handler should not have 
returned')
+                    result.append(ch)
             pos = rutf8._pos_at_index(s, newindex)
     return result.build()
 
@@ -346,7 +364,7 @@
             continue
 
         if ordch1 <= 0xC1:
-            r, pos = errorhandler(errors, "utf8", "invalid start byte",
+            r, pos, rettype = errorhandler(errors, "utf8", "invalid start 
byte",
                     s, pos, pos + 1)
             res.append(r)
             continue
@@ -358,14 +376,14 @@
                 if not final:
                     pos -= 1
                     break
-                r, pos = errorhandler(errors, "utf8", "unexpected end of data",
+                r, pos, rettype = errorhandler(errors, "utf8", "unexpected end 
of data",
                     s, pos - 1, pos)
                 res.append(r)
                 continue
             ordch2 = ord(s[pos])
 
             if rutf8._invalid_byte_2_of_2(ordch2):
-                r, pos = errorhandler(errors, "utf8", "invalid continuation 
byte",
+                r, pos, rettype = errorhandler(errors, "utf8", "invalid 
continuation byte",
                     s, pos - 1, pos)
                 res.append(r)
                 continue
@@ -380,7 +398,7 @@
                 if not final:
                     pos -= 1
                     break
-                r, pos = errorhandler(errors, "utf8", "unexpected end of data",
+                r, pos, rettype = errorhandler(errors, "utf8", "unexpected end 
of data",
                     s, pos - 1, pos + 1)
                 res.append(r)
                 continue
@@ -388,12 +406,12 @@
             ordch3 = ord(s[pos + 1])
 
             if rutf8._invalid_byte_2_of_3(ordch1, ordch2, allow_surrogates):
-                r, pos = errorhandler(errors, "utf8", "invalid continuation 
byte",
+                r, pos, rettype = errorhandler(errors, "utf8", "invalid 
continuation byte",
                     s, pos - 1, pos)
                 res.append(r)
                 continue
             elif rutf8._invalid_byte_3_of_3(ordch3):
-                r, pos = errorhandler(errors, "utf8", "invalid continuation 
byte",
+                r, pos, rettype = errorhandler(errors, "utf8", "invalid 
continuation byte",
                     s, pos - 1, pos + 1)
                 res.append(r)
                 continue
@@ -410,26 +428,25 @@
                 if not final:
                     pos -= 1
                     break
-                r, pos = errorhandler(errors, "utf8", "unexpected end of data",
+                r, pos, rettype = errorhandler(errors, "utf8", "unexpected end 
of data",
                     s, pos - 1, pos)
-                res.append(r)
                 continue
             ordch2 = ord(s[pos])
             ordch3 = ord(s[pos + 1])
             ordch4 = ord(s[pos + 2])
 
             if rutf8._invalid_byte_2_of_4(ordch1, ordch2):
-                r, pos = errorhandler(errors, "utf8", "invalid continuation 
byte",
+                r, pos, rettype = errorhandler(errors, "utf8", "invalid 
continuation byte",
                     s, pos - 1, pos)
                 res.append(r)
                 continue
             elif rutf8._invalid_byte_3_of_4(ordch3):
-                r, pos = errorhandler(errors, "utf8", "invalid continuation 
byte",
+                r, pos, rettype = errorhandler(errors, "utf8", "invalid 
continuation byte",
                     s, pos - 1, pos + 1)
                 res.append(r)
                 continue
             elif rutf8._invalid_byte_4_of_4(ordch4):
-                r, pos = errorhandler(errors, "utf8", "invalid continuation 
byte",
+                r, pos, rettype = errorhandler(errors, "utf8", "invalid 
continuation byte",
                     s, pos - 1, pos + 2)
                 res.append(r)
                 continue
@@ -442,7 +459,7 @@
             res.append(chr(ordch4))
             continue
 
-        r, pos = errorhandler(errors, "utf8", "invalid start byte",
+        r, pos, rettype = errorhandler(errors, "utf8", "invalid start byte",
                 s, pos - 1, pos)
         res.append(r)
 
@@ -458,9 +475,9 @@
         endinpos = pos
         while endinpos < len(s) and s[endinpos] in hexdigits:
             endinpos += 1
-        res, pos = errorhandler(
+        r, pos, rettype = errorhandler(
             errors, encoding, message, s, pos - 2, endinpos)
-        builder.append(res)
+        builder.append(r)
     else:
         try:
             chr = int(s[pos:pos + digits], 16)
@@ -468,9 +485,9 @@
             endinpos = pos
             while s[endinpos] in hexdigits:
                 endinpos += 1
-            res, pos = errorhandler(
+            r, pos, rettype = errorhandler(
                 errors, encoding, message, s, pos - 2, endinpos)
-            builder.append(res)
+            builder.append(r)
         else:
             # when we get here, chr is a 32-bit unicode character
             try:
@@ -478,9 +495,9 @@
                 pos += digits
             except ValueError:
                 message = "illegal Unicode character"
-                res, pos = errorhandler(
+                r, pos, rettype = errorhandler(
                     errors, encoding, message, s, pos - 2, pos + digits)
-                builder.append(res)
+                builder.append(r)
     return pos
 
 def str_decode_unicode_escape(s, errors, final, errorhandler, ud_handler):
@@ -506,9 +523,9 @@
         pos += 1
         if pos >= size:
             message = "\\ at end of string"
-            res, pos = errorhandler(errors, "unicodeescape",
+            r, pos, rettype = errorhandler(errors, "unicodeescape",
                                     message, s, pos - 1, size)
-            builder.append(res)
+            builder.append(r)
             continue
 
         ch = s[pos]
@@ -586,21 +603,21 @@
                     name = s[pos + 1:look]
                     code = ud_handler.call(name)
                     if code < 0:
-                        res, pos = errorhandler(
+                        r, pos, rettype = errorhandler(
                             errors, "unicodeescape", message,
                             s, pos - 1, look + 1)
-                        builder.append(res)
+                        builder.append(r)
                         continue
                     pos = look + 1
                     builder.append_code(code)
                 else:
-                    res, pos = errorhandler(errors, "unicodeescape",
+                    r, pos, rettype = errorhandler(errors, "unicodeescape",
                                             message, s, pos - 1, look + 1)
-                    builder.append(res)
+                    builder.append(r)
             else:
-                res, pos = errorhandler(errors, "unicodeescape",
+                r, pos, rettype = errorhandler(errors, "unicodeescape",
                                         message, s, pos - 1, look + 1)
-                builder.append(res)
+                builder.append(r)
         else:
             builder.append_char('\\')
             builder.append_code(ord(ch))
@@ -867,22 +884,22 @@
                         # We've seen at least one base-64 character
                         pos += 1
                         msg = "partial character in shift sequence"
-                        res, pos = errorhandler(errors, 'utf7',
+                        r, pos, rettype = errorhandler(errors, 'utf7',
                                                 msg, s, pos-1, pos)
-                        reslen = rutf8.check_utf8(res, True)
+                        reslen = rutf8.check_utf8(r, True)
                         outsize += reslen
-                        result.append(res)
+                        result.append(r)
                         continue
                     else:
                         # Some bits remain; they should be zero
                         if base64buffer != 0:
                             pos += 1
                             msg = "non-zero padding bits in shift sequence"
-                            res, pos = errorhandler(errors, 'utf7',
+                            r, pos, rettype = errorhandler(errors, 'utf7',
                                                     msg, s, pos-1, pos)
-                            reslen = rutf8.check_utf8(res, True)
+                            reslen = rutf8.check_utf8(r, True)
                             outsize += reslen
-                            result.append(res)
+                            result.append(r)
                             continue
 
                 if surrogate and _utf7_DECODE_DIRECT(ord(ch)):
@@ -917,10 +934,10 @@
             startinpos = pos
             pos += 1
             msg = "unexpected special character"
-            res, pos = errorhandler(errors, 'utf7', msg, s, pos-1, pos)
-            reslen = rutf8.check_utf8(res, True)
+            r, pos, rettype = errorhandler(errors, 'utf7', msg, s, pos-1, pos)
+            reslen = rutf8.check_utf8(r, True)
             outsize += reslen
-            result.append(res)
+            result.append(r)
 
     # end of string
     final_length = result.getlength()
@@ -931,10 +948,10 @@
             base64bits >= 6 or
             (base64bits > 0 and base64buffer != 0)):
             msg = "unterminated shift sequence"
-            res, pos = errorhandler(errors, 'utf7', msg, s, shiftOutStartPos, 
pos)
-            reslen = rutf8.check_utf8(res, True)
+            r, pos, rettype = errorhandler(errors, 'utf7', msg, s, 
shiftOutStartPos, pos)
+            reslen = rutf8.check_utf8(r, True)
             outsize += reslen
-            result.append(res)
+            result.append(r)
             final_length = result.getlength()
     elif inShift:
         pos = startinpos
@@ -1101,7 +1118,7 @@
         if len(s) - pos < 2:
             if not final:
                 break
-            r, pos = errorhandler(errors, public_encoding_name,
+            r, pos, rettype = errorhandler(errors, public_encoding_name,
                                   "truncated data",
                                   s, pos, len(s))
             result.append(r)
@@ -1118,7 +1135,7 @@
             if not final:
                 break
             errmsg = "unexpected end of data"
-            r, pos = errorhandler(errors, public_encoding_name,
+            r, pos, rettype = errorhandler(errors, public_encoding_name,
                                   errmsg, s, pos, len(s))
             result.append(r)
             if len(s) - pos < 2:
@@ -1131,12 +1148,12 @@
                 rutf8.unichr_as_utf8_append(result, ch)
                 continue
             else:
-                r, pos = errorhandler(errors, public_encoding_name,
+                r, pos, rettype = errorhandler(errors, public_encoding_name,
                                       "illegal UTF-16 surrogate",
                                       s, pos - 4, pos - 2)
                 result.append(r)
         else:
-            r, pos = errorhandler(errors, public_encoding_name,
+            r, pos, rettype = errorhandler(errors, public_encoding_name,
                                   "illegal encoding",
                                   s, pos - 2, pos)
             result.append(r)
@@ -1176,44 +1193,62 @@
     index = 0
     while pos < size:
         try:
-            ch = rutf8.codepoint_at_pos(s, pos)
+            cp = rutf8.codepoint_at_pos(s, pos)
         except IndexError:
             # malformed codepoint, blindly use ch
-            ch = ord(s[pos])
             pos += 1
             if errorhandler:
-                res_8, newindex = errorhandler(
+                r, newindex, rettype = errorhandler(
                     errors, public_encoding_name, 'malformed unicode',
                     s, pos - 1, pos)
-                for cp in rutf8.Utf8StringIterator(res_8):
-                    if cp < 0xD800:
+                if rettype == 'u':
+                    for cp in rutf8.Utf8StringIterator(r):
+                        if cp < 0xD800:
+                            _STORECHAR(result, cp, byteorder)
+                        else:
+                            errorhandler('strict', public_encoding_name,
+                                         'malformed unicode',
+                                     s, pos-1, pos)
+                else:
+                    for ch in r:
+                        cp = ord(ch)
+                        if cp < 0xD800:
+                            _STORECHAR(result, cp, byteorder)
+                        else:
+                            errorhandler('strict', public_encoding_name,
+                                         'malformed unicode',
+                                     s, pos-1, pos)
+            else:
+                cp = ord(s[pos])
+                _STORECHAR(result, cp, byteorder)
+            continue
+        if cp < 0xD800:
+            _STORECHAR(result, cp, byteorder)
+        elif cp >= 0x10000:
+            _STORECHAR(result, 0xD800 | ((cp-0x10000) >> 10), byteorder)
+            _STORECHAR(result, 0xDC00 | ((cp-0x10000) & 0x3FF), byteorder)
+        elif cp >= 0xE000 or allow_surrogates:
+            _STORECHAR(result, cp, byteorder)
+        else:
+            r, newindex, rettype = errorhandler(
+                errors, public_encoding_name, 'surrogates not allowed',
+                s, pos, pos+1)
+            if rettype == 'u':
+                for cp in rutf8.Utf8StringIterator(r):
+                    if cp < 0xD800 or allow_surrogates:
                         _STORECHAR(result, cp, byteorder)
                     else:
                         errorhandler('strict', public_encoding_name,
-                                     'malformed unicode',
-                                 s, pos-1, pos)
+                                     'surrogates not allowed',
+                                     s, pos, pos+1)
             else:
-                _STORECHAR(result, ch, byteorder)
-                continue
-        if ch < 0xD800:
-            _STORECHAR(result, ch, byteorder)
-        elif ch >= 0x10000:
-            _STORECHAR(result, 0xD800 | ((ch-0x10000) >> 10), byteorder)
-            _STORECHAR(result, 0xDC00 | ((ch-0x10000) & 0x3FF), byteorder)
-        elif ch >= 0xE000 or allow_surrogates:
-            _STORECHAR(result, ch, byteorder)
-        else:
-            res_8, newindex = errorhandler(
-                errors, public_encoding_name, 'surrogates not allowed',
-                s, pos, pos+1)
-            #for cp in rutf8.Utf8StringIterator(res_8):
-            for ch in res_8:
-                cp = ord(ch)
-                if cp < 0xD800 or allow_surrogates:
-                    _STORECHAR(result, cp, byteorder)
-                else:
-                    errorhandler('strict', public_encoding_name,
-                                 'surrogates not allowed',
+                for ch in r:
+                    cp = ord(ch)
+                    if cp < 0xD800 or allow_surrogates:
+                        _STORECHAR(result, cp, byteorder)
+                    else:
+                        errorhandler('strict', public_encoding_name,
+                                     'surrogates not allowed',
                                  s, pos, pos+1)
             if index != newindex:  # Should be uncommon
                 index = newindex
@@ -1329,7 +1364,7 @@
         if len(s) - pos < 4:
             if not final:
                 break
-            r, pos = errorhandler(errors, public_encoding_name,
+            r, pos, rettype = errorhandler(errors, public_encoding_name,
                                   "truncated data",
                                   s, pos, len(s))
             result.append(r)
@@ -1339,14 +1374,14 @@
         ch = ((ord(s[pos + iorder[3]]) << 24) | (ord(s[pos + iorder[2]]) << 
16) |
               (ord(s[pos + iorder[1]]) << 8)  | ord(s[pos + iorder[0]]))
         if not allow_surrogates and 0xD800 <= ch <= 0xDFFF:
-            r, pos = errorhandler(errors, public_encoding_name,
+            r, pos, rettype = errorhandler(errors, public_encoding_name,
                                   "code point in surrogate code point "
                                   "range(0xd800, 0xe000)",
                                   s, pos, pos + 4)
             result.append(r)
             continue
         elif ch >= 0x110000:
-            r, pos = errorhandler(errors, public_encoding_name,
+            r, pos, rettype = errorhandler(errors, public_encoding_name,
                                   "codepoint not in range(0x110000)",
                                   s, pos, len(s))
             result.append(r)
@@ -1404,11 +1439,20 @@
             ch = ord(s[pos])
             pos += 1
             if errorhandler:
-                res_8, newindex = errorhandler(
+                r, newindex, rettype = errorhandler(
                     errors, public_encoding_name, 'malformed unicode',
                     s, index, index+1)
-                if res_8:
-                    for cp in rutf8.Utf8StringIterator(res_8):
+                if rettype == 'u' and r:
+                    for cp in rutf8.Utf8StringIterator(r):
+                        if cp < 0xD800:
+                            _STORECHAR32(result, cp, byteorder)
+                        else:
+                            errorhandler('strict', public_encoding_name,
+                                     'malformed unicode',
+                                 s, index, index+1)
+                elif r:
+                    for ch in r:
+                        cp = ord(ch)
                         if cp < 0xD800:
                             _STORECHAR32(result, cp, byteorder)
                         else:
@@ -1422,16 +1466,26 @@
             index += 1
             continue
         if not allow_surrogates and 0xD800 <= ch < 0xE000:
-            res_8, newindex = errorhandler(
+            r, newindex, rettype = errorhandler(
                 errors, public_encoding_name, 'surrogates not allowed',
                 s, index, index+1)
-            for ch in rutf8.Utf8StringIterator(res_8):
-                if ch < 0xD800:
-                    _STORECHAR32(result, ch, byteorder)
-                else:
-                    errorhandler(
-                        'strict', public_encoding_name, 'surrogates not 
allowed',
-                        s, index, index+1)
+            if rettype == 'u':
+                for ch in rutf8.Utf8StringIterator(r):
+                    if ch < 0xD800:
+                        _STORECHAR32(result, ch, byteorder)
+                    else:
+                        errorhandler(
+                            'strict', public_encoding_name, 'surrogates not 
allowed',
+                            s, index, index+1)
+            else:
+                for ch in r:
+                    cp = ord(ch)
+                    if cp < 0xD800:
+                        _STORECHAR32(result, cp, byteorder)
+                    else:
+                        errorhandler(
+                            'strict', public_encoding_name, 'surrogates not 
allowed',
+                            s, index, index+1)
             if index != newindex:  # Should be uncommon
                 index = newindex
                 pos = rutf8._pos_at_index(s, newindex)
@@ -1471,11 +1525,20 @@
             ch = ord(s[pos])
             pos += 1
             if errorhandler:
-                res_8, newindex = errorhandler(
+                r, newindex, rettype = errorhandler(
                     errors, public_encoding_name, 'malformed unicode',
                     s, pos - 1, pos)
-                if res_8:
-                    for cp in rutf8.Utf8StringIterator(res_8):
+                if rettype == 'u' and r:
+                    for cp in rutf8.Utf8StringIterator(r):
+                        if cp < 0xD800:
+                            _STORECHAR32(result, cp, byteorder)
+                        else:
+                            errorhandler('strict', public_encoding_name,
+                                     'malformed unicode',
+                                 s, pos-1, pos)
+                elif r:
+                    for ch in r:
+                        cp = ord(ch)
                         if cp < 0xD800:
                             _STORECHAR32(result, cp, byteorder)
                         else:
@@ -1489,16 +1552,26 @@
             index += 1
             continue
         if not allow_surrogates and 0xD800 <= ch < 0xE000:
-            res_8, newindex = errorhandler(
+            r, newindex, rettype = errorhandler(
                 errors, public_encoding_name, 'surrogates not allowed',
                 s, pos - 1, pos)
-            for ch in rutf8.Utf8StringIterator(res_8):
-                if ch < 0xD800:
-                    _STORECHAR32(result, ch, byteorder)
-                else:
-                    errorhandler(
-                        'strict', public_encoding_name, 'surrogates not 
allowed',
-                        s, pos - 1, pos)
+            if rettype == 'u':
+                for ch in rutf8.Utf8StringIterator(res_8):
+                    if ch < 0xD800:
+                        _STORECHAR32(result, ch, byteorder)
+                    else:
+                        errorhandler(
+                            'strict', public_encoding_name, 'surrogates not 
allowed',
+                            s, pos - 1, pos)
+            else:
+                for ch in res_8:
+                    cp = ord(ch)
+                    if cp < 0xD800:
+                        _STORECHAR32(result, cp, byteorder)
+                    else:
+                        errorhandler(
+                            'strict', public_encoding_name, 'surrogates not 
allowed',
+                            s, pos - 1, pos)
             if index != newindex:  # Should be uncommon
                 index = newindex
                 pos = rutf8._pos_at_index(s, newindex)
@@ -1551,10 +1624,10 @@
     pos = 0
     while pos < size:
         if pos > size - unicode_bytes:
-            res, pos = errorhandler(errors, "unicode_internal",
+            r, pos, rettype = errorhandler(errors, "unicode_internal",
                                     "truncated input",
                                     s, pos, size)
-            result.append(res)
+            result.append(r)
             continue
         t = r_uint(0)
         h = 0
@@ -1562,10 +1635,10 @@
             t += r_uint(ord(s[pos + j])) << (h*8)
             h += 1
         if t > runicode.MAXUNICODE:
-            res, pos = errorhandler(errors, "unicode_internal",
+            r, pos, rettype = errorhandler(errors, "unicode_internal",
                                     "unichr(%d) not in range" % (t,),
                                     s, pos, pos + unicode_bytes)
-            result.append(res)
+            result.append(r)
             continue
         rutf8.unichr_as_utf8_append(result, intmask(t), allow_surrogates=True)
         pos += unicode_bytes
@@ -1627,7 +1700,7 @@
 
         c = mapping.get(ord(ch), ERROR_CHAR)
         if c == ERROR_CHAR:
-            r, pos = errorhandler(errors, "charmap",
+            r, pos, rettype = errorhandler(errors, "charmap",
                                   "character maps to <undefined>",
                                   s,  pos, pos + 1)
             result.append(r)
@@ -1659,10 +1732,10 @@
                    mapping.get(rutf8.codepoint_at_pos(s, pos), '') == ''):
                 pos = rutf8.next_codepoint_pos(s, pos)
                 index += 1
-            res_8, newindex = errorhandler(errors, "charmap",
+            r, newindex, rettype = errorhandler(errors, "charmap",
                                    "character maps to <undefined>",
                                    s, startindex, index)
-            for cp2 in rutf8.Utf8StringIterator(res_8):
+            for cp2 in rutf8.Utf8StringIterator(r):
                 ch2 = mapping.get(cp2, '')
                 if not ch2:
                     errorhandler(
@@ -1727,7 +1800,7 @@
             i += 1
         end_index = i
         msg = "invalid decimal Unicode string"
-        r, pos = errorhandler(
+        r, pos, retype = errorhandler(
             errors, 'decimal', msg, s, start_index, end_index)
         for ch in rutf8.Utf8StringIterator(r):
             if unicodedb.isspace(ch):
diff --git a/pypy/module/_codecs/interp_codecs.py 
b/pypy/module/_codecs/interp_codecs.py
--- a/pypy/module/_codecs/interp_codecs.py
+++ b/pypy/module/_codecs/interp_codecs.py
@@ -72,8 +72,11 @@
                 raise OperationError(space.w_TypeError, space.newtext(msg))
 
             w_replace, w_newpos = space.fixedview(w_res, 2)
-            if not (space.isinstance_w(w_replace, space.w_unicode) or
-                (not decode and space.isinstance_w(w_replace, space.w_bytes))):
+            if space.isinstance_w(w_replace, space.w_unicode):
+                rettype = 'u'
+            elif encode and space.isinstance_w(w_replace, space.w_bytes):
+                rettype = 'b'
+            else:
                 if decode:
                     msg = ("decoding error handler must return "
                            "(str, int) tuple")
@@ -94,7 +97,7 @@
                 raise oefmt(space.w_IndexError,
                             "position %d from error handler out of bounds",
                             newpos)
-            return space.utf8_w(w_replace), newpos
+            return space.utf8_w(w_replace), newpos, rettype
         return call_errorhandler
 
     def make_decode_errorhandler(self, space):
diff --git a/pypy/module/_codecs/test/test_codecs.py 
b/pypy/module/_codecs/test/test_codecs.py
--- a/pypy/module/_codecs/test/test_codecs.py
+++ b/pypy/module/_codecs/test/test_codecs.py
@@ -826,11 +826,14 @@
         repl = "\u00E9"
         s = "\u5678".encode("latin-1", "test.bad_handler")
         assert s == b'\xe9'
+        raises(UnicodeEncodeError, "\u5678".encode, "ascii",
+               "test.bad_handler")
 
     def test_lone_surrogates(self):
         encodings = ('utf-8', 'utf-16', 'utf-16-le', 'utf-16-be',
             'utf-32', 'utf-32-le', 'utf-32-be')
         for encoding in encodings:
+            print('encoding', encoding)
             raises(UnicodeEncodeError, u'\ud800'.encode, encoding)
             assert (u'[\udc80]'.encode(encoding, "backslashreplace") ==
                 '[\\udc80]'.encode(encoding))
diff --git a/pypy/objspace/std/unicodeobject.py 
b/pypy/objspace/std/unicodeobject.py
--- a/pypy/objspace/std/unicodeobject.py
+++ b/pypy/objspace/std/unicodeobject.py
@@ -1200,14 +1200,16 @@
     if errors is None or errors == 'strict':
         utf8 = space.utf8_w(w_object)
         if encoding is None or encoding == 'utf-8':
-            #if rutf8.has_surrogates(utf8):
-            #    utf8 = rutf8.reencode_utf8_with_surrogates(utf8)
+            if rutf8.has_surrogates(utf8):
+                # slow path
+                return encode_text(space, w_object, encoding, errors)
             return space.newbytes(utf8)
         elif encoding == 'ascii':
             try:
                 rutf8.check_ascii(utf8)
             except rutf8.CheckError as a:
-                eh = unicodehelper.encode_error_handler(space)
+                state = space.fromcache(CodecState)
+                eh = state.encode_error_handler
                 eh(None, "ascii", "ordinal not in range(128)", utf8,
                     a.pos, a.pos + 1)
                 assert False, "always raises"
_______________________________________________
pypy-commit mailing list
pypy-commit@python.org
https://mail.python.org/mailman/listinfo/pypy-commit

[pypy-commit] pypy unicode-utf8-py3: distinguish between errorhandler returning unicode and bytes

Reply via email to