Author: Matti Picus <matti.pi...@gmail.com> Branch: unicode-utf8-py3 Changeset: r94846:9cf4fc74394c Date: 2018-07-11 06:49 -0700 http://bitbucket.org/pypy/pypy/changeset/9cf4fc74394c/
Log: surrogate and illegal unicode handling diff --git a/pypy/interpreter/unicodehelper.py b/pypy/interpreter/unicodehelper.py --- a/pypy/interpreter/unicodehelper.py +++ b/pypy/interpreter/unicodehelper.py @@ -29,11 +29,17 @@ space.newtext(msg)])) return raise_unicode_exception_decode +def decode_never_raise(errors, encoding, msg, s, startingpos, endingpos): + ux = ['\ux' + hex(ord(x))[2:].upper() for x in s[startingpos:endingpos]] + return ''.join(ux), endingpos + @specialize.memo() def encode_error_handler(space): # Fast version of the "strict" errors handler. def raise_unicode_exception_encode(errors, encoding, msg, utf8, startingpos, endingpos): + if isinstance(utf8, unicode): + utf8 = utf8.encode('utf8') u_len = rutf8.get_utf8_length(utf8) raise OperationError(space.w_UnicodeEncodeError, space.newtuple([space.newtext(encoding), @@ -993,7 +999,7 @@ # Surrogate-preserving utf-8 decoding. Assuming there is no # encoding error, it should always be reversible, and the reverse is # encode_utf8sp(). - return str_decode_utf8(string, "string", True, decode_error_handler(space), + return str_decode_utf8(string, "string", True, decode_never_raise, allow_surrogates=True) diff --git a/pypy/objspace/std/unicodeobject.py b/pypy/objspace/std/unicodeobject.py --- a/pypy/objspace/std/unicodeobject.py +++ b/pypy/objspace/std/unicodeobject.py @@ -1187,22 +1187,26 @@ def encode_object(space, w_object, encoding, errors): + utf8 = space.utf8_w(w_object) + idx = rutf8.surrogate_in_utf8(utf8) + if idx >= 0: + eh = unicodehelper.encode_error_handler(space) + eh(None, "utf8", "surrogates not allowed", utf8, + idx, idx + 1) if errors is None or errors == 'strict': if encoding is None or encoding == 'utf-8': - utf8 = space.utf8_w(w_object) - if rutf8.has_surrogates(utf8): - utf8 = rutf8.reencode_utf8_with_surrogates(utf8) + #if rutf8.has_surrogates(utf8): + # utf8 = rutf8.reencode_utf8_with_surrogates(utf8) return space.newbytes(utf8) elif encoding == 'ascii': - s = space.utf8_w(w_object) try: - rutf8.check_ascii(s) + rutf8.check_ascii(utf8) except rutf8.CheckError as a: eh = unicodehelper.encode_error_handler(space) - eh(None, "ascii", "ordinal not in range(128)", s, + eh(None, "ascii", "ordinal not in range(128)", utf8, a.pos, a.pos + 1) assert False, "always raises" - return space.newbytes(s) + return space.newbytes(utf8) from pypy.module._codecs.interp_codecs import encode_text if encoding is None: _______________________________________________ pypy-commit mailing list pypy-commit@python.org https://mail.python.org/mailman/listinfo/pypy-commit