Author: Matti Picus <matti.pi...@gmail.com> Branch: unicode-utf8-py3 Changeset: r95073:b040f44dc71b Date: 2018-09-02 10:18 +0200 http://bitbucket.org/pypy/pypy/changeset/b040f44dc71b/
Log: use encode_utf8, str_decode_utf8, and maybe handle surrogates in the latter diff --git a/pypy/interpreter/unicodehelper.py b/pypy/interpreter/unicodehelper.py --- a/pypy/interpreter/unicodehelper.py +++ b/pypy/interpreter/unicodehelper.py @@ -50,6 +50,23 @@ return u'', None, 0 return raise_unicode_exception_encode +@specialize.memo() +def encode_unicode_error_handler(space): + # Fast version of the "strict" errors handler. + def raise_unicode_exception_encode(errors, encoding, msg, uni, + startingpos, endingpos): + assert isinstance(uni, unicode) + u_len = len(uni) + utf8 = runicode.unicode_encode_utf8sp(uni, u_len) + raise OperationError(space.w_UnicodeEncodeError, + space.newtuple([space.newtext(encoding), + space.newtext(utf8, u_len), + space.newint(startingpos), + space.newint(endingpos), + space.newtext(msg)])) + return u'', None, 0 + return raise_unicode_exception_encode + def default_error_encode( errors, encoding, msg, u, startingpos, endingpos): """A default handler, for tests""" @@ -322,7 +339,6 @@ valid so we're trying to either raise or pack stuff with error handler. The key difference is that this is call_may_force """ - # XXX need to handle allow_surrogates slen = len(s) res = StringBuilder(slen) pos = 0 @@ -377,7 +393,7 @@ ordch2 = ord(s[pos]) ordch3 = ord(s[pos + 1]) - if rutf8._invalid_byte_2_of_3(ordch1, ordch2, True): + if rutf8._invalid_byte_2_of_3(ordch1, ordch2, allow_surrogates): r, pos = errorhandler(errors, "utf8", "invalid continuation byte", s, pos - 1, pos) res.append(r) @@ -994,7 +1010,7 @@ assert isinstance(uni, unicode) return runicode.unicode_encode_utf_8( uni, len(uni), "strict", - errorhandler=encode_error_handler(space), + errorhandler=encode_unicode_error_handler(space), allow_surrogates=allow_surrogates) def encode_utf8sp(space, uni): diff --git a/pypy/objspace/std/stringmethods.py b/pypy/objspace/std/stringmethods.py --- a/pypy/objspace/std/stringmethods.py +++ b/pypy/objspace/std/stringmethods.py @@ -7,6 +7,7 @@ find, rfind, count, endswith, replace, rsplit, split, startswith) from pypy.interpreter.error import OperationError, oefmt from pypy.interpreter.gateway import WrappedDefault, unwrap_spec +from pypy.interpreter.unicodehelper import str_decode_utf8 from pypy.objspace.std.sliceobject import W_SliceObject, unwrap_start_stop @@ -197,6 +198,12 @@ errors = 'strict' if encoding is None: encoding = 'utf8' + if encoding == 'utf8' or encoding == 'utf-8': + from pypy.module._codecs.interp_codecs import CodecState + state = space.fromcache(CodecState) + eh = state.decode_error_handler + s = space.charbuf_w(self) + ret, lgt, pos = str_decode_utf8(s, errors, True, eh) return decode_object(space, self, encoding, errors) @unwrap_spec(tabsize=int) diff --git a/pypy/objspace/std/unicodeobject.py b/pypy/objspace/std/unicodeobject.py --- a/pypy/objspace/std/unicodeobject.py +++ b/pypy/objspace/std/unicodeobject.py @@ -1898,12 +1898,8 @@ raise oefmt(space.w_TypeError, "expected unicode, got '%T'", w_unistr) value = _rpy_unicode_to_decimal_w(space, w_unistr.utf8_w(space).decode('utf8')) # XXX this is the only place in the code that this funcion is called. - # It does not translate, since it uses a pypy-level error handler - # to throw the UnicodeEncodeError not the rpython default handler - #return unicodehelper.encode_utf8(space, value, - # allow_surrogates=allow_surrogates) - assert isinstance(value, unicode) - return value.encode('utf8') + return unicodehelper.encode_utf8(space, value, + allow_surrogates=allow_surrogates) def _rpy_unicode_to_decimal_w(space, unistr): # XXX rewrite this to accept a utf8 string and use a StringBuilder _______________________________________________ pypy-commit mailing list pypy-commit@python.org https://mail.python.org/mailman/listinfo/pypy-commit