Author: Matti Picus <mtti.pi...@gmail.com> Branch: unicode-utf8-py3 Changeset: r94753:40650baa7fd6 Date: 2018-06-10 22:20 -0700 http://bitbucket.org/pypy/pypy/changeset/40650baa7fd6/
Log: fix imports. Tests start to run. str_decode_utf8 replaces decode_utf8 but args have changed diff --git a/pypy/interpreter/baseobjspace.py b/pypy/interpreter/baseobjspace.py --- a/pypy/interpreter/baseobjspace.py +++ b/pypy/interpreter/baseobjspace.py @@ -247,9 +247,7 @@ def unicode_w(self, space): self._typed_unwrap_error(space, "string") - - def text_w(self, space): - self._typed_unwrap_error(space, "string") + realunicode_w = unicode_w def utf8_w(self, space): self._typed_unwrap_error(space, "unicode") @@ -1732,7 +1730,6 @@ return rstring.assert_str0(result) realtext_w = text_w # Python 2 compatibility - realunicode_w = unicode_w def fsencode(space, w_obj): from pypy.interpreter.unicodehelper import fsencode diff --git a/pypy/interpreter/test/test_unicodehelper.py b/pypy/interpreter/test/test_unicodehelper.py --- a/pypy/interpreter/test/test_unicodehelper.py +++ b/pypy/interpreter/test/test_unicodehelper.py @@ -4,7 +4,7 @@ import struct import sys from pypy.interpreter.unicodehelper import ( - encode_utf8, decode_utf8, unicode_encode_utf_32_be, str_decode_utf_32_be) + encode_utf8, str_decode_utf8, utf8_encode_utf_32_be, str_decode_utf_32_be) from pypy.interpreter.unicodehelper import encode_utf8sp, decode_utf8sp diff --git a/pypy/interpreter/unicodehelper.py b/pypy/interpreter/unicodehelper.py --- a/pypy/interpreter/unicodehelper.py +++ b/pypy/interpreter/unicodehelper.py @@ -304,11 +304,12 @@ errorhandler=errorhandler) return res.encode('utf8'), size, len(res) -def str_decode_utf8(s, errors, final, errorhandler): +def str_decode_utf8(s, errors, final, errorhandler, allow_surrogates=False): """ Same as checking for the valid utf8, but we know the utf8 is not valid so we're trying to either raise or pack stuff with error handler. The key difference is that this is call_may_force """ + # XXX need to handle allow_surrogates slen = len(s) res = StringBuilder(slen) pos = 0 @@ -967,6 +968,32 @@ return result.build() +def encode_utf8(space, uni, allow_surrogates=False): + # Note that Python3 tends to forbid *all* surrogates in utf-8. + # If allow_surrogates=True, then revert to the Python 2 behavior + # which never raises UnicodeEncodeError. Surrogate pairs are then + # allowed, either paired or lone. A paired surrogate is considered + # like the non-BMP character it stands for. See also *_utf8sp(). + assert isinstance(uni, unicode) + return runicode.unicode_encode_utf_8( + uni, len(uni), "strict", + errorhandler=encode_error_handler(space), + allow_surrogates=allow_surrogates) + +def encode_utf8sp(space, uni): + # Surrogate-preserving utf-8 encoding. Any surrogate character + # turns into its 3-bytes encoding, whether it is paired or not. + # This should always be reversible, and the reverse is + # decode_utf8sp(). + return runicode.unicode_encode_utf8sp(uni, len(uni)) + +def decode_utf8sp(space, string): + # Surrogate-preserving utf-8 decoding. Assuming there is no + # encoding error, it should always be reversible, and the reverse is + # encode_utf8sp(). + return decode_utf8(space, string, allow_surrogates=True) + + # ____________________________________________________________ # utf-16 diff --git a/pypy/module/_codecs/interp_codecs.py b/pypy/module/_codecs/interp_codecs.py --- a/pypy/module/_codecs/interp_codecs.py +++ b/pypy/module/_codecs/interp_codecs.py @@ -86,7 +86,7 @@ newpos = -1 else: if newpos < 0: - newpos = length + newpos + newpos = length + newpos if newpos < 0 or newpos > length: raise oefmt(space.w_IndexError, "position %d from error handler out of bounds", diff --git a/pypy/objspace/std/dictmultiobject.py b/pypy/objspace/std/dictmultiobject.py --- a/pypy/objspace/std/dictmultiobject.py +++ b/pypy/objspace/std/dictmultiobject.py @@ -12,7 +12,7 @@ from pypy.interpreter.mixedmodule import MixedModule from pypy.interpreter.signature import Signature from pypy.interpreter.typedef import TypeDef -from pypy.interpreter.unicodehelper import decode_utf8 +from pypy.interpreter.unicodehelper import str_decode_utf8 from pypy.objspace.std.util import negate @@ -1184,7 +1184,7 @@ # we should implement the same shortcuts as we do for BytesDictStrategy def decodekey_str(self, key): - return decode_utf8(self.space, key, allow_surrogates=True) + return str_decode_utf8(self.space, key, allow_surrogates=True) def setitem_str(self, w_dict, key, w_value): assert key is not None diff --git a/pypy/objspace/std/mapdict.py b/pypy/objspace/std/mapdict.py --- a/pypy/objspace/std/mapdict.py +++ b/pypy/objspace/std/mapdict.py @@ -4,7 +4,7 @@ from rpython.rlib.rarithmetic import intmask, r_uint from pypy.interpreter.baseobjspace import W_Root -from pypy.interpreter.unicodehelper import decode_utf8 +from pypy.interpreter.unicodehelper import str_decode_utf8 from pypy.objspace.std.dictmultiobject import ( W_DictMultiObject, DictStrategy, ObjectDictStrategy, BaseKeyIterator, BaseValueIterator, BaseItemIterator, _never_equal_to_string, @@ -433,7 +433,7 @@ def materialize_str_dict(self, space, obj, str_dict): new_obj = self.back.materialize_str_dict(space, obj, str_dict) if self.index == DICT: - uni_name = decode_utf8(space, self.name) + uni_name = str_decode_utf8(space, self.name) str_dict[uni_name] = obj._mapdict_read_storage(self.storageindex) else: self._copy_attr(obj, new_obj) diff --git a/pypy/objspace/std/objspace.py b/pypy/objspace/std/objspace.py --- a/pypy/objspace/std/objspace.py +++ b/pypy/objspace/std/objspace.py @@ -4,7 +4,7 @@ from pypy.interpreter.error import OperationError, oefmt from pypy.interpreter.function import Function, Method, FunctionWithFixedCode from pypy.interpreter.typedef import get_unique_interplevel_subclass -from pypy.interpreter.unicodehelper import decode_utf8 +from pypy.interpreter.unicodehelper import str_decode_utf8 from pypy.objspace.std import frame, transparent, callmethod from pypy.objspace.descroperation import ( DescrOperation, get_attribute_name, raiseattrerror) @@ -165,7 +165,7 @@ unicode_x = x.decode('ascii') except UnicodeDecodeError: return self._wrap_string_old(x) - return self.newunicode(unicode_x) + return self.newtext(unicode_x) if isinstance(x, unicode): x = x.encode('utf8') lgt = rutf8.check_utf8(x, True) @@ -192,7 +192,7 @@ else: lst.append(unichr(ch)) unicode_x = u''.join(lst) - return self.newunicode(unicode_x) + return self.newtext(unicode_x) @not_rpython # only for tests def _wrap_not_rpython(self, x): @@ -334,7 +334,7 @@ def newlist_text(self, list_t): return self.newlist_unicode([ - decode_utf8(self, s, allow_surrogates=True) for s in list_t]) + str_decode_utf8(self, s, allow_surrogates=True) for s in list_t]) def newlist_utf8(self, list_u, is_ascii): if is_ascii: @@ -388,7 +388,7 @@ return W_BytearrayObject(l) def newtext(self, s): - return self.newunicode(decode_utf8(self, s, allow_surrogates=True)) + return self.newtext(str_decode_utf8(self, s, allow_surrogates=True)) def newtext_or_none(self, s): if s is None: diff --git a/pypy/objspace/std/unicodeobject.py b/pypy/objspace/std/unicodeobject.py --- a/pypy/objspace/std/unicodeobject.py +++ b/pypy/objspace/std/unicodeobject.py @@ -6,8 +6,9 @@ from rpython.rlib.rarithmetic import ovfcheck from rpython.rlib.rstring import ( StringBuilder, split, rsplit, UnicodeBuilder, replace_count, startswith, - unicode_encode_ascii, unicode_encode_utf_8, fast_str_decode_ascii, - unicode_encode_utf8_forbid_surrogates, SurrogateError, endswith) + endswith) +from rpython.rlib.runicode import ( + unicode_encode_utf8_forbid_surrogates, SurrogateError) from rpython.rlib import rutf8, jit from pypy.interpreter import unicodehelper @@ -1851,4 +1852,4 @@ return unicode_encode_utf8_forbid_surrogates(value, len(value)) _repr_function = rutf8.make_utf8_escape_function( - pass_printable=True, unicode_output=True, quotes=True, prefix='') + pass_printable=True, quotes=True, prefix='') _______________________________________________ pypy-commit mailing list pypy-commit@python.org https://mail.python.org/mailman/listinfo/pypy-commit