Author: Matti Picus <matti.pi...@gmail.com> Branch: unicode-utf8-py3 Changeset: r94984:07a4929a661d Date: 2018-08-09 13:27 -0700 http://bitbucket.org/pypy/pypy/changeset/07a4929a661d/
Log: add a lgt arg to newtext, change error _compute_value accordingly diff --git a/pypy/interpreter/error.py b/pypy/interpreter/error.py --- a/pypy/interpreter/error.py +++ b/pypy/interpreter/error.py @@ -9,7 +9,7 @@ from rpython.rlib.objectmodel import we_are_translated, specialize from rpython.rlib.objectmodel import dont_inline, not_rpython from rpython.rlib import rstack, rstackovf -from rpython.rlib import rwin32 +from rpython.rlib import rwin32, runicode from pypy.interpreter import debug @@ -71,7 +71,7 @@ space = getattr(self.w_type, 'space', None) if space is not None: if self.__class__ is not OperationError and s is None: - s = self._compute_value(space) + s, lgt = self._compute_value(space) try: s = space.text_w(s) except Exception: @@ -305,8 +305,8 @@ def get_w_value(self, space): w_value = self._w_value if w_value is None: - value = self._compute_value(space) - self._w_value = w_value = space.newtext(value) + value, lgt = self._compute_value(space) + self._w_value = w_value = space.newtext(value, lgt) return w_value def _compute_value(self, space): @@ -477,10 +477,10 @@ if isinstance(string, unicode): return string assert isinstance(string, str) - return string.decode('utf8') - #result, consumed = runicode.str_decode_utf_8( - # string, len(string), "replace", final=True) - #return result + #return string.decode('utf8') + result, consumed = runicode.str_decode_utf_8( + string, len(string), "replace", final=True) + return result def get_operrcls2(valuefmt): valuefmt = valuefmt.decode('ascii') @@ -502,6 +502,7 @@ self.setup(w_type) def _compute_value(self, space): + # TODO: avoid utf8->unicode->utf8 dance lst = [None] * (len(formats) + len(formats) + 1) for i, fmt, attr in entries: lst[i + i] = self.xstrings[i] @@ -523,7 +524,8 @@ elif fmt == '8': # u'str\uxxxx' -> 'str\xXX\xXX' -> u"'str\xXX\xXX'" if isinstance(value, unicode): - result = value.encode('utf8') + result = runicode.unicode_encode_utf_8(value, + len(value), 'strict', allow_surrogates=True) else: from pypy.interpreter import unicodehelper result = _decode_utf8(unicodehelper.str_decode_utf8( @@ -536,7 +538,12 @@ result = _decode_utf8(str(value)) lst[i + i + 1] = result lst[-1] = self.xstrings[-1] - return u''.join(lst) + retval = u''.join(lst) + # We need to annotate both allow_surrogates=True,False + # since this function is used to replace uni.encode('utf8') + # deep in rpython + return runicode.unicode_encode_utf_8(retval, len(retval), + 'strict', allow_surrogates=False), len(retval) # _fmtcache2[formats] = OpErrFmt return OpErrFmt, strings @@ -547,7 +554,7 @@ self.setup(w_type) def _compute_value(self, space): - return self._value.decode('utf-8') + return self._value, len(self._value) def async(self, space): # also matches a RuntimeError("maximum rec.") if the stack is @@ -639,7 +646,7 @@ msg = u'Windows Error %d' % winerror w_errno = space.w_None w_winerror = space.newint(winerror) - w_msg = space.newtext(msg) + w_msg = space.newtext(msg.encode('utf8'), len(msg)) else: errno = e.errno if errno == EINTR: @@ -653,7 +660,7 @@ msg = u'error %d' % errno w_errno = space.newint(errno) w_winerror = space.w_None - w_msg = space.newtext(msg) + w_msg = space.newtext(msg.encode('utf8'), len(msg)) if w_filename is None: w_filename = space.w_None diff --git a/pypy/interpreter/gateway.py b/pypy/interpreter/gateway.py --- a/pypy/interpreter/gateway.py +++ b/pypy/interpreter/gateway.py @@ -1122,7 +1122,7 @@ kw_defs_w = [] for name, w_def in sorted(alldefs_w.items()): assert name in sig.kwonlyargnames - w_name = space.newtext(name.decode('utf-8')) + w_name = space.newtext(name) kw_defs_w.append((w_name, w_def)) return defs_w, kw_defs_w diff --git a/pypy/interpreter/pyparser/error.py b/pypy/interpreter/pyparser/error.py --- a/pypy/interpreter/pyparser/error.py +++ b/pypy/interpreter/pyparser/error.py @@ -46,7 +46,7 @@ if len(self.text) != offset: text, _ = str_decode_utf_8_impl(self.text, len(self.text), 'replace', False, replace_error_handler, True) - w_text = space.newtext(text) + w_text = space.newtext(text.encode('utf8'), len(text)) return space.newtuple([ space.newtext(self.msg), space.newtuple([ diff --git a/pypy/interpreter/test/test_argument.py b/pypy/interpreter/test/test_argument.py --- a/pypy/interpreter/test/test_argument.py +++ b/pypy/interpreter/test/test_argument.py @@ -92,7 +92,7 @@ def getitem(self, obj, key): return obj[key] - def wrap(self, obj): + def wrap(self, obj, lgt=-1): return obj newtext = wrap diff --git a/pypy/interpreter/test/test_error.py b/pypy/interpreter/test/test_error.py --- a/pypy/interpreter/test/test_error.py +++ b/pypy/interpreter/test/test_error.py @@ -133,7 +133,7 @@ w_OSError = [OSError] w_EnvironmentError = [EnvironmentError] w_None = None - def wrap(self, obj): + def wrap(self, obj, lgt=-1): return [obj] newint = newtext = newfilename = wrap def call_function(self, exc, w_errno, w_msg, w_filename=None, *args): diff --git a/pypy/interpreter/test/test_fsencode.py b/pypy/interpreter/test/test_fsencode.py --- a/pypy/interpreter/test/test_fsencode.py +++ b/pypy/interpreter/test/test_fsencode.py @@ -70,7 +70,7 @@ strs.append(self.special_char) for st in strs: # check roundtrip - w_st = space.newtext(st) + w_st = space.newtext(st.encode('utf8'), len(st)) w_enc = space.fsencode(w_st) w_st2 = space.fsdecode(w_enc) assert space.eq_w(w_st, w_st2) @@ -81,7 +81,8 @@ def test_null_byte(self): space = self.space - w_u = space.newtext(u'abc\x00def') + uni = u'abc\x00def' + w_u = space.newtext(uni.encode('utf8'), len(uni)) # this can behave in two different ways depending on how # much initialized the space is: space.fsencode() can raise # ValueError directly, or return a wrapped bytes with the 0 @@ -94,7 +95,7 @@ if self.special_char: strs.append(self.special_char) for st in strs: - w_st = space.newtext(st) + w_st = space.newtext(st.encode('utf8'), len(st)) w_enc = space.fsencode(w_st) space.appexec([w_st, w_enc], """(u, s): import __pypy__ diff --git a/pypy/interpreter/unicodehelper.py b/pypy/interpreter/unicodehelper.py --- a/pypy/interpreter/unicodehelper.py +++ b/pypy/interpreter/unicodehelper.py @@ -95,7 +95,8 @@ return space.call_method(w_string, 'decode', getfilesystemencoding(space), space.newtext('surrogateescape')) - return space.newtext(uni) + return space.newtext(runicode.unicode_encode_utf_8(uni, + len(uni), 'strict', allow_surrogates=True), len(uni)) def fsencode(space, w_uni): from pypy.module._codecs import interp_codecs @@ -373,7 +374,7 @@ if not final: pos -= 1 break - r, pos, lgt = errorhandler(errors, "utf8", "unexpected end of data", + r, pos = errorhandler(errors, "utf8", "unexpected end of data", s, pos - 1, pos + 1) res.append(r) continue diff --git a/pypy/module/_codecs/interp_codecs.py b/pypy/module/_codecs/interp_codecs.py --- a/pypy/module/_codecs/interp_codecs.py +++ b/pypy/module/_codecs/interp_codecs.py @@ -50,9 +50,9 @@ length = len(input) else: w_cls = space.w_UnicodeEncodeError - length = len(input) - assert isinstance(input, unicode) - w_input = space.newtext((input.encode('utf8'), length, length)) + assert isinstance(input, str) + length = rutf8.codepoints_in_utf8(input) + w_input = space.newtext(input, length) w_exc = space.call_function( w_cls, space.newtext(encoding), @@ -441,7 +441,7 @@ ch = 0 if ch == 0: raise OperationError(space.type(w_exc), w_exc) - return space.newtuple([space.newtext(unichr(ch)), + return space.newtuple([space.newtext(unichr(ch).encode('utf8'), 1), space.newint(start + bytelength)]) else: raise oefmt(space.w_TypeError, @@ -480,7 +480,7 @@ if not consumed: # codec complained about ASCII byte. raise OperationError(space.type(w_exc), w_exc) - return space.newtuple([space.newtext(replace), + return space.newtuple([space.newtext(replace.encode('utf8'), len(replace)), space.newint(start + consumed)]) else: raise oefmt(space.w_TypeError, @@ -723,9 +723,6 @@ if errors is None: errors = 'strict' state = space.fromcache(CodecState) - #result = runicode.unicode_encode_utf_8_impl( - # utf8, lgt, errors, state.encode_error_handler, - # allow_surrogates=False) result = unicodehelper.utf8_encode_utf_8(utf8, errors, state.encode_error_handler, allow_surrogates=False) return space.newtuple([space.newbytes(result), space.newint(lgt)]) diff --git a/pypy/module/_sre/interp_sre.py b/pypy/module/_sre/interp_sre.py --- a/pypy/module/_sre/interp_sre.py +++ b/pypy/module/_sre/interp_sre.py @@ -41,7 +41,8 @@ if isinstance(ctx, rsre_core.StrMatchContext): return space.newbytes(ctx._string[start:end]) elif isinstance(ctx, rsre_core.UnicodeMatchContext): - return space.newtext(ctx._unicodestr[start:end]) + uni = ctx._unicodestr[start:end] + return space.newtext(uni.encode('utf8'), len(uni)) else: # unreachable raise SystemError diff --git a/pypy/objspace/fake/objspace.py b/pypy/objspace/fake/objspace.py --- a/pypy/objspace/fake/objspace.py +++ b/pypy/objspace/fake/objspace.py @@ -218,8 +218,7 @@ def newutf8(self, x, l): return w_some_obj() - @specialize.argtype(1) - def newtext(self, x): + def newtext(self, x, lgt=-1): return w_some_obj() newtext_or_none = newtext newfilename = newtext diff --git a/pypy/objspace/std/objspace.py b/pypy/objspace/std/objspace.py --- a/pypy/objspace/std/objspace.py +++ b/pypy/objspace/std/objspace.py @@ -381,27 +381,22 @@ return W_BytearrayObject(l) @specialize.argtype(1) - def newtext(self, s): + def newtext(self, s, lgt=-1): if isinstance(s, unicode): s, lgt = s.encode('utf8'), len(s) - elif isinstance(s, str): - s, lgt, codepoints = decode_utf8sp(self, s) + elif isinstance(s, str) and lgt < 0: + lgt = rutf8.codepoints_in_utf8(s) elif isinstance(s, tuple): # result of decode_utf8 s, lgt, codepoints = s - else: - # XXX what is s ? - lgt = rutf8.check_utf8(s, True) assert isinstance(s, str) return W_UnicodeObject(s, lgt) - def newtext_or_none(self, s): + def newtext_or_none(self, s, lgt=-1): if s is None: return self.w_None - return self.newtext(s) + return self.newtext(s, lgt) - # XXX find where length is annotated as negative int - #@signature(types.any(), types.str(), types.int_nonneg(), returns=types.any()) def newutf8(self, utf8s, length): assert isinstance(utf8s, str) return W_UnicodeObject(utf8s, length) diff --git a/pypy/objspace/std/unicodeobject.py b/pypy/objspace/std/unicodeobject.py --- a/pypy/objspace/std/unicodeobject.py +++ b/pypy/objspace/std/unicodeobject.py @@ -271,13 +271,13 @@ return w_new def descr_repr(self, space): - return space.newtext(_repr_function(self._utf8)) + return space.newtext(_repr_function(self._utf8)) # quotes=True def descr_str(self, space): if space.is_w(space.type(self), space.w_unicode): return self # Subtype -- return genuine unicode string with the same value. - return space.newtext(space.utf8_w(self)) + return space.newtext(space.utf8_w(self), space.len_w(self)) def descr_hash(self, space): x = compute_hash(self._utf8) @@ -343,7 +343,7 @@ def _parse_format_arg(self, space, w_kwds, __args__): for i in range(len(__args__.keywords)): try: # pff - arg = __args__.keywords[i].decode('utf-8') + arg = __args__.keywords[i] except UnicodeDecodeError: continue # uh, just skip that space.setitem(w_kwds, space.newtext(arg), _______________________________________________ pypy-commit mailing list pypy-commit@python.org https://mail.python.org/mailman/listinfo/pypy-commit