Author: Matti Picus <[email protected]>
Branch: unicode-utf8-py3
Changeset: r94984:07a4929a661d
Date: 2018-08-09 13:27 -0700
http://bitbucket.org/pypy/pypy/changeset/07a4929a661d/
Log: add a lgt arg to newtext, change error _compute_value accordingly
diff --git a/pypy/interpreter/error.py b/pypy/interpreter/error.py
--- a/pypy/interpreter/error.py
+++ b/pypy/interpreter/error.py
@@ -9,7 +9,7 @@
from rpython.rlib.objectmodel import we_are_translated, specialize
from rpython.rlib.objectmodel import dont_inline, not_rpython
from rpython.rlib import rstack, rstackovf
-from rpython.rlib import rwin32
+from rpython.rlib import rwin32, runicode
from pypy.interpreter import debug
@@ -71,7 +71,7 @@
space = getattr(self.w_type, 'space', None)
if space is not None:
if self.__class__ is not OperationError and s is None:
- s = self._compute_value(space)
+ s, lgt = self._compute_value(space)
try:
s = space.text_w(s)
except Exception:
@@ -305,8 +305,8 @@
def get_w_value(self, space):
w_value = self._w_value
if w_value is None:
- value = self._compute_value(space)
- self._w_value = w_value = space.newtext(value)
+ value, lgt = self._compute_value(space)
+ self._w_value = w_value = space.newtext(value, lgt)
return w_value
def _compute_value(self, space):
@@ -477,10 +477,10 @@
if isinstance(string, unicode):
return string
assert isinstance(string, str)
- return string.decode('utf8')
- #result, consumed = runicode.str_decode_utf_8(
- # string, len(string), "replace", final=True)
- #return result
+ #return string.decode('utf8')
+ result, consumed = runicode.str_decode_utf_8(
+ string, len(string), "replace", final=True)
+ return result
def get_operrcls2(valuefmt):
valuefmt = valuefmt.decode('ascii')
@@ -502,6 +502,7 @@
self.setup(w_type)
def _compute_value(self, space):
+ # TODO: avoid utf8->unicode->utf8 dance
lst = [None] * (len(formats) + len(formats) + 1)
for i, fmt, attr in entries:
lst[i + i] = self.xstrings[i]
@@ -523,7 +524,8 @@
elif fmt == '8':
# u'str\uxxxx' -> 'str\xXX\xXX' -> u"'str\xXX\xXX'"
if isinstance(value, unicode):
- result = value.encode('utf8')
+ result = runicode.unicode_encode_utf_8(value,
+ len(value), 'strict',
allow_surrogates=True)
else:
from pypy.interpreter import unicodehelper
result =
_decode_utf8(unicodehelper.str_decode_utf8(
@@ -536,7 +538,12 @@
result = _decode_utf8(str(value))
lst[i + i + 1] = result
lst[-1] = self.xstrings[-1]
- return u''.join(lst)
+ retval = u''.join(lst)
+ # We need to annotate both allow_surrogates=True,False
+ # since this function is used to replace uni.encode('utf8')
+ # deep in rpython
+ return runicode.unicode_encode_utf_8(retval, len(retval),
+ 'strict', allow_surrogates=False), len(retval)
#
_fmtcache2[formats] = OpErrFmt
return OpErrFmt, strings
@@ -547,7 +554,7 @@
self.setup(w_type)
def _compute_value(self, space):
- return self._value.decode('utf-8')
+ return self._value, len(self._value)
def async(self, space):
# also matches a RuntimeError("maximum rec.") if the stack is
@@ -639,7 +646,7 @@
msg = u'Windows Error %d' % winerror
w_errno = space.w_None
w_winerror = space.newint(winerror)
- w_msg = space.newtext(msg)
+ w_msg = space.newtext(msg.encode('utf8'), len(msg))
else:
errno = e.errno
if errno == EINTR:
@@ -653,7 +660,7 @@
msg = u'error %d' % errno
w_errno = space.newint(errno)
w_winerror = space.w_None
- w_msg = space.newtext(msg)
+ w_msg = space.newtext(msg.encode('utf8'), len(msg))
if w_filename is None:
w_filename = space.w_None
diff --git a/pypy/interpreter/gateway.py b/pypy/interpreter/gateway.py
--- a/pypy/interpreter/gateway.py
+++ b/pypy/interpreter/gateway.py
@@ -1122,7 +1122,7 @@
kw_defs_w = []
for name, w_def in sorted(alldefs_w.items()):
assert name in sig.kwonlyargnames
- w_name = space.newtext(name.decode('utf-8'))
+ w_name = space.newtext(name)
kw_defs_w.append((w_name, w_def))
return defs_w, kw_defs_w
diff --git a/pypy/interpreter/pyparser/error.py
b/pypy/interpreter/pyparser/error.py
--- a/pypy/interpreter/pyparser/error.py
+++ b/pypy/interpreter/pyparser/error.py
@@ -46,7 +46,7 @@
if len(self.text) != offset:
text, _ = str_decode_utf_8_impl(self.text, len(self.text),
'replace', False, replace_error_handler, True)
- w_text = space.newtext(text)
+ w_text = space.newtext(text.encode('utf8'), len(text))
return space.newtuple([
space.newtext(self.msg),
space.newtuple([
diff --git a/pypy/interpreter/test/test_argument.py
b/pypy/interpreter/test/test_argument.py
--- a/pypy/interpreter/test/test_argument.py
+++ b/pypy/interpreter/test/test_argument.py
@@ -92,7 +92,7 @@
def getitem(self, obj, key):
return obj[key]
- def wrap(self, obj):
+ def wrap(self, obj, lgt=-1):
return obj
newtext = wrap
diff --git a/pypy/interpreter/test/test_error.py
b/pypy/interpreter/test/test_error.py
--- a/pypy/interpreter/test/test_error.py
+++ b/pypy/interpreter/test/test_error.py
@@ -133,7 +133,7 @@
w_OSError = [OSError]
w_EnvironmentError = [EnvironmentError]
w_None = None
- def wrap(self, obj):
+ def wrap(self, obj, lgt=-1):
return [obj]
newint = newtext = newfilename = wrap
def call_function(self, exc, w_errno, w_msg, w_filename=None, *args):
diff --git a/pypy/interpreter/test/test_fsencode.py
b/pypy/interpreter/test/test_fsencode.py
--- a/pypy/interpreter/test/test_fsencode.py
+++ b/pypy/interpreter/test/test_fsencode.py
@@ -70,7 +70,7 @@
strs.append(self.special_char)
for st in strs:
# check roundtrip
- w_st = space.newtext(st)
+ w_st = space.newtext(st.encode('utf8'), len(st))
w_enc = space.fsencode(w_st)
w_st2 = space.fsdecode(w_enc)
assert space.eq_w(w_st, w_st2)
@@ -81,7 +81,8 @@
def test_null_byte(self):
space = self.space
- w_u = space.newtext(u'abc\x00def')
+ uni = u'abc\x00def'
+ w_u = space.newtext(uni.encode('utf8'), len(uni))
# this can behave in two different ways depending on how
# much initialized the space is: space.fsencode() can raise
# ValueError directly, or return a wrapped bytes with the 0
@@ -94,7 +95,7 @@
if self.special_char:
strs.append(self.special_char)
for st in strs:
- w_st = space.newtext(st)
+ w_st = space.newtext(st.encode('utf8'), len(st))
w_enc = space.fsencode(w_st)
space.appexec([w_st, w_enc], """(u, s):
import __pypy__
diff --git a/pypy/interpreter/unicodehelper.py
b/pypy/interpreter/unicodehelper.py
--- a/pypy/interpreter/unicodehelper.py
+++ b/pypy/interpreter/unicodehelper.py
@@ -95,7 +95,8 @@
return space.call_method(w_string, 'decode',
getfilesystemencoding(space),
space.newtext('surrogateescape'))
- return space.newtext(uni)
+ return space.newtext(runicode.unicode_encode_utf_8(uni,
+ len(uni), 'strict', allow_surrogates=True),
len(uni))
def fsencode(space, w_uni):
from pypy.module._codecs import interp_codecs
@@ -373,7 +374,7 @@
if not final:
pos -= 1
break
- r, pos, lgt = errorhandler(errors, "utf8", "unexpected end of
data",
+ r, pos = errorhandler(errors, "utf8", "unexpected end of data",
s, pos - 1, pos + 1)
res.append(r)
continue
diff --git a/pypy/module/_codecs/interp_codecs.py
b/pypy/module/_codecs/interp_codecs.py
--- a/pypy/module/_codecs/interp_codecs.py
+++ b/pypy/module/_codecs/interp_codecs.py
@@ -50,9 +50,9 @@
length = len(input)
else:
w_cls = space.w_UnicodeEncodeError
- length = len(input)
- assert isinstance(input, unicode)
- w_input = space.newtext((input.encode('utf8'), length, length))
+ assert isinstance(input, str)
+ length = rutf8.codepoints_in_utf8(input)
+ w_input = space.newtext(input, length)
w_exc = space.call_function(
w_cls,
space.newtext(encoding),
@@ -441,7 +441,7 @@
ch = 0
if ch == 0:
raise OperationError(space.type(w_exc), w_exc)
- return space.newtuple([space.newtext(unichr(ch)),
+ return space.newtuple([space.newtext(unichr(ch).encode('utf8'), 1),
space.newint(start + bytelength)])
else:
raise oefmt(space.w_TypeError,
@@ -480,7 +480,7 @@
if not consumed:
# codec complained about ASCII byte.
raise OperationError(space.type(w_exc), w_exc)
- return space.newtuple([space.newtext(replace),
+ return space.newtuple([space.newtext(replace.encode('utf8'),
len(replace)),
space.newint(start + consumed)])
else:
raise oefmt(space.w_TypeError,
@@ -723,9 +723,6 @@
if errors is None:
errors = 'strict'
state = space.fromcache(CodecState)
- #result = runicode.unicode_encode_utf_8_impl(
- # utf8, lgt, errors, state.encode_error_handler,
- # allow_surrogates=False)
result = unicodehelper.utf8_encode_utf_8(utf8, errors,
state.encode_error_handler, allow_surrogates=False)
return space.newtuple([space.newbytes(result), space.newint(lgt)])
diff --git a/pypy/module/_sre/interp_sre.py b/pypy/module/_sre/interp_sre.py
--- a/pypy/module/_sre/interp_sre.py
+++ b/pypy/module/_sre/interp_sre.py
@@ -41,7 +41,8 @@
if isinstance(ctx, rsre_core.StrMatchContext):
return space.newbytes(ctx._string[start:end])
elif isinstance(ctx, rsre_core.UnicodeMatchContext):
- return space.newtext(ctx._unicodestr[start:end])
+ uni = ctx._unicodestr[start:end]
+ return space.newtext(uni.encode('utf8'), len(uni))
else:
# unreachable
raise SystemError
diff --git a/pypy/objspace/fake/objspace.py b/pypy/objspace/fake/objspace.py
--- a/pypy/objspace/fake/objspace.py
+++ b/pypy/objspace/fake/objspace.py
@@ -218,8 +218,7 @@
def newutf8(self, x, l):
return w_some_obj()
- @specialize.argtype(1)
- def newtext(self, x):
+ def newtext(self, x, lgt=-1):
return w_some_obj()
newtext_or_none = newtext
newfilename = newtext
diff --git a/pypy/objspace/std/objspace.py b/pypy/objspace/std/objspace.py
--- a/pypy/objspace/std/objspace.py
+++ b/pypy/objspace/std/objspace.py
@@ -381,27 +381,22 @@
return W_BytearrayObject(l)
@specialize.argtype(1)
- def newtext(self, s):
+ def newtext(self, s, lgt=-1):
if isinstance(s, unicode):
s, lgt = s.encode('utf8'), len(s)
- elif isinstance(s, str):
- s, lgt, codepoints = decode_utf8sp(self, s)
+ elif isinstance(s, str) and lgt < 0:
+ lgt = rutf8.codepoints_in_utf8(s)
elif isinstance(s, tuple):
# result of decode_utf8
s, lgt, codepoints = s
- else:
- # XXX what is s ?
- lgt = rutf8.check_utf8(s, True)
assert isinstance(s, str)
return W_UnicodeObject(s, lgt)
- def newtext_or_none(self, s):
+ def newtext_or_none(self, s, lgt=-1):
if s is None:
return self.w_None
- return self.newtext(s)
+ return self.newtext(s, lgt)
- # XXX find where length is annotated as negative int
- #@signature(types.any(), types.str(), types.int_nonneg(),
returns=types.any())
def newutf8(self, utf8s, length):
assert isinstance(utf8s, str)
return W_UnicodeObject(utf8s, length)
diff --git a/pypy/objspace/std/unicodeobject.py
b/pypy/objspace/std/unicodeobject.py
--- a/pypy/objspace/std/unicodeobject.py
+++ b/pypy/objspace/std/unicodeobject.py
@@ -271,13 +271,13 @@
return w_new
def descr_repr(self, space):
- return space.newtext(_repr_function(self._utf8))
+ return space.newtext(_repr_function(self._utf8)) # quotes=True
def descr_str(self, space):
if space.is_w(space.type(self), space.w_unicode):
return self
# Subtype -- return genuine unicode string with the same value.
- return space.newtext(space.utf8_w(self))
+ return space.newtext(space.utf8_w(self), space.len_w(self))
def descr_hash(self, space):
x = compute_hash(self._utf8)
@@ -343,7 +343,7 @@
def _parse_format_arg(self, space, w_kwds, __args__):
for i in range(len(__args__.keywords)):
try: # pff
- arg = __args__.keywords[i].decode('utf-8')
+ arg = __args__.keywords[i]
except UnicodeDecodeError:
continue # uh, just skip that
space.setitem(w_kwds, space.newtext(arg),
_______________________________________________
pypy-commit mailing list
[email protected]
https://mail.python.org/mailman/listinfo/pypy-commit