Author: fijal
Branch: unicode-utf8
Changeset: r90350:ba2214055259
Date: 2017-02-25 14:05 +0100
http://bitbucket.org/pypy/pypy/changeset/ba2214055259/
Log: implement utf8 gateway (that passes unicode length as a second
param)
diff --git a/pypy/interpreter/baseobjspace.py b/pypy/interpreter/baseobjspace.py
--- a/pypy/interpreter/baseobjspace.py
+++ b/pypy/interpreter/baseobjspace.py
@@ -264,6 +264,9 @@
def utf8_w(self, space):
self._typed_unwrap_error(space, "unicode")
+ def convert_to_w_unicode(self, space):
+ self._typed_unwrap_error(space, "unicode")
+
def bytearray_list_of_chars_w(self, space):
self._typed_unwrap_error(space, "bytearray")
@@ -1676,8 +1679,12 @@
def utf8_w(self, w_obj):
return w_obj.utf8_w(self)
+ def convert_to_w_unicode(self, w_obj):
+ return w_obj.convert_to_w_unicode(self)
+
def unicode0_w(self, w_obj):
"Like unicode_w, but rejects strings with NUL bytes."
+ xxxx
from rpython.rlib import rstring
result = w_obj.unicode_w(self)
if u'\x00' in result:
@@ -1689,6 +1696,7 @@
def realunicode_w(self, w_obj):
# Like unicode_w, but only works if w_obj is really of type
# 'unicode'.
+ xxx
if not self.isinstance_w(w_obj, self.w_unicode):
raise oefmt(self.w_TypeError, "argument must be a unicode")
return self.unicode_w(w_obj)
diff --git a/pypy/interpreter/gateway.py b/pypy/interpreter/gateway.py
--- a/pypy/interpreter/gateway.py
+++ b/pypy/interpreter/gateway.py
@@ -157,6 +157,9 @@
def visit_text(self, el, app_sig):
self.checked_space_method(el, app_sig)
+ def visit_utf8(self, el, app_sig):
+ self.checked_space_method(el, app_sig)
+
def visit_nonnegint(self, el, app_sig):
self.checked_space_method(el, app_sig)
@@ -235,6 +238,7 @@
def __init__(self):
UnwrapSpecEmit.__init__(self)
self.run_args = []
+ self.extracode = []
def scopenext(self):
return "scope_w[%d]" % self.succ()
@@ -292,6 +296,13 @@
def visit_text(self, typ):
self.run_args.append("space.text_w(%s)" % (self.scopenext(),))
+ def visit_utf8(self, typ):
+ name = 'w_uni%d' % self.n
+ self.extracode.append('%s = space.convert_to_w_unicode(%s)' %
+ (name, self.scopenext()))
+ self.run_args.append("space.utf8_w(%s)" % (name,))
+ self.run_args.append("%s._length" % (name,))
+
def visit_nonnegint(self, typ):
self.run_args.append("space.gateway_nonnegint_w(%s)" % (
self.scopenext(),))
@@ -340,8 +351,9 @@
d = {}
source = """if 1:
def _run(self, space, scope_w):
+ %s
return self.behavior(%s)
- \n""" % (', '.join(self.run_args),)
+ \n""" % ("\n".join(self.extracode), ', '.join(self.run_args))
exec compile2(source) in self.miniglobals, d
activation_cls = type("BuiltinActivation_UwS_%s" % label,
@@ -382,6 +394,7 @@
UnwrapSpecEmit.__init__(self)
self.args = []
self.unwrap = []
+ self.extracode = []
self.finger = 0
def dispatch(self, el, *args):
@@ -448,6 +461,13 @@
def visit_text(self, typ):
self.unwrap.append("space.text_w(%s)" % (self.nextarg(),))
+ def visit_utf8(self, typ):
+ name = 'w_uni%d' % self.n
+ self.extracode.append('%s = space.convert_to_w_unicode(%s)' %
+ (name, self.nextarg()))
+ self.unwrap.append("space.utf8_w(%s)" % (name,))
+ self.unwrap.append("%s._length" % (name,))
+
def visit_nonnegint(self, typ):
self.unwrap.append("space.gateway_nonnegint_w(%s)" % (self.nextarg(),))
@@ -472,6 +492,7 @@
def visit_truncatedint_w(self, typ):
self.unwrap.append("space.truncatedint_w(%s)" % (self.nextarg(),))
+ @staticmethod
def make_fastfunc(unwrap_spec, func):
unwrap_info = UnwrapSpec_FastFunc_Unwrap()
unwrap_info.apply_over(unwrap_spec)
@@ -495,21 +516,21 @@
unwrap_info.miniglobals['func'] = func
source = """if 1:
def fastfunc_%s_%d(%s):
+ %s
return func(%s)
\n""" % (func.__name__.replace('-', '_'), narg,
- ', '.join(args),
+ ', '.join(args), '\n'.join(unwrap_info.extracode),
', '.join(unwrap_info.unwrap))
exec compile2(source) in unwrap_info.miniglobals, d
fastfunc = d['fastfunc_%s_%d' % (func.__name__.replace('-', '_'),
narg)]
return narg, fastfunc
- make_fastfunc = staticmethod(make_fastfunc)
def int_unwrapping_space_method(typ):
- assert typ in (int, str, float, unicode, r_longlong, r_uint, r_ulonglong,
bool)
+ assert typ in (int, str, float, r_longlong, r_uint, r_ulonglong, bool)
if typ is r_int is r_longlong:
return 'gateway_r_longlong_w'
- elif typ in (str, unicode, bool):
+ elif typ in (str, bool):
return typ.__name__ + '_w'
else:
return 'gateway_' + typ.__name__ + '_w'
@@ -533,6 +554,13 @@
return func
return decorator
+def unwrap_count_len(spec):
+ lgt = len(spec)
+ for item in spec:
+ if item == 'utf8':
+ lgt += 1
+ return lgt
+
class WrappedDefault(object):
""" Can be used inside unwrap_spec as WrappedDefault(3) which means
it'll be treated as W_Root, but fed with default which will be a wrapped
@@ -1004,7 +1032,7 @@
code = self._code
assert isinstance(code._unwrap_spec, (list, tuple))
assert isinstance(code._argnames, list)
- assert len(code._unwrap_spec) == len(code._argnames)
+ assert unwrap_count_len(code._unwrap_spec) == len(code._argnames)
for i in range(len(code._unwrap_spec)-1, -1, -1):
spec = code._unwrap_spec[i]
argname = code._argnames[i]
diff --git a/pypy/interpreter/test/test_gateway.py
b/pypy/interpreter/test/test_gateway.py
--- a/pypy/interpreter/test/test_gateway.py
+++ b/pypy/interpreter/test/test_gateway.py
@@ -535,21 +535,22 @@
w_app_g3_r = space.wrap(app_g3_r)
raises(gateway.OperationError,space.call_function,w_app_g3_r,w(1.0))
- def test_interp2app_unwrap_spec_unicode(self):
+ def test_interp2app_unwrap_spec_utf8(self):
space = self.space
w = space.wrap
- def g3_u(space, uni):
- return space.wrap(len(uni))
+ def g3_u(space, utf8, utf8len):
+ return space.newtuple([space.wrap(len(utf8)), space.wrap(utf8len)])
app_g3_u = gateway.interp2app_temp(g3_u,
unwrap_spec=[gateway.ObjSpace,
- unicode])
+ 'utf8'])
w_app_g3_u = space.wrap(app_g3_u)
+ encoded = u"gęść".encode('utf8')
assert self.space.eq_w(
- space.call_function(w_app_g3_u, w(u"foo")),
- w(3))
+ space.call_function(w_app_g3_u, w(u"gęść")),
+ space.newtuple([w(len(encoded)), w(4)]))
assert self.space.eq_w(
- space.call_function(w_app_g3_u, w("baz")),
- w(3))
+ space.call_function(w_app_g3_u, w("foo")),
+ space.newtuple([w(3), w(3)]))
raises(gateway.OperationError, space.call_function, w_app_g3_u,
w(None))
raises(gateway.OperationError, space.call_function, w_app_g3_u,
diff --git a/pypy/module/_codecs/interp_codecs.py
b/pypy/module/_codecs/interp_codecs.py
--- a/pypy/module/_codecs/interp_codecs.py
+++ b/pypy/module/_codecs/interp_codecs.py
@@ -373,14 +373,15 @@
def make_encoder_wrapper(name):
rname = "unicode_encode_%s" % (name.replace("_encode", ""), )
assert hasattr(runicode, rname)
- @unwrap_spec(uni=unicode, errors='str_or_None')
- def wrap_encoder(space, uni, errors="strict"):
+ @unwrap_spec(uni='utf8', errors='str_or_None')
+ def wrap_encoder(space, utf8, utf8len, errors="strict"):
if errors is None:
errors = 'strict'
state = space.fromcache(CodecState)
func = getattr(runicode, rname)
- result = func(uni, len(uni), errors, state.encode_error_handler)
- return space.newtuple([space.newbytes(result), space.newint(len(uni))])
+ result = func(utf8, len(utf8), utf8len,
+ errors, state.encode_error_handler)
+ return space.newtuple([space.newbytes(result), space.newint(utf8len)])
wrap_encoder.func_name = rname
globals()[name] = wrap_encoder
diff --git a/pypy/objspace/std/bytesobject.py b/pypy/objspace/std/bytesobject.py
--- a/pypy/objspace/std/bytesobject.py
+++ b/pypy/objspace/std/bytesobject.py
@@ -1,6 +1,6 @@
"""The builtin str implementation"""
-from rpython.rlib import jit
+from rpython.rlib import jit, rutf8
from rpython.rlib.objectmodel import (
compute_hash, compute_unique_id, import_from_mixin)
from rpython.rlib.buffer import StringBuffer
@@ -52,9 +52,17 @@
uid = (base << IDTAG_SHIFT) | IDTAG_SPECIAL
return space.newint(uid)
- def unicode_w(self, space):
+ def convert_to_w_unicode(self, space):
# Use the default encoding.
encoding = getdefaultencoding(space)
+ if encoding == 'ascii':
+ try:
+ rutf8.check_ascii(self._value)
+ return space.newutf8(self._value, len(self._value))
+ except rutf8.AsciiCheckError:
+ xxx
+ else:
+ xxx
return space.unicode_w(decode_object(space, self, encoding, None))
def descr_add(self, space, w_other):
diff --git a/pypy/objspace/std/unicodeobject.py
b/pypy/objspace/std/unicodeobject.py
--- a/pypy/objspace/std/unicodeobject.py
+++ b/pypy/objspace/std/unicodeobject.py
@@ -136,16 +136,19 @@
@staticmethod
def _op_val(space, w_other, strict=None):
- return W_UnicodeObject._convert_to_unicode(space,
w_other)._utf8.decode('utf8')
+ return W_UnicodeObject.convert_arg_to_w_unicode(space,
w_other)._utf8.decode('utf8')
@staticmethod
- def _convert_to_unicode(space, w_other):
+ def convert_arg_to_w_unicode(space, w_other):
if isinstance(w_other, W_UnicodeObject):
return w_other
if space.isinstance_w(w_other, space.w_bytes):
return unicode_from_string(space, w_other)
return unicode_from_encoded_object(space, w_other, None, "strict")
+ def convert_to_w_unicode(self, space):
+ return self
+
def _chr(self, char):
assert len(char) == 1
return char[0]
@@ -249,7 +252,7 @@
def descr_eq(self, space, w_other):
try:
- res = self._utf8 == self._convert_to_unicode(space, w_other)._utf8
+ res = self._utf8 == self.convert_arg_to_w_unicode(space,
w_other)._utf8
except OperationError as e:
if e.match(space, space.w_TypeError):
return space.w_NotImplemented
@@ -265,7 +268,7 @@
def descr_ne(self, space, w_other):
try:
- res = self._utf8 != self._convert_to_unicode(space, w_other)._utf8
+ res = self._utf8 != self.convert_arg_to_w_unicode(space,
w_other)._utf8
except OperationError as e:
if e.match(space, space.w_TypeError):
return space.w_NotImplemented
@@ -281,7 +284,7 @@
def descr_lt(self, space, w_other):
try:
- res = self._utf8 < self._convert_to_unicode(space, w_other)._utf8
+ res = self._utf8 < self.convert_arg_to_w_unicode(space,
w_other)._utf8
except OperationError as e:
if e.match(space, space.w_TypeError):
return space.w_NotImplemented
@@ -290,7 +293,7 @@
def descr_le(self, space, w_other):
try:
- res = self._utf8 <= self._convert_to_unicode(space, w_other)._utf8
+ res = self._utf8 <= self.convert_arg_to_w_unicode(space,
w_other)._utf8
except OperationError as e:
if e.match(space, space.w_TypeError):
return space.w_NotImplemented
@@ -299,7 +302,7 @@
def descr_gt(self, space, w_other):
try:
- res = self._utf8 > self._convert_to_unicode(space, w_other)._utf8
+ res = self._utf8 > self.convert_arg_to_w_unicode(space,
w_other)._utf8
except OperationError as e:
if e.match(space, space.w_TypeError):
return space.w_NotImplemented
@@ -308,7 +311,7 @@
def descr_ge(self, space, w_other):
try:
- res = self._utf8 >= self._convert_to_unicode(space, w_other)._utf8
+ res = self._utf8 >= self.convert_arg_to_w_unicode(space,
w_other)._utf8
except OperationError as e:
if e.match(space, space.w_TypeError):
return space.w_NotImplemented
@@ -431,7 +434,7 @@
def descr_add(self, space, w_other):
try:
- w_other = self._convert_to_unicode(space, w_other)
+ w_other = self.convert_arg_to_w_unicode(space, w_other)
except OperationError as e:
if e.match(space, space.w_TypeError):
return space.w_NotImplemented
@@ -458,7 +461,7 @@
return self._join_autoconvert(space, list_w)
# XXX Maybe the extra copy here is okay? It was basically going to
# happen anyway, what with being placed into the builder
- w_u = self._convert_to_unicode(space, w_s)
+ w_u = self.convert_arg_to_w_unicode(space, w_s)
unwrapped.append(w_u._utf8)
lgt += w_u._length
prealloc_size += len(unwrapped[i])
@@ -525,7 +528,7 @@
res = split(value, maxsplit=maxsplit)
return space.newlist([W_UnicodeObject(s, -1) for s in res])
- by = self._convert_to_unicode(space, w_sep)._utf8
+ by = self.convert_arg_to_w_unicode(space, w_sep)._utf8
if len(by) == 0:
raise oefmt(space.w_ValueError, "empty separator")
res = split(value, by, maxsplit)
@@ -540,7 +543,7 @@
res = rsplit(value, maxsplit=maxsplit)
return space.newlist([W_UnicodeObject(s, -1) for s in res])
- by = self._convert_to_unicode(space, w_sep)._utf8
+ by = self.convert_arg_to_w_unicode(space, w_sep)._utf8
if len(by) == 0:
raise oefmt(space.w_ValueError, "empty separator")
res = rsplit(value, by, maxsplit)
@@ -550,7 +553,7 @@
@unwrap_spec(width=int, w_fillchar=WrappedDefault(' '))
def descr_center(self, space, width, w_fillchar):
value = self._utf8
- fillchar = self._convert_to_unicode(space, w_fillchar)._utf8
+ fillchar = self.convert_arg_to_w_unicode(space, w_fillchar)._utf8
if len(fillchar) != 1:
raise oefmt(space.w_TypeError,
"center() argument 2 must be a single character")
_______________________________________________
pypy-commit mailing list
[email protected]
https://mail.python.org/mailman/listinfo/pypy-commit