Author: fijal
Branch: unicode-utf8
Changeset: r90350:ba2214055259
Date: 2017-02-25 14:05 +0100
http://bitbucket.org/pypy/pypy/changeset/ba2214055259/

Log:    implement utf8 gateway (that passes unicode length as a second
        param)

diff --git a/pypy/interpreter/baseobjspace.py b/pypy/interpreter/baseobjspace.py
--- a/pypy/interpreter/baseobjspace.py
+++ b/pypy/interpreter/baseobjspace.py
@@ -264,6 +264,9 @@
     def utf8_w(self, space):
         self._typed_unwrap_error(space, "unicode")
 
+    def convert_to_w_unicode(self, space):
+        self._typed_unwrap_error(space, "unicode")        
+
     def bytearray_list_of_chars_w(self, space):
         self._typed_unwrap_error(space, "bytearray")
 
@@ -1676,8 +1679,12 @@
     def utf8_w(self, w_obj):
         return w_obj.utf8_w(self)
 
+    def convert_to_w_unicode(self, w_obj):
+        return w_obj.convert_to_w_unicode(self)
+
     def unicode0_w(self, w_obj):
         "Like unicode_w, but rejects strings with NUL bytes."
+        xxxx
         from rpython.rlib import rstring
         result = w_obj.unicode_w(self)
         if u'\x00' in result:
@@ -1689,6 +1696,7 @@
     def realunicode_w(self, w_obj):
         # Like unicode_w, but only works if w_obj is really of type
         # 'unicode'.
+        xxx
         if not self.isinstance_w(w_obj, self.w_unicode):
             raise oefmt(self.w_TypeError, "argument must be a unicode")
         return self.unicode_w(w_obj)
diff --git a/pypy/interpreter/gateway.py b/pypy/interpreter/gateway.py
--- a/pypy/interpreter/gateway.py
+++ b/pypy/interpreter/gateway.py
@@ -157,6 +157,9 @@
     def visit_text(self, el, app_sig):
         self.checked_space_method(el, app_sig)
 
+    def visit_utf8(self, el, app_sig):
+        self.checked_space_method(el, app_sig)
+
     def visit_nonnegint(self, el, app_sig):
         self.checked_space_method(el, app_sig)
 
@@ -235,6 +238,7 @@
     def __init__(self):
         UnwrapSpecEmit.__init__(self)
         self.run_args = []
+        self.extracode = []
 
     def scopenext(self):
         return "scope_w[%d]" % self.succ()
@@ -292,6 +296,13 @@
     def visit_text(self, typ):
         self.run_args.append("space.text_w(%s)" % (self.scopenext(),))
 
+    def visit_utf8(self, typ):
+        name = 'w_uni%d' % self.n
+        self.extracode.append('%s = space.convert_to_w_unicode(%s)' %
+                              (name, self.scopenext()))
+        self.run_args.append("space.utf8_w(%s)" % (name,))
+        self.run_args.append("%s._length" % (name,))
+
     def visit_nonnegint(self, typ):
         self.run_args.append("space.gateway_nonnegint_w(%s)" % (
             self.scopenext(),))
@@ -340,8 +351,9 @@
             d = {}
             source = """if 1:
                 def _run(self, space, scope_w):
+                    %s
                     return self.behavior(%s)
-                \n""" % (', '.join(self.run_args),)
+                \n""" % ("\n".join(self.extracode), ', '.join(self.run_args))
             exec compile2(source) in self.miniglobals, d
 
             activation_cls = type("BuiltinActivation_UwS_%s" % label,
@@ -382,6 +394,7 @@
         UnwrapSpecEmit.__init__(self)
         self.args = []
         self.unwrap = []
+        self.extracode = []
         self.finger = 0
 
     def dispatch(self, el, *args):
@@ -448,6 +461,13 @@
     def visit_text(self, typ):
         self.unwrap.append("space.text_w(%s)" % (self.nextarg(),))
 
+    def visit_utf8(self, typ):
+        name = 'w_uni%d' % self.n
+        self.extracode.append('%s = space.convert_to_w_unicode(%s)' %
+                              (name, self.nextarg()))
+        self.unwrap.append("space.utf8_w(%s)" % (name,))
+        self.unwrap.append("%s._length" % (name,))
+
     def visit_nonnegint(self, typ):
         self.unwrap.append("space.gateway_nonnegint_w(%s)" % (self.nextarg(),))
 
@@ -472,6 +492,7 @@
     def visit_truncatedint_w(self, typ):
         self.unwrap.append("space.truncatedint_w(%s)" % (self.nextarg(),))
 
+    @staticmethod
     def make_fastfunc(unwrap_spec, func):
         unwrap_info = UnwrapSpec_FastFunc_Unwrap()
         unwrap_info.apply_over(unwrap_spec)
@@ -495,21 +516,21 @@
             unwrap_info.miniglobals['func'] = func
             source = """if 1:
                 def fastfunc_%s_%d(%s):
+                    %s
                     return func(%s)
                 \n""" % (func.__name__.replace('-', '_'), narg,
-                         ', '.join(args),
+                         ', '.join(args), '\n'.join(unwrap_info.extracode),
                          ', '.join(unwrap_info.unwrap))
             exec compile2(source) in unwrap_info.miniglobals, d
             fastfunc = d['fastfunc_%s_%d' % (func.__name__.replace('-', '_'), 
narg)]
         return narg, fastfunc
-    make_fastfunc = staticmethod(make_fastfunc)
 
 
 def int_unwrapping_space_method(typ):
-    assert typ in (int, str, float, unicode, r_longlong, r_uint, r_ulonglong, 
bool)
+    assert typ in (int, str, float, r_longlong, r_uint, r_ulonglong, bool)
     if typ is r_int is r_longlong:
         return 'gateway_r_longlong_w'
-    elif typ in (str, unicode, bool):
+    elif typ in (str, bool):
         return typ.__name__ + '_w'
     else:
         return 'gateway_' + typ.__name__ + '_w'
@@ -533,6 +554,13 @@
         return func
     return decorator
 
+def unwrap_count_len(spec):
+    lgt = len(spec)
+    for item in spec:
+        if item == 'utf8':
+            lgt += 1
+    return lgt
+
 class WrappedDefault(object):
     """ Can be used inside unwrap_spec as WrappedDefault(3) which means
     it'll be treated as W_Root, but fed with default which will be a wrapped
@@ -1004,7 +1032,7 @@
             code = self._code
             assert isinstance(code._unwrap_spec, (list, tuple))
             assert isinstance(code._argnames, list)
-            assert len(code._unwrap_spec) == len(code._argnames)
+            assert unwrap_count_len(code._unwrap_spec) == len(code._argnames)
             for i in range(len(code._unwrap_spec)-1, -1, -1):
                 spec = code._unwrap_spec[i]
                 argname = code._argnames[i]
diff --git a/pypy/interpreter/test/test_gateway.py 
b/pypy/interpreter/test/test_gateway.py
--- a/pypy/interpreter/test/test_gateway.py
+++ b/pypy/interpreter/test/test_gateway.py
@@ -535,21 +535,22 @@
         w_app_g3_r = space.wrap(app_g3_r)
         raises(gateway.OperationError,space.call_function,w_app_g3_r,w(1.0))
 
-    def test_interp2app_unwrap_spec_unicode(self):
+    def test_interp2app_unwrap_spec_utf8(self):
         space = self.space
         w = space.wrap
-        def g3_u(space, uni):
-            return space.wrap(len(uni))
+        def g3_u(space, utf8, utf8len):
+            return space.newtuple([space.wrap(len(utf8)), space.wrap(utf8len)])
         app_g3_u = gateway.interp2app_temp(g3_u,
                                          unwrap_spec=[gateway.ObjSpace,
-                                                      unicode])
+                                                      'utf8'])
         w_app_g3_u = space.wrap(app_g3_u)
+        encoded = u"gęść".encode('utf8')
         assert self.space.eq_w(
-            space.call_function(w_app_g3_u, w(u"foo")),
-            w(3))
+            space.call_function(w_app_g3_u, w(u"gęść")),
+            space.newtuple([w(len(encoded)), w(4)]))
         assert self.space.eq_w(
-            space.call_function(w_app_g3_u, w("baz")),
-            w(3))
+            space.call_function(w_app_g3_u, w("foo")),
+            space.newtuple([w(3), w(3)]))
         raises(gateway.OperationError, space.call_function, w_app_g3_u,
                w(None))
         raises(gateway.OperationError, space.call_function, w_app_g3_u,
diff --git a/pypy/module/_codecs/interp_codecs.py 
b/pypy/module/_codecs/interp_codecs.py
--- a/pypy/module/_codecs/interp_codecs.py
+++ b/pypy/module/_codecs/interp_codecs.py
@@ -373,14 +373,15 @@
 def make_encoder_wrapper(name):
     rname = "unicode_encode_%s" % (name.replace("_encode", ""), )
     assert hasattr(runicode, rname)
-    @unwrap_spec(uni=unicode, errors='str_or_None')
-    def wrap_encoder(space, uni, errors="strict"):
+    @unwrap_spec(uni='utf8', errors='str_or_None')
+    def wrap_encoder(space, utf8, utf8len, errors="strict"):
         if errors is None:
             errors = 'strict'
         state = space.fromcache(CodecState)
         func = getattr(runicode, rname)
-        result = func(uni, len(uni), errors, state.encode_error_handler)
-        return space.newtuple([space.newbytes(result), space.newint(len(uni))])
+        result = func(utf8, len(utf8), utf8len,
+            errors, state.encode_error_handler)
+        return space.newtuple([space.newbytes(result), space.newint(utf8len)])
     wrap_encoder.func_name = rname
     globals()[name] = wrap_encoder
 
diff --git a/pypy/objspace/std/bytesobject.py b/pypy/objspace/std/bytesobject.py
--- a/pypy/objspace/std/bytesobject.py
+++ b/pypy/objspace/std/bytesobject.py
@@ -1,6 +1,6 @@
 """The builtin str implementation"""
 
-from rpython.rlib import jit
+from rpython.rlib import jit, rutf8
 from rpython.rlib.objectmodel import (
     compute_hash, compute_unique_id, import_from_mixin)
 from rpython.rlib.buffer import StringBuffer
@@ -52,9 +52,17 @@
             uid = (base << IDTAG_SHIFT) | IDTAG_SPECIAL
         return space.newint(uid)
 
-    def unicode_w(self, space):
+    def convert_to_w_unicode(self, space):
         # Use the default encoding.
         encoding = getdefaultencoding(space)
+        if encoding == 'ascii':
+            try:
+                rutf8.check_ascii(self._value)
+                return space.newutf8(self._value, len(self._value))
+            except rutf8.AsciiCheckError:
+                xxx
+        else:
+            xxx
         return space.unicode_w(decode_object(space, self, encoding, None))
 
     def descr_add(self, space, w_other):
diff --git a/pypy/objspace/std/unicodeobject.py 
b/pypy/objspace/std/unicodeobject.py
--- a/pypy/objspace/std/unicodeobject.py
+++ b/pypy/objspace/std/unicodeobject.py
@@ -136,16 +136,19 @@
 
     @staticmethod
     def _op_val(space, w_other, strict=None):
-        return W_UnicodeObject._convert_to_unicode(space, 
w_other)._utf8.decode('utf8')
+        return W_UnicodeObject.convert_arg_to_w_unicode(space, 
w_other)._utf8.decode('utf8')
 
     @staticmethod
-    def _convert_to_unicode(space, w_other):
+    def convert_arg_to_w_unicode(space, w_other):
         if isinstance(w_other, W_UnicodeObject):
             return w_other
         if space.isinstance_w(w_other, space.w_bytes):
             return unicode_from_string(space, w_other)
         return unicode_from_encoded_object(space, w_other, None, "strict")
 
+    def convert_to_w_unicode(self, space):
+        return self
+
     def _chr(self, char):
         assert len(char) == 1
         return char[0]
@@ -249,7 +252,7 @@
 
     def descr_eq(self, space, w_other):
         try:
-            res = self._utf8 == self._convert_to_unicode(space, w_other)._utf8
+            res = self._utf8 == self.convert_arg_to_w_unicode(space, 
w_other)._utf8
         except OperationError as e:
             if e.match(space, space.w_TypeError):
                 return space.w_NotImplemented
@@ -265,7 +268,7 @@
 
     def descr_ne(self, space, w_other):
         try:
-            res = self._utf8 != self._convert_to_unicode(space, w_other)._utf8
+            res = self._utf8 != self.convert_arg_to_w_unicode(space, 
w_other)._utf8
         except OperationError as e:
             if e.match(space, space.w_TypeError):
                 return space.w_NotImplemented
@@ -281,7 +284,7 @@
 
     def descr_lt(self, space, w_other):
         try:
-            res = self._utf8 < self._convert_to_unicode(space, w_other)._utf8
+            res = self._utf8 < self.convert_arg_to_w_unicode(space, 
w_other)._utf8
         except OperationError as e:
             if e.match(space, space.w_TypeError):
                 return space.w_NotImplemented
@@ -290,7 +293,7 @@
 
     def descr_le(self, space, w_other):
         try:
-            res = self._utf8 <= self._convert_to_unicode(space, w_other)._utf8
+            res = self._utf8 <= self.convert_arg_to_w_unicode(space, 
w_other)._utf8
         except OperationError as e:
             if e.match(space, space.w_TypeError):
                 return space.w_NotImplemented
@@ -299,7 +302,7 @@
 
     def descr_gt(self, space, w_other):
         try:
-            res = self._utf8 > self._convert_to_unicode(space, w_other)._utf8
+            res = self._utf8 > self.convert_arg_to_w_unicode(space, 
w_other)._utf8
         except OperationError as e:
             if e.match(space, space.w_TypeError):
                 return space.w_NotImplemented
@@ -308,7 +311,7 @@
 
     def descr_ge(self, space, w_other):
         try:
-            res = self._utf8 >= self._convert_to_unicode(space, w_other)._utf8
+            res = self._utf8 >= self.convert_arg_to_w_unicode(space, 
w_other)._utf8
         except OperationError as e:
             if e.match(space, space.w_TypeError):
                 return space.w_NotImplemented
@@ -431,7 +434,7 @@
 
     def descr_add(self, space, w_other):
         try:
-            w_other = self._convert_to_unicode(space, w_other)
+            w_other = self.convert_arg_to_w_unicode(space, w_other)
         except OperationError as e:
             if e.match(space, space.w_TypeError):
                 return space.w_NotImplemented
@@ -458,7 +461,7 @@
                 return self._join_autoconvert(space, list_w)
             # XXX Maybe the extra copy here is okay? It was basically going to
             #     happen anyway, what with being placed into the builder
-            w_u = self._convert_to_unicode(space, w_s)
+            w_u = self.convert_arg_to_w_unicode(space, w_s)
             unwrapped.append(w_u._utf8)
             lgt += w_u._length
             prealloc_size += len(unwrapped[i])
@@ -525,7 +528,7 @@
             res = split(value, maxsplit=maxsplit)
             return space.newlist([W_UnicodeObject(s, -1) for s in res])
 
-        by = self._convert_to_unicode(space, w_sep)._utf8
+        by = self.convert_arg_to_w_unicode(space, w_sep)._utf8
         if len(by) == 0:
             raise oefmt(space.w_ValueError, "empty separator")
         res = split(value, by, maxsplit)
@@ -540,7 +543,7 @@
             res = rsplit(value, maxsplit=maxsplit)
             return space.newlist([W_UnicodeObject(s, -1) for s in res])
 
-        by = self._convert_to_unicode(space, w_sep)._utf8
+        by = self.convert_arg_to_w_unicode(space, w_sep)._utf8
         if len(by) == 0:
             raise oefmt(space.w_ValueError, "empty separator")
         res = rsplit(value, by, maxsplit)
@@ -550,7 +553,7 @@
     @unwrap_spec(width=int, w_fillchar=WrappedDefault(' '))
     def descr_center(self, space, width, w_fillchar):
         value = self._utf8
-        fillchar = self._convert_to_unicode(space, w_fillchar)._utf8
+        fillchar = self.convert_arg_to_w_unicode(space, w_fillchar)._utf8
         if len(fillchar) != 1:
             raise oefmt(space.w_TypeError,
                         "center() argument 2 must be a single character")
_______________________________________________
pypy-commit mailing list
[email protected]
https://mail.python.org/mailman/listinfo/pypy-commit

Reply via email to