[pypy-commit] pypy utf8-unicode: break the world, and implement W_UnicodeObject as utf8 rpython strings

antocuni Sat, 18 Jan 2014 02:14:57 -0800

Author: Antonio Cuni <anto.c...@gmail.com>
Branch: utf8-unicode
Changeset: r68759:eb1500901ddf
Date: 2014-01-17 22:54 +0100
http://bitbucket.org/pypy/pypy/changeset/eb1500901ddf/


Log:    break the world, and implement W_UnicodeObject as utf8 rpython
        strings

diff --git a/pypy/interpreter/baseobjspace.py b/pypy/interpreter/baseobjspace.py
--- a/pypy/interpreter/baseobjspace.py
+++ b/pypy/interpreter/baseobjspace.py
@@ -199,7 +199,7 @@
     def str_w(self, space):
         self._typed_unwrap_error(space, "string")
 
-    def unicode_w(self, space):
+    def utf8_w(self, space):
         self._typed_unwrap_error(space, "unicode")
 
     def int_w(self, space):
@@ -1376,11 +1376,11 @@
                                  self.wrap('argument must be a string'))
         return self.str_w(w_obj)
 
-    def unicode_w(self, w_obj):
-        return w_obj.unicode_w(self)
+    def utf8_w(self, w_obj):
+        return w_obj.utf8_w(self)
 
-    def unicode0_w(self, w_obj):
-        "Like unicode_w, but rejects strings with NUL bytes."
+    def utf8_0_w(self, w_obj):
+        "Like utf8_w, but rejects strings with NUL bytes."
         from rpython.rlib import rstring
         result = w_obj.unicode_w(self)
         if u'\x00' in result:
diff --git a/pypy/interpreter/unicodehelper.py 
b/pypy/interpreter/unicodehelper.py
--- a/pypy/interpreter/unicodehelper.py
+++ b/pypy/interpreter/unicodehelper.py
@@ -61,3 +61,20 @@
         uni, len(uni), "strict",
         errorhandler=encode_error_handler(space),
         allow_surrogates=True)
+
+def ensure_ascii(space, s, errors='strict'):
+    # ASCII is equivalent to the first 128 ordinals in Unicode.
+    eh = decode_error_handler(space)
+    pos = 0
+    size = len(s)
+    while pos < size:
+        c = s[pos]
+        if ord(c) >= 128:
+            r, pos = eh(errors, "ascii", "ordinal not in range(128)",
+                        s,  pos, pos + 1)
+        pos += 1
+    return s
+
+def ensure_utf8(space, s, errors='strict'):
+    # XXXY implement me!
+    return s
diff --git a/pypy/objspace/std/bytesobject.py b/pypy/objspace/std/bytesobject.py
--- a/pypy/objspace/std/bytesobject.py
+++ b/pypy/objspace/std/bytesobject.py
@@ -658,8 +658,8 @@
         if space.isinstance_w(w_sub, space.w_unicode):
             from pypy.objspace.std.unicodeobject import W_UnicodeObject
             assert isinstance(w_sub, W_UnicodeObject)
-            self_as_unicode = unicode_from_encoded_object(space, self, None, 
None)
-            return space.newbool(self_as_unicode._value.find(w_sub._value) >= 
0)
+            self_as_utf8 = unicode_from_encoded_object(space, self, None, None)
+            return space.newbool(self_as_utf8._utf8val.find(w_sub._utf8val) >= 
0)
         return self._StringMethods_descr_contains(space, w_sub)
 
     _StringMethods_descr_replace = descr_replace
diff --git a/pypy/objspace/std/listobject.py b/pypy/objspace/std/listobject.py
--- a/pypy/objspace/std/listobject.py
+++ b/pypy/objspace/std/listobject.py
@@ -1633,10 +1633,10 @@
     _applevel_repr = "unicode"
 
     def wrap(self, stringval):
-        return self.space.wrap(stringval)
+        return self.space.wrap_utf8(stringval)
 
     def unwrap(self, w_string):
-        return self.space.unicode_w(w_string)
+        return self.space.utf8_w(w_string)
 
     erase, unerase = rerased.new_erasing_pair("unicode")
     erase = staticmethod(erase)
diff --git a/pypy/objspace/std/objspace.py b/pypy/objspace/std/objspace.py
--- a/pypy/objspace/std/objspace.py
+++ b/pypy/objspace/std/objspace.py
@@ -158,7 +158,8 @@
         if isinstance(x, str):
             return wrapstr(self, x)
         if isinstance(x, unicode):
-            return wrapunicode(self, x)
+            # we might want to kill support for wrap(u'...') eventually
+            return wrapunicode(self, x.encode('utf-8'))
         if isinstance(x, float):
             return W_FloatObject(x)
         if isinstance(x, W_Root):
@@ -181,6 +182,14 @@
         return self._wrap_not_rpython(x)
     wrap._annspecialcase_ = "specialize:wrap"
 
+    def wrap_utf8(self, utf8val):
+        """
+        Take an utf8-encoded RPython string an return an unicode applevel
+        object
+        """
+        # the constructor of W_UnicodeObject checks that it's valid UTF8
+        return wrapunicode(self, utf8val)
+
     def _wrap_not_rpython(self, x):
         "NOT_RPYTHON"
         # _____ this code is here to support testing only _____
diff --git a/pypy/objspace/std/unicodeobject.py 
b/pypy/objspace/std/unicodeobject.py
--- a/pypy/objspace/std/unicodeobject.py
+++ b/pypy/objspace/std/unicodeobject.py
@@ -11,7 +11,7 @@
 from pypy.objspace.std.stdtypedef import StdTypeDef
 from pypy.objspace.std.stringmethods import StringMethods
 from rpython.rlib.objectmodel import compute_hash, compute_unique_id, 
import_from_mixin
-from rpython.rlib.rstring import UnicodeBuilder
+from rpython.rlib.rstring import StringBuilder
 from rpython.rlib.runicode import (str_decode_utf_8, str_decode_ascii,
     unicode_encode_utf_8, unicode_encode_ascii, make_unicode_escape_function)
 
@@ -22,24 +22,26 @@
 
 class W_UnicodeObject(W_Root):
     import_from_mixin(StringMethods)
-    _immutable_fields_ = ['_value']
+    _immutable_fields_ = ['_utf8val']
 
-    def __init__(w_self, unistr):
-        assert isinstance(unistr, unicode)
-        w_self._value = unistr
+    def __init__(w_self, utf8val):
+        assert isinstance(utf8val, str)
+        w_self._utf8val = utf8val
+        # XXXY: we want a more efficient way to compute this
+        w_self._length = len(utf8val.decode('utf-8'))
 
     def __repr__(w_self):
         """ representation for debugging purposes """
-        return "%s(%r)" % (w_self.__class__.__name__, w_self._value)
+        return "%s(%r)" % (w_self.__class__.__name__, 
w_self._utf8val.decode('utf8'))
 
     def unwrap(w_self, space):
         # for testing
-        return w_self._value
+        return w_self._utf8val.decode('utf-8')
 
     def create_if_subclassed(w_self):
         if type(w_self) is W_UnicodeObject:
             return w_self
-        return W_UnicodeObject(w_self._value)
+        return W_UnicodeObject(w_self._utf8val)
 
     def is_w(self, space, w_other):
         if not isinstance(w_other, W_UnicodeObject):
@@ -48,55 +50,58 @@
             return True
         if self.user_overridden_class or w_other.user_overridden_class:
             return False
-        return space.unicode_w(self) is space.unicode_w(w_other)
+        return space.utf8_w(self) is space.utf8_w(w_other)
 
     def immutable_unique_id(self, space):
         if self.user_overridden_class:
             return None
-        return space.wrap(compute_unique_id(space.unicode_w(self)))
+        return space.wrap(compute_unique_id(space.utf8_w(self)))
 
     def str_w(self, space):
         return space.str_w(space.str(self))
 
-    def unicode_w(self, space):
-        return self._value
+    def utf8_w(self, space):
+        return self._utf8val
 
     def listview_unicode(w_self):
-        return _create_list_from_unicode(w_self._value)
+        return _create_list_from_unicode(w_self._utf8val)
 
     def ord(self, space):
-        if len(self._value) != 1:
+        if self._len() != 1:
             msg = "ord() expected a character, but string of length %d found"
-            raise operationerrfmt(space.w_TypeError, msg, len(self._value))
+            raise operationerrfmt(space.w_TypeError, msg, self._len())
+        XXX
         return space.wrap(ord(self._value[0]))
 
-    def _new(self, value):
-        return W_UnicodeObject(value)
+    def _new(self, utf8val):
+        assert isinstance(utf8val, str)
+        return W_UnicodeObject(utf8val)
 
     def _new_from_list(self, value):
-        return W_UnicodeObject(u''.join(value))
+        # value is a RPython list of utf8-encoded strings
+        return W_UnicodeObject(''.join(value))
 
     def _empty(self):
         return W_UnicodeObject.EMPTY
 
     def _len(self):
-        return len(self._value)
+        return self._length
 
     def _val(self, space):
-        return self._value
+        return self._utf8val
 
     def _op_val(self, space, w_other):
         if isinstance(w_other, W_UnicodeObject):
-            return w_other._value
+            return w_other._utf8val
         if space.isinstance_w(w_other, space.w_str):
-            return unicode_from_string(space, w_other)._value
-        return unicode_from_encoded_object(space, w_other, None, 
"strict")._value
+            return unicode_from_string(space, w_other)._utf8val
+        return unicode_from_encoded_object(space, w_other, None, 
"strict")._utf8val
 
     def _chr(self, char):
         assert len(char) == 1
         return unicode(char)[0]
 
-    _builder = UnicodeBuilder
+    _builder = StringBuilder
 
     def _isupper(self, ch):
         return unicodedb.isupper(ord(ch))
@@ -189,7 +194,7 @@
         return encode_object(space, self, None, None)
 
     def descr_hash(self, space):
-        x = compute_hash(self._value)
+        x = compute_hash(self._utf8val)
         return space.wrap(x)
 
     def descr_eq(self, space, w_other):
@@ -350,8 +355,9 @@
         return space.newbool(cased)
 
 
-def wrapunicode(space, uni):
-    return W_UnicodeObject(uni)
+def wrapunicode(space, utf8val):
+    # XXXY: we should check that it's valid UTF8
+    return W_UnicodeObject(utf8val)
 
 def plain_str2unicode(space, s):
     try:
@@ -426,17 +432,17 @@
         encoding = getdefaultencoding(space)
     if errors is None or errors == 'strict':
         if encoding == 'ascii':
-            # XXX error handling
             s = space.bufferstr_w(w_obj)
-            eh = unicodehelper.decode_error_handler(space)
-            return space.wrap(str_decode_ascii(
-                    s, len(s), None, final=True, errorhandler=eh)[0])
+            s = unicodehelper.ensure_ascii(space, s)
+            return space.wrap_utf8(s)
         if encoding == 'utf-8':
             s = space.bufferstr_w(w_obj)
-            eh = unicodehelper.decode_error_handler(space)
-            return space.wrap(str_decode_utf_8(
-                    s, len(s), None, final=True, errorhandler=eh,
-                    allow_surrogates=True)[0])
+            s = unicodehelper.ensure_utf8(space, s)
+            return space.wrap_utf8(s)
+            ## eh = unicodehelper.decode_error_handler(space)
+            ## return space.wrap(str_decode_utf_8(
+            ##         s, len(s), None, final=True, errorhandler=eh,
+            ##         allow_surrogates=True)[0])
     w_codecs = space.getbuiltinmodule("_codecs")
     w_decode = space.getattr(w_codecs, space.wrap("decode"))
     if errors is None:
@@ -489,11 +495,8 @@
     if encoding != 'ascii':
         return unicode_from_encoded_object(space, w_str, encoding, "strict")
     s = space.str_w(w_str)
-    try:
-        return W_UnicodeObject(s.decode("ascii"))
-    except UnicodeDecodeError:
-        # raising UnicodeDecodeError is messy, "please crash for me"
-        return unicode_from_encoded_object(space, w_str, "ascii", "strict")
+    s = unicodehelper.ensure_ascii(space, s)
+    return W_UnicodeObject(s)
 
 
 class UnicodeDocstrings:
@@ -1034,7 +1037,7 @@
     return [s for s in value]
 
 
-W_UnicodeObject.EMPTY = W_UnicodeObject(u'')
+W_UnicodeObject.EMPTY = W_UnicodeObject('')
 
 # Helper for converting int/long
 def unicode_to_decimal_w(space, w_unistr):
_______________________________________________
pypy-commit mailing list
pypy-commit@python.org
https://mail.python.org/mailman/listinfo/pypy-commit

[pypy-commit] pypy utf8-unicode: break the world, and implement W_UnicodeObject as utf8 rpython strings

Reply via email to