Author: Antonio Cuni <[email protected]>
Branch: utf8-unicode
Changeset: r68759:eb1500901ddf
Date: 2014-01-17 22:54 +0100
http://bitbucket.org/pypy/pypy/changeset/eb1500901ddf/
Log: break the world, and implement W_UnicodeObject as utf8 rpython
strings
diff --git a/pypy/interpreter/baseobjspace.py b/pypy/interpreter/baseobjspace.py
--- a/pypy/interpreter/baseobjspace.py
+++ b/pypy/interpreter/baseobjspace.py
@@ -199,7 +199,7 @@
def str_w(self, space):
self._typed_unwrap_error(space, "string")
- def unicode_w(self, space):
+ def utf8_w(self, space):
self._typed_unwrap_error(space, "unicode")
def int_w(self, space):
@@ -1376,11 +1376,11 @@
self.wrap('argument must be a string'))
return self.str_w(w_obj)
- def unicode_w(self, w_obj):
- return w_obj.unicode_w(self)
+ def utf8_w(self, w_obj):
+ return w_obj.utf8_w(self)
- def unicode0_w(self, w_obj):
- "Like unicode_w, but rejects strings with NUL bytes."
+ def utf8_0_w(self, w_obj):
+ "Like utf8_w, but rejects strings with NUL bytes."
from rpython.rlib import rstring
result = w_obj.unicode_w(self)
if u'\x00' in result:
diff --git a/pypy/interpreter/unicodehelper.py
b/pypy/interpreter/unicodehelper.py
--- a/pypy/interpreter/unicodehelper.py
+++ b/pypy/interpreter/unicodehelper.py
@@ -61,3 +61,20 @@
uni, len(uni), "strict",
errorhandler=encode_error_handler(space),
allow_surrogates=True)
+
+def ensure_ascii(space, s, errors='strict'):
+ # ASCII is equivalent to the first 128 ordinals in Unicode.
+ eh = decode_error_handler(space)
+ pos = 0
+ size = len(s)
+ while pos < size:
+ c = s[pos]
+ if ord(c) >= 128:
+ r, pos = eh(errors, "ascii", "ordinal not in range(128)",
+ s, pos, pos + 1)
+ pos += 1
+ return s
+
+def ensure_utf8(space, s, errors='strict'):
+ # XXXY implement me!
+ return s
diff --git a/pypy/objspace/std/bytesobject.py b/pypy/objspace/std/bytesobject.py
--- a/pypy/objspace/std/bytesobject.py
+++ b/pypy/objspace/std/bytesobject.py
@@ -658,8 +658,8 @@
if space.isinstance_w(w_sub, space.w_unicode):
from pypy.objspace.std.unicodeobject import W_UnicodeObject
assert isinstance(w_sub, W_UnicodeObject)
- self_as_unicode = unicode_from_encoded_object(space, self, None,
None)
- return space.newbool(self_as_unicode._value.find(w_sub._value) >=
0)
+ self_as_utf8 = unicode_from_encoded_object(space, self, None, None)
+ return space.newbool(self_as_utf8._utf8val.find(w_sub._utf8val) >=
0)
return self._StringMethods_descr_contains(space, w_sub)
_StringMethods_descr_replace = descr_replace
diff --git a/pypy/objspace/std/listobject.py b/pypy/objspace/std/listobject.py
--- a/pypy/objspace/std/listobject.py
+++ b/pypy/objspace/std/listobject.py
@@ -1633,10 +1633,10 @@
_applevel_repr = "unicode"
def wrap(self, stringval):
- return self.space.wrap(stringval)
+ return self.space.wrap_utf8(stringval)
def unwrap(self, w_string):
- return self.space.unicode_w(w_string)
+ return self.space.utf8_w(w_string)
erase, unerase = rerased.new_erasing_pair("unicode")
erase = staticmethod(erase)
diff --git a/pypy/objspace/std/objspace.py b/pypy/objspace/std/objspace.py
--- a/pypy/objspace/std/objspace.py
+++ b/pypy/objspace/std/objspace.py
@@ -158,7 +158,8 @@
if isinstance(x, str):
return wrapstr(self, x)
if isinstance(x, unicode):
- return wrapunicode(self, x)
+ # we might want to kill support for wrap(u'...') eventually
+ return wrapunicode(self, x.encode('utf-8'))
if isinstance(x, float):
return W_FloatObject(x)
if isinstance(x, W_Root):
@@ -181,6 +182,14 @@
return self._wrap_not_rpython(x)
wrap._annspecialcase_ = "specialize:wrap"
+ def wrap_utf8(self, utf8val):
+ """
+ Take an utf8-encoded RPython string an return an unicode applevel
+ object
+ """
+ # the constructor of W_UnicodeObject checks that it's valid UTF8
+ return wrapunicode(self, utf8val)
+
def _wrap_not_rpython(self, x):
"NOT_RPYTHON"
# _____ this code is here to support testing only _____
diff --git a/pypy/objspace/std/unicodeobject.py
b/pypy/objspace/std/unicodeobject.py
--- a/pypy/objspace/std/unicodeobject.py
+++ b/pypy/objspace/std/unicodeobject.py
@@ -11,7 +11,7 @@
from pypy.objspace.std.stdtypedef import StdTypeDef
from pypy.objspace.std.stringmethods import StringMethods
from rpython.rlib.objectmodel import compute_hash, compute_unique_id,
import_from_mixin
-from rpython.rlib.rstring import UnicodeBuilder
+from rpython.rlib.rstring import StringBuilder
from rpython.rlib.runicode import (str_decode_utf_8, str_decode_ascii,
unicode_encode_utf_8, unicode_encode_ascii, make_unicode_escape_function)
@@ -22,24 +22,26 @@
class W_UnicodeObject(W_Root):
import_from_mixin(StringMethods)
- _immutable_fields_ = ['_value']
+ _immutable_fields_ = ['_utf8val']
- def __init__(w_self, unistr):
- assert isinstance(unistr, unicode)
- w_self._value = unistr
+ def __init__(w_self, utf8val):
+ assert isinstance(utf8val, str)
+ w_self._utf8val = utf8val
+ # XXXY: we want a more efficient way to compute this
+ w_self._length = len(utf8val.decode('utf-8'))
def __repr__(w_self):
""" representation for debugging purposes """
- return "%s(%r)" % (w_self.__class__.__name__, w_self._value)
+ return "%s(%r)" % (w_self.__class__.__name__,
w_self._utf8val.decode('utf8'))
def unwrap(w_self, space):
# for testing
- return w_self._value
+ return w_self._utf8val.decode('utf-8')
def create_if_subclassed(w_self):
if type(w_self) is W_UnicodeObject:
return w_self
- return W_UnicodeObject(w_self._value)
+ return W_UnicodeObject(w_self._utf8val)
def is_w(self, space, w_other):
if not isinstance(w_other, W_UnicodeObject):
@@ -48,55 +50,58 @@
return True
if self.user_overridden_class or w_other.user_overridden_class:
return False
- return space.unicode_w(self) is space.unicode_w(w_other)
+ return space.utf8_w(self) is space.utf8_w(w_other)
def immutable_unique_id(self, space):
if self.user_overridden_class:
return None
- return space.wrap(compute_unique_id(space.unicode_w(self)))
+ return space.wrap(compute_unique_id(space.utf8_w(self)))
def str_w(self, space):
return space.str_w(space.str(self))
- def unicode_w(self, space):
- return self._value
+ def utf8_w(self, space):
+ return self._utf8val
def listview_unicode(w_self):
- return _create_list_from_unicode(w_self._value)
+ return _create_list_from_unicode(w_self._utf8val)
def ord(self, space):
- if len(self._value) != 1:
+ if self._len() != 1:
msg = "ord() expected a character, but string of length %d found"
- raise operationerrfmt(space.w_TypeError, msg, len(self._value))
+ raise operationerrfmt(space.w_TypeError, msg, self._len())
+ XXX
return space.wrap(ord(self._value[0]))
- def _new(self, value):
- return W_UnicodeObject(value)
+ def _new(self, utf8val):
+ assert isinstance(utf8val, str)
+ return W_UnicodeObject(utf8val)
def _new_from_list(self, value):
- return W_UnicodeObject(u''.join(value))
+ # value is a RPython list of utf8-encoded strings
+ return W_UnicodeObject(''.join(value))
def _empty(self):
return W_UnicodeObject.EMPTY
def _len(self):
- return len(self._value)
+ return self._length
def _val(self, space):
- return self._value
+ return self._utf8val
def _op_val(self, space, w_other):
if isinstance(w_other, W_UnicodeObject):
- return w_other._value
+ return w_other._utf8val
if space.isinstance_w(w_other, space.w_str):
- return unicode_from_string(space, w_other)._value
- return unicode_from_encoded_object(space, w_other, None,
"strict")._value
+ return unicode_from_string(space, w_other)._utf8val
+ return unicode_from_encoded_object(space, w_other, None,
"strict")._utf8val
def _chr(self, char):
assert len(char) == 1
return unicode(char)[0]
- _builder = UnicodeBuilder
+ _builder = StringBuilder
def _isupper(self, ch):
return unicodedb.isupper(ord(ch))
@@ -189,7 +194,7 @@
return encode_object(space, self, None, None)
def descr_hash(self, space):
- x = compute_hash(self._value)
+ x = compute_hash(self._utf8val)
return space.wrap(x)
def descr_eq(self, space, w_other):
@@ -350,8 +355,9 @@
return space.newbool(cased)
-def wrapunicode(space, uni):
- return W_UnicodeObject(uni)
+def wrapunicode(space, utf8val):
+ # XXXY: we should check that it's valid UTF8
+ return W_UnicodeObject(utf8val)
def plain_str2unicode(space, s):
try:
@@ -426,17 +432,17 @@
encoding = getdefaultencoding(space)
if errors is None or errors == 'strict':
if encoding == 'ascii':
- # XXX error handling
s = space.bufferstr_w(w_obj)
- eh = unicodehelper.decode_error_handler(space)
- return space.wrap(str_decode_ascii(
- s, len(s), None, final=True, errorhandler=eh)[0])
+ s = unicodehelper.ensure_ascii(space, s)
+ return space.wrap_utf8(s)
if encoding == 'utf-8':
s = space.bufferstr_w(w_obj)
- eh = unicodehelper.decode_error_handler(space)
- return space.wrap(str_decode_utf_8(
- s, len(s), None, final=True, errorhandler=eh,
- allow_surrogates=True)[0])
+ s = unicodehelper.ensure_utf8(space, s)
+ return space.wrap_utf8(s)
+ ## eh = unicodehelper.decode_error_handler(space)
+ ## return space.wrap(str_decode_utf_8(
+ ## s, len(s), None, final=True, errorhandler=eh,
+ ## allow_surrogates=True)[0])
w_codecs = space.getbuiltinmodule("_codecs")
w_decode = space.getattr(w_codecs, space.wrap("decode"))
if errors is None:
@@ -489,11 +495,8 @@
if encoding != 'ascii':
return unicode_from_encoded_object(space, w_str, encoding, "strict")
s = space.str_w(w_str)
- try:
- return W_UnicodeObject(s.decode("ascii"))
- except UnicodeDecodeError:
- # raising UnicodeDecodeError is messy, "please crash for me"
- return unicode_from_encoded_object(space, w_str, "ascii", "strict")
+ s = unicodehelper.ensure_ascii(space, s)
+ return W_UnicodeObject(s)
class UnicodeDocstrings:
@@ -1034,7 +1037,7 @@
return [s for s in value]
-W_UnicodeObject.EMPTY = W_UnicodeObject(u'')
+W_UnicodeObject.EMPTY = W_UnicodeObject('')
# Helper for converting int/long
def unicode_to_decimal_w(space, w_unistr):
_______________________________________________
pypy-commit mailing list
[email protected]
https://mail.python.org/mailman/listinfo/pypy-commit