Author: Antonio Cuni <anto.c...@gmail.com> Branch: utf8-unicode Changeset: r68759:eb1500901ddf Date: 2014-01-17 22:54 +0100 http://bitbucket.org/pypy/pypy/changeset/eb1500901ddf/
Log: break the world, and implement W_UnicodeObject as utf8 rpython strings diff --git a/pypy/interpreter/baseobjspace.py b/pypy/interpreter/baseobjspace.py --- a/pypy/interpreter/baseobjspace.py +++ b/pypy/interpreter/baseobjspace.py @@ -199,7 +199,7 @@ def str_w(self, space): self._typed_unwrap_error(space, "string") - def unicode_w(self, space): + def utf8_w(self, space): self._typed_unwrap_error(space, "unicode") def int_w(self, space): @@ -1376,11 +1376,11 @@ self.wrap('argument must be a string')) return self.str_w(w_obj) - def unicode_w(self, w_obj): - return w_obj.unicode_w(self) + def utf8_w(self, w_obj): + return w_obj.utf8_w(self) - def unicode0_w(self, w_obj): - "Like unicode_w, but rejects strings with NUL bytes." + def utf8_0_w(self, w_obj): + "Like utf8_w, but rejects strings with NUL bytes." from rpython.rlib import rstring result = w_obj.unicode_w(self) if u'\x00' in result: diff --git a/pypy/interpreter/unicodehelper.py b/pypy/interpreter/unicodehelper.py --- a/pypy/interpreter/unicodehelper.py +++ b/pypy/interpreter/unicodehelper.py @@ -61,3 +61,20 @@ uni, len(uni), "strict", errorhandler=encode_error_handler(space), allow_surrogates=True) + +def ensure_ascii(space, s, errors='strict'): + # ASCII is equivalent to the first 128 ordinals in Unicode. + eh = decode_error_handler(space) + pos = 0 + size = len(s) + while pos < size: + c = s[pos] + if ord(c) >= 128: + r, pos = eh(errors, "ascii", "ordinal not in range(128)", + s, pos, pos + 1) + pos += 1 + return s + +def ensure_utf8(space, s, errors='strict'): + # XXXY implement me! + return s diff --git a/pypy/objspace/std/bytesobject.py b/pypy/objspace/std/bytesobject.py --- a/pypy/objspace/std/bytesobject.py +++ b/pypy/objspace/std/bytesobject.py @@ -658,8 +658,8 @@ if space.isinstance_w(w_sub, space.w_unicode): from pypy.objspace.std.unicodeobject import W_UnicodeObject assert isinstance(w_sub, W_UnicodeObject) - self_as_unicode = unicode_from_encoded_object(space, self, None, None) - return space.newbool(self_as_unicode._value.find(w_sub._value) >= 0) + self_as_utf8 = unicode_from_encoded_object(space, self, None, None) + return space.newbool(self_as_utf8._utf8val.find(w_sub._utf8val) >= 0) return self._StringMethods_descr_contains(space, w_sub) _StringMethods_descr_replace = descr_replace diff --git a/pypy/objspace/std/listobject.py b/pypy/objspace/std/listobject.py --- a/pypy/objspace/std/listobject.py +++ b/pypy/objspace/std/listobject.py @@ -1633,10 +1633,10 @@ _applevel_repr = "unicode" def wrap(self, stringval): - return self.space.wrap(stringval) + return self.space.wrap_utf8(stringval) def unwrap(self, w_string): - return self.space.unicode_w(w_string) + return self.space.utf8_w(w_string) erase, unerase = rerased.new_erasing_pair("unicode") erase = staticmethod(erase) diff --git a/pypy/objspace/std/objspace.py b/pypy/objspace/std/objspace.py --- a/pypy/objspace/std/objspace.py +++ b/pypy/objspace/std/objspace.py @@ -158,7 +158,8 @@ if isinstance(x, str): return wrapstr(self, x) if isinstance(x, unicode): - return wrapunicode(self, x) + # we might want to kill support for wrap(u'...') eventually + return wrapunicode(self, x.encode('utf-8')) if isinstance(x, float): return W_FloatObject(x) if isinstance(x, W_Root): @@ -181,6 +182,14 @@ return self._wrap_not_rpython(x) wrap._annspecialcase_ = "specialize:wrap" + def wrap_utf8(self, utf8val): + """ + Take an utf8-encoded RPython string an return an unicode applevel + object + """ + # the constructor of W_UnicodeObject checks that it's valid UTF8 + return wrapunicode(self, utf8val) + def _wrap_not_rpython(self, x): "NOT_RPYTHON" # _____ this code is here to support testing only _____ diff --git a/pypy/objspace/std/unicodeobject.py b/pypy/objspace/std/unicodeobject.py --- a/pypy/objspace/std/unicodeobject.py +++ b/pypy/objspace/std/unicodeobject.py @@ -11,7 +11,7 @@ from pypy.objspace.std.stdtypedef import StdTypeDef from pypy.objspace.std.stringmethods import StringMethods from rpython.rlib.objectmodel import compute_hash, compute_unique_id, import_from_mixin -from rpython.rlib.rstring import UnicodeBuilder +from rpython.rlib.rstring import StringBuilder from rpython.rlib.runicode import (str_decode_utf_8, str_decode_ascii, unicode_encode_utf_8, unicode_encode_ascii, make_unicode_escape_function) @@ -22,24 +22,26 @@ class W_UnicodeObject(W_Root): import_from_mixin(StringMethods) - _immutable_fields_ = ['_value'] + _immutable_fields_ = ['_utf8val'] - def __init__(w_self, unistr): - assert isinstance(unistr, unicode) - w_self._value = unistr + def __init__(w_self, utf8val): + assert isinstance(utf8val, str) + w_self._utf8val = utf8val + # XXXY: we want a more efficient way to compute this + w_self._length = len(utf8val.decode('utf-8')) def __repr__(w_self): """ representation for debugging purposes """ - return "%s(%r)" % (w_self.__class__.__name__, w_self._value) + return "%s(%r)" % (w_self.__class__.__name__, w_self._utf8val.decode('utf8')) def unwrap(w_self, space): # for testing - return w_self._value + return w_self._utf8val.decode('utf-8') def create_if_subclassed(w_self): if type(w_self) is W_UnicodeObject: return w_self - return W_UnicodeObject(w_self._value) + return W_UnicodeObject(w_self._utf8val) def is_w(self, space, w_other): if not isinstance(w_other, W_UnicodeObject): @@ -48,55 +50,58 @@ return True if self.user_overridden_class or w_other.user_overridden_class: return False - return space.unicode_w(self) is space.unicode_w(w_other) + return space.utf8_w(self) is space.utf8_w(w_other) def immutable_unique_id(self, space): if self.user_overridden_class: return None - return space.wrap(compute_unique_id(space.unicode_w(self))) + return space.wrap(compute_unique_id(space.utf8_w(self))) def str_w(self, space): return space.str_w(space.str(self)) - def unicode_w(self, space): - return self._value + def utf8_w(self, space): + return self._utf8val def listview_unicode(w_self): - return _create_list_from_unicode(w_self._value) + return _create_list_from_unicode(w_self._utf8val) def ord(self, space): - if len(self._value) != 1: + if self._len() != 1: msg = "ord() expected a character, but string of length %d found" - raise operationerrfmt(space.w_TypeError, msg, len(self._value)) + raise operationerrfmt(space.w_TypeError, msg, self._len()) + XXX return space.wrap(ord(self._value[0])) - def _new(self, value): - return W_UnicodeObject(value) + def _new(self, utf8val): + assert isinstance(utf8val, str) + return W_UnicodeObject(utf8val) def _new_from_list(self, value): - return W_UnicodeObject(u''.join(value)) + # value is a RPython list of utf8-encoded strings + return W_UnicodeObject(''.join(value)) def _empty(self): return W_UnicodeObject.EMPTY def _len(self): - return len(self._value) + return self._length def _val(self, space): - return self._value + return self._utf8val def _op_val(self, space, w_other): if isinstance(w_other, W_UnicodeObject): - return w_other._value + return w_other._utf8val if space.isinstance_w(w_other, space.w_str): - return unicode_from_string(space, w_other)._value - return unicode_from_encoded_object(space, w_other, None, "strict")._value + return unicode_from_string(space, w_other)._utf8val + return unicode_from_encoded_object(space, w_other, None, "strict")._utf8val def _chr(self, char): assert len(char) == 1 return unicode(char)[0] - _builder = UnicodeBuilder + _builder = StringBuilder def _isupper(self, ch): return unicodedb.isupper(ord(ch)) @@ -189,7 +194,7 @@ return encode_object(space, self, None, None) def descr_hash(self, space): - x = compute_hash(self._value) + x = compute_hash(self._utf8val) return space.wrap(x) def descr_eq(self, space, w_other): @@ -350,8 +355,9 @@ return space.newbool(cased) -def wrapunicode(space, uni): - return W_UnicodeObject(uni) +def wrapunicode(space, utf8val): + # XXXY: we should check that it's valid UTF8 + return W_UnicodeObject(utf8val) def plain_str2unicode(space, s): try: @@ -426,17 +432,17 @@ encoding = getdefaultencoding(space) if errors is None or errors == 'strict': if encoding == 'ascii': - # XXX error handling s = space.bufferstr_w(w_obj) - eh = unicodehelper.decode_error_handler(space) - return space.wrap(str_decode_ascii( - s, len(s), None, final=True, errorhandler=eh)[0]) + s = unicodehelper.ensure_ascii(space, s) + return space.wrap_utf8(s) if encoding == 'utf-8': s = space.bufferstr_w(w_obj) - eh = unicodehelper.decode_error_handler(space) - return space.wrap(str_decode_utf_8( - s, len(s), None, final=True, errorhandler=eh, - allow_surrogates=True)[0]) + s = unicodehelper.ensure_utf8(space, s) + return space.wrap_utf8(s) + ## eh = unicodehelper.decode_error_handler(space) + ## return space.wrap(str_decode_utf_8( + ## s, len(s), None, final=True, errorhandler=eh, + ## allow_surrogates=True)[0]) w_codecs = space.getbuiltinmodule("_codecs") w_decode = space.getattr(w_codecs, space.wrap("decode")) if errors is None: @@ -489,11 +495,8 @@ if encoding != 'ascii': return unicode_from_encoded_object(space, w_str, encoding, "strict") s = space.str_w(w_str) - try: - return W_UnicodeObject(s.decode("ascii")) - except UnicodeDecodeError: - # raising UnicodeDecodeError is messy, "please crash for me" - return unicode_from_encoded_object(space, w_str, "ascii", "strict") + s = unicodehelper.ensure_ascii(space, s) + return W_UnicodeObject(s) class UnicodeDocstrings: @@ -1034,7 +1037,7 @@ return [s for s in value] -W_UnicodeObject.EMPTY = W_UnicodeObject(u'') +W_UnicodeObject.EMPTY = W_UnicodeObject('') # Helper for converting int/long def unicode_to_decimal_w(space, w_unistr): _______________________________________________ pypy-commit mailing list pypy-commit@python.org https://mail.python.org/mailman/listinfo/pypy-commit