Author: Philip Jenvey <pjen...@underboss.org> Branch: py3k-refactor-str-types Changeset: r68869:9d4908e6605a Date: 2014-01-23 11:07 -0800 http://bitbucket.org/pypy/pypy/changeset/9d4908e6605a/
Log: reintegrate our unicode changes diff --git a/pypy/objspace/std/bytesobject.py b/pypy/objspace/std/bytesobject.py --- a/pypy/objspace/std/bytesobject.py +++ b/pypy/objspace/std/bytesobject.py @@ -8,7 +8,7 @@ from pypy.objspace.std.formatting import mod_format from pypy.objspace.std.stdtypedef import StdTypeDef from pypy.objspace.std.stringmethods import StringMethods -from pypy.objspace.std.unicodeobject import (unicode_from_string, +from pypy.objspace.std.unicodeobject import ( decode_object, unicode_from_encoded_object, _get_encoding_and_errors) from rpython.rlib.jit import we_are_jitted from rpython.rlib.objectmodel import compute_hash, compute_unique_id, import_from_mixin diff --git a/pypy/objspace/std/stringmethods.py b/pypy/objspace/std/stringmethods.py --- a/pypy/objspace/std/stringmethods.py +++ b/pypy/objspace/std/stringmethods.py @@ -513,7 +513,14 @@ if self._startswith(space, value, w_prefix, start, end): return space.w_True return space.w_False - return space.newbool(self._startswith(space, value, w_prefix, start, end)) + try: + return space.newbool(self._startswith(space, value, w_prefix, start, end)) + except OperationError as e: + if e.match(space, space.w_TypeError): + msg = ("startswith first arg must be str or a tuple of str, " + "not %T") + raise operationerrfmt(space.w_TypeError, msg, w_prefix) + raise def _startswith(self, space, value, w_prefix, start, end): return startswith(value, self._op_val(space, w_prefix), start, end) @@ -527,7 +534,15 @@ if self._endswith(space, value, w_suffix, start, end): return space.w_True return space.w_False - return space.newbool(self._endswith(space, value, w_suffix, start, end)) + try: + return space.newbool(self._endswith(space, value, w_suffix, start, + end)) + except OperationError as e: + if e.match(space, space.w_TypeError): + msg = ("endswith first arg must be str or a tuple of str, not " + "%T") + raise operationerrfmt(space.w_TypeError, msg, w_suffix) + raise def _endswith(self, space, value, w_prefix, start, end): return endswith(value, self._op_val(space, w_prefix), start, end) diff --git a/pypy/objspace/std/unicodeobject.py b/pypy/objspace/std/unicodeobject.py --- a/pypy/objspace/std/unicodeobject.py +++ b/pypy/objspace/std/unicodeobject.py @@ -16,7 +16,7 @@ __all__ = ['W_UnicodeObject', 'wrapunicode', 'plain_str2unicode', 'encode_object', 'decode_object', 'unicode_from_object', - 'unicode_from_string', 'unicode_to_decimal_w'] + 'unicode_to_decimal_w'] class W_UnicodeObject(W_Root): @@ -99,9 +99,9 @@ def _op_val(self, space, w_other): if isinstance(w_other, W_UnicodeObject): return w_other._value - if space.isinstance_w(w_other, space.w_str): - return unicode_from_string(space, w_other)._value - return unicode_from_encoded_object(space, w_other, None, "strict")._value + raise operationerrfmt(space.w_TypeError, + "Can't convert '%T' object to str implicitly", + w_other) def _chr(self, char): assert len(char) == 1 @@ -155,41 +155,101 @@ return space.newlist_unicode(lst) @staticmethod - @unwrap_spec(w_string = WrappedDefault("")) - def descr_new(space, w_unicodetype, w_string, w_encoding=None, + @unwrap_spec(w_object = WrappedDefault(u'')) + def descr_new(space, w_unicodetype, w_object=None, w_encoding=None, w_errors=None): # NB. the default value of w_obj is really a *wrapped* empty string: # there is gateway magic at work - w_obj = w_string + w_obj = w_object encoding, errors = _get_encoding_and_errors(space, w_encoding, w_errors) - # convoluted logic for the case when unicode subclass has a __unicode__ - # method, we need to call this method - is_precisely_unicode = space.is_w(space.type(w_obj), space.w_unicode) - if (is_precisely_unicode or - (space.isinstance_w(w_obj, space.w_unicode) and - space.findattr(w_obj, space.wrap('__unicode__')) is None)): - if encoding is not None or errors is not None: - raise OperationError(space.w_TypeError, space.wrap( - 'decoding Unicode is not supported')) - if (is_precisely_unicode and - space.is_w(w_unicodetype, space.w_unicode)): - return w_obj - w_value = w_obj + if encoding is None and errors is None: + w_value = unicode_from_object(space, w_obj) else: - if encoding is None and errors is None: - w_value = unicode_from_object(space, w_obj) - else: - w_value = unicode_from_encoded_object(space, w_obj, - encoding, errors) - if space.is_w(w_unicodetype, space.w_unicode): - return w_value + w_value = unicode_from_encoded_object(space, w_obj, + encoding, errors) + if space.is_w(w_unicodetype, space.w_unicode): + return w_value assert isinstance(w_value, W_UnicodeObject) w_newobj = space.allocate_instance(W_UnicodeObject, w_unicodetype) W_UnicodeObject.__init__(w_newobj, w_value._value) return w_newobj + @staticmethod + def descr_maketrans(space, w_type, w_x, w_y=None, w_z=None): + if space.is_none(w_y): + y = None + else: + y = space.unicode_w(w_y) + if space.is_none(w_z): + z = None + else: + z = space.unicode_w(w_z) + + w_new = space.newdict() + if y is not None: + # x must be a string too, of equal length + ylen = len(y) + try: + x = space.unicode_w(w_x) + except OperationError, e: + if not e.match(space, space.w_TypeError): + raise + raise OperationError(space.w_TypeError, space.wrap( + "first maketrans argument must " + "be a string if there is a second argument")) + if len(x) != ylen: + raise OperationError(space.w_ValueError, space.wrap( + "the first two maketrans " + "arguments must have equal length")) + # create entries for translating chars in x to those in y + for i in range(len(x)): + w_key = space.newint(ord(x[i])) + w_value = space.newint(ord(y[i])) + space.setitem(w_new, w_key, w_value) + # create entries for deleting chars in z + if z is not None: + for i in range(len(z)): + w_key = space.newint(ord(z[i])) + space.setitem(w_new, w_key, space.w_None) + else: + # x must be a dict + if not space.is_w(space.type(w_x), space.w_dict): + raise OperationError(space.w_TypeError, space.wrap( + "if you give only one argument " + "to maketrans it must be a dict")) + # copy entries into the new dict, converting string keys to int keys + w_iter = space.iter(space.call_method(w_x, "items")) + while True: + try: + w_item = space.next(w_iter) + except OperationError, e: + if not e.match(space, space.w_StopIteration): + raise + break + w_key, w_value = space.unpackiterable(w_item, 2) + if space.isinstance_w(w_key, space.w_unicode): + # convert string keys to integer keys + key = space.unicode_w(w_key) + if len(key) != 1: + raise OperationError(space.w_ValueError, space.wrap( + "string keys in translate " + "table must be of length 1")) + w_key = space.newint(ord(key[0])) + else: + # just keep integer keys + try: + space.int_w(w_key) + except OperationError, e: + if not e.match(space, space.w_TypeError): + raise + raise OperationError(space.w_TypeError, space.wrap( + "keys in translate table must " + "be strings or integers")) + space.setitem(w_new, w_key, w_value) + return w_new + def descr_repr(self, space): chars = self._value size = len(chars) @@ -197,7 +257,10 @@ return space.wrap(s) def descr_str(self, space): - return encode_object(space, self, None, None) + if space.is_w(space.type(self), space.w_unicode): + return self + # Subtype -- return genuine unicode string with the same value. + return space.wrap(space.unicode_w(self)) def descr_hash(self, space): x = compute_hash(self._value) @@ -209,13 +272,6 @@ except OperationError, e: if e.match(space, space.w_TypeError): return space.w_NotImplemented - if (e.match(space, space.w_UnicodeDecodeError) or - e.match(space, space.w_UnicodeEncodeError)): - msg = ("Unicode equal comparison failed to convert both " - "arguments to Unicode - interpreting them as being " - "unequal") - space.warn(space.wrap(msg), space.w_UnicodeWarning) - return space.w_False raise def descr_ne(self, space, w_other): @@ -224,13 +280,6 @@ except OperationError, e: if e.match(space, space.w_TypeError): return space.w_NotImplemented - if (e.match(space, space.w_UnicodeDecodeError) or - e.match(space, space.w_UnicodeEncodeError)): - msg = ("Unicode unequal comparison failed to convert both " - "arguments to Unicode - interpreting them as being " - "unequal") - space.warn(space.wrap(msg), space.w_UnicodeWarning) - return space.w_True raise def descr_lt(self, space, w_other): @@ -274,19 +323,16 @@ return newformat.format_method(space, self, __args__.arguments_w, w_kwds, True) + def descr_format_map(self, space, w_mapping): + return newformat.format_method(space, self, None, w_mapping, True) + def descr__format__(self, space, w_format_spec): - """ - if not space.isinstance_w(w_format_spec, space.w_unicode): - w_format_spec = space.call_function(space.w_unicode, w_format_spec) - spec = space.unicode_w(w_format_spec) - formatter = newformat.unicode_formatter(space, spec) - self2 = unicode_from_object(space, self) - assert isinstance(self2, W_UnicodeObject) - return formatter.format_string(self2._value) - """ return newformat.run_formatter(space, w_format_spec, "format_string", self) + def descr_iter(self, space): + return space.newseqiter(self) + def descr_mod(self, space, w_values): return mod_format(space, self, w_values, do_unicode=True) @@ -334,16 +380,6 @@ return 0 return 1 - def descr_formatter_parser(self, space): - from pypy.objspace.std.newformat import unicode_template_formatter - tformat = unicode_template_formatter(space, space.unicode_w(self)) - return tformat.formatter_parser() - - def descr_formatter_field_name_split(self, space): - from pypy.objspace.std.newformat import unicode_template_formatter - tformat = unicode_template_formatter(space, space.unicode_w(self)) - return tformat.formatter_field_name_split() - def descr_isdecimal(self, space): return self._is_generic(space, '_isdecimal') @@ -370,6 +406,15 @@ cased = True return space.newbool(cased) + def descr_isidentifier(self, space): + return space.newbool(_isidentifier(self._value)) + + def descr_isprintable(self, space): + for uchar in self._value: + if not unicodedb.isprintable(ord(uchar)): + return space.w_False + return space.w_True + def wrapunicode(space, uni): return W_UnicodeObject(uni) @@ -390,6 +435,25 @@ space.wrap("ordinal not in range(128)")])) assert False, "unreachable" +def _isidentifier(u): + if not u: + return False + + # PEP 3131 says that the first character must be in XID_Start and + # subsequent characters in XID_Continue, and for the ASCII range, + # the 2.x rules apply (i.e start with letters and underscore, + # continue with letters, digits, underscore). However, given the + # current definition of XID_Start and XID_Continue, it is sufficient + # to check just for these, except that _ must be allowed as starting + # an identifier. + first = u[0] + if not (unicodedb.isxidstart(ord(first)) or first == u'_'): + return False + + for i in range(1, len(u)): + if not unicodedb.isxidcontinue(ord(u[i])): + return False + return True # stuff imported from bytesobject for interoperability @@ -420,14 +484,13 @@ if encoding == 'ascii': u = space.unicode_w(w_object) eh = unicodehelper.encode_error_handler(space) - return space.wrap(unicode_encode_ascii( + return space.wrapbytes(unicode_encode_ascii( u, len(u), None, errorhandler=eh)) if encoding == 'utf-8': u = space.unicode_w(w_object) eh = unicodehelper.encode_error_handler(space) - return space.wrap(unicode_encode_utf_8( - u, len(u), None, errorhandler=eh, - allow_surrogates=True)) + return space.wrapbytes(unicode_encode_utf_8( + u, len(u), None, errorhandler=eh)) from pypy.module._codecs.interp_codecs import lookup_codec w_encoder = space.getitem(lookup_codec(space, encoding), space.wrap(0)) if errors is None: @@ -436,10 +499,9 @@ w_errors = space.wrap(errors) w_restuple = space.call_function(w_encoder, w_object, w_errors) w_retval = space.getitem(w_restuple, space.wrap(0)) - if not space.isinstance_w(w_retval, space.w_str): - raise operationerrfmt(space.w_TypeError, - "encoder did not return an string object (type '%s')", - space.type(w_retval).getname(space)) + if not space.isinstance_w(w_retval, space.w_bytes): + msg = "encoder did not return a bytes string (type '%T')" + raise operationerrfmt(space.w_TypeError, msg, w_retval) return w_retval def decode_object(space, w_obj, encoding, errors): @@ -456,8 +518,7 @@ s = space.bufferstr_w(w_obj) eh = unicodehelper.decode_error_handler(space) return space.wrap(str_decode_utf_8( - s, len(s), None, final=True, errorhandler=eh, - allow_surrogates=True)[0]) + s, len(s), None, final=True, errorhandler=eh)[0]) w_codecs = space.getbuiltinmodule("_codecs") w_decode = space.getattr(w_codecs, space.wrap("decode")) if errors is None: @@ -486,44 +547,29 @@ def unicode_from_object(space, w_obj): if space.is_w(space.type(w_obj), space.w_unicode): return w_obj - elif space.is_w(space.type(w_obj), space.w_str): - w_res = w_obj - else: - w_unicode_method = space.lookup(w_obj, "__unicode__") - # obscure workaround: for the next two lines see - # test_unicode_conversion_with__str__ - if w_unicode_method is None: - if space.isinstance_w(w_obj, space.w_unicode): - return space.wrap(space.unicode_w(w_obj)) - w_unicode_method = space.lookup(w_obj, "__str__") - if w_unicode_method is not None: - w_res = space.get_and_call_function(w_unicode_method, w_obj) - else: - w_res = space.str(w_obj) - if space.isinstance_w(w_res, space.w_unicode): - return w_res - return unicode_from_encoded_object(space, w_res, None, "strict") + if space.lookup(w_obj, "__str__") is not None: + return space.str(w_obj) + return space.repr(w_obj) -def unicode_from_string(space, w_str): - # this is a performance and bootstrapping hack - encoding = getdefaultencoding(space) - if encoding != 'ascii': - return unicode_from_encoded_object(space, w_str, encoding, "strict") - s = space.str_w(w_str) - try: - return W_UnicodeObject(s.decode("ascii")) - except UnicodeDecodeError: - # raising UnicodeDecodeError is messy, "please crash for me" - return unicode_from_encoded_object(space, w_str, "ascii", "strict") +def ascii_from_object(space, w_obj): + """Implements builtins.ascii()""" + # repr is guaranteed to be unicode + w_repr = space.repr(w_obj) + w_encoded = encode_object(space, w_repr, 'ascii', 'backslashreplace') + return decode_object(space, w_encoded, 'ascii', None) class UnicodeDocstrings: - """unicode(object='') -> unicode object - unicode(string[, encoding[, errors]]) -> unicode object + """str(object='') -> str + str(bytes_or_buffer[, encoding[, errors]]) -> str - Create a new Unicode object from the given encoded string. - encoding defaults to the current default string encoding. - errors can be 'strict', 'replace' or 'ignore' and defaults to 'strict'. + Create a new string object from the given object. If encoding or + errors is specified, then the object must expose a data buffer + that will be decoded using the given encoding and error handler. + Otherwise, returns the result of object.__str__() (if defined) + or repr(object). + encoding defaults to sys.getdefaultencoding(). + errors defaults to 'strict'. """ @@ -554,18 +600,15 @@ def __getnewargs__(): """""" - def __getslice__(): - """x.__getslice__(i, j) <==> x[i:j] - - Use of negative indices is not supported. - """ - def __gt__(): """x.__gt__(y) <==> x>y""" def __hash__(): """x.__hash__() <==> hash(x)""" + def __iter__(): + """x.__iter__() <==> iter(x)""" + def __le__(): """x.__le__(y) <==> x<=y""" @@ -676,6 +719,14 @@ The substitutions are identified by braces ('{' and '}'). """ + def format_map(): + """S.format_map(mapping) -> str + + Return a formatted version of S, using substitutions from + mapping. The substitutions are identified by braces ('{' and + '}'). + """ + def index(): """S.index(sub[, start[, end]]) -> int @@ -710,6 +761,13 @@ and there is at least one character in S, False otherwise. """ + def isidentifier(): + """S.isidentifier() -> bool + + Return True if S is a valid identifier according to the language + definition. + """ + def islower(): """S.islower() -> bool @@ -724,6 +782,13 @@ False otherwise. """ + def isprintable(): + """S.isprintable() -> bool + + Return True if all characters in S are considered printable in + repr() or S is empty, False otherwise. + """ + def isspace(): """S.isspace() -> bool @@ -775,6 +840,19 @@ If chars is a str, it will be converted to unicode before stripping """ + def maketrans(): + """str.maketrans(x[, y[, z]]) -> dict (static method) + + Return a translation table usable for str.translate(). + If there is only one argument, it must be a dictionary mapping Unicode + ordinals (integers) or characters to Unicode ordinals, strings or None. + Character keys will be then converted to ordinals. + If there are two arguments, they must be strings of equal length, and + in the resulting dictionary, each character in x will be mapped to the + character at the same position in y. If there is a third argument, it + must be a string, whose characters will be mapped to None in the result. + """ + def partition(): """S.partition(sep) -> (head, sep, tail) @@ -939,6 +1017,8 @@ __ge__ = interp2app(W_UnicodeObject.descr_ge, doc=UnicodeDocstrings.__ge__.__doc__), + __iter__ = interp2app(W_UnicodeObject.descr_iter, + doc=UnicodeDocstrings.__iter__.__doc__), __len__ = interp2app(W_UnicodeObject.descr_len, doc=UnicodeDocstrings.__len__.__doc__), __contains__ = interp2app(W_UnicodeObject.descr_contains, @@ -953,8 +1033,6 @@ __getitem__ = interp2app(W_UnicodeObject.descr_getitem, doc=UnicodeDocstrings.__getitem__.__doc__), - __getslice__ = interp2app(W_UnicodeObject.descr_getslice, - doc=UnicodeDocstrings.__getslice__.__doc__), capitalize = interp2app(W_UnicodeObject.descr_capitalize, doc=UnicodeDocstrings.capitalize.__doc__), @@ -962,8 +1040,6 @@ doc=UnicodeDocstrings.center.__doc__), count = interp2app(W_UnicodeObject.descr_count, doc=UnicodeDocstrings.count.__doc__), - decode = interp2app(W_UnicodeObject.descr_decode, - doc=UnicodeDocstrings.decode.__doc__), encode = interp2app(W_UnicodeObject.descr_encode, doc=UnicodeDocstrings.encode.__doc__), expandtabs = interp2app(W_UnicodeObject.descr_expandtabs, @@ -984,10 +1060,14 @@ doc=UnicodeDocstrings.isdecimal.__doc__), isdigit = interp2app(W_UnicodeObject.descr_isdigit, doc=UnicodeDocstrings.isdigit.__doc__), + isidentifier = interp2app(W_UnicodeObject.descr_isidentifier, + doc=UnicodeDocstrings.isidentifier.__doc__), islower = interp2app(W_UnicodeObject.descr_islower, doc=UnicodeDocstrings.islower.__doc__), isnumeric = interp2app(W_UnicodeObject.descr_isnumeric, doc=UnicodeDocstrings.isnumeric.__doc__), + isprintable = interp2app(W_UnicodeObject.descr_isprintable, + doc=UnicodeDocstrings.isprintable.__doc__), isspace = interp2app(W_UnicodeObject.descr_isspace, doc=UnicodeDocstrings.isspace.__doc__), istitle = interp2app(W_UnicodeObject.descr_istitle, @@ -1037,15 +1117,17 @@ format = interp2app(W_UnicodeObject.descr_format, doc=UnicodeDocstrings.format.__doc__), + format_map = interp2app(W_UnicodeObject.descr_format_map, + doc=UnicodeDocstrings.format_map.__doc__), __format__ = interp2app(W_UnicodeObject.descr__format__, doc=UnicodeDocstrings.__format__.__doc__), __mod__ = interp2app(W_UnicodeObject.descr_mod, doc=UnicodeDocstrings.__mod__.__doc__), __getnewargs__ = interp2app(W_UnicodeObject.descr_getnewargs, doc=UnicodeDocstrings.__getnewargs__.__doc__), - _formatter_parser = interp2app(W_UnicodeObject.descr_formatter_parser), - _formatter_field_name_split = - interp2app(W_UnicodeObject.descr_formatter_field_name_split), + maketrans = interp2app(W_UnicodeObject.descr_maketrans, + as_classmethod=True, + doc=UnicodeDocstrings.maketrans.__doc__) ) @@ -1057,7 +1139,15 @@ W_UnicodeObject.EMPTY = W_UnicodeObject(u'') -# Helper for converting int/long +# Helper for converting int/long this is called only from +# {int,long,float}type.descr__new__: in the default branch this is implemented +# using the same logic as PyUnicode_EncodeDecimal, as CPython 2.7 does. +# +# In CPython3 the call to PyUnicode_EncodeDecimal has been replaced to a call +# to PyUnicode_TransformDecimalToASCII, which is much simpler. Here, we do the +# equivalent. +# +# Note that, differently than default, we return an *unicode* RPython string def unicode_to_decimal_w(space, w_unistr): if not isinstance(w_unistr, W_UnicodeObject): raise operationerrfmt(space.w_TypeError, "expected unicode, got '%T'", @@ -1079,4 +1169,4 @@ _repr_function, _ = make_unicode_escape_function( - pass_printable=False, unicode_output=False, quotes=True, prefix='u') + pass_printable=True, unicode_output=True, quotes=True, prefix='') _______________________________________________ pypy-commit mailing list pypy-commit@python.org https://mail.python.org/mailman/listinfo/pypy-commit