As anyone following the py3k checkins should have figured out by now, I'm on a mission to require all code to be consistent about bytes vs. str. For example binary files will soon refuse str arguments to write(), and vice versa.
I have a patch that turns on this enforcement, but I have anout 14 failing unit tests that require a lot of attention. I'm hoping a few folks might have time to help out. Here are the unit tests that still need work: test_asynchat test_bsddb3 test_cgi test_cmd_line test_csv test_doctest test_gettext test_httplib test_shelve test_sqlite test_tarfile test_urllib test_urllib2 test_urllib2_localnet Attached is the patch that makes them fail. Note that it forces an error when you use PyBUF_CHARACTERS when calling PyObject_GetBuffer on a str (PyUnicode) object. -- --Guido van Rossum (home page: http://www.python.org/~guido/)
Index: Objects/unicodeobject.c =================================================================== --- Objects/unicodeobject.c (revision 57587) +++ Objects/unicodeobject.c (working copy) @@ -965,31 +965,11 @@ return NULL; } -#if 0 - /* For b/w compatibility we also accept Unicode objects provided - that no encodings is given and then redirect to - PyObject_Unicode() which then applies the additional logic for - Unicode subclasses. - - NOTE: This API should really only be used for object which - represent *encoded* Unicode ! - - */ - if (PyUnicode_Check(obj)) { - if (encoding) { - PyErr_SetString(PyExc_TypeError, - "decoding Unicode is not supported"); - return NULL; - } - return PyObject_Unicode(obj); - } -#else if (PyUnicode_Check(obj)) { PyErr_SetString(PyExc_TypeError, "decoding Unicode is not supported"); return NULL; } -#endif /* Coerce object */ if (PyString_Check(obj)) { @@ -6440,26 +6420,7 @@ static PyObject * unicode_decode(PyUnicodeObject *self, PyObject *args) { - char *encoding = NULL; - char *errors = NULL; - PyObject *v; - - if (!PyArg_ParseTuple(args, "|ss:decode", &encoding, &errors)) - return NULL; - v = PyUnicode_AsDecodedObject((PyObject *)self, encoding, errors); - if (v == NULL) - goto onError; - if (!PyString_Check(v) && !PyUnicode_Check(v)) { - PyErr_Format(PyExc_TypeError, - "decoder did not return a string/unicode object " - "(type=%.400s)", - Py_Type(v)->tp_name); - Py_DECREF(v); - return NULL; - } - return v; - - onError: + PyErr_Format(PyExc_TypeError, "decoding str is not supported"); return NULL; } @@ -8471,17 +8432,11 @@ { if (flags & PyBUF_CHARACTER) { - PyObject *str; - - str = _PyUnicode_AsDefaultEncodedString((PyObject *)self, NULL); - if (str == NULL) return -1; - return PyBuffer_FillInfo(view, (void *)PyString_AS_STRING(str), - PyString_GET_SIZE(str), 1, flags); + PyErr_SetString(PyExc_SystemError, "can't use str as char buffer"); + return -1; } - else { - return PyBuffer_FillInfo(view, (void *)self->str, - PyUnicode_GET_DATA_SIZE(self), 1, flags); - } + return PyBuffer_FillInfo(view, (void *)self->str, + PyUnicode_GET_DATA_SIZE(self), 1, flags); } Index: Objects/bytesobject.c =================================================================== --- Objects/bytesobject.c (revision 57587) +++ Objects/bytesobject.c (working copy) @@ -82,7 +82,13 @@ if (buffer == NULL || PyUnicode_Check(obj) || - buffer->bf_getbuffer == NULL) return -1; + buffer->bf_getbuffer == NULL) + { + PyErr_Format(PyExc_TypeError, + "Type %.100s doesn't support the buffer API", + Py_Type(obj)->tp_name); + return -1; + } if (buffer->bf_getbuffer(obj, view, PyBUF_SIMPLE) < 0) return -1; @@ -167,7 +173,7 @@ else if (size < alloc) { /* Within allocated size; quick exit */ Py_Size(self) = size; - ((PyBytesObject *)self)->ob_bytes[size] = '\0'; /* Trailing null byte */ + ((PyBytesObject *)self)->ob_bytes[size] = '\0'; /* Trailing null */ return 0; } else if (size <= alloc * 1.125) { @@ -181,10 +187,11 @@ if (((PyBytesObject *)self)->ob_exports > 0) { /* - fprintf(stderr, "%d: %s", ((PyBytesObject *)self)->ob_exports, ((PyBytesObject *)self)->ob_bytes); + fprintf(stderr, "%d: %s", ((PyBytesObject *)self)->ob_exports, + ((PyBytesObject *)self)->ob_bytes); */ PyErr_SetString(PyExc_BufferError, - "Existing exports of data: object cannot be re-sized"); + "Existing exports of data: object cannot be re-sized"); return -1; } @@ -262,24 +269,24 @@ PyBuffer vo; if (_getbuffer(other, &vo) < 0) { - PyErr_Format(PyExc_TypeError, - "can't concat bytes to %.100s", Py_Type(self)->tp_name); - return NULL; + PyErr_Format(PyExc_TypeError, "can't concat bytes to %.100s", + Py_Type(self)->tp_name); + return NULL; } mysize = Py_Size(self); size = mysize + vo.len; if (size < 0) { - PyObject_ReleaseBuffer(other, &vo); - return PyErr_NoMemory(); + PyObject_ReleaseBuffer(other, &vo); + return PyErr_NoMemory(); } if (size < self->ob_alloc) { - Py_Size(self) = size; - self->ob_bytes[Py_Size(self)] = '\0'; /* Trailing null byte */ + Py_Size(self) = size; + self->ob_bytes[Py_Size(self)] = '\0'; /* Trailing null byte */ } else if (PyBytes_Resize((PyObject *)self, size) < 0) { - PyObject_ReleaseBuffer(other, &vo); - return NULL; + PyObject_ReleaseBuffer(other, &vo); + return NULL; } memcpy(self->ob_bytes + mysize, vo.buf, vo.len); PyObject_ReleaseBuffer(other, &vo); @@ -327,7 +334,7 @@ return PyErr_NoMemory(); if (size < self->ob_alloc) { Py_Size(self) = size; - self->ob_bytes[Py_Size(self)] = '\0'; /* Trailing null byte */ + self->ob_bytes[Py_Size(self)] = '\0'; /* Trailing null byte */ } else if (PyBytes_Resize((PyObject *)self, size) < 0) return NULL; @@ -507,7 +514,7 @@ memmove(self->ob_bytes + lo + needed, self->ob_bytes + hi, Py_Size(self) - hi); } - /* XXX(nnorwitz): need to verify this can't overflow! */ + /* XXX(nnorwitz): need to verify this can't overflow! */ if (PyBytes_Resize((PyObject *)self, Py_Size(self) + needed - avail) < 0) { res = -1; @@ -757,8 +764,11 @@ if (PyUnicode_Check(arg)) { /* Encode via the codec registry */ PyObject *encoded, *new; - if (encoding == NULL) - encoding = PyUnicode_GetDefaultEncoding(); + if (encoding == NULL) { + PyErr_SetString(PyExc_TypeError, + "string argument without an encoding"); + return -1; + } encoded = PyCodec_Encode(arg, encoding, errors); if (encoded == NULL) return -1; @@ -769,12 +779,12 @@ Py_DECREF(encoded); return -1; } - new = bytes_iconcat(self, encoded); - Py_DECREF(encoded); - if (new == NULL) - return -1; - Py_DECREF(new); - return 0; + new = bytes_iconcat(self, encoded); + Py_DECREF(encoded); + if (new == NULL) + return -1; + Py_DECREF(new); + return 0; } /* If it's not unicode, there can't be encoding or errors */ @@ -954,12 +964,14 @@ self_size = _getbuffer(self, &self_bytes); if (self_size < 0) { + PyErr_Clear(); Py_INCREF(Py_NotImplemented); return Py_NotImplemented; } other_size = _getbuffer(other, &other_bytes); if (other_size < 0) { + PyErr_Clear(); PyObject_ReleaseBuffer(self, &self_bytes); Py_INCREF(Py_NotImplemented); return Py_NotImplemented; @@ -1061,10 +1073,11 @@ sub_len = PyBytes_GET_SIZE(subobj); } /* XXX --> use the modern buffer interface */ - else if (PyObject_AsCharBuffer(subobj, &sub, &sub_len)) + else if (PyObject_AsCharBuffer(subobj, &sub, &sub_len)) { /* XXX - the "expected a character buffer object" is pretty confusing for a non-expert. remap to something else ? */ return -2; + } if (dir > 0) return stringlib_find_slice( @@ -2021,49 +2034,24 @@ { Py_ssize_t count = -1; PyObject *from, *to, *res; - const char *from_s, *to_s; - Py_ssize_t from_len, to_len; - int relfrom=0, relto=0; PyBuffer vfrom, vto; if (!PyArg_ParseTuple(args, "OO|n:replace", &from, &to, &count)) return NULL; - if (PyBytes_Check(from)) { - from_s = PyBytes_AS_STRING(from); - from_len = PyBytes_GET_SIZE(from); + if (_getbuffer(from, &vfrom) < 0) + return NULL; + if (_getbuffer(to, &vto) < 0) { + PyObject_ReleaseBuffer(from, &vfrom); + return NULL; } - else { - if (PyObject_GetBuffer(from, &vfrom, PyBUF_CHARACTER) < 0) - return NULL; - from_s = vfrom.buf; - from_len = vfrom.len; - relfrom = 1; - } - if (PyBytes_Check(to)) { - to_s = PyBytes_AS_STRING(to); - to_len = PyBytes_GET_SIZE(to); - } - else { - if (PyObject_GetBuffer(to, &vto, PyBUF_CHARACTER) < 0) { - if (relfrom) - PyObject_ReleaseBuffer(from, &vfrom); - return NULL; - } - to_s = vto.buf; - to_len = vto.len; - relto = 1; - } - res = (PyObject *)replace((PyBytesObject *) self, - from_s, from_len, - to_s, to_len, count); + vfrom.buf, vfrom.len, + vto.buf, vto.len, count); - if (relfrom) - PyObject_ReleaseBuffer(from, &vfrom); - if (relto) - PyObject_ReleaseBuffer(to, &vto); + PyObject_ReleaseBuffer(from, &vfrom); + PyObject_ReleaseBuffer(to, &vto); return res; } @@ -2799,10 +2787,10 @@ { PyObject *latin1; if (self->ob_bytes) - latin1 = PyUnicode_DecodeLatin1(self->ob_bytes, - Py_Size(self), NULL); + latin1 = PyUnicode_DecodeLatin1(self->ob_bytes, + Py_Size(self), NULL); else - latin1 = PyUnicode_FromString(""); + latin1 = PyUnicode_FromString(""); return Py_BuildValue("(O(Ns))", Py_Type(self), latin1, "latin-1"); } Index: Lib/io.py =================================================================== --- Lib/io.py (revision 57587) +++ Lib/io.py (working copy) @@ -659,12 +659,14 @@ def write(self, b): if self.closed: raise ValueError("write to closed file") + if isinstance(b, str): + raise TypeError("can't write str to binary stream") n = len(b) newpos = self._pos + n if newpos > len(self._buffer): # Inserts null bytes between the current end of the file # and the new write position. - padding = '\x00' * (newpos - len(self._buffer) - n) + padding = b'\x00' * (newpos - len(self._buffer) - n) self._buffer[self._pos:newpos - n] = padding self._buffer[self._pos:newpos] = b self._pos = newpos @@ -801,11 +803,8 @@ def write(self, b): if self.closed: raise ValueError("write to closed file") - if not isinstance(b, bytes): - if hasattr(b, "__index__"): - raise TypeError("Can't write object of type %s" % - type(b).__name__) - b = bytes(b) + if isinstance(b, str): + raise TypeError("can't write str to binary stream") # XXX we can implement some more tricks to try and avoid partial writes if len(self._write_buf) > self.buffer_size: # We're full, so let's pre-flush the buffer @@ -1099,8 +1098,6 @@ s = s.replace("\n", self._writenl) # XXX What if we were just reading? b = s.encode(self._encoding) - if isinstance(b, str): - b = bytes(b) self.buffer.write(b) if haslf and self.isatty(): self.flush()
_______________________________________________ Python-3000 mailing list Python-3000@python.org http://mail.python.org/mailman/listinfo/python-3000 Unsubscribe: http://mail.python.org/mailman/options/python-3000/archive%40mail-archive.com