Guido van Rossum added the comment:
Here's a patch that fixes bytes.split and .rsplit. I'll hold off for a
while in case there's strong disagreement. I might add a patch for
bytes.strip later (it's simpler).
----------
keywords: +patch
__________________________________
Tracker <[EMAIL PROTECTED]>
<http://bugs.python.org/issue1125>
__________________________________
Index: Objects/bytesobject.c
===================================================================
--- Objects/bytesobject.c (revision 58048)
+++ Objects/bytesobject.c (working copy)
@@ -2104,7 +2104,7 @@
Py_LOCAL_INLINE(PyObject *)
split_char(const char *s, Py_ssize_t len, char ch, Py_ssize_t maxcount)
{
- register Py_ssize_t i, j, count=0;
+ register Py_ssize_t i, j, count = 0;
PyObject *str;
PyObject *list = PyList_New(PREALLOC_SIZE(maxcount));
@@ -2113,7 +2113,7 @@
i = j = 0;
while ((j < len) && (maxcount-- > 0)) {
- for(; j<len; j++) {
+ for(; j < len; j++) {
/* I found that using memchr makes no difference */
if (s[j] == ch) {
SPLIT_ADD(s, i, j);
@@ -2133,28 +2133,72 @@
return NULL;
}
+#define ISSPACE(c) (isspace(Py_CHARMASK(c)) && ((c) & 0x80) == 0)
+
+Py_LOCAL_INLINE(PyObject *)
+split_whitespace(const char *s, Py_ssize_t len, Py_ssize_t maxcount)
+{
+ register Py_ssize_t i, j, count = 0;
+ PyObject *str;
+ PyObject *list = PyList_New(PREALLOC_SIZE(maxcount));
+
+ if (list == NULL)
+ return NULL;
+
+ for (i = j = 0; i < len; ) {
+ /* find a token */
+ while (i < len && ISSPACE(s[i]))
+ i++;
+ j = i;
+ while (i < len && !ISSPACE(s[i]))
+ i++;
+ if (j < i) {
+ if (maxcount-- <= 0)
+ break;
+ SPLIT_ADD(s, j, i);
+ while (i < len && ISSPACE(s[i]))
+ i++;
+ j = i;
+ }
+ }
+ if (j < len) {
+ SPLIT_ADD(s, j, len);
+ }
+ FIX_PREALLOC_SIZE(list);
+ return list;
+
+ onError:
+ Py_DECREF(list);
+ return NULL;
+}
+
PyDoc_STRVAR(split__doc__,
-"B.split(sep [,maxsplit]) -> list of bytes\n\
+"B.split([sep [, maxsplit]]) -> list of bytes\n\
\n\
Return a list of the bytes in the string B, using sep as the\n\
-delimiter. If maxsplit is given, at most maxsplit\n\
-splits are done.");
+delimiter. If sep is not given, B is split on ASCII whitespace\n\
+characters (space, tab, return, newline, formfeed, vertical tab).\n\
+If maxsplit is given, at most maxsplit splits are done.");
static PyObject *
bytes_split(PyBytesObject *self, PyObject *args)
{
Py_ssize_t len = PyBytes_GET_SIZE(self), n, i, j;
- Py_ssize_t maxsplit = -1, count=0;
+ Py_ssize_t maxsplit = -1, count = 0;
const char *s = PyBytes_AS_STRING(self), *sub;
- PyObject *list, *str, *subobj;
+ PyObject *list, *str, *subobj = Py_None;
#ifdef USE_FAST
Py_ssize_t pos;
#endif
- if (!PyArg_ParseTuple(args, "O|n:split", &subobj, &maxsplit))
+ if (!PyArg_ParseTuple(args, "|On:split", &subobj, &maxsplit))
return NULL;
if (maxsplit < 0)
maxsplit = PY_SSIZE_T_MAX;
+
+ if (subobj == Py_None)
+ return split_whitespace(s, len, maxsplit);
+
if (PyBytes_Check(subobj)) {
sub = PyBytes_AS_STRING(subobj);
n = PyBytes_GET_SIZE(subobj);
@@ -2167,7 +2211,7 @@
PyErr_SetString(PyExc_ValueError, "empty separator");
return NULL;
}
- else if (n == 1)
+ if (n == 1)
return split_char(s, len, sub[0], maxsplit);
list = PyList_New(PREALLOC_SIZE(maxsplit));
@@ -2293,26 +2337,71 @@
return NULL;
}
+Py_LOCAL_INLINE(PyObject *)
+rsplit_whitespace(const char *s, Py_ssize_t len, Py_ssize_t maxcount)
+{
+ register Py_ssize_t i, j, count = 0;
+ PyObject *str;
+ PyObject *list = PyList_New(PREALLOC_SIZE(maxcount));
+
+ if (list == NULL)
+ return NULL;
+
+ for (i = j = len - 1; i >= 0; ) {
+ /* find a token */
+ while (i >= 0 && Py_UNICODE_ISSPACE(s[i]))
+ i--;
+ j = i;
+ while (i >= 0 && !Py_UNICODE_ISSPACE(s[i]))
+ i--;
+ if (j > i) {
+ if (maxcount-- <= 0)
+ break;
+ SPLIT_ADD(s, i + 1, j + 1);
+ while (i >= 0 && Py_UNICODE_ISSPACE(s[i]))
+ i--;
+ j = i;
+ }
+ }
+ if (j >= 0) {
+ SPLIT_ADD(s, 0, j + 1);
+ }
+ FIX_PREALLOC_SIZE(list);
+ if (PyList_Reverse(list) < 0)
+ goto onError;
+
+ return list;
+
+ onError:
+ Py_DECREF(list);
+ return NULL;
+}
+
PyDoc_STRVAR(rsplit__doc__,
"B.rsplit(sep [,maxsplit]) -> list of bytes\n\
\n\
Return a list of the sections in the byte B, using sep as the\n\
delimiter, starting at the end of the bytes and working\n\
-to the front. If maxsplit is given, at most maxsplit splits are\n\
-done.");
+to the front. If sep is not given, B is split on ASCII whitespace\n\
+characters (space, tab, return, newline, formfeed, vertical tab).\n\
+If maxsplit is given, at most maxsplit splits are done.");
static PyObject *
bytes_rsplit(PyBytesObject *self, PyObject *args)
{
Py_ssize_t len = PyBytes_GET_SIZE(self), n, i, j;
- Py_ssize_t maxsplit = -1, count=0;
+ Py_ssize_t maxsplit = -1, count = 0;
const char *s = PyBytes_AS_STRING(self), *sub;
- PyObject *list, *str, *subobj;
+ PyObject *list, *str, *subobj = Py_None;
- if (!PyArg_ParseTuple(args, "O|n:rsplit", &subobj, &maxsplit))
+ if (!PyArg_ParseTuple(args, "|On:rsplit", &subobj, &maxsplit))
return NULL;
if (maxsplit < 0)
maxsplit = PY_SSIZE_T_MAX;
+
+ if (subobj == Py_None)
+ return rsplit_whitespace(s, len, maxsplit);
+
if (PyBytes_Check(subobj)) {
sub = PyBytes_AS_STRING(subobj);
n = PyBytes_GET_SIZE(subobj);
Index: Lib/test/test_bytes.py
===================================================================
--- Lib/test/test_bytes.py (revision 58048)
+++ Lib/test/test_bytes.py (working copy)
@@ -617,17 +617,35 @@
self.assertEqual(b.split(b'i'), [b'm', b'ss', b'ss', b'pp', b''])
self.assertEqual(b.split(b'ss'), [b'mi', b'i', b'ippi'])
self.assertEqual(b.split(b'w'), [b])
- # require an arg (no magic whitespace split)
- self.assertRaises(TypeError, b.split)
+ def test_split_whitespace(self):
+ for b in (b' arf barf ', b'arf\tbarf', b'arf\nbarf', b'arf\rbarf',
+ b'arf\fbarf', b'arf\vbarf'):
+ self.assertEqual(b.split(), [b'arf', b'barf'])
+ self.assertEqual(b.split(None), [b'arf', b'barf'])
+ self.assertEqual(b.split(None, 2), [b'arf', b'barf'])
+ self.assertEqual(b' a bb c '.split(None, 0), [b'a bb c '])
+ self.assertEqual(b' a bb c '.split(None, 1), [b'a', b'bb c '])
+ self.assertEqual(b' a bb c '.split(None, 2), [b'a', b'bb', b'c '])
+ self.assertEqual(b' a bb c '.split(None, 3), [b'a', b'bb', b'c'])
+
def test_rsplit(self):
b = b'mississippi'
self.assertEqual(b.rsplit(b'i'), [b'm', b'ss', b'ss', b'pp', b''])
self.assertEqual(b.rsplit(b'ss'), [b'mi', b'i', b'ippi'])
self.assertEqual(b.rsplit(b'w'), [b])
- # require an arg (no magic whitespace split)
- self.assertRaises(TypeError, b.rsplit)
+ def test_rsplit_whitespace(self):
+ for b in (b' arf barf ', b'arf\tbarf', b'arf\nbarf', b'arf\rbarf',
+ b'arf\fbarf', b'arf\vbarf'):
+ self.assertEqual(b.rsplit(), [b'arf', b'barf'])
+ self.assertEqual(b.rsplit(None), [b'arf', b'barf'])
+ self.assertEqual(b.rsplit(None, 2), [b'arf', b'barf'])
+ self.assertEqual(b' a bb c '.rsplit(None, 0), [b' a bb c'])
+ self.assertEqual(b' a bb c '.rsplit(None, 1), [b' a bb', b'c'])
+ self.assertEqual(b' a bb c '.rsplit(None,2), [b' a', b'bb', b'c'])
+ self.assertEqual(b' a bb c '.rsplit(None, 3), [b'a', b'bb', b'c'])
+
def test_partition(self):
b = b'mississippi'
self.assertEqual(b.partition(b'ss'), (b'mi', b'ss', b'issippi'))
_______________________________________________
Python-bugs-list mailing list
Unsubscribe:
http://mail.python.org/mailman/options/python-bugs-list/archive%40mail-archive.com