[issue1125] bytes.split shold have same interface as str.split, or different name

Guido van Rossum Fri, 07 Sep 2007 17:37:27 -0700

Guido van Rossum added the comment:

Here's a patch that fixes bytes.split and .rsplit.  I'll hold off for a
while in case there's strong disagreement.  I might add a patch for
bytes.strip later (it's simpler).


----------
keywords: +patch

__________________________________
Tracker <[EMAIL PROTECTED]>
<http://bugs.python.org/issue1125>
__________________________________

Index: Objects/bytesobject.c
===================================================================
--- Objects/bytesobject.c	(revision 58048)
+++ Objects/bytesobject.c	(working copy)
@@ -2104,7 +2104,7 @@
 Py_LOCAL_INLINE(PyObject *)
 split_char(const char *s, Py_ssize_t len, char ch, Py_ssize_t maxcount)
 {
-    register Py_ssize_t i, j, count=0;
+    register Py_ssize_t i, j, count = 0;
     PyObject *str;
     PyObject *list = PyList_New(PREALLOC_SIZE(maxcount));
 
@@ -2113,7 +2113,7 @@
 
     i = j = 0;
     while ((j < len) && (maxcount-- > 0)) {
-        for(; j<len; j++) {
+        for(; j < len; j++) {
             /* I found that using memchr makes no difference */
             if (s[j] == ch) {
                 SPLIT_ADD(s, i, j);
@@ -2133,28 +2133,72 @@
     return NULL;
 }
 
+#define ISSPACE(c) (isspace(Py_CHARMASK(c)) && ((c) & 0x80) == 0)
+
+Py_LOCAL_INLINE(PyObject *)
+split_whitespace(const char *s, Py_ssize_t len, Py_ssize_t maxcount)
+{
+    register Py_ssize_t i, j, count = 0;
+    PyObject *str;
+    PyObject *list = PyList_New(PREALLOC_SIZE(maxcount));
+
+    if (list == NULL)
+        return NULL;
+
+    for (i = j = 0; i < len; ) {
+	/* find a token */
+	while (i < len && ISSPACE(s[i]))
+	    i++;
+	j = i;
+	while (i < len && !ISSPACE(s[i]))
+	    i++;
+	if (j < i) {
+	    if (maxcount-- <= 0)
+		break;
+	    SPLIT_ADD(s, j, i);
+	    while (i < len && ISSPACE(s[i]))
+		i++;
+	    j = i;
+	}
+    }
+    if (j < len) {
+	SPLIT_ADD(s, j, len);
+    }
+    FIX_PREALLOC_SIZE(list);
+    return list;
+
+  onError:
+    Py_DECREF(list);
+    return NULL;
+}
+
 PyDoc_STRVAR(split__doc__,
-"B.split(sep [,maxsplit]) -> list of bytes\n\
+"B.split([sep [, maxsplit]]) -> list of bytes\n\
 \n\
 Return a list of the bytes in the string B, using sep as the\n\
-delimiter.  If maxsplit is given, at most maxsplit\n\
-splits are done.");
+delimiter.  If sep is not given, B is split on ASCII whitespace\n\
+characters (space, tab, return, newline, formfeed, vertical tab).\n\
+If maxsplit is given, at most maxsplit splits are done.");
 
 static PyObject *
 bytes_split(PyBytesObject *self, PyObject *args)
 {
     Py_ssize_t len = PyBytes_GET_SIZE(self), n, i, j;
-    Py_ssize_t maxsplit = -1, count=0;
+    Py_ssize_t maxsplit = -1, count = 0;
     const char *s = PyBytes_AS_STRING(self), *sub;
-    PyObject *list, *str, *subobj;
+    PyObject *list, *str, *subobj = Py_None;
 #ifdef USE_FAST
     Py_ssize_t pos;
 #endif
 
-    if (!PyArg_ParseTuple(args, "O|n:split", &subobj, &maxsplit))
+    if (!PyArg_ParseTuple(args, "|On:split", &subobj, &maxsplit))
         return NULL;
     if (maxsplit < 0)
         maxsplit = PY_SSIZE_T_MAX;
+
+    if (subobj == Py_None)
+        return split_whitespace(s, len, maxsplit);
+
     if (PyBytes_Check(subobj)) {
         sub = PyBytes_AS_STRING(subobj);
         n = PyBytes_GET_SIZE(subobj);
@@ -2167,7 +2211,7 @@
         PyErr_SetString(PyExc_ValueError, "empty separator");
         return NULL;
     }
-    else if (n == 1)
+    if (n == 1)
         return split_char(s, len, sub[0], maxsplit);
 
     list = PyList_New(PREALLOC_SIZE(maxsplit));
@@ -2293,26 +2337,71 @@
     return NULL;
 }
 
+Py_LOCAL_INLINE(PyObject *)
+rsplit_whitespace(const char *s, Py_ssize_t len, Py_ssize_t maxcount)
+{
+    register Py_ssize_t i, j, count = 0;
+    PyObject *str;
+    PyObject *list = PyList_New(PREALLOC_SIZE(maxcount));
+
+    if (list == NULL)
+        return NULL;
+
+    for (i = j = len - 1; i >= 0; ) {
+	/* find a token */
+	while (i >= 0 && Py_UNICODE_ISSPACE(s[i]))
+	    i--;
+	j = i;
+	while (i >= 0 && !Py_UNICODE_ISSPACE(s[i]))
+	    i--;
+	if (j > i) {
+	    if (maxcount-- <= 0)
+		break;
+	    SPLIT_ADD(s, i + 1, j + 1);
+	    while (i >= 0 && Py_UNICODE_ISSPACE(s[i]))
+		i--;
+	    j = i;
+	}
+    }
+    if (j >= 0) {
+	SPLIT_ADD(s, 0, j + 1);
+    }
+    FIX_PREALLOC_SIZE(list);
+    if (PyList_Reverse(list) < 0)
+        goto onError;
+
+    return list;
+
+  onError:
+    Py_DECREF(list);
+    return NULL;
+}
+
 PyDoc_STRVAR(rsplit__doc__,
 "B.rsplit(sep [,maxsplit]) -> list of bytes\n\
 \n\
 Return a list of the sections in the byte B, using sep as the\n\
 delimiter, starting at the end of the bytes and working\n\
-to the front.  If maxsplit is given, at most maxsplit splits are\n\
-done.");
+to the front.  If sep is not given, B is split on ASCII whitespace\n\
+characters (space, tab, return, newline, formfeed, vertical tab).\n\
+If maxsplit is given, at most maxsplit splits are done.");
 
 static PyObject *
 bytes_rsplit(PyBytesObject *self, PyObject *args)
 {
     Py_ssize_t len = PyBytes_GET_SIZE(self), n, i, j;
-    Py_ssize_t maxsplit = -1, count=0;
+    Py_ssize_t maxsplit = -1, count = 0;
     const char *s = PyBytes_AS_STRING(self), *sub;
-    PyObject *list, *str, *subobj;
+    PyObject *list, *str, *subobj = Py_None;
 
-    if (!PyArg_ParseTuple(args, "O|n:rsplit", &subobj, &maxsplit))
+    if (!PyArg_ParseTuple(args, "|On:rsplit", &subobj, &maxsplit))
         return NULL;
     if (maxsplit < 0)
         maxsplit = PY_SSIZE_T_MAX;
+
+    if (subobj == Py_None)
+        return rsplit_whitespace(s, len, maxsplit);
+
     if (PyBytes_Check(subobj)) {
         sub = PyBytes_AS_STRING(subobj);
         n = PyBytes_GET_SIZE(subobj);
Index: Lib/test/test_bytes.py
===================================================================
--- Lib/test/test_bytes.py	(revision 58048)
+++ Lib/test/test_bytes.py	(working copy)
@@ -617,17 +617,35 @@
         self.assertEqual(b.split(b'i'), [b'm', b'ss', b'ss', b'pp', b''])
         self.assertEqual(b.split(b'ss'), [b'mi', b'i', b'ippi'])
         self.assertEqual(b.split(b'w'), [b])
-        # require an arg (no magic whitespace split)
-        self.assertRaises(TypeError, b.split)
 
+    def test_split_whitespace(self):
+        for b in (b'  arf  barf  ', b'arf\tbarf', b'arf\nbarf', b'arf\rbarf',
+                  b'arf\fbarf', b'arf\vbarf'):
+            self.assertEqual(b.split(), [b'arf', b'barf'])
+            self.assertEqual(b.split(None), [b'arf', b'barf'])
+            self.assertEqual(b.split(None, 2), [b'arf', b'barf'])
+        self.assertEqual(b'  a  bb  c  '.split(None, 0), [b'a  bb  c  '])
+        self.assertEqual(b'  a  bb  c  '.split(None, 1), [b'a', b'bb  c  '])
+        self.assertEqual(b'  a  bb  c  '.split(None, 2), [b'a', b'bb', b'c  '])
+        self.assertEqual(b'  a  bb  c  '.split(None, 3), [b'a', b'bb', b'c'])
+
     def test_rsplit(self):
         b = b'mississippi'
         self.assertEqual(b.rsplit(b'i'), [b'm', b'ss', b'ss', b'pp', b''])
         self.assertEqual(b.rsplit(b'ss'), [b'mi', b'i', b'ippi'])
         self.assertEqual(b.rsplit(b'w'), [b])
-        # require an arg (no magic whitespace split)
-        self.assertRaises(TypeError, b.rsplit)
 
+    def test_rsplit_whitespace(self):
+        for b in (b'  arf  barf  ', b'arf\tbarf', b'arf\nbarf', b'arf\rbarf',
+                  b'arf\fbarf', b'arf\vbarf'):
+            self.assertEqual(b.rsplit(), [b'arf', b'barf'])
+            self.assertEqual(b.rsplit(None), [b'arf', b'barf'])
+            self.assertEqual(b.rsplit(None, 2), [b'arf', b'barf'])
+        self.assertEqual(b'  a  bb  c  '.rsplit(None, 0), [b'  a  bb  c'])
+        self.assertEqual(b'  a  bb  c  '.rsplit(None, 1), [b'  a  bb', b'c'])
+        self.assertEqual(b'  a  bb  c  '.rsplit(None,2), [b'  a', b'bb', b'c'])
+        self.assertEqual(b'  a  bb  c  '.rsplit(None, 3), [b'a', b'bb', b'c'])
+
     def test_partition(self):
         b = b'mississippi'
         self.assertEqual(b.partition(b'ss'), (b'mi', b'ss', b'issippi'))

_______________________________________________
Python-bugs-list mailing list 
Unsubscribe: 
http://mail.python.org/mailman/options/python-bugs-list/archive%40mail-archive.com

[issue1125] bytes.split shold have same interface as str.split, or different name

Reply via email to