Hi,

See attached patch: python3_bytes_filename.patch

Using the patch, you will get:
 - open() support bytes
 - listdir(unicode) -> only unicode, *skip* invalid filenames 
   (as asked by Guido)
 - remove os.getcwdu()
 - create os.getcwdb() -> bytes
 - glob.glob() support bytes
 - fnmatch.filter() support bytes
 - posixpath.join() and posixpath.split() support bytes

Mixing bytes and str is invalid. Examples raising a TypeError:
 - posixpath.join(b'x', 'y')
 - fnmatch.filter([b'x', 'y'], '*')
 - fnmatch.filter([b'x', b'y'], '*')
 - glob.glob1('.', b'*')
 - glob.glob1(b'.', '*')

$ diffstat ~/python3_bytes_filename.patch
 Lib/fnmatch.py        |    7 +++-
 Lib/glob.py           |   15 ++++++---
 Lib/io.py             |    2 -
 Lib/posixpath.py      |   20 ++++++++----
 Modules/posixmodule.c |   83 
++++++++++++++++++--------------------------------
 5 files changed, 62 insertions(+), 65 deletions(-)

TODO:
 - review this patch :-)
 - support non-ASCII bytes in fnmatch.filter()
 - fix other functions, eg. posixpath.isabs() and fnmatch.fnmatchcase()
 - fix functions written in C: grep FileSystemDefaultEncoding
 - make sure that mixing bytes and str is rejected

-- 
Victor Stinner aka haypo
http://www.haypocalc.com/blog/
Index: Lib/posixpath.py
===================================================================
--- Lib/posixpath.py	(révision 66687)
+++ Lib/posixpath.py	(copie de travail)
@@ -59,14 +59,18 @@
     """Join two or more pathname components, inserting '/' as needed.
     If any component is an absolute path, all previous path components
     will be discarded."""
+    if isinstance(a, bytes):
+        sep = b'/'
+    else:
+        sep = '/'
     path = a
     for b in p:
-        if b.startswith('/'):
+        if b.startswith(sep):
             path = b
-        elif path == '' or path.endswith('/'):
+        elif not path or path.endswith(sep):
             path +=  b
         else:
-            path += '/' + b
+            path += sep + b
     return path
 
 
@@ -78,10 +82,14 @@
 def split(p):
     """Split a pathname.  Returns tuple "(head, tail)" where "tail" is
     everything after the final slash.  Either part may be empty."""
-    i = p.rfind('/') + 1
+    if isinstance(p, bytes):
+        sep = b'/'
+    else:
+        sep = '/'
+    i = p.rfind(sep) + 1
     head, tail = p[:i], p[i:]
-    if head and head != '/'*len(head):
-        head = head.rstrip('/')
+    if head and head != sep*len(head):
+        head = head.rstrip(sep)
     return head, tail
 
 
Index: Lib/glob.py
===================================================================
--- Lib/glob.py	(révision 66687)
+++ Lib/glob.py	(copie de travail)
@@ -27,7 +27,7 @@
         return
     dirname, basename = os.path.split(pathname)
     if not dirname:
-        for name in glob1(os.curdir, basename):
+        for name in glob1(None, basename):
             yield name
         return
     if has_magic(dirname):
@@ -49,9 +49,8 @@
 def glob1(dirname, pattern):
     if not dirname:
         dirname = os.curdir
-    if isinstance(pattern, str) and not isinstance(dirname, str):
-        dirname = str(dirname, sys.getfilesystemencoding() or
-                                   sys.getdefaultencoding())
+        if isinstance(pattern, bytes):
+            dirname = dirname.encode("ASCII")
     try:
         names = os.listdir(dirname)
     except os.error:
@@ -73,6 +72,12 @@
 
 
 magic_check = re.compile('[*?[]')
+magic_check_bytes = re.compile(b'[*?[]')
 
 def has_magic(s):
-    return magic_check.search(s) is not None
+    if isinstance(s, bytes):
+        match = magic_check_bytes.search(s)
+    else:
+        match = magic_check.search(s)
+    return match is not None
+
Index: Lib/fnmatch.py
===================================================================
--- Lib/fnmatch.py	(révision 66687)
+++ Lib/fnmatch.py	(copie de travail)
@@ -43,7 +43,12 @@
     result=[]
     pat=os.path.normcase(pat)
     if not pat in _cache:
-        res = translate(pat)
+        if isinstance(pat, bytes):
+            pat_str = str(pat, "ASCII")
+            res_str = translate(pat_str)
+            res = res_str.encode("ASCII")
+        else:
+            res = translate(pat)
         _cache[pat] = re.compile(res)
     match=_cache[pat].match
     if os.path is posixpath:
Index: Lib/io.py
===================================================================
--- Lib/io.py	(révision 66687)
+++ Lib/io.py	(copie de travail)
@@ -180,7 +180,7 @@
     opened in a text mode, and for bytes a BytesIO can be used like a file
     opened in a binary mode.
     """
-    if not isinstance(file, (str, int)):
+    if not isinstance(file, (str, bytes, int)):
         raise TypeError("invalid file: %r" % file)
     if not isinstance(mode, str):
         raise TypeError("invalid mode: %r" % mode)
Index: Modules/posixmodule.c
===================================================================
--- Modules/posixmodule.c	(révision 66687)
+++ Modules/posixmodule.c	(copie de travail)
@@ -1968,63 +1968,18 @@
 
 
 #ifdef HAVE_GETCWD
-PyDoc_STRVAR(posix_getcwd__doc__,
-"getcwd() -> path\n\n\
-Return a string representing the current working directory.");
-
 static PyObject *
-posix_getcwd(PyObject *self, PyObject *noargs)
+posix_getcwd(int use_bytes)
 {
-	int bufsize_incr = 1024;
-	int bufsize = 0;
-	char *tmpbuf = NULL;
-	char *res = NULL;
-	PyObject *dynamic_return;
-
-	Py_BEGIN_ALLOW_THREADS
-	do {
-		bufsize = bufsize + bufsize_incr;
-		tmpbuf = malloc(bufsize);
-		if (tmpbuf == NULL) {
-			break;
-		}
-#if defined(PYOS_OS2) && defined(PYCC_GCC)
-		res = _getcwd2(tmpbuf, bufsize);
-#else
-		res = getcwd(tmpbuf, bufsize);
-#endif
-
-		if (res == NULL) {
-			free(tmpbuf);
-		}
-	} while ((res == NULL) && (errno == ERANGE));
-	Py_END_ALLOW_THREADS
-
-	if (res == NULL)
-		return posix_error();
-
-	dynamic_return = PyUnicode_FromString(tmpbuf);
-	free(tmpbuf);
-
-	return dynamic_return;
-}
-
-PyDoc_STRVAR(posix_getcwdu__doc__,
-"getcwdu() -> path\n\n\
-Return a unicode string representing the current working directory.");
-
-static PyObject *
-posix_getcwdu(PyObject *self, PyObject *noargs)
-{
 	char buf[1026];
 	char *res;
 
 #ifdef Py_WIN_WIDE_FILENAMES
-	DWORD len;
-	if (unicode_file_names()) {
+	if (!use_bytes && unicode_file_names()) {
 		wchar_t wbuf[1026];
 		wchar_t *wbuf2 = wbuf;
 		PyObject *resobj;
+		DWORD len;
 		Py_BEGIN_ALLOW_THREADS
 		len = GetCurrentDirectoryW(sizeof wbuf/ sizeof wbuf[0], wbuf);
 		/* If the buffer is large enough, len does not include the
@@ -2059,8 +2014,30 @@
 	Py_END_ALLOW_THREADS
 	if (res == NULL)
 		return posix_error();
+	if (use_bytes)
+		return PyBytes_FromStringAndSize(buf, strlen(buf));
 	return PyUnicode_Decode(buf, strlen(buf), Py_FileSystemDefaultEncoding,"strict");
 }
+
+PyDoc_STRVAR(posix_getcwd__doc__,
+"getcwd() -> path\n\n\
+Return a unicode string representing the current working directory.");
+
+static PyObject *
+posix_getcwd_unicode(PyObject *self)
+{
+    return posix_getcwd(0);
+}
+
+PyDoc_STRVAR(posix_getcwdb__doc__,
+"getcwdb() -> path\n\n\
+Return a bytes string representing the current working directory.");
+
+static PyObject *
+posix_getcwd_bytes(PyObject *self)
+{
+    return posix_getcwd(1);
+}
 #endif
 
 
@@ -2378,9 +2355,9 @@
 				v = w;
 			}
 			else {
-				/* fall back to the original byte string, as
-				   discussed in patch #683592 */
 				PyErr_Clear();
+				Py_DECREF(v);
+				continue;
 			}
 		}
 		if (PyList_Append(d, v) != 0) {
@@ -6810,8 +6787,10 @@
 	{"ctermid",	posix_ctermid, METH_NOARGS, posix_ctermid__doc__},
 #endif
 #ifdef HAVE_GETCWD
-	{"getcwd",	posix_getcwd, METH_NOARGS, posix_getcwd__doc__},
-	{"getcwdu",	posix_getcwdu, METH_NOARGS, posix_getcwdu__doc__},
+	{"getcwd",	(PyCFunction)posix_getcwd_unicode,
+	METH_NOARGS, posix_getcwd__doc__},
+	{"getcwdb",	(PyCFunction)posix_getcwd_bytes,
+	METH_NOARGS, posix_getcwdb__doc__},
 #endif
 #ifdef HAVE_LINK
 	{"link",	posix_link, METH_VARARGS, posix_link__doc__},
_______________________________________________
Python-3000 mailing list
Python-3000@python.org
http://mail.python.org/mailman/listinfo/python-3000
Unsubscribe: 
http://mail.python.org/mailman/options/python-3000/archive%40mail-archive.com

Reply via email to