https://github.com/python/cpython/commit/ccad61e35d240972d14f993507566706fbf419f1 commit: ccad61e35d240972d14f993507566706fbf419f1 branch: main author: Barney Gale <barney.g...@gmail.com> committer: barneygale <barney.g...@gmail.com> date: 2025-04-14T01:49:02+01:00 summary:
GH-125866: Support complete "file:" URLs in urllib (#132378) Add optional *add_scheme* argument to `urllib.request.pathname2url()`; when set to true, a complete URL is returned. Likewise add optional *require_scheme* argument to `url2pathname()`; when set to true, a complete URL is accepted. Co-authored-by: Bénédikt Tran <10796600+picn...@users.noreply.github.com> files: A Misc/NEWS.d/next/Library/2025-04-10-21-43-04.gh-issue-125866.EZ9X8D.rst M Doc/library/urllib.request.rst M Doc/whatsnew/3.14.rst M Lib/pathlib/__init__.py M Lib/test/test_pathlib/test_pathlib.py M Lib/test/test_urllib.py M Lib/test/test_urllib2.py M Lib/test/test_urllib2net.py M Lib/urllib/request.py diff --git a/Doc/library/urllib.request.rst b/Doc/library/urllib.request.rst index edfc249eb43c78..a5f1b9b292a85a 100644 --- a/Doc/library/urllib.request.rst +++ b/Doc/library/urllib.request.rst @@ -146,16 +146,19 @@ The :mod:`urllib.request` module defines the following functions: attribute to modify its position in the handlers list. -.. function:: pathname2url(path) +.. function:: pathname2url(path, *, add_scheme=False) Convert the given local path to a ``file:`` URL. This function uses - :func:`~urllib.parse.quote` function to encode the path. For historical - reasons, the return value omits the ``file:`` scheme prefix. This example - shows the function being used on Windows:: + :func:`~urllib.parse.quote` function to encode the path. + + If *add_scheme* is false (the default), the return value omits the + ``file:`` scheme prefix. Set *add_scheme* to true to return a complete URL. + + This example shows the function being used on Windows:: >>> from urllib.request import pathname2url >>> path = 'C:\\Program Files' - >>> 'file:' + pathname2url(path) + >>> pathname2url(path, add_scheme=True) 'file:///C:/Program%20Files' .. versionchanged:: 3.14 @@ -168,17 +171,25 @@ The :mod:`urllib.request` module defines the following functions: sections. For example, the path ``/etc/hosts`` is converted to the URL ``///etc/hosts``. + .. versionchanged:: next + The *add_scheme* argument was added. + -.. function:: url2pathname(url) +.. function:: url2pathname(url, *, require_scheme=False) Convert the given ``file:`` URL to a local path. This function uses - :func:`~urllib.parse.unquote` to decode the URL. For historical reasons, - the given value *must* omit the ``file:`` scheme prefix. This example shows - the function being used on Windows:: + :func:`~urllib.parse.unquote` to decode the URL. + + If *require_scheme* is false (the default), the given value should omit a + ``file:`` scheme prefix. If *require_scheme* is set to true, the given + value should include the prefix; a :exc:`~urllib.error.URLError` is raised + if it doesn't. + + This example shows the function being used on Windows:: >>> from urllib.request import url2pathname >>> url = 'file:///C:/Program%20Files' - >>> url2pathname(url.removeprefix('file:')) + >>> url2pathname(url, require_scheme=True) 'C:\\Program Files' .. versionchanged:: 3.14 @@ -193,6 +204,9 @@ The :mod:`urllib.request` module defines the following functions: returned (as before), and on other platforms a :exc:`~urllib.error.URLError` is raised. + .. versionchanged:: next + The *require_scheme* argument was added. + .. function:: getproxies() diff --git a/Doc/whatsnew/3.14.rst b/Doc/whatsnew/3.14.rst index 421d12660b7956..0e30500fc9b997 100644 --- a/Doc/whatsnew/3.14.rst +++ b/Doc/whatsnew/3.14.rst @@ -1218,16 +1218,20 @@ urllib supporting SHA-256 digest authentication as specified in :rfc:`7616`. (Contributed by Calvin Bui in :gh:`128193`.) -* Improve standards compliance when parsing and emitting ``file:`` URLs. +* Improve ergonomics and standards compliance when parsing and emitting + ``file:`` URLs. In :func:`urllib.request.url2pathname`: + - Accept a complete URL when the new *require_scheme* argument is set to + true. - Discard URL authorities that resolve to a local IP address. - Raise :exc:`~urllib.error.URLError` if a URL authority doesn't resolve - to ``localhost``, except on Windows where we return a UNC path. + to a local IP address, except on Windows where we return a UNC path. In :func:`urllib.request.pathname2url`: + - Return a complete URL when the new *add_scheme* argument is set to true. - Include an empty URL authority when a path begins with a slash. For example, the path ``/etc/hosts`` is converted to the URL ``///etc/hosts``. diff --git a/Lib/pathlib/__init__.py b/Lib/pathlib/__init__.py index 43a5440e0132ff..12cf9f579cb32d 100644 --- a/Lib/pathlib/__init__.py +++ b/Lib/pathlib/__init__.py @@ -1271,17 +1271,15 @@ def as_uri(self): if not self.is_absolute(): raise ValueError("relative paths can't be expressed as file URIs") from urllib.request import pathname2url - return f'file:{pathname2url(str(self))}' + return pathname2url(str(self), add_scheme=True) @classmethod def from_uri(cls, uri): """Return a new path from the given 'file' URI.""" - if not uri.startswith('file:'): - raise ValueError(f"URI does not start with 'file:': {uri!r}") from urllib.error import URLError from urllib.request import url2pathname try: - path = cls(url2pathname(uri.removeprefix('file:'))) + path = cls(url2pathname(uri, require_scheme=True)) except URLError as exc: raise ValueError(exc.reason) from None if not path.is_absolute(): diff --git a/Lib/test/test_pathlib/test_pathlib.py b/Lib/test/test_pathlib/test_pathlib.py index 21bc2f9e68b811..41a79d0dceb0eb 100644 --- a/Lib/test/test_pathlib/test_pathlib.py +++ b/Lib/test/test_pathlib/test_pathlib.py @@ -3302,8 +3302,8 @@ def test_from_uri_posix(self): @needs_posix def test_from_uri_pathname2url_posix(self): P = self.cls - self.assertEqual(P.from_uri('file:' + pathname2url('/foo/bar')), P('/foo/bar')) - self.assertEqual(P.from_uri('file:' + pathname2url('//foo/bar')), P('//foo/bar')) + self.assertEqual(P.from_uri(pathname2url('/foo/bar', add_scheme=True)), P('/foo/bar')) + self.assertEqual(P.from_uri(pathname2url('//foo/bar', add_scheme=True)), P('//foo/bar')) @needs_windows def test_absolute_windows(self): diff --git a/Lib/test/test_urllib.py b/Lib/test/test_urllib.py index ecf429e17811a4..abfbed8840ca03 100644 --- a/Lib/test/test_urllib.py +++ b/Lib/test/test_urllib.py @@ -476,7 +476,7 @@ def test_missing_localfile(self): def test_file_notexists(self): fd, tmp_file = tempfile.mkstemp() - tmp_file_canon_url = 'file:' + urllib.request.pathname2url(tmp_file) + tmp_file_canon_url = urllib.request.pathname2url(tmp_file, add_scheme=True) parsed = urllib.parse.urlsplit(tmp_file_canon_url) tmp_fileurl = parsed._replace(netloc='localhost').geturl() try: @@ -620,7 +620,7 @@ def tearDown(self): def constructLocalFileUrl(self, filePath): filePath = os.path.abspath(filePath) - return "file:" + urllib.request.pathname2url(filePath) + return urllib.request.pathname2url(filePath, add_scheme=True) def createNewTempFile(self, data=b""): """Creates a new temporary file containing the specified data, @@ -1436,6 +1436,21 @@ def test_pathname2url(self): self.assertEqual(fn(f'{sep}a{sep}b.c'), '///a/b.c') self.assertEqual(fn(f'{sep}a{sep}b%#c'), '///a/b%25%23c') + def test_pathname2url_add_scheme(self): + sep = os.path.sep + subtests = [ + ('', 'file:'), + (sep, 'file:///'), + ('a', 'file:a'), + (f'a{sep}b.c', 'file:a/b.c'), + (f'{sep}a{sep}b.c', 'file:///a/b.c'), + (f'{sep}a{sep}b%#c', 'file:///a/b%25%23c'), + ] + for path, expected_url in subtests: + with self.subTest(path=path): + self.assertEqual( + urllib.request.pathname2url(path, add_scheme=True), expected_url) + @unittest.skipUnless(sys.platform == 'win32', 'test specific to Windows pathnames.') def test_pathname2url_win(self): @@ -1503,6 +1518,49 @@ def test_url2pathname(self): self.assertEqual(fn('//localhost/foo/bar'), f'{sep}foo{sep}bar') self.assertEqual(fn('///foo/bar'), f'{sep}foo{sep}bar') self.assertEqual(fn('////foo/bar'), f'{sep}{sep}foo{sep}bar') + self.assertEqual(fn('data:blah'), 'data:blah') + self.assertEqual(fn('data://blah'), f'data:{sep}{sep}blah') + + def test_url2pathname_require_scheme(self): + sep = os.path.sep + subtests = [ + ('file:', ''), + ('FILE:', ''), + ('FiLe:', ''), + ('file:/', f'{sep}'), + ('file:///', f'{sep}'), + ('file:////', f'{sep}{sep}'), + ('file:foo', 'foo'), + ('file:foo/bar', f'foo{sep}bar'), + ('file:/foo/bar', f'{sep}foo{sep}bar'), + ('file://localhost/foo/bar', f'{sep}foo{sep}bar'), + ('file:///foo/bar', f'{sep}foo{sep}bar'), + ('file:////foo/bar', f'{sep}{sep}foo{sep}bar'), + ('file:data:blah', 'data:blah'), + ('file:data://blah', f'data:{sep}{sep}blah'), + ] + for url, expected_path in subtests: + with self.subTest(url=url): + self.assertEqual( + urllib.request.url2pathname(url, require_scheme=True), + expected_path) + + error_subtests = [ + '', + ':', + 'foo', + 'http:foo', + 'localfile:foo', + 'data:foo', + 'data:file:foo', + 'data:file://foo', + ] + for url in error_subtests: + with self.subTest(url=url): + self.assertRaises( + urllib.error.URLError, + urllib.request.url2pathname, + url, require_scheme=True) @unittest.skipUnless(sys.platform == 'win32', 'test specific to Windows pathnames.') diff --git a/Lib/test/test_urllib2.py b/Lib/test/test_urllib2.py index 088ee4c4f90803..f44d324b3ab763 100644 --- a/Lib/test/test_urllib2.py +++ b/Lib/test/test_urllib2.py @@ -809,7 +809,7 @@ def test_file(self): TESTFN = os_helper.TESTFN towrite = b"hello, world\n" - canonurl = 'file:' + urllib.request.pathname2url(os.path.abspath(TESTFN)) + canonurl = urllib.request.pathname2url(os.path.abspath(TESTFN), add_scheme=True) parsed = urlsplit(canonurl) if parsed.netloc: raise unittest.SkipTest("non-local working directory") diff --git a/Lib/test/test_urllib2net.py b/Lib/test/test_urllib2net.py index b84290a7368c29..e6a18476908495 100644 --- a/Lib/test/test_urllib2net.py +++ b/Lib/test/test_urllib2net.py @@ -150,7 +150,7 @@ def test_file(self): f.write('hi there\n') f.close() urls = [ - 'file:' + urllib.request.pathname2url(os.path.abspath(TESTFN)), + urllib.request.pathname2url(os.path.abspath(TESTFN), add_scheme=True), ('file:///nonsensename/etc/passwd', None, urllib.error.URLError), ] diff --git a/Lib/urllib/request.py b/Lib/urllib/request.py index 84c075ec8b359f..2c9c7b6ca5394d 100644 --- a/Lib/urllib/request.py +++ b/Lib/urllib/request.py @@ -1466,17 +1466,16 @@ def get_names(self): def open_local_file(self, req): import email.utils import mimetypes - filename = _splittype(req.full_url)[1] - localfile = url2pathname(filename) + localfile = url2pathname(req.full_url, require_scheme=True) try: stats = os.stat(localfile) size = stats.st_size modified = email.utils.formatdate(stats.st_mtime, usegmt=True) - mtype = mimetypes.guess_type(filename)[0] + mtype = mimetypes.guess_file_type(localfile)[0] headers = email.message_from_string( 'Content-type: %s\nContent-length: %d\nLast-modified: %s\n' % (mtype or 'text/plain', size, modified)) - origurl = f'file:{pathname2url(localfile)}' + origurl = pathname2url(localfile, add_scheme=True) return addinfourl(open(localfile, 'rb'), headers, origurl) except OSError as exp: raise URLError(exp, exp.filename) @@ -1635,9 +1634,16 @@ def data_open(self, req): # Code move from the old urllib module -def url2pathname(url): - """OS-specific conversion from a relative URL of the 'file' scheme - to a file system path; not recommended for general use.""" +def url2pathname(url, *, require_scheme=False): + """Convert the given file URL to a local file system path. + + The 'file:' scheme prefix must be omitted unless *require_scheme* + is set to true. + """ + if require_scheme: + scheme, url = _splittype(url) + if scheme != 'file': + raise URLError("URL is missing a 'file:' scheme") authority, url = _splithost(url) if os.name == 'nt': if not _is_local_authority(authority): @@ -1661,13 +1667,17 @@ def url2pathname(url): return unquote(url, encoding=encoding, errors=errors) -def pathname2url(pathname): - """OS-specific conversion from a file system path to a relative URL - of the 'file' scheme; not recommended for general use.""" +def pathname2url(pathname, *, add_scheme=False): + """Convert the given local file system path to a file URL. + + The 'file:' scheme prefix is omitted unless *add_scheme* + is set to true. + """ if os.name == 'nt': pathname = pathname.replace('\\', '/') encoding = sys.getfilesystemencoding() errors = sys.getfilesystemencodeerrors() + scheme = 'file:' if add_scheme else '' drive, root, tail = os.path.splitroot(pathname) if drive: # First, clean up some special forms. We are going to sacrifice the @@ -1689,7 +1699,7 @@ def pathname2url(pathname): # avoids interpreting the path as a URL authority. root = '//' + root tail = quote(tail, encoding=encoding, errors=errors) - return drive + root + tail + return scheme + drive + root + tail # Utility functions diff --git a/Misc/NEWS.d/next/Library/2025-04-10-21-43-04.gh-issue-125866.EZ9X8D.rst b/Misc/NEWS.d/next/Library/2025-04-10-21-43-04.gh-issue-125866.EZ9X8D.rst new file mode 100644 index 00000000000000..0d60a16a17753a --- /dev/null +++ b/Misc/NEWS.d/next/Library/2025-04-10-21-43-04.gh-issue-125866.EZ9X8D.rst @@ -0,0 +1,4 @@ +Add optional *add_scheme* argument to :func:`urllib.request.pathname2url`; when +set to true, a complete URL is returned. Likewise add optional *require_scheme* +argument to :func:`~urllib.request.url2pathname`; when set to true, a complete +URL is accepted. _______________________________________________ Python-checkins mailing list -- python-checkins@python.org To unsubscribe send an email to python-checkins-le...@python.org https://mail.python.org/mailman3/lists/python-checkins.python.org/ Member address: arch...@mail-archive.com