https://github.com/python/cpython/commit/66cdb2bd8ab93a4691bead7f5d1e54e5ca6895b4 commit: 66cdb2bd8ab93a4691bead7f5d1e54e5ca6895b4 branch: main author: Barney Gale <barney.g...@gmail.com> committer: barneygale <barney.g...@gmail.com> date: 2025-04-10T19:58:04Z summary:
GH-123599: `url2pathname()`: handle authority section in file URL (#126844) In `urllib.request.url2pathname()`, if the authority resolves to the current host, discard it. If an authority is present but resolves somewhere else, then on Windows we return a UNC path (as before), and on other platforms we raise `URLError`. Affects `pathlib.Path.from_uri()` in the same way. Co-authored-by: Adam Turner <9087854+aa-tur...@users.noreply.github.com> Co-authored-by: Bénédikt Tran <10796600+picn...@users.noreply.github.com> files: A Misc/NEWS.d/next/Library/2024-11-14-21-17-48.gh-issue-126838.Yr5vKF.rst M Doc/library/pathlib.rst M Doc/library/urllib.request.rst M Doc/whatsnew/3.14.rst M Lib/pathlib/__init__.py M Lib/test/test_pathlib/test_pathlib.py M Lib/test/test_urllib.py M Lib/urllib/request.py diff --git a/Doc/library/pathlib.rst b/Doc/library/pathlib.rst index 708a16e6bc8c94..b82986902861b2 100644 --- a/Doc/library/pathlib.rst +++ b/Doc/library/pathlib.rst @@ -871,6 +871,12 @@ conforming to :rfc:`8089`. .. versionadded:: 3.13 + .. versionchanged:: next + If a URL authority (e.g. a hostname) is present and resolves to a local + address, it is discarded. If an authority is present and *doesn't* + resolve to a local address, then on Windows a UNC path is returned (as + before), and on other platforms a :exc:`ValueError` is raised. + .. method:: Path.as_uri() diff --git a/Doc/library/urllib.request.rst b/Doc/library/urllib.request.rst index 8b54e10713e782..edfc249eb43c78 100644 --- a/Doc/library/urllib.request.rst +++ b/Doc/library/urllib.request.rst @@ -158,16 +158,16 @@ The :mod:`urllib.request` module defines the following functions: >>> 'file:' + pathname2url(path) 'file:///C:/Program%20Files' - .. versionchanged:: 3.14 - Paths beginning with a slash are converted to URLs with authority - sections. For example, the path ``/etc/hosts`` is converted to - the URL ``///etc/hosts``. - .. versionchanged:: 3.14 Windows drive letters are no longer converted to uppercase, and ``:`` characters not following a drive letter no longer cause an :exc:`OSError` exception to be raised on Windows. + .. versionchanged:: 3.14 + Paths beginning with a slash are converted to URLs with authority + sections. For example, the path ``/etc/hosts`` is converted to + the URL ``///etc/hosts``. + .. function:: url2pathname(url) @@ -186,6 +186,13 @@ The :mod:`urllib.request` module defines the following functions: characters not following a drive letter no longer cause an :exc:`OSError` exception to be raised on Windows. + .. versionchanged:: next + This function calls :func:`socket.gethostbyname` if the URL authority + isn't empty or ``localhost``. If the authority resolves to a local IP + address then it is discarded; otherwise, on Windows a UNC path is + returned (as before), and on other platforms a + :exc:`~urllib.error.URLError` is raised. + .. function:: getproxies() diff --git a/Doc/whatsnew/3.14.rst b/Doc/whatsnew/3.14.rst index 0f15a2a8a8f6af..f8cae78b909a00 100644 --- a/Doc/whatsnew/3.14.rst +++ b/Doc/whatsnew/3.14.rst @@ -1197,6 +1197,25 @@ urllib supporting SHA-256 digest authentication as specified in :rfc:`7616`. (Contributed by Calvin Bui in :gh:`128193`.) +* Improve standards compliance when parsing and emitting ``file:`` URLs. + + In :func:`urllib.request.url2pathname`: + + - Discard URL authorities that resolve to a local IP address. + - Raise :exc:`~urllib.error.URLError` if a URL authority doesn't resolve + to ``localhost``, except on Windows where we return a UNC path. + + In :func:`urllib.request.pathname2url`: + + - Include an empty URL authority when a path begins with a slash. For + example, the path ``/etc/hosts`` is converted to the URL ``///etc/hosts``. + + On Windows, drive letters are no longer converted to uppercase, and ``:`` + characters not following a drive letter no longer cause an :exc:`OSError` + exception to be raised. + + (Contributed by Barney Gale in :gh:`125866`.) + uuid ---- diff --git a/Lib/pathlib/__init__.py b/Lib/pathlib/__init__.py index cd28f62ce3baf5..43a5440e0132ff 100644 --- a/Lib/pathlib/__init__.py +++ b/Lib/pathlib/__init__.py @@ -1278,8 +1278,12 @@ def from_uri(cls, uri): """Return a new path from the given 'file' URI.""" if not uri.startswith('file:'): raise ValueError(f"URI does not start with 'file:': {uri!r}") + from urllib.error import URLError from urllib.request import url2pathname - path = cls(url2pathname(uri.removeprefix('file:'))) + try: + path = cls(url2pathname(uri.removeprefix('file:'))) + except URLError as exc: + raise ValueError(exc.reason) from None if not path.is_absolute(): raise ValueError(f"URI is not absolute: {uri!r}") return path diff --git a/Lib/test/test_pathlib/test_pathlib.py b/Lib/test/test_pathlib/test_pathlib.py index b1fcc5f6f0538e..00ec17e21e235f 100644 --- a/Lib/test/test_pathlib/test_pathlib.py +++ b/Lib/test/test_pathlib/test_pathlib.py @@ -3285,10 +3285,14 @@ def test_handling_bad_descriptor(self): def test_from_uri_posix(self): P = self.cls self.assertEqual(P.from_uri('file:/foo/bar'), P('/foo/bar')) - self.assertEqual(P.from_uri('file://foo/bar'), P('//foo/bar')) + self.assertRaises(ValueError, P.from_uri, 'file://foo/bar') self.assertEqual(P.from_uri('file:///foo/bar'), P('/foo/bar')) self.assertEqual(P.from_uri('file:////foo/bar'), P('//foo/bar')) self.assertEqual(P.from_uri('file://localhost/foo/bar'), P('/foo/bar')) + if not is_wasi: + self.assertEqual(P.from_uri('file://127.0.0.1/foo/bar'), P('/foo/bar')) + self.assertEqual(P.from_uri(f'file://{socket.gethostname()}/foo/bar'), + P('/foo/bar')) self.assertRaises(ValueError, P.from_uri, 'foo/bar') self.assertRaises(ValueError, P.from_uri, '/foo/bar') self.assertRaises(ValueError, P.from_uri, '//foo/bar') diff --git a/Lib/test/test_urllib.py b/Lib/test/test_urllib.py index ed23215c4d0ab7..ecf429e17811a4 100644 --- a/Lib/test/test_urllib.py +++ b/Lib/test/test_urllib.py @@ -11,6 +11,7 @@ from test.support import os_helper from test.support import socket_helper import os +import socket try: import ssl except ImportError: @@ -1424,6 +1425,17 @@ def test_quoting(self): "url2pathname() failed; %s != %s" % (expect, result)) + def test_pathname2url(self): + # Test cases common to Windows and POSIX. + fn = urllib.request.pathname2url + sep = os.path.sep + self.assertEqual(fn(''), '') + self.assertEqual(fn(sep), '///') + self.assertEqual(fn('a'), 'a') + self.assertEqual(fn(f'a{sep}b.c'), 'a/b.c') + self.assertEqual(fn(f'{sep}a{sep}b.c'), '///a/b.c') + self.assertEqual(fn(f'{sep}a{sep}b%#c'), '///a/b%25%23c') + @unittest.skipUnless(sys.platform == 'win32', 'test specific to Windows pathnames.') def test_pathname2url_win(self): @@ -1466,12 +1478,9 @@ def test_pathname2url_win(self): 'test specific to POSIX pathnames') def test_pathname2url_posix(self): fn = urllib.request.pathname2url - self.assertEqual(fn('/'), '///') - self.assertEqual(fn('/a/b.c'), '///a/b.c') self.assertEqual(fn('//a/b.c'), '////a/b.c') self.assertEqual(fn('///a/b.c'), '/////a/b.c') self.assertEqual(fn('////a/b.c'), '//////a/b.c') - self.assertEqual(fn('/a/b%#c'), '///a/b%25%23c') @unittest.skipUnless(os_helper.FS_NONASCII, 'need os_helper.FS_NONASCII') def test_pathname2url_nonascii(self): @@ -1480,11 +1489,25 @@ def test_pathname2url_nonascii(self): url = urllib.parse.quote(os_helper.FS_NONASCII, encoding=encoding, errors=errors) self.assertEqual(urllib.request.pathname2url(os_helper.FS_NONASCII), url) + def test_url2pathname(self): + # Test cases common to Windows and POSIX. + fn = urllib.request.url2pathname + sep = os.path.sep + self.assertEqual(fn(''), '') + self.assertEqual(fn('/'), f'{sep}') + self.assertEqual(fn('///'), f'{sep}') + self.assertEqual(fn('////'), f'{sep}{sep}') + self.assertEqual(fn('foo'), 'foo') + self.assertEqual(fn('foo/bar'), f'foo{sep}bar') + self.assertEqual(fn('/foo/bar'), f'{sep}foo{sep}bar') + self.assertEqual(fn('//localhost/foo/bar'), f'{sep}foo{sep}bar') + self.assertEqual(fn('///foo/bar'), f'{sep}foo{sep}bar') + self.assertEqual(fn('////foo/bar'), f'{sep}{sep}foo{sep}bar') + @unittest.skipUnless(sys.platform == 'win32', 'test specific to Windows pathnames.') def test_url2pathname_win(self): fn = urllib.request.url2pathname - self.assertEqual(fn('/'), '\\') self.assertEqual(fn('/C:/'), 'C:\\') self.assertEqual(fn("///C|"), 'C:') self.assertEqual(fn("///C:"), 'C:') @@ -1530,11 +1553,13 @@ def test_url2pathname_win(self): 'test specific to POSIX pathnames') def test_url2pathname_posix(self): fn = urllib.request.url2pathname - self.assertEqual(fn('/foo/bar'), '/foo/bar') - self.assertEqual(fn('//foo/bar'), '//foo/bar') - self.assertEqual(fn('///foo/bar'), '/foo/bar') - self.assertEqual(fn('////foo/bar'), '//foo/bar') - self.assertEqual(fn('//localhost/foo/bar'), '/foo/bar') + self.assertRaises(urllib.error.URLError, fn, '//foo/bar') + self.assertRaises(urllib.error.URLError, fn, '//localhost:/foo/bar') + self.assertRaises(urllib.error.URLError, fn, '//:80/foo/bar') + self.assertRaises(urllib.error.URLError, fn, '//:/foo/bar') + self.assertRaises(urllib.error.URLError, fn, '//c:80/foo/bar') + self.assertEqual(fn('//127.0.0.1/foo/bar'), '/foo/bar') + self.assertEqual(fn(f'//{socket.gethostname()}/foo/bar'), '/foo/bar') @unittest.skipUnless(os_helper.FS_NONASCII, 'need os_helper.FS_NONASCII') def test_url2pathname_nonascii(self): diff --git a/Lib/urllib/request.py b/Lib/urllib/request.py index f22dc56af2f428..84c075ec8b359f 100644 --- a/Lib/urllib/request.py +++ b/Lib/urllib/request.py @@ -1450,16 +1450,6 @@ def parse_http_list(s): return [part.strip() for part in res] class FileHandler(BaseHandler): - # Use local file or FTP depending on form of URL - def file_open(self, req): - url = req.selector - if url[:2] == '//' and url[2:3] != '/' and (req.host and - req.host != 'localhost'): - if not req.host in self.get_names(): - raise URLError("file:// scheme is supported only on localhost") - else: - return self.open_local_file(req) - # names for the localhost names = None def get_names(self): @@ -1476,8 +1466,7 @@ def get_names(self): def open_local_file(self, req): import email.utils import mimetypes - host = req.host - filename = req.selector + filename = _splittype(req.full_url)[1] localfile = url2pathname(filename) try: stats = os.stat(localfile) @@ -1487,21 +1476,21 @@ def open_local_file(self, req): headers = email.message_from_string( 'Content-type: %s\nContent-length: %d\nLast-modified: %s\n' % (mtype or 'text/plain', size, modified)) - if host: - host, port = _splitport(host) - if not host or \ - (not port and _safe_gethostbyname(host) in self.get_names()): - origurl = 'file:' + pathname2url(localfile) - return addinfourl(open(localfile, 'rb'), headers, origurl) + origurl = f'file:{pathname2url(localfile)}' + return addinfourl(open(localfile, 'rb'), headers, origurl) except OSError as exp: raise URLError(exp, exp.filename) - raise URLError('file not on local host') -def _safe_gethostbyname(host): + file_open = open_local_file + +def _is_local_authority(authority): + if not authority or authority == 'localhost': + return True try: - return socket.gethostbyname(host) - except socket.gaierror: - return None + address = socket.gethostbyname(authority) + except (socket.gaierror, AttributeError): + return False + return address in FileHandler().get_names() class FTPHandler(BaseHandler): def ftp_open(self, req): @@ -1649,16 +1638,13 @@ def data_open(self, req): def url2pathname(url): """OS-specific conversion from a relative URL of the 'file' scheme to a file system path; not recommended for general use.""" - if url[:3] == '///': - # Empty authority section, so the path begins on the third character. - url = url[2:] - elif url[:12] == '//localhost/': - # Skip past 'localhost' authority. - url = url[11:] - + authority, url = _splithost(url) if os.name == 'nt': - if url[:3] == '///': - # Skip past extra slash before UNC drive in URL path. + if not _is_local_authority(authority): + # e.g. file://server/share/file.txt + url = '//' + authority + url + elif url[:3] == '///': + # e.g. file://///server/share/file.txt url = url[1:] else: if url[:1] == '/' and url[2:3] in (':', '|'): @@ -1668,6 +1654,8 @@ def url2pathname(url): # Older URLs use a pipe after a drive letter url = url[:1] + ':' + url[2:] url = url.replace('/', '\\') + elif not _is_local_authority(authority): + raise URLError("file:// scheme is supported only on localhost") encoding = sys.getfilesystemencoding() errors = sys.getfilesystemencodeerrors() return unquote(url, encoding=encoding, errors=errors) diff --git a/Misc/NEWS.d/next/Library/2024-11-14-21-17-48.gh-issue-126838.Yr5vKF.rst b/Misc/NEWS.d/next/Library/2024-11-14-21-17-48.gh-issue-126838.Yr5vKF.rst new file mode 100644 index 00000000000000..857cc359229daa --- /dev/null +++ b/Misc/NEWS.d/next/Library/2024-11-14-21-17-48.gh-issue-126838.Yr5vKF.rst @@ -0,0 +1,5 @@ +Fix issue where :func:`urllib.request.url2pathname` mishandled file URLs with +authorities. If an authority is present and resolves to ``localhost``, it is +now discarded. If an authority is present but *doesn't* resolve to +``localhost``, then on Windows a UNC path is returned (as before), and on +other platforms a :exc:`urllib.error.URLError` is now raised. _______________________________________________ Python-checkins mailing list -- python-checkins@python.org To unsubscribe send an email to python-checkins-le...@python.org https://mail.python.org/mailman3/lists/python-checkins.python.org/ Member address: arch...@mail-archive.com