https://github.com/python/cpython/commit/66cdb2bd8ab93a4691bead7f5d1e54e5ca6895b4
commit: 66cdb2bd8ab93a4691bead7f5d1e54e5ca6895b4
branch: main
author: Barney Gale <barney.g...@gmail.com>
committer: barneygale <barney.g...@gmail.com>
date: 2025-04-10T19:58:04Z
summary:

GH-123599: `url2pathname()`: handle authority section in file URL (#126844)

In `urllib.request.url2pathname()`, if the authority resolves to the
current host, discard it. If an authority is present but resolves somewhere
else, then on Windows we return a UNC path (as before), and on other
platforms we raise `URLError`.

Affects `pathlib.Path.from_uri()` in the same way.

Co-authored-by: Adam Turner <9087854+aa-tur...@users.noreply.github.com>
Co-authored-by: Bénédikt Tran <10796600+picn...@users.noreply.github.com>

files:
A Misc/NEWS.d/next/Library/2024-11-14-21-17-48.gh-issue-126838.Yr5vKF.rst
M Doc/library/pathlib.rst
M Doc/library/urllib.request.rst
M Doc/whatsnew/3.14.rst
M Lib/pathlib/__init__.py
M Lib/test/test_pathlib/test_pathlib.py
M Lib/test/test_urllib.py
M Lib/urllib/request.py

diff --git a/Doc/library/pathlib.rst b/Doc/library/pathlib.rst
index 708a16e6bc8c94..b82986902861b2 100644
--- a/Doc/library/pathlib.rst
+++ b/Doc/library/pathlib.rst
@@ -871,6 +871,12 @@ conforming to :rfc:`8089`.
 
    .. versionadded:: 3.13
 
+   .. versionchanged:: next
+      If a URL authority (e.g. a hostname) is present and resolves to a local
+      address, it is discarded. If an authority is present and *doesn't*
+      resolve to a local address, then on Windows a UNC path is returned (as
+      before), and on other platforms a :exc:`ValueError` is raised.
+
 
 .. method:: Path.as_uri()
 
diff --git a/Doc/library/urllib.request.rst b/Doc/library/urllib.request.rst
index 8b54e10713e782..edfc249eb43c78 100644
--- a/Doc/library/urllib.request.rst
+++ b/Doc/library/urllib.request.rst
@@ -158,16 +158,16 @@ The :mod:`urllib.request` module defines the following 
functions:
       >>> 'file:' + pathname2url(path)
       'file:///C:/Program%20Files'
 
-   .. versionchanged:: 3.14
-      Paths beginning with a slash are converted to URLs with authority
-      sections. For example, the path ``/etc/hosts`` is converted to
-      the URL ``///etc/hosts``.
-
    .. versionchanged:: 3.14
       Windows drive letters are no longer converted to uppercase, and ``:``
       characters not following a drive letter no longer cause an
       :exc:`OSError` exception to be raised on Windows.
 
+   .. versionchanged:: 3.14
+      Paths beginning with a slash are converted to URLs with authority
+      sections. For example, the path ``/etc/hosts`` is converted to
+      the URL ``///etc/hosts``.
+
 
 .. function:: url2pathname(url)
 
@@ -186,6 +186,13 @@ The :mod:`urllib.request` module defines the following 
functions:
       characters not following a drive letter no longer cause an
       :exc:`OSError` exception to be raised on Windows.
 
+   .. versionchanged:: next
+      This function calls :func:`socket.gethostbyname` if the URL authority
+      isn't empty or ``localhost``. If the authority resolves to a local IP
+      address then it is discarded; otherwise, on Windows a UNC path is
+      returned (as before), and on other platforms a
+      :exc:`~urllib.error.URLError` is raised.
+
 
 .. function:: getproxies()
 
diff --git a/Doc/whatsnew/3.14.rst b/Doc/whatsnew/3.14.rst
index 0f15a2a8a8f6af..f8cae78b909a00 100644
--- a/Doc/whatsnew/3.14.rst
+++ b/Doc/whatsnew/3.14.rst
@@ -1197,6 +1197,25 @@ urllib
   supporting SHA-256 digest authentication as specified in :rfc:`7616`.
   (Contributed by Calvin Bui in :gh:`128193`.)
 
+* Improve standards compliance when parsing and emitting ``file:`` URLs.
+
+  In :func:`urllib.request.url2pathname`:
+
+  - Discard URL authorities that resolve to a local IP address.
+  - Raise :exc:`~urllib.error.URLError` if a URL authority doesn't resolve
+    to ``localhost``, except on Windows where we return a UNC path.
+
+  In :func:`urllib.request.pathname2url`:
+
+  - Include an empty URL authority when a path begins with a slash. For
+    example, the path ``/etc/hosts`` is converted to the URL ``///etc/hosts``.
+
+  On Windows, drive letters are no longer converted to uppercase, and ``:``
+  characters not following a drive letter no longer cause an :exc:`OSError`
+  exception to be raised.
+
+  (Contributed by Barney Gale in :gh:`125866`.)
+
 
 uuid
 ----
diff --git a/Lib/pathlib/__init__.py b/Lib/pathlib/__init__.py
index cd28f62ce3baf5..43a5440e0132ff 100644
--- a/Lib/pathlib/__init__.py
+++ b/Lib/pathlib/__init__.py
@@ -1278,8 +1278,12 @@ def from_uri(cls, uri):
         """Return a new path from the given 'file' URI."""
         if not uri.startswith('file:'):
             raise ValueError(f"URI does not start with 'file:': {uri!r}")
+        from urllib.error import URLError
         from urllib.request import url2pathname
-        path = cls(url2pathname(uri.removeprefix('file:')))
+        try:
+            path = cls(url2pathname(uri.removeprefix('file:')))
+        except URLError as exc:
+            raise ValueError(exc.reason) from None
         if not path.is_absolute():
             raise ValueError(f"URI is not absolute: {uri!r}")
         return path
diff --git a/Lib/test/test_pathlib/test_pathlib.py 
b/Lib/test/test_pathlib/test_pathlib.py
index b1fcc5f6f0538e..00ec17e21e235f 100644
--- a/Lib/test/test_pathlib/test_pathlib.py
+++ b/Lib/test/test_pathlib/test_pathlib.py
@@ -3285,10 +3285,14 @@ def test_handling_bad_descriptor(self):
     def test_from_uri_posix(self):
         P = self.cls
         self.assertEqual(P.from_uri('file:/foo/bar'), P('/foo/bar'))
-        self.assertEqual(P.from_uri('file://foo/bar'), P('//foo/bar'))
+        self.assertRaises(ValueError, P.from_uri, 'file://foo/bar')
         self.assertEqual(P.from_uri('file:///foo/bar'), P('/foo/bar'))
         self.assertEqual(P.from_uri('file:////foo/bar'), P('//foo/bar'))
         self.assertEqual(P.from_uri('file://localhost/foo/bar'), P('/foo/bar'))
+        if not is_wasi:
+            self.assertEqual(P.from_uri('file://127.0.0.1/foo/bar'), 
P('/foo/bar'))
+            
self.assertEqual(P.from_uri(f'file://{socket.gethostname()}/foo/bar'),
+                             P('/foo/bar'))
         self.assertRaises(ValueError, P.from_uri, 'foo/bar')
         self.assertRaises(ValueError, P.from_uri, '/foo/bar')
         self.assertRaises(ValueError, P.from_uri, '//foo/bar')
diff --git a/Lib/test/test_urllib.py b/Lib/test/test_urllib.py
index ed23215c4d0ab7..ecf429e17811a4 100644
--- a/Lib/test/test_urllib.py
+++ b/Lib/test/test_urllib.py
@@ -11,6 +11,7 @@
 from test.support import os_helper
 from test.support import socket_helper
 import os
+import socket
 try:
     import ssl
 except ImportError:
@@ -1424,6 +1425,17 @@ def test_quoting(self):
                          "url2pathname() failed; %s != %s" %
                          (expect, result))
 
+    def test_pathname2url(self):
+        # Test cases common to Windows and POSIX.
+        fn = urllib.request.pathname2url
+        sep = os.path.sep
+        self.assertEqual(fn(''), '')
+        self.assertEqual(fn(sep), '///')
+        self.assertEqual(fn('a'), 'a')
+        self.assertEqual(fn(f'a{sep}b.c'), 'a/b.c')
+        self.assertEqual(fn(f'{sep}a{sep}b.c'), '///a/b.c')
+        self.assertEqual(fn(f'{sep}a{sep}b%#c'), '///a/b%25%23c')
+
     @unittest.skipUnless(sys.platform == 'win32',
                          'test specific to Windows pathnames.')
     def test_pathname2url_win(self):
@@ -1466,12 +1478,9 @@ def test_pathname2url_win(self):
                      'test specific to POSIX pathnames')
     def test_pathname2url_posix(self):
         fn = urllib.request.pathname2url
-        self.assertEqual(fn('/'), '///')
-        self.assertEqual(fn('/a/b.c'), '///a/b.c')
         self.assertEqual(fn('//a/b.c'), '////a/b.c')
         self.assertEqual(fn('///a/b.c'), '/////a/b.c')
         self.assertEqual(fn('////a/b.c'), '//////a/b.c')
-        self.assertEqual(fn('/a/b%#c'), '///a/b%25%23c')
 
     @unittest.skipUnless(os_helper.FS_NONASCII, 'need os_helper.FS_NONASCII')
     def test_pathname2url_nonascii(self):
@@ -1480,11 +1489,25 @@ def test_pathname2url_nonascii(self):
         url = urllib.parse.quote(os_helper.FS_NONASCII, encoding=encoding, 
errors=errors)
         self.assertEqual(urllib.request.pathname2url(os_helper.FS_NONASCII), 
url)
 
+    def test_url2pathname(self):
+        # Test cases common to Windows and POSIX.
+        fn = urllib.request.url2pathname
+        sep = os.path.sep
+        self.assertEqual(fn(''), '')
+        self.assertEqual(fn('/'), f'{sep}')
+        self.assertEqual(fn('///'), f'{sep}')
+        self.assertEqual(fn('////'), f'{sep}{sep}')
+        self.assertEqual(fn('foo'), 'foo')
+        self.assertEqual(fn('foo/bar'), f'foo{sep}bar')
+        self.assertEqual(fn('/foo/bar'), f'{sep}foo{sep}bar')
+        self.assertEqual(fn('//localhost/foo/bar'), f'{sep}foo{sep}bar')
+        self.assertEqual(fn('///foo/bar'), f'{sep}foo{sep}bar')
+        self.assertEqual(fn('////foo/bar'), f'{sep}{sep}foo{sep}bar')
+
     @unittest.skipUnless(sys.platform == 'win32',
                          'test specific to Windows pathnames.')
     def test_url2pathname_win(self):
         fn = urllib.request.url2pathname
-        self.assertEqual(fn('/'), '\\')
         self.assertEqual(fn('/C:/'), 'C:\\')
         self.assertEqual(fn("///C|"), 'C:')
         self.assertEqual(fn("///C:"), 'C:')
@@ -1530,11 +1553,13 @@ def test_url2pathname_win(self):
                      'test specific to POSIX pathnames')
     def test_url2pathname_posix(self):
         fn = urllib.request.url2pathname
-        self.assertEqual(fn('/foo/bar'), '/foo/bar')
-        self.assertEqual(fn('//foo/bar'), '//foo/bar')
-        self.assertEqual(fn('///foo/bar'), '/foo/bar')
-        self.assertEqual(fn('////foo/bar'), '//foo/bar')
-        self.assertEqual(fn('//localhost/foo/bar'), '/foo/bar')
+        self.assertRaises(urllib.error.URLError, fn, '//foo/bar')
+        self.assertRaises(urllib.error.URLError, fn, '//localhost:/foo/bar')
+        self.assertRaises(urllib.error.URLError, fn, '//:80/foo/bar')
+        self.assertRaises(urllib.error.URLError, fn, '//:/foo/bar')
+        self.assertRaises(urllib.error.URLError, fn, '//c:80/foo/bar')
+        self.assertEqual(fn('//127.0.0.1/foo/bar'), '/foo/bar')
+        self.assertEqual(fn(f'//{socket.gethostname()}/foo/bar'), '/foo/bar')
 
     @unittest.skipUnless(os_helper.FS_NONASCII, 'need os_helper.FS_NONASCII')
     def test_url2pathname_nonascii(self):
diff --git a/Lib/urllib/request.py b/Lib/urllib/request.py
index f22dc56af2f428..84c075ec8b359f 100644
--- a/Lib/urllib/request.py
+++ b/Lib/urllib/request.py
@@ -1450,16 +1450,6 @@ def parse_http_list(s):
     return [part.strip() for part in res]
 
 class FileHandler(BaseHandler):
-    # Use local file or FTP depending on form of URL
-    def file_open(self, req):
-        url = req.selector
-        if url[:2] == '//' and url[2:3] != '/' and (req.host and
-                req.host != 'localhost'):
-            if not req.host in self.get_names():
-                raise URLError("file:// scheme is supported only on localhost")
-        else:
-            return self.open_local_file(req)
-
     # names for the localhost
     names = None
     def get_names(self):
@@ -1476,8 +1466,7 @@ def get_names(self):
     def open_local_file(self, req):
         import email.utils
         import mimetypes
-        host = req.host
-        filename = req.selector
+        filename = _splittype(req.full_url)[1]
         localfile = url2pathname(filename)
         try:
             stats = os.stat(localfile)
@@ -1487,21 +1476,21 @@ def open_local_file(self, req):
             headers = email.message_from_string(
                 'Content-type: %s\nContent-length: %d\nLast-modified: %s\n' %
                 (mtype or 'text/plain', size, modified))
-            if host:
-                host, port = _splitport(host)
-            if not host or \
-                (not port and _safe_gethostbyname(host) in self.get_names()):
-                origurl = 'file:' + pathname2url(localfile)
-                return addinfourl(open(localfile, 'rb'), headers, origurl)
+            origurl = f'file:{pathname2url(localfile)}'
+            return addinfourl(open(localfile, 'rb'), headers, origurl)
         except OSError as exp:
             raise URLError(exp, exp.filename)
-        raise URLError('file not on local host')
 
-def _safe_gethostbyname(host):
+    file_open = open_local_file
+
+def _is_local_authority(authority):
+    if not authority or authority == 'localhost':
+        return True
     try:
-        return socket.gethostbyname(host)
-    except socket.gaierror:
-        return None
+        address = socket.gethostbyname(authority)
+    except (socket.gaierror, AttributeError):
+        return False
+    return address in FileHandler().get_names()
 
 class FTPHandler(BaseHandler):
     def ftp_open(self, req):
@@ -1649,16 +1638,13 @@ def data_open(self, req):
 def url2pathname(url):
     """OS-specific conversion from a relative URL of the 'file' scheme
     to a file system path; not recommended for general use."""
-    if url[:3] == '///':
-        # Empty authority section, so the path begins on the third character.
-        url = url[2:]
-    elif url[:12] == '//localhost/':
-        # Skip past 'localhost' authority.
-        url = url[11:]
-
+    authority, url = _splithost(url)
     if os.name == 'nt':
-        if url[:3] == '///':
-            # Skip past extra slash before UNC drive in URL path.
+        if not _is_local_authority(authority):
+            # e.g. file://server/share/file.txt
+            url = '//' + authority + url
+        elif url[:3] == '///':
+            # e.g. file://///server/share/file.txt
             url = url[1:]
         else:
             if url[:1] == '/' and url[2:3] in (':', '|'):
@@ -1668,6 +1654,8 @@ def url2pathname(url):
                 # Older URLs use a pipe after a drive letter
                 url = url[:1] + ':' + url[2:]
         url = url.replace('/', '\\')
+    elif not _is_local_authority(authority):
+        raise URLError("file:// scheme is supported only on localhost")
     encoding = sys.getfilesystemencoding()
     errors = sys.getfilesystemencodeerrors()
     return unquote(url, encoding=encoding, errors=errors)
diff --git 
a/Misc/NEWS.d/next/Library/2024-11-14-21-17-48.gh-issue-126838.Yr5vKF.rst 
b/Misc/NEWS.d/next/Library/2024-11-14-21-17-48.gh-issue-126838.Yr5vKF.rst
new file mode 100644
index 00000000000000..857cc359229daa
--- /dev/null
+++ b/Misc/NEWS.d/next/Library/2024-11-14-21-17-48.gh-issue-126838.Yr5vKF.rst
@@ -0,0 +1,5 @@
+Fix issue where :func:`urllib.request.url2pathname` mishandled file URLs with
+authorities. If an authority is present and resolves to ``localhost``, it is
+now discarded. If an authority is present but *doesn't* resolve to
+``localhost``, then on Windows a UNC path is returned (as before), and on
+other platforms a :exc:`urllib.error.URLError` is now raised.

_______________________________________________
Python-checkins mailing list -- python-checkins@python.org
To unsubscribe send an email to python-checkins-le...@python.org
https://mail.python.org/mailman3/lists/python-checkins.python.org/
Member address: arch...@mail-archive.com

Reply via email to