https://github.com/python/cpython/commit/9654daf793b534b44a831c80f43505ab9e380f1f
commit: 9654daf793b534b44a831c80f43505ab9e380f1f
branch: main
author: Serhiy Storchaka <[email protected]>
committer: serhiy-storchaka <[email protected]>
date: 2024-03-26T13:26:45+02:00
summary:

gh-66543: Fix mimetype.guess_type() (GH-117217)

Fix parsing of the following corner cases:

* URLs with only a host name
* URLs containing a fragment
* URLs containing a query
* filenames with only a UNC sharepoint on Windows

Co-authored-by: Dong-hee Na <[email protected]>

files:
A Misc/NEWS.d/next/Library/2019-08-27-01-03-26.gh-issue-66543._TRpYr.rst
M Lib/mimetypes.py
M Lib/test/test_mimetypes.py
M Lib/test/test_urllib2.py

diff --git a/Lib/mimetypes.py b/Lib/mimetypes.py
index 51b99701c9d727..b33051f5331514 100644
--- a/Lib/mimetypes.py
+++ b/Lib/mimetypes.py
@@ -120,7 +120,13 @@ def guess_type(self, url, strict=True):
         but non-standard types.
         """
         url = os.fspath(url)
-        scheme, url = urllib.parse._splittype(url)
+        p = urllib.parse.urlparse(url)
+        if p.scheme and len(p.scheme) > 1:
+            scheme = p.scheme
+            url = p.path
+        else:
+            scheme = None
+            url = os.path.splitdrive(url)[1]
         if scheme == 'data':
             # syntax of data URLs:
             # dataurl   := "data:" [ mediatype ] [ ";base64" ] "," data
diff --git a/Lib/test/test_mimetypes.py b/Lib/test/test_mimetypes.py
index 01bba0ac2eed5a..cc9bae893bb55a 100644
--- a/Lib/test/test_mimetypes.py
+++ b/Lib/test/test_mimetypes.py
@@ -1,5 +1,6 @@
 import io
 import mimetypes
+import os
 import pathlib
 import sys
 import unittest.mock
@@ -109,15 +110,40 @@ def test_filename_with_url_delimiters(self):
         # compared to when interpreted as filename because of the semicolon.
         eq = self.assertEqual
         gzip_expected = ('application/x-tar', 'gzip')
-        eq(self.db.guess_type(";1.tar.gz"), gzip_expected)
-        eq(self.db.guess_type("?1.tar.gz"), gzip_expected)
-        eq(self.db.guess_type("#1.tar.gz"), gzip_expected)
-        eq(self.db.guess_type("#1#.tar.gz"), gzip_expected)
-        eq(self.db.guess_type(";1#.tar.gz"), gzip_expected)
-        eq(self.db.guess_type(";&1=123;?.tar.gz"), gzip_expected)
-        eq(self.db.guess_type("?k1=v1&k2=v2.tar.gz"), gzip_expected)
+        for name in (
+                ';1.tar.gz',
+                '?1.tar.gz',
+                '#1.tar.gz',
+                '#1#.tar.gz',
+                ';1#.tar.gz',
+                ';&1=123;?.tar.gz',
+                '?k1=v1&k2=v2.tar.gz',
+            ):
+            for prefix in ('', '/', '\\',
+                           'c:', 'c:/', 'c:\\', 'c:/d/', 'c:\\d\\',
+                           '//share/server/', '\\\\share\\server\\'):
+                path = prefix + name
+                with self.subTest(path=path):
+                    eq(self.db.guess_type(path), gzip_expected)
+            expected = (None, None) if os.name == 'nt' else gzip_expected
+            for prefix in ('//', '\\\\', '//share/', '\\\\share\\'):
+                path = prefix + name
+                with self.subTest(path=path):
+                    eq(self.db.guess_type(path), expected)
         eq(self.db.guess_type(r" \"\`;b&b&c |.tar.gz"), gzip_expected)
 
+    def test_url(self):
+        result = self.db.guess_type('http://host.html')
+        msg = 'URL only has a host name, not a file'
+        self.assertSequenceEqual(result, (None, None), msg)
+        result = self.db.guess_type('http://example.com/host.html')
+        msg = 'Should be text/html'
+        self.assertSequenceEqual(result, ('text/html', None), msg)
+        result = self.db.guess_type('http://example.com/host.html#x.tar')
+        self.assertSequenceEqual(result, ('text/html', None))
+        result = self.db.guess_type('http://example.com/host.html?q=x.tar')
+        self.assertSequenceEqual(result, ('text/html', None))
+
     def test_guess_all_types(self):
         # First try strict.  Use a set here for testing the results because if
         # test_urllib2 is run before test_mimetypes, global state is modified
diff --git a/Lib/test/test_urllib2.py b/Lib/test/test_urllib2.py
index 739c15df13de21..6febb491788b42 100644
--- a/Lib/test/test_urllib2.py
+++ b/Lib/test/test_urllib2.py
@@ -777,7 +777,7 @@ def connect_ftp(self, user, passwd, host, port, dirs,
              ["foo", "bar"], "", None),
             ("ftp://localhost/baz.gif;type=a";,
              "localhost", ftplib.FTP_PORT, "", "", "A",
-             [], "baz.gif", None),  # XXX really this should guess image/gif
+             [], "baz.gif", "image/gif"),
             ]:
             req = Request(url)
             req.timeout = None
diff --git 
a/Misc/NEWS.d/next/Library/2019-08-27-01-03-26.gh-issue-66543._TRpYr.rst 
b/Misc/NEWS.d/next/Library/2019-08-27-01-03-26.gh-issue-66543._TRpYr.rst
new file mode 100644
index 00000000000000..62f7aa2490bb73
--- /dev/null
+++ b/Misc/NEWS.d/next/Library/2019-08-27-01-03-26.gh-issue-66543._TRpYr.rst
@@ -0,0 +1,4 @@
+Make :func:`mimetypes.guess_type` properly parsing of URLs with only a host
+name, URLs containing fragment or query, and filenames with only a UNC
+sharepoint on Windows.
+Based on patch by Dong-hee Na.

_______________________________________________
Python-checkins mailing list -- [email protected]
To unsubscribe send an email to [email protected]
https://mail.python.org/mailman3/lists/python-checkins.python.org/
Member address: [email protected]

Reply via email to