https://github.com/python/cpython/commit/dbb6e22cb1f533bba00a61a5b63ec68af9d48836
commit: dbb6e22cb1f533bba00a61a5b63ec68af9d48836
branch: main
author: Serhiy Storchaka <[email protected]>
committer: serhiy-storchaka <[email protected]>
date: 2024-11-07T09:09:59+02:00
summary:

gh-125926: Fix urllib.parse.urljoin() for base URI with undefined authority 
(GH-125989)

Although this goes beyond the application of RFC 3986, urljoin()
should support relative base URIs for backward compatibility.

files:
A Misc/NEWS.d/next/Library/2024-10-25-20-52-15.gh-issue-125926.pp8rtZ.rst
M Lib/test/test_urlparse.py
M Lib/urllib/parse.py

diff --git a/Lib/test/test_urlparse.py b/Lib/test/test_urlparse.py
index d49e4388696ab4..297fb4831c16bf 100644
--- a/Lib/test/test_urlparse.py
+++ b/Lib/test/test_urlparse.py
@@ -623,6 +623,78 @@ def test_urljoins(self):
         self.checkJoin(RFC1808_BASE, 'https:;', 'https:;')
         self.checkJoin(RFC1808_BASE, 'https:;x', 'https:;x')
 
+    def test_urljoins_relative_base(self):
+        # According to RFC 3986, Section 5.1, a base URI must conform to
+        # the absolute-URI syntax rule (Section 4.3). But urljoin() lacks
+        # a context to establish missed components of the relative base URI.
+        # It still has to return a sensible result for backwards compatibility.
+        # The following tests are figments of the imagination and artifacts
+        # of the current implementation that are not based on any standard.
+        self.checkJoin('', '', '')
+        self.checkJoin('', '//', '//', relroundtrip=False)
+        self.checkJoin('', '//v', '//v')
+        self.checkJoin('', '//v/w', '//v/w')
+        self.checkJoin('', '/w', '/w')
+        self.checkJoin('', '///w', '///w', relroundtrip=False)
+        self.checkJoin('', 'w', 'w')
+
+        self.checkJoin('//', '', '//')
+        self.checkJoin('//', '//', '//')
+        self.checkJoin('//', '//v', '//v')
+        self.checkJoin('//', '//v/w', '//v/w')
+        self.checkJoin('//', '/w', '///w')
+        self.checkJoin('//', '///w', '///w')
+        self.checkJoin('//', 'w', '///w')
+
+        self.checkJoin('//a', '', '//a')
+        self.checkJoin('//a', '//', '//a')
+        self.checkJoin('//a', '//v', '//v')
+        self.checkJoin('//a', '//v/w', '//v/w')
+        self.checkJoin('//a', '/w', '//a/w')
+        self.checkJoin('//a', '///w', '//a/w')
+        self.checkJoin('//a', 'w', '//a/w')
+
+        for scheme in '', 'http:':
+            self.checkJoin('http:', scheme + '', 'http:')
+            self.checkJoin('http:', scheme + '//', 'http:')
+            self.checkJoin('http:', scheme + '//v', 'http://v')
+            self.checkJoin('http:', scheme + '//v/w', 'http://v/w')
+            self.checkJoin('http:', scheme + '/w', 'http:/w')
+            self.checkJoin('http:', scheme + '///w', 'http:/w')
+            self.checkJoin('http:', scheme + 'w', 'http:/w')
+
+            self.checkJoin('http://', scheme + '', 'http://')
+            self.checkJoin('http://', scheme + '//', 'http://')
+            self.checkJoin('http://', scheme + '//v', 'http://v')
+            self.checkJoin('http://', scheme + '//v/w', 'http://v/w')
+            self.checkJoin('http://', scheme + '/w', 'http:///w')
+            self.checkJoin('http://', scheme + '///w', 'http:///w')
+            self.checkJoin('http://', scheme + 'w', 'http:///w')
+
+            self.checkJoin('http://a', scheme + '', 'http://a')
+            self.checkJoin('http://a', scheme + '//', 'http://a')
+            self.checkJoin('http://a', scheme + '//v', 'http://v')
+            self.checkJoin('http://a', scheme + '//v/w', 'http://v/w')
+            self.checkJoin('http://a', scheme + '/w', 'http://a/w')
+            self.checkJoin('http://a', scheme + '///w', 'http://a/w')
+            self.checkJoin('http://a', scheme + 'w', 'http://a/w')
+
+        self.checkJoin('/b/c', '', '/b/c')
+        self.checkJoin('/b/c', '//', '/b/c')
+        self.checkJoin('/b/c', '//v', '//v')
+        self.checkJoin('/b/c', '//v/w', '//v/w')
+        self.checkJoin('/b/c', '/w', '/w')
+        self.checkJoin('/b/c', '///w', '/w')
+        self.checkJoin('/b/c', 'w', '/b/w')
+
+        self.checkJoin('///b/c', '', '///b/c')
+        self.checkJoin('///b/c', '//', '///b/c')
+        self.checkJoin('///b/c', '//v', '//v')
+        self.checkJoin('///b/c', '//v/w', '//v/w')
+        self.checkJoin('///b/c', '/w', '///w')
+        self.checkJoin('///b/c', '///w', '///w')
+        self.checkJoin('///b/c', 'w', '///b/w')
+
     def test_RFC2732(self):
         str_cases = [
             ('http://Test.python.org:5432/foo/', 'test.python.org', 5432),
diff --git a/Lib/urllib/parse.py b/Lib/urllib/parse.py
index 5b00ab25c6b4ca..a721d777c82f82 100644
--- a/Lib/urllib/parse.py
+++ b/Lib/urllib/parse.py
@@ -577,9 +577,9 @@ def urljoin(base, url, allow_fragments=True):
 
     if scheme is None:
         scheme = bscheme
-    if scheme != bscheme or scheme not in uses_relative:
+    if scheme != bscheme or (scheme and scheme not in uses_relative):
         return _coerce_result(url)
-    if scheme in uses_netloc:
+    if not scheme or scheme in uses_netloc:
         if netloc:
             return _coerce_result(_urlunsplit(scheme, netloc, path,
                                               query, fragment))
diff --git 
a/Misc/NEWS.d/next/Library/2024-10-25-20-52-15.gh-issue-125926.pp8rtZ.rst 
b/Misc/NEWS.d/next/Library/2024-10-25-20-52-15.gh-issue-125926.pp8rtZ.rst
new file mode 100644
index 00000000000000..7f98bcdc38e566
--- /dev/null
+++ b/Misc/NEWS.d/next/Library/2024-10-25-20-52-15.gh-issue-125926.pp8rtZ.rst
@@ -0,0 +1,4 @@
+Fix :func:`urllib.parse.urljoin` for base URI with undefined authority.
+Although :rfc:`3986` only specify reference resolution for absolute base
+URI, :func:`!urljoin` should continue to return sensible result for relative
+base URI.

_______________________________________________
Python-checkins mailing list -- [email protected]
To unsubscribe send an email to [email protected]
https://mail.python.org/mailman3/lists/python-checkins.python.org/
Member address: [email protected]

Reply via email to