https://github.com/python/cpython/commit/cb7ef18d70a0bc1363799e2dfa475db457155e43
commit: cb7ef18d70a0bc1363799e2dfa475db457155e43
branch: main
author: Serhiy Storchaka <[email protected]>
committer: serhiy-storchaka <[email protected]>
date: 2025-09-05T18:58:42+03:00
summary:
gh-88375, gh-111788: Fix parsing errors and normalization in robotparser
(GH-138502)
* Don't fail trying to parse weird patterns.
* Don't fail trying to decode non-UTF-8 "robots.txt" files.
* No longer ignore trailing "?" in patterns and URLs.
* Distinguish raw special characters "?", "=" and "&" from the
percent-encoded ones.
* Remove tests that do nothing.
files:
A Misc/NEWS.d/next/Library/2025-09-04-15-18-11.gh-issue-111788.tuTEM5.rst
A Misc/NEWS.d/next/Library/2025-09-05-15-35-59.gh-issue-88375.dC491a.rst
M Lib/test/test_robotparser.py
M Lib/urllib/robotparser.py
diff --git a/Lib/test/test_robotparser.py b/Lib/test/test_robotparser.py
index 8d89e2a8224452..e33723cc70c877 100644
--- a/Lib/test/test_robotparser.py
+++ b/Lib/test/test_robotparser.py
@@ -16,6 +16,14 @@ class BaseRobotTest:
bad = []
site_maps = None
+ def __init_subclass__(cls):
+ super().__init_subclass__()
+ # Remove tests that do nothing.
+ if not cls.good:
+ cls.test_good_urls = None
+ if not cls.bad:
+ cls.test_bad_urls = None
+
def setUp(self):
lines = io.StringIO(self.robots_txt).readlines()
self.parser = urllib.robotparser.RobotFileParser()
@@ -231,9 +239,16 @@ class DisallowQueryStringTest(BaseRobotTest,
unittest.TestCase):
robots_txt = """\
User-agent: *
Disallow: /some/path?name=value
+Disallow: /another/path?
+Disallow: /yet/one/path?name=value&more
"""
- good = ['/some/path']
- bad = ['/some/path?name=value']
+ good = ['/some/path', '/some/path?',
+ '/some/path%3Fname=value', '/some/path?name%3Dvalue',
+ '/another/path', '/another/path%3F',
+ '/yet/one/path?name=value%26more']
+ bad = ['/some/path?name=value'
+ '/another/path?', '/another/path?name=value',
+ '/yet/one/path?name=value&more']
class UseFirstUserAgentWildcardTest(BaseRobotTest, unittest.TestCase):
@@ -249,15 +264,79 @@ class UseFirstUserAgentWildcardTest(BaseRobotTest,
unittest.TestCase):
bad = ['/some/path']
-class EmptyQueryStringTest(BaseRobotTest, unittest.TestCase):
- # normalize the URL first (#17403)
+class PercentEncodingTest(BaseRobotTest, unittest.TestCase):
robots_txt = """\
User-agent: *
-Allow: /some/path?
-Disallow: /another/path?
- """
- good = ['/some/path?']
- bad = ['/another/path?']
+Disallow: /a1/Z-._~ # unreserved characters
+Disallow: /a2/%5A%2D%2E%5F%7E # percent-encoded unreserved characters
+Disallow: /u1/%F0%9F%90%8D # percent-encoded ASCII Unicode character
+Disallow: /u2/%f0%9f%90%8d
+Disallow: /u3/\U0001f40d # raw non-ASCII Unicode character
+Disallow: /v1/%F0 # percent-encoded non-ASCII octet
+Disallow: /v2/%f0
+Disallow: /v3/\udcf0 # raw non-ASCII octet
+Disallow: /p1%xy # raw percent
+Disallow: /p2%
+Disallow: /p3%25xy # percent-encoded percent
+Disallow: /p4%2525xy # double percent-encoded percent
+Disallow: /john%20smith # space
+Disallow: /john doe
+Disallow: /trailingspace%20
+Disallow: /question%3Fq=v # not query
+Disallow: /hash%23f # not fragment
+Disallow: /dollar%24
+Disallow: /asterisk%2A
+Disallow: /sub/dir
+Disallow: /slash%2F
+Disallow: /query/question?q=%3F
+Disallow: /query/raw/question?q=?
+Disallow: /query/eq?q%3Dv
+Disallow: /query/amp?q=v%26a
+"""
+ good = [
+ '/u1/%F0', '/u1/%f0',
+ '/u2/%F0', '/u2/%f0',
+ '/u3/%F0', '/u3/%f0',
+ '/p1%2525xy', '/p2%f0', '/p3%2525xy', '/p4%xy', '/p4%25xy',
+ '/question?q=v',
+ '/dollar', '/asterisk',
+ '/query/eq?q=v',
+ '/query/amp?q=v&a',
+ ]
+ bad = [
+ '/a1/Z-._~', '/a1/%5A%2D%2E%5F%7E',
+ '/a2/Z-._~', '/a2/%5A%2D%2E%5F%7E',
+ '/u1/%F0%9F%90%8D', '/u1/%f0%9f%90%8d', '/u1/\U0001f40d',
+ '/u2/%F0%9F%90%8D', '/u2/%f0%9f%90%8d', '/u2/\U0001f40d',
+ '/u3/%F0%9F%90%8D', '/u3/%f0%9f%90%8d', '/u3/\U0001f40d',
+ '/v1/%F0', '/v1/%f0', '/v1/\udcf0', '/v1/\U0001f40d',
+ '/v2/%F0', '/v2/%f0', '/v2/\udcf0', '/v2/\U0001f40d',
+ '/v3/%F0', '/v3/%f0', '/v3/\udcf0', '/v3/\U0001f40d',
+ '/p1%xy', '/p1%25xy',
+ '/p2%', '/p2%25', '/p2%2525', '/p2%xy',
+ '/p3%xy', '/p3%25xy',
+ '/p4%2525xy',
+ '/john%20smith', '/john smith',
+ '/john%20doe', '/john doe',
+ '/trailingspace%20', '/trailingspace ',
+ '/question%3Fq=v',
+ '/hash#f', '/hash%23f',
+ '/dollar$', '/dollar%24',
+ '/asterisk*', '/asterisk%2A',
+ '/sub/dir', '/sub%2Fdir',
+ '/slash%2F', '/slash/',
+ '/query/question?q=?', '/query/question?q=%3F',
+ '/query/raw/question?q=?', '/query/raw/question?q=%3F',
+ '/query/eq?q%3Dv',
+ '/query/amp?q=v%26a',
+ ]
+ # other reserved characters
+ for c in ":/#[]@!$&'()*+,;=":
+ robots_txt += f'Disallow: /raw{c}\nDisallow: /pc%{ord(c):02X}\n'
+ bad.append(f'/raw{c}')
+ bad.append(f'/raw%{ord(c):02X}')
+ bad.append(f'/pc{c}')
+ bad.append(f'/pc%{ord(c):02X}')
class DefaultEntryTest(BaseRequestRateTest, unittest.TestCase):
@@ -299,26 +378,17 @@ def test_string_formatting(self):
self.assertEqual(str(self.parser), self.expected_output)
-class RobotHandler(BaseHTTPRequestHandler):
-
- def do_GET(self):
- self.send_error(403, "Forbidden access")
-
- def log_message(self, format, *args):
- pass
-
-
@unittest.skipUnless(
support.has_socket_support,
"Socket server requires working socket."
)
-class PasswordProtectedSiteTestCase(unittest.TestCase):
+class BaseLocalNetworkTestCase:
def setUp(self):
# clear _opener global variable
self.addCleanup(urllib.request.urlcleanup)
- self.server = HTTPServer((socket_helper.HOST, 0), RobotHandler)
+ self.server = HTTPServer((socket_helper.HOST, 0), self.RobotHandler)
self.t = threading.Thread(
name='HTTPServer serving',
@@ -335,6 +405,57 @@ def tearDown(self):
self.t.join()
self.server.server_close()
+
+SAMPLE_ROBOTS_TXT = b'''\
+User-agent: test_robotparser
+Disallow: /utf8/\xf0\x9f\x90\x8d
+Disallow: /non-utf8/\xf0
+Disallow: //[spam]/path
+'''
+
+
+class LocalNetworkTestCase(BaseLocalNetworkTestCase, unittest.TestCase):
+ class RobotHandler(BaseHTTPRequestHandler):
+
+ def do_GET(self):
+ self.send_response(200)
+ self.end_headers()
+ self.wfile.write(SAMPLE_ROBOTS_TXT)
+
+ def log_message(self, format, *args):
+ pass
+
+ @threading_helper.reap_threads
+ def testRead(self):
+ # Test that reading a weird robots.txt doesn't fail.
+ addr = self.server.server_address
+ url = f'http://{socket_helper.HOST}:{addr[1]}'
+ robots_url = url + '/robots.txt'
+ parser = urllib.robotparser.RobotFileParser()
+ parser.set_url(robots_url)
+ parser.read()
+ # And it can even interpret the weird paths in some reasonable way.
+ agent = 'test_robotparser'
+ self.assertTrue(parser.can_fetch(agent, robots_url))
+ self.assertTrue(parser.can_fetch(agent, url + '/utf8/'))
+ self.assertFalse(parser.can_fetch(agent, url + '/utf8/\U0001f40d'))
+ self.assertFalse(parser.can_fetch(agent, url + '/utf8/%F0%9F%90%8D'))
+ self.assertFalse(parser.can_fetch(agent, url + '/utf8/\U0001f40d'))
+ self.assertTrue(parser.can_fetch(agent, url + '/non-utf8/'))
+ self.assertFalse(parser.can_fetch(agent, url + '/non-utf8/%F0'))
+ self.assertFalse(parser.can_fetch(agent, url + '/non-utf8/\U0001f40d'))
+ self.assertFalse(parser.can_fetch(agent, url + '/%2F[spam]/path'))
+
+
+class PasswordProtectedSiteTestCase(BaseLocalNetworkTestCase,
unittest.TestCase):
+ class RobotHandler(BaseHTTPRequestHandler):
+
+ def do_GET(self):
+ self.send_error(403, "Forbidden access")
+
+ def log_message(self, format, *args):
+ pass
+
@threading_helper.reap_threads
def testPasswordProtectedSite(self):
addr = self.server.server_address
diff --git a/Lib/urllib/robotparser.py b/Lib/urllib/robotparser.py
index 409f2b2e48de6e..4009fd6b58f594 100644
--- a/Lib/urllib/robotparser.py
+++ b/Lib/urllib/robotparser.py
@@ -11,6 +11,7 @@
"""
import collections
+import re
import urllib.error
import urllib.parse
import urllib.request
@@ -20,6 +21,19 @@
RequestRate = collections.namedtuple("RequestRate", "requests seconds")
+def normalize(path):
+ unquoted = urllib.parse.unquote(path, errors='surrogateescape')
+ return urllib.parse.quote(unquoted, errors='surrogateescape')
+
+def normalize_path(path):
+ path, sep, query = path.partition('?')
+ path = normalize(path)
+ if sep:
+ query = re.sub(r'[^=&]+', lambda m: normalize(m[0]), query)
+ path += '?' + query
+ return path
+
+
class RobotFileParser:
""" This class provides a set of methods to read, parse and answer
questions about a single robots.txt file.
@@ -55,7 +69,7 @@ def modified(self):
def set_url(self, url):
"""Sets the URL referring to a robots.txt file."""
self.url = url
- self.host, self.path = urllib.parse.urlparse(url)[1:3]
+ self.host, self.path = urllib.parse.urlsplit(url)[1:3]
def read(self):
"""Reads the robots.txt URL and feeds it to the parser."""
@@ -69,7 +83,7 @@ def read(self):
err.close()
else:
raw = f.read()
- self.parse(raw.decode("utf-8").splitlines())
+ self.parse(raw.decode("utf-8", "surrogateescape").splitlines())
def _add_entry(self, entry):
if "*" in entry.useragents:
@@ -113,7 +127,7 @@ def parse(self, lines):
line = line.split(':', 1)
if len(line) == 2:
line[0] = line[0].strip().lower()
- line[1] = urllib.parse.unquote(line[1].strip())
+ line[1] = line[1].strip()
if line[0] == "user-agent":
if state == 2:
self._add_entry(entry)
@@ -167,10 +181,11 @@ def can_fetch(self, useragent, url):
return False
# search for given user agent matches
# the first match counts
- parsed_url = urllib.parse.urlparse(urllib.parse.unquote(url))
- url = urllib.parse.urlunparse(('','',parsed_url.path,
- parsed_url.params,parsed_url.query, parsed_url.fragment))
- url = urllib.parse.quote(url)
+ # TODO: The private API is used in order to preserve an empty query.
+ # This is temporary until the public API starts supporting this
feature.
+ parsed_url = urllib.parse._urlsplit(url, '')
+ url = urllib.parse._urlunsplit(None, None, *parsed_url[2:])
+ url = normalize_path(url)
if not url:
url = "/"
for entry in self.entries:
@@ -213,7 +228,6 @@ def __str__(self):
entries = entries + [self.default_entry]
return '\n\n'.join(map(str, entries))
-
class RuleLine:
"""A rule line is a single "Allow:" (allowance==True) or "Disallow:"
(allowance==False) followed by a path."""
@@ -221,8 +235,7 @@ def __init__(self, path, allowance):
if path == '' and not allowance:
# an empty value means allow all
allowance = True
- path = urllib.parse.urlunparse(urllib.parse.urlparse(path))
- self.path = urllib.parse.quote(path)
+ self.path = normalize_path(path)
self.allowance = allowance
def applies_to(self, filename):
@@ -268,7 +281,7 @@ def applies_to(self, useragent):
def allowance(self, filename):
"""Preconditions:
- our agent applies to this entry
- - filename is URL decoded"""
+ - filename is URL encoded"""
for line in self.rulelines:
if line.applies_to(filename):
return line.allowance
diff --git
a/Misc/NEWS.d/next/Library/2025-09-04-15-18-11.gh-issue-111788.tuTEM5.rst
b/Misc/NEWS.d/next/Library/2025-09-04-15-18-11.gh-issue-111788.tuTEM5.rst
new file mode 100644
index 00000000000000..a86e71e052d9f1
--- /dev/null
+++ b/Misc/NEWS.d/next/Library/2025-09-04-15-18-11.gh-issue-111788.tuTEM5.rst
@@ -0,0 +1,3 @@
+Fix parsing errors in the :mod:`urllib.robotparser` module.
+Don't fail trying to parse weird paths.
+Don't fail trying to decode non-UTF-8 ``robots.txt`` files.
diff --git
a/Misc/NEWS.d/next/Library/2025-09-05-15-35-59.gh-issue-88375.dC491a.rst
b/Misc/NEWS.d/next/Library/2025-09-05-15-35-59.gh-issue-88375.dC491a.rst
new file mode 100644
index 00000000000000..1660f39ddb1d3c
--- /dev/null
+++ b/Misc/NEWS.d/next/Library/2025-09-05-15-35-59.gh-issue-88375.dC491a.rst
@@ -0,0 +1,4 @@
+Fix normalization of the ``robots.txt`` rules and URLs in the
+:mod:`urllib.robotparser` module. No longer ignore trailing ``?``.
+Distinguish raw special characters ``?``, ``=`` and ``&`` from the
+percent-encoded ones.
_______________________________________________
Python-checkins mailing list -- [email protected]
To unsubscribe send an email to [email protected]
https://mail.python.org/mailman3//lists/python-checkins.python.org
Member address: [email protected]