https://github.com/python/cpython/commit/509ea397fbfa9cf63c6e1e20f24ec5171cb897c2 commit: 509ea397fbfa9cf63c6e1e20f24ec5171cb897c2 branch: 3.13 author: Miss Islington (bot) <31488909+miss-isling...@users.noreply.github.com> committer: serhiy-storchaka <storch...@gmail.com> date: 2025-09-05T20:21:02Z summary:
[3.13] gh-88375, gh-111788: Fix parsing errors and normalization in robotparser (GH-138502) (GH-138549) * Don't fail trying to parse weird patterns. * Don't fail trying to decode non-UTF-8 "robots.txt" files. * No longer ignore trailing "?" in patterns and URLs. * Distinguish raw special characters "?", "=" and "&" from the percent-encoded ones. * Remove tests that do nothing. (cherry picked from commit cb7ef18d70a0bc1363799e2dfa475db457155e43) Co-authored-by: Serhiy Storchaka <storch...@gmail.com> files: A Misc/NEWS.d/next/Library/2025-09-04-15-18-11.gh-issue-111788.tuTEM5.rst A Misc/NEWS.d/next/Library/2025-09-05-15-35-59.gh-issue-88375.dC491a.rst M Lib/test/test_robotparser.py M Lib/urllib/robotparser.py diff --git a/Lib/test/test_robotparser.py b/Lib/test/test_robotparser.py index 8d89e2a8224452..cdcf0681b44cf2 100644 --- a/Lib/test/test_robotparser.py +++ b/Lib/test/test_robotparser.py @@ -16,6 +16,14 @@ class BaseRobotTest: bad = [] site_maps = None + def __init_subclass__(cls): + super().__init_subclass__() + # Remove tests that do nothing. + if not cls.good: + cls.test_good_urls = None + if not cls.bad: + cls.test_bad_urls = None + def setUp(self): lines = io.StringIO(self.robots_txt).readlines() self.parser = urllib.robotparser.RobotFileParser() @@ -231,9 +239,16 @@ class DisallowQueryStringTest(BaseRobotTest, unittest.TestCase): robots_txt = """\ User-agent: * Disallow: /some/path?name=value +Disallow: /another/path? +Disallow: /yet/one/path?name=value&more """ - good = ['/some/path'] - bad = ['/some/path?name=value'] + good = ['/some/path', '/some/path?', + '/some/path%3Fname=value', '/some/path?name%3Dvalue', + '/another/path', '/another/path%3F', + '/yet/one/path?name=value%26more'] + bad = ['/some/path?name=value' + '/another/path?', '/another/path?name=value', + '/yet/one/path?name=value&more'] class UseFirstUserAgentWildcardTest(BaseRobotTest, unittest.TestCase): @@ -249,15 +264,79 @@ class UseFirstUserAgentWildcardTest(BaseRobotTest, unittest.TestCase): bad = ['/some/path'] -class EmptyQueryStringTest(BaseRobotTest, unittest.TestCase): - # normalize the URL first (#17403) +class PercentEncodingTest(BaseRobotTest, unittest.TestCase): robots_txt = """\ User-agent: * -Allow: /some/path? -Disallow: /another/path? - """ - good = ['/some/path?'] - bad = ['/another/path?'] +Disallow: /a1/Z-._~ # unreserved characters +Disallow: /a2/%5A%2D%2E%5F%7E # percent-encoded unreserved characters +Disallow: /u1/%F0%9F%90%8D # percent-encoded ASCII Unicode character +Disallow: /u2/%f0%9f%90%8d +Disallow: /u3/\U0001f40d # raw non-ASCII Unicode character +Disallow: /v1/%F0 # percent-encoded non-ASCII octet +Disallow: /v2/%f0 +Disallow: /v3/\udcf0 # raw non-ASCII octet +Disallow: /p1%xy # raw percent +Disallow: /p2% +Disallow: /p3%25xy # percent-encoded percent +Disallow: /p4%2525xy # double percent-encoded percent +Disallow: /john%20smith # space +Disallow: /john doe +Disallow: /trailingspace%20 +Disallow: /question%3Fq=v # not query +Disallow: /hash%23f # not fragment +Disallow: /dollar%24 +Disallow: /asterisk%2A +Disallow: /sub/dir +Disallow: /slash%2F +Disallow: /query/question?q=%3F +Disallow: /query/raw/question?q=? +Disallow: /query/eq?q%3Dv +Disallow: /query/amp?q=v%26a +""" + good = [ + '/u1/%F0', '/u1/%f0', + '/u2/%F0', '/u2/%f0', + '/u3/%F0', '/u3/%f0', + '/p1%2525xy', '/p2%f0', '/p3%2525xy', '/p4%xy', '/p4%25xy', + '/question?q=v', + '/dollar', '/asterisk', + '/query/eq?q=v', + '/query/amp?q=v&a', + ] + bad = [ + '/a1/Z-._~', '/a1/%5A%2D%2E%5F%7E', + '/a2/Z-._~', '/a2/%5A%2D%2E%5F%7E', + '/u1/%F0%9F%90%8D', '/u1/%f0%9f%90%8d', '/u1/\U0001f40d', + '/u2/%F0%9F%90%8D', '/u2/%f0%9f%90%8d', '/u2/\U0001f40d', + '/u3/%F0%9F%90%8D', '/u3/%f0%9f%90%8d', '/u3/\U0001f40d', + '/v1/%F0', '/v1/%f0', '/v1/\udcf0', '/v1/\U0001f40d', + '/v2/%F0', '/v2/%f0', '/v2/\udcf0', '/v2/\U0001f40d', + '/v3/%F0', '/v3/%f0', '/v3/\udcf0', '/v3/\U0001f40d', + '/p1%xy', '/p1%25xy', + '/p2%', '/p2%25', '/p2%2525', '/p2%xy', + '/p3%xy', '/p3%25xy', + '/p4%2525xy', + '/john%20smith', '/john smith', + '/john%20doe', '/john doe', + '/trailingspace%20', '/trailingspace ', + '/question%3Fq=v', + '/hash#f', '/hash%23f', + '/dollar$', '/dollar%24', + '/asterisk*', '/asterisk%2A', + '/sub/dir', '/sub%2Fdir', + '/slash%2F', '/slash/', + '/query/question?q=?', '/query/question?q=%3F', + '/query/raw/question?q=?', '/query/raw/question?q=%3F', + '/query/eq?q%3Dv', + '/query/amp?q=v%26a', + ] + # other reserved characters + for c in ":/[]@!$&'()*+,;=": + robots_txt += f'Disallow: /raw{c}\nDisallow: /pc%{ord(c):02X}\n' + bad.append(f'/raw{c}') + bad.append(f'/raw%{ord(c):02X}') + bad.append(f'/pc{c}') + bad.append(f'/pc%{ord(c):02X}') class DefaultEntryTest(BaseRequestRateTest, unittest.TestCase): @@ -299,26 +378,17 @@ def test_string_formatting(self): self.assertEqual(str(self.parser), self.expected_output) -class RobotHandler(BaseHTTPRequestHandler): - - def do_GET(self): - self.send_error(403, "Forbidden access") - - def log_message(self, format, *args): - pass - - @unittest.skipUnless( support.has_socket_support, "Socket server requires working socket." ) -class PasswordProtectedSiteTestCase(unittest.TestCase): +class BaseLocalNetworkTestCase: def setUp(self): # clear _opener global variable self.addCleanup(urllib.request.urlcleanup) - self.server = HTTPServer((socket_helper.HOST, 0), RobotHandler) + self.server = HTTPServer((socket_helper.HOST, 0), self.RobotHandler) self.t = threading.Thread( name='HTTPServer serving', @@ -335,6 +405,57 @@ def tearDown(self): self.t.join() self.server.server_close() + +SAMPLE_ROBOTS_TXT = b'''\ +User-agent: test_robotparser +Disallow: /utf8/\xf0\x9f\x90\x8d +Disallow: /non-utf8/\xf0 +Disallow: //[spam]/path +''' + + +class LocalNetworkTestCase(BaseLocalNetworkTestCase, unittest.TestCase): + class RobotHandler(BaseHTTPRequestHandler): + + def do_GET(self): + self.send_response(200) + self.end_headers() + self.wfile.write(SAMPLE_ROBOTS_TXT) + + def log_message(self, format, *args): + pass + + @threading_helper.reap_threads + def testRead(self): + # Test that reading a weird robots.txt doesn't fail. + addr = self.server.server_address + url = f'http://{socket_helper.HOST}:{addr[1]}' + robots_url = url + '/robots.txt' + parser = urllib.robotparser.RobotFileParser() + parser.set_url(robots_url) + parser.read() + # And it can even interpret the weird paths in some reasonable way. + agent = 'test_robotparser' + self.assertTrue(parser.can_fetch(agent, robots_url)) + self.assertTrue(parser.can_fetch(agent, url + '/utf8/')) + self.assertFalse(parser.can_fetch(agent, url + '/utf8/\U0001f40d')) + self.assertFalse(parser.can_fetch(agent, url + '/utf8/%F0%9F%90%8D')) + self.assertFalse(parser.can_fetch(agent, url + '/utf8/\U0001f40d')) + self.assertTrue(parser.can_fetch(agent, url + '/non-utf8/')) + self.assertFalse(parser.can_fetch(agent, url + '/non-utf8/%F0')) + self.assertFalse(parser.can_fetch(agent, url + '/non-utf8/\U0001f40d')) + self.assertFalse(parser.can_fetch(agent, url + '/%2F[spam]/path')) + + +class PasswordProtectedSiteTestCase(BaseLocalNetworkTestCase, unittest.TestCase): + class RobotHandler(BaseHTTPRequestHandler): + + def do_GET(self): + self.send_error(403, "Forbidden access") + + def log_message(self, format, *args): + pass + @threading_helper.reap_threads def testPasswordProtectedSite(self): addr = self.server.server_address diff --git a/Lib/urllib/robotparser.py b/Lib/urllib/robotparser.py index 409f2b2e48de6e..63689816f3018e 100644 --- a/Lib/urllib/robotparser.py +++ b/Lib/urllib/robotparser.py @@ -11,6 +11,7 @@ """ import collections +import re import urllib.error import urllib.parse import urllib.request @@ -20,6 +21,19 @@ RequestRate = collections.namedtuple("RequestRate", "requests seconds") +def normalize(path): + unquoted = urllib.parse.unquote(path, errors='surrogateescape') + return urllib.parse.quote(unquoted, errors='surrogateescape') + +def normalize_path(path): + path, sep, query = path.partition('?') + path = normalize(path) + if sep: + query = re.sub(r'[^=&]+', lambda m: normalize(m[0]), query) + path += '?' + query + return path + + class RobotFileParser: """ This class provides a set of methods to read, parse and answer questions about a single robots.txt file. @@ -55,7 +69,7 @@ def modified(self): def set_url(self, url): """Sets the URL referring to a robots.txt file.""" self.url = url - self.host, self.path = urllib.parse.urlparse(url)[1:3] + self.host, self.path = urllib.parse.urlsplit(url)[1:3] def read(self): """Reads the robots.txt URL and feeds it to the parser.""" @@ -69,7 +83,7 @@ def read(self): err.close() else: raw = f.read() - self.parse(raw.decode("utf-8").splitlines()) + self.parse(raw.decode("utf-8", "surrogateescape").splitlines()) def _add_entry(self, entry): if "*" in entry.useragents: @@ -113,7 +127,7 @@ def parse(self, lines): line = line.split(':', 1) if len(line) == 2: line[0] = line[0].strip().lower() - line[1] = urllib.parse.unquote(line[1].strip()) + line[1] = line[1].strip() if line[0] == "user-agent": if state == 2: self._add_entry(entry) @@ -167,10 +181,9 @@ def can_fetch(self, useragent, url): return False # search for given user agent matches # the first match counts - parsed_url = urllib.parse.urlparse(urllib.parse.unquote(url)) - url = urllib.parse.urlunparse(('','',parsed_url.path, - parsed_url.params,parsed_url.query, parsed_url.fragment)) - url = urllib.parse.quote(url) + parsed_url = urllib.parse.urlsplit(url) + url = urllib.parse.urlunsplit(('', '', *parsed_url[2:])) + url = normalize_path(url) if not url: url = "/" for entry in self.entries: @@ -213,7 +226,6 @@ def __str__(self): entries = entries + [self.default_entry] return '\n\n'.join(map(str, entries)) - class RuleLine: """A rule line is a single "Allow:" (allowance==True) or "Disallow:" (allowance==False) followed by a path.""" @@ -221,8 +233,7 @@ def __init__(self, path, allowance): if path == '' and not allowance: # an empty value means allow all allowance = True - path = urllib.parse.urlunparse(urllib.parse.urlparse(path)) - self.path = urllib.parse.quote(path) + self.path = normalize_path(path) self.allowance = allowance def applies_to(self, filename): @@ -268,7 +279,7 @@ def applies_to(self, useragent): def allowance(self, filename): """Preconditions: - our agent applies to this entry - - filename is URL decoded""" + - filename is URL encoded""" for line in self.rulelines: if line.applies_to(filename): return line.allowance diff --git a/Misc/NEWS.d/next/Library/2025-09-04-15-18-11.gh-issue-111788.tuTEM5.rst b/Misc/NEWS.d/next/Library/2025-09-04-15-18-11.gh-issue-111788.tuTEM5.rst new file mode 100644 index 00000000000000..a86e71e052d9f1 --- /dev/null +++ b/Misc/NEWS.d/next/Library/2025-09-04-15-18-11.gh-issue-111788.tuTEM5.rst @@ -0,0 +1,3 @@ +Fix parsing errors in the :mod:`urllib.robotparser` module. +Don't fail trying to parse weird paths. +Don't fail trying to decode non-UTF-8 ``robots.txt`` files. diff --git a/Misc/NEWS.d/next/Library/2025-09-05-15-35-59.gh-issue-88375.dC491a.rst b/Misc/NEWS.d/next/Library/2025-09-05-15-35-59.gh-issue-88375.dC491a.rst new file mode 100644 index 00000000000000..1660f39ddb1d3c --- /dev/null +++ b/Misc/NEWS.d/next/Library/2025-09-05-15-35-59.gh-issue-88375.dC491a.rst @@ -0,0 +1,4 @@ +Fix normalization of the ``robots.txt`` rules and URLs in the +:mod:`urllib.robotparser` module. No longer ignore trailing ``?``. +Distinguish raw special characters ``?``, ``=`` and ``&`` from the +percent-encoded ones. _______________________________________________ Python-checkins mailing list -- python-checkins@python.org To unsubscribe send an email to python-checkins-le...@python.org https://mail.python.org/mailman3//lists/python-checkins.python.org Member address: arch...@mail-archive.com