https://github.com/python/cpython/commit/f0daba1652cbf2eb04feaf21f9c913023f286e7e
commit: f0daba1652cbf2eb04feaf21f9c913023f286e7e
branch: main
author: Hugo van Kemenade <[email protected]>
committer: hugovk <[email protected]>
date: 2026-05-07T23:39:08+03:00
summary:
gh-106693: Revert "Explicitly mark ob_sval as unsigned char to avoid UB
(#106826)" (#149514)
files:
D Misc/NEWS.d/next/Library/2025-09-05-20-50-35.gh-issue-79638.Y-JfaH.rst
M Include/cpython/bytesobject.h
M Lib/test/test_robotparser.py
M Lib/urllib/robotparser.py
diff --git a/Include/cpython/bytesobject.h b/Include/cpython/bytesobject.h
index 550b5fcb952e68..85bc2b827df8fb 100644
--- a/Include/cpython/bytesobject.h
+++ b/Include/cpython/bytesobject.h
@@ -5,7 +5,7 @@
typedef struct {
PyObject_VAR_HEAD
Py_DEPRECATED(3.11) Py_hash_t ob_shash;
- unsigned char ob_sval[1];
+ char ob_sval[1];
/* Invariants:
* ob_sval contains space for 'ob_size+1' elements.
@@ -20,7 +20,7 @@ PyAPI_FUNC(int) _PyBytes_Resize(PyObject **, Py_ssize_t);
#define _PyBytes_CAST(op) \
(assert(PyBytes_Check(op)), _Py_CAST(PyBytesObject*, op))
-static inline unsigned char* PyBytes_AS_STRING(PyObject *op)
+static inline char* PyBytes_AS_STRING(PyObject *op)
{
return _PyBytes_CAST(op)->ob_sval;
}
diff --git a/Lib/test/test_robotparser.py b/Lib/test/test_robotparser.py
index 65bfe815705e0a..3ea0ec66fbfbe9 100644
--- a/Lib/test/test_robotparser.py
+++ b/Lib/test/test_robotparser.py
@@ -646,23 +646,26 @@ def test_group_without_user_agent(self):
)
class BaseLocalNetworkTestCase:
- @classmethod
- def setUpClass(cls):
+ def setUp(self):
# clear _opener global variable
- cls.addClassCleanup(urllib.request.urlcleanup)
+ self.addCleanup(urllib.request.urlcleanup)
- cls.server = HTTPServer((socket_helper.HOST, 0), cls.RobotHandler)
- cls.addClassCleanup(cls.server.server_close)
+ self.server = HTTPServer((socket_helper.HOST, 0), self.RobotHandler)
- t = threading.Thread(
+ self.t = threading.Thread(
name='HTTPServer serving',
- target=cls.server.serve_forever,
+ target=self.server.serve_forever,
# Short poll interval to make the test finish quickly.
# Time between requests is short enough that we won't wake
# up spuriously too many times.
kwargs={'poll_interval':0.01})
- cls.enterClassContext(threading_helper.start_threads([t]))
- cls.addClassCleanup(cls.server.shutdown)
+ self.t.daemon = True # In case this function raises.
+ self.t.start()
+
+ def tearDown(self):
+ self.server.shutdown()
+ self.t.join()
+ self.server.server_close()
SAMPLE_ROBOTS_TXT = b'''\
@@ -684,6 +687,7 @@ def do_GET(self):
def log_message(self, format, *args):
pass
+ @threading_helper.reap_threads
def testRead(self):
# Test that reading a weird robots.txt doesn't fail.
addr = self.server.server_address
@@ -705,21 +709,17 @@ def testRead(self):
self.assertFalse(parser.can_fetch(agent, url + '/%2F[spam]/path'))
-class HttpErrorsTestCase(BaseLocalNetworkTestCase, unittest.TestCase):
+class PasswordProtectedSiteTestCase(BaseLocalNetworkTestCase,
unittest.TestCase):
class RobotHandler(BaseHTTPRequestHandler):
def do_GET(self):
- self.send_error(self.server.return_code)
+ self.send_error(403, "Forbidden access")
def log_message(self, format, *args):
pass
- def setUp(self):
- # Make sure that a valid code is set in the test.
- self.server.return_code = None
-
+ @threading_helper.reap_threads
def testPasswordProtectedSite(self):
- self.server.return_code = 403
addr = self.server.server_address
url = 'http://' + socket_helper.HOST + ':' + str(addr[1])
robots_url = url + "/robots.txt"
@@ -727,40 +727,6 @@ def testPasswordProtectedSite(self):
parser.set_url(url)
parser.read()
self.assertFalse(parser.can_fetch("*", robots_url))
- self.assertFalse(parser.can_fetch("*", url + '/some/file.html'))
-
- def testNotFound(self):
- self.server.return_code = 404
- addr = self.server.server_address
- url = f'http://{socket_helper.HOST}:{addr[1]}'
- robots_url = url + "/robots.txt"
- parser = urllib.robotparser.RobotFileParser()
- parser.set_url(url)
- parser.read()
- self.assertTrue(parser.can_fetch("*", robots_url))
- self.assertTrue(parser.can_fetch("*", url + '/path/file.html'))
-
- def testTeapot(self):
- self.server.return_code = 418
- addr = self.server.server_address
- url = f'http://{socket_helper.HOST}:{addr[1]}'
- robots_url = url + "/robots.txt"
- parser = urllib.robotparser.RobotFileParser()
- parser.set_url(url)
- parser.read()
- self.assertTrue(parser.can_fetch("*", robots_url))
- self.assertTrue(parser.can_fetch("*", url + '/pot-1?milk-type=Cream'))
-
- def testServiceUnavailable(self):
- self.server.return_code = 503
- addr = self.server.server_address
- url = f'http://{socket_helper.HOST}:{addr[1]}'
- robots_url = url + "/robots.txt"
- parser = urllib.robotparser.RobotFileParser()
- parser.set_url(url)
- parser.read()
- self.assertFalse(parser.can_fetch("*", robots_url))
- self.assertFalse(parser.can_fetch("*", url + '/path/file.html'))
@support.requires_working_socket()
diff --git a/Lib/urllib/robotparser.py b/Lib/urllib/robotparser.py
index 0c3e5d92890935..e70eae80036784 100644
--- a/Lib/urllib/robotparser.py
+++ b/Lib/urllib/robotparser.py
@@ -65,17 +65,9 @@ def read(self):
f = urllib.request.urlopen(self.url)
except urllib.error.HTTPError as err:
if err.code in (401, 403):
- # If access to robot.txt has the status Unauthorized/Forbidden,
- # then most likely this applies to the entire site.
self.disallow_all = True
- elif 400 <= err.code < 500:
- # RFC 9309, Section 2.3.1.3: the crawler MAY access any
- # resources on the server.
+ elif err.code >= 400 and err.code < 500:
self.allow_all = True
- elif 500 <= err.code < 600:
- # RFC 9309, Section 2.3.1.4: the crawler MUST assume
- # complete disallow.
- self.disallow_all = True
err.close()
else:
raw = f.read()
diff --git
a/Misc/NEWS.d/next/Library/2025-09-05-20-50-35.gh-issue-79638.Y-JfaH.rst
b/Misc/NEWS.d/next/Library/2025-09-05-20-50-35.gh-issue-79638.Y-JfaH.rst
deleted file mode 100644
index bd9fff0bc2e31b..00000000000000
--- a/Misc/NEWS.d/next/Library/2025-09-05-20-50-35.gh-issue-79638.Y-JfaH.rst
+++ /dev/null
@@ -1,2 +0,0 @@
-Disallow all access in :mod:`urllib.robotparser` if the ``robots.txt`` file
-is unreachable due to server or network errors.
_______________________________________________
Python-checkins mailing list -- [email protected]
To unsubscribe send an email to [email protected]
https://mail.python.org/mailman3//lists/python-checkins.python.org
Member address: [email protected]