https://github.com/python/cpython/commit/9ad57cf58b76d1abe4786fb2e4c77a02e100d047
commit: 9ad57cf58b76d1abe4786fb2e4c77a02e100d047
branch: main
author: Cody Maloney <cmalo...@users.noreply.github.com>
committer: hauntsaninja <12621235+hauntsani...@users.noreply.github.com>
date: 2024-11-02T20:37:21-07:00
summary:

gh-120754: Add a strace helper and test set of syscalls for open().read(), Take 
2 (#123413)

files:
A Lib/test/support/strace_helper.py
M Lib/test/test_fileio.py
M Lib/test/test_subprocess.py
M Misc/ACKS

diff --git a/Lib/test/support/strace_helper.py 
b/Lib/test/support/strace_helper.py
new file mode 100644
index 00000000000000..90d4b5bccb6fa3
--- /dev/null
+++ b/Lib/test/support/strace_helper.py
@@ -0,0 +1,170 @@
+import re
+import sys
+import textwrap
+import unittest
+from dataclasses import dataclass
+from functools import cache
+from test import support
+from test.support.script_helper import run_python_until_end
+
+_strace_binary = "/usr/bin/strace"
+_syscall_regex = re.compile(
+    r"(?P<syscall>[^(]*)\((?P<args>[^)]*)\)\s*[=]\s*(?P<returncode>.+)")
+_returncode_regex = re.compile(
+    br"\+\+\+ exited with (?P<returncode>\d+) \+\+\+")
+
+
+@dataclass
+class StraceEvent:
+    syscall: str
+    args: list[str]
+    returncode: str
+
+
+@dataclass
+class StraceResult:
+    strace_returncode: int
+    python_returncode: int
+
+    """The event messages generated by strace. This is very similar to the
+    stderr strace produces with returncode marker section removed."""
+    event_bytes: bytes
+    stdout: bytes
+    stderr: bytes
+
+    def events(self):
+        """Parse event_bytes data into system calls for easier processing.
+
+        This assumes the program under inspection doesn't print any non-utf8
+        strings which would mix into the strace output."""
+        decoded_events = self.event_bytes.decode('utf-8')
+        matches = [
+            _syscall_regex.match(event)
+            for event in decoded_events.splitlines()
+        ]
+        return [
+            StraceEvent(match["syscall"],
+                        [arg.strip() for arg in (match["args"].split(","))],
+                        match["returncode"]) for match in matches if match
+        ]
+
+    def sections(self):
+        """Find all "MARK <X>" writes and use them to make groups of events.
+
+        This is useful to avoid variable / overhead events, like those at
+        interpreter startup or when opening a file so a test can verify just
+        the small case under study."""
+        current_section = "__startup"
+        sections = {current_section: []}
+        for event in self.events():
+            if event.syscall == 'write' and len(
+                    event.args) > 2 and event.args[1].startswith("\"MARK "):
+                # Found a new section, don't include the write in the section
+                # but all events until next mark should be in that section
+                current_section = event.args[1].split(
+                    " ", 1)[1].removesuffix('\\n"')
+                if current_section not in sections:
+                    sections[current_section] = list()
+            else:
+                sections[current_section].append(event)
+
+        return sections
+
+
+@support.requires_subprocess()
+def strace_python(code, strace_flags, check=True):
+    """Run strace and return the trace.
+
+    Sets strace_returncode and python_returncode to `-1` on error."""
+    res = None
+
+    def _make_error(reason, details):
+        return StraceResult(
+            strace_returncode=-1,
+            python_returncode=-1,
+            event_bytes=f"error({reason},details={details}) = 
-1".encode('utf-8'),
+            stdout=res.out if res else b"",
+            stderr=res.err if res else b"")
+
+    # Run strace, and get out the raw text
+    try:
+        res, cmd_line = run_python_until_end(
+            "-c",
+            textwrap.dedent(code),
+            __run_using_command=[_strace_binary] + strace_flags)
+    except OSError as err:
+        return _make_error("Caught OSError", err)
+
+    if check and res.rc:
+        res.fail(cmd_line)
+
+    # Get out program returncode
+    stripped = res.err.strip()
+    output = stripped.rsplit(b"\n", 1)
+    if len(output) != 2:
+        return _make_error("Expected strace events and exit code line",
+                           stripped[-50:])
+
+    returncode_match = _returncode_regex.match(output[1])
+    if not returncode_match:
+        return _make_error("Expected to find returncode in last line.",
+                           output[1][:50])
+
+    python_returncode = int(returncode_match["returncode"])
+    if check and python_returncode:
+        res.fail(cmd_line)
+
+    return StraceResult(strace_returncode=res.rc,
+                        python_returncode=python_returncode,
+                        event_bytes=output[0],
+                        stdout=res.out,
+                        stderr=res.err)
+
+
+def get_events(code, strace_flags, prelude, cleanup):
+    # NOTE: The flush is currently required to prevent the prints from getting
+    # buffered and done all at once at exit
+    prelude = textwrap.dedent(prelude)
+    code = textwrap.dedent(code)
+    cleanup = textwrap.dedent(cleanup)
+    to_run = f"""
+print("MARK prelude", flush=True)
+{prelude}
+print("MARK code", flush=True)
+{code}
+print("MARK cleanup", flush=True)
+{cleanup}
+print("MARK __shutdown", flush=True)
+    """
+    trace = strace_python(to_run, strace_flags)
+    all_sections = trace.sections()
+    return all_sections['code']
+
+
+def get_syscalls(code, strace_flags, prelude="", cleanup=""):
+    """Get the syscalls which a given chunk of python code generates"""
+    events = get_events(code, strace_flags, prelude=prelude, cleanup=cleanup)
+    return [ev.syscall for ev in events]
+
+
+# Moderately expensive (spawns a subprocess), so share results when possible.
+@cache
+def _can_strace():
+    res = strace_python("import sys; sys.exit(0)", [], check=False)
+    assert res.events(), "Should have parsed multiple calls"
+
+    return res.strace_returncode == 0 and res.python_returncode == 0
+
+
+def requires_strace():
+    if sys.platform != "linux":
+        return unittest.skip("Linux only, requires strace.")
+
+    if support.check_sanitizer(address=True, memory=True):
+        return unittest.skip("LeakSanitizer does not work under ptrace 
(strace, gdb, etc)")
+
+    return unittest.skipUnless(_can_strace(), "Requires working strace")
+
+
+__all__ = ["get_events", "get_syscalls", "requires_strace", "strace_python",
+           "StraceEvent", "StraceResult"]
diff --git a/Lib/test/test_fileio.py b/Lib/test/test_fileio.py
index 0611d1749f41c1..d60aabcdf1ae22 100644
--- a/Lib/test/test_fileio.py
+++ b/Lib/test/test_fileio.py
@@ -11,7 +11,7 @@
 
 from test.support import (
     cpython_only, swap_attr, gc_collect, is_emscripten, is_wasi,
-    infinite_recursion,
+    infinite_recursion, strace_helper
 )
 from test.support.os_helper import (
     TESTFN, TESTFN_ASCII, TESTFN_UNICODE, make_bad_fd,
@@ -24,6 +24,9 @@
 import _pyio # Python implementation of io
 
 
+_strace_flags=["--trace=%file,%desc"]
+
+
 class AutoFileTests:
     # file tests for which a test file is automatically set up
 
@@ -359,6 +362,139 @@ def testErrnoOnClosedReadinto(self, f):
         a = array('b', b'x'*10)
         f.readinto(a)
 
+    @strace_helper.requires_strace()
+    def test_syscalls_read(self):
+        """Check that the set of system calls produced by the I/O stack is what
+        is expected for various read cases.
+
+        It's expected as bits of the I/O implementation change, this will need
+        to change. The goal is to catch changes that unintentionally add
+        additional systemcalls (ex. additional calls have been looked at in
+        bpo-21679 and gh-120754).
+        """
+        self.f.write(b"Hello, World!")
+        self.f.close()
+
+
+        def check_readall(name, code, prelude="", cleanup="",
+                          extra_checks=None):
+            with self.subTest(name=name):
+                syscalls = strace_helper.get_events(code, _strace_flags,
+                                                      prelude=prelude,
+                                                      cleanup=cleanup)
+
+                # The first call should be an open that returns a
+                # file descriptor (fd). Afer that calls may vary. Once the file
+                # is opened, check calls refer to it by fd as the filename
+                # could be removed from the filesystem, renamed, etc. See:
+                # Time-of-check time-of-use (TOCTOU) software bug class.
+                #
+                # There are a number of related but distinct open system calls
+                # so not checking precise name here.
+                self.assertGreater(
+                    len(syscalls),
+                    1,
+                    f"Should have had at least an open call|calls={syscalls}")
+                fd_str = syscalls[0].returncode
+
+                # All other calls should contain the fd in their argument set.
+                for ev in syscalls[1:]:
+                    self.assertIn(
+                        fd_str,
+                        ev.args,
+                        f"Looking for file descriptor in arguments|ev={ev}"
+                    )
+
+                # There are a number of related syscalls used to implement
+                # behaviors in a libc (ex. fstat, newfstatat, statx, open, 
openat).
+                # Allow any that use the same substring.
+                def count_similarname(name):
+                    return len([ev for ev in syscalls if name in ev.syscall])
+
+                checks = [
+                    # Should open and close the file exactly once
+                    ("open", 1),
+                    ("close", 1),
+                    # There should no longer be an isatty call (All files being
+                    # tested are block devices / not character devices).
+                    ('ioctl', 0),
+                    # Should only have one fstat (bpo-21679, gh-120754)
+                    # note: It's important this uses a fd rather than filename,
+                    # That is validated by the `fd` check above.
+                    # note: fstat, newfstatat, and statx have all been observed
+                    # here in the underlying C library implementations.
+                    ("stat", 1)
+                ]
+
+                if extra_checks:
+                    checks += extra_checks
+
+                for call, count in checks:
+                    self.assertEqual(
+                        count_similarname(call),
+                        count,
+                        msg=f"call={call}|count={count}|syscalls={syscalls}"
+                    )
+
+        # "open, read, close" file using different common patterns.
+        check_readall(
+            "open builtin with default options",
+            f"""
+            f = open('{TESTFN}')
+            f.read()
+            f.close()
+            """
+        )
+
+        check_readall(
+            "open in binary mode",
+            f"""
+            f = open('{TESTFN}', 'rb')
+            f.read()
+            f.close()
+            """
+        )
+
+        check_readall(
+            "open in text mode",
+            f"""
+            f = open('{TESTFN}', 'rt')
+            f.read()
+            f.close()
+            """,
+            # GH-122111: read_text uses BufferedIO which requires looking up
+            # position in file. `read_bytes` disables that buffering and avoids
+            # these calls which is tested the `pathlib read_bytes` case.
+            extra_checks=[("seek", 1)]
+        )
+
+        check_readall(
+            "pathlib read_bytes",
+            "p.read_bytes()",
+            prelude=f"""from pathlib import Path; p = Path("{TESTFN}")""",
+            # GH-122111: Buffering is disabled so these calls are avoided.
+            extra_checks=[("seek", 0)]
+        )
+
+        check_readall(
+            "pathlib read_text",
+            "p.read_text()",
+            prelude=f"""from pathlib import Path; p = Path("{TESTFN}")"""
+        )
+
+        # Focus on just `read()`.
+        calls = strace_helper.get_syscalls(
+            prelude=f"f = open('{TESTFN}')",
+            code="f.read()",
+            cleanup="f.close()",
+            strace_flags=_strace_flags
+        )
+        # One to read all the bytes
+        # One to read the EOF and get a size 0 return.
+        self.assertEqual(calls.count("read"), 2)
+
+
+
 class CAutoFileTests(AutoFileTests, unittest.TestCase):
     FileIO = _io.FileIO
     modulename = '_io'
diff --git a/Lib/test/test_subprocess.py b/Lib/test/test_subprocess.py
index f065b9c9bb1c2c..e45701dfe033a6 100644
--- a/Lib/test/test_subprocess.py
+++ b/Lib/test/test_subprocess.py
@@ -4,6 +4,7 @@
 from test.support import check_sanitizer
 from test.support import import_helper
 from test.support import os_helper
+from test.support import strace_helper
 from test.support import warnings_helper
 from test.support.script_helper import assert_python_ok
 import subprocess
@@ -3415,7 +3416,7 @@ def __del__(self):
 
     @unittest.skipIf(not sysconfig.get_config_var("HAVE_VFORK"),
                      "vfork() not enabled by configure.")
-    @unittest.skipIf(sys.platform != "linux", "Linux only, requires strace.")
+    @strace_helper.requires_strace()
     @mock.patch("subprocess._USE_POSIX_SPAWN", new=False)
     def test_vfork_used_when_expected(self):
         # This is a performance regression test to ensure we default to using
@@ -3423,36 +3424,25 @@ def test_vfork_used_when_expected(self):
         # Technically this test could pass when posix_spawn is used as well
         # because libc tends to implement that internally using vfork. But
         # that'd just be testing a libc+kernel implementation detail.
-        strace_binary = "/usr/bin/strace"
-        # The only system calls we are interested in.
-        strace_filter = 
"--trace=clone,clone2,clone3,fork,vfork,exit,exit_group"
-        true_binary = "/bin/true"
-        strace_command = [strace_binary, strace_filter]
 
-        try:
-            does_strace_work_process = subprocess.run(
-                    strace_command + [true_binary],
-                    stderr=subprocess.PIPE,
-                    stdout=subprocess.DEVNULL,
-            )
-            rc = does_strace_work_process.returncode
-            stderr = does_strace_work_process.stderr
-        except OSError:
-            rc = -1
-            stderr = ""
-        if rc or (b"+++ exited with 0 +++" not in stderr):
-            self.skipTest("strace not found or not working as expected.")
+        # Are intersted in the system calls:
+        # clone,clone2,clone3,fork,vfork,exit,exit_group
+        # Unfortunately using `--trace` with that list to strace fails because
+        # not all are supported on all platforms (ex. clone2 is ia64 only...)
+        # So instead use `%process` which is recommended by strace, and 
contains
+        # the above.
+        true_binary = "/bin/true"
+        strace_args = ["--trace=%process"]
 
         with self.subTest(name="default_is_vfork"):
-            vfork_result = assert_python_ok(
-                    "-c",
-                    textwrap.dedent(f"""\
-                    import subprocess
-                    subprocess.check_call([{true_binary!r}])"""),
-                    __run_using_command=strace_command,
+            vfork_result = strace_helper.strace_python(
+                f"""\
+                import subprocess
+                subprocess.check_call([{true_binary!r}])""",
+                strace_args
             )
             # Match both vfork() and clone(..., flags=...|CLONE_VFORK|...)
-            self.assertRegex(vfork_result.err, br"(?i)vfork")
+            self.assertRegex(vfork_result.event_bytes, br"(?i)vfork")
             # Do NOT check that fork() or other clones did not happen.
             # If the OS denys the vfork it'll fallback to plain fork().
 
@@ -3465,9 +3455,8 @@ def test_vfork_used_when_expected(self):
                 ("setgroups", "", "extra_groups=[]", True),
         ):
             with self.subTest(name=sub_name):
-                non_vfork_result = assert_python_ok(
-                    "-c",
-                    textwrap.dedent(f"""\
+                non_vfork_result = strace_helper.strace_python(
+                    f"""\
                     import subprocess
                     {preamble}
                     try:
@@ -3475,11 +3464,11 @@ def test_vfork_used_when_expected(self):
                                 [{true_binary!r}], **dict({sp_kwarg}))
                     except PermissionError:
                         if not {expect_permission_error}:
-                            raise"""),
-                    __run_using_command=strace_command,
+                            raise""",
+                    strace_args
                 )
                 # Ensure neither vfork() or clone(..., 
flags=...|CLONE_VFORK|...).
-                self.assertNotRegex(non_vfork_result.err, br"(?i)vfork")
+                self.assertNotRegex(non_vfork_result.event_bytes, 
br"(?i)vfork")
 
 
 @unittest.skipUnless(mswindows, "Windows specific tests")
diff --git a/Misc/ACKS b/Misc/ACKS
index a1769d9601a2ea..5e36eda554af0f 100644
--- a/Misc/ACKS
+++ b/Misc/ACKS
@@ -1164,6 +1164,7 @@ Grzegorz Makarewicz
 David Malcolm
 Greg Malcolm
 William Mallard
+Cody Maloney
 Ken Manheimer
 Vladimir Marangozov
 Colin Marc

_______________________________________________
Python-checkins mailing list -- python-checkins@python.org
To unsubscribe send an email to python-checkins-le...@python.org
https://mail.python.org/mailman3/lists/python-checkins.python.org/
Member address: arch...@mail-archive.com

Reply via email to