[PATCH] selftests/sched: add proxy execution mutex tests

soolaugust Sun, 08 Mar 2026 23:47:31 -0700

From: zhidao su <[email protected]>

Add basic selftests for the Proxy Execution (PE) feature
(CONFIG_SCHED_PROXY_EXEC). Three test cases exercise the single-CPU
PE path which is present in the current upstream kernel independently
of the donor migration series (v24).


TC-1: Single-level mutex blocking
  A SCHED_FIFO prio=80 thread blocks on a mutex held by a SCHED_OTHER
  thread doing CPU-intensive work. Verifies that the holder accumulates
  significant CPU time (>= 50ms out of 200ms hold period), confirming
  PE is running the holder as proxy for the high-priority waiter.

TC-2: blocked_on lifetime - no voluntary context switches
  While a high-priority thread is PE-blocked on a mutex, its
  voluntary_ctxt_switches count must not increase. PE keeps the donor
  on the runqueue rather than doing a voluntary sleep, so no voluntary
  switch should occur during the block period.

TC-3: Two-level mutex chain traversal
  A (prio=80) -> mutex1 -> B (prio=50) -> mutex2 -> C (SCHED_OTHER).
  Verifies PE traverses the full chain and runs C as proxy, confirmed
  by C accumulating >= 50ms CPU time while A and B are both blocked.

The test skips gracefully when:
  - CONFIG_SCHED_PROXY_EXEC is not compiled in
  - sched_proxy_exec=0 is set on the kernel command line
  - not running as root (SCHED_FIFO requires CAP_SYS_NICE)

These tests cover the single-CPU PE base functionality and are
orthogonal to the cross-CPU donor migration work (v24). They should
remain valid after donor migration lands, as the single-CPU path
is preserved.

Tested on Linux 7.0-rc2 with CONFIG_SCHED_PROXY_EXEC=y via virtme-ng
on Intel Core i7-10700 @ 2.90GHz:
  sched_proxy_exec=1: TC-1 PASS, TC-2 PASS, TC-3 PASS
  sched_proxy_exec=0: all SKIP

Signed-off-by: zhidao su <[email protected]>
---
 tools/testing/selftests/sched/Makefile        |   6 +-
 tools/testing/selftests/sched/pe_mutex_test.c | 508 ++++++++++++++++++
 2 files changed, 511 insertions(+), 3 deletions(-)
 create mode 100644 tools/testing/selftests/sched/pe_mutex_test.c

diff --git a/tools/testing/selftests/sched/Makefile 
b/tools/testing/selftests/sched/Makefile
index 099ee921355..5ecfa45a103 100644
--- a/tools/testing/selftests/sched/Makefile
+++ b/tools/testing/selftests/sched/Makefile
@@ -6,9 +6,9 @@ endif
 
 CFLAGS += -O2 -Wall -g -I./ $(KHDR_INCLUDES) -Wl,-rpath=./ \
          $(CLANG_FLAGS)
-LDLIBS += -lpthread
+LDLIBS += -lpthread -lrt
 
-TEST_GEN_FILES := cs_prctl_test
-TEST_PROGS := cs_prctl_test
+TEST_GEN_FILES := cs_prctl_test pe_mutex_test
+TEST_PROGS := cs_prctl_test pe_mutex_test
 
 include ../lib.mk
diff --git a/tools/testing/selftests/sched/pe_mutex_test.c 
b/tools/testing/selftests/sched/pe_mutex_test.c
new file mode 100644
index 00000000000..b3ff4852ddc
--- /dev/null
+++ b/tools/testing/selftests/sched/pe_mutex_test.c
@@ -0,0 +1,508 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Proxy Execution (PE) mutex selftest - TC-1 through TC-3
+ *
+ * Verifies basic PE behavior for mutex blocking:
+ *   TC-1: High-priority blocked task's CPU time increases via PE
+ *   TC-2: blocked_on lifetime - voluntary ctxt switches don't increase
+ *   TC-3: Two-level mutex chain traversal
+ *
+ * Requires CONFIG_SCHED_PROXY_EXEC=y and root privileges.
+ */
+
+#define _GNU_SOURCE
+#include <errno.h>
+#include <fcntl.h>
+#include <pthread.h>
+#include <sched.h>
+#include <signal.h>
+#include <stdatomic.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <time.h>
+#include <unistd.h>
+#include <sys/types.h>
+#include <sys/syscall.h>
+
+#include "../kselftest.h"
+
+/* ---------- helpers ---------- */
+
+static pid_t gettid_compat(void)
+{
+       return (pid_t)syscall(SYS_gettid);
+}
+
+/*
+ * is_proxy_exec_enabled - check whether PE is active at runtime.
+ *
+ * PE has no sysctl; it is controlled by the "sched_proxy_exec" boot
+ * parameter.  DEFINE_STATIC_KEY_TRUE means it defaults ON unless
+ * "sched_proxy_exec=0" appears on the kernel command line.
+ */
+static bool is_proxy_exec_enabled(void)
+{
+       char line[4096];
+       FILE *f;
+
+       f = fopen("/proc/cmdline", "r");
+       if (!f)
+               return true; /* assume enabled if we cannot read cmdline */
+
+       if (!fgets(line, sizeof(line), f)) {
+               fclose(f);
+               return true;
+       }
+       fclose(f);
+
+       return !strstr(line, "sched_proxy_exec=0");
+}
+
+/* Return monotonic time in nanoseconds. */
+static long long now_ns(void)
+{
+       struct timespec ts;
+
+       clock_gettime(CLOCK_MONOTONIC, &ts);
+       return (long long)ts.tv_sec * 1000000000LL + ts.tv_nsec;
+}
+
+/* Return CPU time consumed by the calling thread in nanoseconds. */
+static long long cputime_ns(void)
+{
+       struct timespec ts;
+
+       clock_gettime(CLOCK_THREAD_CPUTIME_ID, &ts);
+       return (long long)ts.tv_sec * 1000000000LL + ts.tv_nsec;
+}
+
+/*
+ * get_voluntary_ctxt_switches - read voluntary_ctxt_switches for @tid.
+ *
+ * Threads (tid != tgid) are only visible under
+ * /proc/<tgid>/task/<tid>/status, not /proc/<tid>/status directly.
+ * Try the task path first, fall back to the top-level pid path.
+ */
+static long get_voluntary_ctxt_switches(pid_t tid)
+{
+       char path[128];
+       char line[256];
+       FILE *f;
+       long val = -1;
+
+       /* Try /proc/<tgid>/task/<tid>/status (works for all threads) */
+       snprintf(path, sizeof(path), "/proc/%d/task/%d/status",
+                (int)getpid(), (int)tid);
+       f = fopen(path, "r");
+       if (!f) {
+               /* Fallback: /proc/<tid>/status (works only for tgid == tid) */
+               snprintf(path, sizeof(path), "/proc/%d/status", (int)tid);
+               f = fopen(path, "r");
+       }
+       if (!f)
+               return -1;
+
+       while (fgets(line, sizeof(line), f)) {
+               if (strncmp(line, "voluntary_ctxt_switches:", 24) == 0) {
+                       val = strtol(line + 24, NULL, 10);
+                       break;
+               }
+       }
+       fclose(f);
+       return val;
+}
+
+/* Set SCHED_FIFO priority for the calling thread. */
+static int set_fifo(int prio)
+{
+       struct sched_param sp = { .sched_priority = prio };
+
+       return sched_setscheduler(0, SCHED_FIFO, &sp);
+}
+
+/* Set SCHED_OTHER (normal) for the calling thread. */
+static int set_normal(void)
+{
+       struct sched_param sp = { .sched_priority = 0 };
+
+       return sched_setscheduler(0, SCHED_OTHER, &sp);
+}
+
+/* ---------- TC-1 ----------------------------------------------------------
+ *
+ * Single-level PE: high-priority waiter gets CPU via PE.
+ *
+ * Setup:
+ *   - LOW thread (SCHED_OTHER): holds mutex, burns CPU for ~200 ms,
+ *     then releases.
+ *   - HIGH thread (SCHED_FIFO prio=80): waits for mutex immediately.
+ *
+ * On a PE kernel the scheduler runs LOW as proxy for HIGH, so LOW
+ * should accumulate significant CPU time (measured via
+ * CLOCK_PROCESS_CPUTIME_ID inside the holder thread itself).
+ *
+ * Verification: CPU time consumed by the LOW thread during the hold
+ * period is >= 50 ms.  CLOCK_THREAD_CPUTIME_ID is used so that only
+ * LOW's own CPU consumption is measured, not that of other threads.
+ */
+
+#define TC1_HOLD_MS 200 /* ms LOW holds the mutex */
+#define TC1_CPU_THRESHOLD_MS 50 /* minimum CPU ms we expect */
+
+struct tc1_args {
+       pthread_mutex_t *mtx;
+       long long cpu_during_hold_ns; /* output: CPU ns consumed by LOW */
+       atomic_int ready;
+       atomic_int done;
+};
+
+static void *tc1_low_thread(void *arg)
+{
+       struct tc1_args *a = arg;
+       long long t0, t1, deadline;
+
+       /* Become the LOW thread */
+       set_normal();
+
+       pthread_mutex_lock(a->mtx);
+       a->ready = 1;
+
+       /* Spin for TC1_HOLD_MS real-time milliseconds while holding lock */
+       deadline = now_ns() + (long long)TC1_HOLD_MS * 1000000LL;
+       t0 = cputime_ns();
+       while (now_ns() < deadline)
+               ; /* busy wait */
+       t1 = cputime_ns();
+
+       a->cpu_during_hold_ns = t1 - t0;
+       pthread_mutex_unlock(a->mtx);
+       a->done = 1;
+       return NULL;
+}
+
+static void *tc1_high_thread(void *arg)
+{
+       struct tc1_args *a = arg;
+
+       /* Become HIGH priority */
+       set_fifo(80);
+
+       /* Wait until LOW has the lock */
+       while (!a->ready)
+               sched_yield();
+
+       /* Block on mutex - PE should now proxy-run LOW */
+       pthread_mutex_lock(a->mtx);
+       pthread_mutex_unlock(a->mtx);
+       return NULL;
+}
+
+static void run_tc1(void)
+{
+       pthread_mutex_t mtx = PTHREAD_MUTEX_INITIALIZER;
+       struct tc1_args args = { .mtx = &mtx, .ready = 0, .done = 0 };
+       pthread_t low, high;
+       long long threshold_ns = (long long)TC1_CPU_THRESHOLD_MS * 1000000LL;
+
+       pthread_create(&low, NULL, tc1_low_thread, &args);
+
+       /* Wait for LOW to acquire the lock before creating HIGH */
+       while (!args.ready)
+               sched_yield();
+
+       pthread_create(&high, NULL, tc1_high_thread, &args);
+
+       pthread_join(high, NULL);
+       pthread_join(low, NULL);
+
+       pthread_mutex_destroy(&mtx);
+
+       if (args.cpu_during_hold_ns >= threshold_ns) {
+               ksft_test_result_pass(
+                       "TC-1: PE ran LOW as proxy (cpu_hold=%lld ms >= %d 
ms)\n",
+                       args.cpu_during_hold_ns / 1000000,
+                       TC1_CPU_THRESHOLD_MS);
+       } else {
+               ksft_test_result_fail(
+                       "TC-1: LOW did not get enough CPU time (cpu_hold=%lld 
ms < %d ms)\n",
+                       args.cpu_during_hold_ns / 1000000,
+                       TC1_CPU_THRESHOLD_MS);
+       }
+}
+
+/* ---------- TC-2 ----------------------------------------------------------
+ *
+ * blocked_on lifetime: voluntary context switches must NOT increase
+ * for the high-priority waiter while it is proxy-blocked.
+ *
+ * When PE is active the high-priority task stays on the runqueue
+ * (as donor) and is never voluntarily context-switched out.
+ *
+ * Verification:
+ *   Record voluntary_ctxt_switches for HIGH before and after the
+ *   blocking period; they should be equal.
+ */
+
+#define TC2_HOLD_MS 150
+
+struct tc2_args {
+       pthread_mutex_t *mtx;
+       pid_t high_tid;
+       atomic_int low_has_lock;  /* LOW signals it holds the mutex */
+       atomic_int high_blocking; /* HIGH signals it is about to block */
+       long ctxt_after;          /* HIGH records its own switches after 
unblock */
+};
+
+static void *tc2_low_thread(void *arg)
+{
+       struct tc2_args *a = arg;
+       long long deadline;
+
+       set_normal();
+       pthread_mutex_lock(a->mtx);
+       a->low_has_lock = 1;
+
+       deadline = now_ns() + (long long)TC2_HOLD_MS * 1000000LL;
+       while (now_ns() < deadline)
+               ; /* busy spin holding the lock */
+
+       pthread_mutex_unlock(a->mtx);
+       return NULL;
+}
+
+static void *tc2_high_thread(void *arg)
+{
+       struct tc2_args *a = arg;
+
+       set_fifo(80);
+       a->high_tid = gettid_compat();
+
+       /* Wait until LOW holds the lock */
+       while (!a->low_has_lock)
+               sched_yield();
+
+       /* Signal main that we are about to block, then immediately block */
+       a->high_blocking = 1;
+       pthread_mutex_lock(a->mtx);
+       pthread_mutex_unlock(a->mtx);
+       /* Record our own ctxt switches before exiting (proc entry still live) 
*/
+       a->ctxt_after = get_voluntary_ctxt_switches(gettid_compat());
+       return NULL;
+}
+
+static void run_tc2(void)
+{
+       pthread_mutex_t mtx = PTHREAD_MUTEX_INITIALIZER;
+       struct tc2_args args = {
+               .mtx = &mtx,
+               .high_tid = 0,
+               .low_has_lock = 0,
+               .high_blocking = 0,
+               .ctxt_after = -1,
+       };
+       pthread_t low, high;
+       long before, after = -1;
+
+       /* Start LOW first so it grabs the lock */
+       pthread_create(&low, NULL, tc2_low_thread, &args);
+
+       while (!args.low_has_lock)
+               sched_yield();
+
+       pthread_create(&high, NULL, tc2_high_thread, &args);
+
+       /*
+        * Wait until HIGH has set high_tid AND signaled it is about to block.
+        * There is a tiny window between high_blocking=1 and the actual
+        * pthread_mutex_lock() call, but that is unavoidable in userspace.
+        * Sample "before" here; HIGH cannot have voluntarily yielded yet
+        * because it has not blocked yet.
+        */
+       while (!args.high_tid || !args.high_blocking)
+               sched_yield();
+
+       /* Sample voluntary switches while HIGH is (about to be) blocked */
+       before = get_voluntary_ctxt_switches(args.high_tid);
+
+       pthread_join(high, NULL);
+       pthread_join(low, NULL);
+
+       after = args.ctxt_after;
+
+       pthread_mutex_destroy(&mtx);
+
+       if (before < 0 || after < 0) {
+               ksft_test_result_skip(
+                       "TC-2: Could not read /proc task status\n");
+               return;
+       }
+
+       if (after == before) {
+               ksft_test_result_pass(
+                       "TC-2: HIGH voluntary_ctxt_switches unchanged (%ld) 
during PE block\n",
+                       before);
+       } else {
+               ksft_test_result_fail(
+                       "TC-2: HIGH voluntary_ctxt_switches changed: before=%ld 
after=%ld\n",
+                       before, after);
+       }
+}
+
+/* ---------- TC-3 ----------------------------------------------------------
+ *
+ * Two-level mutex chain:
+ *   A (SCHED_FIFO prio=80) -> blocked on mutex1 -> held by
+ *   B (SCHED_FIFO prio=50) -> blocked on mutex2 -> held by
+ *   C (SCHED_OTHER)                                  ^^ PE must traverse
+ *                                                      the chain and run C
+ *
+ * Verification: C's CPU time during the hold period is >= 50 ms,
+ * meaning PE reached the end of the chain and ran C as proxy.
+ */
+
+#define TC3_HOLD_MS 200
+#define TC3_CPU_THRESHOLD_MS 50
+
+struct tc3_args {
+       pthread_mutex_t *mtx1; /* A waits on this; B holds */
+       pthread_mutex_t *mtx2; /* B waits on this; C holds */
+
+       atomic_int b_has_mtx1; /* B has acquired mtx1 */
+       atomic_int c_has_mtx2; /* C has acquired mtx2 */
+
+       long long c_cpu_during_hold_ns;
+};
+
+static void *tc3_c_thread(void *arg)
+{
+       struct tc3_args *a = arg;
+       long long t0, t1, deadline;
+
+       set_normal();
+       pthread_mutex_lock(a->mtx2);
+       a->c_has_mtx2 = 1;
+
+       /* Spin holding mtx2 */
+       deadline = now_ns() + (long long)TC3_HOLD_MS * 1000000LL;
+       t0 = cputime_ns();
+       while (now_ns() < deadline)
+               ;
+       t1 = cputime_ns();
+
+       a->c_cpu_during_hold_ns = t1 - t0;
+       pthread_mutex_unlock(a->mtx2);
+       return NULL;
+}
+
+static void *tc3_b_thread(void *arg)
+{
+       struct tc3_args *a = arg;
+
+       set_fifo(50);
+
+       /* Acquire mtx1 first, so A will block on it */
+       pthread_mutex_lock(a->mtx1);
+       a->b_has_mtx1 = 1;
+
+       /* Wait until C holds mtx2 before blocking on it */
+       while (!a->c_has_mtx2)
+               sched_yield();
+
+       /* Now block on mtx2 - chain: A->mtx1->B->mtx2->C */
+       pthread_mutex_lock(a->mtx2);
+       pthread_mutex_unlock(a->mtx2);
+
+       pthread_mutex_unlock(a->mtx1);
+       return NULL;
+}
+
+static void *tc3_a_thread(void *arg)
+{
+       struct tc3_args *a = arg;
+
+       set_fifo(80);
+
+       /* Wait until the full chain is established */
+       while (!a->b_has_mtx1 || !a->c_has_mtx2)
+               sched_yield();
+
+       pthread_mutex_lock(a->mtx1);
+       pthread_mutex_unlock(a->mtx1);
+       return NULL;
+}
+
+static void run_tc3(void)
+{
+       pthread_mutex_t mtx1 = PTHREAD_MUTEX_INITIALIZER;
+       pthread_mutex_t mtx2 = PTHREAD_MUTEX_INITIALIZER;
+       struct tc3_args args = {
+               .mtx1 = &mtx1,
+               .mtx2 = &mtx2,
+               .b_has_mtx1 = 0,
+               .c_has_mtx2 = 0,
+       };
+       pthread_t ta, tb, tc;
+       long long threshold_ns = (long long)TC3_CPU_THRESHOLD_MS * 1000000LL;
+
+       /* Start C first so it grabs mtx2 */
+       pthread_create(&tc, NULL, tc3_c_thread, &args);
+
+       /* Wait for C to hold mtx2 */
+       while (!args.c_has_mtx2)
+               sched_yield();
+
+       /* Start B - it will grab mtx1 then block on mtx2 */
+       pthread_create(&tb, NULL, tc3_b_thread, &args);
+
+       /* Wait for B to hold mtx1 */
+       while (!args.b_has_mtx1)
+               sched_yield();
+
+       /* Start A - highest priority, blocks on mtx1 */
+       pthread_create(&ta, NULL, tc3_a_thread, &args);
+
+       pthread_join(ta, NULL);
+       pthread_join(tb, NULL);
+       pthread_join(tc, NULL);
+
+       pthread_mutex_destroy(&mtx1);
+       pthread_mutex_destroy(&mtx2);
+
+       if (args.c_cpu_during_hold_ns >= threshold_ns) {
+               ksft_test_result_pass(
+                       "TC-3: PE traversed 2-level chain, C got cpu_hold=%lld 
ms >= %d ms\n",
+                       args.c_cpu_during_hold_ns / 1000000,
+                       TC3_CPU_THRESHOLD_MS);
+       } else {
+               ksft_test_result_fail(
+                       "TC-3: C did not get enough CPU (chain traversal 
failed?): %lld ms < %d ms\n",
+                       args.c_cpu_during_hold_ns / 1000000,
+                       TC3_CPU_THRESHOLD_MS);
+       }
+}
+
+/* ---------- main ---------------------------------------------------------- 
*/
+
+int main(void)
+{
+       ksft_print_header();
+
+#ifndef CONFIG_SCHED_PROXY_EXEC
+       ksft_exit_skip("CONFIG_SCHED_PROXY_EXEC not enabled\n");
+#endif
+
+       if (getuid() != 0)
+               ksft_exit_skip("requires root (needed for SCHED_FIFO)\n");
+
+       if (!is_proxy_exec_enabled())
+               ksft_exit_skip("sched_proxy_exec=0 on kernel cmdline, PE 
disabled\n");
+
+       ksft_set_plan(3);
+
+       run_tc1();
+       run_tc2();
+       run_tc3();
+
+       ksft_finished();
+}
-- 
2.43.0

[PATCH] selftests/sched: add proxy execution mutex tests

Reply via email to